R - How to 'create' or plot missing data? - r

I have a datset, AIS_dat, which looks at the number of boats (BoatCount) present at three sites (Site) on different days of the week (Day), before and during a Covid lockdown.
rm(list = ls())
setwd('K:/SoundTrap/Boats/PSD Output/Duty cycle data/TOL analysis')
getwd()
AIS_dat<-read.csv("AllSitesConcat_dBcalcs_50-24000Hz_matchedCameraCounts.csv")
str(AIS_dat)
#set factors
AIS_dat$Lockdown <- as.factor(AIS_dat$Lockdown)
#change order of sites
AIS_dat$Site<-factor(AIS_dat$Site,
level=c('Kawau','Tiritiri','Noises'))
#change order of days
AIS_dat$Day<-factor(AIS_dat$Day,
level=c('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))
#hour of day as factor
AIS_dat$Hour <- as.factor(AIS_dat$Hour)
#Look at variation between sites
bp<-ggplot(AIS_dat, aes(x=Day,y=BoatCount,fill=factor(Site))) +
geom_boxplot()+
ylab(expression("Number of Boats"))+
xlab("Day of Week")+
scale_fill_manual(values = get_pal("Kereru"),
name="Site") +
theme_bw()
bp
bp<-bp+theme(axis.text.x = element_text(angle = 0,size=14),
axis.text.y = element_text(size=14),
axis.title.x = element_text(size=14),
axis.title.y =element_text(size=14),
#legend.title = element_text(size = 14),
#legend.text = element_text(size = 14)
) #rotate x-axis labels
bp<-bp+facet_grid(rows=vars(Lockdown)) #separate plot for each season
bp
The plot looks like this
...which is awesome. BUT, because there is no data for Tiritiri under 'During', we only see two sites, and it is a bit misleading as the order of the sites then changes. I'd like to have missing data in this panel to make it clear which site is which. How would I do this? Would I have to add missing data to my original datasheet somehow? I tried this but it didn't work, so if that is the best approach I am not sure which way to do it?
The dataset looks like this (but has 17143 rows):

You can combine site and day into a new column used for the x axis ticks. The function facet will arrange the x tick values by default in a fixed way:
library(tidyverse)
set.seed(1337)
before_data <-
tribble(
~step, ~Site,
"before", "K",
"before", "T",
"before", "N"
) %>%
mutate(
value = rnorm(10) %>% list()
) %>%
unnest(value)
during_data <-
tribble(
~step, ~Site,
"during", "K",
"during", "N"
) %>%
mutate(
value = rnorm(10) %>% list()
) %>%
unnest(value)
data <- bind_rows(before_data, during_data) %>% mutate(day = "Monday")
data %>%
mutate(x = paste0(day, Site)) %>%
ggplot(aes(x, value, color = Site)) +
geom_boxplot() +
facet_grid(~step)
Created on 2021-09-13 by the reprex package (v2.0.1)

Related

Order the plotting of data by the contents of a date column

I have dataset as follows:
df = data.frame(name = c('Ger1', 'Ger2', 'Ger3', 'Ger4', 'Ger5', 'Ger6'),
issued = c(UKS, USD, UKS, UKS, USD, USD),
mat = c(2024-01-31, 2023-01-31, 2026-10-22, 2022-07-22, 2029-01-31, 2025-06-07)
volume = c(0.476, 0.922, 0.580, 1.259, 0.932, 0.417)
I currently plot (and filter) the data using the following code:
plot1<- ggplot(subset(df, issued == "UKS")) +
geom_bar(stat="identity", aes(x=volume,y=name),fill="#1170aa")+
theme(title=element_text(size=12),
panel.background = element_rect(fill='white',color='black'),
legend.position='right')+
labs(title = "Total carriage by Volume on the day", x = "Volume", y = "Name")
I'd like to be able to order this data using the 'mat' column as guide, namely with the data that has the earliest 'mat' date at the top of the Y axis and the most distant 'mat' date at the bottom. Does anyone have any advice on how to achieve this?
Edit: I use grid arrange to plot it against another chart.
grid.arrange(plot1,plot2,ncol=2)
Sadly I get the following error:
Error in `-.POSIXt`(Maturity) : unary '-' is not defined for "POSIXt" objects
You can use stats::reorder() inside aes() to reorder the bars. If a factor is supplied you don't need to supply a FUN, but for a continuous variable like Date you can specify the way to sort. In your data (although you didn't post it this way in the question), it seems your mat variable is POSIXlt. This format cannot be directly operated on as a numeric. Instead, I suggest using POSIXct and then it will work. See ?stats::reorder for more info on how to control this. Another option is to set levels of the factor in the data before passing to ggplot() which might be a better option if you have complex sorting to do.
library(tidyverse)
df <- data.frame(name = c('Ger1', 'Ger2', 'Ger3', 'Ger4', 'Ger5', 'Ger6'),
issued = c("UKS", "USD", "UKS", "UKS", "USD", "USD"),
mat = c("2024-01-31", "2023-01-31", "2026-10-22", "2022-07-22", "2029-01-31", "2025-06-07"),
volume = c(0.476, 0.922, 0.580, 1.259, 0.932, 0.417))
df %>%
mutate(mat = as.POSIXct(mat)) %>%
filter(issued == "UKS") %>%
# mutate(name = fct_reorder(.f = name, .x = mat)) %>% ggplot(aes(volume, name)) +
ggplot(aes(x = volume, y = reorder(x = name, X = mat, FUN = sort))) +
geom_col(fill = "#1170aa") +
labs(title = "Total carriage by Volume on the day", x = "Volume", y = "Name") +
theme(
title = element_text(size = 12),
panel.background = element_rect(fill = 'white', color = 'black'),
legend.position = 'right'
)
Created on 2022-02-07 by the reprex package (v2.0.1)
So, I was able to fix the ordering issue by appending the mat column data to the name, and then adding:
+scale_y_discrete(limits=rev)
To the end of the ggplot code.

How to aggregate data from years to decades and plot them?

This is the graph that I would like to reproduce:
but for that I have to change the years column because on the graph the x axis is in decades. By what means could I accomplish this ?
This is what I did to extract the data from the site (https://ourworldindata.org/famines) :
library(rvest)
library(dplyr)
library(tidyr)
library(ggplot2)
col_link <- "https://ourworldindata.org/famines#famines-by-world-region-since-1860"
col_page <- read_html(col_link)
col_table <- col_page %>% html_nodes("table#tablepress-73") %>%
html_table() %>% . [[1]]
data1 <- col_table %>%
select(Year, `Excess Mortality midpoint`)
Year `Excess Mortality midpoint`
<chr> <chr>
1 1846–52 1,000,000
2 1860-1 2,000,000
3 1863-67 30,000
4 1866-7 961,043
5 1868 100,000
6 1868-70 1,500,000
7 1870–1871 1,000,000
8 1876–79 750,000
9 1876–79 7,176,346
10 1877–79 11,000,000
# ... with 67 more rows
Firstly, to convert the periods to decades, you need to extract a year for each period, based on which the calculation will be made. From your comment above, it looks like you need to extract the end year for each period. Given the data, regular expressions are used below to do this (and packages dplyr and stringr).
col_table <- col_table %>%
mutate(Year = case_when(
grepl("^\\d{4}$",Year) ~ Year,
grepl("\\d{4}[–-]\\d{4}",Year) ~ str_sub(Year, start= -4),
grepl("\\d{4}[–-]\\d{2}$",Year) ~ paste0(str_sub(Year,1,2),str_sub(Year,-2)),
grepl("\\d{4}[–-]\\d{1}$",Year) ~ paste0(str_sub(Year,1,3),str_sub(Year,-1))))
What this part of code is doing, is to detect the different cases and extract the proper year. Below there are examples for all cases, that are present on the dataset and what this part of code will result to.
1868 -> 1868
1878-1880 -> 1880
1846–52 -> 1852
1860-1 -> 1861
Now we have the year, so the next step is to extract the decade. To do so, we need to make sure that Year column is numeric and apply the necessary calculation (check here for it: https://stackoverflow.com/a/48966643/8864619)
col_table <- col_table %>%
mutate(Decade = as.numeric(Year) - as.numeric(Year) %% 10)
To reproduce the plot we need to group by decade and make sure that the Excess Mortality midpoint column is numeric to be able to get the sum of victims per decade.
col_table <- col_table %>%
mutate(`Excess Mortality midpoint` = as.numeric(gsub(",", "", `Excess Mortality midpoint`))) %>%
group_by(Decade) %>%
summarize(val = sum(`Excess Mortality midpoint`)) %>%
ungroup()
For the plot itself, ggplot2 is used:
ylab <- c(5, 10, 15, 20, 25)
options(scipen=999)
p <- ggplot(data = col_table, aes(x=factor(Decade),y=val)) +
geom_bar(stat = "identity", fill = "navy") +
scale_x_discrete(labels = col_table %>% distinct(Decade) %>% mutate(Decade = paste0(Decade,"s")) %>% pull()) +
geom_text(aes(label=format(val,big.mark=",")), size=2,vjust=-0.3) +
scale_y_continuous(labels = paste(ylab, "millions"),breaks = 10^6 * ylab) +
ggtitle('Famine victims worldwide')+
theme(panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(size = 0.05, linetype = 'solid',
colour = "black"),
axis.title.x = element_blank(),
axis.title.y = element_blank())
p
So, putting everything together, the following code should get you a column for the year and a column for the relevant decade, which should be then used to create the plot you want to:
library(rvest)
library(dplyr)
library(stringr)
library(ggplot2)
col_link <- "https://ourworldindata.org/famines#famines-by-world-region-since-1860"
col_page <- read_html(col_link)
col_table <- col_page %>% html_nodes("table#tablepress-73") %>% html_table() %>% . [[1]]
col_table <- col_table %>%
mutate(Year = case_when(
grepl("^\\d{4}$",Year) ~Year,
grepl("\\d{4}[–-]\\d{4}",Year) ~ str_sub(Year, start= -4),
grepl("\\d{4}[–-]\\d{2}$",Year) ~ paste0(str_sub(Year,1,2),str_sub(Year,-2)),
grepl("\\d{4}[–-]\\d{1}$",Year) ~ paste0(str_sub(Year,1,3),str_sub(Year,-1)))) %>%
mutate(Decade = as.numeric(Year) - as.numeric(Year)%%10) %>%
mutate(`Excess Mortality midpoint` = as.numeric(gsub(",", "", `Excess Mortality midpoint`))) %>%
group_by(Decade) %>%
summarize(val = sum(`Excess Mortality midpoint`)) %>%
ungroup()
ylab <- c(5, 10, 15, 20, 25)
options(scipen=999)
p <- ggplot(data = col_table, aes(x=factor(Decade),y=val)) +
geom_bar(stat = "identity", fill = "navy") +
scale_x_discrete(labels = col_table %>% distinct(Decade) %>% mutate(Decade = paste0(Decade,"s")) %>% pull()) +
geom_text(aes(label=format(val,big.mark=",")), size=2,vjust=-0.3) +
scale_y_continuous(labels = paste(ylab, "millions"),breaks = 10^6 * ylab) +
ggtitle('Famine victims worldwide')+
theme(panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(size = 0.05, linetype = 'solid',
colour = "black"),
axis.title.x = element_blank(),
axis.title.y = element_blank())
p
Here's the reproduced plot:
First, strsplit, make a proper year matrix, combine back with famines divided by number of years and reshape to long format (lines 1:6). Next, aggregate sums by decade and barplot it.
r <- strsplit(data1$Year, '-|–|, ') |>
rapply(\(y) unlist(lapply(y, \(x) f(max(as.numeric(y)), x))), how='r') |>
{\(.) t(sapply(., \(x) `length<-`(x, max(lengths(.)))))}() |>
{\(.) cbind(`colnames<-`(., paste0('year.', seq_len(dim(.)[2]))),
n=dim(.)[2] - rowSums(is.na(.)))}() |>
{\(.) data.frame(., f=as.numeric(gsub('\\D', '',
data1$`Excess Mortality midpoint`))/
.[, 'n'])}()|>
reshape(1:3, direction='long') |>
stats:::aggregate.formula(formula=f ~ as.integer(substr(year, 1, 3)),
FUN=sum) |>
t()
## plot
op <- par(mar=c(5, 5, 4, 2)+.1) ## set/store old pars
b <- barplot(r, axes=FALSE, ylim=c(0, max(r[2, ])*1.05),
main='Famine victims', )
abline(h=asq, col='lightgrey', lty=3)
barplot(r, names.arg=paste0(r[1, ], '0s'), col='#20254c',
cex.names=.8, axes=FALSE, add=TRUE)
asq <- seq(0, max(axTicks(2)), 2e6)
axis(2, asq, labels=FALSE)
mtext(paste(asq/1e6, 'Million'), 2, 1, at=asq, las=2)
text(b, r[2, ] + 5e5, labels=formatC(r[2, ], format='d', big.mark=','), cex=.7)
box()
par(op) ## restore old pars
In line 2, I used this helper function f() to fill up the pseudo-years:
f <- \(x1, x2, n1=nchar(x1)) {
u <- lapply(list(x1, x2), as.character)
s <- c(n1 - nchar(u[[2]]) + 1L, n1)
as.integer(`substr<-`(u[[1]], s[1], s[2], u[[2]]))
}
You can refine the aggregation method yourself to make the result exactly look like the original, but maybe this is better :)

How do you compare similarities between variables in an R data frame, based on two categorical variables and one numeric variables

I have a dataframe with three variables of interest: LGA(Location), Offence Category and Total (numeric)
What I am hoping to do, is compare the distance/similarity between each LGA, based on the Total value, in order to create a heat map or similar structure. Is this possible? And if so, what would the process be?
Here is a snippet of the data frame:
I don't really understand your question, but here is an example of a heatmap and a clustered heatmap for 'similar' data:
# Load libraries
library(tidyverse)
library(readxl)
library(httr)
# Find some data
url1 <- "https://www.bocsar.nsw.gov.au/Documents/lga/NewSouthWales.xlsx"
# Get the data and remove missing data points (NA's)
GET(url1, write_disk(tf <- tempfile(fileext = ".xlsx")))
df <- read_excel(path = tf, 2L, skip = 5) %>%
na.omit()
df2 <- df %>%
# format the data to "long format" for plotting
pivot_longer(cols = -c(`Premises type`)) %>%
# Change "Premises type" and "name" to factors
mutate(`Premises type` = factor(
`Premises type`, levels = unique(`Premises type`))
) %>%
mutate(name = factor(
name, levels = unique(name))
) %>%
# Remove the "Total" counts
filter(`Premises type` != "Total")
# Define colours for text (white for dark fill, black for light fill)
hcl <- farver::decode_colour(viridisLite::inferno(length(df2$value)), "rgb", "hcl")
label_col <- ifelse(hcl[, "l"] > 50, "black", "white")
# Plot the data (log scale for fill)
ggplot(df2, aes(y = fct_rev(`Premises type`),
x = name, fill = log(value))) +
geom_tile() +
geom_text(aes(label = value, color = factor(value)),
show.legend = FALSE, size = 2.5) +
theme(axis.text.x = element_text(angle = 45, hjust = 1.05),
axis.title = element_blank()) +
scale_color_manual(values = label_col) +
scale_fill_viridis_c(option = "inferno", na.value = "black")
And a clustered heatmap (similar Premises Type / Crime types cluster together):
# Load the raw data and format for pheatmap (expects a matrix)
dm <- read_excel(path = tf, 2L, skip = 5) %>%
na.omit() %>%
column_to_rownames(var = "Premises type")
# Plot the data
pheatmap::pheatmap(as.matrix(dm), scale = "row")
Edit
I haven't used it before, so I don't know if the output is correct, but based on this SO post you can use cluster::daisy() to get the gower dissimilarity for "Premises Type" then plot using pheatmap, e.g.
library(cluster)
pheatmap::pheatmap(as.matrix(daisy(dm)))
Edit 2
You only need two variables for this heatmap (i.e. "Local government Area" (Character) and "Total" (Numeric) should be fine):
# Load libraries
library(tidyverse)
library(readxl)
library(httr)
library(cluster)
library(pheatmap)
# Find some data
url1 <- "https://www.bocsar.nsw.gov.au/Documents/lga/NewSouthWales.xlsx"
# Get the data and remove missing data points (NA's)
GET(url1, write_disk(tf <- tempfile(fileext = ".xlsx")))
df <- read_excel(path = tf, 2L, skip = 5) %>%
na.omit()
# Select two variables, then set the Premises type as the rownames
df3 <- df %>%
select(`Premises type`, Robbery) %>%
column_to_rownames(var = "Premises type")
# (in your case, use "column_to_rownames(`Local government Area`)"
# Then plot the heatmap
pheatmap(daisy(as.matrix(df3)),
labels_row = rownames(df3),
labels_col = rownames(df3))

How can I transform my data frame from wide to long in R?

I have issues with transforming my data frame from wide to long. I'm well aware that there are plenty of excellent vignettes out there, which explain gather() or pivot_longer() very precisely (e.g. https://www.storybench.org/pivoting-data-from-columns-to-rows-and-back-in-the-tidyverse/). Nevertheless, I'm still stuck for days now and this drives me crazy. Thus, I dediced to ask the internet. You.
I have a data frame that looks like this:
id <- c(1,2,3)
year <- c(2018,2003,2011)
lvl <- c("A","B","C")
item.1 <- factor(c("A","A","C"),levels = lvl)
item.2 <- factor(c("C","B","A"),levels = lvl)
item.3 <- factor(c("B","B","C"),levels = lvl)
df <- data.frame(id,year,item.1,item.2,item.3)
So we have an id variable for each observation (e.g. movies). We have a year variable, indicating when the observation took place (e.g. when the movie was released). And we have three factor variables that assessed different characteristics of the observation (e.g. cast, storyline and film music). Those three factor variables share the same factor levels "A","B" or "C" (e.g. cast of the movie was "excellent", "okay" or "shitty").
But in my wildest dreams, the data more look like this:
id.II <- c(rep(1, 9), rep(2, 9), rep(3,9))
year.II <- c(rep(2018, 9), rep(2003, 9), rep(2011,9))
item.II <- rep(c(c(1,1,1),c(2,2,2),c(3,3,3)),3)
rating.II <- rep(c("A", "B", "C"), 9)
number.II <- c(1,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,1)
df.II <- data.frame(id.II,year.II,item.II,rating.II,number.II)
So now the data frame would be way more useable for further analysis. For example, the next step would be to calculate for each year the number (or even better percentage) of movies that were rated as "excellent".
year.III <- factor(c(rep(2018, 3), rep(2003, 3), rep(2011,3)))
item.III <- factor(rep(c(1, 2, 3), 3))
number.A.III <- c(1,0,0,1,0,0,0,1,0)
df.III <- data.frame(year.III,item.III,number.A.III)
ggplot(data=df.III, aes(x=year.III, y=number.A.III, group=item.III)) +
geom_line(aes(color=item.III))+
geom_point(aes(color=item.III))+
theme(panel.background = element_blank(),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
legend.position = "bottom")+
labs(colour="Item")
Or even more important to me, show for each item (cast, storytelling, film music) the percentage of being rated as "excellent", "okay" and "shitty".
item.IV <- factor(rep(c(c(1,1,1),c(2,2,2),c(3,3,3)),3))
rating.IV <- factor(rep(c("A", "B", "C"), 9))
number.IV <- c(2,0,1,1,1,1,0,2,1)
df.IV <- data.frame(item.IV,rating.IV,number.IV)
df.IV
ggplot(df.IV,aes(fill=rating.IV,y=number.IV,x=item.IV))+
geom_bar(position= position_fill(reverse = TRUE), stat="identity")+
theme(axis.title.y = element_text(size = rel(1.2), angle = 0),
axis.title.x = element_blank(),
panel.background = element_blank(),
legend.title = element_blank(),
legend.position = "bottom")+
labs(x = "Item")+
coord_flip()+
scale_x_discrete(limits = rev(levels(df.IV$item.IV)))+
scale_y_continuous(labels = scales::percent)
My primary question is: How do I transform the data frame df into df.II?
That would make my day. Wrong. My weekend.
And if you could then also give a hint how to proceed from df.II to df.III and df.IV that would be absolutely mindblowing. However, I don't want to burden you too much with my problems.
Best wishes
Jascha
Does this achieve what you need?
library(tidyverse)
df_long <- df %>%
pivot_longer(cols = item.1:item.3, names_to = "item", values_to = "rating") %>%
mutate(
item = str_remove(item, "item.")
)
df2 <- crossing(
df_long,
rating_all = unique(df_long$rating)
) %>%
mutate(n = rating_all == rating) %>%
group_by(id, year, item, rating_all) %>%
summarise(n = sum(n))
df3 <- df2 %>%
filter(item == "3")

Removing the non-business days from ggplot axis [duplicate]

This question already has answers here:
R + ggplot2: how to hide missing dates from x-axis?
(3 answers)
Closed 5 years ago.
I would like to plot the development of different indexes with ggplot2. My problem is, that 2018-02-03 and 2018-02-04 are non-working days, and thus there is no data available for these days, but when plotting ggplot2 adds them by extrapolating. How could I omit the non-business days, so that I get 2018-02-02 followed by 2018-02-05?
library(tidyverse)
library(quantmod)
#retrieve data
getSymbols("BTCUSD=X;^DJI;^VIX;^GDAXI", from="2017-01-01")
#merge all dataframes together
df <- merge(`BTCUSD=X`,`DJI`, all = TRUE)
df <- merge(df, `VIX`, all = TRUE)
df <- merge(df, `GDAXI`, all = TRUE)
#creating a dataframe with first column as date that comes from xts object extracted by index()
df <- data.frame(date=index(df), coredata(df))
#selecting columns and filtering the time series start date
df_1 <- df%>% select(date, contains("Close"))%>% na.omit() %>% filter(date>"2018-01-25")
#df_1 <- df_1 %>%mutate(BTCUSD.X.Close=BTCUSD.X.Close/BTCUSD.X.Close[1], DJI.Close=DJI.Close/DJI.Close[1], GDAXI.Close=GDAXI.Close/GDAXI.Close[1], VIX.Close=VIX.Close/VIX.Close[1])
df_1 <- df_1 %>% gather(var, closing, 2:5)
png("indexes.png", width = 9, height = 6, units = 'in', res = 600)
plot_1 <- ggplot(data=df_1)+
geom_line(aes(x=date, y=closing))+
facet_wrap(~var, scales = "free")+
scale_x_date(breaks = df_1$date, date_minor_breaks = "1 day", date_labels = "%y-%m-%d")+
theme(text = element_text(size=7), axis.text.x = element_text(angle = 90, hjust = 1))
plot_1
dev.off()
plot_1
The package bdscale was designed for this purpose, once added, you can substitute your scale_x_date line with:
scale_x_bd(
business.dates = df_1$date,
max.major.breaks = 10,
labels = date_format("%y-%m-%d")
)
To produce this plot...
Fully reproducible code
library(tidyverse)
library(quantmod)
library(bdscale)
library(scales)
getSymbols("BTCUSD=X;^DJI;^VIX;^GDAXI", from = "2017-01-01")
df <- merge(`BTCUSD=X`,`DJI`, all = TRUE) %>%
merge(`VIX`, all = TRUE) %>%
merge(`GDAXI`, all = TRUE)
df <- data.frame(date = index(df), coredata(df))
df_1 <- df %>%
select(date, contains("Close")) %>%
na.omit %>%
filter(date > "2018-01-25") %>%
gather(var, closing,2:5)
ggplot(data = df_1, aes(x = date, y = closing)) +
geom_line() +
facet_wrap(~var, scales = "free") +
scale_x_bd(business.dates = df_1$date,
max.major.breaks = 10,
labels = date_format("%y-%m-%d")) +
theme(text = element_text(size = 7),
axis.text.x = element_text(angle = 90, hjust = 1))

Resources