Optimize code for scatter plot generation in R - r

The executable code below generates a scatter plot that depends on the date (date2) he chooses and three lines are also generated, referring to mean, mean+standard deviation and mean-standard deviation, which are based on the day of the week (Week) that is chosen.
As you can see, I used vector i to generate the mean and standard deviation. But I would like to optimize this, that is, when he chooses the date, he already understands what day of the week it is, so he doesn't need to use this i vector.
For example, I put it to generate scatterplot date 10/04/2021, so the code would need to know it's a Saturday, without having to set vector i to 3.
Can you help me with this question?
The link to download the database is:https://docs.google.com/spreadsheets/d/1W_hzuRq7D6X12BdwaXeM-cjg2A5MIKDx/edit?usp=sharing&ouid=102073768617937039119&rtpof=true&sd=true
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
df<-read_excel('C:/Users/Downloads/database_test1.xlsx')
df<-subset(df,df$date2<df$date1)
dim_data<-dim(df)
day<-c(seq.Date(from = as.Date(df$date2[1]),
to = as.Date(df$date2[dim_data[1]]),
by = "1 day"))
df_grouped <- df %>%
mutate(across(starts_with("date"), as.Date)) %>%
group_by(date2) %>%
summarise(Id = first(Id),
date1 = first(date1),
Week = first(Week),
D = first(D),
D1 = sum(D1)) %>%
select(Id,date1,date2,Week,D,D1)
df_grouped <- df_grouped %>% mutate(date1=format(date1,"%d/%m/%Y"),
date2=format(date2,"%d/%m/%Y"))
df_grouped<-data.frame(df_grouped)
DS=c("Thursday","Friday","Saturday")
i<-3
df_OC<-subset(df_grouped,is.na(D))
ds_OC<-subset(df_OC,df_OC$Week==DS[i])
#Mean and Standard Deviation
mean_Week<-mean(as.numeric(ds_OC[,"D1"]) )
sdeviation_Week<-sd(as.numeric(ds_OC[,"D1"]))
#create scatter plot
scatter_date <- function(dt, dta = df) {
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=mean_Week, col='blue')
abline(h=(mean_Week + sdeviation_Week), col='green',lty=2)
abline(h=(mean_Week - sdeviation_Week), col='orange',lty=2)
}
scatter_date("2021-04-10",df)
Generated images

You could create a lookup table:
library(tibble)
lookup <- df %>%
select(date2, Week) %>%
distinct() %>%
mutate(date2 = ymd(date2)) %>%
deframe()
lookup
#> 2021-03-04 2021-04-02 2021-04-03 2021-04-08 2021-04-09 2021-04-10
#> "Thursday" "Friday" "Saturday" "Thursday" "Friday" "Saturday"
So now
lookup["2021-04-10"]
#> "Saturday"
To use this with your scatterplot function you need to move some of your code into your function.
One more idea of optimization:
# You could put this lines into one pipe
df_grouped <- df %>%
mutate(across(starts_with("date"), as.Date)) %>%
group_by(date2) %>%
summarise(Id = first(Id),
date1 = first(date1),
Week = first(Week),
D = first(D),
D1 = sum(D1)) %>%
select(Id, date1, date2, Week, D, D1) %>%
mutate(date1 = format(date1, "%d/%m/%Y"),
date2 = format(date2, "%d/%m/%Y"))
# you don't need this line
# df_grouped<-data.frame(df_grouped)
Two more hints:
Use a space after ",". This makes the code easier to read.
Avoid using different types of quoting marks: use either " or ' not both (unless you have to use both).

According to https://stackoverflow.com/a/68948847/8282674 you can adapt your scatter_date with a switch statment and calculate every mean in there. The other way with less changes in your code, would be to remove DS=c("Thursday","Friday","Saturday") to calculate the weekday in the scatter_date function directly:
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
df<-readxl::read_excel('C:/Users/Downloads/database_test1.xlsx')
df<-subset(df,df$date2<df$date1)
# translate the days
df %>% dplyr::mutate(Week = ifelse(Week=="Thursday", "quinta-feira", Week),
Week = ifelse(Week=="Friday", "sexta-feira", Week),
Week = ifelse(Week=="Saturday", "sábado", Week)) -> df
dim_data<-dim(df)
day<-c(seq.Date(from = as.Date(df$date2[1]),
to = as.Date(df$date2[dim_data[1]]),
by = "1 day"))
df_grouped <- df %>%
mutate(across(starts_with("date"), as.Date)) %>%
group_by(date2) %>%
summarise(Id = first(Id),
date1 = first(date1),
Week = first(Week),
D = first(D),
D1 = sum(D1)) %>%
select(Id,date1,date2,Week,D,D1)
df_grouped <- df_grouped %>% mutate(date1=format(date1,"%d/%m/%Y"),
date2=format(date2,"%d/%m/%Y"))
df_grouped<-data.frame(df_grouped)
#create scatter plot
scatter_date <- function(dt, dta = df) {
# get the week day
my_day <- weekdays(as.Date(dt))
df_OC<-subset(df_grouped,is.na(D))
ds_OC<-subset(df_OC,df_OC$Week==my_day) # omit 'i' and DS
mean_Week<-mean(as.numeric(ds_OC[,"D1"]) )
sdeviation_Week<-sd(as.numeric(ds_OC[,"D1"]))
mean_Week_pos <- (mean_Week + sdeviation_Week)
mean_Week_neg <- (mean_Week - sdeviation_Week)
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7),
main = paste0(my_day, ": (", mean_Week, ",+",mean_Week_pos, ",-", mean_Week_neg,")"),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=mean_Week, col='blue')
abline(h= mean_Week_pos, col='green',lty=2)
abline(h= mean_Week_neg, col='orange',lty=2)
}
scatter_date("2021-04-10",df)
scatter_date("2021-04-9",df)
scatter_date("2021-04-8",df)

Related

Change row group labels in gt table (with superscript/subscript and line breaks). Customising row group labels in R

I have the following data and table:
library(gt)
library(dplyr)
a <- rnorm(21, mean = 112, sd =12)
colour <- rep(c("Blue", "Red", "Green"), 7)
data <- data.frame(colour, a) %>%
group_by(colour) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = html("[H<sub>2</sub>O]")) %>%
rename(cat = colour)
b <- rnorm(21, mean = 60, sd =12)
day <- rep(c("2", "4", "6"), 7)
data2 <- data.frame(day, b) %>%
group_by(day) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = html("[H<sub>2</sub>O] Additition <br> (Days)")) %>%
rename(cat = day)
bind_rows(data, data2) %>%
group_by(grp) %>%
gt(rowname_col = "cat")
bind_rows(data, data2) %>%
group_by(grp) %>%
gt() %>%
tab_options(row_group.as_column = TRUE)
The row group labels appear literally as '[H<sub>2<\sub>O]', rather than [H2O] etc. It is likely that I am using HTML wrong and it needs to be used with another package/function. I have also tried using cols_label but doesn't recognise these as columns in the dataframe.
Is there also a way to have the row groups column vertically centered, rather than at the top where is currently is? How do you bold these row groups?
The html function won't work outside of a gt table, so you'll have to create the row groups using tab_row_group and add the html labels there.
data <- data.frame(colour, a) %>%
group_by(colour) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = "color") %>%
rename(cat = colour)
data2 <- data.frame(day, b) %>%
group_by(day) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = "day") %>%
rename(cat = day)
bind_rows(data, data2) %>%
gt() %>%
tab_row_group(
label = html("[H<sub>2</sub>O]"),
rows = grp == "color"
) %>%
tab_row_group(
label = html("[H<sub>2</sub>O] Additition <br> (Days)"),
rows = grp == "day"
) %>%
cols_hide(grp)

Adjust line in graph in R

The graph below generates a scatter plot based on date2. In addition, a horizontal line that refers to the mean is generated. Each day of the week has a different mean as you can see.
Note that in abline I specified h=mean_saturday, as 10/4 is a Saturday. But I didn't want to always have to change this part of the abline to show the right mean line, but my idea is to leave it automatically, that is, when I enter the date 10/4/2021 in the code, the code already recognize that the 10th it's Saturday and inserts the appropriate mean line. Any idea how to do this?
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(tibble)
df <- structure(
list(Id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
date1 = c("2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20"),
date2 = c("2021-07-01","2021-07-01","2021-07-01","2021-07-01","2021-04-02",
"2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-03",
"2021-04-03","2021-04-03","2021-04-03","2021-04-03","2021-04-08","2021-04-08",
"2021-04-09","2021-04-09","2021-04-10","2021-04-10"),
Week= c("Thursday","Thursday","Thursday","Thursday","Friday","Friday","Friday","Friday",
"Friday","Friday","Saturday","Saturday","Saturday","Saturday","Saturday","Thursday",
"Thursday","Friday","Friday","Saturday","Saturday"),
D = c("","","Ho","","","","","","Ho","","","","","","","","","","","",""),
DR01 = c(2,1,4,3,3,4,1,6,3,7,2,3,4,6,7,8,4,2,6,2,3)),
class = "data.frame", row.names = c(NA, -21L))
mean_thursday=4
mean_friday=5
mean_saturday=6
scatter_date <- function(dt, dta = df) {
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=mean_saturday, col='blue')
}
scatter_date("2021-04-10",df)
You could try to convert the input date in your scatter_date function to a date and get the weekday: my_day <- weekdays(as.Date(dt)) add that to a switch statment for your means:
my_mean <- switch(
my_day,
"Saturday" = mean_saturday,
"Friday" = mean_friday,
"Thursday" = mean_thursday,
0) # add here your other days
and replace mean_saturday in abline(h=mean_saturday, col='blue') with my_mean
here the full code:
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(tibble)
df <- structure(
list(Id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
date1 = c("2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20"),
date2 = c("2021-07-01","2021-07-01","2021-07-01","2021-07-01","2021-04-02",
"2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-03",
"2021-04-03","2021-04-03","2021-04-03","2021-04-03","2021-04-08","2021-04-08",
"2021-04-09","2021-04-09","2021-04-10","2021-04-10"),
Week= c("Thursday","Thursday","Thursday","Thursday","Friday","Friday","Friday","Friday",
"Friday","Friday","Saturday","Saturday","Saturday","Saturday","Saturday","Thursday",
"Thursday","Friday","Friday","Saturday","Saturday"),
D = c("","","Ho","","","","","","Ho","","","","","","","","","","","",""),
DR01 = c(2,1,4,3,3,4,1,6,3,7,2,3,4,6,7,8,4,2,6,2,3)),
class = "data.frame", row.names = c(NA, -21L))
mean_thursday=4
mean_friday=5
mean_saturday=6
scatter_date <- function(dt, dta = df) {
my_day <- weekdays(as.Date(dt))
my_mean <- switch(
my_day,
"Saturday" = mean_saturday,
"Friday" = mean_friday,
"Thursday" = mean_thursday,
0) # add here your other days
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7), main = paste0(my_day, ":", my_mean),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=my_mean, col='blue')
}
# testing the different means
scatter_date("2021-04-10",df)
scatter_date("2021-04-9",df)
scatter_date("2021-04-8",df)
One way would be to define a data.frame containing the mean for the days of interest and then use weekdays to extract the corresponding mean from that table.
Instead of
mean_thursday=4
mean_friday=5
mean_saturday=6
I would go for something like:
mean_df <- data.frame(mean = c(4:6),
day = c('Thursday', 'Friday', 'Saturday'))
and then
abline(h=subset(mean_df, day == weekdays(as.Date(dt)))$mean, col='blue')
which will be the only change in your function.

Numbers of years having rainy days in the range of 81–119% of long term average

A day with precipitation >= 2.5 mm is called a rainy day. I could able to calculate monthwise rainy days using the following code
library(seas)
library(tidyverse)
library(zoo)
library(lubridate)
data(mscdata)
dat.int <- (mksub(mscdata, id=1108447))
dat.int %>%
as_tibble() %>% # for easier viewing
mutate(yearmon = as.yearmon(dat.int$date, "%b %y")) %>%
dplyr::select(-date, -year, -yday, -t_max, -t_min, -t_mean) %>%
pivot_longer(cols = -yearmon, names_to = "variable", values_to = "value") %>%
group_by(yearmon, variable) %>%
summarise(rainy_days = sum(value > 2.5)) %>%
pivot_wider(names_from = "variable", values_from = "rainy_days")
Then I have calculated the longterm average using the following code
dat.int %>%
as_tibble() %>% # for easier viewing
mutate(yearmon = as.yearmon(dat.int$date, "%b %y")) %>%
dplyr::select(-date, -year, -yday, -t_max, -t_min, -t_mean) %>%
pivot_longer(cols = -yearmon, names_to = "variable", values_to = "value") %>%
group_by(yearmon, variable) %>%
summarise(rainy_days = sum(value > 2.5)) %>%
mutate(year = year(yearmon)) %>%
group_by(variable) %>%
summarize(value = as.integer(round(mean(rainy_days, na.rm = T)))) %>%
pivot_wider(names_from = "variable", values_from = "value")
Now two thresholds should be calculated as: lower threshold = 0.81*long term average and upper threshold = 1.19*long term average. Then calculate the number of years having rainy days between these two thresholds. Now I want to calculate the number of years having rainy days in the range of 81–119% of long term average (between lower and upper threshold).
Edit: Based on OP's comments and wanting to summarize by total precip, rain and snow.
library(dplyr)
library(lubridate)
dat.int %>%
mutate(month = month(ymd(date))) %>%
group_by(year, month) %>%
summarize_at(vars(precip,rain,snow), funs(days = sum(. >= 2.5,na.rm = TRUE))) %>%
group_by(year) %>%
summarize_at(vars(ends_with("days")), funs(yearly = sum(.))) %>%
summarize_at(vars(-year), list(~ sum(. > mean(.) * 0.81 & . < mean(.) * 1.19))) %>%
rename_all(list(~ gsub("days_yearly","in_range",.))) summarize(years = n())
# precip_in_range rain_in_range snow_in_range
# <int> <int> <int>
#1 26 24 6

Combining multiple data.frame in R?

I am computing statistics (i.e., mean, max, median etc) for winter season comprised of Months 11 & 12 of previous year and Months 1-4 of following year.
mydate <- as.data.frame(seq(as.Date("2010-01-01"), to= as.Date("2019-12-31"), by="day"))
colnames(mydate) <- "Date"
DF <- data.frame(A = runif(3652,0,10),
J = runif(3652,0,8),
X = runif(3652,0,12),
Z = runif(3652,0,10),
mydate)
mydata <- DF %>% mutate(Year = year(Date), Month = month(Date)) %>%
pivot_longer(-c(Date,Year,Month), names_to = "variable", values_to = "values") %>%
filter(Month == 11 | Month == 12 | Month == 01 | Month == 02 | Month == 03 | Month == 04) %>%
mutate(W_Year = ifelse(Month > 10, Year+1, Year)) %>%
filter(W_Year != 2019) %>%
group_by(W_Year, variable) %>%
mutate(Cumulative = cumsum(values)) %>%
mutate(NewDate = ymd(paste("2020", Month, day(Date), sep = "-"))) %>%
ungroup() %>%
group_by(variable, NewDate) %>%
summarise(Median = median(Cumulative))
I would then need to combine mydata with the data.frame of accumulated values for the most recent year, which in my case is the year 2019 as an additional column.
X = c("A", "J","X", "Z")
Data2019 <- DF %>% mutate(Year = year(Date), Month = month(Date)) %>%
pivot_longer(-c(Date,Year,Month), names_to = "variable", values_to = "values") %>%
filter(between(Month,5,10)) %>%
filter(Year == 2019) %>%
group_by(Year, variable) %>%
mutate(Precipitation = cumsum(values)) %>%
mutate(NewDate = ymd(paste("2020", Month,day(Date), sep = "-"))) %>%
ungroup() %>%
group_by(variable, NewDate) %>%
select(c(4,6,7)) %>%
slice(match(X, variable))
While combining the two data.frame, i am getting mis-match error for number of rows- which i believe is due to leap year but do not know how to overcome this problem. Any way forward would help. Thank you,
Data_plot <- data.frame(mydata, Data2019[,2])

how to make auto-separated years in a calendar with echarts4r

I'm trying to make calendar with echarts4r package.
library(tidyverse)
library(echarts4r)
dates <- seq.Date(as.Date("2017-01-01"), as.Date("2018-12-31"), by = "day")
values <- rnorm(length(dates), 20, 6)
year <- data.frame(date = dates, values = values)
year %>%
e_charts(date) %>%
e_calendar(range = "2017",top="40") %>%
e_calendar(range = "2018",top="260") %>%
e_heatmap(values, coord.system = "calendar") %>%
e_visual_map(max = 30) %>%
e_title("Calendar", "Heatmap")%>%
e_tooltip("item")
But this one didn't plot 2018 year.
How to make auto-separated years in a calendar?
Is any solution like fill from ggplot?
Expected output : this
The API is admittedly clunky and unintuitive but it is doable. You need to add the two calendars as you do already, reference their index in your e_heatmap function (so that the heatmaps is plotted against the correct calendar). Also, I use e_data in order to pass the values (x) for the second calendar. Make sure to adjust to position of the calendars so that they do not overlap (i.e.: top = 300).
dates18 <- seq.Date(as.Date("2018-01-01"), as.Date("2018-12-31"), by = "day")
dates17 <- seq.Date(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "day")
values <- rnorm(length(dates18), 20, 6)
df <- data.frame(date18 = dates18, date17 = dates17, values = values)
df %>%
e_charts(date18) %>%
e_calendar(range = "2018") %>%
e_heatmap(values, coord.system = "calendar", calendarIndex = 0, name = "2018") %>%
e_data(df, date17) %>%
e_calendar(range = "2017", top = 300) %>%
e_heatmap(values, coord.system = "calendar", calendarIndex = 1, name = "2017") %>%
e_visual_map(max = 30)
Update
Since version 0.2.0 the above can be done by grouping the data by year which is much clearer and easier:
dates <- seq.Date(as.Date("2017-01-01"), as.Date("2018-12-31"), by = "day")
values <- rnorm(length(dates), 20, 6)
year <- data.frame(date = dates, values = values)
year %>%
dplyr::mutate(year = format(date, "%Y")) %>% # get year from date
group_by(year) %>%
e_charts(date) %>%
e_calendar(range = "2017",top="40") %>%
e_calendar(range = "2018",top="260") %>%
e_heatmap(values, coord_system = "calendar") %>%
e_visual_map(max = 30) %>%
e_title("Calendar", "Heatmap")%>%
e_tooltip("item")

Resources