How to optimize code that generates graph in R - r

Could you help me optimize the code below? As you can see, I'm using the same date twice, once for graph generation and once for subset y generation. The result is correct, but I'd like some help trying to optimize to at least use the date only once and another optimizing that you find necessary. Every help is welcome.
Thank you very much!
library(dplyr)
library(lubridate)
library(tidyverse)
#dataset
df <- structure(
list(date1 = c("2021-06-28","2021-06-28","2021-06-28","2021-06-28","2021-06-28",
"2021-06-28","2021-06-28","2021-06-28"),
date2 = c("2021-04-02","2021-04-03","2021-04-08","2021-04-09","2021-04-10","2021-07-01","2021-07-02","2021-07-03"),
Week= c("Friday","Saturday","Thursday","Friday","Saturday","Thursday","Friday","Monday"),
DR01 = c(4,1,4,3,3,4,3,6), DR02= c(4,2,6,7,3,2,7,4),DR03= c(9,5,4,3,3,2,1,5),
DR04 = c(5,4,3,3,6,2,1,9),DR05 = c(5,4,5,3,6,2,1,9),
DR06 = c(2,4,3,3,5,6,7,8),DR07 = c(2,5,4,4,9,4,7,8)),
class = "data.frame", row.names = c(NA, -8L))
#Generate graph
dmda<-"2021-07-01"
dta<-df
datas<-dta %>%
filter(date2 == ymd(dmda)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name))
colnames(datas)<-c("Days","Numbers")
attach(datas)
plot(Numbers ~ Days, ylim=c(0,20))
model <- nls(Numbers ~ b1*Days^2+b2,start = list(b1 = 47,b2 = 0))
new.data <- data.frame(Days = seq(min(Days),max(Days),len = 45))
lines(new.data$Days,predict(model,newdata = new.data))
#Add the y points to the graph
df[, 1:2] = lapply(df[, 1:2], FUN = as_date)
get_cutoff = function(date) {
date2 = as_date(date)
date1 = df[1,1]
as.numeric(date2 - date1 + 1)
}
subset_data = function(date, start_index) {
date = as_date(date)
if (date > df[1,1]) {
end_index = start_index + get_cutoff(date) - 1
df[, -c(start_index:end_index)] %>%
filter(date2 == date)
} else {
return(df)
}
}
y<-subset_data("2021-07-01", 4)
y
pivot_longer(y,
cols=c(starts_with("DR"))) %>%
mutate(day = parse_number(name)) -> new_y
new_y
lines(x=new_y$day, y=new_y$value, col="red")
points(x=new_y$day, y=new_y$value, col="red")

Make these changes:
only load packages used
can eliminate lubridate
don't need dta
in filter we don't need to convert dmda to Date class
pivot_wider can transform the names
don't use attach
the model is linear in the parameters so use lm, not nls
replace the new.data/lines with curve
don't overwrite df
simplify the cutoff calculation
use type = "o" to reduce points/lines to just lines
use subset in lines
Now assuming that df and dmda have been defined as in the question we have this.
library(dplyr)
library(tidyr)
datas <- df %>%
filter(date2 == dmda) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)",
names_to = "Days", values_to = "Numbers",
names_transform = list(Days = as.numeric))
plot(Numbers ~ Days, datas, ylim=c(0,20))
model <- lm(Numbers ~ I(Days^2), datas)
rng <- range(datas$Days)
curve(predict(model, list(Days = x)), rng[1], rng[2], add = TRUE)
# assume this for cutoff. You may or may not need to change this line.
cutoff <- as.numeric(as.Date(dmda) - first(as.Date(df$date1))) + 1
lines(Numbers ~ Days, datas, subset = seq_len(nrow(datas)) > cutoff,
type = "o" , col = "red")

I used ggplot rather than base R plotting functions since you are already working in the tidyverse. The following will do the trick to plot it all on a single graph.
dmda<-"2021-07-01"
dta<-df
## Rather than rely on column position, explicitly set the number
## of days desired for highlighting on plot
num_days <- 3
model <- nls(Numbers ~ b1*Days^2+b2,start = list(b1 = 47,b2 = 0))
new.data <- data.frame(Days = seq(min(Days),max(Days),len = 45)) %>%
mutate(Numbers = predict(model, newdata = .))
datas<-dta %>%
filter(date2 == ymd(dmda)) %>%
summarize(across(starts_with("DR"), sum)) %>%
## Can convert data to numeric and create column names inside pivot_longer
pivot_longer(everything(), names_pattern = "DR(.+)",
values_to = "Numbers", names_to = "Days",
names_transform = list(Days = as.numeric, Numbers = as.numeric)) %>%
## Create flag for whether the values are in the final number of days
mutate(subs = 1:n() > (n() - num_days))
plt <- ggplot(datas, aes(x = Days, y = Numbers)) +
geom_point(aes(color = subs)) +
geom_line(data = filter(datas, subs == TRUE), color = "red") +
geom_line(data = new.data, color = "black") +
scale_y_continuous(limits = c(0, 20)) +
scale_color_manual(values = c("black", "red"))
plt

Related

Plotting Backtested Workflow_Set data

I'm trying to view how this model performs against prior actual close. I'm using a workflow_set model and have no issues extracting the forecast. I've supplied a reproducible example below. I'd like to be able to plot actual, with a backtested trend line along with the forecast.
tickers <- "TSLA"
first.date <- Sys.Date() - 3000
last.date <- Sys.Date()
freq.data <- "daily"
stocks <- BatchGetSymbols::BatchGetSymbols(tickers = tickers,
first.date = first.date,
last.date = last.date,
freq.data = freq.data ,
do.cache = FALSE,
thresh.bad.data = 0)
stocks <- stocks %>% as.data.frame() %>% select(Date = df.tickers.ref.date, Close = df.tickers.price.close)
time_val_split <-
stocks %>%
sliding_period(
Date,
period = "day",
every = 52)
data_extended <- stocks %>%
future_frame(
.length_out = 60,
.bind_data = TRUE
) %>%
ungroup()
train_tbl <- data_extended %>% drop_na()
future_tbl <- data_extended %>% filter(is.na(Close))
base_rec <- recipe(Close ~ Date, train_tbl) %>%
step_timeseries_signature(Date) %>%
step_rm(matches("(.xts$)|(.iso$)|(.lbl)|(hour)|(minute)|(second)|(am.pm)|(mweek)|(qday)|(week2)|(week3)|(week4)")) %>%
step_dummy(all_nominal(), one_hot = TRUE) %>%
step_normalize(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_rm(Date)
cubist_spec <-
cubist_rules(committees = tune(),
neighbors = tune()) %>%
set_engine("Cubist")
rf_spec <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = 1000) %>%
set_engine("ranger") %>%
set_mode("regression")
base <-
workflow_set(
preproc = list(base_date = base_rec),
models = list(
cubist_base = cubist_spec,
cart_base = cart_spec
))
all_workflows <-
bind_rows(
base
)
cores <- parallel::detectCores(logical = FALSE)
clusters <- parallel::makePSOCKcluster(cores)
doParallel::registerDoParallel(clusters)
wflwset_tune_results <-
all_workflows %>%
workflow_map(
fn = "tune_race_anova",
seed = 1,
resamples = time_val_split,
grid = 2,
verbose = TRUE)
doParallel::stopImplicitCluster()
best_for_each_mod <- wflwset_tune_results %>%
rank_results(select_best = TRUE) %>%
filter(.metric == "rmse") %>%
select(wflow_id, .config, mean, preprocessor, model)
b_mod <- best_for_each_mod %>%
arrange(mean) %>%
head(1) %>%
select(wflow_id) %>% as.character()
best_param <- wflwset_tune_results %>% extract_workflow_set_result(id = b_mod) %>% select_best(metric = "rmse")
# Finalize model with best param
best_finalized <- wflwset_tune_results %>%
extract_workflow(b_mod) %>%
finalize_workflow(best_param) %>%
fit(train_tbl)
At this point the model has been trained but I can't seem to figure out how to run it against prior actuals. My goal is to bind the backed results with the predictions below.
prediction_tbl <- best_finalized %>%
predict(new_data = future_tbl) %>%
bind_cols(future_tbl) %>%
select(.pred, Date) %>%
mutate(type = "prediction") %>%
rename(Close = .pred)
train_tbl %>% mutate(type = "actual") %>% rbind(prediction_tbl) %>%
ggplot(aes(Date, Close, color = type)) +
geom_line(size = 2)
Based on your comment, I'd recommend using pivot_longer() after binding the future_tbl to your predictions. This lets you keep everything in one pipeline, rather than having to create two separate dataframes then bind them together. Here's an example plotting the prediction & actual values against mpg. Hope this helps!
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
# split data
set.seed(123)
mtcars <- as_tibble(mtcars)
cars_split <- initial_split(mtcars)
cars_train <- training(cars_split)
cars_test <- testing(cars_split)
# plot truth & prediction against another variable
workflow() %>%
add_model(linear_reg() %>% set_engine("lm")) %>%
add_recipe(recipe(qsec ~ ., data = cars_train)) %>%
fit(cars_train) %>%
predict(cars_test) %>%
bind_cols(cars_test) %>%
pivot_longer(cols = c(.pred, qsec),
names_to = "comparison",
values_to = "value") %>%
ggplot(aes(x = mpg,
y = value,
color = comparison)) +
geom_point(alpha = 0.75)
Created on 2021-11-18 by the reprex package (v2.0.1)

R Highcharter - highlight same group in multiple stacked columns chart + order groups in columns

R newbie here :)
I have recently started using R library Highcharter as an alternative to ggplot2.
This is the sample code I am currently working on:
library(highcharter)
library(dplyr)
## Sample dataframe
YEAR <- c(2019,2020,2021)
CATEGORY <- c("dog", "cat", "mouse")
SAMPLE_DATA <- expand.grid(YEAR, CATEGORY)
names(SAMPLE_DATA)[1] <- "CATEGORY"
names(SAMPLE_DATA)[2] <- "YEAR"
SAMPLE_DATA$VALUE <- runif(n = 9, min = 400, max = 900)
## Chart
SAMPLE_DATA <- SAMPLE_DATA %>%
group_by(YEAR, CATEGORY) %>%
summarise(VALUE = sum(VALUE, na.rm = T))
highchart() %>%
hc_add_series(data = SAMPLE_DATA, hcaes(x = YEAR, y = round(VALUE,0), group = CATEGORY), type = "column") %>%
hc_plotOptions(column = list(stacking = "normal"))
What I am trying to do is:
Sort how the group "CATEGORY" is piled in each column, based on ascending/descending "VALUE"
Have that effect which highlights the same group in all columns as you hover over it
Does anyone have an idea? Thank you!
This is a late answer but I believe this is what you want.
Adding the data again because I think you swapped some column names on accident:
YEAR <- c(2019,2020,2021)
CATEGORY <- c("dog", "cat", "mouse")
SAMPLE_DATA <- expand.grid(YEAR, CATEGORY)
names(SAMPLE_DATA)[1] <- "YEAR"
names(SAMPLE_DATA)[2] <- "CATEGORY"
SAMPLE_DATA$VALUE <- runif(n = 9, min = 400, max = 900)
## Chart
SAMPLE_DATA <- SAMPLE_DATA %>%
group_by(YEAR, CATEGORY) %>%
summarise(VALUE = sum(VALUE, na.rm = T))
Creating plot:
SAMPLE_DATA %>%
ungroup() %>%
mutate(YEAR = factor(YEAR) %>% fct_reorder(VALUE, .desc = TRUE)) %>%
mutate(year_index = as.numeric(YEAR)) %>%
hchart(
type = "column",
hcaes(x = year_index,
y = VALUE,
group = CATEGORY,
name = YEAR),
) %>%
hc_xAxis(type = "category", labels = list(step = 1)) %>%
hc_plotOptions(series = list(stacking = TRUE))

Inserting new data into a table

I would like a little help with the following question: note that this code generates a coefficient from a date I have chosen, in this case for the day 03/07 (dmda), it gave a coefficient equal to 15.55. In this case, I would like to generate a new table, where there is a column with dates and the other column with the coefficient corresponding to those dates. For the column dates, only the dates of date2 after the day considered in date1 (28/06) will be considered, in this case, the dates are: 01/07, 02/07 and 03/07.
So the table will look like this:
Thanks!
library(dplyr)
library(tidyverse)
library(lubridate)
df1 <- structure(
list(date1 = c("2021-06-28","2021-06-28","2021-06-28","2021-06-28","2021-06-28",
"2021-06-28","2021-06-28","2021-06-28"),
date2 = c("2021-04-02","2021-04-03","2021-04-08","2021-04-09","2021-04-10","2021-07-01","2021-07-02","2021-07-03"),
Week= c("Friday","Saturday","Thursday","Friday","Saturday","Thursday","Friday","Monday"),
DR01 = c(14,11,14,13,13,14,13,16), DR02= c(14,12,16,17,13,12,17,14),DR03= c(19,15,14,13,13,12,11,15),
DR04 = c(15,14,13,13,16,12,11,19),DR05 = c(15,14,15,13,16,12,11,19),
DR06 = c(21,14,13,13,15,16,17,18),DR07 = c(12,15,14,14,19,14,17,18)),
class = "data.frame", row.names = c(NA, -8L))
dmda<-"2021-07-03"
datas<-df1 %>%
filter(date2 == ymd(dmda)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name))
colnames(datas)<-c("Days","Numbers")
mod <- nls(Numbers ~ b1*Days^2+b2,start = list(b1 = 47,b2 = 0), data = datas)
coef(mod)[2]
> coef(mod)[2]
b2
15.55011
We may subset the data where the 'date2' is greater than date1', get the 'date2' column extracted as a vector. Loop over the dates with map (from purrr), do the transformation within the loop, build the nls and extract the coefficient in a tibble, and use _dfr to collapse the list to a single tibble
library(purrr)
library(dplyr)
dates <- subset(df1, date2 > date1, select = date2)$date2
map_dfr(dates, ~ {
datas <- df1 %>%
filter(date2 == ymd(.x)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name))
colnames(datas)<-c("Days","Numbers")
mod <- nls(Numbers ~ b1*Days^2+b2,start = list(b1 = 47,b2 = 0), data = datas)
tibble(dates = .x, coef = coef(mod)[2])
}) %>%
mutate(dates = format(ymd(dates), "%d/%m/%Y"))
# A tibble: 3 × 2
dates coef
<chr> <dbl>
1 01/07/2021 12.2
2 02/07/2021 12.4
3 03/07/2021 15.6

Optimize code for scatter plot generation in R

The executable code below generates a scatter plot that depends on the date (date2) he chooses and three lines are also generated, referring to mean, mean+standard deviation and mean-standard deviation, which are based on the day of the week (Week) that is chosen.
As you can see, I used vector i to generate the mean and standard deviation. But I would like to optimize this, that is, when he chooses the date, he already understands what day of the week it is, so he doesn't need to use this i vector.
For example, I put it to generate scatterplot date 10/04/2021, so the code would need to know it's a Saturday, without having to set vector i to 3.
Can you help me with this question?
The link to download the database is:https://docs.google.com/spreadsheets/d/1W_hzuRq7D6X12BdwaXeM-cjg2A5MIKDx/edit?usp=sharing&ouid=102073768617937039119&rtpof=true&sd=true
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
df<-read_excel('C:/Users/Downloads/database_test1.xlsx')
df<-subset(df,df$date2<df$date1)
dim_data<-dim(df)
day<-c(seq.Date(from = as.Date(df$date2[1]),
to = as.Date(df$date2[dim_data[1]]),
by = "1 day"))
df_grouped <- df %>%
mutate(across(starts_with("date"), as.Date)) %>%
group_by(date2) %>%
summarise(Id = first(Id),
date1 = first(date1),
Week = first(Week),
D = first(D),
D1 = sum(D1)) %>%
select(Id,date1,date2,Week,D,D1)
df_grouped <- df_grouped %>% mutate(date1=format(date1,"%d/%m/%Y"),
date2=format(date2,"%d/%m/%Y"))
df_grouped<-data.frame(df_grouped)
DS=c("Thursday","Friday","Saturday")
i<-3
df_OC<-subset(df_grouped,is.na(D))
ds_OC<-subset(df_OC,df_OC$Week==DS[i])
#Mean and Standard Deviation
mean_Week<-mean(as.numeric(ds_OC[,"D1"]) )
sdeviation_Week<-sd(as.numeric(ds_OC[,"D1"]))
#create scatter plot
scatter_date <- function(dt, dta = df) {
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=mean_Week, col='blue')
abline(h=(mean_Week + sdeviation_Week), col='green',lty=2)
abline(h=(mean_Week - sdeviation_Week), col='orange',lty=2)
}
scatter_date("2021-04-10",df)
Generated images
You could create a lookup table:
library(tibble)
lookup <- df %>%
select(date2, Week) %>%
distinct() %>%
mutate(date2 = ymd(date2)) %>%
deframe()
lookup
#> 2021-03-04 2021-04-02 2021-04-03 2021-04-08 2021-04-09 2021-04-10
#> "Thursday" "Friday" "Saturday" "Thursday" "Friday" "Saturday"
So now
lookup["2021-04-10"]
#> "Saturday"
To use this with your scatterplot function you need to move some of your code into your function.
One more idea of optimization:
# You could put this lines into one pipe
df_grouped <- df %>%
mutate(across(starts_with("date"), as.Date)) %>%
group_by(date2) %>%
summarise(Id = first(Id),
date1 = first(date1),
Week = first(Week),
D = first(D),
D1 = sum(D1)) %>%
select(Id, date1, date2, Week, D, D1) %>%
mutate(date1 = format(date1, "%d/%m/%Y"),
date2 = format(date2, "%d/%m/%Y"))
# you don't need this line
# df_grouped<-data.frame(df_grouped)
Two more hints:
Use a space after ",". This makes the code easier to read.
Avoid using different types of quoting marks: use either " or ' not both (unless you have to use both).
According to https://stackoverflow.com/a/68948847/8282674 you can adapt your scatter_date with a switch statment and calculate every mean in there. The other way with less changes in your code, would be to remove DS=c("Thursday","Friday","Saturday") to calculate the weekday in the scatter_date function directly:
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
df<-readxl::read_excel('C:/Users/Downloads/database_test1.xlsx')
df<-subset(df,df$date2<df$date1)
# translate the days
df %>% dplyr::mutate(Week = ifelse(Week=="Thursday", "quinta-feira", Week),
Week = ifelse(Week=="Friday", "sexta-feira", Week),
Week = ifelse(Week=="Saturday", "sábado", Week)) -> df
dim_data<-dim(df)
day<-c(seq.Date(from = as.Date(df$date2[1]),
to = as.Date(df$date2[dim_data[1]]),
by = "1 day"))
df_grouped <- df %>%
mutate(across(starts_with("date"), as.Date)) %>%
group_by(date2) %>%
summarise(Id = first(Id),
date1 = first(date1),
Week = first(Week),
D = first(D),
D1 = sum(D1)) %>%
select(Id,date1,date2,Week,D,D1)
df_grouped <- df_grouped %>% mutate(date1=format(date1,"%d/%m/%Y"),
date2=format(date2,"%d/%m/%Y"))
df_grouped<-data.frame(df_grouped)
#create scatter plot
scatter_date <- function(dt, dta = df) {
# get the week day
my_day <- weekdays(as.Date(dt))
df_OC<-subset(df_grouped,is.na(D))
ds_OC<-subset(df_OC,df_OC$Week==my_day) # omit 'i' and DS
mean_Week<-mean(as.numeric(ds_OC[,"D1"]) )
sdeviation_Week<-sd(as.numeric(ds_OC[,"D1"]))
mean_Week_pos <- (mean_Week + sdeviation_Week)
mean_Week_neg <- (mean_Week - sdeviation_Week)
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7),
main = paste0(my_day, ": (", mean_Week, ",+",mean_Week_pos, ",-", mean_Week_neg,")"),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=mean_Week, col='blue')
abline(h= mean_Week_pos, col='green',lty=2)
abline(h= mean_Week_neg, col='orange',lty=2)
}
scatter_date("2021-04-10",df)
scatter_date("2021-04-9",df)
scatter_date("2021-04-8",df)

Adjust line in graph in R

The graph below generates a scatter plot based on date2. In addition, a horizontal line that refers to the mean is generated. Each day of the week has a different mean as you can see.
Note that in abline I specified h=mean_saturday, as 10/4 is a Saturday. But I didn't want to always have to change this part of the abline to show the right mean line, but my idea is to leave it automatically, that is, when I enter the date 10/4/2021 in the code, the code already recognize that the 10th it's Saturday and inserts the appropriate mean line. Any idea how to do this?
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(tibble)
df <- structure(
list(Id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
date1 = c("2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20"),
date2 = c("2021-07-01","2021-07-01","2021-07-01","2021-07-01","2021-04-02",
"2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-03",
"2021-04-03","2021-04-03","2021-04-03","2021-04-03","2021-04-08","2021-04-08",
"2021-04-09","2021-04-09","2021-04-10","2021-04-10"),
Week= c("Thursday","Thursday","Thursday","Thursday","Friday","Friday","Friday","Friday",
"Friday","Friday","Saturday","Saturday","Saturday","Saturday","Saturday","Thursday",
"Thursday","Friday","Friday","Saturday","Saturday"),
D = c("","","Ho","","","","","","Ho","","","","","","","","","","","",""),
DR01 = c(2,1,4,3,3,4,1,6,3,7,2,3,4,6,7,8,4,2,6,2,3)),
class = "data.frame", row.names = c(NA, -21L))
mean_thursday=4
mean_friday=5
mean_saturday=6
scatter_date <- function(dt, dta = df) {
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=mean_saturday, col='blue')
}
scatter_date("2021-04-10",df)
You could try to convert the input date in your scatter_date function to a date and get the weekday: my_day <- weekdays(as.Date(dt)) add that to a switch statment for your means:
my_mean <- switch(
my_day,
"Saturday" = mean_saturday,
"Friday" = mean_friday,
"Thursday" = mean_thursday,
0) # add here your other days
and replace mean_saturday in abline(h=mean_saturday, col='blue') with my_mean
here the full code:
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(tibble)
df <- structure(
list(Id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
date1 = c("2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20"),
date2 = c("2021-07-01","2021-07-01","2021-07-01","2021-07-01","2021-04-02",
"2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-03",
"2021-04-03","2021-04-03","2021-04-03","2021-04-03","2021-04-08","2021-04-08",
"2021-04-09","2021-04-09","2021-04-10","2021-04-10"),
Week= c("Thursday","Thursday","Thursday","Thursday","Friday","Friday","Friday","Friday",
"Friday","Friday","Saturday","Saturday","Saturday","Saturday","Saturday","Thursday",
"Thursday","Friday","Friday","Saturday","Saturday"),
D = c("","","Ho","","","","","","Ho","","","","","","","","","","","",""),
DR01 = c(2,1,4,3,3,4,1,6,3,7,2,3,4,6,7,8,4,2,6,2,3)),
class = "data.frame", row.names = c(NA, -21L))
mean_thursday=4
mean_friday=5
mean_saturday=6
scatter_date <- function(dt, dta = df) {
my_day <- weekdays(as.Date(dt))
my_mean <- switch(
my_day,
"Saturday" = mean_saturday,
"Friday" = mean_friday,
"Thursday" = mean_thursday,
0) # add here your other days
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7), main = paste0(my_day, ":", my_mean),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=my_mean, col='blue')
}
# testing the different means
scatter_date("2021-04-10",df)
scatter_date("2021-04-9",df)
scatter_date("2021-04-8",df)
One way would be to define a data.frame containing the mean for the days of interest and then use weekdays to extract the corresponding mean from that table.
Instead of
mean_thursday=4
mean_friday=5
mean_saturday=6
I would go for something like:
mean_df <- data.frame(mean = c(4:6),
day = c('Thursday', 'Friday', 'Saturday'))
and then
abline(h=subset(mean_df, day == weekdays(as.Date(dt)))$mean, col='blue')
which will be the only change in your function.

Resources