Could you help me optimize the code below? As you can see, I'm using the same date twice, once for graph generation and once for subset y generation. The result is correct, but I'd like some help trying to optimize to at least use the date only once and another optimizing that you find necessary. Every help is welcome.
Thank you very much!
library(dplyr)
library(lubridate)
library(tidyverse)
#dataset
df <- structure(
list(date1 = c("2021-06-28","2021-06-28","2021-06-28","2021-06-28","2021-06-28",
"2021-06-28","2021-06-28","2021-06-28"),
date2 = c("2021-04-02","2021-04-03","2021-04-08","2021-04-09","2021-04-10","2021-07-01","2021-07-02","2021-07-03"),
Week= c("Friday","Saturday","Thursday","Friday","Saturday","Thursday","Friday","Monday"),
DR01 = c(4,1,4,3,3,4,3,6), DR02= c(4,2,6,7,3,2,7,4),DR03= c(9,5,4,3,3,2,1,5),
DR04 = c(5,4,3,3,6,2,1,9),DR05 = c(5,4,5,3,6,2,1,9),
DR06 = c(2,4,3,3,5,6,7,8),DR07 = c(2,5,4,4,9,4,7,8)),
class = "data.frame", row.names = c(NA, -8L))
#Generate graph
dmda<-"2021-07-01"
dta<-df
datas<-dta %>%
filter(date2 == ymd(dmda)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name))
colnames(datas)<-c("Days","Numbers")
attach(datas)
plot(Numbers ~ Days, ylim=c(0,20))
model <- nls(Numbers ~ b1*Days^2+b2,start = list(b1 = 47,b2 = 0))
new.data <- data.frame(Days = seq(min(Days),max(Days),len = 45))
lines(new.data$Days,predict(model,newdata = new.data))
#Add the y points to the graph
df[, 1:2] = lapply(df[, 1:2], FUN = as_date)
get_cutoff = function(date) {
date2 = as_date(date)
date1 = df[1,1]
as.numeric(date2 - date1 + 1)
}
subset_data = function(date, start_index) {
date = as_date(date)
if (date > df[1,1]) {
end_index = start_index + get_cutoff(date) - 1
df[, -c(start_index:end_index)] %>%
filter(date2 == date)
} else {
return(df)
}
}
y<-subset_data("2021-07-01", 4)
y
pivot_longer(y,
cols=c(starts_with("DR"))) %>%
mutate(day = parse_number(name)) -> new_y
new_y
lines(x=new_y$day, y=new_y$value, col="red")
points(x=new_y$day, y=new_y$value, col="red")
Make these changes:
only load packages used
can eliminate lubridate
don't need dta
in filter we don't need to convert dmda to Date class
pivot_wider can transform the names
don't use attach
the model is linear in the parameters so use lm, not nls
replace the new.data/lines with curve
don't overwrite df
simplify the cutoff calculation
use type = "o" to reduce points/lines to just lines
use subset in lines
Now assuming that df and dmda have been defined as in the question we have this.
library(dplyr)
library(tidyr)
datas <- df %>%
filter(date2 == dmda) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)",
names_to = "Days", values_to = "Numbers",
names_transform = list(Days = as.numeric))
plot(Numbers ~ Days, datas, ylim=c(0,20))
model <- lm(Numbers ~ I(Days^2), datas)
rng <- range(datas$Days)
curve(predict(model, list(Days = x)), rng[1], rng[2], add = TRUE)
# assume this for cutoff. You may or may not need to change this line.
cutoff <- as.numeric(as.Date(dmda) - first(as.Date(df$date1))) + 1
lines(Numbers ~ Days, datas, subset = seq_len(nrow(datas)) > cutoff,
type = "o" , col = "red")
I used ggplot rather than base R plotting functions since you are already working in the tidyverse. The following will do the trick to plot it all on a single graph.
dmda<-"2021-07-01"
dta<-df
## Rather than rely on column position, explicitly set the number
## of days desired for highlighting on plot
num_days <- 3
model <- nls(Numbers ~ b1*Days^2+b2,start = list(b1 = 47,b2 = 0))
new.data <- data.frame(Days = seq(min(Days),max(Days),len = 45)) %>%
mutate(Numbers = predict(model, newdata = .))
datas<-dta %>%
filter(date2 == ymd(dmda)) %>%
summarize(across(starts_with("DR"), sum)) %>%
## Can convert data to numeric and create column names inside pivot_longer
pivot_longer(everything(), names_pattern = "DR(.+)",
values_to = "Numbers", names_to = "Days",
names_transform = list(Days = as.numeric, Numbers = as.numeric)) %>%
## Create flag for whether the values are in the final number of days
mutate(subs = 1:n() > (n() - num_days))
plt <- ggplot(datas, aes(x = Days, y = Numbers)) +
geom_point(aes(color = subs)) +
geom_line(data = filter(datas, subs == TRUE), color = "red") +
geom_line(data = new.data, color = "black") +
scale_y_continuous(limits = c(0, 20)) +
scale_color_manual(values = c("black", "red"))
plt
Could you help me solve following issue:
I have two codes that were made to generate the same scatter plot. The first one works normally, generates the graph and the lines without any problems. It is a code that requires vector i to generate the mean and standard deviation(sd).
Code 2, on the other hand, does not require vector i, but the result is not the desired one regarding the construction of lines in relation to the mean and sd. In my opinion it was to work.
Could the problem be with the ylim?
I hope someone helps me with this! =)
Thank you so much!
First code
library(dplyr)
library(tidyr)
library(lubridate)
data <- structure(
list(Id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
date1 = c("2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20",
"2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20",
"2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20",
"2021-06-20","2021-06-20","2021-06-20","2021-06-20"),
date2 = c("2021-07-01","2021-07-01","2021-07-01","2021-07-01","2021-04-02",
"2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-03",
"2021-04-03","2021-04-03","2021-04-03","2021-04-03","2021-04-08","2021-04-08",
"2021-04-09","2021-04-09","2021-04-10","2021-04-10"),
Week= c("Thursday","Thursday","Thursday","Thursday","Friday","Friday","Friday","Friday",
"Friday","Friday","Saturday","Saturday","Saturday","Saturday","Saturday","Thursday",
"Thursday","Friday","Friday","Saturday","Saturday"),
DTPE = c("Ho","Ho","Ho","Ho","","","","","","","","","","","","","","","","Ho","Ho"),
D1 = c(8,1,9, 3,5,4,7,6,3,8,2,3,4,6,7,8,4,2,6,2,3), DR01 = c(4,1,4,3,3,4,3,6,3,7,2,3,4,6,7,8,4,2,6,7,3),
DR02 = c(8,1,4,3,3,4,1,6,3,7,2,3,4,6,7,8,4,2,6,2,3), DR03 = c(7,5,4,3,3,4,1,5,3,3,2,3,4,6,7,8,4,2,6,4,3),
DR04= c(4,5,6,7,3,2,7,4,2,1,2,3,4,6,7,8,4,2,6,4,3),DR05 = c(9,5,4,3,3,2,1,5,3,7,2,3,4,7,7,8,4,2,6,4,3)),
class = "data.frame", row.names = c(NA, -21L))
graph <- function(dt, dta = data) {
dim_data<-dim(data)
day<-c(seq.Date(from = as.Date(data$date2[1]), by = "days",
length = dim_data[1]
))
data_grouped <- data %>%
mutate(across(starts_with("date"), as.Date)) %>%
group_by(date2) %>%
summarise(Id = first(Id),
date1 = first(date1),
Week = first(Week),
DTPE = first(DTPE),
D1 = sum(D1)) %>%
select(Id,date1,date2,Week,DTPE,D1)
data_grouped <- data_grouped %>% mutate(date1=format(date1,"%d/%m/%Y"),
date2=format(date2,"%d/%m/%Y"))
data_grouped<-data.frame(data_grouped)
data_grouped %>%
mutate(DTPE = na_if(DTPE, ""))
DS=c("Thursday","Friday","Saturday")
i<-2
df_OC<-subset(data_grouped, DTPE == "")
ds_CO<-subset(df_OC,df_OC$Week==DS[i])
mean<-mean(as.numeric(ds_CO[,"D1"]) )
sd<-sd(as.numeric(ds_CO[,"D1"]))
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Number", xlim = c(0, 45),cex=1.5,cex.lab=1.5,
cex.axis=1.5, cex.main=2, cex.sub=2, lwd=2.5, ylim = c((min(.$val) %/% 10) * 15, (max(.$val) %/% 10 + 1) * 100))
abline(h=mean, col='blue') +
abline(h=(mean + sd), col='green',lty=2)
abline(h=(mean - sd), col='orange',lty=2)
}
graph("2021-04-09",data)
Second code
library(dplyr)
library(tidyr)
library(lubridate)
data <- structure(
list(Id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
date1 = c("2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20",
"2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20",
"2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20",
"2021-06-20","2021-06-20","2021-06-20","2021-06-20"),
date2 = c("2021-07-01","2021-07-01","2021-07-01","2021-07-01","2021-04-02",
"2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-03",
"2021-04-03","2021-04-03","2021-04-03","2021-04-03","2021-04-08","2021-04-08",
"2021-04-09","2021-04-09","2021-04-10","2021-04-10"),
Week= c("Thursday","Thursday","Thursday","Thursday","Friday","Friday","Friday","Friday",
"Friday","Friday","Saturday","Saturday","Saturday","Saturday","Saturday","Thursday",
"Thursday","Friday","Friday","Saturday","Saturday"),
DTPE = c("Ho","Ho","Ho","Ho","","","","","","","","","","","","","","","","Ho","Ho"),
D1 = c(8,1,9, 3,5,4,7,6,3,8,2,3,4,6,7,8,4,2,6,2,3), DR01 = c(4,1,4,3,3,4,3,6,3,7,2,3,4,6,7,8,4,2,6,7,3),
DR02 = c(8,1,4,3,3,4,1,6,3,7,2,3,4,6,7,8,4,2,6,2,3), DR03 = c(7,5,4,3,3,4,1,5,3,3,2,3,4,6,7,8,4,2,6,4,3),
DR04= c(4,5,6,7,3,2,7,4,2,1,2,3,4,6,7,8,4,2,6,4,3),DR05 = c(9,5,4,3,3,2,1,5,3,7,2,3,4,7,7,8,4,2,6,4,3)),
class = "data.frame", row.names = c(NA, -21L))
graph <- function(dt, dta = data) {
dim_data<-dim(data)
day<-c(seq.Date(from = as.Date(data$date2[1]), by = "days",
length = dim_data[1]
))
data_grouped <- data %>%
mutate(across(starts_with("date"), as.Date)) %>%
group_by(date2) %>%
summarise(Id = first(Id),
date1 = first(date1),
Week = first(Week),
DTPE = first(DTPE),
D1 = sum(D1)) %>%
select(Id,date1,date2,Week,DTPE,D1)
data_grouped <- data_grouped %>% mutate(date1=format(date1,"%d/%m/%Y"),
date2=format(date2,"%d/%m/%Y"))
data_grouped<-data.frame(data_grouped)
data_grouped %>%
mutate(DTPE = na_if(DTPE, ""))
# get the week day
my_day <- weekdays(as.Date(dt))
df_OC<-subset(data_grouped, DTPE == "")
ds_CO<-subset(df_OC,df_OC$Week == my_day)
mean<-mean(as.numeric(ds_CO[,"D1"]) )
sd<-sd(as.numeric(ds_CO[,"D1"]))
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Number", xlim = c(0, 45),cex=1.5,cex.lab=1.5,
cex.axis=1.5, cex.main=2, cex.sub=2, lwd=2.5, ylim = c((min(.$val) %/% 10) * 15, (max(.$val) %/% 10 + 1) * 100))
abline(h=mean, col='blue') +
abline(h=(mean + sd), col='green',lty=2)
abline(h=(mean - sd), col='orange',lty=2)
}
graph("2021-04-09",data)
You've messed up a lot with these data transformations.
Below, however, I present my code that works according to your expectations.
The main problem here was my_day <- weekdays (as.Date (dt)), In my system I was getting "piątek" and you didn't have such a day in your data, right?
library(dplyr)
library(tidyr)
library(lubridate)
data <- structure(
list(Id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
date1 = c("2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20",
"2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20",
"2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20",
"2021-06-20","2021-06-20","2021-06-20","2021-06-20"),
date2 = c("2021-07-01","2021-07-01","2021-07-01","2021-07-01","2021-04-02",
"2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-03",
"2021-04-03","2021-04-03","2021-04-03","2021-04-03","2021-04-08","2021-04-08",
"2021-04-09","2021-04-09","2021-04-10","2021-04-10"),
Week= c("Thursday","Thursday","Thursday","Thursday","Friday","Friday","Friday","Friday",
"Friday","Friday","Saturday","Saturday","Saturday","Saturday","Saturday","Thursday",
"Thursday","Friday","Friday","Saturday","Saturday"),
DTPE = c("Ho","Ho","Ho","Ho","","","","","","","","","","","","","","","","Ho","Ho"),
D1 = c(8,1,9, 3,5,4,7,6,3,8,2,3,4,6,7,8,4,2,6,2,3), DR01 = c(4,1,4,3,3,4,3,6,3,7,2,3,4,6,7,8,4,2,6,7,3),
DR02 = c(8,1,4,3,3,4,1,6,3,7,2,3,4,6,7,8,4,2,6,2,3), DR03 = c(7,5,4,3,3,4,1,5,3,3,2,3,4,6,7,8,4,2,6,4,3),
DR04= c(4,5,6,7,3,2,7,4,2,1,2,3,4,6,7,8,4,2,6,4,3),DR05 = c(9,5,4,3,3,2,1,5,3,7,2,3,4,7,7,8,4,2,6,4,3)),
class = "data.frame", row.names = c(NA, -21L))
graph <- function(dt, dta = data) {
dim_data<-dim(data)
day<-c(seq.Date(from = as.Date(data$date2[1]), by = "days",
length = dim_data[1]
))
data_grouped <- data %>%
mutate(across(starts_with("date"), as.Date)) %>%
group_by(date2) %>%
summarise(Id = first(Id),
date1 = first(date1),
Week = first(Week),
DTPE = first(DTPE),
D1 = sum(D1)) %>%
select(Id,date1,date2,Week,DTPE,D1)
#data_grouped <- data_grouped %>% mutate(date1=format(date1,"%d/%m/%Y"),
# date2=format(date2,"%d/%m/%Y"))
#data_grouped<-data.frame(data_grouped)
data_grouped %>%
mutate(DTPE = na_if(DTPE, ""))
# get the week day
#my_day <- weekdays(as.Date(dt))
df_OC<-subset(data_grouped, DTPE == "")
ds_CO = df_OC %>% filter(weekdays(date2) %in% weekdays(as.Date(dt)))
#ds_CO<-subset(df_OC,df_OC$Week == my_day)
mean<-mean(ds_CO$D1)
sd<-sd(ds_CO$D1)
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Number", xlim = c(0, 45),cex=1.5,cex.lab=1.5,
cex.axis=1.5, cex.main=2, cex.sub=2, lwd=2.5, ylim = c((min(.$val) %/% 10) * 15, (max(.$val) %/% 10 + 1) * 100))
abline(h=mean, col='blue') +
abline(h=(mean + sd), col='green',lty=2)
abline(h=(mean - sd), col='orange',lty=2)
}
graph("2021-04-09",data)
Finally, I recommend:
Keep your data in tibble,
do not unnecessarily transform the date into a string several times and vice versa,
use ggplo2. The charts will be much nicer.
The graph below generates a scatter plot based on date2. In addition, a horizontal line that refers to the mean is generated. Each day of the week has a different mean as you can see.
Note that in abline I specified h=mean_saturday, as 10/4 is a Saturday. But I didn't want to always have to change this part of the abline to show the right mean line, but my idea is to leave it automatically, that is, when I enter the date 10/4/2021 in the code, the code already recognize that the 10th it's Saturday and inserts the appropriate mean line. Any idea how to do this?
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(tibble)
df <- structure(
list(Id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
date1 = c("2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20"),
date2 = c("2021-07-01","2021-07-01","2021-07-01","2021-07-01","2021-04-02",
"2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-03",
"2021-04-03","2021-04-03","2021-04-03","2021-04-03","2021-04-08","2021-04-08",
"2021-04-09","2021-04-09","2021-04-10","2021-04-10"),
Week= c("Thursday","Thursday","Thursday","Thursday","Friday","Friday","Friday","Friday",
"Friday","Friday","Saturday","Saturday","Saturday","Saturday","Saturday","Thursday",
"Thursday","Friday","Friday","Saturday","Saturday"),
D = c("","","Ho","","","","","","Ho","","","","","","","","","","","",""),
DR01 = c(2,1,4,3,3,4,1,6,3,7,2,3,4,6,7,8,4,2,6,2,3)),
class = "data.frame", row.names = c(NA, -21L))
mean_thursday=4
mean_friday=5
mean_saturday=6
scatter_date <- function(dt, dta = df) {
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=mean_saturday, col='blue')
}
scatter_date("2021-04-10",df)
You could try to convert the input date in your scatter_date function to a date and get the weekday: my_day <- weekdays(as.Date(dt)) add that to a switch statment for your means:
my_mean <- switch(
my_day,
"Saturday" = mean_saturday,
"Friday" = mean_friday,
"Thursday" = mean_thursday,
0) # add here your other days
and replace mean_saturday in abline(h=mean_saturday, col='blue') with my_mean
here the full code:
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(tibble)
df <- structure(
list(Id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
date1 = c("2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20"),
date2 = c("2021-07-01","2021-07-01","2021-07-01","2021-07-01","2021-04-02",
"2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-03",
"2021-04-03","2021-04-03","2021-04-03","2021-04-03","2021-04-08","2021-04-08",
"2021-04-09","2021-04-09","2021-04-10","2021-04-10"),
Week= c("Thursday","Thursday","Thursday","Thursday","Friday","Friday","Friday","Friday",
"Friday","Friday","Saturday","Saturday","Saturday","Saturday","Saturday","Thursday",
"Thursday","Friday","Friday","Saturday","Saturday"),
D = c("","","Ho","","","","","","Ho","","","","","","","","","","","",""),
DR01 = c(2,1,4,3,3,4,1,6,3,7,2,3,4,6,7,8,4,2,6,2,3)),
class = "data.frame", row.names = c(NA, -21L))
mean_thursday=4
mean_friday=5
mean_saturday=6
scatter_date <- function(dt, dta = df) {
my_day <- weekdays(as.Date(dt))
my_mean <- switch(
my_day,
"Saturday" = mean_saturday,
"Friday" = mean_friday,
"Thursday" = mean_thursday,
0) # add here your other days
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7), main = paste0(my_day, ":", my_mean),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=my_mean, col='blue')
}
# testing the different means
scatter_date("2021-04-10",df)
scatter_date("2021-04-9",df)
scatter_date("2021-04-8",df)
One way would be to define a data.frame containing the mean for the days of interest and then use weekdays to extract the corresponding mean from that table.
Instead of
mean_thursday=4
mean_friday=5
mean_saturday=6
I would go for something like:
mean_df <- data.frame(mean = c(4:6),
day = c('Thursday', 'Friday', 'Saturday'))
and then
abline(h=subset(mean_df, day == weekdays(as.Date(dt)))$mean, col='blue')
which will be the only change in your function.