Related
In the data below, I have 7 patients who have undergone surgeries for 9 conditions, two of the patients have undergone surgery on the same day for two different reasons. When I try to summarise this data using tbl_summary function, it shows total number of observations as 9. I want to add total number of patients, which is n = 7.I am not understanding how do I add that information in the table. Could you guys please help me with this?
library(data.table)
library(tidyverse)
library(gtsummary)
participant.index = c(1,2,3,3,4,5,5,6,7)
repeat.instance = c(1,1,1,1,1,1,1,1,1)
indication.surgery = c("ibs","infection", "infection", "renalstones", "ibs", "infection",
"infection", "ibs","tumour")
date.surgery =c("2019-01-10", "2019-01-01", "2018-01-01", "2018-01-01", "2017-09-10",
"2000-09-09","2015-01-10","2015-01-10","2006-09-09")
mydata = data.table(participant.index,repeat.instance,indication.surgery,date.surgery)
mydata%>%
select(indication.surgery)%>%
tbl_summary
label = list(indication.surgery ~
"Indication for Surgery"),
statistic = list(all_categorical() ~ "{n} ({p}%)"),
missing_text = "(Missing Observations)",
percent = c("cell")
) %>%
modify_table_styling(spanning_header = "Indications for Surgery") %>%
modify_caption("**Table 1. Indication for Surgery**") %>%
modify_footnote(all_stat_cols() ~ "Number of observations, Frequency (%)")
Sure! If you add an indicator for the first participant index and include that new variable in the summary table, you can include it in the summary. Example Below!
library(data.table)
library(tidyverse)
library(gtsummary)
participant.index <- c(1, 2, 3, 3, 4, 5, 5, 6, 7)
repeat.instance <- c(1, 1, 1, 1, 1, 1, 1, 1, 1)
indication.surgery <- c(
"ibs", "infection", "infection", "renalstones", "ibs", "infection",
"infection", "ibs", "tumour"
)
date.surgery <- c(
"2019-01-10", "2019-01-01", "2018-01-01", "2018-01-01", "2017-09-10",
"2000-09-09", "2015-01-10", "2015-01-10", "2006-09-09"
)
mydata <-
data.table(participant.index, repeat.instance,
indication.surgery, date.surgery) %>%
group_by(participant.index) %>%
mutate(first.participant.index = row_number() == 1L,
.before = 1L) %>%
ungroup()
mydata %>%
select(first.participant.index, indication.surgery) %>%
tbl_summary(
label =
list(first.participant.index ~ "No. Unique Participants",
indication.surgery ~ "Indication for Surgery"),
statistic = list(all_categorical() ~ "{n} ({p}%)",
first.participant.index ~ "{n}"),
missing_text = "(Missing Observations)",
percent = c("cell")
) %>%
modify_table_styling(spanning_header = "Indications for Surgery") %>%
modify_caption("**Table 1. Indication for Surgery**") %>%
modify_footnote(all_stat_cols() ~ "Number of observations, Frequency (%)")
My question is regarding to for loops and alternatives:
I have multiple consecutive for loops for a big data frame. In every for loop there are new variables/dataframes defined which are necessary for computing the following for loop. Is there a way (maybe lapply or similar) which I can use in this case so that the process will be faster?
databackend2 = data.frame()
databackend2 = structure(list( Class = c("T1", "T1", "T2", "T3", "T5", "Q12"), Places = c("Orlando", "Orlando", "Boston", "LA", "New York", "New York"), Names = c("Planist", "Plantist", "Engi", "Engi", "Shifter", "Automatist"), Final.Work= c(0.08, 0.05, 0.06, 0.05, 0.055, 0.043), Parent = c(NA, NA, NA, "Planist", "Engi", "Engi"), d_stage = c(1, 2, 2, 3, 5, 2))
databackend3 = data.frame()
databackend3 = structure(list( Type = NA, Places = c("Orlando", "Colorado", "Boston", "LA", "New York", "Florida"), D.Stage = c(1.4, 1.5, 2.3, 3.4, 5.1, 2.5), X4= c(3, 3, 4, 5, 5, 4), X5=c(4, 5, 5, 6, 6, 6), Names = c("Sum", "Plantist", "Engi", "Fieldor", "Shifter", "Automatist"), Cu.No.Of.Emp = c(32, 7, 8, 9, 2, 6), Sh.fact = c(NA, 1, 1, 3, 3, 4), Cu.Stage = c(1, 1, 2, 3, 5, 2), One.Target.Stage = c(3, 3, 4, 5, 5, 4), Two.Target.Stage = c(4, 5, 5, 6, 6, 6))
for (names in all_names) {
n_cu_norm_fact = n_cu = databackend3 %>% filter(Names == names) %>% pull(Cu.No..Of.Emp)
sh_fact_nas = databackend3 %>% filter(Roles == role) %>% pull(Sh.fact)
if (is.na(n_cu) | (n_cu == 0)) {
n_cu = 0
n_cu_norm_fact = 1
}
n_emp_df[n_emp_df$names == names, "n_cu"] = n_cu
df_names_one = data.frame()
df_names_two = data.frame()
all_places_names = databackend2 %>% filter(Names == names) %>% pull(Places) %>% unique()
sum_of_cu_work_names_df = data.frame()
for (place in all_places_names) {
ds_cu = databackend3 %>% filter(Places == place) %>% pull(Cu.Stage)
df_i = databackend2 %>% filter(Places == place, Places == place, d_stage == ds_cu) %>% select(Class, Final.Work)
sum_of_cu_work_names_df = sum_of_cu_work_names_df %>% bind_rows(df_i)
}
sum_of_cu_work_names = sum_of_cu_work_names_df$Final.Work %>% sum()
sum_of_cu_work_names = ifelse(n_cu == 0, 1, sum_of_cu_work_names)
for (place in all_places_names) {
ds_cu = databackend3 %>% filter(Places == place) %>% pull(Cu.Stage)
ds_target_one = databackend3 %>% filter(Places == place) %>% pull(One.Target.Stage)
ds_target_two = databackend3 %>% filter(Places == place) %>% pull(Two.Target.Stage)
df_names_place_target_one = databackend2 %>% filter(Names == names, Places == place, d_stage == ds_target_one)
df_names_places_target_two = databackend2 %>% filter(Names == names, Places == place, d_stage == ds_target_two)
df_names_place_target_cu = databackend2 %>% filter(Names == names, Places == place, d_stage == ds_cu) %>% select(Class, Final.Work)
colnames(df_names_place_cu)[2] = "Cu.Work"
df_names_place_target_one = df_names_place_target_one %>% left_join(df_names_place_cu)
df_names_place_target_two = df_names_place_target_one %>% left_join(df_names_place_cu)
df_names_place_target_one$work_norm = df_names_place_target_one$Final.Work / sum_of_cu_work_names
df_names_place_target_two$work_norm = df_names_place_target_two$Final.Work / sum_of_cu_work_names
for (class in df_names_place_target_one$Class) {
source_names = df_names_place_target_one %>% filter(Class == class) %>% slice(1) %>% pull(Parent)
if (is.na(source_names)) next
n_source_names = databackend3 %>% filter(Names == source_names) %>% pull(Cu.No..Of.Emp)
n_source_names = ifelse(n_source_names == 0, 1, n_source_names)
sh_fact_source = databackend3 %>% filter(Names == source_names) %>% pull(Sh.fact)
work_old = df_names_place_target_one %>% filter(Class == class) %>% pull(Final.Work)
sum_of_cu_work_source = databackend2 %>% filter(Names == source_names, d_stage == ds_cu) %>% pull(Final.Work) %>% sum()
df_names_place_target_one[df_names_place_target_one$Class == class, "work_norm"] = (n_source_names / sh_fact_source) * (work_old / sum_of_cu_work_source) / (n_cu_norm_fact / sh_fact_names)
}
for (class in df_names_place_target_two$Class) {
source_names = df_names_place_target_two %>% filter(Class == class) %>% slice(1) %>% pull(Parent)
if (is.na(source_names)) next
n_source_names = databackend3 %>% filter(Names == source_names) %>% pull(Cu.No..Of.Emp)
n_source_names = ifelse(n_source_names == 0, 1, n_source_names)
sh_fact_source = databackend3 %>% filter(Names == source_names) %>% pull(Sh.fact)
work_old = df_names_place_target_two %>% filter(Class == class) %>% pull(Final.Work)
sum_of_cu_work_source = databackend2 %>% filter(Names == source_names, d_stage == ds_cu) %>% pull(Final.Work) %>% sum()
df_names_place_target_two[df_names_place_target_two$Class == class, "work_norm"] = (n_source_names / sh_fact_source) * (work_old / sum_of_cu_work_source) / (n_cu_norm_fact / sh_fact_names)
}
df_names_one = df_names_one %>% bind_rows(df_names_place_target_one)
df_names_two = df_names_two %>% bind_rows(df_names_place_target_two) }
write.csv2(df_names_one, file = paste0("debugging\\one\\", names, ".csv"), row.names = FALSE)
write.csv2(df_names_two, file = paste0("debugging\\bl\\", names, ".csv"), row.names = FALSE)
sum_work_norm_one = df_names_one %>% pull(work_norm) %>% sum(na.rm = TRUE)
sum_work_norm_two = df_names_two %>% pull(work_norm) %>% sum(na.rm = TRUE)
n_emp_df[n_emp_df$names == names, "n_target_one"] = ifelse(n_cu > 0, sum_work_norm_one * n_cu, sum_work_norm_one * 1)
n_emp_df[n_emp_df$names == names, "n_target_two"] = ifelse(n_cu > 0, sum_work_norm_two * n_cu, sum_work_norm_two * 1)
}
The graph below generates a scatter plot based on date2. In addition, a horizontal line that refers to the mean is generated. Each day of the week has a different mean as you can see.
Note that in abline I specified h=mean_saturday, as 10/4 is a Saturday. But I didn't want to always have to change this part of the abline to show the right mean line, but my idea is to leave it automatically, that is, when I enter the date 10/4/2021 in the code, the code already recognize that the 10th it's Saturday and inserts the appropriate mean line. Any idea how to do this?
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(tibble)
df <- structure(
list(Id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
date1 = c("2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20"),
date2 = c("2021-07-01","2021-07-01","2021-07-01","2021-07-01","2021-04-02",
"2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-03",
"2021-04-03","2021-04-03","2021-04-03","2021-04-03","2021-04-08","2021-04-08",
"2021-04-09","2021-04-09","2021-04-10","2021-04-10"),
Week= c("Thursday","Thursday","Thursday","Thursday","Friday","Friday","Friday","Friday",
"Friday","Friday","Saturday","Saturday","Saturday","Saturday","Saturday","Thursday",
"Thursday","Friday","Friday","Saturday","Saturday"),
D = c("","","Ho","","","","","","Ho","","","","","","","","","","","",""),
DR01 = c(2,1,4,3,3,4,1,6,3,7,2,3,4,6,7,8,4,2,6,2,3)),
class = "data.frame", row.names = c(NA, -21L))
mean_thursday=4
mean_friday=5
mean_saturday=6
scatter_date <- function(dt, dta = df) {
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=mean_saturday, col='blue')
}
scatter_date("2021-04-10",df)
You could try to convert the input date in your scatter_date function to a date and get the weekday: my_day <- weekdays(as.Date(dt)) add that to a switch statment for your means:
my_mean <- switch(
my_day,
"Saturday" = mean_saturday,
"Friday" = mean_friday,
"Thursday" = mean_thursday,
0) # add here your other days
and replace mean_saturday in abline(h=mean_saturday, col='blue') with my_mean
here the full code:
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(tibble)
df <- structure(
list(Id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
date1 = c("2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20","2021-07-20",
"2021-07-20","2021-07-20","2021-07-20","2021-07-20"),
date2 = c("2021-07-01","2021-07-01","2021-07-01","2021-07-01","2021-04-02",
"2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-03",
"2021-04-03","2021-04-03","2021-04-03","2021-04-03","2021-04-08","2021-04-08",
"2021-04-09","2021-04-09","2021-04-10","2021-04-10"),
Week= c("Thursday","Thursday","Thursday","Thursday","Friday","Friday","Friday","Friday",
"Friday","Friday","Saturday","Saturday","Saturday","Saturday","Saturday","Thursday",
"Thursday","Friday","Friday","Saturday","Saturday"),
D = c("","","Ho","","","","","","Ho","","","","","","","","","","","",""),
DR01 = c(2,1,4,3,3,4,1,6,3,7,2,3,4,6,7,8,4,2,6,2,3)),
class = "data.frame", row.names = c(NA, -21L))
mean_thursday=4
mean_friday=5
mean_saturday=6
scatter_date <- function(dt, dta = df) {
my_day <- weekdays(as.Date(dt))
my_mean <- switch(
my_day,
"Saturday" = mean_saturday,
"Friday" = mean_friday,
"Thursday" = mean_thursday,
0) # add here your other days
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7), main = paste0(my_day, ":", my_mean),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=my_mean, col='blue')
}
# testing the different means
scatter_date("2021-04-10",df)
scatter_date("2021-04-9",df)
scatter_date("2021-04-8",df)
One way would be to define a data.frame containing the mean for the days of interest and then use weekdays to extract the corresponding mean from that table.
Instead of
mean_thursday=4
mean_friday=5
mean_saturday=6
I would go for something like:
mean_df <- data.frame(mean = c(4:6),
day = c('Thursday', 'Friday', 'Saturday'))
and then
abline(h=subset(mean_df, day == weekdays(as.Date(dt)))$mean, col='blue')
which will be the only change in your function.
I have the following data tables and I would like to make a single data table out of all three.
library(dplyr)
set.seed(123)
dt.Ger <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Germany = rnorm(365, 2, 1), check.names = FALSE)
dt.Aut <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Austria = rnorm(365, 4, 2), check.names = FALSE)
dt.Den <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Denmark = rnorm(365, 3, 1), check.names = FALSE)
dt.Ger <- dt.Ger %>%
mutate(month = format(date, '%b'),
date = format(date, '%d')) %>%
tidyr::pivot_wider(names_from = date, values_from = Germany)
dt.Aut <- dt.Aut %>%
mutate(month = format(date, '%b'),
date = format(date, '%d')) %>%
tidyr::pivot_wider(names_from = date, values_from = Austria)
dt.Den <- dt.Den %>%
mutate(month = format(date, '%b'),
date = format(date, '%d')) %>%
tidyr::pivot_wider(names_from = date, values_from = Denmark)
Now I would like to link all tables together, i.e. first dt.Ger, then possibly add two empty lines and then append dt.Aut, now add again two empty lines and finally add dt.Den. Ideally, it would be great if Germany were the first headline, then Austria (in the second empty line before dt.Aut) and then Denmark (in the second empty line before dt.Den).
So that I only have a single table as a return. This table should look something like this (I only did it with SnippingTool, so it only serves to explain):
EDIT:
Using
l <- list(dt.Ger, dt.Aut, dt.Den)
l.result <- rbindlist(l)
yields to:
And I want to get an extra space/line/row (at the red parts) where Germany, Austria and Denmark is written.
I'm still not sure, what you are trying to achive - for me it seems you are better of working with a list of data.tables.
Furthermore, I switched to using dcast instead of pivot_wider so you can drop tidyr / dplyr.
However, here is an approach inserting NAs inbetween the different data.tables using rbindlist:
library(data.table)
set.seed(123)
dt.Ger <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Germany = rnorm(365, 2, 1), check.names = FALSE)
dt.Aut <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Austria = rnorm(365, 4, 2), check.names = FALSE)
dt.Den <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Denmark = rnorm(365, 3, 1), check.names = FALSE)
# or rather date ~ month?
dt.Ger[, c("month", "date") := list(format(date, '%b'), format(date, '%d'))]
dt.Ger <- dcast(dt.Ger, month ~ date, value.var = "Germany")
dt.Aut[, c("month", "date") := list(format(date, '%b'), format(date, '%d'))]
dt.Aut <- dcast(dt.Aut, month ~ date, value.var = "Austria")
dt.Den[, c("month", "date") := list(format(date, '%b'), format(date, '%d'))]
dt.Den <- dcast(dt.Den, month ~ date, value.var = "Denmark")
# use a list of data.tables:
recommended <- list(Germany = dt.Ger, Austria = dt.Aut, Denmark = dt.Den)
DT <- rbindlist(list(data.table(month = c("", "Germany")), dt.Ger, data.table(month = c("", "Austria")), dt.Aut, data.table(month = c("", "Denmark")), dt.Den), fill = TRUE) # [, V1 := NULL]
DT[,(names(DT)):= lapply(.SD, as.character), .SDcols = names(DT)]
for (j in seq_len(ncol(DT))){
set(DT, which(is.na(DT[[j]])), j, "")
}
print(DT)
I am trying to run some models on some monthly time series data. The times series data are not of equal length and also not starting/ending from/in the same month. What I have is a numeric month column and a numeric year column. I have created a time series from those two variables and made a tsibble out of it so that I can use the fable package. This is what I am doing to process the time series data,
I am posting a simulated data here.
# Packages
library(tidyverse)
library(tsibble)
library(fable)
library(fabletools)
# Simulated data
id <- c(rep (222, 28), rep(111, 36), rep(555, 16))
year <- c(rep(2014, 12), rep(2015, 12), rep(2016, 4),
rep(2014, 12), rep(2015, 12), rep(2016, 12),
rep(2015, 12), rep(2016, 4))
mnt <- c(seq(1, 12, by = 1), seq(1, 12, by = 1), seq(1, 4, by = 1),
seq(1, 12, by = 1), seq(1, 12, by = 1), seq(1, 12, by = 1),
seq(1, 12, by = 1), seq(1, 4, by = 1))
value <- rnorm(80, mean = 123, sd = 50)
dataf <- data.frame(id, mnt, year, value)
To make it a tsibble I am converting my month variable mnt into a character,
dataf$mnt[dataf$mnt == 1] <- "Jan"
dataf$mnt[dataf$mnt == 2] <- "Feb"
dataf$mnt[dataf$mnt == 3] <- "Mar"
dataf$mnt[dataf$mnt == 4] <- "Apr"
dataf$mnt[dataf$mnt == 5] <- "May"
dataf$mnt[dataf$mnt == 6] <- "Jun"
dataf$mnt[dataf$mnt == 7] <- "Jul"
dataf$mnt[dataf$mnt == 8] <- "Aug"
dataf$mnt[dataf$mnt == 9] <- "Sep"
dataf$mnt[dataf$mnt == 10] <- "Oct"
dataf$mnt[dataf$mnt == 11] <- "Nov"
dataf$mnt[dataf$mnt == 12] <- "Dec"
Adding month and year together
dataf %>% unite("time", mnt:year, sep = " ")
Make a tsibble
tsbl <- as_tsibble(dataf, index = time, key = id)
At this point, I am having this error,
> tsbl <- as_tsibble(dataf, index = time, key = id)
Error: `var` must evaluate to a single number or a column name, not a function
Call `rlang::last_error()` to see a backtrace.
The remaining codes are this,
# Fitting arima
fit <- tsbl %>%
fill_gaps(b = 0) %>%
model(
arima = ARIMA(value),
)
fit
# One month ahead forecast
fc <- fit %>%
forecast(h = 1)
fc
# Accuracy measure
accuracy_table <- accuracy(fit)
Any idea how to preprocess my data to run forecasting models from fable package?
You have two small issues where you are creating the time column. The first is that you aren't reassigning your results back to the dataf dataframe, but only posting results to the console. Resolving that will cure your error that you posted.
The next piece is that you'll need a compatible data type. A character isn't quite enough, and you'll want something like the tsibble function yearmonth() to get the job done. For that, you'll see I flipped the order of your unite() call.
The relevant piece:
dataf <- dataf %>% unite("time", c(year, mnt), sep = " ") %>%
mutate(time = yearmonth(time))