Replace multiple for loops with something more efficient in R - r

My question is regarding to for loops and alternatives:
I have multiple consecutive for loops for a big data frame. In every for loop there are new variables/dataframes defined which are necessary for computing the following for loop. Is there a way (maybe lapply or similar) which I can use in this case so that the process will be faster?
databackend2 = data.frame()
databackend2 = structure(list( Class = c("T1", "T1", "T2", "T3", "T5", "Q12"), Places = c("Orlando", "Orlando", "Boston", "LA", "New York", "New York"), Names = c("Planist", "Plantist", "Engi", "Engi", "Shifter", "Automatist"), Final.Work= c(0.08, 0.05, 0.06, 0.05, 0.055, 0.043), Parent = c(NA, NA, NA, "Planist", "Engi", "Engi"), d_stage = c(1, 2, 2, 3, 5, 2))
databackend3 = data.frame()
databackend3 = structure(list( Type = NA, Places = c("Orlando", "Colorado", "Boston", "LA", "New York", "Florida"), D.Stage = c(1.4, 1.5, 2.3, 3.4, 5.1, 2.5), X4= c(3, 3, 4, 5, 5, 4), X5=c(4, 5, 5, 6, 6, 6), Names = c("Sum", "Plantist", "Engi", "Fieldor", "Shifter", "Automatist"), Cu.No.Of.Emp = c(32, 7, 8, 9, 2, 6), Sh.fact = c(NA, 1, 1, 3, 3, 4), Cu.Stage = c(1, 1, 2, 3, 5, 2), One.Target.Stage = c(3, 3, 4, 5, 5, 4), Two.Target.Stage = c(4, 5, 5, 6, 6, 6))
for (names in all_names) {
n_cu_norm_fact = n_cu = databackend3 %>% filter(Names == names) %>% pull(Cu.No..Of.Emp)
sh_fact_nas = databackend3 %>% filter(Roles == role) %>% pull(Sh.fact)
if (is.na(n_cu) | (n_cu == 0)) {
n_cu = 0
n_cu_norm_fact = 1
}
n_emp_df[n_emp_df$names == names, "n_cu"] = n_cu
df_names_one = data.frame()
df_names_two = data.frame()
all_places_names = databackend2 %>% filter(Names == names) %>% pull(Places) %>% unique()
sum_of_cu_work_names_df = data.frame()
for (place in all_places_names) {
ds_cu = databackend3 %>% filter(Places == place) %>% pull(Cu.Stage)
df_i = databackend2 %>% filter(Places == place, Places == place, d_stage == ds_cu) %>% select(Class, Final.Work)
sum_of_cu_work_names_df = sum_of_cu_work_names_df %>% bind_rows(df_i)
}
sum_of_cu_work_names = sum_of_cu_work_names_df$Final.Work %>% sum()
sum_of_cu_work_names = ifelse(n_cu == 0, 1, sum_of_cu_work_names)
for (place in all_places_names) {
ds_cu = databackend3 %>% filter(Places == place) %>% pull(Cu.Stage)
ds_target_one = databackend3 %>% filter(Places == place) %>% pull(One.Target.Stage)
ds_target_two = databackend3 %>% filter(Places == place) %>% pull(Two.Target.Stage)
df_names_place_target_one = databackend2 %>% filter(Names == names, Places == place, d_stage == ds_target_one)
df_names_places_target_two = databackend2 %>% filter(Names == names, Places == place, d_stage == ds_target_two)
df_names_place_target_cu = databackend2 %>% filter(Names == names, Places == place, d_stage == ds_cu) %>% select(Class, Final.Work)
colnames(df_names_place_cu)[2] = "Cu.Work"
df_names_place_target_one = df_names_place_target_one %>% left_join(df_names_place_cu)
df_names_place_target_two = df_names_place_target_one %>% left_join(df_names_place_cu)
df_names_place_target_one$work_norm = df_names_place_target_one$Final.Work / sum_of_cu_work_names
df_names_place_target_two$work_norm = df_names_place_target_two$Final.Work / sum_of_cu_work_names
for (class in df_names_place_target_one$Class) {
source_names = df_names_place_target_one %>% filter(Class == class) %>% slice(1) %>% pull(Parent)
if (is.na(source_names)) next
n_source_names = databackend3 %>% filter(Names == source_names) %>% pull(Cu.No..Of.Emp)
n_source_names = ifelse(n_source_names == 0, 1, n_source_names)
sh_fact_source = databackend3 %>% filter(Names == source_names) %>% pull(Sh.fact)
work_old = df_names_place_target_one %>% filter(Class == class) %>% pull(Final.Work)
sum_of_cu_work_source = databackend2 %>% filter(Names == source_names, d_stage == ds_cu) %>% pull(Final.Work) %>% sum()
df_names_place_target_one[df_names_place_target_one$Class == class, "work_norm"] = (n_source_names / sh_fact_source) * (work_old / sum_of_cu_work_source) / (n_cu_norm_fact / sh_fact_names)
}
for (class in df_names_place_target_two$Class) {
source_names = df_names_place_target_two %>% filter(Class == class) %>% slice(1) %>% pull(Parent)
if (is.na(source_names)) next
n_source_names = databackend3 %>% filter(Names == source_names) %>% pull(Cu.No..Of.Emp)
n_source_names = ifelse(n_source_names == 0, 1, n_source_names)
sh_fact_source = databackend3 %>% filter(Names == source_names) %>% pull(Sh.fact)
work_old = df_names_place_target_two %>% filter(Class == class) %>% pull(Final.Work)
sum_of_cu_work_source = databackend2 %>% filter(Names == source_names, d_stage == ds_cu) %>% pull(Final.Work) %>% sum()
df_names_place_target_two[df_names_place_target_two$Class == class, "work_norm"] = (n_source_names / sh_fact_source) * (work_old / sum_of_cu_work_source) / (n_cu_norm_fact / sh_fact_names)
}
df_names_one = df_names_one %>% bind_rows(df_names_place_target_one)
df_names_two = df_names_two %>% bind_rows(df_names_place_target_two) }
write.csv2(df_names_one, file = paste0("debugging\\one\\", names, ".csv"), row.names = FALSE)
write.csv2(df_names_two, file = paste0("debugging\\bl\\", names, ".csv"), row.names = FALSE)
sum_work_norm_one = df_names_one %>% pull(work_norm) %>% sum(na.rm = TRUE)
sum_work_norm_two = df_names_two %>% pull(work_norm) %>% sum(na.rm = TRUE)
n_emp_df[n_emp_df$names == names, "n_target_one"] = ifelse(n_cu > 0, sum_work_norm_one * n_cu, sum_work_norm_one * 1)
n_emp_df[n_emp_df$names == names, "n_target_two"] = ifelse(n_cu > 0, sum_work_norm_two * n_cu, sum_work_norm_two * 1)
}

Related

highlight cell if previous column meets specific condition R

I have a dataframe
library(flextable)
df = structure(list(col1 = c(1, NA, 1, 1, 1), col2 = c(NA, 1, NA,
1, 1), col3 = c(1, 1, NA, 1, NA), col4 = c(1, 1, 1, 1, NA)), class = "data.frame", row.names = c(NA,
-5L))
df %>% flextable()
I want
to return the last 3 columns highlighted based on the following logic:
red if it is blank
green if and only if the preceeding column was blank.
Based on this, I am trying to create a color matrix to identify the green highlights, but have hit a brick wall.
To identify the red matrix, I used the following code ifelse(is.na(df),"red","").
what would be the best method to identify the green labels
Not the prettiest, but works
df=data.frame(col1 = c(1,NA,1,1,1,1),
col2 = c(NA,1,NA,1,1,1),
col3 = c(1,1,NA,1, NA,1),
col4 = c(1,1,1,1,NA,1))
df %>% flextable()
red = ifelse(is.na(df),1,0)
green = data.frame()
for(n in 1:(ncol(red)-1)){
print(n)
r=ifelse(red[,n]==1 & red[,n+1] == 0,1,0)
green = rbind(green, r)
}
green = t(green)
colnames(green) = paste0("col",2:4)
green
red[,2:4]
ft = df[,2:4] %>%
flextable() %>%
bg(i = ~ is.na(col2), j = 1,bg='red') %>%
bg(i = ~ is.na(col3), j = 2,bg='red') %>%
bg(i = ~ is.na(col4), j = 3,bg='red') %>%
bg(i = ~ green[,1]==1,j = 1, bg='green') %>%
bg(i = ~ green[,2]==1,j = 2, bg='green') %>%
bg(i = ~ green[,3]==1,j = 3, bg='green')
ft

Vectorised wilcox.test function dplyr

I have tried to create a vectorised version of the wilcox.test function in R. It doesn't return the same p values as the original wilcox.test function. Does anyone understand why ?
library(tidyverse)
vect_wilcox <- function(df, grouping_variable, x, y){
df %>%
group_by({{grouping_variable}}) %>%
group_map(~ broom::tidy(wilcox.test(pull(.x[x]), pull(.x[y])), paired = TRUE)) %>%
enframe %>%
unnest(value) %>%
mutate(df %>% distinct({{grouping_variable}}))
}
df <- tribble(
~session, ~name_var, ~time_pt1, ~time_pt2,
1, "fio2", 90, NA,
2, "fio2", 100, 80,
3, "fio2", 100, 70,
4, "fio2", 90, 70,
1, "ph", 7.24, NA,
2, "ph", 7.19, 7.38,
3, "ph", 7.2, 7.2,
4, "ph", 7.37, 7.33
)
new_wilcox <- vect_wilcox(df, grouping_variable = name_var, x= "time_pt1", y="time_pt2")
d3 <- df %>%
pivot_longer(col = 3:4, names_to = "time_point", values_to = "value") %>%
pivot_wider(
names_from = c(name_var, time_point),
values_from = value,
names_sep = "_")
pval = format(wilcox.test(d3$fio2_time_pt1,d3$fio2_time_pt2,paired=T)$p.value,digits=3)
pval = c(pval,format(wilcox.test(d3$ph_time_pt1,d3$ph_time_pt2,paired=T)$p.value,digits=3))
(comp<- new_wilcox %>%
select(name_var, p.value) %>%
mutate(old_p.value = pval) %>%
rename(new_p.value = p.value))
dput(comp)
Thanks a lot !
There were two errors:
a parenthesis error in group_map
the variable reassignement with mutate(df %>% distinct({{grouping_variable}}))obviously didn't work so I changed group_map all together for group_modify.
This function works:
vect_wilcox <- function(df, grouping_variable, x, y){
df %>%
group_by({{grouping_variable}}) %>%
group_modify(~ broom::tidy(
wilcox.test(
x = pull(.x[x]),
y = pull(.x[y]),
paired = TRUE))) %>%
ungroup()
}

Using summary_row() values to calculate group percentage with {gt} package?

I am trying to calculate the percentage for an entire group while using the summary_rows() function from the {gt} package. The problem I am encountering is how to create a function that uses summary_rows() values to calculate a percentage rowwise.
Sample df:
df <- tibble(
division = c("Science", "Science", "Science"),
department = c("Biology", "Biology", "Biology"),
course_num = c('101', '201', "301"),
widthraws = c(1, 2, 3),
unsucessful = c(0, 0 , 0),
successfull = c(1, 3, 4),
total_enrolled = c(2, 5, 7),
percent_successful = c(.50, .60, .57)
)
Sample of gt table:
df %>%
group_by(division, department) %>%
gt() %>%
summary_rows(
groups = TRUE,
columns = 4:7,
missing_text = " ",
fns = list(
total = ~sum(.)),
)
What I would want is the total row of the percent_successful column to be .57. Open to other ideas that would help me achieve this too.
Compute the percentage for total outside and add a layer
library(gt)
library(dplyr)
total_success_perc <- with(df, round(sum(successfull)/sum(total_enrolled), 2))
df %>%
group_by(division, department) %>%
gt() %>%
summary_rows(
groups = TRUE,
columns = 4:7,
missing_text = " ",
fns = list(
total = ~sum(.)),
) %>%
summary_rows(groups = TRUE, columns = 8, missing_text = " ",
fns = list(total = ~ c(total_success_perc)))
-output
library(tidyverse)
library(gt)
df <- tibble(
division = c("Science", "Science", "Science"),
department = c("Biology", "Biology", "Biology"),
course_num = c('101', '201', "301"),
widthraws = c(1, 2, 3),
unsucessful = c(0, 0 , 0),
successfull = c(1, 3, 4),
total_enrolled = c(2, 5, 7),
percent_successful = c(.50, .60, .57)
)
df %>%
group_by(division, department) %>%
gt() %>%
summary_rows(
groups = TRUE,
columns = c(widthraws:percent_successful),
missing_text = " ",
fns = list(
total = ~sum(.),
max = ~max(.),
min = ~min(.),
medain = ~median(.))
)

An error keeps appearing that the sample doesn't work in R Studio cloud and I don't know why

sharp_null_thought_experiment <-
function() {
final_data %>%
mutate(
OUTCOME_Z_0 = rnorm(n(), sd = 0.5007117),
OUTCOME_Z_1 = OUTCOME_Z_0,
Z = sample(rep(c(0, 1), times = c(sum(final_data$treatment_group=="control"), sum(final_data$treatment_group=="treatment"))), size = n()),
OUTCOME = if_else(Z == 0, OUTCOME_Z_0, OUTCOME_Z_1)
) %>%
difference_in_means(OUTCOME ~ Z, data = .) %>%
tidy
}
sampling_distribution_sharp_null <- rerun(1000, sharp_null_thought_experiment()) %>%
bind_rows
sampling_distribution_sharp_null %>%
summarise(mean(estimate>=results$estimate))

creating summary tables with tidyverse

Q1. Is there a more direct (but still tidyverse) way to create a summary table like this?
library(tidyverse)
library(knitr)
library(kableExtra)
df <- data.frame(group=c(1, 1, 1, 1, 0, 0, 0, 0),
v1=c(1, 2, 3, 4, 5, 6, 1, 2),
v2=c(4, 3, 2, 5, 3, 5, 3, 8),
v3=c(0, 1, 0, 1, 1, 0, 1, 1))
df %>%
group_by(group) %>%
summarise(v1=paste0(round(mean(v1), 2),
" (",
round(sd(v1), 2),
")"),
v2=paste0(round(mean(v2), 2),
" (",
round(sd(v2), 2),
")"),
v3=round(mean(v3)*100, 1)
) %>%
dplyr::select(-group) %>%
t() %>%
`rownames<-` (c("v1 mean (SD)",
"v2 mean (SD)",
"Percent v3")) %>%
kable("html",
col.names=c("Group 0", "Group 1")) %>%
kable_styling()
Q2. Related to this, is there a way to combine two levels of summarise (e.g., no grouping + grouping) without repeating the summarise code?
all <-
df %>%
summarise(v1=paste0(round(mean(v1), 2),
" (",
round(sd(v1), 2),
")"),
v2=paste0(round(mean(v2), 2),
" (",
round(sd(v2), 2),
")"),
v3=round(mean(v3)*100, 1)
) %>%
t() %>%
`rownames<-` (c("v1 mean (SD)",
"v2 mean (SD)",
"Percent v3"))
groups <-
df %>%
group_by(group) %>%
summarise(v1=paste0(round(mean(v1), 2),
" (",
round(sd(v1), 2),
")"),
v2=paste0(round(mean(v2), 2),
" (",
round(sd(v2), 2),
")"),
v3=round(mean(v3)*100, 1)
) %>%
dplyr::select(-group) %>%
t() %>%
`rownames<-` (c("v1 mean (SD)",
"v2 mean (SD)",
"Percent v3"))
all %>%
cbind(groups) %>%
kable("html",
col.names=c("All", "Group 0", "Group 1")) %>%
kable_styling()
One solution (especially if you want to expand the number of columns v1, v2, ... in the future) to make your code a bit more concise might be, to put paste0(round(mean(v1), 2)," (", round(sd(v1), 2), ")") into a function: paste_mean_and_sd = function(df_col){paste0(round(mean(df_col), 2)," (", round(sd(df_col), 2), ")")}.
That would shorten your "pipeline" and make it more easily readable:
... %>% summarise(v1 = paste_mean_and_sd(v1), v2 = paste_mean_and_sd(v2), v3=round(mean(v3)*100, 1)) %>% ...
This is the minimum I can think of.
cat_var <- "v3"
df_cal <- function(x, var) {
if (var[1] %in% cat_var) return(as.character(round(mean(x), 1)))
paste0(mean(x), " (", round(sd(x), 2), ")")
}
df_tall <- df %>% gather(var, x, v1:v3) %>% group_by(var)
all <- df_tall %>% summarise(stat = df_cal(x, var)) %>% mutate(group = -1)
groups <- df_tall %>% group_by(group, var) %>% summarise(stat = df_cal(x, var))
bind_rows(all, groups) %>%
ungroup() %>%
mutate(var = factor(var, labels = c(
"v1 mean (SD)", "v2 mean (SD)", "Precent v3"
))) %>%
spread(group, stat) %>%
kable("html", col.names = c(" ", "All", "Group 0", "Group 1")) %>%
kable_styling()

Resources