Dynamic formula not working with startsWith and colnames - r

I'm working on making a function to create tables and I need to have some conditional rules involved for formatting. One will be based on a column name, however when I send it down using as.formula it seems to be over doing it. I've made an example here:
library(tidyverse)
library(rlang)
a <- as_tibble(x =cbind( Year = c(2018, 2019, 2020), a = 1:3,
b.1 = c("a", "b", "c"),
b.2 = c("d", "e", "f"),
fac = c("This", "This","That")))
foo <- function(x, y, z, ...){
y_var <- enquo(y)
x %>%
filter(Year %in% c(2018, 2019),
...) %>%
mutate(!!quo_name(y_var) := factor(!!y_var,
levels = z,
ordered = TRUE)) %>%
arrange(!!y_var)
}
to.table <- function(x, y, z, ...){
y_var <- enquo(y)
df.in <- foo(x=x,
y=!!y_var,
z= z)
cond <- paste("~!is.na(", quo_name(y_var),")")
cond.2 <- paste("~startsWith(colnames(", df.in, "),\"b\")")
flextable(df.in) %>%
bold(i = as.formula(cond),
part = "body") %>%
bg(i = as.formula(cond.2),
bg = "Red3",
j = as.formula(cond.2))
}
to.table(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
Error in startsWith(colnames(2:3), "b") : non-character object(s)
From the error I've been reviving it looks like solved the expression before it gets put through the as.formula as those two columns are the correct answer.
Proof:
df.in <- foo(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
startsWith(colnames(df.in), prefix = "b")
[1] FALSE FALSE TRUE TRUE FALSE
What am I missing here? If anyone has a solution, or suggestion on how to do things differently potentially using quosures or other tidyverse friendly methods I would much appreciate it.
Extension:
To make this a bit more clear, I may need to elaborate on my intended use of this example. I'm trying to figure out how to take names generated dynamically in a function represented as foo that start with a specified value (generally 3 columns), and then check those columns for a specified value that I can then highlight in a specific Color.
Additionally in the answer cond is used in both of the i= designation, the two separate conditions in will likely never overlap.

We could specify the j with the column names of the data created i.e. startsWith returns a logical vector from the column names based on the names that starts with 'b', use the logical vector to extract the column names with [ (nm1).
to.table <- function(x, y, z, ...){
y_var <- enquo(y)
df.in <- foo(x=x,
y=!!y_var,
z= z)
cond <- as.formula(glue::glue('~ !is.na({quo_name(y_var)})'))
nm1 <- names(df.in)[startsWith(names(df.in), prefix = "b")]
flextable(df.in) %>%
bold(i = cond,
part = "body") %>%
bg(i = cond,
bg = "Red3",
j = nm1)
}
-testing
to.table(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
-output
In the OP's post formula created for 'cond' is fine although it is a bit more flexible by using glue whereas the second one i.e. 'cond.2' returns
paste("~startsWith(colnames(", df.in, "),\"b\")")
[1] "~startsWith(colnames( 2:3 ),\"b\")" "~startsWith(colnames( c(\"1\", \"2\") ),\"b\")"
[3] "~startsWith(colnames( c(\"a\", \"b\") ),\"b\")" "~startsWith(colnames( c(\"d\", \"e\") ),\"b\")"
[5] "~startsWith(colnames( c(\"This\", \"This\") ),\"b\")"
It is because df.in is a data.frame on which we are trying to paste the startsWith(colnames( string. Each of the lines returned are the column values
If we want to get either 'a' or 'b' column names prefix with 'red' color, change the startsWith to grep which can take a regex as pattern
to.table <- function(x, y, z, ...){
y_var <- enquo(y)
df.in <- foo(x=x,
y=!!y_var,
z= z)
cond <- as.formula(glue::glue('~ !is.na({quo_name(y_var)})'))
nm1 <- grep("^(a|b)", names(df.in), value = TRUE)
flextable(df.in) %>%
bold(i = cond,
part = "body") %>%
bg(i = cond,
bg = "Red3",
j = nm1)
}
to.table(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
-output
If we want to color based on the value of 'a'
to.table <- function(x, y, z, ...){
y_var <- enquo(y)
df.in <- foo(x=x,
y=!!y_var,
z= z)
cond <- as.formula(glue::glue('~ !is.na({quo_name(y_var)})'))
nm1 <- names(df.in)[startsWith(names(df.in), prefix = "b")]
flextable(df.in) %>%
bold(i = cond,
part = "body") %>%
bg(i = ~ a == 2,
bg = "Red3",
j = nm1)
}
to.table(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
-output

Related

Extract each data frame name from a list of data frames in lapply function

I have a list of many data frames and I am trying to perform manipulations to each data frame in the list. I created this lapply function and then the list is then merged together. However when trying to rename certain columns so that they include the respective data frame name:
paste(deparse(substitute(x)),"_start"
the dataframe names are being extracted like this :
x[[i]]_start_1
Here is the full code:
df_list <-lapply(df_list, function(x){
lookup <- c(start = paste(deparse(substitute(x)),"_start"),
end = paste(deparse(substitute(x)),"_end"),
top = paste(deparse(substitute(x)),"_top"),
left = paste(deparse(substitute(x)),"_left"),
height = paste(deparse(substitute(x)),"_height"),
width = paste(deparse(substitute(x)),"_width"),
type = paste(deparse(substitute(x)),"_type"),
value = paste(deparse(substitute(x)),"_value"))
x <- x %>% rename_with(.fn = ~lookup[.x], .cols = intersect(names(.), names(lookup)))
x <- arrange(x, creativeId)
x <- x[,-1]
x <- x %>% distinct()
x$counter <- with(x, ave(creativeId, with(rle(creativeId), rep(seq_along(values), lengths)), FUN = seq_along))
x <- x %>% relocate(counter)
x <- x %>% pivot_wider(names_from =counter, values_from= -names(.)[1:2])
})
new_df <- Reduce(function(x,y) merge(x,y,all=TRUE), df_list)
Please let me know if there is a workaround so that the data frame names are printed correctly. Thank you!
We may use Map
df_list2 <- Map(function(x, nm) {
lookup <- c(start = paste0(nm,"_start"),
end = paste0(nm, "_end"),
top = paste0(nm,"_top"),
left = paste0(nm,"_left"),
height = paste0(nm,"_height"),
width = paste0(nm,"_width"),
type = paste0(nm,"_type"),
value = paste0(nm,"_value"))
x <- x %>%
rename_with(.fn = ~lookup[.x], .cols = intersect(names(.), names(lookup)))
x <- arrange(x, creativeId)
x <- x[,-1]
x <- x %>% distinct()
x$counter <- with(x, ave(creativeId,
with(rle(creativeId), rep(seq_along(values), lengths)), FUN = seq_along))
x <- x %>% relocate(counter)
x <- x %>% pivot_wider(names_from =counter, values_from= -names(.)[1:2])
}, df_list, names(df_list))

Processing a data frame in r by subgroup: is it possible to get rid of the 'for' loop?

I frequently work with data frames and have to run some sophisticated data wrangling / manipulations by subgroup that is defined in one of the columns. I am aware of dplyr and group_by and know that many things could be solved using group_by. However, often I have to do some pretty intricate calculations and end up just using the 'for' loop.
I was wondering about the existence of some other general approach or paradigm that is faster/more elegant. Maybe map (that I am not very familiar with)?
Below is an example. Notice - it is fake and meaningless. So let's ignore why I need to do those things or the fact that there could be 2 consequtive NAs in a column, etc. That's not the focus of my question. The point is that often I have to operate "within the constraints of a subgroup" and then - inside that subgroup - I have to do operations columnwise, rowwise and sometimes even cellwise.
I also realize that I could probably put most of that code inside a function, split my data frame into a list based on 'group', apply this function to each element of that list and then do.call(rbind...) at the end. But is this the only way?
Thanks a lot for any hints!
library(dplyr)
library(forcats)
set.seed(123)
x <- tibble(group = c(rep('a', 10), rep('b', 10), rep('c', 10)),
attrib = c(sample(c("one", "two", "three", "four"), 10, replace = T),
sample(c("one", "two", "three"), 10, replace = T),
sample(c("one", "three", "four"), 10, replace = T)),
v1 = sample(c(1:5, NA), 30, replace = T),
v2 = sample(c(1:5, NA), 30, replace = T),
v3 = sample(c(1:5, NA), 30, replace = T),
n1 = abs(rnorm(30)), n2 = abs(rnorm(30)), n3 = abs(rnorm(30)))
v_vars = paste0("v", 1:3)
n_vars = paste0("n", 1:3)
results <- NULL # Placeholder for final results
for(i in seq(length(unique(x$group)))) { # loop through groups
mygroup <- unique(x$group)[i]
mysubtable <- x %>% filter(group == mygroup)
# IMPUTE NAs in v columns
# Replace every NA with a mean of values above and below it; and if it's the first or
# the last value, with the mean of 2 values below or above it.
for (v in v_vars){ # loop through v columns
which_nas <- which(is.na(mysubtable[[v]])) # create index of NAs for column v
if (length(which_nas) == 0) next else {
for (na in which_nas) { # loop through indexes of column values that are NAs
if (na == 1) {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na + 1],
mysubtable[[v]][na + 2]), na.rm = TRUE)
} else if (na == nrow(mysubtable)) {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na - 2],
mysubtable[[v]][na - 1]), na.rm = TRUE)
} else {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na - 1],
mysubtable[[v]][na + 1]), na.rm = TRUE)
}
} # end of loop through NA indexes
} # end of else
} # end of loop through v vars
# Aggregate v columns (mean) for each value of column 'attrib'
result1 <- mysubtable %>% group_by(attrib) %>%
summarize_at(v_vars, mean)
# Aggregate n columns (sum) for each value of column 'attrib'
result2 <- mysubtable %>% group_by(attrib) %>%
summarize_at(n_vars, sum)
# final result should contain the name of the group
results[[i]] <- cbind(mygroup, result1, result2[-1])
}
results <- do.call(rbind, results)
Maybe this example is too simple, but in this case, the only thing you need to pull out is the imputation.
my_impute <- function(x) {
which_nas <- which(is.na(x))
for (na in which_nas) {
if (na == 1) {
x[na] <- mean(c(x[na + 1], x[na + 2]), na.rm = TRUE)
} else if (na == length(x)) {
x[na] <- mean(c(x[na - 2], x[na - 1]), na.rm = TRUE)
} else {
x[na] <- mean(c(x[na - 1], x[na + 1]), na.rm = TRUE)
}
}
x
}
Then you just need to group appropriately and impute and summarize.
x2 <- x %>% group_by(group) %>% mutate_at(v_vars, my_impute) %>%
group_by(group, attrib)
full_join(x2 %>% summarize_at(v_vars, mean),
x2 %>% summarize_at(n_vars, sum))
My usual method for things like this, where similar calculations need to be on a bunch of columns, is to put it in long format. Here it feels a little like the long way round, but perhaps this would be useful to see.
x %>% mutate(row=1:n()) %>% gather("variable", "value", c(v_vars, n_vars)) %>%
separate(variable, c("var", "x"), sep=1) %>% spread(var, value) %>%
arrange(group, x, row) %>% group_by(group, x) %>%
mutate(v=my_impute(v)) %>% group_by(group, attrib, x) %>%
summarize(v=mean(v), n=sum(n)) %>%
gather("var", "value", v, n) %>% mutate(X=paste0(var, x)) %>%
select(-x, -var) %>% spread(X, value)
More generally, split-apply-combine is probably the way to go, as you suggest in your question; here's a way using the tidyverse.
doX <- function(x) {
x2 <- x %>% mutate_at(v_vars, my_impute) %>% group_by(attrib)
full_join(x2 %>% summarize_at(v_vars, mean),
x2 %>% summarize_at(n_vars, sum))
}
x %>% group_by(group) %>% nest() %>%
mutate(result=map(data, doX)) %>% select(-data) %>% unnest()
The more traditional method is with do.call, split, and rbind; here I don't make the effort to keep the group information.
do.call(rbind, lapply(split(x, x$group), doX))
The first thing to do is to change your data imputing into a function. I made some simple modifications to have it accept a vector and simplified the call to mean.
fx_na_rm <- function(z) {
which_nas <- which(is.na(z))
if (length(which_nas) > 0) {
for (na in which_nas) { # loop through indexes of column values that are NAs
if (na == 1) {
z[na] <- mean(z[na + (1:2)], na.rm = TRUE)
} else if (na == nrow(mysubtable)) {
z[na] <- mean(z[na - (1:2)], na.rm = TRUE)
} else {
z[na] <- mean(z[c(na - 1, na + 1)], na.rm = TRUE)
}
} # end of loop through NA indexes
}
return(z)
}
I like data.table so here's a solution that uses it. Now since you use different functions for the n and v variable groups, most purrr or any other solutions will also be a little funny.
library(data.table)
dt <- copy(as.data.table(x))
v_vars = paste0("v", 1:3)
n_vars = paste0("n", 1:3)
dt[, (v_vars) := lapply(.SD, as.numeric), .SDcols = v_vars]
dt[, (v_vars) := lapply(.SD, fx_na_rm), by = group, .SDcols = v_vars]
# see https://stackoverflow.com/questions/50626316/r-data-table-apply-function-a-to-some-columns-and-function-b-to-some-others
scols <- list(v_vars, n_vars)
funs <- rep(c(mean, sum), lengths(scols))
dt[, setNames(Map(function(f, x) f(x), funs, .SD), unlist(scols))
, by = .(group,attrib)
, .SDcols = unlist(scols)]
The for loop itself is difficult to vectorize because the results can depend on itself. Here is my attempt which is not an identical output to yours:
# not identical
fx_na_rm2 <- function(z) {
which_nas <- which(is.na(z))
if (length(which_nas) > 0) {
ind <- c(rbind(which_nas - 1 + 2 * (which_nas == 1) + -1 * (which_nas == length(z)),
which_nas + 1 + 1 * (which_nas == 1) + -2 * (which_nas == length(z))))
z[which_nas] <- colMeans(matrix(z[ind], nrow = 2), na.rm = T)
}
return(z)
}

Is there a way I can simplify the code below using vectors?

I am using R. I need to create a new column in a data frame that is the sum of the three variables. The sum should only take place if there are numeric values for each of the three variables. In other words, if there are any NAs or blanks the sum should not take place.
I have written the code below which works, but would like to simplify it. I am interested in using vectors to avoid repetition in my code.
data.x <- data.frame('time' = c(1:11),
'x' = c(5,3,"",'ND',2,'ND',7,8,'ND',1," "))
data.x[data.x == ''] <- 'NA'
data.x[data.x == ' '] <- 'NA'
data.x[data.x == 'ND'] <- 'NA'
data.x.na.omit <- na.omit(data.x)
data.y <- data.frame('time' = c(1:8),
'y' = c(5,2,3,1,2,NA,NA,8))
data.y[data.y == ''] <- 'NA'
data.y[data.y == ' '] <- 'NA'
data.y[data.y == 'ND'] <- 'NA'
data.y.na.omit <- na.omit(data.y)
data.z <- data.frame('time' = c(1:5),
'z' = c(1:5))
data.z[data.z == ''] <- 'NA'
data.z[data.z == ' '] <- 'NA'
data.z[data.z == 'ND'] <- 'NA'
data.z.na.omit <- na.omit(data.z)
data.x.y <- merge.data.frame(data.x.na.omit, data.y.na.omit, by.x = "time", by.y = "time")
data.x.y.z <- merge.data.frame(data.x.y, data.z.na.omit, by.x = "time", by.y = "time" )
data.x.y.z$x <- as.numeric(data.x.y.z$x)
data.x.y.z$y <- as.numeric(data.x.y.z$y)
data.x.y.z$z <- as.numeric(data.x.y.z$z)
data.x.y.z$result <- data.x.y.z$x + data.x.y.z$y + data.x.y.z$z
I don't see particularly good ways to use vectors to avoid repetition. I would suggest the following, though:
Removing NA rows by evaluating the result column once, so you don't have to do this for each of x, y and z.
Setting stringsAsFactors to FALSE so using a single line like data.x$x <- as.numeric(data.x$x) will automatically coerce strings to NA, and you don't have to do it separately.
Bringing in the data as a single dataframe (by adding NA to the bottom of columns y and z), rather than creating data.x, data.y and data.z then merging.
For example, code with these suggestions might look like this:
# Create merged data
data <- data.frame('time' = c(1:11),
'x' = c(5,3,"",'ND',2,'ND',7,8,'ND',1," "),
'y' = c(5,2,3,1,2,NA,NA,8, rep(NA, 3)),
'z' = c(1:5, rep(NA, 6)),
stringsAsFactors=F)
# Convert x, y and z to numeric
for(col in c("x", "y", "z"))
class(data[,col]) <- "numeric"
# Add x, y and z together
data$result <- data$x + data$y + data$z
# Remove NAs at the end
data <- na.omit(data)
If your data sources are such that you can't bring them in as a single dataframe, but you have to merge them, then you could replace the "Create merged data" section with something like this:
# Create separate data
data.x <- data.frame('time' = c(1:11),
'x' = c(5,3,"",'ND',2,'ND',7,8,'ND',1," "),
stringsAsFactors=F)
data.y <- data.frame('time' = c(1:8),
'y' = c(5,2,3,1,2,NA,NA,8),
stringsAsFactors=F)
data.z <- data.frame('time' = c(1:5),
'z' = c(1:5),
stringsAsFactors=F)
# Merge data
data.xy <- merge(data.x, data.y)
data <- merge(data.xy, data.z)
# Now continue main code suggestion from the 'Convert x, y and z to numeric' section

Can't use !!arg in dplyr for mutate call

I can use !! to filter by a user-given variable but not to modify that same variable. The following function throws an error when created, but it works just fine if I delete the mutate call.
avg_dims <- function(x, y) {
y <- enquo(y)
x %>%
filter(!!y != "TOTAL") %>%
mutate(!!y = "MEAN") %>%
group_by(var1, var2)
}
The naming of the column on the lhs of assignment goes along with the assignment operator (:=) instead of the = operator. Also, the names should be either string or symbol. So, we can convert the quosure ('y' from enquo) to string (quo_name) and then do the evaluation (!!)
avg_dims <- function(x, y) {
y <- enquo(y)
y1 <- rlang::quo_name(y)
x %>%
filter(!!y != "TOTAL") %>%
mutate(!!y1 := "MEAN") %>%
group_by(var1, var2)
}
avg_dims(df1, varN)
data
set.seed(24)
df1 <- data.frame(var1 = rep(LETTERS[1:3], each = 4),
var2 = rep(letters[1:2], each = 6),
varN = sample(c("TOTAL", "hello", 'bc'), 12, replace = TRUE),
stringsAsFactors = FALSE)

Diff-in-diff estimation with resampling from large dataset

I have a large dataset on which to perform a diff-in-diff estimation. Given the nature of the dataset my t-statistics denominators are inflated and coefficient are (surreptitiously) statistically significant.
I want to step-by-step reducing the number of element in the database, and for each step resample a large number of times and re-estimating each time interaction coefficient and standard errors.
Then I want to take all the averages estimates and standard error, and plot them on a graph, to show at what point (if any) they are not statistically different from zero.
My code follows with a toy example.
I am not sure this is the most efficient way to tackle the problem
I cannot retrieve and thus plot the confidence interval
I am not sure the sampling is representative given the existence of different groups.
Toy example (Creds Torres-Reyna - ‎2015)
library(foreign)
library(dplyr)
library(ggplot2)
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:10){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(coefficient <- mean(c, na.rm = T),
standard_error <- mean(s, na.rm = T))
names(df_0) <- c("i","c","s")
View(df_0)
Consider the following refactored code using base R functions: within, %in%, nested lapply, setNames, aggregate, and do.call. This approach avoids calling rbind in a loop and compactly re-writes code without constantly using $ column referencing.
library(foreign)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata <- within(mydata, {
time <- ifelse(year >= 1994, 1, 0)
treated <- ifelse(country %in% c("E", "F", "G"), 1, 0)
did <- time * treated
})
# OUTER LIST OF DATA FRAMES
df_0_list <- lapply(1:length(seq(5,nrow(mydata)-1,5)), function(i) {
index <- seq(5,nrow(mydata),5)[i]
# INNER LIST OF DATA FRAMES
df_1_list <- lapply(1:100, function(j) {
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg <- lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- setNames(data.frame(t(new_line)), c("c","s","i"))
})
# APPEND ALL INNER DFS
df <- do.call(rbind, df_1_list)
return(df)
})
# APPEND ALL OUTER DFS
df_0 <- do.call(rbind, df_0_list)
# AGGREGATE WITH NEW COLUMNS
df_0 <- within(aggregate(cbind(c, s) ~ i, df_0, function(x) mean(x, na.rm=TRUE)), {
upper = c + s
lower = c - s
})
# RUN PLOT
within(df_0, {
plot(i, c, ylim=c(min(c)-5000000000, max(c)+5000000000), type = "l",
cex.lab=0.75, cex.axis=0.75, cex.main=0.75, cex.sub=0.75)
polygon(c(i, rev(i)), c(lower, rev(upper)),
col = "grey75", border = FALSE)
lines(i, c, lwd = 2)
})
In the end I solved it like this:
Is this the most efficient way?
library(foreign)
library(dplyr)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata$time = ifelse(mydata$year >= 1994, 1, 0)
mydata$treated = ifelse(mydata$country == "E" |
mydata$country == "F" |
mydata$country == "G", 1, 0)
mydata$did = mydata$time * mydata$treated
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:100){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(c = mean(c, na.rm = T), s =
mean(s, na.rm = T))
df_0 <- df_0 %>% group_by(i) %>% mutate(upper = c+s, lower = c-s)
df <- df_0
plot(df$i, df$c, ylim=c(min(df_0$c)-5000000000, max(df_0$c)+5000000000), type = "l")
polygon(c(df$i,rev(df$i)),c(df$lower,rev(df$upper)),col = "grey75", border = FALSE)
lines(df$i, df$c, lwd = 2)

Resources