rowwise filtering in dplyr - r

I wanted to use dplyr instead of apply,1 in order to filter a dataset rowwise according to a logical expression, ie for this example I´d like to remove all rows that have one or more values of 99.
However, I was surprised by the poor performance in dplyr. Any ideas if I can speed this up in dplyr? Also, I would have thought that the rowwise function would pipe the individual rows, but apparently not (see below). How can I use the rowwise function?
library(tidyverse)
s <- tibble(rows = seq(from = 250, to = 5000, by = 250)) #my original dataset has 400K rows...
s$num <- map(s$rows, ~ rnorm(.x * 6))
s$num <-
map(s$num, ~ replace(.x, sample(1:length(.x), size = length(.x) / 20), 99))
s$mat <- map(s$num, ~ as_data_frame(matrix(.x, ncol = 6)))
help_an <- function(vec) {
browser()
return(!any(vec == 99))
}
help_dp_t <- function(df) {
clo1 <- proc.time()
a <- as_data_frame(t(df)) %>% summarise_all(help_an)
df2 <- filter(df, t(a)[, 1])
b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
return(b)
}
s$dplyr <- map(s$mat, ~ dplyr::mutate(help_dp_t(.x)))
help_lap <- function(df) {
clo1 <- proc.time()
a_base <- df[apply(df, 1, function(x)
! any(x == 99)), ]
b <- tibble(time = (proc.time() - clo1)[3], df = list(a_base))
return(b)
}
s$lapply <- map(s$mat, ~ mutate(help_lap(.x)))
s$equal_dplyr_lapply <-
map2_lgl(s$dplyr, s$lapply, ~ all.equal(.x$df, .y$df))
s$dplyr_time <- map_dbl(s$dplyr, "time")
s$lapply_time <- map_dbl(s$lapply, "time")
ggplot(gather(s, ... = c(7, 8)), aes(x = rows, y = value, color = key)) +
geom_line()
I tried the following with rowwise, but the rowwise pipe does not send a vector, but the entire df to the help_an function.
help_dp_r <- function(df) {
clo1 <- proc.time()
df2 <-
df %>% rowwise() %>% mutate(cond = help_an(.)) ### . is not passed on as a vector, but the entire df??
b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
}
s$dplyr_r <- map(s$mat, ~ dplyr::mutate(help_dp_r(.x)))

Related

Optimizing large db merge using split() function

I need to perform a conceptually straightforward double left-merge followed by a simple series of matching functions (See: Straightforward Solution). However, given the DBs I have to merge are large in size I tried to unpack the merging procedure by considering a for-loop that does the trick but is inefficient to say the least (See: For-loops Solution).
Is there a solution splitting and naming at least the largest db?
Below there is a toy example.
For reference, in my data:
db_m1 ~50k lines (for ~5k unique m1)
db_m2 ~25k lines (for ~5k unique m1 and m2)
db_p ~100m lines
set.seed(0)
db_m1 <- data.frame(
y=rep(1,20),
id=sort(rep(paste0("id_",c(letters[1:4])),5)),
m1=rep(c(1,2),10),
x1=sample(LETTERS, 20, TRUE),
x2=sample(LETTERS, 20, TRUE))
set.seed(0)
db_m2 <- data.frame(y=rep(1,20),
m1=sample(c(1:5),20,TRUE),
m2=sample(c(6:10),20,TRUE))
set.seed(0)
db_p <- data.frame(m2=sample(c(6:10),100,TRUE),
y1=sample(LETTERS, 100,TRUE),
y2=sample(LETTERS, 10,TRUE))
Straightforward Solution :
final_dplyr <- db_m1 %>%
dplyr::left_join(db_m2) %>%
dplyr::left_join(db_p) %>%
dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
For-loops Solution:
fn_final <- function(db_m1,db_m2,db_p) {
matches_final <- vector("list",length = length(unique(db_m1$y)))
for(i in 1:length(unique(db_m1$y))){
matches <- vector("list",length = length(unique(db_m1$m1)))
for(j in 1:length(unique(db_m1$m1))){
temp_db_m1 <- db_m1 %>% dplyr::filter(y==unique(db_m1$y)[i], m1==unique(db_m1$m1)[j])
temp_db_m2 <- db_m2 %>% dplyr::filter(y==unique(db_m1$y)[i], m1==unique(db_m1$m1)[j])
m_vector <- unique(temp_db_m2$m2)
temp_db_p <- db_p %>%
dplyr::filter(m2 %in% m_vector)
final <- db_m1 %>%
dplyr::left_join(db_m2) %>%
dplyr::left_join(db_p) %>% dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
matches[[j]] <- final
}
matches_all <- do.call(rbind, matches)
matches_final[[i]] <- matches_all
}
final <- do.call(rbind, matches_final) %>%
dplyr::filter(!is.na(n_p)) %>%
unique()
return(final)
}
final_for <- fn_final(db_m1,db_m2,db_p)
This is a possible solution, should it be optimized further?
db_m1_s <- split(db_m1, f = list(db_m1$y,db_m1$m1))
db_m2_s <- split(db_m2, f = list(db_m2$y,db_m2$m1))
db_p_s <- split(db_p, f = list(db_p$m2))
match_fn <- function(temp_db_m1,temp_db_m2,temp_db_p){
final <- temp_db_m1 %>%
dplyr::left_join(temp_db_m2) %>%
dplyr::left_join(temp_db_p) %>%
dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
return(final)
}
fn_final <- function(db_m1,db_m1_s,db_m2_s,db_p_s) {
m <- names(db_m1_s)
matches_1 <- vector("list",length = length(m))
for(i in 1:length(m)){
temp_db_m1 <- db_m1_s[[m[i]]]
temp_db_m2 <- db_m2_s[[m[i]]]
n <- as.character(sort(unique(temp_db_m2$m2)))
matches_2 <- vector("list",length = length(n))
for(j in 1:length(n)){
temp_db_p <- db_p_s[[n[j]]]
final <- match_fn(temp_db_m1,temp_db_m2,temp_db_p)
matches_2[[j]] <- final
}
matches_all <- do.call(rbind, matches_2)
matches_1[[i]] <- matches_all
}
matches_0 <- do.call(rbind, matches_1) %>%
dplyr::filter(!is.na(n_p)) %>%
unique()
return(matches_0)
}
final_for <- fn_final(db_m1,db_m1_s,db_m2_s,db_p_s)

Processing a data frame in r by subgroup: is it possible to get rid of the 'for' loop?

I frequently work with data frames and have to run some sophisticated data wrangling / manipulations by subgroup that is defined in one of the columns. I am aware of dplyr and group_by and know that many things could be solved using group_by. However, often I have to do some pretty intricate calculations and end up just using the 'for' loop.
I was wondering about the existence of some other general approach or paradigm that is faster/more elegant. Maybe map (that I am not very familiar with)?
Below is an example. Notice - it is fake and meaningless. So let's ignore why I need to do those things or the fact that there could be 2 consequtive NAs in a column, etc. That's not the focus of my question. The point is that often I have to operate "within the constraints of a subgroup" and then - inside that subgroup - I have to do operations columnwise, rowwise and sometimes even cellwise.
I also realize that I could probably put most of that code inside a function, split my data frame into a list based on 'group', apply this function to each element of that list and then do.call(rbind...) at the end. But is this the only way?
Thanks a lot for any hints!
library(dplyr)
library(forcats)
set.seed(123)
x <- tibble(group = c(rep('a', 10), rep('b', 10), rep('c', 10)),
attrib = c(sample(c("one", "two", "three", "four"), 10, replace = T),
sample(c("one", "two", "three"), 10, replace = T),
sample(c("one", "three", "four"), 10, replace = T)),
v1 = sample(c(1:5, NA), 30, replace = T),
v2 = sample(c(1:5, NA), 30, replace = T),
v3 = sample(c(1:5, NA), 30, replace = T),
n1 = abs(rnorm(30)), n2 = abs(rnorm(30)), n3 = abs(rnorm(30)))
v_vars = paste0("v", 1:3)
n_vars = paste0("n", 1:3)
results <- NULL # Placeholder for final results
for(i in seq(length(unique(x$group)))) { # loop through groups
mygroup <- unique(x$group)[i]
mysubtable <- x %>% filter(group == mygroup)
# IMPUTE NAs in v columns
# Replace every NA with a mean of values above and below it; and if it's the first or
# the last value, with the mean of 2 values below or above it.
for (v in v_vars){ # loop through v columns
which_nas <- which(is.na(mysubtable[[v]])) # create index of NAs for column v
if (length(which_nas) == 0) next else {
for (na in which_nas) { # loop through indexes of column values that are NAs
if (na == 1) {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na + 1],
mysubtable[[v]][na + 2]), na.rm = TRUE)
} else if (na == nrow(mysubtable)) {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na - 2],
mysubtable[[v]][na - 1]), na.rm = TRUE)
} else {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na - 1],
mysubtable[[v]][na + 1]), na.rm = TRUE)
}
} # end of loop through NA indexes
} # end of else
} # end of loop through v vars
# Aggregate v columns (mean) for each value of column 'attrib'
result1 <- mysubtable %>% group_by(attrib) %>%
summarize_at(v_vars, mean)
# Aggregate n columns (sum) for each value of column 'attrib'
result2 <- mysubtable %>% group_by(attrib) %>%
summarize_at(n_vars, sum)
# final result should contain the name of the group
results[[i]] <- cbind(mygroup, result1, result2[-1])
}
results <- do.call(rbind, results)
Maybe this example is too simple, but in this case, the only thing you need to pull out is the imputation.
my_impute <- function(x) {
which_nas <- which(is.na(x))
for (na in which_nas) {
if (na == 1) {
x[na] <- mean(c(x[na + 1], x[na + 2]), na.rm = TRUE)
} else if (na == length(x)) {
x[na] <- mean(c(x[na - 2], x[na - 1]), na.rm = TRUE)
} else {
x[na] <- mean(c(x[na - 1], x[na + 1]), na.rm = TRUE)
}
}
x
}
Then you just need to group appropriately and impute and summarize.
x2 <- x %>% group_by(group) %>% mutate_at(v_vars, my_impute) %>%
group_by(group, attrib)
full_join(x2 %>% summarize_at(v_vars, mean),
x2 %>% summarize_at(n_vars, sum))
My usual method for things like this, where similar calculations need to be on a bunch of columns, is to put it in long format. Here it feels a little like the long way round, but perhaps this would be useful to see.
x %>% mutate(row=1:n()) %>% gather("variable", "value", c(v_vars, n_vars)) %>%
separate(variable, c("var", "x"), sep=1) %>% spread(var, value) %>%
arrange(group, x, row) %>% group_by(group, x) %>%
mutate(v=my_impute(v)) %>% group_by(group, attrib, x) %>%
summarize(v=mean(v), n=sum(n)) %>%
gather("var", "value", v, n) %>% mutate(X=paste0(var, x)) %>%
select(-x, -var) %>% spread(X, value)
More generally, split-apply-combine is probably the way to go, as you suggest in your question; here's a way using the tidyverse.
doX <- function(x) {
x2 <- x %>% mutate_at(v_vars, my_impute) %>% group_by(attrib)
full_join(x2 %>% summarize_at(v_vars, mean),
x2 %>% summarize_at(n_vars, sum))
}
x %>% group_by(group) %>% nest() %>%
mutate(result=map(data, doX)) %>% select(-data) %>% unnest()
The more traditional method is with do.call, split, and rbind; here I don't make the effort to keep the group information.
do.call(rbind, lapply(split(x, x$group), doX))
The first thing to do is to change your data imputing into a function. I made some simple modifications to have it accept a vector and simplified the call to mean.
fx_na_rm <- function(z) {
which_nas <- which(is.na(z))
if (length(which_nas) > 0) {
for (na in which_nas) { # loop through indexes of column values that are NAs
if (na == 1) {
z[na] <- mean(z[na + (1:2)], na.rm = TRUE)
} else if (na == nrow(mysubtable)) {
z[na] <- mean(z[na - (1:2)], na.rm = TRUE)
} else {
z[na] <- mean(z[c(na - 1, na + 1)], na.rm = TRUE)
}
} # end of loop through NA indexes
}
return(z)
}
I like data.table so here's a solution that uses it. Now since you use different functions for the n and v variable groups, most purrr or any other solutions will also be a little funny.
library(data.table)
dt <- copy(as.data.table(x))
v_vars = paste0("v", 1:3)
n_vars = paste0("n", 1:3)
dt[, (v_vars) := lapply(.SD, as.numeric), .SDcols = v_vars]
dt[, (v_vars) := lapply(.SD, fx_na_rm), by = group, .SDcols = v_vars]
# see https://stackoverflow.com/questions/50626316/r-data-table-apply-function-a-to-some-columns-and-function-b-to-some-others
scols <- list(v_vars, n_vars)
funs <- rep(c(mean, sum), lengths(scols))
dt[, setNames(Map(function(f, x) f(x), funs, .SD), unlist(scols))
, by = .(group,attrib)
, .SDcols = unlist(scols)]
The for loop itself is difficult to vectorize because the results can depend on itself. Here is my attempt which is not an identical output to yours:
# not identical
fx_na_rm2 <- function(z) {
which_nas <- which(is.na(z))
if (length(which_nas) > 0) {
ind <- c(rbind(which_nas - 1 + 2 * (which_nas == 1) + -1 * (which_nas == length(z)),
which_nas + 1 + 1 * (which_nas == 1) + -2 * (which_nas == length(z))))
z[which_nas] <- colMeans(matrix(z[ind], nrow = 2), na.rm = T)
}
return(z)
}

Apply a function to multiple datasets using lapply

I have a large number of datasets for which I want to create the same variable. I would like to create a function to avoid having to repeat the same code many times.
I tried the code below: the first 3 lines describe the creation of the variable that I am trying to apply through the function created below.
data1 <- data1 %>%
dplyr::group_by(id)%>%
dplyr::mutate(new_var = sum(score))
list_data <- c(data1, data2, data3)
my_func <- function(x) {
x <- x %>%
dplyr::group_by(id) %>%
dplyr::mutate(new_var = sum(score))
}
lapply(list_data, my_func)
I obtain the error message
no applicable method for 'group_by' applied to an object of class
"character".
Could you please help me figure this out?
for me this works fine:
my_func <- function(x) {
x <- x %>%
dplyr::group_by(id) %>%
dplyr::mutate(new_var = sum(score))
}
data1 <- data.frame(id = rep(1:3, each = 3), score = 1:9)
data2 <- data.frame(id = rep(1:3, each = 3), score = 11:19)
data3 <- data.frame(id = rep(1:3, each = 3), score = 21:29)
list_data <- list(data1, data2, data3)
lapply(list_data, my_func)

Diff-in-diff estimation with resampling from large dataset

I have a large dataset on which to perform a diff-in-diff estimation. Given the nature of the dataset my t-statistics denominators are inflated and coefficient are (surreptitiously) statistically significant.
I want to step-by-step reducing the number of element in the database, and for each step resample a large number of times and re-estimating each time interaction coefficient and standard errors.
Then I want to take all the averages estimates and standard error, and plot them on a graph, to show at what point (if any) they are not statistically different from zero.
My code follows with a toy example.
I am not sure this is the most efficient way to tackle the problem
I cannot retrieve and thus plot the confidence interval
I am not sure the sampling is representative given the existence of different groups.
Toy example (Creds Torres-Reyna - ‎2015)
library(foreign)
library(dplyr)
library(ggplot2)
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:10){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(coefficient <- mean(c, na.rm = T),
standard_error <- mean(s, na.rm = T))
names(df_0) <- c("i","c","s")
View(df_0)
Consider the following refactored code using base R functions: within, %in%, nested lapply, setNames, aggregate, and do.call. This approach avoids calling rbind in a loop and compactly re-writes code without constantly using $ column referencing.
library(foreign)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata <- within(mydata, {
time <- ifelse(year >= 1994, 1, 0)
treated <- ifelse(country %in% c("E", "F", "G"), 1, 0)
did <- time * treated
})
# OUTER LIST OF DATA FRAMES
df_0_list <- lapply(1:length(seq(5,nrow(mydata)-1,5)), function(i) {
index <- seq(5,nrow(mydata),5)[i]
# INNER LIST OF DATA FRAMES
df_1_list <- lapply(1:100, function(j) {
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg <- lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- setNames(data.frame(t(new_line)), c("c","s","i"))
})
# APPEND ALL INNER DFS
df <- do.call(rbind, df_1_list)
return(df)
})
# APPEND ALL OUTER DFS
df_0 <- do.call(rbind, df_0_list)
# AGGREGATE WITH NEW COLUMNS
df_0 <- within(aggregate(cbind(c, s) ~ i, df_0, function(x) mean(x, na.rm=TRUE)), {
upper = c + s
lower = c - s
})
# RUN PLOT
within(df_0, {
plot(i, c, ylim=c(min(c)-5000000000, max(c)+5000000000), type = "l",
cex.lab=0.75, cex.axis=0.75, cex.main=0.75, cex.sub=0.75)
polygon(c(i, rev(i)), c(lower, rev(upper)),
col = "grey75", border = FALSE)
lines(i, c, lwd = 2)
})
In the end I solved it like this:
Is this the most efficient way?
library(foreign)
library(dplyr)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata$time = ifelse(mydata$year >= 1994, 1, 0)
mydata$treated = ifelse(mydata$country == "E" |
mydata$country == "F" |
mydata$country == "G", 1, 0)
mydata$did = mydata$time * mydata$treated
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:100){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(c = mean(c, na.rm = T), s =
mean(s, na.rm = T))
df_0 <- df_0 %>% group_by(i) %>% mutate(upper = c+s, lower = c-s)
df <- df_0
plot(df$i, df$c, ylim=c(min(df_0$c)-5000000000, max(df_0$c)+5000000000), type = "l")
polygon(c(df$i,rev(df$i)),c(df$lower,rev(df$upper)),col = "grey75", border = FALSE)
lines(df$i, df$c, lwd = 2)

Apply different data to a function in R

I have the following data frame:
library(tidyverse)
set.seed(1234)
df <- data.frame(
x = seq(1, 100, 1),
y = rnorm(100)
)
Where I apply a smooth spline using different knots:
nknots <- seq(4, 15, 1)
output <- map(nknots, ~ smooth.spline(x = df$x, y = df$y, nknots = .x))
What I need to do now is to apply the same function using 2-point and 3-point averages:
df_2 <- df %>%
group_by(., x = round(.$x/2)*2) %>%
summarise_all(funs(mean))
df_3 <- df %>%
group_by(., x = round(.$x/3)*3) %>%
summarise_all(funs(mean))
In summary, I need to apply the function I used in output with the following data frames:
df
df_2
df_3
Of course, this is a minimal example, so I am looking for a efficient way of doing it. Preferably with the purrr package.
Using lapply, and the library zoo to calculate the moving average in a more simple and elegant manner:
library(zoo)
lapply(1:3,function(roll){
dftemp <- as.data.frame(rollmean(df,roll))
map(nknots, ~ smooth.spline(x = dftemp$x, y = dftemp$y, nknots = .x))
})
Here's one possible solution:
library(tidyverse)
set.seed(1234)
df <- data.frame(x = seq(1, 100, 1),
y = rnorm(100))
# funtion to get v-point averages
GetAverages = function(v) {
df %>%
group_by(., x = round(.$x/v)*v) %>%
summarise_all(funs(mean)) }
# specify nunber of knots
nknots <- seq(4, 15, 1)
dt_res = tibble(v=1:3) %>% # specify v-point averages
mutate(d = map(v, GetAverages)) %>% # get data for each v-point
crossing(., data.frame(nknots=nknots)) %>% # combine each dataset with a knot
mutate(res = map2(d, nknots, ~smooth.spline(x = .x$x, y = .x$y, nknots = .y))) # apply smooth spline
You can use dt_res$res[dt_res$v == 1] to see all results for your original daatset, dt_res$res[dt_res$v == 2] to see results for your 2-point estimate, etc.

Resources