Apply statistical test to many variables: improve speed - r

I have a dataframe with 40 rows and ~40000 columns. The 40 rows are split into group "A" and group "B" (20 each). For each column, I would like to apply a statistical test (wilcox.test()) comparing the two groups. I started using a for loop to run through the 40000 columns but it was very slow.
Minimal Reproducible Example (MRE):
library(tidyverse)
set.seed(123)
metrics <- paste("metric_", 1:40000, sep = "")
patient_IDs <- paste("patientID_", 1:40, sep = "")
m <- matrix(sample(1:20, 1600000, replace = TRUE), ncol = 40000, nrow = 40,
dimnames=list(patient_IDs, metrics))
test_data <- as.data.frame(m)
test_data$group <- c(rep("A", 20), rep("B", 20))
# Collate list of metrics to analyse ("check") for significance
list_to_check <- colnames(test_data)[1:40000]
Original 'loop' method (this is what I want to vectorise):
# Create a variable to store the results
results_A_vs_B <- c()
# Loop through the "list_to_check" and,
# for each 'value', compare group A with group B
# and load the results into the "results_A_vs_B" variable
for (i in list_to_check) {
outcome <- wilcox.test(test_data[test_data$group == "A", ][[i]],
test_data[test_data$group == "B", ][[i]],
exact = FALSE)
if (!is.nan(outcome$p.value) && outcome$p.value <= 0.05) {
results_A_vs_B[i] <- paste(outcome$p.value, sep = "\t")
}
}
# Format the results into a dataframe
summarised_results_A_vs_B <- as.data.frame(results_A_vs_B) %>%
rownames_to_column(var = "A vs B") %>%
rename("Wilcox Test P-value" = "results_A_vs_B")
Benchmarking the answers so far:
# Ronak Shah's "Map" approach
Map_func <- function(dataset, list_to_check) {
tmp <- split(dataset[list_to_check], dataset$group)
stack(Map(function(x, y) wilcox.test(x, y, exact = FALSE)$p.value, tmp[[1]], tmp[[2]]))
}
# #Onyambu's data.table method
dt_func <- function(dataset, list_to_check) {
melt(setDT(dataset), measure.vars = list_to_check)[, dcast(.SD, rowid(group) + variable ~ group)][, wilcox.test(A, B, exact = FALSE)$p.value, variable]
}
# #Park's dplyr method (with some minor tweaks)
dplyr_func <- function(dataset, list_to_check){
dataset %>%
summarise(across(all_of(list_to_check),
~ wilcox.test(.x ~ group, exact = FALSE)$p.value)) %>%
pivot_longer(cols = everything(),
names_to = "Metrics",
values_to = "Wilcox Test P-value")
}
library(microbenchmark)
res_map <- microbenchmark(Map_func(test_data, list_to_check), times = 10)
res_dplyr <- microbenchmark(dplyr_func(test_data, list_to_check), times = 2)
library(data.table)
res_dt <- microbenchmark(dt_func(test_data, list_to_check), times = 10)
autoplot(rbind(res_map, res_dt, res_dplyr))
# Excluding dplyr
autoplot(rbind(res_map, res_dt))
--
Running the code on a server took a couple of seconds longer but the difference between Map and data.table was more pronounced (laptop = 4 cores, server = 8 cores):
autoplot(rbind(res_map, res_dt))

Here is another option -
Map_approach <- function(dataset, list_to_check) {
tmp <- split(dataset[list_to_check], dataset$group)
stack(Map(function(x, y) wilcox.test(x, y)$p.value, tmp[[1]], tmp[[2]]))
}
Map_approach(data_subset, list_to_check)
# values ind
#1 5.359791e-05 value_1
#2 5.499685e-08 value_2
#3 1.503951e-06 value_3
#4 6.179352e-08 value_4
#5 5.885650e-08 value_5
Testing it on larger sample Map is slightly faster than the for loop.
n <- 1e6
data_subset <- data.frame(patient_ID = 1:n,
group = c(rep("A", n/2),
rep("B", n/2)),
value_1 = c(sample(1:10, n/2, replace = TRUE),
sample(5:15, n/2, replace = TRUE)),
value_2 = c(sample(1:5, n/2, replace = TRUE),
sample(15:n/2, n/2, replace = TRUE)),
value_3 = c(sample(1:12, n/2, replace = TRUE),
sample(8:17, n/2, replace = TRUE)),
value_4 = c(sample(5:10, n/2, replace = TRUE),
sample(15:25, n/2, replace = TRUE)),
value_5 = c(sample(20:40, n/2, replace = TRUE),
sample(10:15, n/2, replace = TRUE)))
microbenchmark::microbenchmark(loop = wilcox_loop(data_subset, list_to_check),
Map = Map_approach(data_subset, list_to_check))
#Unit: seconds
# expr min lq mean median uq max neval cld
# loop 5.254573 5.631162 5.788624 5.734480 5.920424 6.756319 100 b
# Map 4.710790 5.084783 5.201711 5.160722 5.309048 5.721540 100 a

May you try this code? It's slightly faster in my computer.
wilcox_loop2 <- function(data_subset, list_to_check){
A = data_subset[data_subset$group == "A",]
B = data_subset[data_subset$group == "B",]
outcome <- sapply(list_to_check, function(x) wilcox.test(A[[x]],
B[[x]],
exact = FALSE)$p.value)
as.data.frame(outcome) %>%
rownames_to_column(var = "A vs B") %>%
rename("Wilcox Test P-value" = "outcome")
}
I'm not sure it's OK to split data into A and B...
My system time costs is like
microbenchmark::microbenchmark(origin = wilcox_loop(data_subset, list_to_check),
test = wilcox_loop2(data_subset, list_to_check))
Unit: milliseconds
expr min lq mean median uq max neval cld
origin 4.815601 5.006951 6.490757 5.385502 6.790752 21.5876 100 b
test 3.817801 4.116151 5.146963 4.330500 4.870651 15.8271 100 a

Related

Modify Moving Average calc from looping to a vector

I'm working on a moving average calculator that works as intended, but just takes a long time to run the calc because it is currently looping through the formula instead of being vectorized. The data set is about ~16000 rows. The MA formula is written to filter out the top quartile of values appearing in the previous 45 days of usd_price. Any tips/changes to get this running more efficiently as a vector calc?
The dput output is:
> dput(data)
structure(list(loan_price = c(50000, 60000, 40000, 35000, 1e+05,
95000), cad_price = c(62500, 75000, 50000, 43750, 125000, 118750
), day = structure(c(1642118400, 1641772800, 1639958400, 1639785600,
1638316800, 1640995200), tzone = "UTC", class = c("POSIXct", "POSIXt")), fourtyfive_avg = c(251435.529507523, 251435.529507523,
251435.529507523, 251435.529507523, 251435.529507523, 251435.529507523
), Loan = c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
library(readxl)
library(tidyverse)
library(zoo)
library(dplyr)
data<- data%>% mutate(day=lubridate::parse_date_time(day, "ymd"))
myfunc <- function(x){
fourtyfive_days <- as.Date(x - ddays(45))
data<-
data %>%
filter(day <= x) %>%
filter(day >= fourtyfive_days) %>%
filter(loan_price<= quantile(loan_price, probs = 0.75)) %>%
summarize(fourtyfive_avg = mean(loan_price))
return(data$fourtyfive_avg)
}
data$fourtyfive_avg <- sapply(data$day, simplify = TRUE, FUN = myfunc)
Functions in the tidyverse are notoriously slow so moving away from tidy functions will improve the speed significantly. You could also try the data.table package, but 16,000 rows isn’t a lot of data so I’m not sure how necessary this would be. I typically only use it when the number of rows in my data is in the millions. You also have to account for the time it takes to index, so I’ve included that in the benchmark below.
The functions we'll be comparing:
library(data.table)
library(tidyverse)
library(lubridate)
library(microbenchmark)
## dplyr
myfunc <- function(x){
fourtyfive_days <- as.Date(x - ddays(45))
data<-
data %>%
filter(day <= x) %>%
filter(day >= fourtyfive_days) %>%
filter(loan_price<= quantile(loan_price, probs = 0.75)) %>%
summarize(fourtyfive_avg = mean(loan_price))
return(data$fourtyfive_avg)
}
# data$fourtyfive_avg <- sapply(data$day, simplify = TRUE, FUN = myfunc)
## base
mybasefunc <- function(x){
fortyfive_days <- as.Date(x - ddays(45))
data <- data[data$day >= fortyfive_days & data$day <= x ,]
q75 <- quantile(data$loan_price, probs = 0.75)
data <- data[data$loan_price <= q75,]
fortyfive_avg <- mean(data$loan_price)
return(fortyfive_avg)
}
# data$fortyfive_avg <- sapply(data$day, simplify = TRUE, FUN = mybasefunc)
## data.table
# dat <- data.table(data, key = c("day", "loan_price"))
mydtfunc <- function(x){
fortyfive_days <- as.Date(x - ddays(45))
dat <- dat[day >= fortyfive_days & day <= x]
q75 <- quantile(dat$loan_price, probs = 0.75)
dat <- dat[dat$loan_price <= q75]
fortyfive_avg <- mean(dat$loan_price)
return(fortyfive_avg)
}
# dat[ , fortyfive_avg := sapply(day, mydtfunc), ]
And benchmarking:
set.seed(1)
m <-
microbenchmark("dplyr" = {data$fourtyfive_avg <- sapply(data$day, simplify = TRUE, FUN = myfunc)},
"base" = {data$fortyfive_avg <- sapply(data$day, simplify = TRUE, FUN = mybasefunc)},
"dt" = {dat <- data.table(data, key = c("day", "loan_price")); dat[ , fortyfive_avg := sapply(day, mydtfunc), ]})
m
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> dplyr 29.6519 30.71400 32.594319 32.42595 33.68590 44.0838 100 c
#> base 4.2316 4.37210 4.631541 4.42665 4.58615 12.3656 100 a
#> dt 5.6883 5.83755 6.254143 5.97100 6.11905 15.6615 100 b
autoplot(m)
The benchmark seems pretty conclusive: you can see a significant improvement by moving away from dplyr to either base or data.table. It's worth noting I don't use data.table often, so there may be a more efficient way to accomplish what I've done, but it's still much faster than dplyr.
Created on 2022-01-31 by the reprex package (v2.0.1)

chi square over multiple groups and variables

I have a huge dataset with several groups (factors with between 2 to 6 levels), and dichotomous variables (0, 1).
example data
DF <- data.frame(
group1 = sample(x = c("A","B","C","D"), size = 100, replace = T),
group2 = sample(x = c("red","blue","green"), size = 100, replace = T),
group3 = sample(x = c("tiny","small","big","huge"), size = 100, replace = T),
var1 = sample(x = 0:1, size = 100, replace = T),
var2 = sample(x = 0:1, size = 100, replace = T),
var3 = sample(x = 0:1, size = 100, replace = T),
var4 = sample(x = 0:1, size = 100, replace = T),
var5 = sample(x = 0:1, size = 100, replace = T))
I want to do a chi square for every group, across all the variables.
library(tidyverse)
library(rstatix)
chisq_test(DF$group1, DF$var1)
chisq_test(DF$group1, DF$var2)
chisq_test(DF$group1, DF$var3)
...
etc
I managed to make it work by using two nested for loops, but I'm sure there is a better solution
groups <- c("group1","group2","group3")
vars <- c("var1","var2","var3","var4","var5")
results <- data.frame()
for(i in groups){
for(j in vars){
test <- chisq_test(DF[,i], DF[,j])
test <- mutate(test, group=i, var=j)
results <- rbind(results, test)
}
}
results
I think I need some kind of apply function, but I can't figure it out
Here is one way to do it with apply. I am sure there is an even more elegant way to do it with dplyr. (Note that here I extract the p.value of the test, but you can extract something else or the whole test result if you prefer).
res <- apply(DF[,1:3], 2, function(x) {
apply(DF[,4:7], 2,
function(y) {chisq.test(x,y)$p.value})
})
Here's a quick and easy dplyr solution, that involves transforming the data into long format keyed by group and var, then running the chi-sq test on each combination of group and var.
DF %>%
pivot_longer(starts_with("group"), names_to = "group", values_to = "group_val") %>%
pivot_longer(starts_with("var"), names_to = "var", values_to = "var_val") %>%
group_by(group, var) %>%
summarise(chisq_test(group_val, var_val)) %>%
ungroup()

Efficiently fill 2D matrices by rows in a list in R

I have a list of 2D matrices. Each matrix is filled using the function fillMatrices. This function adds a number of individuals to each day 0 in a matrix and updates the columns a_M, b_M and c_M. The numbers of individuals come from an initial matrix ind. The code works but it is slow when the number of matrices within the list is large. For example with n = 10000:
user system elapsed
3.73 0.83 4.55
If possible, I would like to reduce the elapsed time to <= 1 sec and increase the n to 720000 matrices. So, I am looking for way to optimize only the section 3. Here is the code:
###############################################
###############################################
## Section 3
## Run the function "fillMatrices"
indexTime <- 1
dt_t_1 <- do.call(rbind, lapply(list_matrices, function(x) x[1,]))
dt_t <- fillMatrices(dt_t_1 = dt_t_1, species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## Fill the matrices within the list
system.time(for(i in 1:n){
list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
})
## test <- list_matrices[[1]]
The code of the section 1 is used to initialize the matrices and the function fillMatrices can be found in the section 2. In my example, the function is used to fill matrices for one species. In reality, the function is used for 3 species (i.e., is applied three times) by changing the argument species = c("a_M", "b_M", "c_M"). How can I speed up my code? Any advice would be much appreciated.
Here are the codes of sections 1 and 2:
rm(list=ls(all=TRUE))
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000
###############################################
###############################################
## Section 1
## Build the list of 2D matrices
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
list_matrices <- list()
for(i in 1:n){
print(i)
list_matrices[[i]] <- ff(-999, dim=c(3650, length(col_mat)), dimnames=list(NULL, col_mat), vmode="double", overwrite = TRUE)
}
## test <- list_matrices[[1]]
## dim(list_matrices[[1]])
## Fill the first row of each matrix
for(i in 1:n){
print(i)
list_matrices[[i]][1,] <- c(1, 1, 1, i-1, 0, rep(0, length(v_date)))
}
## test <- list_matrices[[2]]
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1, n), day = rep(1, n), time = rep(1, n), died = rep(0, n), ID = (seq(1, n, 1))- 1, a_M = sample(1:10, n, replace = T), b_M = sample(1:10, n, replace = T), c_M = sample(1:10, n, replace = T)))
## print(ind)
###############################################
###############################################
## Section 2
## Function to convert a data frame into a matrix
convertDFToMat <- function(x){
mat <- as.matrix(x[,-1])
ifelse(is(x[,1], "data.frame"), rownames(mat) <- pull(x[,1]), rownames(mat) <- x[,1])
## Convert character matrix into numeric matrix
mat <- apply(mat, 2, as.numeric)
return(mat)
}
## Define the function that is used to fill the matrices within the list
fillMatrices <- function(dt_t_1, species, maxDuration, matrixColumns){
## Format data
dt <- as.data.frame(dt_t_1) %>%
reshape::melt(id = c("ID")) %>%
arrange(ID) %>%
dplyr::mutate_all(as.character)
## summary(dt)
## Break out the variable "variable" into different columns, with one row for each individual-day
dt_reshape_filter_1 <- dt %>%
dplyr::filter(!variable %in% c("year", "day", "time", "ID", "died")) %>%
dplyr::mutate(day = variable %>% gsub(pattern = "\\_.*", replacement = "", x = .), col = variable %>% gsub(pattern = ".*\\|", replacement = "", x = .)) %>%
dplyr::select(-variable) %>%
tidyr::spread(col, value) %>%
dplyr::mutate_all(as.numeric) %>%
dplyr::arrange(ID, day)
## summary(dt_reshape_filter_1)
## Apply requested transformations and build the data frame
dt_transform <- dt_reshape_filter_1 %>%
dplyr::rename_at(vars(species), ~ c("a", "b", "c")) %>%
dplyr::mutate(day = day + 1) %>%
dplyr::filter(day < maxDuration + 1) %>%
dplyr::bind_rows(tibble(ID = ind[,c("ID")], day = 0, a = ind[,c("a_M")], b = ind[,c("b_M")])) %>%
dplyr::mutate(c = a + b) %>%
dplyr::rename_at(vars("a", "b", "c"), ~ species) %>%
dplyr::arrange(ID, day)
## summary(dt_transform)
## Take different columns of the data frame and gather them into a single column
dt_gather <- dt_transform %>%
tidyr::gather(variable, value, species) %>%
dplyr::mutate(day = if_else(day > 1, paste0(day, "_days"), paste0(day, "_day"))) %>%
tidyr::unite(variable, c("day", "variable"), sep = "|") %>%
dplyr::rename(var2 = ID) %>%
dplyr::mutate_all(as.character)
## summary(dt_gather)
## Add the other columns in the data frame and convert the resulting data frame into a matrix
dt_reshape_filter_2 <- dt %>%
dplyr::rename(var2 = ID) %>%
dplyr::filter(variable %in% c("year", "day", "time", "ID", "died")) %>%
tidyr::spread(variable, value) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate(year = ind[,c("year")],
day = ind[,c("day")],
time = ind[,c("time")],
ID = ind[,c("ID")],
died = ind[,c("died")]) %>%
tidyr::gather(variable, value, c(year, day, time, ID, died)) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate_all(as.character)
## summary(dt_reshape_filter_2)
## Build the output matrix
dt_bind <- bind_rows(dt_reshape_filter_2, dt_gather) %>%
tidyr::spread(var2, value) %>%
dplyr::arrange(match(variable, matrixColumns)) %>%
dplyr::select("variable", as.character(ind[,c("ID")]))
## summary(dt_bind)
dt_mat <- convertDFToMat(dt_bind)
## summary(dt_mat)
return(dt_mat)
}
Making a 3D array instead of a 2D list of matrices gives you more options
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000L
n_row <- 3650L
#array way:
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
arr1 <- ff(-999L, dim = c(n_row, length(col_mat), n), dimnames = list(NULL, col_mat, NULL))
## Fill the first row of each matrix slice
arr1[1, , ] <- c(1L, 1L, 1L, NA, 0L, rep(0L, length(v_date)))
arr1[1, 4, ] <- seq_len(n)-1L
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1L, n), day = rep(1L, n), time = rep(1L, n), died = rep(0L, n), ID = (seq(1L, n, 1L))- 1L, a_M = sample(1L:10L, n, replace = T), b_M = sample(1L:10L, n, replace = T), c_M = sample(1L:10L, n, replace = T)))
##fill the matrix
indexTime <- 1L
dt_t <- fillMatrices(dt_t_1 = t(arr1[1, ,]), species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## reassign
system.time(
arr1[indexTime + 1, ,] <- dt_t
)
user system elapsed
0.05 0.70 0.7
# for comparison
#> system.time(for(i in 1:n){
#+ list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
#+ })
# user system elapsed
# 4.75 1.08 5.90
As far as I can tell, it's giving me the same results as your original approach but does so a lot faster.

How to randomly select and bind data columns based on their median values in R?

I have two dataframes in wide format. Each of the columns is a time series of page hits for various wikipedia articles.
set.seed(123)
library(tidyr)
time = as.Date('2009-01-01') + 0:9
wiki_1 <- data.frame(
W = sample(1:1000,10,replace = T),
X = sample(1:100,10,replace = T),
Y = sample(1:10,10,replace = T),
Z = sample(1:10,10, replace = T)
)
wiki_2 <- data.frame(
A = sample(500:1000,10,replace = T),
B = sample(90:100,10,replace = T),
C = sample(1:10,10,replace = T),
D = sample(1:10,10,replace = T)
)
I want to combine one of the columns from the first dataset (wiki_1) with n columns from the second dataset (wiki_2). But this selection should be based on how close the median values of the columns in wiki_2 are to those in wiki_1 e.g. by order of magnitude.
In this example, for n = 2, Y should be matched with C and D because of how close their median values are.
median(wiki_1$Y) # 7
median(wiki_2$C) # 6
median(wiki_2$D) # 4.5
I'm not sure how to implement the difference in median values criterion to get the desired result.
Additionally, it would be useful to be able to randomly sample from the columns in wiki_2 that satisfy the criterion as my real dataset has many more columns.
This is what I'm working with so far:
df <- zoo(cbind(subset(wiki_1,select="Y"),
subset(wiki_2,select=c("C","D"))),time)
I think this is what you're after. I added a column to wiki_2 in order to allow more than 2 matches to show the random selection of matching columns.
set.seed(123)
library(tidyr)
time = as.Date('2009-01-01') + 0:9
wiki_1 <- data.frame(
W = sample(1:1000,10,replace = T),
X = sample(1:100,10,replace = T),
Y = sample(1:10,10,replace = T),
Z = sample(1:10,10, replace = T)
)
wiki_2 <- data.frame(
A = sample(500:1000,10,replace = T),
B = sample(90:100,10,replace = T),
C = sample(1:10,10,replace = T),
D = sample(1:10,10,replace = T),
E = sample(1:20,10,replace = T)
)
selectColsByMedian <- function(df1, df2, ref_v, n_v, cutoff_v) {
#' Select Columns By Median
#' #description Select any number of columns from a test data.frame whose median value is
#' close to the median value of a specified column from a reference data.frame. "Close to"
#' is determined as the absolute value of the difference in medians being less thant he specified cutoff.
#' Outputs a new data.frame containing the reference data.frame's test column and all matching columns
#' from the test data.frame
#' #param df1 reference data.frame
#' #param df2 test data.frame
#' #param ref_v column from reference data.frame to test against
#' #param n_v number of columns from df2 to select
#' #param cutoff_v value to use to determine if test columns' medians are close enough
#' #return data.frame with 1 column from df1 and matching columns from df2
## Get median of ref
med_v <- median(df1[,ref_v], na.rm = T)
## Get other medians
otherMed_v <- apply(wiki_2, 2, function(x) median(x, na.rm = T))
## Get differences
medDiff_v <- sapply(otherMed_v, function(x) abs(med_v - x))
## Get whoever is within range (and order them)
inRange_v <- sort(medDiff_v[medDiff_v < cutoff_v])
inRangeCols_v <- names(inRange_v)
## Select random sample, if needed
if (length(inRangeCols_v) > n_v){
whichRandom_v <- sample(1:length(inRangeCols_v), size = n_v, replace = F)
} else {
whichRandom_v <- 1:length(inRangeCols_v)
}
finalCols_v <- inRangeCols_v[whichRandom_v]
## Final output
out_df <- cbind(df1[,ref_v], df2[,finalCols_v])
colnames(out_df) <- c(ref_v, finalCols_v)
## Return
return(out_df)
} # selectColsByMedian
### 3 matching columns, select 2
match3pick2_df <- selectColsByMedian(df1 = wiki_1, df2 = wiki_2, ref_v = "Y", n_v = 2, cutoff_v = 12)
match3pick2_df2 <- selectColsByMedian(df1 = wiki_1, df2 = wiki_2, ref_v = "Y", n_v = 2, cutoff_v = 12)
### 2 matching columns, select 2
match2pick2_df <- selectColsByMedian(df1 = wiki_1, df2 = wiki_2, ref_v = "Y", n_v = 2, cutoff_v = 10)
Here is my solution, I've added more columns to wiki_2 to allow for subsetting (but it works if ncols(wiki_1) == ncols(wiki_2).
set.seed(123)
wiki_1 <- data.frame(
W = sample(1:1000,10,replace = T),
X = sample(1:100,10,replace = T),
Y = sample(1:10,10,replace = T),
Z = sample(1:10,10, replace = T)
)
wiki_2 <- data.frame(
A = sample(500:1000,100,replace = T),
B = sample(90:100,100,replace = T),
C = sample(1:10,100,replace = T),
D = sample(1:10,100,replace = T)
)
combineMedianComp <- function(data1, data2, col, n){
if(nrow(data1) > nrow(data2)) stop("Rows in 'data2' need to be greater or equal to rows in 'data1'")
medRef <- median(data1[[col]], na.rm = T, ) # median of desired column
medComp <- sapply(data2, function(x){abs(medRef - median(x, na.rm = T))}) # vector with medians for each columns in data2 ('wiki_2')
cols <- names(sort(medComp)[seq_len(n)]) # sort this vector in ascending order, select top n
d2 <- data2[, c(cols)] # select columns in data2 that have medians closest to 'medRef'
d2 <- d2[sample(seq_len(nrow(d2)), size = nrow(data1), replace = F), ] # subset column as to match those in data1
# merge data
res <- do.call(cbind, list(data1[col], d2))
return(res)
}
combineMedianComp(data1 = wiki_1, data2 = wiki_2, col = "Y", n = 2)
You can do:
time = as.Date('2009-01-01') + 0:9
close_median <- function(df1, df2, to_match = NULL){
# get median
m <- median(df1[[to_match]])
# get difference of median from other data
mat_cols <- apply(df2, 2, function(x) abs(m - median(x)))
# get top 2 matched column
cols <- sort(names(sort(v)[1:2]))
return(cbind(df1[to_match], df2[cols], row.names=time))
}
close_median(wiki_1, wiki_2, 'Y')
Y C D
2009-01-01 8 9 10
2009-01-02 7 8 1
2009-01-03 1 7 7
2009-01-04 10 3 10
2009-01-05 2 1 1
2009-01-06 3 10 3
2009-01-07 6 2 3
2009-01-08 5 8 10
2009-01-09 3 8 5
2009-01-10 10 8 3

Rowsum with Dynamic Column Range

I am working with a large health insurance dataset and I am interested in participants with certain claims codes. One of my inclusion criteria is that the participant has to have be insured for one year before and one year after the claim date. E.g., if they were injured 9/27/2017, they need insurance from 9/27/2016-9/27/2018.
I have tried doing a simple rowsum, and using apply, but both have the same issue: in from:to : numerical expression has # elements: only the first used. Right now, I have the range saved as variables in the dataframe. It think I understand why I am having the issue--it is expecting a number and receiving a vector. How can I get it to conditionally select columns to sum. I will include my code below.
In my example, I am just trying to count the number of months a participant is insured for 6 month before and after their accident. The ins_#_# variables are a simple YES/NO for whether or not participants were insured that month. Any guidance is appreciated!
library(tidyverse)
set.seed(1)
df <- data.frame(id= seq(1,100),
injury_date = sample(seq(as.Date('2017/01/01'), as.Date('2017/12/31'), by="day"), 100),
ins_07_16 = sample(c(0,1), replace = TRUE),
ins_08_16 = sample(c(0,1), replace = TRUE),
ins_09_16 = sample(c(0,1), replace = TRUE),
ins_10_16 = sample(c(0,1), replace = TRUE),
ins_11_16 = sample(c(0,1), replace = TRUE),
ins_12_16 = sample(c(0,1), replace = TRUE),
ins_01_17 = sample(c(0,1), replace = TRUE),
ins_02_17 = sample(c(0,1), replace = TRUE),
ins_03_17 = sample(c(0,1), replace = TRUE),
ins_04_17 = sample(c(0,1), replace = TRUE),
ins_05_17 = sample(c(0,1), replace = TRUE),
ins_06_17 = sample(c(0,1), replace = TRUE),
ins_07_17 = sample(c(0,1), replace = TRUE),
ins_08_17 = sample(c(0,1), replace = TRUE),
ins_09_17 = sample(c(0,1), replace = TRUE),
ins_10_17 = sample(c(0,1), replace = TRUE),
ins_11_17 = sample(c(0,1), replace = TRUE),
ins_12_17 = sample(c(0,1), replace = TRUE),
ins_01_18 = sample(c(0,1), replace = TRUE),
ins_02_18 = sample(c(0,1), replace = TRUE),
ins_03_18 = sample(c(0,1), replace = TRUE),
ins_04_18 = sample(c(0,1), replace = TRUE),
ins_05_18 = sample(c(0,1), replace = TRUE),
ins_06_18 = sample(c(0,1), replace = TRUE))
df <- df %>%
mutate(month = as.numeric(format(as.Date(injury_date), "%m")), #pulling month of injury
low_mo = month + 2,
high_mo = month + 14)
df$insured <- rowSums(df[df$low_mo:df$high_mo]) #only uses first element
df$insured <- apply(df[df$low_mo:df$high_mo], 1, sum) #only uses first element
Edit:
Although I did not specify that I wanted a fast solution, I am working with a lot of data so I tested which of #akrun's solutions was the fastest. I changed the dataframe so it was 1e5 (100,000) rows. The results are below in case anyone is curious.
microbenchmark(o1 <- sapply(seq_len(nrow(df)), function(i) sum(df[i, df$low_mo[i]:df$high_mo[i]])),
o2 <- {colInd <- Map(`:`, df$low_mo, df$high_mo);
rowInd <- rep(seq_len(nrow(df)), lengths(colInd));
as.vector(tapply(df[-(1:2)][cbind(rowInd, unlist(colInd)-2)],
rowInd, FUN = sum))},
o3 <- {colInd1 <- Map(function(x, y) which(!seq_along(df) %in% x:y), df$low_mo, df$high_mo);
rowInd1 <- rep(seq_len(nrow(df)), lengths(colInd1));
rowSums(replace(df, cbind(rowInd1, unlist(colInd1)), NA)[-(1:2)], na.rm = TRUE)},
times = 5)
Unit: milliseconds
expr min lq mean median uq max neval
o1 20408.5072 20757.0285 20903.9386 20986.2275 21069.3163 21298.6137 5
o2 433.5463 436.3066 448.6448 455.6551 456.8836 460.8325 5
o3 470.6834 482.4449 492.9594 485.6210 504.1353 521.9122 5
> identical(o1, o2)
[1] TRUE
> identical(o2, o3)
[1] TRUE
There are couple of way to do this. Loop through the sequence of rows, subset the dataset by the row index, and the columns generated by taking the sequence of 'low_mo' and 'high_mo' for each row, get the sum
o1 <- sapply(seq_len(nrow(df)), function(i) sum(df[i, df$low_mo[i]:df$high_mo[i]]))
Or another option is to extract the elements based on the row/column index and then do a group by sum
colInd <- Map(`:`, df$low_mo, df$high_mo)
rowInd <- rep(seq_len(nrow(df)), lengths(colInd))
o2 <- as.vector(tapply(df[-(1:2)][cbind(rowInd, unlist(colInd)-2)],
rowInd, FUN = sum))
identical(o1, o2)
#[1] TRUE
Or another approach is to change the column values that are not in the sequence to NA and use the rowSums
colInd1 <- Map(function(x, y) which(!seq_along(df) %in% x:y), df$low_mo, df$high_mo)
rowInd1 <- rep(seq_len(nrow(df)), lengths(colInd1))
o3 <- rowSums(replace(df, cbind(rowInd1, unlist(colInd1)),
NA)[-(1:2)], na.rm = TRUE)
identical(o1, o3)
#[1] TRUE

Resources