I have a long format data frame that looks like this:
What I'd like to do is (1) randomly split the trials from Condition 1 into 4 groups and then calculate the mean of Y for each ID and (2) carry out this same procedure for the trials in Condition 2. This is what this new output would look like:
Is there a concise way to do this using the tidyverse? I'm just getting started and am having a having a hard time with this!
Assuming that your data frame is not that small, this should do exactly what you need.
set.seed(2)
df <- tibble(ID = c(rep(1,50),rep(2,50)),
Condition = rep(c(1,1,2,2,2,1,1,2,2,2,1,1,1,2,2,1,1,1,2,2),5),
Y = rnorm(100,100,20)) ##Creates some fake data
result_df <- df %>% mutate(randomizer = sapply(1:length(row_number()), function(i)sample(c(1,2,3,4),1))) %>%
group_by(Condition) %>%
group_by(ID, Condition, randomizer) %>%
summarize(mean_y = mean(Y, na.rm = TRUE)) %>%
mutate(condition_status = paste0("Condition",Condition,"M",randomizer)) %>%
ungroup() %>%
dplyr::select(-Condition, -randomizer) %>%
spread(condition_status, mean_y)
result_df
Not the best way but it does work
x <- data.frame(c(rep(1,10), rep(2,10)), c(1,1,2,2,2,1,1,2,2,2,1,1,1,2,2,1,1,1,2,2), c(100,200,80,58,89,100,200,80,58,89,95,72,99,120,130,95,72,99,120,130))
colnames(x) <- c("ID", "Condition", "Y")
id <- unique(x[,1])
con <- unique(x[,2])
#multiple by the number of groups to split into
y <- data.frame(matrix(, ncol = (length(id) * 4) + 1, nrow = length(id)))
y[,1] <- id
colnames(y) <- c("ID", "Condition1M1", "Condition1M2", "Condition1M3", "Condition1M4", "Condition2M1", "Condition2M2", "Condition2M3", "Condition2M4")
x1 <- x[which(x[,2] == con[1]),]
x2 <- x[which(x[,2] == con[2]),]
#specify sample size
n <- 5
x11 <- x1[sample(nrow(x1), n, replace = FALSE),]
x12 <- x1[sample(nrow(x1), n, replace = FALSE),]
x13 <- x1[sample(nrow(x1), n, replace = FALSE),]
x14 <- x1[sample(nrow(x1), n, replace = FALSE),]
x21 <- x2[sample(nrow(x2), n, replace = FALSE),]
x22 <- x2[sample(nrow(x2), n, replace = FALSE),]
x23 <- x2[sample(nrow(x2), n, replace = FALSE),]
x24 <- x2[sample(nrow(x2), n, replace = FALSE),]
y[1,2] <- mean(x11[which(x11[,1] == id[1]),3])
y[1,3] <- mean(x12[which(x12[,1] == id[1]),3])
y[1,4] <- mean(x13[which(x13[,1] == id[1]),3])
y[1,5] <- mean(x14[which(x14[,1] == id[1]),3])
y[2,2] <- mean(x21[which(x21[,1] == id[2]),3])
y[2,3] <- mean(x22[which(x22[,1] == id[2]),3])
y[2,4] <- mean(x23[which(x23[,1] == id[2]),3])
y[2,5] <- mean(x24[which(x24[,1] == id[2]),3])
y[1,6] <- mean(x11[which(x11[,1] == id[1]),3])
y[1,7] <- mean(x12[which(x12[,1] == id[1]),3])
y[1,8] <- mean(x13[which(x13[,1] == id[1]),3])
y[1,9] <- mean(x14[which(x14[,1] == id[1]),3])
y[2,6] <- mean(x21[which(x21[,1] == id[2]),3])
y[2,7] <- mean(x22[which(x22[,1] == id[2]),3])
y[2,8] <- mean(x23[which(x23[,1] == id[2]),3])
y[2,9] <- mean(x24[which(x24[,1] == id[2]),3])
Results:
ID Condition1M1 Condition1M2 Condition1M3 Condition1M4 Condition2M1 Condition2M2 Condition2M3 Condition2M4
1 1 133.3333 100 150 133.3333 133.3333 100 150 133.3333
2 2 125.0000 125 125 120.0000 125.0000 125 125 120.0000
Related
I am using this to get the mean and standard deviation of a car mpg
df1 <- mtcars; df1$rownames = rownames(df1)
df2 <- mtcars; df2$rownames = rownames(df2)
df2$mpg = df2$mpg + rnorm(nrow(df2),0,3)
data = rbind(df1, df2)
I use a function to get the probabilities that a car has lower mpg than other cars
df = plyr::ddply(data,~rownames,summarise,mean=mean(mpg),sd=sd(mpg))
f <- function(x, y){
n1 = df$mean[x]; n2 = df$mean[y]; sd1 = df$sd[x]; sd2 = df$sd[y]
pnorm(0, mean = n1 - n2, sd = sqrt(sd1^2 + sd2^2))
}
res <- outer(X = 1:nrow(df), Y = 1:nrow(df), f)
dimnames(res) <- list(df$rownames, df$rownames)
res <- data.frame(res)
res <- tibble::rownames_to_column(res, 'p1')
datalong_2 <- tidyr::gather(res, 'p2', 'value', -1) # output
Now I want to have the probabilities of a car have the lowest mpg than the rest of the cars. I tied this:
cars = unique(datalong_2$p1)
win <- data.frame(sapply(1:length(cars), function(x) setNames(prod(subset(datalong_2, p1 == cars[x] & p2 != cars[x])$value),cars[x])))
colnames(win) <- "prob"
win$prob <- round(win$prob,4)
But the probabilities do not add up to one. How can I change this code to get a table that have the probability of each car of having the lowest mpg than the rest of the cars?
Here is a comparison of dplyr/data.table methods to return the probabilities
library(dplyr)
library(data.table)
library(tidyr)
library(tibble)
# // input data
df <- mtcars[1] %>%
rownames_to_column("car")
-testing
# // dplyr
system.time({
out <- df %>%
uncount(10000, .id = "run") %>%
rowwise() %>%
mutate(sim_mpg = rpois(1, lambda = mpg)) %>%
group_by(run) %>%
arrange(sim_mpg) %>%
mutate(lowest_mpg = row_number() == 1) %>%
group_by(car) %>%
summarize(chance_lowest = mean(lowest_mpg),
orig_mpg = first(mpg))
})
# user system elapsed
# 1.715 0.074 1.787
# // data.table
system.time({
df_expand <- setDT(df)[rep(seq_len(.N), 10000)][, run := rowid(car)]
out2 <- df_expand[, sim_mpg := rpois(1, lambda = mpg), 1:nrow(df_expand)
][order(sim_mpg), lowest_mpg := seq_len(.N) == 1 ,run
][, .(chance_lowest = mean(lowest_mpg), orig_mpg = first(mpg)), .(car)]
})
# user system elapsed
# 0.704 0.050 0.757
sum(out$chance_lowest)
#[1] 1
I have the following dataframe:
id <- c("A", "B", "C")
col1 <- c(1, 3, 5)
col2 <- c(6, 12, 9)
col3 <- c(2, 4, 30)
df <- data.frame(id, col1, col2, col3)
Essentially, I want every i to be replaced by 20, 25, 30, 35, 40. This loop works but it works very, very slowly.
library(dplyr)
library(tibble)
library(foreach)
library(tidyverse)
library(purrr)
id <- c("A", "B", "C")
col1 <- c(1, 3, 5)
col2 <- c(6, 12, 9)
col3 <- c(2, 4, 30)
df <- data.frame(id, col1, col2, col3)
vals <- c(seq(from=20, to=40, by=5))
final <- foreach(i = vals, .combine='cbind') %do% {
# if cell is greater than i, then code 0
df_2 <- df %>% mutate(across(starts_with("col"), ~ +(. < i)))
# transpose the dataset
rownames(df_2) <- df_2$id
df_2$id <- NULL
df_2_t <- as.data.frame(t(df_2))
# sum the rows
df_2_t <- cbind(id = rownames(df_2_t), df_2_t)
rownames(df_2_t) <- 1:nrow(df_2_t)
df_2_t <- df_2_t %>%
mutate(sum = rowSums(.[2:ncol(.)]))
# merge a new column
id2 <- c("col1", "col2", "col3")
D <- c(3, 4, 5)
id_d <- data.frame(id2, D)
df_2_t_d <- left_join(df_2_t, id_d, by = c("id" = "id2"))
# divide D by the number of letters (there are 3 letter columns -- A, B, C)
df_2_t_d$letters <- rep(3)
df_2_t_d <- df_2_t_d %>%
mutate(frac = D/letters)
# recode all 1s to the frac
letters <- grep("^A|^B|^C", names(df_2_t_d))
df_2_t_d[letters] <- apply(df_2_t_d[letters], 2, function(x) ifelse(x == 1, df_2_t_d$frac, 0))
# drop two columns
df_2_t_d <- select(df_2_t_d, -c(D, letters))
# transpose again
rownames(df_2_t_d) <- df_2_t_d$id
df_2_t_d$id <- NULL
df_2_t_d2 <- as.data.frame(t(df_2_t_d))
df_2_t_d2_sum <- df_2_t_d2 %>%
mutate(rowSums(.[1:3])) %>%
transmute(!!paste0('sum_', i) := rowSums(select(., starts_with('col'))))
}
df_2_t_d2 <- cbind(list_name = rownames(df_2_t_d2), df_2_t_d2)
rownames(df_2_t_d2) <- 1:nrow(df_2_t_d2)
df_2_t_d2 <- select(df_2_t_d2, list_name)
abc <- cbind(df_2_t_d2, df_2_t_d2_sum)
View(abc)
If there's any way to speed it up, suggestions are welcome!
Here's a way to do this map_dfc :
library(dplyr)
library(purrr)
vals <- seq(from=20, to=40, by=5)
bind_cols(
df, map_dfc(vals, function(x) df %>%
mutate(across(starts_with("col"), ~ +(. < x))) %>%
transmute(!!paste0('sum_', x) := rowSums(select(., starts_with('col'))))))
Or in base R :
cols <- grep('col', names(df))
df[paste0('sum_', vals)] <- lapply(vals, function(x) rowSums(+(df[cols] < x)))
df
# id col1 col2 col3 sum_20 sum_25 sum_30 sum_35 sum_40
#1 A 1 6 2 3 3 3 3 3
#2 B 3 12 4 3 3 3 3 3
#3 C 5 9 30 2 2 2 3 3
I have a list of 2D matrices. Each matrix is filled using the function fillMatrices. This function adds a number of individuals to each day 0 in a matrix and updates the columns a_M, b_M and c_M. The numbers of individuals come from an initial matrix ind. The code works but it is slow when the number of matrices within the list is large. For example with n = 10000:
user system elapsed
3.73 0.83 4.55
If possible, I would like to reduce the elapsed time to <= 1 sec and increase the n to 720000 matrices. So, I am looking for way to optimize only the section 3. Here is the code:
###############################################
###############################################
## Section 3
## Run the function "fillMatrices"
indexTime <- 1
dt_t_1 <- do.call(rbind, lapply(list_matrices, function(x) x[1,]))
dt_t <- fillMatrices(dt_t_1 = dt_t_1, species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## Fill the matrices within the list
system.time(for(i in 1:n){
list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
})
## test <- list_matrices[[1]]
The code of the section 1 is used to initialize the matrices and the function fillMatrices can be found in the section 2. In my example, the function is used to fill matrices for one species. In reality, the function is used for 3 species (i.e., is applied three times) by changing the argument species = c("a_M", "b_M", "c_M"). How can I speed up my code? Any advice would be much appreciated.
Here are the codes of sections 1 and 2:
rm(list=ls(all=TRUE))
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000
###############################################
###############################################
## Section 1
## Build the list of 2D matrices
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
list_matrices <- list()
for(i in 1:n){
print(i)
list_matrices[[i]] <- ff(-999, dim=c(3650, length(col_mat)), dimnames=list(NULL, col_mat), vmode="double", overwrite = TRUE)
}
## test <- list_matrices[[1]]
## dim(list_matrices[[1]])
## Fill the first row of each matrix
for(i in 1:n){
print(i)
list_matrices[[i]][1,] <- c(1, 1, 1, i-1, 0, rep(0, length(v_date)))
}
## test <- list_matrices[[2]]
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1, n), day = rep(1, n), time = rep(1, n), died = rep(0, n), ID = (seq(1, n, 1))- 1, a_M = sample(1:10, n, replace = T), b_M = sample(1:10, n, replace = T), c_M = sample(1:10, n, replace = T)))
## print(ind)
###############################################
###############################################
## Section 2
## Function to convert a data frame into a matrix
convertDFToMat <- function(x){
mat <- as.matrix(x[,-1])
ifelse(is(x[,1], "data.frame"), rownames(mat) <- pull(x[,1]), rownames(mat) <- x[,1])
## Convert character matrix into numeric matrix
mat <- apply(mat, 2, as.numeric)
return(mat)
}
## Define the function that is used to fill the matrices within the list
fillMatrices <- function(dt_t_1, species, maxDuration, matrixColumns){
## Format data
dt <- as.data.frame(dt_t_1) %>%
reshape::melt(id = c("ID")) %>%
arrange(ID) %>%
dplyr::mutate_all(as.character)
## summary(dt)
## Break out the variable "variable" into different columns, with one row for each individual-day
dt_reshape_filter_1 <- dt %>%
dplyr::filter(!variable %in% c("year", "day", "time", "ID", "died")) %>%
dplyr::mutate(day = variable %>% gsub(pattern = "\\_.*", replacement = "", x = .), col = variable %>% gsub(pattern = ".*\\|", replacement = "", x = .)) %>%
dplyr::select(-variable) %>%
tidyr::spread(col, value) %>%
dplyr::mutate_all(as.numeric) %>%
dplyr::arrange(ID, day)
## summary(dt_reshape_filter_1)
## Apply requested transformations and build the data frame
dt_transform <- dt_reshape_filter_1 %>%
dplyr::rename_at(vars(species), ~ c("a", "b", "c")) %>%
dplyr::mutate(day = day + 1) %>%
dplyr::filter(day < maxDuration + 1) %>%
dplyr::bind_rows(tibble(ID = ind[,c("ID")], day = 0, a = ind[,c("a_M")], b = ind[,c("b_M")])) %>%
dplyr::mutate(c = a + b) %>%
dplyr::rename_at(vars("a", "b", "c"), ~ species) %>%
dplyr::arrange(ID, day)
## summary(dt_transform)
## Take different columns of the data frame and gather them into a single column
dt_gather <- dt_transform %>%
tidyr::gather(variable, value, species) %>%
dplyr::mutate(day = if_else(day > 1, paste0(day, "_days"), paste0(day, "_day"))) %>%
tidyr::unite(variable, c("day", "variable"), sep = "|") %>%
dplyr::rename(var2 = ID) %>%
dplyr::mutate_all(as.character)
## summary(dt_gather)
## Add the other columns in the data frame and convert the resulting data frame into a matrix
dt_reshape_filter_2 <- dt %>%
dplyr::rename(var2 = ID) %>%
dplyr::filter(variable %in% c("year", "day", "time", "ID", "died")) %>%
tidyr::spread(variable, value) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate(year = ind[,c("year")],
day = ind[,c("day")],
time = ind[,c("time")],
ID = ind[,c("ID")],
died = ind[,c("died")]) %>%
tidyr::gather(variable, value, c(year, day, time, ID, died)) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate_all(as.character)
## summary(dt_reshape_filter_2)
## Build the output matrix
dt_bind <- bind_rows(dt_reshape_filter_2, dt_gather) %>%
tidyr::spread(var2, value) %>%
dplyr::arrange(match(variable, matrixColumns)) %>%
dplyr::select("variable", as.character(ind[,c("ID")]))
## summary(dt_bind)
dt_mat <- convertDFToMat(dt_bind)
## summary(dt_mat)
return(dt_mat)
}
Making a 3D array instead of a 2D list of matrices gives you more options
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000L
n_row <- 3650L
#array way:
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
arr1 <- ff(-999L, dim = c(n_row, length(col_mat), n), dimnames = list(NULL, col_mat, NULL))
## Fill the first row of each matrix slice
arr1[1, , ] <- c(1L, 1L, 1L, NA, 0L, rep(0L, length(v_date)))
arr1[1, 4, ] <- seq_len(n)-1L
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1L, n), day = rep(1L, n), time = rep(1L, n), died = rep(0L, n), ID = (seq(1L, n, 1L))- 1L, a_M = sample(1L:10L, n, replace = T), b_M = sample(1L:10L, n, replace = T), c_M = sample(1L:10L, n, replace = T)))
##fill the matrix
indexTime <- 1L
dt_t <- fillMatrices(dt_t_1 = t(arr1[1, ,]), species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## reassign
system.time(
arr1[indexTime + 1, ,] <- dt_t
)
user system elapsed
0.05 0.70 0.7
# for comparison
#> system.time(for(i in 1:n){
#+ list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
#+ })
# user system elapsed
# 4.75 1.08 5.90
As far as I can tell, it's giving me the same results as your original approach but does so a lot faster.
I wanted to use dplyr instead of apply,1 in order to filter a dataset rowwise according to a logical expression, ie for this example I´d like to remove all rows that have one or more values of 99.
However, I was surprised by the poor performance in dplyr. Any ideas if I can speed this up in dplyr? Also, I would have thought that the rowwise function would pipe the individual rows, but apparently not (see below). How can I use the rowwise function?
library(tidyverse)
s <- tibble(rows = seq(from = 250, to = 5000, by = 250)) #my original dataset has 400K rows...
s$num <- map(s$rows, ~ rnorm(.x * 6))
s$num <-
map(s$num, ~ replace(.x, sample(1:length(.x), size = length(.x) / 20), 99))
s$mat <- map(s$num, ~ as_data_frame(matrix(.x, ncol = 6)))
help_an <- function(vec) {
browser()
return(!any(vec == 99))
}
help_dp_t <- function(df) {
clo1 <- proc.time()
a <- as_data_frame(t(df)) %>% summarise_all(help_an)
df2 <- filter(df, t(a)[, 1])
b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
return(b)
}
s$dplyr <- map(s$mat, ~ dplyr::mutate(help_dp_t(.x)))
help_lap <- function(df) {
clo1 <- proc.time()
a_base <- df[apply(df, 1, function(x)
! any(x == 99)), ]
b <- tibble(time = (proc.time() - clo1)[3], df = list(a_base))
return(b)
}
s$lapply <- map(s$mat, ~ mutate(help_lap(.x)))
s$equal_dplyr_lapply <-
map2_lgl(s$dplyr, s$lapply, ~ all.equal(.x$df, .y$df))
s$dplyr_time <- map_dbl(s$dplyr, "time")
s$lapply_time <- map_dbl(s$lapply, "time")
ggplot(gather(s, ... = c(7, 8)), aes(x = rows, y = value, color = key)) +
geom_line()
I tried the following with rowwise, but the rowwise pipe does not send a vector, but the entire df to the help_an function.
help_dp_r <- function(df) {
clo1 <- proc.time()
df2 <-
df %>% rowwise() %>% mutate(cond = help_an(.)) ### . is not passed on as a vector, but the entire df??
b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
}
s$dplyr_r <- map(s$mat, ~ dplyr::mutate(help_dp_r(.x)))
I have a data frame with I obsevations, and each observation belongs to one of g categories.
set.seed(9782)
I <- 500
g <- 10
library(dplyr)
anon_id <- function(n = 1, length = 12) {
randomString <- c(1:n)
for (i in 1:n)
{
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
length, replace = TRUE),
collapse = "")
}
return(randomString)
}
df <- data.frame(id = anon_id(n = I, length = 16),
group = sample(1:g, I, T))
I want to randomly assign each observation to one of J "urns", given some vector of probabilities p. That is the probability of being assign to urn J=1 is p[1]. The added complexity is that I want to do this block by block.
If I ignore the blocks, I can do this easily:
J <- 3
p <- c(0.25, 0.5, 0.25)
df1 <- df %>% mutate(urn = sample(x = c(1:J), size = I, replace = T, prob = p))
I thought about this method to do it by "block"
# Block randomization
randomize_block <- function(g) {
df1 <- df %>% filter(group==g)
size <- nrow(df1)
df1 <- df1 %>% mutate(urn = sample(x = c(1:J),
size = size,
replace = T,
prob = p))
return(df1)
}
df2 <- lapply(1:g, randomize_block)
df2 <- data.table::rbindlist(df2)
Is there a better way?
Not sure if this is better, but here is a base R technique with data.frame df, that has group name "group" as well as urn assignments 1:J with assignment probabilities in vector p of length J.
# get urn assignment
urnAssignment <- lapply(unique(df$group),
function(i) sample(1:J, nrow(df[group==i,]), replace =T, prob=p))
# get a list that collects position of observations
obsOrder <- lapply(unique(df$group),
function(i) which(df$group == i))
df$urnAssignment <- unlist(urnAssignment)[unlist(obsOrder)]
randomizr::block_ra does exactly what you want.
library(randomizr)
library(janitor) #just for the tabyl function
block_rand <- as.tibble(randomizr::block_ra(blocks = df$group, conditions = c("urn_1","urn_2","urn_3")))
df2 <- as.tibble(bind_cols(df, block_rand))
df2 %>% janitor::tabyl(group, value)
This does the trick using dplyr:
randomize <- function(data, groups=2, block_id = NULL, p=NULL, seed=9782) {
if(is.null(p)) p <- rep(1/groups, groups)
if(is.null(block_id)){
df1 <- data %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
return(df1)
}else{
df1 <- data %>% group_by_(block_id) %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
}
}
df1 <- randomize(data = df, groups = J, block_id = "group", p = p, seed = 9782)