for inside foreach parallel not populating a dataframe in R - r

I am having an issue populating a foreach. Suppose I have the following dataframe, the consequence of this dataframe is exactly what my real one looks like:
Elec2 <- rep(rep(rep(27:1, each = 81), each = 18), times = 100)
Ind <- rep(1:18, times = 218700)
Cond <- rep(1:3, times = 1312200)
Trial <- rep(rep(1:100, each = 2187), each = 18)
DVAR <- rbeta (3936600, 0.7, 1,5)
data <- cbind(DVAR, Ind, Cond, Trial, Elec1, Elec2)
I am trying the following code of parallelisation:
distinct_pairs <-
data %>%
select(Elec1, Elec2) %>%
distinct()
cl <- makeCluster(2) #values here are adjusted to cores, used 2 for the example
registerDoParallel(cl)
output <- foreach (i = 1:nrow(distinct_pairs), .packages='glmmTMB',
.combine = rbind,
.errorhandling="pass",
.verbose = T) %dopar% {
dep <- distinct_pairs[i,]
dat1 <- subset(data, dep$Elec1 == data$Elec1 & dep$Elec2 == data$Elec2)
df[i,]$Elec1 <- dep[i,]$Elec1
df[i,]$Elec2 <- dep[i,]$Elec2
for (j in 1:18) { #By individual
dat2 <- subset(dat1, dat1$Ind==j)
model <- glmmTMB(DVAR ~ Cond, family=beta_family('logit'), data=dat2)
results <- summary(model)
est <- results$coefficients$cond[2,1]
ste <- results$coefficients$cond[2,2]
df[j,] <- c(est,ste)
}
return(df)
}
output <- as.data.frame(output, row.names = FALSE)
As you can see I am expecting a dataframe with the results of the iterations est & ste plus the identification of the electrodes Elec1 & Elec2. If I run the lines independently one by one it seems to work fine, but i cannot make it work the way I expect.
First loop takes a pair of electrodes, every row in distinct_pairs is a pair of electrodes, numbered from 1 to 27 for Elec1 and for Elec2.
Problem is I am unable to get the data of the for loop written in the final output dataframe.
I am sure the problem is pretty basic, but I appreciate any insight as I seem to be missing something.
Thanks!
[[UPDATE WITH SOLUTION]]
In case anyone is interested, here is the solution.
output <- foreach (i = 1:10, .packages='glmmTMB',
.combine = rbind,
.errorhandling="pass",
.inorder = TRUE,
.verbose = T) %dopar% {
dat1 <- subset(data, distinct_pairs[i,]$Elec1 == data$Elec1 & distinct_pairs[i,]$Elec2 == data$Elec2)
df <- data.frame('Elec1'=rep(distinct_pairs[i,]$Elec1,18),'Elec2'=rep(distinct_pairs[i,]$Elec2,18),'est'=rep(NA,18),'ste'=rep(NA,18))
for (j in 1:18) {
dat2 <- subset(dat1, dat1$Ind==j)
model <- glmmTMB(DVAR ~ Condition, family=beta_family('logit'), data=dat2)
results <- summary(model)
est <- results$coefficients$cond[2,1]
ste <- results$coefficients$cond[2,2]
df[j,c('est','ste')] <- c(est,ste)
}
return(df)
}
Which returns exactly what I was looking for:
> head(output)
Elec1 Elec2 est ste
1 1 1 0.034798615 0.03530296
2 1 1 -0.005363760 0.03392442
3 1 1 -0.017349123 0.03404430
4 1 1 -0.034819068 0.03196078
5 1 1 0.002301062 0.03163825
6 1 1 0.003575131 0.03452420

I am definetly not sure if I got the problem, could you also provide an Elec1 in your data Example?
An idea:
Foreach might not find df, you could create the data frame at the beginning of your loop with something like
df <- data.frame('Elec1'=rep(NA,18),'Elec2'=rep(NA,18),'est'=rep(NA,18),'ste'=rep(NA,18))
maybe add then below in the for loop: df[j,c('est','ste')] <- c(est,ste)

Related

Combining the Results of Several Loops Together

I wrote the following code that generates a single random number, subtracts this random number from some constant, records this result - and then repeats this process 100 times:
# 1 random number
results <- list()
for (i in 1:100) {
iteration = i
number_i_1 = mean(rnorm(1,10,2))
difference_i_1 = 10 - number_i_1
results_tmp = data.frame(iteration, number_i_1, difference_i_1)
results[[i]] <- results_tmp
}
results_df_1 <- do.call(rbind.data.frame, results)
To do this for 2 random numbers and 3 random numbers - the above code only needs to be slightly modified:
# 2 random numbers
results <- list()
for (i in 1:100) {
iteration = i
number_i_2 = mean(rnorm(2,10,2))
difference_i_2 = 10 - number_i_2
results_tmp = data.frame( number_i_2, difference_i_2)
results[[i]] <- results_tmp
}
results_df_2 <- do.call(rbind.data.frame, results)
# 3 random numbers
results <- list()
for (i in 1:100) {
iteration = i
number_i_3 = mean(rnorm(3,10,2))
difference_i_3 = 10 - number_i_3
results_tmp = data.frame( number_i_3, difference_i_3)
results[[i]] <- results_tmp
}
results_df_3 <- do.call(rbind.data.frame, results)
My Question: I would like to repeat this general process 20 times and store all the results in a single data frame. For example (note: the actual data frame would have 20 pairs of such columns):
final_frame = cbind(results_df_1 , results_df_2, results_df_3)
iteration number_i_1 difference_i_1 number_i_2 difference_i_2 number_i_3 difference_i_3
1 1 12.534059 -2.5340585 9.623655 0.3763455 9.327020 0.67298023
2 2 9.893728 0.1062721 10.135650 -0.1356502 10.037904 -0.03790384
3 3 8.895232 1.1047680 9.848402 0.1515981 7.588531 2.41146943
4 4 11.648550 -1.6485504 8.509288 1.4907120 10.294153 -0.29415334
5 5 9.045034 0.9549660 9.351834 0.6481655 11.084067 -1.08406691
6 6 9.230139 0.7698612 8.163164 1.8368356 7.846356 2.15364367
And then make two mean files (note: each of these two files would also have 20 rows):
mean_numbers = data_frame(iterations = c(1:3), mean_number = c(mean(final_frame$number_i_1),mean(final_frame$number_i_2), mean(final_frame$number_i_3) ) )
mean_differences = data_frame(iterations = c(1:3), mean_differences = c(mean(final_frame$difference_i_1),mean(final_frame$difference_i_1), mean(final_frame$difference_i_1) ) )
Can someone please show me how to do this?
Your initial objective can be simplified like this:
results <- list()
for (i in seq_len(100)) {
#Samples from 1 to 20 numbers, averages them
a <- unlist(lapply(seq_len(20), function(x) mean(rnorm(x, 10, 2))))
#Creates names for this vector
names(a) <- paste0(rep("number_i_", 20), 1:20)
#differences
b <- 10-a
#and it's names
names(b) <- paste0(rep("diff_i_", 20), 1:20)
#creating 40c df (there are better structures for this specially if the final outcome is to separate them)
c <- as.data.frame(cbind(rbind(a), rbind(b)))
#storing in list
results[[i]] <- c
}
results_df_3 <- do.call(rbind.data.frame, results)
There are even more elegant ways to write this but it will be enough for you to get there.
The format in your last section does not make sense to what you want to achieve. If it is to create a summary of the means for each number of samples taken, like this:
mockfdf <- data.frame(nsamp = 1:20, meanmeans = rnorm(20))#summarized means go here
mockddf <- data.frame(nsamp = 1:20, diffmeans = rnorm(20))#summarized means go here
Then you can easily separate the dataframes for differences and means and process them a lot better by using separate dataframes for each.

How to create new variable at the end of each loop iteration in R

I am trying to create a variable that is a function of 4 other variables. I have the following code:
set.seed(123)
iter <- 1000
group <- c('A','B','C','D','E','F')
for (i in group) {
df <- df1[df1$group == i,]
x_ <- vector(mode="numeric", length=1000)
assign(eval(paste0("X_", i)), globalenv()) #This is the issue
a <- rnorm(iter, mean=df$a, sd=df$sea)
b <- rnorm(iter, mean=df$b, sd=df$seb)
c <- rnorm(iter, mean=df$c, sd=df$sec)
z <- rnorm(iter, mean=df$zbar, sd=df$se_z)
X_[i] = (a + c*(z-df$zbar))/(-b)
}
I am unable to create a unique group-specific variable (e.g. X_A, X_B, ...) and I am unsure why the -assign( )- function is not working properly. The dataframe df1 has 6 rows (one for each group) and then the number of columns is equal to the number of variables plus a string variable for group. I am not trying to append this new variables X_[i] to the dataset I am just trying to place it in the global environment. I believe the issue lies in my assigning the placement of the variable, but it isn't generating a numeric variable X.
df1 is a dataframe with 6 observations of 9 variables containing a, sea, b, seb, c, sec, zbar, se_z. These are just the means and standard deviations of a, b, c, and z, respectively. The 9th variable is group which contains A, B, ..., F. When I use the code df <-df1[df1$group == i,] I am trying to create a unique X variable for each group entity.
Try something like this:
dynamicVariableName <- paste0("X_", i)
assign(dynamicVariableName, (a + c*(z-df$zbar))/(-b))
Alternatively to the answer from #ErrorJordan, you can write your loop like that:
set.seed(123)
iter <- 1000
group <- c('A','B','C','D','E','F')
for(i in group)
{
df <- df1[df1$group == i,]
a <- rnorm(iter, mean=df$a, sd=df$sea)
b <- rnorm(iter, mean=df$b, sd=df$seb)
c <- rnorm(iter, mean=df$c, sd=df$sec)
z <- rnorm(iter, mean=df$zbar, sd=df$se_z)
X <- (a + c*(z-df$zbar))/(-b)
assign(paste0("X_",i),X,.GlobalEnv)
}
As suggested by #MrFlick, you can also stored your data into a list, to do so you can just modify your loop to get:
set.seed(123)
iter <- 1000
group <- c('A','B','C','D','E','F')
X = vector("list",length(group))
names(X) = group
for(i in 1:length(group))
{
df <- df1[df1$group == group[i],]
a <- rnorm(iter, mean=df$a, sd=df$sea)
b <- rnorm(iter, mean=df$b, sd=df$seb)
c <- rnorm(iter, mean=df$c, sd=df$sec)
z <- rnorm(iter, mean=df$zbar, sd=df$se_z)
X[[i]] <- (a + c*(z-df$zbar))/(-b)
}
df1 dataframe
df1 = data.frame(a = c(1:6),
b = c(1:6),
c = c(1:6),
zbar = c(1:6),
sea = rep(1,6),
seb = rep(1,6),
sec = rep(1,6),
se_z = rep(1,6),
group = group)
It's a little hard to parse what you want to do, but I'm assuming it's something like
for each value in group make an object (in the global env) called X_A, X_B, ...
for each one of those objects, assign it the value (a + c*(z-df$zbar))/(-b)
I think this should do that for you:
set.seed(123)
group <- c('A','B','C','D','E','F')
for (i in group) {
df <- df1[df1$group == i,]
a <- rnorm(iter, mean=df$a, sd=df$sea)
b <- rnorm(iter, mean=df$b, sd=df$seb)
c <- rnorm(iter, mean=df$c, sd=df$sec)
z <- rnorm(iter, mean=df$zbar, sd=df$se_z)
assign(paste0("X_", i), (a + c*(z-df$zbar))/(-b), globalenv())
}
Note that in the code example you gave, the command iter <- 1000 has no effect, and the command x_ <- vector(mode="numeric", length=1000) also has no effect. By that I mean, you make those objects, but never subsequently use them in any further computation. If those commands should do something meaningful I'll need your help in explaining their intended purpose.

Vectorization of a nested for-loop that inputs all paired combinations

I thought that the following problem must have been answered or a function must exist to do it, but I was unable to find an answer.
I have a nested loop that takes a row from one 3-col. data frame and copies it next to each of the other rows, to form a 6-col. data frame (with all possible combinations). This works fine, but with a medium sized data set (800 rows), the loops take forever to complete the task.
I will demonstrate on a sample data set:
Sdat <- data.frame(
x = c(10,20,30,40),
y = c(15,25,35,45),
ID =c(1,2,3,4)
)
compar <- data.frame(matrix(nrow=0, ncol=6)) # to contain all combinations
names(compar) <- c("x","y", "ID", "x","y", "ID")
N <- nrow(Sdat) # how many different points we have
for (i in 1:N)
{
for (j in 1:N)
{
Temp1 <- Sdat[i,] # data from 1st point
Temp2 <- Sdat[j,] # data from 2nd point
C <- cbind(Temp1, Temp2)
compar <- rbind(C,compar)
}
}
These loops provide exactly the output that I need for further analysis. Any suggestion for vectorizing this section?
You can do:
ind <- seq_len(nrow(Sdat))
grid <- expand.grid(ind, ind)
compar <- cbind(Sdat[grid[, 1], ], Sdat[grid[, 2], ])
A naive solution using rep (assuming you are happy with a data frame output):
compar <- data.frame(x = rep(Sdat$x, each = N),
y = rep(Sdat$y, each = N),
id = rep(1:n, each = N),
x1 = rep(Sdat$x, N),
y1 = rep(Sdat$y, N),
id_1 = rep(1:n, N))

creating a function for processing my dataframe calculations

I am doing systematic calculations for my created dataframe. I have the code for the calculations but I would like to:
1) Wite it as a function and calling it for the dataframe I created.
2) reset the calculations for next ID in the dataframe.
I would appreciate your help and advice on this.
The dataframe is created in R using the following code:
#Create a dataframe
dosetimes <- c(0,6,12,18)
df <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
#Add back dose information
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
#Time-dependent covariate
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
#The calculations are done in a for-loop. Here is the code for it:
#values needed for the calculation
C <- 2
V <- 10
k <- C/V
#I would like this part to be written as a function
for(i in 2:nrow(df))
{
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
The other thing is that the previous code assumes the subject ID=1 for all time points. If subject ID=2 when the WT (weight) changes to 120. How can I reset the calculations and make it automated for all subject IDs in the dataframe? In this case the original dataframe would be like this:
#code:
rm(list=ls(all=TRUE))
dosetimes <- c(0,6,12,18)
df <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
df$ID[(df$WT>=120)==T] <- 2
df$TIME[df$ID==2] <- c(seq(0,20,1))
Thank you in advance!
In general, when doing calculations on different subject's data, I like to split the dataframe by ID, pass the vector of individual subject data into a for loop, do all the calculations, build a vector containing all the newly calculated data and then collapse the resultant and return the dataframe with all the numbers you want. This allows for a lot of control over what you do for each subject
subjects = split(df, df$ID)
forResults = vector("list", length=length(subjects))
# initialize these constants
C <- 2
V <- 10
k <- C/V
myFunc = function(data, resultsArray){
for(k in seq_along(subjects)){
df = subjects[[k]]
df$A1 = 100 # I assume this should be 100 for t=0 for each subject?
# you could vectorize this nested for loop..
for(i in 2:nrow(df)) {
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
# you can add all sorts of other calculations you want to do on each subject's data
# when you're done doing calculations, put the resultant into
# the resultsArray and we'll rebuild the dataframe with all the new variables
resultsArray[[k]] = df
# if you're not using RStudio, then you want to use dev.new() to instantiate a new plot canvas
# dev.new() # dont need this if you're using RStudio (which doesnt allow multiple plots open)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
}
# collapse the results vector into a dataframe
resultsDF = do.call(rbind, resultsArray)
return(resultsDF)
}
results = myFunc(subjects, forResults)
Do you want this:
ddf <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
myfn = function(df){
dosetimes <- c(0,6,12,18)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
#Add back dose information
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
#Time-dependent covariate
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
#The calculations are done in a for-loop. Here is the code for it:
#values needed for the calculation
C <- 2
V <- 10
k <- C/V
#I would like this part to be written as a function
for(i in 2:nrow(df))
{
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
}
myfn(ddf)
For multiple calls:
for(i in 1:N) {
myfn(ddf[ddf$ID==i,])
readline(prompt="Press <Enter> to continue...")
}

R: Row resampling loop speed improvement

I'm subsampling rows from a dataframe with c("x","y","density") columns at a variety of c("s_size","reps"). Reps= replicates, s_size= number of rows subsampled from the whole dataframe.
> head(data_xyz)
x y density
1 6 1 0
2 7 1 17600
3 8 1 11200
4 12 1 14400
5 13 1 0
6 14 1 8000
#Subsampling###################
subsample_loop <- function(s_size, reps, int) {
tm1 <- system.time( #start timer
{
subsample_bound = data.frame()
#Perform Subsampling of the general
for (s_size in seq(1,s_size,int)){
for (reps in 1:reps) {
subsample <- sample.df.rows(s_size, data_xyz)
assign(paste("sample" ,"_","n", s_size, "_", "r", reps , sep=""), subsample)
subsample_replicate <- subsample[,] #temporary variable
subsample_replicate <- cbind(subsample, rep(s_size,(length(subsample_replicate[,1]))),
rep(reps,(length(subsample_replicate[,1]))))
subsample_bound <- rbind(subsample_bound, subsample_replicate)
}
}
}) #end timer
colnames(subsample_bound) <- c("x","y","density","s_size","reps")
subsample_bound
} #end function
Here's the function call:
source("R/functions.R")
subsample_data <- subsample_loop(s_size=206, reps=5, int=10)
Here's the row subsample function:
# Samples a number of rows in a dataframe, outputs a dataframe of the same # of columns
# df Data Frame
# N number of samples to be taken
sample.df.rows <- function (N, df, ...)
{
df[sample(nrow(df), N, replace=FALSE,...), ]
}
It's way too slow, I've tried a few times with apply functions and had no luck. I'll be doing somewhere around 1,000-10,000 replicates for each s_size from 1:250.
Let me know what you think! Thanks in advance.
=========================================================================
UPDATE EDIT: Sample data from which to sample:
https://www.dropbox.com/s/47mpo36xh7lck0t/density.csv
Joran's code in a function (in a sourced function.R file):
foo <- function(i,j,data){
res <- data[sample(nrow(data),i,replace = FALSE),]
res$s_size <- i
res$reps <- rep(j,i)
res
}
resampling_custom <- function(dat, s_size, int, reps) {
ss <- rep(seq(1,s_size,by = int),each = reps)
id <- rep(seq_len(reps),times = s_size/int)
out <- do.call(rbind,mapply(foo,i = ss,j = id,MoreArgs = list(data = dat),SIMPLIFY = FALSE))
}
Calling the function
set.seed(2)
out <- resampling_custom(dat=retinal_xyz, s_size=206, int=5, reps=10)
outputs data, unfortunately with this warning message:
Warning message:
In mapply(foo, i = ss, j = id, MoreArgs = list(data = dat), SIMPLIFY = FALSE) :
longer argument not a multiple of length of shorter
I put very little thought into actually optimizing this, I was just concentrating on doing something that's at least reasonable while matching your procedure.
Your big problem is that you are growing objects via rbind and cbind. Basically anytime you see someone write data.frame() or c() and expand that object using rbind, cbind or c, you can be very sure that the resulting code will essentially be the slowest possible way of doing what ever task is being attempted.
This version is around 12-13 times faster, and I'm sure you could squeeze some more out of this if you put some real thought into it:
s_size <- 200
int <- 10
reps <- 30
ss <- rep(seq(1,s_size,by = int),each = reps)
id <- rep(seq_len(reps),times = s_size/int)
foo <- function(i,j,data){
res <- data[sample(nrow(data),i,replace = FALSE),]
res$s_size <- i
res$reps <- rep(j,i)
res
}
out <- do.call(rbind,mapply(foo,i = ss,j = id,MoreArgs = list(data = dat),SIMPLIFY = FALSE))
The best part about R is that not only is this way, way faster, it's also way less code.

Resources