Apply function between lists of data frames - r

I have 2 lists (my.listA and my.listB) in R including 3 data frames each:
da1 <- data.frame(x=c(1,2,3),y=c(4,5,6))
da2 <- data.frame(x=c(3,2,1),y=c(6,5,4))
da3 <- data.frame(x=c(5,4,1),y=c(8,5,7))
my.listA <- list(da1, da2, da3)
db1 <- data.frame(z=c(2))
db2 <- data.frame(z=c(3))
db3 <- data.frame(z=c(4))
my.listB <- list(db1, db2, db3)
I am trying to obtain a new list (my.listAB) so that it includes 3 data frames showing the element by element product of the data frames in my.listA and my.listB paired according to the number at the end of the data frames' names, that is, the product of elements in da1 by elements in db1, the product of da2 by db2 and the product of da3 by db3.
This would be my desired result:
dab1 <- data.frame(x=c(2,4,6),y=c(8,10,12))
dab2 <- data.frame(x=c(9,6,3),y=c(18,15,12))
dab3 <- data.frame(x=c(20,16,4),y=c(32,20,28))
my.listAB <- list(dab1 , dab2 , dab3)
I tried the following, but it did not work:
for (i in 1:3) {
my.listAB <- my.listA[[i]]*my.listB[[i]]
};
Ideally someone could guide me towards a solution using the lapply function?
Many thanks!

You can use
l <- lapply(1:3, function(x) my.listA[[x]] * my.listB[[x]]$z)
or
l <- list()
for (x in 1:3)
l[[x]] <- my.listA[[x]] * my.listB[[x]]$z

In addition to the lapply and for loop option suggested by #lukeA in the comments, you could also try Map
r1 <- Map(`*`, my.listA,unlist(my.listB))
identical(r1, my.listAB)
#[1] TRUE

Related

How to add many data frame columns efficiently in R

I need to add several thousand columns to a data frame. Currently, I have a list of 93 lists, where each of the embedded lists contains 4 data frames, each with 19 variables. I want to add each column of all those data frames to an outside file. My code looks like:
vars <- c('tmin_F','tavg_F','tmax_F','pp','etr_grass','etr_alfalfa','vpd','rhmin','rhmax','dtr_F','us','shum','pp_def_grass','pp_def_alfalfa','rw_tot','fdd28_F0','fdd32_F0','fdd35_F0',
'fdd356_F0','fdd36_F0','fdd38_F0','fdd39_F0','fdd392_F0','fdd40_F0','fdd41_F0','fdd44_F0','fdd45_F0','fdd464_F0','fdd48_F0','fdd50_F0','fdd52_F0','fdd536_F0','fdd55_F0',
'fdd57_F0','fdd59_F0','fdd60_F0','fdd65_F0','fdd70_F0','fdd72_F0','hdd40_F0','hdd45_F0','hdd50_F0','hdd55_F0','hdd57_F0','hdd60_F0','hdd65_F0','hdd45_F0',
'cdd45_F0','cdd50_F0','cdd55_F0','cdd57_F0','cdd60_F0','cdd65_F0','cdd70_F0','cdd72_F0',
'gdd32_F0','gdd35_F0','gdd356_F0','gdd38_F0','gdd39_F0','gdd392_F0','gdd40_F0','gdd41_F0','gdd44_F0','gdd45_F0',
'gdd464_F0','gdd48_F0','gdd50_F0','gdd52_F0','gdd536_F0','gdd55_F0','gdd57_F0','gdd59_F0','gdd60_F0','gdd65_F0','gdd70_F0','gdd72_F0',
'gddmod_32_59_F0','gddmod_32_788_F0','gddmod_356_788_F0','gddmod_392_86_F0','gddmod_41_86_F0','gddmod_464_86_F0','gddmod_48_86_F0','gddmod_50_86_F0','gddmod_536_95_F0',
'sdd77_F0','sdd86_F0','sdd95_F0','sdd97_F0','sdd99_F0','sdd104_F0','sdd113_F0')
windows <- c(15,15,15,29,29,29,15,15,15,15,29,29,29,29,15,rep(15,78))
perc_list <- c('obs','smoothed_obs','windowed_obs','smoothed_windowed_obs')
percs <- c('00','02','05','10','20','25','30','33','40','50','60','66','70','75','80','90','95','98','100')
vcols <- seq(1,19,1)
for (v in 1:93){
for (pl in 1:4){
for (p in 1:19){
normals_1981_2010 <- normals_1981_2010 %>% mutate(!!paste0(vars[v],'_daily',perc_list[pl],'_perc',percs[p]) := percents[[v]][[pl]][,vcols[p]])}}
print(v)}
The code starts fast, but very quickly slows to a crawl as the outside data frame grows in size. I didn't realize this would be problem. How do I add all these extra columns efficiently? Is there a better way to do this than by using mutate? I've tried add_column, but that does not work. Maybe it doesn't like the loop or something.
Your example is not reproducible as is (the object normals_1981_2010 doesn't exist but is called within the loop, so I am unsure I understood your question.
If I did though, this should work:
First, I am reproducing your dataset structure, except that instead of 93 list, I set it up to have 5, instead of 4 nested tables within, I set it up to have 3 tables, and instead of each tables having 19 columns, I set them up to have 3 columns.
df_list <- vector("list", 5) # Create an empty list vector, then fill it in.
for(i in 1:5) {
df_list[[i]] <- vector("list", 3)
for(j in 1:3) {
df_list[[i]][[j]] <- data.frame(a = 1:12,
b = letters[1:12],
c = month.abb[1:12])
colnames(df_list[[i]][[j]]) <- paste0(colnames(df_list[[i]][[j]]), "_nest_", i, "subnest_", j)
}
}
df_list # preview the structure.
Then, answering your question:
# Now, how to bind everything together:
df_out <- vector("list", 5)
for(i in 1:5) {
df_out[[i]] <- bind_cols(df_list[[i]])
}
# Final step
df_out <- bind_cols(df_out)
ncol(df_out) # Here I have 5*3*3 = 45 columns, but you will have 93*4*19 = 7068 columns
# [1] 45

How to perform selective rbind on datasets in R

I am trying to use rbind to append different datasets countrywise. The list of the datasets is
data <- c('a1','a2','a3','b1','b2','bu3','bu4','c1','c3')
code <- c('a','b','bu',c)
The structure of the data is somewhat like this -
countrya1 <- c("a","a","a")
yeara1 <- c("1","1","1")
inca1 <- c("1","2","3")
a1 <- data.frame(countrya1,yeara1,inca1)
countrya2 <- c("a","a","a")
yeara2 <- c("2","2","2")
inca2 <- c("1","4","3")
a2 <- data.frame(countrya2,yeara2,inca2)
countryb1 <- c("b","b","b")
yearb1 <- c("1","1","1")
incb1 <- c("1","2","7")
b1 <- data.frame(countryb1,yearb1,incb1)
countryb2 <- c("b","b","b")
yearb2 <- c("2","2","2")
incb2 <- c("6","2","3")
b2 <- data.frame(countryb2,yearb2,incb2)
The code that I used to combine all the datasets is as follows -
df=NULL
for (i in length(data)){
df1 <-read.dta(data[i])
df <-rbind(df,df1)
}
This binds all the datasets together in df.
Is there a way to bind a1,a2,a3 together and b1,b2,b3 together and so on. In short, I want to bind the datasets by 'code'. Is there a way to do it in R?
Thanks in advance for the help.
We can split the 'data' by creating a group without the numbers and then with read.dta, read the datasets and rbind the datasets of same name
lst <- lapply(split(data, sub("\\d+", "", data)),
function(x) do.call(rbind, lapply(x, read.dta)))
If we want to use the 'code', then use grep by looping through the 'code'
lapply(code, function(x) do.call(rbind, lapply(grep(x, data, value = TRUE), read.dta)))
data
code <- c('a','b','bu','c')

Building forvalues loops in R

[Working with R 3.2.2]
I have three data frames with the same variables. I need to modify the value of some variables and change the name of the variables (rename the columns). Instead of doing this data frame by data frame, I would like to use a loop.
This is the code I want to run:
#Change the values of the variables
vlist <- c("var1", "var2", "var3")
dataframe0[,vlist] <- dataframe0[,vlist]/10
dataframe1[,vlist] <- dataframe1[,vlist]/10
dataframe2[,vlist] <- dataframe2[,vlist]/10
#Change the name of the variables
colnames(dataframe0)[colnames(dataframe0)=="var1"] <- "temp_min"
colnames(dataframe0)[colnames(dataframe0)=="var2"] <- "temp_max"
colnames(dataframe0)[colnames(dataframe0)=="var3"] <- "prep"
colnames(dataframe1)[colnames(dataframe1)=="var1"] <- "temp_min"
colnames(dataframe1)[colnames(dataframe1)=="var2"] <- "temp_max"
colnames(dataframe1)[colnames(dataframe1)=="var3"] <- "prep"
colnames(dataframe2)[colnames(dataframe2)=="var1"] <- "temp_min"
colnames(dataframe2)[colnames(dataframe2)=="var2"] <- "temp_max"
colnames(dataframe2)[colnames(dataframe2)=="var3"] <- "prep"
I know the logic to do it with programs like Stata, with a forvalues loop:
#Change the values of the variables
forvalues i=0/2 {
dataframe`i'[,vlist] <- dataframe`i'[,vlist]/10
#Change the name of the variables
colnames(dataframe`i')[colnames(dataframe`i')=="var1"] <- "temp_min"
colnames(dataframe`i')[colnames(dataframe`i')=="var2"] <- "temp_max"
colnames(dataframe`i')[colnames(dataframe`i')=="var3"] <- "prep"
}
But, I am not able to reproduce it in R. How should I proceed? Thanks in advance!
I would go working with a list of dataframe, you can still 'split' it after if really needed:
df1 <- data.frame("id"=1:10,"var1"=11:20,"var2"=11:20,"var3"=11:20,"test"=1:10)
df2 <- df1
df3 <- df1
dflist <- list(df1,df2,df3)
for (i in seq_along(dflist)) {
df[[i]]['test'] <- df[[i]]['test']/10
colnames( dflist[[i]] )[ colnames(dflist[[i]]) %in% c('var1','var2','var3') ] <- c('temp_min','temp_max','prep')
# eventually reassign df1-3 to their list value:
# assign(paste0("df",i),dflist[[i]])
}
The interest of using a list is that you can access them a little more easily in a programmatic way.
I did change your code from 3 calls to only one, as colnames give a vector you can subset it and replace in one pass, this is assuming your var1 to var3 are always in the same order.
Addendum: if you want a single dataset at end you can use do.call(rbind,dflist) or with data.table package rbindlist(dflist).
More details on working with list of data.frames in Gregor's answer here

How to add/merge dataframes in a list

I used the following for loop to read 7 csv files and add them to a list.
list <- list()
l <- 1
for(i in 1:7){
data <- read.csv(paste("file",i,".csv",sep=""),header=FALSE)
list[[l]] <- data
l <- l + 1
}
So now I have a list named "list" containing 7 dataframes, right?
Each of the 8 dataframes contain the same three columns (NAME, SURNAME, AGE).
I now want to add:
df <- dataframe(NAME,SURNAME,AGE) ## to each dataframe in the list.
Did that help at all? My question is, how can I achieve that for all 7 objects in the list automatically!
If the 'lst' has seven data.frames and want to 'rbind' the 8th dataset to each of the datasets in the list, we can use Map
Map(rbind, lst, list(d1))
Or using lapply
lapply(lst, rbind, d1)
Update
If the 'lst' is of length 8, and wants to rbind the first 7 elements with the dataset in the 8th element, then you can just do
Map(rbind, lst[-8], lst[8])
data
set.seed(24)
lst <- lapply(1:7, function(i) as.data.frame(matrix(sample(0:10, 3*10,
replace=TRUE), ncol=3)))
set.seed(49)
d1 <- as.data.frame(matrix(sample(1:20, 3*10, replace=TRUE), ncol=3))
Or, if the ultimate goal is to just ensure all 8 CSV files make it into one data.frame:
# generate some sample files
files <- sprintf("iris%d.csv", i)
for (i in 1:8) { write.csv(iris, files, row.names=FALSE) }
# make one happy data frame
do.call(rbind, lapply(files, read.csv))

merge data frames based on non-identical values in R

I have two data frames. First one looks like
dat <- data.frame(matrix(nrow=2,ncol=3))
names(dat) <- c("Locus", "Pos", "NVAR")
dat[1,] <- c("ACTC1-001_1", "chr15:35087734..35087734", "1" )
dat[2,] <- c("ACTC1-001_2 ", "chr15:35086890..35086919", "2")
where chr15:35086890..35086919 indicates all the numbers within this range.
The second looks like:
dat2 <- data.frame(matrix(nrow=2,ncol=3))
names(dat2) <- c("VAR","REF.ALT"," FUNC")
dat2[1,] <- c("chr1:116242719", "T/A", "intergenic" )
dat2[2,] <- c("chr1:116242855", "A/G", "intergenic")
I want to merge these by the values in dat$Pos and dat2$VAR. If the single number in a cell in dat2$VAR is contained within the range of a cell in dat$Pos, I want to merge those rows. If this occurs more than once (dat2$VAR in more than one range in dat$Pos, I want it merged each time). What's the easiest way to do this?
Here is a solution, quite short but not particularly efficient so I would not recommend it for large data. However, you seemed to indicate your data was not that large so give it a try and let me know:
library(plyr)
exploded.dat <- adply(dat, 1, function(x){
parts <- strsplit(x$Pos, ":")[[1]]
chr <- parts[1]
range <- strsplit(parts[2], "..", fixed = TRUE)[[1]]
start <- range[1]
end <- range[2]
data.frame(VAR = paste(chr, seq(from = start, to = end), sep = ":"), x)
})
merge(dat2, exploded.dat, by = "VAR")
If it is too slow or uses too much memory for your needs, you'll have to implement something a bit more complex and this other question looks like a good starting point: Merge by Range in R - Applying Loops.
Please try this out and let us know how it works. Without a larger data set it is a bit hard to trouble shoot. If for whatever reason it does not work, please share a few more rows from your data tables (specifically ones that would match)
SPLICE THE DATA
range.strings <- do.call(rbind, strsplit(dat$Pos, ":"))[, 2]
range.strings <- do.call(rbind, strsplit(range.strings, "\\.\\."))
mins <- as.numeric(range.strings[,1])
maxs <- as.numeric(range.strings[,2])
d2.vars <- as.numeric(do.call(rbind, str_split(dat2$VAR, ":"))[,2])
names(d2.vars) <- seq(d2.vars)
FIND THE MATCHES
# row numebr is the row in dat
# col number is the row in dat2
matches <- sapply(d2.vars, function(v) mins < v & v <= maxs)
MERGE
# create a column in dat to merge-by
dat <- cbind(dat, VAR=NA)
# use the VAR in dat2 as the merge id
sapply(seq(ncol(matches)), function(i)
dat$VAR <- dat2[i, "VAR"] )
merge(dat, dat2)

Resources