Apply function on multiple lists in R - r

I have four lists each with multiple data frames.
I need to apply the same function on the lists.
How can I do this?
Sample data:
df1 <- data.frame(x = 1:3, y = letters[1:3])
df2 <- data.frame(x = 4:6, y = letters[4:6])
df3 <- data.frame(x = 7:9, y = letters[7:9])
df4 <- data.frame(x = 10:12, y = letters[10:12])
list1 <- list(df1,df2)
list2 <- list(df3,df4)
In my real data I import based on a pattern in the filename and thus my list elements will have the following names (sample data):
names(list1) <- c("./1. Data/df1.csv", "./1. Data/df2.csv")
names(list2) <- c("./1. Data/df3.csv", "./1. Data/df4.csv")
And this is one of the functions I want to run on all lists.
element.name <- function(x) {
all_filenames <- names(x) %>%
basename() %>%
as.list()
names(x) <- all_filenames
names(x) <- gsub("\\.csv", "", names(x))
}
which will give the desired output
names(list1) <- element.name(list1)
names(list1)
[1] "df1" [2] "df2"
I've tried using a for loop but I end up overwriting my output, so I hope some of you can help me out, since I need to run a lot of functions on my lists.

You could create a list of your lists, and then use lapply to apply to every list the function element.name. You can use setNames to avoid problems linked the assignment on names. You can then use list2env to get your data.frames back to the global environment.
setNames(list(list1, list2), c('list1', 'list2')) |>
lapply(function(x) setNames(x, element.name(x))) |>
list2env()
output
> list1
$df1
x y
1 1 a
2 2 b
3 3 c
$df2
x y
1 4 d
2 5 e
3 6 f
> list2
$df3
x y
1 7 g
2 8 h
3 9 i
$df4
x y
1 10 j
2 11 k
3 12 l

Here is an approach using data.table::fread
library(data.table)
# create dummy CSVs -------------------------------------------------------
DT1 <- data.frame(x = 1:3, y = letters[1:3])
DT2 <- data.frame(x = 4:6, y = letters[4:6])
DT3 <- data.frame(x = 7:9, y = letters[7:9])
DT4 <- data.frame(x = 10:12, y = letters[10:12])
mapply(write.csv, x = list(DT1, DT2, DT3, DT4), file = list("DT1.csv", "DT2.csv", "DT3.csv", "DT4.csv"), row.names = FALSE)
# read in CSVs ------------------------------------------------------------
csv_paths <- list.files(path = ".", pattern = ".csv$")
# might need to split this into different steps due to different csv formats?
DT_list <- setNames(lapply(csv_paths, fread), tools::file_path_sans_ext(basename(csv_paths)))
# apply a function to each data.table -------------------------------------
lapply(DT_list, function(DT){DT[, test := x*2]})
If you want to stick with the given dummy data just merge the lists:
list1 <- list(df1,df2)
list2 <- list(df3,df4)
DT_list <- setNames(c(list1, list2), tools::file_path_sans_ext(basename(csv_paths)))

Related

Adding an ID column in a list of dataframes in R

I have a list of XML files that I am merging together, but I am trying to figure out how to add an "id" column to each data frame based on the file name.
# BUILD DATAFRAME LIST
list_filenames <- list.files(pattern = ".xml$")
df_list <- lapply(list_filenames, function(f) {
list_ids <- as.list(list_filenames)
doc <- xmlParse(f, useInternalNodes = TRUE)
doc2 <- xmlToDataFrame(doc, nodes = getNodeSet(doc, "//Event"))
mapply(cbind, doc2, "id" = list_ids, SIMPLIFY = F) # Code that kind of works
})
final_df <- do.call(rbind, df_list)
I'm hoping to get something that looks like this:
ex_df <- cbind(x = c(3, 2, 10, 12),
y = c("a", "b", "c", "d"),
id = c("file_name_1", "file_name_1", "file_name_2", "file_name_2")) %>%
as.data.frame()
> ex_df
x y id
1 3 a file_name_1
2 2 b file_name_1
3 10 c file_name_2
4 12 d file_name_2
We can use Map
nm1 <- sub("\\.xml$", "", list_filenames)
out <- do.call(rbind, Map(cbind, df_list, id = nm1))
In the OP's code, we are looping over the list_filenames and then in the second line, using the full set of list_filenames in
as.list(list_filenames)
Instead it would be just 'f' i.e
df_list <- lapply(list_filenames, function(f) {
list_id <- sub("\\.xml$", "", f) #####
doc <- xmlParse(f, useInternalNodes = TRUE)
doc2 <- xmlToDataFrame(doc, nodes = getNodeSet(doc, "//Event"))
doc2$id = list_id
doc2
})
Then, we could rbind the `list elements
do.call(rbind, df_list)

Setting colnames of several data frames based on a list variable

I have a list of multiple data frames which are built the same way. I would like to change the name of the 1 column of each data frame to the name of the data frame itself and append some text. From several different answers I figured lapply and working on lists would be the best way to go.
Example data:
df1 <- data.frame(A = 1, B = 2, C = 3)
df2 <- data.frame(A = 1, B = 2, C = 3)
dfList <- list(df1,df2)
col1 <- names(dfList)
df<-lapply(dfList, function(x) {
names(x)[1:2] <- c(col1[1:length(col1)]"appended text","Col2","Col3");x
})
The problem seems to be with calling the correct entry in the "col1" variable for each data frame within my code.
Any ideas on how I should address/ express this correctly? Thanks a lot!
df1<-data.frame(A = 1, B = 2, C = 3)
df2<-data.frame(A = 1, B = 2, C = 3)
dfList <- list(df1=df1,df2=df2)
names(dfList)
col1 <- names(dfList)
for(i in 1:length(dfList))
names(dfList[[names(dfList[i])]])[1]<-names(dfList)[i]
dfList
Here is one option with tidyverse
library(tidyverse)
map(dfList, ~ .x %>%
rename(Aappended_text = A))
If this is based on the column index, create a function
fName <- function(lst, new_name, index){
map(lst, ~
.x %>%
rename_at(index, funs(paste0(., new_name))))
}
fName(dfList, "appended_text", 1)
I'm not sure if I'm understanding your quesiton completely but is tihs what you're after:
df1 <- data.frame(A = 1, B = 2, C = 3)
df2 <- data.frame(A = 1, B = 2, C = 3)
dfList <- list(df1,df2)
df <- lapply(dfList, function(x) {
colnames(x) <- c(paste0(colnames(x)[1], "appended text"), colnames(x)[2:length(colnames(x))])
return(x)
})
Output:
> df
[[1]]
Aappended text B C
1 1 2 3
[[2]]
Aappended text B C
1 1 2 3
You can simply use lapply
lapply(dfList, function(x) {
names(x)[1L] <- "some text"
x
})
But if you want to rename by the name of the data frame elements in your list, first you need to name them e.g. dfList <- list(df1 = df1, df2 = df2) and you can't acces them directly with lapply(x, ... so you need to lapplyover your list by indexes, for example :
lapply(seq_along(dfList), function(i) {
names(dfList[[i]])[1L] <- names(dfList[i])
dfList[[i]]
})

extracting a dataframe from a list over many objects

I have over a 1000 objects (z) in R, each containing three dataframes (df1, df2, df3) with different structures.
z1$df1 … z1000$df1
z1$df2 … z1000$df2
z1$df3 … z1000$df3
I created a list of these objects (list1 thus contains z1 thru z1000) and tried to use lapply to extract one type of dataframe (df2) for all objects, and then merge them to one single dataframe.
Extraction:
For a single object it would look like this:
df15<- z15$df2 # I transferred the index of z to the extracted df
I tried some code with lapply, ignoring the transfer of the index (I can create another list for that). However I don’t know what function I should use.
List2 <- lapply(list1, function(x))
I try to avoid using a loop because there's so many and vectorization is so much quicker. I have the idea I'm looking at it from the wrong angle.
Subsequent merging can be done as follows:
merged <- do.call(rbind, list2)
Thanks for any suggestions.
It sounds like you want to pull out all the df1s and rbind them together then do the same for the other dataframes. You can use purrr::map_dfr to extract a column from each element of the list and rowbind them together.
library('tidyverse')
dummy_df <- list(
df1 = iris,
df2 = cars,
df3 = CO2)
list1 <- list(
z1 = dummy_df,
z2 = dummy_df,
z3 = dummy_df)
df1 <- map_dfr(list1, 'df1')
df2 <- map_dfr(list1, 'df2')
df3 <- map_dfr(list1, 'df3')
If you wanted to do it in base R, you can use lapply.
df1 <- lapply(list1, function(x) x$df1)
df1_merged <- do.call(rbind, df1)
One option could be using lapply to extract data.frame and then use bind_rows from dplyr.
## The data
df1 <- data.frame(id = c(1:10), name = c(LETTERS[1:10]), stringsAsFactors = FALSE)
df2 <- data.frame(id = 11:20, name = LETTERS[11:20], stringsAsFactors = FALSE)
df3 <- data.frame(id = 21:30, name = LETTERS[15:24], stringsAsFactors = FALSE)
df4 <- data.frame(id = 121:130, name = LETTERS[15:24], stringsAsFactors = FALSE)
z1 <- list(df1 = df1, df2 = df2, df3 = df3)
z2 <- list(df1 = df1, df2 = df2, df3 = df3)
z3 <- list(df1 = df1, df2 = df2, df3 = df3)
z4 <- list(df1 = df1, df2 = df2, df3 = df4) #DFs can contain different data
# z <- list(z1, z2, z3, z4)
# Dynamically populate list z with many list object
z <- as.list(mget(paste("z",1:4,sep="")))
df1_all <- bind_rows(lapply(z, function(x) x$df1))
df2_all <- bind_rows(lapply(z, function(x) x$df2))
df3_all <- bind_rows(lapply(z, function(x) x$df3))
## Result for df3_all
> tail(df3_all)
## id name
## 35 125 S
## 36 126 T
## 37 127 U
## 38 128 V
## 39 129 W
## 40 130 X
Try this:
lapply(list1, "[[", "df2")
or if you want to rbind them together:
do.call("rbind", lapply(list1, "[[", "df2"))
The row names in the resulting data frame will identify the origin of each row.
No packages are used.
Note
We can use this input to test the code above. BOD is a built-in data frame:
z <- list(df1 = BOD, df2 = BOD, df3 = BOD)
list1 <- list(z1 = z, z2 = z)
THere's also data.table::rbindlist, which is likely faster than do.call(rbind, lapply(...)) or dplyr::bind_rows
library(data.table)
rbindlist(lapply(list1, "[[", "df2"))

How to modify multiple data frames without making a list of them and then using lapply?

I have 20 data frames and in each of them I want to format the same column in the same way. Of course, I could make a list of the dfs and then use lapply. Instead, my goal is to modify the dfs such that in the end I do not have to access them as elements of a list but as dfs. Here is an example:
df1 <- data.frame(col1 = rnorm(5), col2 = rnorm(5))
df2 <- data.frame(col1 = rnorm(5), col2 = rnorm(5))
Now, suppose I want to add 1 to every value of col1 in df1 and df2. Of course, I could do
df_list <- lapply(list(df1, df2), function(df) {
df$col1 <- df$col1 + 1
return(df)
})
But now df1 returns the original df instead of the modified one. How to do it?
One option based on the OP's code would be to use list2env after naming the list elements
names(df_list) <- paste0("df", 1:2)
list2env(df_list, envir = .GlobalEnv)
If we need to avoid creating the list (it is recommended to have a list of datasets instead of creating individual objects in the global environment), then use assign with for loop
for(obj in paste0('df', 1:2)) {
assign(obj, `[<-`(get(obj), 'col1', value = get(obj)[['col1']] +1))
}
You could use a hack from #g-grothendieck in this question :
http://stackoverflow.com/questions/1826519/how-to-assign-from-a-function-which-returns-more-than-one-value
and do this:
list[df1, df2] <- lapply(list(df1, df2), function(df) {
df$col1 <- df$col1 + 1
return(df)
})
the hack
list <- structure(NA,class="result")
"[<-.result" <- function(x,...,value) {
args <- as.list(match.call())
args <- args[-c(1:2,length(args))]
length(value) <- length(args)
for(i in seq(along=args)) {
a <- args[[i]]
if(!missing(a)) eval.parent(substitute(a <- v,list(a=a,v=value[[i]])))
}
x
}
full code and results
df1 <- data.frame(col1 = rnorm(5), col2 = rnorm(5))
# col1 col2
# 1 -0.5451934 0.5043287
# 2 -1.4047701 -0.1184588
# 3 0.1745109 0.8279085
# 4 -0.5066673 -0.3269411
# 5 0.4838625 -0.3895784
df2 <- data.frame(col1 = rnorm(5), col2 = rnorm(5))
# col1 col2
# 1 0.4168078 -0.44654445
# 2 -1.9991098 -0.06179699
# 3 -1.0625996 1.21098946
# 4 0.4977718 0.45834008
# 5 -1.6181048 0.97917877
list[df1, df2] <- lapply(list(df1, df2), function(df) {
df$col1 <- df$col1 + 1
return(df)
})
# > df1
# col1 col2
# 1 0.4548066 0.5043287
# 2 -0.4047701 -0.1184588
# 3 1.1745109 0.8279085
# 4 0.4933327 -0.3269411
# 5 1.4838625 -0.3895784
# > df2
# col1 col2
# 1 1.41680778 -0.44654445
# 2 -0.99910976 -0.06179699
# 3 -0.06259959 1.21098946
# 4 1.49777179 0.45834008
# 5 -0.61810483 0.97917877
You could avoid the function (and its temporary environment) with a loop like this:
df1 <- data.frame(col1 = 1:5, col2 = rnorm(5))
df2 <- data.frame(col1 = rep(0, 5), col2 = rnorm(5))
df1 # before
for (d in c("df1", "df2")) {
eval(parse(text = paste(d, "[['col1']] <- ", d, "[['col1']] + 1")))
}
df1 # after
Option 2:
df1 <- data.frame(col1 = 1:5, col2 = rnorm(5))
df2 <- data.frame(col1 = rep(0, 5), col2 = rnorm(5))
df1 # before
df2 # before
eval(parse(text = unlist(lapply(c("df1", "df2"), function(x) {
expr.dummy <- quote(df$col1 <- df$col1 +1) # df will be replaced by df1, df2
gsub("df", x, deparse(expr.dummy))
}))))
df1 # after
df2 # after

how to lapply to one column in a list of data tables

I have a list of dt with same structure, some columns are numeric some characters.
dt1 <- data.table(x = c(1:5), y = "test")
dt2 <- data.table(x = c(1:5), y = "test")
mylist <- list(A = dt1, B = dt2)
I want to apply a function, say sum or mean that cannot be applied across the whole datatable because there are some character columns.
I have tried different combinations of lapply(mylist$y,sum) or lapply(mylist[2],sum) but it doesn't work.
You can create an anonymous function inside lapply in which you subset and perform the needed calculation (promoting my comment to an answer):
lapply(mylist, function(i) i[, sum(x)])
# or:
lapply(mylist, function(i) sum(i[["x"]]))
which gives:
$A
[1] 76
$B
[1] 99
Another example giving you the number of unique y-values for x > 3:
lapply(mylist, function(i) i[x>3, uniqueN(y)])
which gives:
$A
[1] 10
$B
[1] 14
Used data:
dt1 <- data.table(x = c(1:5), y = letters)
dt2 <- data.table(x = c(1:7), y = letters)
mylist <- list(A = dt1, B = dt2)
I really think the purrr package makes these problems easier to think about by letting you break the problem up into bite sized pieces:
library(purrr)
dt1 <- data_frame(x = c(1:5), y = letters[1:5])
dt2 <- data_frame(x = c(1:5), y = letters[1:5])
mylist <- list(A = dt1, B = dt2)
map(mylist, "y") %>%
map(length)
You can also use something like this to apply a function conditionally
map(mylist, ~map_if(., is.numeric, sum))
You could also use nested lapply() functions like so:
dt1 <- data.table(x = c(1:5), y = letters[1:5])
dt2 <- data.table(x = c(6:10), y = letters[1:5])
mylist <- list(A = dt1, B = dt2)
lapply(lapply(mylist, function(x) x[[1]]), mean)
# $A
# [1] 3
# $B
# [1] 8
Many options here it looks like. With my code, it might be interesting to see what lapply() returns and how the other lapply() deals with it to understand why it works.

Resources