How to Add Dataframe name to Columns from Multiple Dataframes - r

The goal is to rename a list of dataframes columns, but while adding the dataframe name to the new column name.
ex: from x to a_x and b_x.
Why? Because I plan to later merge the sets and would like clean ids for the columns.
a = data.frame(x = c(1,2))
b = data.frame(x = c(3,4))
frameList = list(a = a, b = b)
newName = c(*frameName+'_'+'x')
names = lapply(names, setNames, nm = newName)
list2env(names,.GlobalEnv)

Here is one way for you. I looped through each data frame in frameList using the length of frameList. For column names in each data frame, I took the name of a data frame (i.e., names(frameList)) and past it to column names in the data frame.
a = data.frame(x = c(1,2), y = 1:2)
b = data.frame(x = c(3,4), y = 1:2)
frameList = list(a = a, b = b)
lapply(1:length(names(frameList)), function(x) {
names(frameList[[x]]) <- paste(names(frameList)[x], names(frameList[[x]]), sep = "_")
return(frameList[[x]])
})
[[1]]
a_x a_y
1 1 1
2 2 2
[[2]]
b_x b_y
1 3 1
2 4 2

Or another option is Map
Map(function(x, y) setNames(x, paste(y, names(x), sep="_")), frameList, names(frameList))
#$a
# a_x a_y
#1 1 1
#2 2 2
#$b
# b_x b_y
#1 3 1
#2 4 2
Or with tidyverse
library(purrr)
library(dplyr)
f1 <- function(x, y) paste(y, x, sep="_")
map2(frameList, names(frameList), ~ .x %>%
rename_all(f1, .y))
If we need it in the reverse order, this is more simple
map2(frameList, names(frameList), ~ .x %>%
rename_all(paste, sep="_", .y))

Related

R find index of a variable and subset a lsit

I have a list that looks like this
#Make dataframes
df1 = data.frame(x = c("a", "b", "c"), y = 1:3, stringsAsFactors = F)
df2 = df1 %>% mutate(y = y*2)
df3 = df1 %>% mutate(y = y*3)
#Make a name for each dataframe
myvar = "fname"
#Combine name and dataframe into a list
mylist = list(myvar, df1)
#Add the other dataframes and name to the list (done in a loop for my bigger dataset
list2 = list(myvar, df2)
mylist = rbind(mylist, list2)
list3 = list(myvar, df3)
mylist = rbind(mylist, list3)
I want to pull a subset of the list with all the data associated with "c"
x y
3 c 3
x y
3 c 6
x y
3 c 9
This is what I tried but it doesn't work
#Find all instances of "c"
picksite = "c"
site_indices = which(mylist[,2] == picksite)
mylist[site_indices,]
Any suggestions on how to do this, or even a link to better understand lists? Thanks so much.
Wrapping the which inside of lapply will solve this problem:
lapply(mylist[,2], FUN = function(i) i[which(i$x == "c"),])
$mylist
x y
3 c 3
$list2
x y
3 c 6
$list3
x y
3 c 9
Using tidyverse, we can loop over the list with map and use if_any to filter
library(dplyr)
library(purrr)
map(mylist[,2], ~ .x %>%
filter(if_any(everything(), ~ .x == "c")))
-output
$mylist
x y
1 c 3
$list2
x y
1 c 6
$list3
x y
1 c 9

Apply function on multiple lists in R

I have four lists each with multiple data frames.
I need to apply the same function on the lists.
How can I do this?
Sample data:
df1 <- data.frame(x = 1:3, y = letters[1:3])
df2 <- data.frame(x = 4:6, y = letters[4:6])
df3 <- data.frame(x = 7:9, y = letters[7:9])
df4 <- data.frame(x = 10:12, y = letters[10:12])
list1 <- list(df1,df2)
list2 <- list(df3,df4)
In my real data I import based on a pattern in the filename and thus my list elements will have the following names (sample data):
names(list1) <- c("./1. Data/df1.csv", "./1. Data/df2.csv")
names(list2) <- c("./1. Data/df3.csv", "./1. Data/df4.csv")
And this is one of the functions I want to run on all lists.
element.name <- function(x) {
all_filenames <- names(x) %>%
basename() %>%
as.list()
names(x) <- all_filenames
names(x) <- gsub("\\.csv", "", names(x))
}
which will give the desired output
names(list1) <- element.name(list1)
names(list1)
[1] "df1" [2] "df2"
I've tried using a for loop but I end up overwriting my output, so I hope some of you can help me out, since I need to run a lot of functions on my lists.
You could create a list of your lists, and then use lapply to apply to every list the function element.name. You can use setNames to avoid problems linked the assignment on names. You can then use list2env to get your data.frames back to the global environment.
setNames(list(list1, list2), c('list1', 'list2')) |>
lapply(function(x) setNames(x, element.name(x))) |>
list2env()
output
> list1
$df1
x y
1 1 a
2 2 b
3 3 c
$df2
x y
1 4 d
2 5 e
3 6 f
> list2
$df3
x y
1 7 g
2 8 h
3 9 i
$df4
x y
1 10 j
2 11 k
3 12 l
Here is an approach using data.table::fread
library(data.table)
# create dummy CSVs -------------------------------------------------------
DT1 <- data.frame(x = 1:3, y = letters[1:3])
DT2 <- data.frame(x = 4:6, y = letters[4:6])
DT3 <- data.frame(x = 7:9, y = letters[7:9])
DT4 <- data.frame(x = 10:12, y = letters[10:12])
mapply(write.csv, x = list(DT1, DT2, DT3, DT4), file = list("DT1.csv", "DT2.csv", "DT3.csv", "DT4.csv"), row.names = FALSE)
# read in CSVs ------------------------------------------------------------
csv_paths <- list.files(path = ".", pattern = ".csv$")
# might need to split this into different steps due to different csv formats?
DT_list <- setNames(lapply(csv_paths, fread), tools::file_path_sans_ext(basename(csv_paths)))
# apply a function to each data.table -------------------------------------
lapply(DT_list, function(DT){DT[, test := x*2]})
If you want to stick with the given dummy data just merge the lists:
list1 <- list(df1,df2)
list2 <- list(df3,df4)
DT_list <- setNames(c(list1, list2), tools::file_path_sans_ext(basename(csv_paths)))

R lapply function with two arguments that are not fixed

I found a similar question asked before. My question is a bit more complex than the previous one. For my question, the y parameter is not fixed.
In the function(X,Y){SOME FUNCTION}, X is a list of characters and Y is a list of dataframe. Basically, I want the function to work on the pair of X and Y in sequence respectively, and produce the output as one list. For example, the first element of X list and the first element of Y list, the second element of X list and the second element of Y list, the third element of X list and the third element of Y list,...
Example of X, Y
X <- c("1", "2")
y1 <- data.frame("person.1" = "Amy", "bestfood..1" = "fish", "bestthing..1" = "book",
"person.2" = "Mike", "bestfood..2" = "fish", "bestthing..2" = "book")
y2 <- data.frame("person.1" = "Amy","bestfood..1" = "carrot", "bestthing..1" = "cloth",
"person.2" = "Mike","bestfood..2" = "carrot", "bestthing..2" = "cloth")
Y <- list(y1,y2)
The function:
addID <- function(X, Y) {
rowlength <- length(Y)
df <- as.data.frame(matrix(NA, nrow = rowlength, ncol = 3))
colnames(df) <- c("ID", "Person", "Food")
df[1:nrow(df), 1] <- X
# name
namecols <-grep("person",colnames(Y))
for (i in 1:length(namecols)) {
name <- Y[1, namecols[i]]
df[i, 2] <- as.character(name)
}
# food
foodcols <-
grep("bestfood",colnames(Y))
for (i in 1:length(foodcols)) {
food <- Y[1, foodcols[i]]
df[i, 3] <- as.character(foodcols)
}
return(df)
}
}
I tried to use lapply but can't figure out the way to include the X list. When I try this:
lapply(Y, function, X=X)
The function doesn't work properly. I wonder if there are other ways to include X in it(I tried the function on individual character and dataframe, it works just fine. )
I hope this is clear. If not, please point it out, I will try my best to clarify. Thanks in advance.
UPDATE:
I tried Map as suggested by comments. It returns: incorrect number of dimensions. I added some details in the function. It seems like R stucks on the last line.
outcome <- Map(addID, Y, X)
I get
error in Y[1, namecols[i]] : incorrect number of dimensions
In addition: Warning message:
In `[<-.data.frame`(`*tmp*`, 1:nrow(df), 1, value = list(person.1 = 1L, :
provided 6 variables to replace 1 variables
The outcome should looks like:
z1 <- data.frame(ID = c(1,2), Person = c("Amy","Mike"), Food = c("fish", "fish"))
z2 <- data.frame(ID = c(1,2), Person = c("Amy","Mike"), Food = c("carrot", "carrot"))
outcome <- list(z1,z2)
We could do this easily in tidyverse
library(dplyr)
library(tidyr)
bind_rows(Y, .id = 'ID') %>%
select(ID, starts_with('person'), contains('food')) %>%
pivot_longer(cols = -ID, names_to = c(".value"),
names_pattern = "([^.]+)\\.+\\d+")
-output
# A tibble: 4 x 3
ID person bestfood
<chr> <chr> <chr>
1 1 Amy fish
2 1 Mike fish
3 2 Amy carrot
4 2 Mike carrot
With the OP's function, if we modify, it would work
addID <- function(X, Y) {
rowlength <- length(Y)
df <- as.data.frame(matrix(NA, nrow = rowlength, ncol = 3))
colnames(df) <- c("ID", "Person", "Food")
df[1:nrow(df), 1] <- X
namecols <- grep("person",colnames(Y))
df[, 2] <- unlist(Y[namecols])
foodcols <- grep("bestfood", colnames(Y))
df[,3] <- unlist(Y[foodcols])
return(unique(df))
}
-testing
Map(addID, X, Y)
$`1`
ID Person Food
1 1 Amy fish
2 1 Mike fish
$`2`
ID Person Food
1 2 Amy carrot
2 2 Mike carrot

R - co-locate columns with the same name after merge

Situation
I have two data frames, df1 and df2with the same column headings
x <- c(1,2,3)
y <- c(3,2,1)
z <- c(3,2,1)
names <- c("id","val1","val2")
df1 <- data.frame(x, y, z)
names(df1) <- names
a <- c(1, 2, 3)
b <- c(1, 2, 3)
c <- c(3, 2, 1)
df2 <- data.frame(a, b, c)
names(df2) <- names
And am performing a merge
#library(dplyr) # not needed for merge
joined_df <- merge(x=df1, y=df2, c("id"),all=TRUE)
This gives me the columns in the joined_df as id, val1.x, val2.x, val1.y, val2.y
Question
Is there a way to co-locate the columns that had the same heading in the original data frames, to give the column order in the joined data frame as id, val1.x, val1.y, val2.x, val2.y?
Note that in my actual data frame I have 115 columns, so I'd like to stay clear of using joned_df <- joined_df[, c(1, 2, 4, 3, 5)] if possible.
Update/Edit: also, I would like to maintain the original order of column headings, so sorting alphabetically is not an option (-on my actual data, I realise it would work with the example I have given).
My desired output is
id val1.x val1.y val2.x val2.y
1 1 3 1 3 3
2 2 2 2 2 2
3 3 1 3 1 1
Update with solution for general case
The accepted answer solves my issue nicely.
I've adapted the code slightly here to use the original column names, without having to hard-code them in the rep function.
#specify columns used in merge
merge_cols <- c("id")
# identify duplicate columns and remove those used in the 'merge'
dup_cols <- names(df1)
dup_cols <- dup_cols [! dup_cols %in% merge_cols]
# replicate each duplicate column name and append an 'x' and 'y'
dup_cols <- rep(dup_cols, each=2)
var <- c("x", "y")
newnames <- paste(dup_cols, ".", var, sep = "")
#create new column names and sort the joined df by those names
newnames <- c(merge_cols, newnames)
joined_df <- joined_df[newnames]
How about something like this
numrep <- rep(1:2, each = 2)
numrep
var <- c("x", "y")
var
newnames <- paste("val", numrep, ".", var, sep = "")
newdf <- cbind(joined_df$id, joined_df[newnames])
names(newdf)[1] <- "id"
Which should give you the dataframe like this
id val1.x val1.y val2.x val2.y
1 1 3 1 3 3
2 2 2 2 2 2
3 3 1 3 1 1

aggregate values in dataframe by partly matching rownames in R

I'm thumbling around with the following problem, but to no evail:
d <- data.frame(value = 1:4, row.names = c("abc", "abcd", "ef", "gh"))
value
abc 1
abcd 2
ef 3
gh 4
l <- nrow(d)
wordmat <- matrix(rep(NA, l^2), l, l, dimnames = list(row.names(d), row.names(d)))
for (i in 1:ncol(wordmat)) {
rid <- agrep(colnames(wordmat)[i], rownames(wordmat), max = 0)
d$matchid[i] <- paste(rid, collapse = ";")
}
# desired output:
(d_agg <- data.frame(value = c(3, 3, 4), row.names = c("abc;abcd", "ef", "gh")))
value
abc;abcd 3
ef 3
gh 4
is there a function for this?
Here's a possible solution that you might be able to modify to suit your needs.
Some notes:
I couldn't figure out how to deal with rownames() directly, particularly in the last stage, so this depends on you being happy with copying your row names as a new variable.
The function below "hard-codes" the variable names, functions, and so on. That is to say, it is not by any means a generalized function, but one which might be useful as you look further into this problem.
Here's the function.
matches <- function(data, ...) {
temp = vector("list", nrow(data))
for (i in 1:nrow(data)) {
temp1 = agrep(data$RowNames[i], data$RowNames, value = TRUE, ...)
temp[[i]] = data.frame(RowNames = paste(temp1, collapse = "; "),
value = sum(data[temp1, "value"]))
}
temp = do.call(rbind, temp)
temp[!duplicated(temp$RowNames), ]
}
Note that the function needs a column called RowNames, so we'll create that, and then test the function.
d <- data.frame(value = 1:4, row.names = c("abc", "abcd", "ef", "gh"))
d$RowNames <- rownames(d)
matches(d)
# RowNames value
# 1 abc; abcd 3
# 3 ef 3
# 4 gh 4
matches(d, max.distance = 2)
# RowNames value
# 1 abc; abcd 3
# 3 abc; abcd; ef; gh 10
matches(d, max.distance = 4)
# RowNames value
# 1 abc; abcd; ef; gh 10
This works for your example but may need tweaking for the real thing:
d <- data.frame(value = 1:4, row.names = c("abc", "abcd", "ef", "gh"))
rowclust <- hclust(as.dist(adist(rownames(d))), method="single")
rowgroups <- cutree(rowclust, h=1.5)
rowagg <- aggregate(d, list(rowgroups), sum)
rowname <- unclass(by(rownames(d), rowgroups, paste, collapse=";"))
rownames(rowagg) <- rowname
rowagg
Group.1 value
abc;abcd 1 3
ef 2 3
gh 3 4

Resources