assign names to data frames columns in a list - r

I have a list of data frames
# Create dummy data
df1<-data.frame( c(1,2,3),c(2,3,4))
df2<-data.frame(c(5,6,7),c(4,5,6))
# Create a list
l<-list(df1, df2)
I would like to assign names to columns. As l[[1]][,1] gives me access to the first column, I thought I could assign 'names' as the first column name by:
l<-lapply(l, function(x)names(x[[1]][,1]<-"names"))
But this gives me an error
Error in x[[1]][, 1] <- "names" :
incorrect number of subscripts on matrix
Edit: Added some dput
initial data
dput(lapply(head(results1, 2), head, 2))
list(structure(c(1.27679607834331, 1.05090175857491), .Dim = 2:1, .Dimnames = list(
c("..a15.pdf", "..a17.pdf"), "x")), structure(c(2.096687569578,
2.19826038300833), .Dim = 2:1, .Dimnames = list(c("..a15.pdf",
"..a17.pdf"), "x")))
after trying to assign the name
dput(lapply(head(results1, 2), head, 2))
list(structure(c(1.27679607834331, 1.05090175857491), .Dim = 2:1, .Dimnames = list(
c("..a15.pdf", "..a17.pdf"), "names")), structure(c(2.096687569578,
2.19826038300833), .Dim = 2:1, .Dimnames = list(c("..a15.pdf",
"..a17.pdf"), "names")))
Output:
results1[1]
[[1]]
names
..a15.pdf 1.27679608
..a17.pdf 1.05090176
..a18.pdf 1.51820192
..a21.pdf 2.30296037
..a2TTT.pdf 1.48568732

You can subset the names of the dataframe:
l <- lapply(l, function(x) {names(x)[1] <-"names";x})
l
In tidyverse -
library(dplyr)
library(purrr)
l <- map(l, ~.x %>% rename_with(~'names', 1))
From the updated data it seems you have list of matrices and the first column is actually rowname which you can convert to a column and name it.
lapply(results1, function(x) {
mat <- cbind.data.frame(names = rownames(x), x)
rownames(mat) <- NULL
mat
})
#[[1]]
# names x
#1 ..a15.pdf 1.28
#2 ..a17.pdf 1.05
#[[2]]
# names x
#1 ..a15.pdf 2.1
#2 ..a17.pdf 2.2

Related

Get the name of the list object then add that name as a new column of the each list

I have a list of dataframes. Then I want to extract the list object name and add that as the first column in the dataframe. THen I want to unlist the and make a single dataframe combining each dataframe by row. They all have the same dimensions.
A small section of my list of dataframes below
mylist <- list(SiO2 = structure(c(5.121, 0.00836394378003293, 0.0199499373432604,
5, 10, 1.87883863763252, 0.0836503954062112, 2.1240292640167), .Dim = c(1L,
8L), .Dimnames = list(NULL, c("Analyte_Mean", "SDBetweenGroup",
"SDWithinGroup", "replicates", "NumSamples", "FValue", "PValue",
"FCritical"))), Al2O3 = structure(c(2.0812, 0.0053103672189408,
0.0159059737205869, 5, 10, 0.442687747035557, 0.903289230024797,
2.1240292640167), .Dim = c(1L, 8L), .Dimnames = list(NULL, c("Analyte_Mean",
"SDBetweenGroup", "SDWithinGroup", "replicates", "NumSamples",
"FValue", "PValue", "FCritical"))))
I have the following to get the name of the list object names. But unsure how to pass it back as a column for each dataframe
names(mylist)
to make a single dataframe I have
new_list <- as.data.frame(do.call(rbind, mylist))
Any help is appreciated.
If I've understood you correctly, I'd probably use the purrr library. imap_dfr will pass both your matrix and its name to the function, and then bind the results into a single dataframe.
library(purrr)
new_list <- imap_dfr(mylist, function(mat, name) {
result <- as.data.frame(mat)
result$name <- name
result
})
gives
> new_list
Analyte_Mean SDBetweenGroup SDWithinGroup replicates NumSamples FValue PValue FCritical name
1 5.1210 0.008363944 0.01994994 5 10 1.8788386 0.0836504 2.124029 SiO2
2 2.0812 0.005310367 0.01590597 5 10 0.4426877 0.9032892 2.124029 Al2O3
You can use purrr::map_df -
purrr::map_df(mylist, data.frame, .id = 'name')
# name Analyte_Mean SDBetweenGroup SDWithinGroup replicates NumSamples
#1 SiO2 5.1210 0.008363944 0.01994994 5 10
#2 Al2O3 2.0812 0.005310367 0.01590597 5 10
# FValue PValue FCritical
#1 1.8788386 0.0836504 2.124029
#2 0.4426877 0.9032892 2.124029
Or in base R -
do.call(rbind, Map(cbind.data.frame, mylist, name = names(mylist)))

How to collapse values in a list to allow a list column in a dataframe to be converted to a vector?

I have a dataframe, df:
df <- structure(list(ID = c("ID1", "ID2", "ID3"), values = list(A = "test",
B = c("test2", "test3"), C = "test4")), row.names = c(NA,
-3L), class = "data.frame")
df
ID values
1 ID1 test
2 ID2 test2, test3
3 ID3 test4
sapply(df, class)
ID values
"character" "list"
I'm trying to create a function that will run through each row of df$values, and if the length is greater than one, paste the values into one string. So the data frame will look the same, but will have a different structure:
df
ID values
1 ID1 test
2 ID2 test2, test3
3 ID3 test4
dput(df)
structure(list(ID = c("ID1", "ID2", "ID3"), values = c("test",
"test2, test3", "test4")), class = "data.frame", row.names = c(NA,
-3L))
sapply(df, class)
ID values
"character" "character"
(Note how in the end result, both columns are character columns, rather than a character column and a list).
I tried making a function to do this, but it doesn't work (and is very messy):
newcol <- NULL
for (i in nrow(df)) {
row <- df$values[i] %>%
unlist(., use.names = FALSE)
if (length(row) == 1) {
newcol = rbind(row, newcol)
} else if (length(row)>1) {
row = paste0(row[1], ", ", row[2])
newcol = rbind(row, newcol)
}
}
df$values <- newcol
Is there an easier way to do this (that works), and that can do it for any size of list entry? (eg. if df$values has a row entry that was "test6", test7, test8, test9").
We can use sapply with toString :
df$values <- sapply(df$values, toString)
sapply(df, class)
# ID values
#"character" "character"
str(df)
#'data.frame': 3 obs. of 2 variables:
# $ ID : chr "ID1" "ID2" "ID3"
# $ values: chr "test" "test2, test3" "test4"
toString is shorthand for paste0(..., collapse = ',').
df$values <- sapply(df$values, paste0, collapse = ',')
Using tidyverse
library(dplyr)
library(purrr)
df <- df %>%
mutate(values = map_chr(values, toString))

extracting a dataframe from a list over many objects

I have over a 1000 objects (z) in R, each containing three dataframes (df1, df2, df3) with different structures.
z1$df1 … z1000$df1
z1$df2 … z1000$df2
z1$df3 … z1000$df3
I created a list of these objects (list1 thus contains z1 thru z1000) and tried to use lapply to extract one type of dataframe (df2) for all objects, and then merge them to one single dataframe.
Extraction:
For a single object it would look like this:
df15<- z15$df2 # I transferred the index of z to the extracted df
I tried some code with lapply, ignoring the transfer of the index (I can create another list for that). However I don’t know what function I should use.
List2 <- lapply(list1, function(x))
I try to avoid using a loop because there's so many and vectorization is so much quicker. I have the idea I'm looking at it from the wrong angle.
Subsequent merging can be done as follows:
merged <- do.call(rbind, list2)
Thanks for any suggestions.
It sounds like you want to pull out all the df1s and rbind them together then do the same for the other dataframes. You can use purrr::map_dfr to extract a column from each element of the list and rowbind them together.
library('tidyverse')
dummy_df <- list(
df1 = iris,
df2 = cars,
df3 = CO2)
list1 <- list(
z1 = dummy_df,
z2 = dummy_df,
z3 = dummy_df)
df1 <- map_dfr(list1, 'df1')
df2 <- map_dfr(list1, 'df2')
df3 <- map_dfr(list1, 'df3')
If you wanted to do it in base R, you can use lapply.
df1 <- lapply(list1, function(x) x$df1)
df1_merged <- do.call(rbind, df1)
One option could be using lapply to extract data.frame and then use bind_rows from dplyr.
## The data
df1 <- data.frame(id = c(1:10), name = c(LETTERS[1:10]), stringsAsFactors = FALSE)
df2 <- data.frame(id = 11:20, name = LETTERS[11:20], stringsAsFactors = FALSE)
df3 <- data.frame(id = 21:30, name = LETTERS[15:24], stringsAsFactors = FALSE)
df4 <- data.frame(id = 121:130, name = LETTERS[15:24], stringsAsFactors = FALSE)
z1 <- list(df1 = df1, df2 = df2, df3 = df3)
z2 <- list(df1 = df1, df2 = df2, df3 = df3)
z3 <- list(df1 = df1, df2 = df2, df3 = df3)
z4 <- list(df1 = df1, df2 = df2, df3 = df4) #DFs can contain different data
# z <- list(z1, z2, z3, z4)
# Dynamically populate list z with many list object
z <- as.list(mget(paste("z",1:4,sep="")))
df1_all <- bind_rows(lapply(z, function(x) x$df1))
df2_all <- bind_rows(lapply(z, function(x) x$df2))
df3_all <- bind_rows(lapply(z, function(x) x$df3))
## Result for df3_all
> tail(df3_all)
## id name
## 35 125 S
## 36 126 T
## 37 127 U
## 38 128 V
## 39 129 W
## 40 130 X
Try this:
lapply(list1, "[[", "df2")
or if you want to rbind them together:
do.call("rbind", lapply(list1, "[[", "df2"))
The row names in the resulting data frame will identify the origin of each row.
No packages are used.
Note
We can use this input to test the code above. BOD is a built-in data frame:
z <- list(df1 = BOD, df2 = BOD, df3 = BOD)
list1 <- list(z1 = z, z2 = z)
THere's also data.table::rbindlist, which is likely faster than do.call(rbind, lapply(...)) or dplyr::bind_rows
library(data.table)
rbindlist(lapply(list1, "[[", "df2"))

Convert named vector to data frame using attribute values

I have a vector of characters. Each element of the vector has a name attribute which represents the row index of a data frame and the column index of a data frame, separated by a period. Here's a toy data set:
# Create vector of characters
a <- c("foo","bar","dog","cat")
# Assign attributes. The data frame is 2x2:
attr(a, "names") <- c("1.1", "1.2", "2.1", "2.2")
I am trying to use the attribute names to convert the vector into a data frame, where each element in the data frame is the value in the vector and the element's row is the number before the period in the attribute name and the element's column is the number after the decimal in the attribute name. The toy example's output should look like:
data.frame(var1 = c("foo","dog"), var2 = c("bar", "cat"))
My actual vector is quite large so I am looking to do this efficiently.
You can use indexing by row/column value to do this efficiently:
row.nums <- as.numeric(sapply(strsplit(names(a), "\\."), "[", 1))
col.nums <- as.numeric(sapply(strsplit(names(a), "\\."), "[", 2))
mat <- matrix(NA, max(row.nums), max(col.nums))
mat[cbind(row.nums, col.nums)] <- a
mat
# [,1] [,2]
# [1,] "foo" "bar"
# [2,] "dog" "cat"
Split a on the suffix values and coerce that to a data frame. Omit
the stringsAsFactors=FALSE if you prefer factor columns.
the unname if rownames on the result are acceptable
Code--
as.data.frame(split(unname(a), sub(".*[.]", "", names(a))), stringsAsFactors = FALSE)
giving:
X1 X2
1 foo bar
2 dog cat
I would probably use regex to extract row and column positions, as follows.
my.rows <- as.integer(gsub("\\..*$", "", names(a)))
my.cols <- as.integer(gsub("^.*\\.", "", names(a)))
new.data <- data.frame(matrix(NA, nrow = max(my.rows), ncol = max(my.cols)))
for (i in 1:length(a)) {
new.data[my.rows[i], my.cols[i]] <- a[i]
}
new.data
We can use dplyr and tidyr. b2 is the final output.
library(dplyr)
library(tidyr)
b <- data_frame(Name = names(a), Value = a)
b2 <- b %>%
separate(Name, into = c("Group", "Var")) %>%
spread(Var, Value) %>%
select(-Group)

R: Find matching string then copying row

I have a multi-step problem. First step: match text in one string (df1) from one column to another range of columns (df2). There is no order of which columns match and the match could occur anywhere within the range. Once the match is found, copy the df2 row match into df1. Finally, repeat for the entire column.
df1= structure(list(Assay = c("ATG_AR_trans_up","NVS_PXR_cis","BSK_VCAM1_up"), p.value = c(0.01,0.05,0.0001)), .Names = c("Assay", "p.value"),row.names = c(NA, 3L), class = "data.frame")
df1
Assay p.value
ATG_AR_trans_up 0.01
NVS_hPXR 0.065
BSK_VCAM1_up 0.001
df2=structure(list(GeneID = c("AR", "VACM1", "TR", "ER", "PXR"), Assay1= c("ATG_ARE_cis", "BSK_hEDG_VCAM1", "NVS_TR_tran", "ATG_ER_UP", "NVS_PXRE_UP"), Assay2= c("ATG_AR_trans_up", "BSK_BE3K_VCAM1", "NA", "ATG_ERE_cis", "ATG_PXRE_cis"), Assay3= c("NVS_AR_trans", "BSK_VCAM1_UP", "NA", "NVS_ERa_CIS", "NVS_PXR_cis"), Assay4= c("Tox21_AR_ARE","NA", "NA", "Tox21_ERaERb_lig", "NA")), .Names = c("GeneID", "Assay1", "Assay2", "Assay3", "Assay4"),row.names = c(NA, 5L), class = "data.frame")
df2
GeneID Assay1 Assay 2 Assay3
AR ATG_ARE_cis NVS_hAR ATG_AR_trans_up
VACM1 BSK_hEGF_CAM1 BSK_VCAM1_up BSK_VCAM1_down
TR NVS_TR_tran NA NA
ER ATG_ER_UP ATG_ERE_cis NVS_ERa_CIS
PXR ATG_PXR_down ATG_PXRE_cis NVS_hPXR
Essentially becomes
df
Assay p.value GeneID Assay1 Assay2 Assay3
ATG_AR_trans_up 0.01 AR ATG_ARE_cis NVS_hAR ATG_AR_trans_up
NVS_hPXR 0.065 PXR ATG_PXR_down ATG_PXRE_cis NVS_hPXR
BSK_VCAM1_up 0.001 VCAM1 BSK_hEGF_CAM1 BSK_VCAM1_up BSK_VCAM1_down
For brevity I shortened the df substantially, but it is around 88 Assays and 4,000 some rows to go through for just one match (there are about 30). So a my initial instinct is to loop, but I was told grep might be a helpful package (even though it is not for R 3.2.2). Any help would be appreciated though.
Since OP was interested in a grep solution, another way to do it would be,
asDF2 <- apply(df2, 1, function(r) do.call(paste, as.list(r)))
do.call(rbind, lapply(1:nrow(df1),
function(i){
matchIX <- grepl(df1$Assay[i], asDF2, ignore.case=T)
if(any(matchIX))
cbind(df1[i, ], df2[matchIX, ])
}))
The first line creates a character vector with concatenated row assay names of df2. The second line loops through df1 and finds match in asDF2 using grepl
Or equivalently,
do.call(rbind, lapply(1:nrow(df1),
function(i){
matchIX <- grepl(df1$Assay[i],
data.frame(t(df2), stringsAsFactors=F),
ignore.case=T)
if(any(matchIX))
cbind(df1[i, ], df2[matchIX, ])
} ))
Note that above variants, can match multiple rows in df2 to df1.
NOTE
To test I added new rows to original data frames as
df1 <- rbind(df1, data.frame(Assay="NoMatch", p.value=.2))
df2 <- rbind(df2,
data.frame(GeneID="My", Assay1="NVS_PXR_cis", Assay2="NA", Assay3="NA", Assay4="NA"))
This can be easily done with reshaping. I put all the assays into all caps because that was messing up the matching.
library(dplyr)
library(tidyr)
library(stringi)
df2_ID = df %>% mutate(new_ID = 1:n() )
result =
df2_ID %>%
select(new_ID, Assay1:Assay85) %>%
gather(assay_number, Assay, Assay1:Assay85) %>%
mutate(Assay =
Assay %>%
iconv(to = "ASCII") %>%
stri_trans_toupper) %>%
inner_join(df1 %>%
mutate(Assay =
Assay %>%
iconv(to = "ASCII") %>%
stri_trans_toupper)) %>%
inner_join(df2_ID)
Since you're new to R, I think you are right that the most intuitive way to do this is with a for-loop. This is not the most concise or most efficient way to do this, but it should be clear what's going on.
# Creating example data
df1 <- as.data.frame(matrix(data=c("aa", "bb", "ee", .9, .5, .7), nrow=3))
names(df1) <- c("assay", "p")
df2 <- as.data.frame(matrix(data=c("G1", "G2", "aa", "dd", "bb", "ee", "cc", "ff"), nrow=2))
names(df2) <- c("GeneID", "assay1", "assay2", "assay3")
# Building a dataframe to store output
df3 <- as.data.frame(matrix(data=NA, nrow=dim(df1)[1], ncol=dim(df2)[2]))
names(df3) <- names(df2)
# Populating dataframe with output
for(i in 1:dim(df1)[1]){
index <- which(df2==as.character(df1$assay[i]), arr.ind = TRUE)[1]
for(j in 1:dim(df3)[2]){
df3[i,j] <- as.character(df2[index,j])
}
}
df <- cbind(df1, df3)
Edit after clarification from user:
I just created a triple for loop to check your values. Basically what it does is it looks for a match. It does this by looping through all columns and all the values from that column.
However my code is not perfect yet (also a beginner in R) and I just wanted to post it so that maybe we can work something out together :).
So I first convert your data to a data.frame. After that I create an empty output which I later fill per match found.
The improvements in this method would be that with this solution the function append will also append the column names which will result in multiple useless column names.
df3 <- as.data.frame(df1)
df4 <- as.data.frame(df2)
output <- data.frame()
for(j in 1:nrow(df3)) {
match <- FALSE
for(i in 2:(ncol(df4))) {
for(p in 1:nrow(df4)) {
if((df3[j, 1] == df4[p, i]) && (match == FALSE)) {
output <- append(output, c(df3[j, ], df4[j, ]))
match <- TRUE
}
}
}
}
Assuming, you don't have any repeated entry corresponding to the entry in df1. Following is the solution for your problem:
assay <-as.matrix(df1[,1])
m1 <- as.numeric(sapply(assay, function(x){grep(x,df2[,2], ignore.case = T)}, simplify = FALSE))
m2 <- as.numeric(sapply(assay, function(x){grep(x,df2[,3], ignore.case = T)}, simplify = FALSE))
m3 <- as.numeric(sapply(assay, function(x){grep(x,df2[,4], ignore.case = T)}, simplify = FALSE))
m4 <- as.numeric(sapply(assay, function(x){grep(x,df2[,5], ignore.case = T)}, simplify = FALSE))
m1[is.na(m1)] <- 0
m2[is.na(m2)] <- 0
m3[is.na(m3)] <- 0
m4[is.na(m4)] <- 0
m0 <- (m1+m2+m3+m4)
df <- NULL
for(i in 1:nrow(df1){
df3 = cbind(df1[i,],df2[m0[i],])
df = rbind(df,df3)
}
Edit: Generalization
Since you have more than 80 rows, you can generalize it as under:
assay <-as.matrix(df1[,1])
# Storing Assay column in a list
m <- vector('list',ncol(df2[, 2:ncol(df2)]))
for(i in 1:length(m)){
m[[i]] <- as.numeric(sapply(assay, function(x){grep(x,df2[,(i+1)], ignore.case = T)}, simplify = FALSE))
}
# Getting row subscript for df2
m1 <- as.data.frame(m)
m1[is.na(m1)] <- 0
m2 <- rowSums(m1)
df <- NULL
for(i in 1:nrow(df1)){
df3 = cbind(df1[i,],df2[m2[i],])
df = rbind(df,df3)
}

Resources