Convert data.frame to list of lists - r

I am trying to figure out how to convert a data.frame to a list of lists. Suppose I had (feel free to modify this if you need to capture more attributes for later):
v <- list(
row1=list(col1 = as.Date("2011-01-23"), col2="A"),
row2=list(col1 = as.Date("2012-03-03"), col2="B"))
d <- do.call(rbind, lapply(v, as.data.frame))
d$col3 <- 2
How do I get d back to a list of lists (similar to v). The end result should be equivalent to the result of:
vr <- list(
row1=list(col1 = as.Date("2011-01-23"), col2="A", col3=2),
row2=list(col1 = as.Date("2012-03-03"), col2="B", col3=2))

You can do
out <- lapply(split(d, rownames(d)), as.list)
out
#$row1
#$row1$col1
#[1] "2011-01-23"
#$row1$col2
#[1] "A"
#$row1$col3
#[1] 2
#$row2
#$row2$col1
#[1] "2012-03-03"
#$row2$col2
#[1] "B"
#$row2$col3
#[1] 2
If you add stringsAsFactors = FALSE when creating d, i.e.
d <- do.call(rbind, lapply(v, as.data.frame, stringsAsFactors = FALSE))
d$col3 <- 2
then
identical(out, vr)
returns TRUE.

You have to go through the columns again making them lists before you pass them as values of the element of the main list. I hope the below code helps:
apply(d,MARGIN = 1, FUN=function(x){as.list(x)})

Related

Using attribute tables to apply a function between specific elements in lists

I have two list objects. l1 contains information that has been read in through path files. l2 is a list of values that have similar name components as those in l1. I have assigned attributes to both list based on the names of the elements in the list. I would like to reach my expected results using the attributes that I have assigned to my list.
For example: I would like to apply a function mean() between the elements with the attribute id that are "2013_mean" in l1 to those with the attribute year that are also "2013" in l2. I would like to do the similar thing with those when the attribute for year is "2016".
# File List
oldl1 <- list(2,3,4,5)
names(oldl1) <- c("C:/Users/2013_mean.csv",
"C:/Users/2013_median.csv",
"C:/Users/2016_mean.csv",
"C:/Users/2016_median.csv"
)
newl1 <- list(2,3,4,5,8,9)
names(newl1) <- c("C:/Users/2013_mean.csv",
"C:/Users/2013_median.csv",
"C:/Users/2016_mean.csv",
"C:/Users/2016_median.csv",
"C:/Users/2017_mean.csv",
"C:/Users/2017_median.csv"
)
attributes(l1) <- data.frame(id = sub("\\.csv", "", basename(names(l1))),
year = trimws(basename(names(l1)), whitespace = "_.*"))
# Other List
l2 <- list(8,9,10,15,1)
names(l2) <- c("2013_A",
"2013_B",
"2013_C",
"2016_D",
"2016_E")
attributes(l2) <- data.frame(year = trimws(names(l2), whitespace = "_.*"))
expected <- list(mean(c(l1[[1]], l2[[1]])),
mean(c(l1[[1]], l2[[2]])),
mean(c(l1[[1]], l2[[3]])),
mean(c(l1[[3]], l2[[4]])),
mean(c(l1[[3]], l2[[5]]))
)
We may use the attributes to split and match and get the mean
yrs <- intersect(attr(l1, "year"), attr(l2, "year"))
i1 <- grepl("mean", attr(l1, "id"))
i12 <- attr(l1, "year") %in% yrs
i1 <- i1 & i12
i2 <- attr(l2, "year") %in% yrs
l2new <- l2[i2]
l1new <- l1[i1]
attr(l1new, "year") <- attr(l1, "year")[i1]
out <- do.call(c, Map(function(x, y) lapply(x, function(z)
mean(c(z, y))), split(l2new, attr(l2, 'year')[i2]), l1new))
names(out) <- NULL
-checking with OP's expected
> identical(out, expected)
[1] TRUE
Or another option is to convert the list with attributes to a data.frame, do a merge and use rowMeans and then convert to list with as.list
as.list(rowMeans(merge(transform(data.frame(attributes(l2)),
l2 = unlist(l2)),
subset(transform(data.frame(attributes(l1)), l1 = unlist(l1)),
grepl("mean", id), select = c(year, l1)), all.x = TRUE)[-1]))
-output
[[1]]
[1] 5
[[2]]
[1] 5.5
[[3]]
[1] 6
[[4]]
[1] 9.5
[[5]]
[1] 2.5

How to get a specific list from a list of lists?

This is such a basic question and for some reason I can't figure out how to get this right. Suppose I have a list of lists
v <- list(
list(a=1, b=2, c=3),
list(a=4, b=5, c=6),
list(a=7, b=8, c=9))
How do I pull out a list of all elements that are named "a". i.e. I would like to get list(1, 4, 7) asking for a.
We can use pluck
library(tidyverse)
map(v, pluck, "a")
#[[1]]
#[1] 1
#[[2]]
# [1] 4
#[[3]]
# [1] 7
The corresponding method in base R would be
lapply(v, `[[`, "a")
In base R, we can use
unlist(v)[names(unlist(v))=="a"]
Or, if your prefer not to use unlist twice:
(x <- unlist(v))[names(x)=="a"]

R add columns based on modifying other columns of dataframes within a list

I would like to add a new column D to data.frames in a list that contains the first part of column B. But I'm not sure how to adress within lists down to the column level?
create some data
df1 <- data.frame(A = "hey", B = "wass.7", C = "up")
df2 <- data.frame(A = "how", B = "are.1", C = "you")
dfList <- list(df1,df2)
desired output:
# a new column removing the last part of column B
[[1]]
A B C D
1 hey wass.7 up wass
[[2]]
A B C D
1 how are.1 you are
for each data frame I did this, which worked
df1$D<-sub('\\..*', '', df1$B)
in a function I tried this, which is probably
not correctly addressing the columns and returns
"unexpected symbol..."
dfList <- lapply(rapply(dfList, function(x)
x$D<-sub('\\..*', '', x$B) how = "list"),
as.data.frame)
the lapply(rapply) part is copied from Using gsub in list of dataframes with R
Check this out
lapply(dfList, function(x){
x$D <-sub('\\..*', '', x$B);
x
})
[[1]]
A B C D
1 hey wass.7 up wass
[[2]]
A B C D
1 how are.1 you are
The rapply solution does work. However, you needed a comma before the how argument to resolve the error. Additionally, you will NOT be able to assign one new column only replace existing ones. Since rapply is a recursive call, it will run the gsub across every element in nested list so across ALL columns of ALL dataframes.
Otherwise use a simple lapply per #JilberUrbina's answer.
df1 <- data.frame(A = "hey", B = "wass.7", C = "up", stringsAsFactors = F)
df2 <- data.frame(A = "how", B = "are.1", C = "you", stringsAsFactors = F)
dfList <- list(df1,df2)
dfList <- lapply(rapply(dfList, function(x)
sub('\\..*', '', x), how = "list"),
as.data.frame)
dfList
# [[1]]
# A B C
# 1 hey wass up
# [[2]]
# A B C
# 1 how are you

Isolate alphabetical strings within a larger string

Is there a way to isolate parts of a string that are in alphabetical order?
In other words, if you have a string like this: hjubcdepyvb
Could you just pull out the portion in alphabetical order?: bcde
I have thought about using the is.unsorted() function, but I'm not sure how to apply this to only a portion of a string.
Here's one way by converting to ASCII and back:
input <- "hjubcdepyvb"
spl_asc <- as.integer(charToRaw(input)) # Convert to ASCII
d1 <- diff(spl_asc) == 1 # Find sequences
filt <- spl_asc[c(FALSE, d1) | c(d1, FALSE)] # Only keep sequences (incl start and end)
rawToChar(as.raw(filt)) # Convert back to character
#[1] "bcde"
Note that this will concatenate any parts that are in alphabetical order.
i.e. If input is "abcxasdicfgaqwe" then output would be abcfg.
If you wanted to get separate vectors for each sequential string, you could do the following
input <- "abcxasdicfgaqwe"
spl_asc <- as.integer(charToRaw(input))
d1 <- diff(spl_asc) == 1
r <- rle(c(FALSE, d1) | c(d1, FALSE)) # Find boundaries
cm <- cumsum(c(1, r$lengths)) # Map these to string positions
substring(input, cm[-length(cm)], cm[-1] - 1)[r$values] # Extract matching strings
Finally, I had to come up with a way to use regex:
input <- c("abcxasdicfgaqwe", "xufasiuxaboqdasdij", "abcikmcapnoploDEFgnm",
"acfhgik")
(rg <- paste0("(", paste0(c(letters[-26], LETTERS[-26]),
"(?=", c(letters[-1], LETTERS[-1]), ")", collapse = "|"), ")+."))
#[1] "(a(?=b)|b(?=c)|c(?=d)|d(?=e)|e(?=f)|f(?=g)|g(?=h)|h(?=i)|i(?=j)|j(?=k)|
#k(?=l)|l(?=m)|m(?=n)|n(?=o)|o(?=p)|p(?=q)|q(?=r)|r(?=s)|s(?=t)|t(?=u)|u(?=v)|
#v(?=w)|w(?=x)|x(?=y)|y(?=z)|A(?=B)|B(?=C)|C(?=D)|D(?=E)|E(?=F)|F(?=G)|G(?=H)|
#H(?=I)|I(?=J)|J(?=K)|K(?=L)|L(?=M)|M(?=N)|N(?=O)|O(?=P)|P(?=Q)|Q(?=R)|R(?=S)|
#S(?=T)|T(?=U)|U(?=V)|V(?=W)|W(?=X)|X(?=Y)|Y(?=Z))+."
regmatches(input, gregexpr(rg, input, perl = TRUE))
#[[1]]
#[1] "abc" "fg"
#
#[[2]]
#[1] "ab" "ij"
#
#[[3]]
#[1] "abc" "nop" "DEF"
#
#[[4]]
#character(0)
This regular expression will identify consecutive upper or lower case letters (but not mixed case). As demonstrated, it works for character vectors and produces a list of vectors with all the matches identified. If no match is found, the output is character(0).
Using factor integer conversion:
input <- "hjubcdepyvb"
d1 <- diff(as.integer(factor(unlist(strsplit(input, "")), levels = letters))) == 1
filt <- c(FALSE, d1) | c(d1, FALSE)
paste(unlist(strsplit(input, ""))[filt], collapse = "")
# [1] "bcde"
myf = function(x){
x = unlist(strsplit(x, ""))
ind = charmatch(x, letters)
d = c(0, diff(ind))
d[d !=1] = 0
d = d + c(sapply(1:(length(d)-1), function(i) {
ifelse(d[i] == 0 & d[i+1] == 1, 1, 0)
}
), 0)
d = split(seq_along(d)[d!=0], with(rle(d), rep(seq_along(values), lengths))[d!=0])
return(sapply(d, function(a) paste(x[a], collapse = "")))
}
myf(x = "hjubcdepyvblltpqrs")
# 2 4
#"bcde" "pqrs"

Retrieving ids of splitted list in R

Say I have a list, like so:
my.list <- list()
for (i in 1:100)
{
my.list[[i]] <- list(location = sample(paste0("Location", 1:5), 1, replace=T),
val1 = runif(100),
val2 = runif(30))
}
Now I split it by location
loc <- sapply(my.list, function(x){x$location})
my.list.split <- split(my.list, loc)
Is there a way to associate each element of my.list.split to the original my.list, i.e., finding its ID in my.list?
Here's one way to find the IDs:
IDs <- seq_along(my.list) # generate a vector of IDs
IDs.split <- split(IDs, loc) # split the IDs along loc
This returns a list which includes vectors of IDs for each location.
If you give my.list some names, then your my.list.split will also have names which you can use to refer back, if necessary.
# Syntactically different, but functionally equivalent way of creating the list.
my.list<- lapply(1:100,function(x) list(location = sample(paste0("Location", 1:5), 1, replace=T),
val1 = runif(100),
val2 = runif(30)))
names(my.list)<-paste0('id_',seq_along(my.list)) # Added
loc <- sapply(my.list, function(x){x$location})
my.list.split <- split(my.list, loc)
So, now everything has an unique id:
my.list.split[[1]]
# $id_11
# $id_11$location
# [1] "Location1"
#
# $id_11$val1
# [1] 0.997154684 0.348063634 0.373797808 0.569167679 0.417461443 0.799423830 0.147882721
# [8] 0.489438012 0.292867337 0.072622654 0.583932815 0.060452664 0.083562011 0.613114462
# ....
# $id_11$val2
# [1] 0.68983774 0.41056046 0.18620312 0.61078253 0.85947881 0.50736945 0.01362270 0.70022800
Another way if for some reason you don't want to set IDs first:
match(unlist(my.list.split, FALSE), my.list)
You can then set the names with names() or whatever if that's what you're trying to do.
split() divides your list into a nested list according to loc. unlist() with recursive set to FALSE will remove the items from my.list.split so that they are in the same shape as my.list. Then all you have to do is match() them to see which items refer to what indices in the original object.
Proof that the match is correct (should return TRUE unless I've made a horrible mistake):
ul <- unlist(my.list.split, FALSE)
m <- match(ul, my.list)
identical(my.list[m], unname(ul))

Resources