Extract colnames from a nested list of data.frames - r

I have a nested list of data.frames, what is the easiest way to get the column names of all data.frames?
Example:
d = data.frame(a = 1:3, b = 1:3, c = 1:3)
l = list(a = d, list(b = d, c = d))
Result:
$a
[1] "a" "b" "c"
$b
[1] "a" "b" "c"
$c
[1] "a" "b" "c"

There are already a couple of answers. But let me leave another approach. I used rapply2() in the rawr package.
devtools::install_github('raredd/rawr')
library(rawr)
library(purrr)
rapply2(l = l, FUN = colnames) %>%
flatten
$a
[1] "a" "b" "c"
$b
[1] "a" "b" "c"
$c
[1] "a" "b" "c"

Here is a base R solution.
You can define a customized function to flatten your nested list (which can deal nested list of any depths, e.g., more than 2 levels), i.e.,
flatten <- function(x){
islist <- sapply(x, class) %in% "list"
r <- c(x[!islist], unlist(x[islist],recursive = F))
if(!sum(islist))return(r)
flatten(r)
}
and then use the following code to achieve the colnames
out <- Map(colnames,flatten(l))
such that
> out
$a
[1] "a" "b" "c"
$b
[1] "a" "b" "c"
$c
[1] "a" "b" "c"
Example with a deeper nested list
l <- list(a = d, list(b = d, list(c = list(e = list(f= list(g = d))))))
> l
$a
a b c
1 1 1 1
2 2 2 2
3 3 3 3
[[2]]
[[2]]$b
a b c
1 1 1 1
2 2 2 2
3 3 3 3
[[2]][[2]]
[[2]][[2]]$c
[[2]][[2]]$c$e
[[2]][[2]]$c$e$f
[[2]][[2]]$c$e$f$g
a b c
1 1 1 1
2 2 2 2
3 3 3 3
and you will get
> out
$a
[1] "a" "b" "c"
$b
[1] "a" "b" "c"
$c.e.f.g
[1] "a" "b" "c"

Here is an attempt to do this as Vectorized as possible,
i1 <- names(unlist(l, TRUE, TRUE))
#[1] "a.a1" "a.a2" "a.a3" "a.b1" "a.b2" "a.b3" "a.c1" "a.c2" "a.c3" "b.a1" "b.a2" "b.a3" "b.b1" "b.b2" "b.b3" "b.c1" "b.c2" "b.c3" "c.a1" "c.a2" "c.a3" "c.b1" "c.b2" "c.b3" "c.c1" "c.c2" "c.c3"
i2 <- names(split(i1, gsub('\\d+', '', i1)))
#[1] "a.a" "a.b" "a.c" "b.a" "b.b" "b.c" "c.a" "c.b" "c.c"
We can now split i2 on everything before the dot, which will give,
split(i2, sub('\\..*', '', i2))
# $a
# [1] "a.a" "a.b" "a.c"
# $b
# [1] "b.a" "b.b" "b.c"
# $c
# [1] "c.a" "c.b" "c.c"
To get them fully cleaned, we need to loop over and apply a simple regex,
lapply(split(i2, sub('\\..*', '', i2)), function(i)sub('.*\\.', '', i))
which gives,
$a
[1] "a" "b" "c"
$b
[1] "a" "b" "c"
$c
[1] "a" "b" "c"
The Code compacted
i1 <- names(unlist(l, TRUE, TRUE))
i2 <- names(split(i1, gsub('\\d+', '', i1)))
final_res <- lapply(split(i2, sub('\\..*', '', i2)), function(i)sub('.*\\.', '', i))

Try this
d = data.frame(a = 1:3, b = 1:3, c = 1:3)
l = list(a = d, list(b = d, c = d))
foo <- function(x, f){
if (is.data.frame(x)) return(f(x))
lapply(x, foo, f = f)
}
foo(l, names)
The crux here is that data.frames actually are special list, so it's important what to test for.
Small explanation: what needs to be done here is a recursion, since with every element you might look at either a dataframe, so you want to decide if you apply the names or go deeper into the recursion and call foo again.

First create l1, a nested list with only the colnames
l1 <- lapply(l, function(x) if(is.data.frame(x)){
list(colnames(x)) #necessary to list it for the unlist() step afterwards
}else{
lapply(x, colnames)
})
Then unlist l1
unlist(l1, recursive=F)

Here is one way using purrr functions map_depth and vec_depth
library(purrr)
return_names <- function(x) {
if(inherits(x, "list"))
return(map_depth(x, vec_depth(x) - 2, names))
else return(names(x))
}
map(l, return_names)
#$a
#[1] "a" "b" "c"
#[[2]]
#[[2]]$b
#[1] "a" "b" "c"
#[[2]]$c
#[1] "a" "b" "c"

Using an external package, this is also straightforward with rrapply() in the rrapply-package (and works for arbitrary levels of nesting):
library(rrapply)
rrapply(l, classes = "data.frame", f = colnames, how = "flatten")
#> $a
#> [1] "a" "b" "c"
#>
#> $b
#> [1] "a" "b" "c"
#>
#> $c
#> [1] "a" "b" "c"
## deeply nested list
l2 <- list(a = d, list(b = d, list(c = list(e = list(f = list(g = d))))))
rrapply(l2, classes = "data.frame", f = colnames, how = "flatten")
#> $a
#> [1] "a" "b" "c"
#>
#> $b
#> [1] "a" "b" "c"
#>
#> $g
#> [1] "a" "b" "c"

Related

Names of nested list containing dots (e.g. "c.2)

How can I get the names of the leafs of a nested list (containing a dataframe)
p <- list(a=1,b=list(b1=2,b2=3),c=list(c1=list(c11='a',c12='x'),c.2=data.frame("t"=1)))
into a vector format:
[[1]]
[1] "a"
[[2]]
[1] "b" "b1"
[[3]]
[1] "b" "b2"
[[4]]
[1] "c" "c1" "c11"
[[5]]
[1] "c" "c1" "c12"
[[6]]
[1] "c" "c.2"
The problem is that my list contains names with a dot (e.g. "c.2"). By using unlist, one gets "c.c.2" and I (or possibly strsplit) can't tell if the point is a delimiter of unlist or part of the name. That is the difference to this question.
It should ignore data.frames. My approach so far is adapted from here, but struggles with the points created by unlist:
listNames = function(l, maxDepth = 2) {
n = 0
listNames_rec = function(l, n) {
if(!is.list(l) | is.data.frame(l) | n>=maxDepth) TRUE
else {
n = n + 1
# print(n)
lapply(l, listNames_rec, n)
}
}
n = names(unlist(listNames_rec(l, n)))
return(n)
}
listNames(p, maxDepth = 3)
[1] "a" "b.b1" "b.b2" "c.c1.c11" "c.c1.c12" "c.c.2"
Like this?
subnames <- function(L, s) {
if (!is.list(L) || is.data.frame(L)) return(L)
names(L) <- gsub(".", s, names(L), fixed = TRUE)
lapply(L, subnames, s)
}
res <- listNames(subnames(p, ":"), maxDepth = 3)
gsub(":", ".",
gsub(".", "$", res, fixed = TRUE),
fixed = TRUE
)
#[1] "a" "b$b1" "b$b2" "c$c1$c11" "c$c1$c12" "c$c.2"
Not a full answer but I imagine rrapply package could help you here?
One option could be to extract all names:
library(rrapply)
library(dplyr)
rrapply(p, how = "melt") %>%
select(-value)
# L1 L2 L3
# 1 a <NA> <NA>
# 2 b b1 <NA>
# 3 b b2 <NA>
# 4 c c1 c11
# 5 c c1 c12
# 6 c c.2 t
The problem here is that data.frame names are included above too so you could extract them separately:
#extract data frame name
rrapply(p, classes = "data.frame", how = "melt") %>%
select(-value)
# L1 L2
# 1 c c.2
Then you could play around with these two datasets and perhaps extract duplicates but keep dataframe names
rrapply(p, how = "melt") %>%
bind_rows(rrapply(p, classes = "data.frame", how = "melt"))
#then filter etc...
A way might be:
listNames = function(l, n, N) {
if(!is.list(l) | is.data.frame(l) | n<1) list(rev(N))
else unlist(Map(listNames, l, n=n-1, N=lapply(names(l), c, N)), FALSE, FALSE)
}
listNames(p, 3, NULL)
#[[1]]
#[1] "a"
#
#[[2]]
#[1] "b" "b1"
#
#[[3]]
#[1] "b" "b2"
#
#[[4]]
#[1] "c" "c1" "c11"
#
#[[5]]
#[1] "c" "c1" "c12"
#
#[[6]]
#[1] "c" "c.2"

Create a list containing a variable number of lists

I need to create a list from rows of a dataframe in the following format:
df <- data.frame(y1 = c("a", "d"), y2 = c("b", "e"), y3 = c("c", "f"))
df$y1 <- as.character(df$y1)
df$y2 <- as.character(df$y2)
df$y3 <- as.character(df$y3)
x <- list(
list(y1 = df$y1[1],
y2 = df$y2[1],
y3 = df$y3[1]),
list(y1 = df$y1[2],
y2 = df$y2[2],
y3 = df$y3[2])
)
> x
[[1]]
[[1]]$`y1`
[1] "a"
[[1]]$y2
[1] "b"
[[1]]$y3
[1] "c"
[[2]]
[[2]]$`y1`
[1] "d"
[[2]]$y2
[1] "e"
[[2]]$y3
[1] "f"
This is an example when there are two rows in the dataframe. How can I achieve this when the number of rows in the dataframe is variable? So for every row in the dataframe, there should be a list.
We may also use apply by going over the rows and applying as.list to each:
apply(df, 1, as.list)
[[1]]
[[1]]$y1
[1] "a"
[[1]]$y2
[1] "b"
[[1]]$y3
[1] "c"
[[2]]
[[2]]$y1
[1] "d"
[[2]]$y2
[1] "e"
[[2]]$y3
[1] "f"
We first split every row of the dataframe and then for every row we convert each element into separate list element using as.list
lapply(split(df, 1:nrow(df)), as.list)
#$`1`
#$`1`$y1
#[1] "a"
#$`1`$y2
#[1] "b"
#$`1`$y3
#[1] "c"
#$`2`
#$`2`$y1
#[1] "d"
#$`2`$y2
#[1] "e"
#$`2`$y3
#[1] "f"
We can use transpose from purrr
library(purrr)
transpose(df)
#[1]]
#[[1]]$y1
#[1] "a"
#[[1]]$y2
#[1] "b"
#[[1]]$y3
#[1] "c"
#[[2]]
#[[2]]$y1
#[1] "d"
#[[2]]$y2
#[1] "e"
#[[2]]$y3
#[1] "f"

R - How do I check if an element is in a list of vectors?

Ok, my question might be a bit weirder than what the title suggests.
I have this list:
x <- list(
c("a", "d"),
c("a", "c"),
c("d", "e"),
c("e", "f"),
c("b", "c"),
c("f", "c"), # row 6
c("c", "e"),
c("f", "b"),
c("b", "a")
)
And I need to copy this stuff in another list called T. The only condition is that both letters of the pair must not be in T already. If one of them is already in T and the other isn't it's fine.
Basically in this example I would take the first 5 positions and copy them in T one after another because either one or both letters are new to T.
Then I would skip the 6th position because the letter "f" was already in the 4th position of T and the letter "c" is already in the 2nd and 5th positions of T.
Then I would skip the remaining 3 positions for the same reason (the letters "c", "e", "f", "b", "a" are already in T at this point)
I tried doing this
for(i in 1:length(T){
if (!( *first letter* %in% T && *second letter* %in% T)) {
T[[i]] <- c(*first letter*, *second letter*)
}
}
But it's like the "if" isn't even there, and I'm pretty sure I'm using %in% in the wrong way.
Any suggestions? I hope what I wrote makes sense, I'm new to R and to this site in general.
Thanks for your time
Effectively, for each element of the list, you want to lose it if both of its elements exist in earlier elements. A logical index is helpful here.
# Make a logical vector the length of x.
lose <- logical(length(x))
Now you can run a loop over the length of lose and compare it against all previous elements of x. Using seq_len saves us the headache of having to guard against the special case of i = 1 (seq_len(0) returns a zero-length integer instead of 0).
for (i in seq_along(lose)){
lose[i] <- all(x[[i]] %in% unique(unlist(x[seq_len(i - 1)])))
}
Now let's use the logical vector to subset x to T
T <- x[!lose]
T
#> [[1]]
#> [1] "a" "d"
#>
#> [[2]]
#> [1] "a" "c"
#>
#> [[3]]
#> [1] "d" "e"
#>
#> [[4]]
#> [1] "e" "f"
#>
#> [[5]]
#> [1] "b" "c"
# Created on 2018-07-19 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0).
You can put the set of all previous elements in a list cum.sets, then use Map to check if all elements of the current vector are in the lagged cumulative set.
cum.sets <- lapply(seq_along(x), function(y) unlist(x[1:y]))
keep <- unlist(
Map(function(x, y) !all(x %in% y)
, x
, c(NA, cum.sets[-length(cum.sets)])))
x[keep]
# [[1]]
# [1] "a" "d"
#
# [[2]]
# [1] "a" "c"
#
# [[3]]
# [1] "d" "e"
#
# [[4]]
# [1] "e" "f"
#
# [[5]]
# [1] "b" "c"
tidyverse version (same output)
library(tidyverse)
cum.sets <- imap(x, ~ unlist(x[1:.y]))
keep <- map2_lgl(x, lag(cum.sets), ~!all(.x %in% .y))
x[keep]
You can use Reduce. In this case. IF all the new values are not in the list already, then concatenate it to the list, else drop it. the initial is the first element of the list:
Reduce(function(i, y) c(i, if(!all(y %in% unlist(i))) list(y)), x[-1],init = x[1])
[[1]]
[1] "a" "d"
[[2]]
[1] "a" "c"
[[3]]
[1] "d" "e"
[[4]]
[1] "e" "f"
[[5]]
[1] "b" "c"
The most straightforward option is that you could store unique entries in another vector as you're looping through your input data.
Here's a solution without considering the positions (1 or 2) of the alphabets in your output list or the order of your input list.
dat <- list(c('a','d'),c('a','c'),c('d','e'),c('e','f'),c('b','c'),
c('f','c'),c('c','e'),c('f','b'),c('b','a'))
Dat <- list()
idx <- list()
for(i in dat){
if(!all(i %in% idx)){
Dat <- append(Dat, list(i))
## append to idx if not previously observed
if(! i[1] %in% idx) idx <- append(idx, i[1])
if(! i[2] %in% idx) idx <- append(idx, i[2])
}
}
print(Dat)
#> [[1]]
#> [1] "a" "d"
#>
#> [[2]]
#> [1] "a" "c"
#>
#> [[3]]
#> [1] "d" "e"
#>
#> [[4]]
#> [1] "e" "f"
#>
#> [[5]]
#> [1] "b" "c"
On another note, I'd advise against using T as your vector name as it's used as TRUE in R.
We can unlist, check duplicated values with duplicated, reformat as a matrix and filter out pairs of TRUE values:
x[colSums(matrix(duplicated(unlist(x)), nrow = 2)) != 2]
# [[1]]
# [1] "a" "d"
#
# [[2]]
# [1] "a" "c"
#
# [[3]]
# [1] "d" "e"
#
# [[4]]
# [1] "e" "f"
#
# [[5]]
# [1] "b" "c"
#
And I recommend you don't use T as a variable name, it means TRUE by default (thought it's discouraged to use it as such), this could lead to unpleasant debugging.

Applying as.numeric only to elements of a list that can be coerced to numeric (in R)

I have a function which returns a list containing individual character vectors which I would like to convert to numeric. Most of the time, all the elements of the list can easily be coerced to numeric:
and so a simplelapply(x, FUN = as.numeric) works fine.
e.g.
l <- list(a = c("1","1"), b = c("2","2"))
l
$a
[1] "1" "1"
$b
[1] "2" "2"
lapply(l, FUN = as.numeric)
$a
[1] 1 1
$b
[1] 2 2
However, in some situations, vectors contain true characters:
e.g.
l <- list(a = c("1","1"), b = c("a","b"))
l
$a
[1] "1" "1"
$b
[1] "a" "b"
lapply(l, FUN = as.numeric)
$a
[1] 1 1
$b
[1] NA NA
The solution I have come with works but feels a little convoluted:
l.id <- unlist(lapply(l, FUN = function(x){all(!is.na(suppressWarnings(as.numeric(x))))}))
l.id
a b
TRUE FALSE
l[l.id] <- lapply(l[l.id], FUN = as.numeric)
l
$a
[1] 1 1
$b
[1] "a" "b"
So I was just wondering if anyone out there had a more streamlined and elegant solution to suggest.
Thanks!
One option would be to check whether all the elements in the vector have only numbers and if so convert to numeric or else stay as the same.
lapply(l, function(x) if(all(grepl('^[0-9.]+$', x))) as.numeric(x) else x)
Or we can use type.convert to automatically convert the class, but the character vectors will be converted to factor class.
lapply(l, type.convert)
You could also do something like
lapply(l, function(x) if(is.numeric(t <- type.convert(x))) t else x)
# $a
# [1] 1 1
#
# $b
# [1] "a" "b"
This doesn't convert anything other than when a numeric results from type.convert(). Or, for this simple case we can use as.is = TRUE but note that this will not always give us what we want.
lapply(l, type.convert, as.is = TRUE)
# $a
# [1] 1 1
#
# $b
# [1] "a" "b"

Aggregate strings using c() in dplyr summarize or aggregate

I want to aggregate some strings using c() as aggregation function in dplyr. I first tried the following:
> InsectSprays$spray = as.character(InsectSprays$spray)
> dt = tbl_df(InsectSprays)
> dt %>% group_by(count) %>% summarize(c(spray))
Error: expecting a single value
But using c() function in aggregate() works:
> da = aggregate(spray ~ count, InsectSprays, c)
> head(da)
count spray
1 0 C, C
2 1 C, C, C, C, E, E
3 2 C, C, D, E>
Searching in stackoverflow hinted that instead of c() function, using paste() with collapse would solve the problem:
dt %>% group_by(count) %>% summarize(s=paste(spray, collapse=","))
or
dt %>% group_by(count) %>% summarize(paste( c(spray), collapse=","))
My question is: Why does c() function work in aggregate() but not in dplyr summarize()?
If you have a closer look, you can find that c() actually does work (to a certain extent) when we use do(). But to my understanding, dplyr does not currently allow this type of printing of lists
> InsectSprays$spray = as.character(InsectSprays$spray)
> dt = tbl_df(InsectSprays)
> doC <- dt %>% group_by(count) %>% do(s = c(.$spray))
> head(doC)
Source: local data frame [6 x 2]
count s
1 0 <chr[2]>
2 1 <chr[6]>
3 2 <chr[4]>
4 3 <chr[8]>
5 4 <chr[4]>
6 5 <chr[7]>
> head(doC)[[2]]
[[1]]
[1] "C" "C"
[[2]]
[1] "C" "C" "C" "C" "E" "E"
[[3]]
[1] "C" "C" "D" "E"
[[4]]
[1] "C" "C" "D" "D" "E" "E" "E" "E"
[[5]]
[1] "C" "D" "D" "E"
[[6]]
[1] "D" "D" "D" "D" "D" "E" "E"

Resources