Find unique level of list of data set each column - r

I have a list of 18 datasets, each dataset has some columns, how I write a loop to find the intersect by the index of column, and return list of index of column.
df1 <- data.frame(id = c(1:5), loc = c("a","b","c","a","b"))
df2 <- data.frame(id = c(3:7), ta = c("c","b","d","a","b"))
df3 <- data.frame(id = c(1:5), az = c("d","a","e","d","b"))
df <- list(df1, df2, df3)
df <- lapply(df, function(i) lapply(i, function(j) as.character(j)))
intersect(df[[1]][1], df[[2]][1], df[[3]][1])
intersect(df[[1]][2], df[[2]][2], df[[3]][2])

With tidyverse, we can use map/reduce
library(purrr)
library(dplyr)
map(df, pull, 1) %>%
reduce(intersect)
#[1] 3 4 5
Or as a function
f1 <- function(lstA, ind) {
map(lstA, pull, ind) %>%
reduce(intersect)
}
f1(df, 1)
#[1] 3 4 5
f1(df, 2)
#[1] "a" "b"

You may use Reduce on the intersect function and the [ in an sapply to choose sub list number.
Single:
Reduce(intersect, sapply(df, `[`, 1))
# [1] "3" "4" "5"
Reduce(intersect, sapply(df, `[`, 2))
# [1] "a" "b"
Or altogether:
lapply(1:2, function(i) Reduce(intersect, sapply(df, `[`, i)))
# [[1]]
# [1] "3" "4" "5"
#
# [[2]]
# [1] "a" "b"

Related

Names of nested list containing dots (e.g. "c.2)

How can I get the names of the leafs of a nested list (containing a dataframe)
p <- list(a=1,b=list(b1=2,b2=3),c=list(c1=list(c11='a',c12='x'),c.2=data.frame("t"=1)))
into a vector format:
[[1]]
[1] "a"
[[2]]
[1] "b" "b1"
[[3]]
[1] "b" "b2"
[[4]]
[1] "c" "c1" "c11"
[[5]]
[1] "c" "c1" "c12"
[[6]]
[1] "c" "c.2"
The problem is that my list contains names with a dot (e.g. "c.2"). By using unlist, one gets "c.c.2" and I (or possibly strsplit) can't tell if the point is a delimiter of unlist or part of the name. That is the difference to this question.
It should ignore data.frames. My approach so far is adapted from here, but struggles with the points created by unlist:
listNames = function(l, maxDepth = 2) {
n = 0
listNames_rec = function(l, n) {
if(!is.list(l) | is.data.frame(l) | n>=maxDepth) TRUE
else {
n = n + 1
# print(n)
lapply(l, listNames_rec, n)
}
}
n = names(unlist(listNames_rec(l, n)))
return(n)
}
listNames(p, maxDepth = 3)
[1] "a" "b.b1" "b.b2" "c.c1.c11" "c.c1.c12" "c.c.2"
Like this?
subnames <- function(L, s) {
if (!is.list(L) || is.data.frame(L)) return(L)
names(L) <- gsub(".", s, names(L), fixed = TRUE)
lapply(L, subnames, s)
}
res <- listNames(subnames(p, ":"), maxDepth = 3)
gsub(":", ".",
gsub(".", "$", res, fixed = TRUE),
fixed = TRUE
)
#[1] "a" "b$b1" "b$b2" "c$c1$c11" "c$c1$c12" "c$c.2"
Not a full answer but I imagine rrapply package could help you here?
One option could be to extract all names:
library(rrapply)
library(dplyr)
rrapply(p, how = "melt") %>%
select(-value)
# L1 L2 L3
# 1 a <NA> <NA>
# 2 b b1 <NA>
# 3 b b2 <NA>
# 4 c c1 c11
# 5 c c1 c12
# 6 c c.2 t
The problem here is that data.frame names are included above too so you could extract them separately:
#extract data frame name
rrapply(p, classes = "data.frame", how = "melt") %>%
select(-value)
# L1 L2
# 1 c c.2
Then you could play around with these two datasets and perhaps extract duplicates but keep dataframe names
rrapply(p, how = "melt") %>%
bind_rows(rrapply(p, classes = "data.frame", how = "melt"))
#then filter etc...
A way might be:
listNames = function(l, n, N) {
if(!is.list(l) | is.data.frame(l) | n<1) list(rev(N))
else unlist(Map(listNames, l, n=n-1, N=lapply(names(l), c, N)), FALSE, FALSE)
}
listNames(p, 3, NULL)
#[[1]]
#[1] "a"
#
#[[2]]
#[1] "b" "b1"
#
#[[3]]
#[1] "b" "b2"
#
#[[4]]
#[1] "c" "c1" "c11"
#
#[[5]]
#[1] "c" "c1" "c12"
#
#[[6]]
#[1] "c" "c.2"

R dplyr mutate_at accessing colnames

How could one access the column name being processed by dplyr::mutate_at?
Let's say we would like to convert a column of a data frame into factors with levels stored in a separate list.
df <- data.frame("C1"=c("A","B","C"), "C2"=c("D","E","F"))
df
C1 C2
1 A D
2 B E
3 C F
lst <- list("C2"=c("F","E","D"), "C3"=c("G","H","I"))
lst
$C2
[1] "F" "E" "D"
$C3
[1] "G" "H" "I"
All of the following trigger error or replace all the column values by NA:
df %>%
mutate_at(vars(C2), function(x) factor(x, levels=lst$.))
df %>%
mutate_at(vars(C2), function(x) factor(x, levels=lst[[colnames(.)]]))
df %>%
mutate_at(vars(C2), function(x){col = as.name(.); factor(x, levels=lst$col))
You can use Map in base R or map2 from purrr after getting the common columns using intersect.
cols <- intersect(names(lst), names(df))
df[cols] <- Map(function(x, y) factor(x, levels = y), df[cols], lst[cols])
Or
df[cols] <- purrr::map2(df[cols], lst[cols], ~factor(.x, levels = .y))

How to remove duplicate elements from two lists (pairwise)?

I have two very large lists (13000) elements. I would like to remove the duplicates pair-wise, i.e. remove object i in both lists if we find the same as object j.
The function unique() works very well for a single list, but does not work pairwise.
a = matrix(c(50,70,45,89), ncol = 2)
b = matrix(c(45,86), ncol = 2)
c = matrix(c(20,35), ncol = 2)
df1 = list(a,b,c)
df2 = list(a,b,a)
df3 = cbind(df1,df2)
v = unique(df3, incomparables = FALSE)
In the end, the expected result would be df1 = list(c) and df2 = list(a). Do you have a good approach for this? Thank you a lot!
If you only have single element for each component of your list, then you can:
df1 <- list("a", "b", "c")
df2 <- list("a", "b", "a")
comp <- unlist(df1) != unlist(df2)
df1[comp]
[[1]]
[1] "c"
df2[comp]
[[1]]
[1] "a"
is that what you were looking for?
a more generic (whatever you'd have in your lists) solution using purrr would be:
comp2 <- !purrr::map2_lgl(df1, df2, identical)
df1[comp2]
[[1]]
[1] "c"
df2[comp2]
[[1]]
[1] "a"
You can try
Filter(length, Map(function(x, y) x[x != y], df1, df2))
#[[1]]
#[1] "c"
Filter(length, Map(function(x, y) x[x != y], df2, df1))
#[[1]]
#[1] "a"

Making new dataframes from old dataframes by column number

I'm trying to re-organize my dataframes by Column orders
for Example
x <- data.frame("A" = c(1,1), "B" = c(2,2), "C" = c(3,3))
y <- data.frame("A" = c(2,2), "B" = c(3,3), "C" = c(4,4))
z <- data.frame("A" = c(3,3), "B" = c(4,4), "C" = c(5,5))
Say I have dataframes as above.
What I want to do is make new dataframes by column orders of those above dataframes. (Simply put, I want to put all the "A"s ,"B"s and "C"s, to 3 new dataframes.
the below dataframes are my wanted results
a <- data.frame("A" = c(1,1), "A" = c(2,2), "A" = c(3,3))
b <- data.frame("B" = c(2,2), "B" = c(3,3), "B" = c(4,4))
c <- data.frame("C" = c(3,3), "C" = c(4,4), "C" = c(5,5))
We can do this with tidyverse
library(tidyverse)
list(x, y, z) %>%
transpose %>%
map(~ do.call(cbind, .x))
Or with base R
lapply(names(x), function(nm) cbind(x[, nm], y[, nm], z[, nm]))
Assuming you have equal number of columns in all the dataframes, one way is to use lapply over list of dataframes and subset them sequentially.
lst1 <- list(x, y, z)
lapply(seq_len(ncol(x)), function(i) cbind.data.frame(lapply(lst1, `[`, i)))
#[[1]]
# A A A
#1 1 2 3
#2 1 2 3
#[[2]]
# B B B
#1 2 3 4
#2 2 3 4
#[[3]]
# C C C
#1 3 4 5
#2 3 4 5
If your dataframes are not already sorted by names you might want to do that first.
lst1 <- lapply(list(x, y, z), function(i) i[order(names(i))])
We can also use purrr using the same logic
library(purrr)
map(seq_len(ncol(x)), ~cbind.data.frame(map(lst1, `[`, .)))

Sorting lists by purrr::transpose but some value is missing

I want to sort the lists depending on the number of "a" in each element.
library("purrr")
data1 <- c("apple","appreciate","available","account","adapt")
data2 <- c("tab","banana","cable","tatabox","aaaaaaa")
list1 <- list(data1,data2)
ca <- lapply(list1, function(x) str_count(x, "a"))
t2 <- Map(split, list1, ca)
t3 <- transpose(t2)
> t3
$`1`
$`1`[[1]]
[1] "apple" "account"
$`1`[[2]]
[1] "tab" "cable"
$`2`
$`2`[[1]]
[1] "appreciate" "adapt"
$`2`[[2]]
[1] "tatabox"
$`3`
$`3`[[1]]
[1] "available"
$`3`[[2]]
[1] "banana"
It lost the "aaaaaaa" which in data2. How can I fix this problem?
I had find a solution:
data1 <- c("apple","appreciate","available","account","adapt")
data2 <- c("tab","banana","cable","tatabox","aaaaaaa","aaaaaaaaaaa")
list1 <- list(data1,data2)
ca <- lapply(list1, function(x) str_count(x, "a"))
k11<- flatten(Map(split, list1, ca))
k1<-split(k11, as.integer(names(k11)))
Citing the words of lionel: "transpose() treats lists of lists as implicitly rectangular tables". It was not designed for many edge cases like the current one. However, you could get the requried put by place the longer one at the begining: transpose(t2[2:1]).
However, this workaround could not generalize. I prefer the following way --- combine the sublists into a single list and split again:
> t3 <- do.call(c, t2)
> split(t3, names(t3))
$`1`
$`1`$`1`
[1] "apple" "account"
$`1`$`1`
[1] "tab" "cable"
$`2`
$`2`$`2`
[1] "appreciate" "adapt"
$`2`$`2`
[1] "tatabox"
$`3`
$`3`$`3`
[1] "available"
$`3`$`3`
[1] "banana"
$`7`
$`7`$`7`
[1] "aaaaaaa"
Edit
A function for named and unnamed input:
data1 <- c("apple","appreciate","available","account","adapt")
data2 <- c("tab","banana","cable","tatabox","aaaaaaa","aaaaaaaaaaa")
list1 <- list(data1,data2)
names(list1) <- c("atf","bdfs")
f <- function(x){
if(is.null(names(x))){
names(x) <- make.names(seq_along(x))
}
dtf <- stack(x)
res <- split(dtf, str_count(dtf$values, 'a'))
lapply(res, function(y) split(y$values, y$ind, drop = TRUE) )
}
f(list1)
We can also do this using pipe with purrr functions
map(list1, str_count, "a") %>%
map2(list1, ., split) %>%
flatten %>%
split(names(.))
#$`1`
#$`1`$`1`
#[1] "apple" "account"
#$`1`$`1`
#[1] "tab" "cable"
#$`2`
#$`2`$`2`
#[1] "appreciate" "adapt"
#$`2`$`2`
#[1] "tatabox"
#$`3`
#$`3`$`3`
#[1] "available"
#$`3`$`3`
#[1] "banana"
#$`7`
#$`7`$`7`
#[1] "aaaaaaa"

Resources