cbind with partially nested list - r

I'm trying to cbind or unnest or as.data.table a partially nested list.
id <- c(1,2)
A <- c("A1","A2","A3")
B <- c("B1")
AB <- list(A=A,B=B)
ABAB <- list(AB,AB)
nested_list <- list(id=id,ABAB=ABAB)
The length of id is the same as ABAB (2 in this case). I don't know how to unlist a part of this list (ABAB) and cbind another part (id). Here's my desired result as a data.table:
data.table(id=c(1,1,1,2,2,2),A=c("A1","A2","A3","A1","A2","A3"),B=rep("B1",6))
id A B
1: 1 A1 B1
2: 1 A2 B1
3: 1 A3 B1
4: 2 A1 B1
5: 2 A2 B1
6: 2 A3 B1

I haven't tested for more general cases, but this works for the OP example:
library(data.table)
as.data.table(nested_list)[, lapply(ABAB, as.data.table)[[1]], id]
# id A B
#1: 1 A1 B1
#2: 1 A2 B1
#3: 1 A3 B1
#4: 2 A1 B1
#5: 2 A2 B1
#6: 2 A3 B1
Or another option (which is probably faster, but is more verbose):
rbindlist(lapply(nested_list$ABAB, as.data.table),
idcol = 'id')[, id := nested_list$id[id]]

This is some super ugly base R, but produces the desired output.
Reduce(rbind, Map(function(x, y) setNames(data.frame(x, y), c("id", "A", "B")),
as.list(nested_list[[1]]),
lapply(unlist(nested_list[-1], recursive=FALSE),
function(x) Reduce(cbind, x))))
id A B
1 1 A1 B1
2 1 A2 B1
3 1 A3 B1
4 2 A1 B1
5 2 A2 B1
6 2 A3 B1
lapply takes the a list of two elements (each containing the A and B variables) extracted with unlist and recursive=FALSE. It returns a list of character matrices with the B variable filled in by recycling. A list of the individual id variables from as.list(nested_list[[1]]) and the lit of matrices are fed to Map which converts corresponding pairs to a data.frame and gives the columns the desired names and returns a list of data.frames. Finally, this list of data.frames is fed to Reduce, which rbinds the results to a single data.frame.
The final Reduce(rbind, could be replaced by data.tables rbindlist if desired.

Here's another hideous solution
max_length = max(unlist(lapply(nested_list, function(x) lapply(x, lengths))))
data.frame(id = do.call(c, lapply(nested_list$id, rep, max_length)),
do.call(rbind, lapply(nested_list$ABAB, function(x)
do.call(cbind, lapply(x, function(y) {
if(length(y) < max_length) {
rep(y, max_length)
} else {
y
}
})))))
# id A B
#1 1 A1 B1
#2 1 A2 B1
#3 1 A3 B1
#4 2 A1 B1
#5 2 A2 B1
#6 2 A3 B1

And one more, also inelegant- but I`d gone too far by the time I saw the other answers.
restructure <- function(nested_l) {
ids <- as.numeric(max(unlist(lapply(unlist(nested_l, recursive = FALSE), function(x){
lapply(x, length)
}))))
temp = data.frame(rep(nested_l$id, each = ids),
sapply(1:length(nested_l$id), function(x){
out <-unlist(lapply(nested_l[[2]], function(y){
return(y[x])
}))
}))
names(temp) <- c("id", unique(substring(unlist(nested_l[2]), first = 1, last = 1)))
return(temp)
}
> restructure(nested_list)
id A B
1 1 A1 B1
2 1 A2 B1
3 1 A3 B1
4 2 A1 B1
5 2 A2 B1
6 2 A3 B1

Joining the party:
library(tidyverse)
temp <- map(nested_list,~map(.x,~expand.grid(.x)))
df <- map_df(1:2,~cbind(temp$id[[.x]],temp$ABAB[[.x]]))
Var1 A B
1 1 A1 B1
2 1 A2 B1
3 1 A3 B1
4 2 A1 B1
5 2 A2 B1
6 2 A3 B1

Related

Dependents and Precedents in R

Need help in flagging number of Dependents and Precedents in R. My data frame contains some formulas (strings) and I want to add "col3" which should contain: 0 for A1, 1 for A2 (Because A2 is dependent on A1 - One dependency) and 2 for A3 (Because A3 is dependent on A2/A1).
col1 <- c('A1','A2','A3', 'A6','A4','A7')
col2 <- c('X1+Y1','A1+Y2', 'A4+Y3+A2', 'Y5+A1','A2+A1+A3','A2+A1')
df <- data.frame(col1, col2, stringsAsFactors=F)
My Output should look like:
col1 col2 col3
1 A1 Y1 0
2 A2 A1+Y2 1
3 A3 A4+Y3+A2 5
4 A6 Y5+A1 1
5 A4 A2+A1+Y3 3
6 A7 A2+A1 3
I have a data frame with 100+ rows of this format. Appreciate if you could help with this.
Below code produces the correct output.
col0 <- c('A1','A2','A3', 'A6','A4','A7')
col2 <- c('X1+Y1','A1+Y2', 'A1+Y3+A2', 'Y5+A2','A2+A1+A3','A2+A3')
df <- data.frame(col0, col2, stringsAsFactors=F)
library(tidyr)
library(dplyr)
df1 <- df %>%
separate(col2, into = as.character(c(1:4)),sep = "\\+") %>%
replace(is.na(.),"")
df1$OOE <- 0
for (i in 1:nrow(df1)) {
for (j in 2:ncol(df1)) {
for (k in 1:nrow(df1)) {
if (df1[i,j] == df1$col0[k]) df1$OOE[i]=df1$OOE[k]+df1$OOE[i]+1
}
}
}
col0 1 2 3 4 OOE
1 A1 X1 Y1 0
2 A2 A1 Y2 1
3 A3 A1 Y3 A2 3
4 A6 Y5 A2 2
5 A4 A2 A1 A3 7
6 A7 A2 A3 6
If AX can have a dependency on AY where Y>X, we need a tree like structure to find the dependencies. I knew about the igraph package but it seems to complex for the task. We just need some reference semantics and after some research, data.tree package seems appropriate. Here is the code:
col1 <- c('A1','A2','A3', 'A6','A4','A7')
col2 <- c('X1+Y1','A1+Y2', 'A1+Y3+A2', 'Y5+A2','A2+A1+A3','A2+A3')
df <- data.frame(col1, col2, stringsAsFactors=F)
require(data.tree)
# Create the graph/forest based on the data
getForest <- function(data) {
res <- new.env()
for( i in 1:nrow(data)){
nname <- data$col1[i]
if(!exists(nname,where=res))
assign(nname,Node$new(nname), pos=res)
par <- get(nname, envir=res)
print(par)
#Add the childs
deps <- unlist(regmatches(data$col2[i],gregexpr("A\\d+",data$col2[i])))
for( ch in deps) {
print("Ammm")
if(!exists(ch, where=res))
assign(ch,Node$new(ch), pos=res)
child <- get(ch, envir=res)
par$AddChildNode(child)
}
}
#Return the nodes
res
}
f <- getForest(df)
# Function to get the dependency level
getLevel<- function(node) {
if (node$count == 0)
return (0)
else {
return (length(node$children)+sum(sapply(node$children,getlevel)))
}
}
#Add dependency level to data frame
df$col3 <- sapply(df$col1, function(x) {getLevel(get(x,f))})
df
# col1 col2 col3
#1 A1 X1+Y1 0
#2 A2 A1+Y2 1
#3 A3 A1+Y3+A2 3
#4 A6 Y5+A2 2
#5 A4 A2+A1+A3 7
#6 A7 A2+A3 6

Comparing two columns of two dataframes (logical operators)

I would like to compare two columns simultaneously. My data looks like this:
a <- data.frame("a1" = c(1,1,1,3,4), "a2" = c(2,1,2,1,2))
b <- data.frame("b1" = c(1,1,3,1,3), "b2" = c(2,2,1,2,1))
cbind(a, b)
# a1 a2 b1 b2
# 1 1 2 1 2
# 2 1 1 1 2
# 3 1 2 3 1
# 4 3 1 1 2
# 5 4 2 3 1
I would like to identify all rows of a where a1 is not in b1 or where a1 is in b1 but a2 for the special a1 is not in b2 for the special b2. So the second question is: When a1 is in b1 is then a2 for this row for a1 also in b2 for this row for b1.
Example for line 2: I am checking, if a1 = 1 is anywhere in b1 = c(1,1,3,1,3). It is, so I want to check if a2 = 1 in line 2 (where a1 = 1) is anywhere in b2 where b1 = a1 = 1, so here b2 = c(2, 2, 2). For line 2 a2 = 1 is not in b2 = c(2, 2, 2), so the result should show me this line.
The first question is easy to answer with the following code:
a[which(!(a$a1 %in% b$b1)), ]
# a1 a2
# 5 4 2
But I can't fix the second problem. Maybe I am working in a wrong way with the logical operators. My result should look like this:
a1 a2
2 1 1
4 4 2
Following the explanation in your edit, you want the rows where either the specific a1 from a is not in b1 from b or where the specific a1 from a is equal to b1 of the same row in b and a2 from a is not among the values of b2 from b of the rows for which b1 equals the value of the specific a1.
In R, you can write this like that:
cond <- sapply(seq(nrow(a)), # check each row, one by one
function (i){
!(a$a1[i] %in% b$b1) | # a1 of the specific row is not in b1 or
!(a$a2[i] %in% b$b2[b$b1==a$a1[i]]) # a2 of the specific row is not in the values of b2 for which b1 equals a1 of the sepcific row
})
a[cond, ]
# a1 a2
#2 1 1
#5 4 2
Obviously not a nice solution, but it works with my data (unequal dimension of rows of the two datasets, not the same position of the values in the variables) - here with new example data, because I chose the first really bad.
a <- data.frame("a1" = c(1,1,1,3,4), "a2" = c(2,1,2,1,2))
b <- data.frame("b1" = c(1,3,1,1), "b2" = c(2,1,2,2))
test <- function (data1, data2) {
for (i in unique(data1[data1$a1 %in% data2$b1, "a1"])) {
temp_data1 <- data1[data1$a1 == i, c("a1", "a2")]
temp_data2 <- data2[data2$b1 == i, c("b1", "b2")]
for (j in unique(temp_data1$a2)) {
test <- j %in% unique(temp_data2$b2)
if (test == FALSE) {
print(unique(temp_data1[temp_data1$a1 == i & temp_data1$a2 == j, ]))
}
}
}
for (k in unique(data1[which(!(data1$a1 %in% data2$b1)), "a1"])) {
print(unique(data1[data1$a1 == k, c("a1", "a2")]))
}
}
test(a, b)
a1 a2
2 1 1
a1 a2
5 4 2
Based on your answer I improved the function test(). This version returns a dataframe:
a <- data.frame(a1=c(1,1,1,3,4), a2=c(2,1,2,1,2))
b <- data.frame(b1=c(1,1,3,1,3), b2=c(2,2,1,2,1))
test <- function (a, b) {
R <- subset(a,!a1 %in% b$b1)
I <- unique(a$a1[a$a1 %in% b$b1])
for (i in I) {
ai <- subset(a, a1 == i)
bi <- subset(b, b1 == i)
J <- unique(bi$b2)
for (j in unique(ai$a2)) if (! j %in% J) R <- rbind(subset(ai, a2==j), R)
}
R
}
test(a, b)

Recursively building a list of dataframes (in R) - is loop only option?

I have a dataframe with n columns and I need to obtain combinations of its variables:
E.g.:
df <- data.frame(A = c("a1","a2","a3","a4","a5","a6"),
B = c("a1","a1","a3","a3","a5","a5"),
C = c("a1","a1","a1","a3","a4","a4"),
D = c("a1","a1","a1","a3","a4","a5"))
I need to make a list that would have n-1 elements each including all the unique combinations of the dataframe variables. The first element includes unique values for each columns starting from the first and ending to the last. For each subsequent element I need to drop the first column of the previous appended dataframe. Like this:
myList <- list(unique(df[,1:ncol(df)),
unique(df[,2:ncol(df)),
unique(df[,3:ncol(df)))
I managed to solve this with a for loop:
myList <- list()
for (i in 1:(ncol(df) - 1)){
myList[[i]] <- unique(df[, i:ncol(df)])
}
but I was left wondering whether there was a faster and more elegant way to do this.
With sapply():
sapply(1:(ncol(df)-1),
FUN = function(x, nc, df) unique(df[, x:nc]), nc = ncol(df), df = df)
An elegant solution would be a recursion:
func = function(df, n, lst)
{
if(ncol(df)==n) return(lst)
func(df, n+1, c(lst, list(unique(df[n:ncol(df)]))))
}
#> func(df,1, list())
#[[1]]
# A B C D
#1 a1 a1 a1 a1
#2 a2 a1 a1 a1
#3 a3 a3 a1 a1
#4 a4 a3 a3 a3
#5 a5 a5 a4 a4
#6 a6 a5 a4 a5
#[[2]]
# B C D
#1 a1 a1 a1
#3 a3 a1 a1
#4 a3 a3 a3
#5 a5 a4 a4
#6 a5 a4 a5
#[[3]]
# C D
#1 a1 a1
#4 a3 a3
#5 a4 a4
#6 a4 a5

expand.grid with separate variable for each column

I would like to achieve the following data.frame in R:
i1 i2 i3
1 A1 A2 A3
2 No A2 A3
3 A1 No A3
4 No No A3
5 A1 A2 No
6 No A2 No
7 A1 No No
8 No No No
In each column the variable can either be the concatenated string "A" and the column number or "No". The data.frame should contain all possible combinations.
My idea was to use expand.grid, but I don't know how to create the list dynamically. Or is there a better approach?
expand.grid(list(c("A1", "No"), c("A2", "No"), c("A3", "No")))
I guess you could create your own helper function, something like that
MyList <- function(n) expand.grid(lapply(paste0("A", seq_len(n)), c, "No"))
Then simply pass it the number of elements (e.g., 3)
MyList(3)
# Var1 Var2 Var3
# 1 A1 A2 A3
# 2 No A2 A3
# 3 A1 No A3
# 4 No No A3
# 5 A1 A2 No
# 6 No A2 No
# 7 A1 No No
# 8 No No No
Alternatively, you could also try data.tables CJ equivalent which should much more efficient than expand.grid for a big n
library(data.table)
DTCJ <- function(n) do.call(CJ, lapply(paste0("A", seq_len(n)), c, "No"))
DTCJ(3) # will return a sorted cross join
# V1 V2 V3
# 1: A1 A2 A3
# 2: A1 A2 No
# 3: A1 No A3
# 4: A1 No No
# 5: No A2 A3
# 6: No A2 No
# 7: No No A3
# 8: No No No
Another option is using Map with expand.grid
n <- 3
expand.grid(Map(c, paste0('A', seq_len(n)), 'NO'))
Or
expand.grid(as.data.frame(rbind(paste0('A', seq_len(n)),'NO')))
Another option, only using the most fundamental functions in R, is to use the indices:
df <- data.frame(V1 = c('A','A','A', 'A',rep('No',4)), V2 = c('A','A','No','No','A','A','No','No'), V3 = c('A','No','A','No','A','No','A','No'), stringsAsFactors = FALSE)
to get the row and col indices of the elements we need to change:
rindex <- which(df != 'No') %% nrow(df)
cindex <- ceiling(which(df != 'No')/nrow(df))
the solution is basically a one-liner:
df[matrix(c(rindex,cindex),ncol=2)] <- paste0(df[matrix(c(rindex,cindex),ncol=2)],cindex)
> df
V1 V2 V3
1 A1 A2 A3
2 A1 A2 No
3 A1 No A3
4 A1 No No
5 No A2 A3
6 No A2 No
7 No No A3
8 No No No

R keep only last submission?

My data looks something like this:
DF<- data.frame( id=c("A1","A2","A3","A1"), submission=c(1,1,1,2))
What is the best way of keeping only the last submission for each id? That is:
DF<- data.frame( id=c("A2","A3","A1"), submission=c(1,1,2))
Thanks!
Here are a few options in base R:
DF[!duplicated(DF$id, fromLast=TRUE),]
# id submission
# 2 A2 1
# 3 A3 1
# 4 A1 2
do.call(rbind, by(DF, DF$id, FUN=tail, 1))
# id submission
# A1 A1 2
# A2 A2 1
# A3 A3 1
aggregate(submission ~ id, DF, tail, 1)
# id submission
# 1 A1 2
# 2 A2 1
# 3 A3 1

Resources