Merge a data.table and a list - r

I want to add a list into my data.table. Lets consider this data.table:
dt = data.table(id = 1:3)
lst <- list()
lst[[2]] <- cbind(a=10:12, b=5:7)
dt[-nrow(dt), lst:=lst]
dt
# id lst
#1: 1
#2: 2 10,11,12, 5, 6, 7
#3: 3
Is it possible to "unlist" the lst, so that the data.table will look like this?
id a b
1.0: 1
2.0: 2
2.1: 2 10 5
2.2: 2 11 6
2.3: 2 12 7
3.0: 3
There is also a speed issue, as the data I am working with consists of billions of rows.

An option as mentioned in comment:
rbindlist(list(dt,
rbindlist(lapply(lst, as.data.table), idcol='id')),
use.names=TRUE, fill=TRUE)[order(id)]
output:
id a b
1: 1 NA NA
2: 2 NA NA
3: 2 10 5
4: 2 11 6
5: 2 12 7
6: 3 NA NA

You can run a lapply across the list, and add the rows to an empty row if the item in the list is non-empty:
dt <- data.table(id = 1:3)
lst <- list()
lst[[2]] <- cbind(a=10:12, b=5:7)
create_table <- function(x, lst) {
if (!is.null(lst[[x]])) {
# Empty row plus items in list
rbindlist(
list(data.table(id = x), data.table(id = x, lst[[x]])),
use.names = TRUE, fill = TRUE
)
} else {
data.table(id = x)
}
}
aux_lst <- rbindlist(
lapply(seq(lst), create_table, lst = lst),
use.names = TRUE, fill = TRUE
)
aux_lst[dt, on = .(id)] # Keeps all IDs in dt
If the list is named and the id column relates to those names, then replace seq with names

Some reformatting is needed, but you can use rbindlist:
# create all entries in lst
length(lst) <- nrow(dt)
# identify table sizes
lens = sapply(lst, NROW)
# use data.tables instead of matrices
# fill empty tables with a blank template
template = data.table(a=NA_real_, b=NA_real_)
dtlist = replace(lapply(lst, as.data.table), lens == 0, list(template))
# expand dt to match tables
replens = pmax(lens, 1L)
cbind(dt[rep(1:.N, replens)], rbindlist(dtlist))
id a b
1: 1 NA NA
2: 2 10 5
3: 2 11 6
4: 2 12 7
5: 3 NA NA

library(data.table)
dt = data.table(id = 1:3)
lst <- list()
lst[[2]] <- cbind(a=10:12, b=5:7)
unique(rbindlist(lapply(1:length(lst), function(i) {
data.table(id = i, lst[[i]])[dt, on = .(id)]
}
), fill=TRUE))[order(id)]
id a b
1: 1 NA NA
2: 2 NA NA
3: 2 10 5
4: 2 11 6
5: 2 12 7
6: 3 NA NA

Related

Post-processing of full_join output to remove multiplicity

I have two data frames(df1, df2) and performed full_join using the common column of interest col1.
df1 <- data.frame(col1=c('A','D','C','C','E','E','I'),col2=c(4,7,8,3,2,4,9))
df2 <- data.frame(col1=c('A','A','B','C','C','E','E','I'),col2=c(4,1,6,8,3,2,1,9))
df1 %>% full_join(df2, by = "col1")
# col1 col2.x col2.y
# 1 A 4 4
# 2 A 4 1
# 3 D 7 NA
# 4 C 8 8
# 5 C 8 3
# 6 C 3 8
# 7 C 3 3
# 8 E 2 2
# 9 E 2 1
# 10 E 4 2
# 11 E 4 1
# 12 I 9 9
# 13 B NA 6
As expected the full_join provides multiplicty of the joining column values and I wish to avoid it. I wish to arrive at the following output. What kind of post-processing approaches do you suggest?
# col1 col2.x col2.y
# 1 A 4 4
# 2 A NA 1
# 3 D 7 NA
# 4 C 8 8
# 5 C 3 3
# 6 E 2 2
# 7 E 4 1
# 8 I 9 9
# 9 B NA 6
More information:
Case 1: I do not need four rows in the output for two same values in both input objects:
# 4 C 8 8
# 5 C 8 3
# 6 C 3 8
# 7 C 3 3
instead, I want only two as:
# 4 C 8 8
# 5 C 3 3
Case 2: Similarly, I need same row for the difference in values:
# 8 E 2 2
# 9 E 2 1
# 10 E 4 2
# 11 E 4 1
instead, I want only two rows as below:
# 8 E 2 2
# 9 E 4 1
A possible solution in 2 steps using the data.table-package:
0) load package & convert to data.table's
library(data.table)
setDT(df1)
setDT(df2)
1) define helper function
unlistSD <- function(x) {
l <- length(x)
ls <- sapply(x, lengths)
m <- max(ls)
newSD <- vector(mode = "list", length = l)
for (i in 1:l) {
u <- unlist(x[[i]])
lu <- length(u)
if (lu < m) {
u <- c(u, rep(NA_real_, m - lu))
}
newSD[[i]] <- u
}
return(setNames(as.list(newSD), names(x)))
}
2) merge and apply helper function
merge(df1[, .(col2 = list(col2)), by = col1],
df2[, .(col2 = list(col2)), by = col1],
by = "col1", all = TRUE
)[, unlistSD(.SD), by = col1]
which gives the following result:
col1 col2.x col2.y
1: A 4 4
2: A NA 1
3: C 8 8
4: C 3 3
5: D 7 NA
6: E 2 2
7: E 4 1
8: I 9 9
9: B NA 6
Another possibiliy with base R:
unlistDF <- function(d, groupcols) {
ds <- split(d[, setdiff(names(d), groupcols)], d[,groupcols])
ls <- lapply(ds, function(x) max(sapply(x, lengths)))
dl <- lapply(ds, function(x) lapply(as.list(x), unlist))
du <- Map(function(x, y) {
lapply(x, function(i) {
if(length(i) < y) {
c(i, rep(NA_real_, y - length(i)))
} else i
})
}, x = dl, y = ls)
ld <- lapply(du, as.data.frame)
cbind(d[rep(1:nrow(d), ls), groupcols, drop = FALSE],
do.call(rbind.data.frame, c(ld, make.row.names = FALSE)),
row.names = NULL)
}
Now you can use this function as follows in combination with merge:
df <- merge(aggregate(col2 ~ col1, df1, as.list),
aggregate(col2 ~ col1, df2, as.list),
by = "col1", all = TRUE)
unlistDF(df, "col1")

Issue with local variables in r custom function

I've got a dataset
>view(interval)
# V1 V2 V3 ID
# 1 NA 1 2 1
# 2 2 2 3 2
# 3 3 NA 1 3
# 4 4 2 2 4
# 5 NA 5 1 5
>dput(interval)
structure(list(V1 = c(NA, 2, 3, 4, NA),
V2 = c(1, 2, NA, 2, 5),
V3 = c(2, 3, 1, 2, 1), ID = 1:5), row.names = c(NA, -5L), class = "data.frame")
I would like to extract the previous not NA value (or the next, if NA is in the first row) for every row, and store it as a local variable in a custom function, because I have to perform other operations on every row based on this value(which should change for every row i'm applying the function).
I've written this function to print the local variables, but when I apply it the output is not what I want
myFunction<- function(x){
position <- as.data.frame(which(is.na(interval), arr.ind=TRUE))
tempVar <- ifelse(interval$ID == 1, interval[position$row+1,
position$col], interval[position$row-1, position$col])
return(tempVar)
}
I was expecting to get something like this
# [1] 2
# [2] 2
# [3] 4
But I get something pretty messed up instead.
Here's attempt number 1:
dat <- read.table(header=TRUE, text='
V1 V2 V3 ID
NA 1 2 1
2 2 3 2
3 NA 1 3
4 2 2 4
NA 5 1 5')
myfunc1 <- function(x) {
ind <- which(is.na(x), arr.ind=TRUE)
# since it appears you want them in row-first sorted order
ind <- ind[order(ind[,1], ind[,2]),]
# catch first-row NA
ind[,1] <- ifelse(ind[,1] == 1L, 2L, ind[,1] - 1L)
x[ind]
}
myfunc1(dat)
# [1] 2 2 4
The problem with this is when there is a second "stacked" NA:
dat2 <- dat
dat2[2,1] <- NA
dat2
# V1 V2 V3 ID
# 1 NA 1 2 1
# 2 NA 2 3 2
# 3 3 NA 1 3
# 4 4 2 2 4
# 5 NA 5 1 5
myfunc1(dat2)
# [1] NA NA 2 4
One fix/safeguard against this is to use zoo::na.locf, which takes the "last observation carried forward". Since the top-row is a special case, we do it twice, second time in reverse. This gives us the "next non-NA value in the column (up or down, depending).
library(zoo)
myfunc2 <- function(x) {
ind <- which(is.na(x), arr.ind=TRUE)
# since it appears you want them in row-first sorted order
ind <- ind[order(ind[,1], ind[,2]),]
# this is to guard against stacked NA
x <- apply(x, 2, zoo::na.locf, na.rm = FALSE)
# this special-case is when there are one or more NAs at the top of a column
x <- apply(x, 2, zoo::na.locf, fromLast = TRUE, na.rm = FALSE)
x[ind]
}
myfunc2(dat2)
# [1] 3 3 2 4

R: Combine multiple columns as pairs of column cells in same row

I'd like to combine/pair multiple columns in a data frame as pairs of column cells in the same row. As an example, df1 should be transformed to df2.
df1
col1 col2 col3
1 2 3
0 0 1
df2
c1 c2
1 2
1 3
2 3
0 0
0 1
0 1
The solution should be scalable for df1s with (way) more than three columns.
I thought about melt/reshape/dcast but found no solution yet. There are no NAs in the data frame. Thank you!
EDIT: Reshape just produced errors, so I thought about
combn(df1[1,], 2)
comb2 <- t(comb1)
and looping and appending through all rows. This inefficient, considering 2 million rows..
Here's the approach I would take.
Create a function that uses rbindlist from "data.table" and combn from base R. The function looks like this:
lengthener <- function(indf) {
temp <- rbindlist(
combn(names(indf), 2, FUN = function(x) indf[x], simplify = FALSE),
use.names = FALSE, idcol = TRUE)
setorder(temp[, .id := sequence(.N), by = .id], .id)[, .id := NULL][]
}
Here's the sample data from the other answer, and the application of the function on it:
df1 = as.data.frame(matrix(c(1,2,3,4,0,0,1,1), byrow = TRUE, nrow = 2))
lengthener(df1)
# V1 V2
# 1: 1 2
# 2: 1 3
# 3: 1 4
# 4: 2 3
# 5: 2 4
# 6: 3 4
# 7: 0 0
# 8: 0 1
# 9: 0 1
# 10: 0 1
# 11: 0 1
# 12: 1 1
Test it out on some larger data too:
set.seed(1)
M <- as.data.frame(matrix(sample(100, 100*100, TRUE), 100))
system.time(out <- lengthener(M))
# user system elapsed
# 0.19 0.00 0.19
out
# V1 V2
# 1: 27 66
# 2: 27 27
# 3: 27 68
# 4: 27 66
# 5: 27 56
# ---
# 494996: 33 13
# 494997: 33 66
# 494998: 80 13
# 494999: 80 66
# 495000: 13 66
System time for the other approach:
funAMK <- function(indf) {
nrow_combn = nrow(t(combn(indf[1,], m = 2)))
nrow_df = nrow(indf) * nrow_combn
df2 = data.frame(V1 = rep(0, nrow_df), V2 = rep(0, nrow_df))
for(i in 1:nrow(indf)){
df2[(((i-1)*nrow_combn)+1):(i*(nrow_combn)), ] = data.frame(t(combn(indf[i,], m = 2)))
}
df2
}
> system.time(funAMK(M))
user system elapsed
16.03 0.16 16.37
Your edit is very similar to my answer below, you just need to rbind the result each iteration over the rows of df1. Using data.table is a good way to speed up rbind, see this answer for more.
EDIT: Unfortunately, when I switched to the data.table approach, it turned out that the rbindlist() led the answer to be wrong (as pointed out in the comment below). Therefore, although it may be slightly slower, I think that preallocating a data frame and using rbind may be the best option.
EDIT2: switched the preallocated df to a more general number of rows.
df1 = as.data.frame(matrix(c(1,2,3,4,0,0,1,1), byrow = TRUE, nrow = 2))
nrow_combn = nrow(t(combn(df1[1,], m = 2)))
nrow_df = nrow(df1) * nrow_combn
df2 = data.frame(V1 = rep(0, nrow_df), V2 = rep(0, nrow_df))
for(i in 1:nrow(df1)){
df2[(((i-1)*nrow_combn)+1):(i*(nrow_combn)), ] = data.frame(t(combn(df1[i,], m = 2)))
}

LIst of lists in R into a data.frame - inconsistent variable names

I have a list of lists and I want to convert it into a dataframe. The challenge is that there are missing variables names in lists (not NA's but the variable is missing completely).
To illustrate on example: from
my_list <- list()
my_list[[1]] <- list(a = 1, b = 2, c = 3)
my_list[[2]] <- list(a = 4, c = 6)
I would like to get
a b c
[1,] 1 2 3
[2,] 4 NA 6
Another option is
library(reshape2)
as.data.frame(acast(melt(my_list), L1~L2, value.var='value'))
# a b c
#1 1 2 3
#2 4 NA 6
Or as #David Arenburg suggested a wrapper for melt/dcast would be recast
recast(my_list, L1 ~ L2, value.var = 'value')[, -1]
# a b c
#1 1 2 3
#2 4 NA 6
You can use the bind_rows function from the dplyr package :
my_list <- list()
my_list[[1]] <- list(a = 1, b = 2, c = 3)
my_list[[2]] <- list(a = 4, c = 6)
dplyr::bind_rows(lapply(my_list, as.data.frame))
This outputs:
Source: local data frame [2 x 3]
a b c
1 1 2 3
2 4 NA 6
Another answer, this requires to change the class of the arguments to data.frames:
library(plyr)
lista <- list(a=1, b=2, c =3)
listb <- list(a=4, c=6)
lista <- as.data.frame(lista)
listb <- as.data.frame(listb)
my_list <- list(lista, listb)
my_list <- do.call(rbind.fill, my_list)
my_list
a b c
1 1 2 3
2 4 NA 6

Split different lengths values and bind to columns

I've got a rather large (around 100k observations) data set, similar to this:
data <- data.frame(
ID = seq(1, 5, 1),
Values = c("1,2,3", "4", " ", "4,1,6,5,1,1,6", "0,0"),
stringsAsFactors=F)
data
ID Values
1 1 1,2,3
2 2 4
3 3
4 4 4,1,6,5,1,1,6
5 5 0,0
I want to split the Values column by "," with NA for missed cells:
ID v1 v2 v3 v4 v5 v6 v7
1 1 2 3 NA NA NA NA
2 4 NA NA NA NA NA NA
3 NA NA NA NA NA NA NA
4 4 1 6 5 1 1 6
5 0 0 NA NA NA NA NA
...
Best attempt was strsplit + rbind:
df <- data.frame(do.call(
"rbind",
strsplit(as.character(data$Values), split = "," , fixed = FALSE)
))
But rbind function just recycles all 'short' rows instead to set an "NA".
Have found similar problem
Many thanks, Leo
I would suggest looking at my cSplit function or approaching the problem manually.
The cSplit approach would simply be:
cSplit(data, "Values", ",")
# ID Values_1 Values_2 Values_3 Values_4 Values_5 Values_6 Values_7
# 1: 1 1 2 3 NA NA NA NA
# 2: 2 4 NA NA NA NA NA NA
# 3: 3 NA NA NA NA NA NA
# 4: 4 4 1 6 5 1 1 6
# 5: 5 0 0 NA NA NA NA NA
Approaching the problem manually would look like:
## Split up the values
Split <- strsplit(data$Values, ",", fixed = TRUE)
## How long is each list element?
Ncol <- vapply(Split, length, 1L)
## Create an empty character matrix to store the results
M <- matrix(NA_character_, nrow = nrow(data),
ncol = max(Ncol),
dimnames = list(NULL, paste0("V", sequence(max(Ncol)))))
## Use matrix indexing to figure out where to put the results
M[cbind(rep(1:nrow(data), Ncol),
sequence(Ncol))] <- unlist(Split, use.names = FALSE)
## Bind the values back together, here as a "data.table" (faster)
data.table(ID = data$ID, M)
^^ That's pretty much what goes on in cSplit, but the function has a few other options and some basic error checking and so on that might make it a little bit slower than a purely manual approach (or a function written to address your specific problem).
Both of these approaches would be faster than a "data.table" + "reshape2" approach. Also, since each row is treated individually, you shouldn't have any problems even if you have duplicated ID values--your output should have the same number of rows as your input.
Benchmarks
I've done benchmarks on more rows and on data that would give "wider" results (since that's implied in your comments to David's answer).
Here is the sample data:
set.seed(1)
a <- sample(0:100, 100000, TRUE)
Values <- vapply(a, function(x)
paste(sample(0:100, x, TRUE), collapse = ","), character(1L))
Values[sample(length(Values), length(Values) * .15)] <- ""
ID <- c(1:80000, 1:20000)
data <- data.frame(ID, Values, stringsAsFactors = FALSE)
DT <- as.data.table(data)
Here are the functions to test:
fun1a <- function(inDT) {
data2 <- DT[, list(Values = unlist(
strsplit(Values, ","))), by = ID]
data2[, Var := paste0("v", seq_len(.N)), by = ID]
dcast.data.table(data2, ID ~ Var,
fill = NA_character_,
value.var = "Values")
}
fun1b <- function(inDT) {
data2 <- DT[, list(Values = unlist(
strsplit(Values, ",", fixed = TRUE),
use.names = FALSE)), by = ID]
data2[, Var := paste0("v", seq_len(.N)), by = ID]
dcast.data.table(data2, ID ~ Var,
fill = NA_character_,
value.var = "Values")
}
fun2 <- function(inDT) {
cSplit(DT, "Values", ",")
}
fun3 <- function(inDF) {
Split <- strsplit(inDF$Values, ",", fixed = TRUE)
Ncol <- vapply(Split, length, 1L)
M <- matrix(NA_character_, nrow = nrow(inDF),
ncol = max(Ncol),
dimnames = list(NULL, paste0("V", sequence(max(Ncol)))))
M[cbind(rep(1:nrow(inDF), Ncol),
sequence(Ncol))] <- unlist(Split, use.names = FALSE)
data.table(ID = inDF$ID, M)
}
Here are the results:
library(microbenchmark)
microbenchmark(fun2(DT), fun3(data), times = 20)
# Unit: seconds
# expr min lq median uq max neval
# fun2(DT) 4.810942 5.173103 5.498279 5.622279 6.003339 20
# fun3(data) 3.847228 3.929311 4.058728 4.160082 4.664568 20
## Didn't want to microbenchmark here...
system.time(fun1a(DT))
# user system elapsed
# 16.92 0.50 17.59
system.time(fun1b(DT)) # fixed = TRUE & use.names = FALSE
# user system elapsed
# 11.54 0.42 12.01
NOTE: The results of fun1a and fun1b would not be the same as those of fun2 and fun3 because of the duplicated IDs.
Here's a data.table combined with reshape2 approach (should be very efficient)
library(data.table) # Loading `data.table` package
data2 <- setDT(data)[, list(Values = unlist(strsplit(Values, ","))), by = ID] # splitting the values by `,` for each `ID`
data2[, Var := paste0("v", seq_len(.N)), by = ID] # Adding the `Var` variable
library(reshape2) # Loading `reshape2` package
dcast.data.table(data2, ID ~ Var, fill = NA_character_, value.var = "Values") # decasting
# ID v1 v2 v3 v4 v5 v6 v7
# 1: 1 1 2 3 NA NA NA NA
# 2: 2 4 NA NA NA NA NA NA
# 3: 3 NA NA NA NA NA NA
# 4: 4 4 1 6 5 1 1 6
# 5: 5 0 0 NA NA NA NA NA

Resources