Extract and collapse non-missing elements by row in the data.table - r

I would like to extract all unique non missing elements in a row and then collapse them using &&&&. Here comes a small example:
#Load needed libraries:
library(data.table)
#Generate the data:
set.seed(1)
n_rows<-10
#Define function to apply to rows:
function_non_missing<-function(x){
x<-x[!is.na(x)]
x<-x[x!="NA"]
x<-unique(x[order(x)])
paste(x,collapse="&&&&")
}
data<-data.table(
a=sample(c(1,2,NA,NA),n_rows,replace=TRUE),
b=sample(c(1,2,NA,NA),n_rows,replace=TRUE),
c=sample(c(1,2,NA,NA),n_rows,replace=TRUE)
)
> data
a b c
1: 1 NA 1
2: NA NA NA
3: NA 1 1
4: 1 1 1
5: 2 1 1
6: 1 2 1
7: NA 2 2
8: NA 2 1
9: 2 2 1
10: 2 NA 2
#Obtain results
data[,paste(.SD),by=1:nrow(data)][,function_non_missing(V1),by=nrow]
nrow V1
1: 1 1
2: 2
3: 3 1
4: 4 1
5: 5 1&&&&2
6: 6 1&&&&2
7: 7 2
8: 8 1&&&&2
9: 9 1&&&&2
10: 10 2
The above code looks very convoluted and I believe there might be better solutions.

Using melt() / dcast():
data[, row := .I
][, melt(.SD, id.vars = "row")
][order(row, value), paste0(unique(value[!is.na(value)]), collapse = "&&&"), by = row]
row V1
1: 1 1
2: 2
3: 3 1
4: 4 1
5: 5 1&&&2
6: 6 1&&&2
7: 7 2
8: 8 1&&&2
9: 9 1&&&2
10: 10 2
Alterntively using your original function:
data[, function_non_missing(unlist(.SD)), by = 1:nrow(data)]
nrow V1
1: 1 1
2: 2
3: 3 2
4: 4 1&&&&2
5: 5 1&&&&2
6: 6 1&&&&2
7: 7 1
8: 8 2
9: 9 1&&&&2
10: 10 1&&&&2

Probably using apply?
library(data.table)
data[, col := apply(.SD, 1, function(x)
paste(sort(unique(na.omit(x))), collapse = "&&&"))]
data
# a b c col
# 1: 1 NA 1 1
# 2: NA NA NA
# 3: NA 1 1 1
# 4: 1 1 1 1
# 5: 2 1 1 1&&&2
# 6: 1 2 1 1&&&2
# 7: NA 2 2 2
# 8: NA 2 1 1&&&2
# 9: 2 2 1 1&&&2
#10: 2 NA 2 2

Related

Is there some way to keep variable names from.SD+.SDcols together with non .SD variable names in data.table?

Given a data.table
library(data.table)
DT = data.table(x=rep(c("b","a","c"),each=3), v=c(1,1,1,2,2,1,1,2,2), y=c(1,3,6), a=1:9, b=9:1)
DT
x v y a b
1: b 1 1 1 9
2: b 1 3 2 8
3: b 1 6 3 7
4: a 2 1 4 6
5: a 2 3 5 5
6: a 1 6 6 4
7: c 1 1 7 3
8: c 2 3 8 2
9: c 2 6 9 1
if one does
DT[, .(a, .SD), .SDcols=x:y]
a .SD.x .SD.v .SD.y
1: 1 b 1 1
2: 2 b 1 3
3: 3 b 1 6
4: 4 a 2 1
5: 5 a 2 3
6: 6 a 1 6
7: 7 c 1 1
8: 8 c 2 3
9: 9 c 2 6
the variables from .SDcols become prefixed by .SD. On the other hand, if one tries, as in https://stackoverflow.com/a/62282856/997979,
DT[, c(.(a), .SD), .SDcols=x:y]
V1 x v y
1: 1 b 1 1
2: 2 b 1 3
3: 3 b 1 6
4: 4 a 2 1
5: 5 a 2 3
6: 6 a 1 6
7: 7 c 1 1
8: 8 c 2 3
9: 9 c 2 6
the other variable name (a) become lost. (It is due to this reason that I re-ask the question which I initially marked as a duplicate to that linked above).
Is there some way to keep the names from both .SD variables and non .SD variables?
The goal is simultaneously being able to use .() to select variables without quotes and being able to select variables through .SDcols = patterns("...")
Thanks in advance!
not really sure why.. but it works ;-)
DT[, .(a, (.SD)), .SDcols=x:y]
# a x v y
# 1: 1 b 1 1
# 2: 2 b 1 3
# 3: 3 b 1 6
# 4: 4 a 2 1
# 5: 5 a 2 3
# 6: 6 a 1 6
# 7: 7 c 1 1
# 8: 8 c 2 3
# 9: 9 c 2 6

column mean for subset of rows in R data.table

I have a data.table like this:
example <- data.table(
id=rep(1:2,each=9),
day=rep(rep(1:3,each=3),2),
time=rep(1:3,6),
mean_day=rnorm(18)
)
I need to compute the mean across multiple days, but different ones between id's.
To get the mean across days 1 to 2 for the first individual I've tried the following (inspired by this post):
example[id==1, mean_over_days := mean(mean_day,na.rm=T), by=(day %in% 1:2)]
> example
id day time mean_day mean_over_days
1: 1 1 1 -1.53685184 -0.8908466
2: 1 1 2 0.77445521 -0.8908466
3: 1 1 3 -0.56048917 -0.8908466
4: 1 2 1 -1.78388960 -0.8908466
5: 1 2 2 0.01787129 -0.8908466
6: 1 2 3 -2.25617538 -0.8908466
7: 1 3 1 -0.44886190 -0.0955414
8: 1 3 2 -1.31086985 -0.0955414
9: 1 3 3 1.47310754 -0.0955414
10: 2 1 1 0.53560356 NA
11: 2 1 2 1.16654996 NA
12: 2 1 3 -0.06704728 NA
13: 2 2 1 -0.83897719 NA
14: 2 2 2 -0.85209939 NA
15: 2 2 3 -0.41392341 NA
16: 2 3 1 -0.03014190 NA
17: 2 3 2 0.43835822 NA
18: 2 3 3 -1.62432188 NA
I want all the lines for id==1 of column mean_over_days to have the same value (-0.8908466), but it happens that for day 3 this column has the mean over that day only.
How can I change the code to correct this?
Don't subset in by it would create different groups for TRUE and FALSE values.
library(data.table)
example[id==1, mean_over_days := mean(mean_day[day %in% 1:2],na.rm=TRUE)][]
# id day time mean_day mean_over_days
# 1: 1 1 1 -0.56047565 0.4471527
# 2: 1 1 2 -0.23017749 0.4471527
# 3: 1 1 3 1.55870831 0.4471527
# 4: 1 2 1 0.07050839 0.4471527
# 5: 1 2 2 0.12928774 0.4471527
# 6: 1 2 3 1.71506499 0.4471527
# 7: 1 3 1 0.46091621 0.4471527
# 8: 1 3 2 -1.26506123 0.4471527
# 9: 1 3 3 -0.68685285 0.4471527
#10: 2 1 1 -0.44566197 NA
#11: 2 1 2 1.22408180 NA
#12: 2 1 3 0.35981383 NA
#13: 2 2 1 0.40077145 NA
#14: 2 2 2 0.11068272 NA
#15: 2 2 3 -0.55584113 NA
#16: 2 3 1 1.78691314 NA
#17: 2 3 2 0.49785048 NA
#18: 2 3 3 -1.96661716 NA
data
set.seed(123)
example <- data.table(
id=rep(1:2,each=9),
day=rep(rep(1:3,each=3),2),
time=rep(1:3,6),
mean_day=rnorm(18)
)

Tracking the first incidence of each episode

I am currently using R to process a data set that looks like the following:
age ep
1 0
2 0
3 1
4 1
5 1
6 1
7 0
8 0
9 1
10 1
11 0
I want to create a variable that will keep track of the first occurrence of ep=1 per series of ep=1. These series will have ep=0 prior to the first ep=1 and ep=0 following the last ep=1 of each series.
I would like the data set to look like this after processing:
age ep first
1 0 NA
2 0 NA
3 1 1
4 1 NA
5 1 NA
6 1 NA
7 0 NA
8 0 NA
9 1 1
10 1 NA
11 0 NA
I am working in data table as this data set is rather large, so I'd prefer to process the data using code for data tables, however if this isn't possible I can convert to a data frame and use other code. Any assistance would be greatly appreciated.
A fast data.table method ...
library(data.table)
dt <- fread("age ep
1 0
2 0
3 1
4 1
5 1
6 1
7 0
8 0
9 1
10 1
11 0")
dt[!shift(ep) & ep, first := 1]
# or more explicit:
dt[shift(ep) != 1 & ep == 1, first := 1]
dt
# age ep first
# 1: 1 0 NA
# 2: 2 0 NA
# 3: 3 1 1
# 4: 4 1 NA
# 5: 5 1 NA
# 6: 6 1 NA
# 7: 7 0 NA
# 8: 8 0 NA
# 9: 9 1 1
# 10: 10 1 NA
# 11: 11 0 NA
Note: just for clarity, if your object is not already a data.table. You can coerce it to a data.table:
setDT(dt)
Another option using an update join
dt[, first := dt[dt[, .I[1], by=rleid(ep)]$V1][ep == 1][dt, on=.(age), ep]]
dt
# age ep first
# 1: 1 0 NA
# 2: 2 0 NA
# 3: 3 1 1
# 4: 4 1 NA
# 5: 5 1 NA
# 6: 6 1 NA
# 7: 7 0 NA
# 8: 8 0 NA
# 9: 9 1 1
#10: 10 1 NA
#11: 11 0 NA
Using data provided by #Khaynes
An approach using fifelse
dt[, first := fifelse( ep == 1 & shift( ep , type = "lag" ) == 0L, 1L, NA_integer_) ]
dt
# age ep first
# 1: 1 0 NA
# 2: 2 0 NA
# 3: 3 1 1
# 4: 4 1 NA
# 5: 5 1 NA
# 6: 6 1 NA
# 7: 7 0 NA
# 8: 8 0 NA
# 9: 9 1 1
# 10: 10 1 NA
# 11: 11 0 NA
Another update join version, using mult="first" to only overwrite the first matching row in the group:
dt[, rid := rleid(ep)][dt[ep==1], on=.(rid), mult="first", first := 1]
dt
# age ep rid first
# 1: 1 0 1 NA
# 2: 2 0 1 NA
# 3: 3 1 2 1
# 4: 4 1 2 NA
# 5: 5 1 2 NA
# 6: 6 1 2 NA
# 7: 7 0 3 NA
# 8: 8 0 3 NA
# 9: 9 1 4 1
#10: 10 1 4 NA
#11: 11 0 5 NA

from two lists to one by binding elements

I have two lists with two elements each,
l1 <- list(data.table(id=1:5, group=1), data.table(id=1:5, group=1))
l2 <- list(data.table(id=1:5, group=2), data.table(id=1:5, group=2))
and I would like to rbind(.) both elements, resulting in a new list with two elements.
> l
[[1]]
id group
1: 1 1
2: 2 1
3: 3 1
4: 4 1
5: 5 1
6: 1 2
7: 2 2
8: 3 2
9: 4 2
10: 5 2
[[2]]
id group
1: 1 1
2: 2 1
3: 3 1
4: 4 1
5: 5 1
6: 1 2
7: 2 2
8: 3 2
9: 4 2
10: 5 2
However, I only find examples where rbind(.) is applied to bind across elements. I suspect that the solution lies somewhere in lapply(.) but lapply(c(l1,l2),rbind) appears to bind the lists, producing a list of four elements.
You can use mapply or Map. mapply (which stands for multivariate apply) applies the supplied function to the first elements of the arguments and then the second and then the third and so on. Map is quite literally a wrapper to mapply that does not try to simplify the result (try running mapply with and without SIMPLIFY=T). Shorter, arguments are recycled as necessary.
mapply(x=l1, y=l2, function(x,y) rbind(x,y), SIMPLIFY = F)
#[[1]]
# id group
# 1: 1 1
# 2: 2 1
# 3: 3 1
# 4: 4 1
# 5: 5 1
# 6: 1 2
# 7: 2 2
# 8: 3 2
# 9: 4 2
#10: 5 2
#
#[[2]]
# id group
# 1: 1 1
# 2: 2 1
# 3: 3 1
# 4: 4 1
# 5: 5 1
# 6: 1 2
# 7: 2 2
# 8: 3 2
# 9: 4 2
#10: 5 2
As #Parfait pointed out you can do this Map:
Map(rbind, l1, l2)
#[[1]]
# id group
# 1: 1 1
# 2: 2 1
# 3: 3 1
# 4: 4 1
# 5: 5 1
# 6: 1 2
# 7: 2 2
# 8: 3 2
# 9: 4 2
#10: 5 2
#
#[[2]]
# id group
# 1: 1 1
# 2: 2 1
# 3: 3 1
# 4: 4 1
# 5: 5 1
# 6: 1 2
# 7: 2 2
# 8: 3 2
# 9: 4 2
#10: 5 2
Using tidyverse
library(tidyverse0
map2(l1, l2, bind_rows)

Shifting row values by lag value in another column

I have a rather large dataset and I am interested in "marching" values forward through time based on values from another column. For example, if I have a Value = 3 at Time = 0 and a DesiredShift = 2, I want the 3 to shift down two rows to be at Time = 2. Here is a reproducible example.
Build reproducible fake data
library(data.table)
set.seed(1)
rowsPerID <- 8
dat <- CJ(1:2, 1:rowsPerID)
setnames(dat, c("ID","Time"))
dat[, Value := rpois(.N, 4)]
dat[, Shift := sample(0:2, size=.N, replace=TRUE)]
Fake Data
# ID Time Value Shift
# 1: 1 1 3 2
# 2: 1 2 3 2
# 3: 1 3 4 1
# 4: 1 4 7 2
# 5: 1 5 2 2
# 6: 1 6 7 0
# 7: 1 7 7 1
# 8: 1 8 5 0
# 9: 2 1 5 0
# 10: 2 2 1 1
# 11: 2 3 2 0
# 12: 2 4 2 1
# 13: 2 5 5 2
# 14: 2 6 3 1
# 15: 2 7 5 1
# 16: 2 8 4 1
I want each Value to shift forward according the the Shift column. So the
DesiredOutput column for row 3 will be equal to 3 since the value at Time=1 is
Value = 3 and Shift = 2.
Row 4 shows 3+4=7 since 3 shifts down 2 and 4 shifts down 1.
I would like to be able to do this by ID group and hopefully take advantage
of data.table since speed is of interest for this problem.
Desired Result
# ID Time Value Shift DesiredOutput
# 1: 1 1 3 2 NA
# 2: 1 2 3 2 NA
# 3: 1 3 4 1 3
# 4: 1 4 7 2 3+4 = 7
# 5: 1 5 2 2 NA
# 6: 1 6 7 0 7+7 = 14
# 7: 1 7 7 1 2
# 8: 1 8 5 0 7+5 = 12
# 9: 2 1 5 0 5
# 10: 2 2 1 1 NA
# 11: 2 3 2 0 1+2 = 3
# 12: 2 4 2 1 NA
# 13: 2 5 5 2 2
# 14: 2 6 3 1 NA
# 15: 2 7 5 1 3+5=8
# 16: 2 8 4 1 5
I was hoping to get this working using the data.table::shift function, but I am unsure how to make this work using multiple lag parameters.
Try this:
dat[, TargetIndex:= .I + Shift]
toMerge = dat[, list(Out = sum(Value)), by='TargetIndex']
dat[, TargetIndex:= .I]
# dat = merge(dat, toMerge, by='TargetIndex', all=TRUE)
dat[toMerge, on='TargetIndex', DesiredOutput:= i.Out]
> dat
# ID Time Value Shift TargetIndex DesiredOutput
# 1: 1 1 3 2 1 NA
# 2: 1 2 3 2 2 NA
# 3: 1 3 4 1 3 3
# 4: 1 4 7 2 4 7
# 5: 1 5 2 2 5 NA
# 6: 1 6 7 0 6 14
# 7: 1 7 7 1 7 2
# 8: 1 8 5 0 8 12
# 9: 2 1 5 0 9 5
# 10: 2 2 1 1 10 NA
# 11: 2 3 2 0 11 3
# 12: 2 4 2 1 12 NA
# 13: 2 5 5 2 13 2
# 14: 2 6 3 1 14 NA
# 15: 2 7 5 1 15 8
# 16: 2 8 4 1 16 5

Resources