column mean for subset of rows in R data.table - r

I have a data.table like this:
example <- data.table(
id=rep(1:2,each=9),
day=rep(rep(1:3,each=3),2),
time=rep(1:3,6),
mean_day=rnorm(18)
)
I need to compute the mean across multiple days, but different ones between id's.
To get the mean across days 1 to 2 for the first individual I've tried the following (inspired by this post):
example[id==1, mean_over_days := mean(mean_day,na.rm=T), by=(day %in% 1:2)]
> example
id day time mean_day mean_over_days
1: 1 1 1 -1.53685184 -0.8908466
2: 1 1 2 0.77445521 -0.8908466
3: 1 1 3 -0.56048917 -0.8908466
4: 1 2 1 -1.78388960 -0.8908466
5: 1 2 2 0.01787129 -0.8908466
6: 1 2 3 -2.25617538 -0.8908466
7: 1 3 1 -0.44886190 -0.0955414
8: 1 3 2 -1.31086985 -0.0955414
9: 1 3 3 1.47310754 -0.0955414
10: 2 1 1 0.53560356 NA
11: 2 1 2 1.16654996 NA
12: 2 1 3 -0.06704728 NA
13: 2 2 1 -0.83897719 NA
14: 2 2 2 -0.85209939 NA
15: 2 2 3 -0.41392341 NA
16: 2 3 1 -0.03014190 NA
17: 2 3 2 0.43835822 NA
18: 2 3 3 -1.62432188 NA
I want all the lines for id==1 of column mean_over_days to have the same value (-0.8908466), but it happens that for day 3 this column has the mean over that day only.
How can I change the code to correct this?

Don't subset in by it would create different groups for TRUE and FALSE values.
library(data.table)
example[id==1, mean_over_days := mean(mean_day[day %in% 1:2],na.rm=TRUE)][]
# id day time mean_day mean_over_days
# 1: 1 1 1 -0.56047565 0.4471527
# 2: 1 1 2 -0.23017749 0.4471527
# 3: 1 1 3 1.55870831 0.4471527
# 4: 1 2 1 0.07050839 0.4471527
# 5: 1 2 2 0.12928774 0.4471527
# 6: 1 2 3 1.71506499 0.4471527
# 7: 1 3 1 0.46091621 0.4471527
# 8: 1 3 2 -1.26506123 0.4471527
# 9: 1 3 3 -0.68685285 0.4471527
#10: 2 1 1 -0.44566197 NA
#11: 2 1 2 1.22408180 NA
#12: 2 1 3 0.35981383 NA
#13: 2 2 1 0.40077145 NA
#14: 2 2 2 0.11068272 NA
#15: 2 2 3 -0.55584113 NA
#16: 2 3 1 1.78691314 NA
#17: 2 3 2 0.49785048 NA
#18: 2 3 3 -1.96661716 NA
data
set.seed(123)
example <- data.table(
id=rep(1:2,each=9),
day=rep(rep(1:3,each=3),2),
time=rep(1:3,6),
mean_day=rnorm(18)
)

Related

Extract and collapse non-missing elements by row in the data.table

I would like to extract all unique non missing elements in a row and then collapse them using &&&&. Here comes a small example:
#Load needed libraries:
library(data.table)
#Generate the data:
set.seed(1)
n_rows<-10
#Define function to apply to rows:
function_non_missing<-function(x){
x<-x[!is.na(x)]
x<-x[x!="NA"]
x<-unique(x[order(x)])
paste(x,collapse="&&&&")
}
data<-data.table(
a=sample(c(1,2,NA,NA),n_rows,replace=TRUE),
b=sample(c(1,2,NA,NA),n_rows,replace=TRUE),
c=sample(c(1,2,NA,NA),n_rows,replace=TRUE)
)
> data
a b c
1: 1 NA 1
2: NA NA NA
3: NA 1 1
4: 1 1 1
5: 2 1 1
6: 1 2 1
7: NA 2 2
8: NA 2 1
9: 2 2 1
10: 2 NA 2
#Obtain results
data[,paste(.SD),by=1:nrow(data)][,function_non_missing(V1),by=nrow]
nrow V1
1: 1 1
2: 2
3: 3 1
4: 4 1
5: 5 1&&&&2
6: 6 1&&&&2
7: 7 2
8: 8 1&&&&2
9: 9 1&&&&2
10: 10 2
The above code looks very convoluted and I believe there might be better solutions.
Using melt() / dcast():
data[, row := .I
][, melt(.SD, id.vars = "row")
][order(row, value), paste0(unique(value[!is.na(value)]), collapse = "&&&"), by = row]
row V1
1: 1 1
2: 2
3: 3 1
4: 4 1
5: 5 1&&&2
6: 6 1&&&2
7: 7 2
8: 8 1&&&2
9: 9 1&&&2
10: 10 2
Alterntively using your original function:
data[, function_non_missing(unlist(.SD)), by = 1:nrow(data)]
nrow V1
1: 1 1
2: 2
3: 3 2
4: 4 1&&&&2
5: 5 1&&&&2
6: 6 1&&&&2
7: 7 1
8: 8 2
9: 9 1&&&&2
10: 10 1&&&&2
Probably using apply?
library(data.table)
data[, col := apply(.SD, 1, function(x)
paste(sort(unique(na.omit(x))), collapse = "&&&"))]
data
# a b c col
# 1: 1 NA 1 1
# 2: NA NA NA
# 3: NA 1 1 1
# 4: 1 1 1 1
# 5: 2 1 1 1&&&2
# 6: 1 2 1 1&&&2
# 7: NA 2 2 2
# 8: NA 2 1 1&&&2
# 9: 2 2 1 1&&&2
#10: 2 NA 2 2

How to compute a new variable based on the number of days since a particular type of record

I'm trying to create a variable that shows the number of days since a particular event occurred. This is a follow up to this previous question, using the same data.
The data looks like this (note dates are in DD-MM-YYYY format):
ID date drug score
A 28/08/2016 2 3
A 29/08/2016 1 4
A 30/08/2016 2 4
A 2/09/2016 2 4
A 3/09/2016 1 4
A 4/09/2016 2 4
B 8/08/2016 1 3
B 9/08/2016 2 4
B 10/08/2016 2 3
B 11/08/2016 1 3
C 30/11/2016 2 4
C 2/12/2016 1 5
C 3/12/2016 2 1
C 5/12/2016 1 4
C 6/12/2016 2 4
C 8/12/2016 1 2
C 9/12/2016 1 2
For 'drug': 1=drug taken, 2=no drug taken.
Each time the value of drug is 1, if that ID has a previous record that is also drug==1, then I need to generate a new value 'lagtime' that shows the number of days (not the number of rows!) since the previous time the drug was taken.
So the output I am looking for is:
ID date drug score lagtime
A 28/08/2016 2 3
A 29/08/2016 1 4
A 30/08/2016 2 4
A 2/09/2016 2 4
A 3/09/2016 1 4 5
A 4/09/2016 2 4
B 8/08/2016 1 3
B 9/08/2016 2 4
B 10/08/2016 2 3
B 11/08/2016 1 3 3
C 30/11/2016 2 4
C 2/12/2016 1 5
C 3/12/2016 2 1
C 5/12/2016 1 4 3
C 6/12/2016 2 4
C 8/12/2016 1 2 3
C 9/12/2016 1 2 1
So I need a way to generate (mutate?) this lagtime score that is calculated as the date for each drug==1 record, minus the date of the previous drug==1 record, grouped by ID.
This has me completely bamboozled.
Here's code for the example data:
data<-data.frame(ID=c("A","A","A","A","A","A","B","B","B","B","C","C","C","C","C","C","C"),
date=as.Date(c("28/08/2016","29/08/2016","30/08/2016","2/09/2016","3/09/2016","4/09/2016","8/08/2016","9/08/2016","10/08/2016","11/08/2016","30/11/2016","2/12/2016","3/12/2016","5/12/2016","6/12/2016","8/12/2016","9/12/2016"),format= "%d/%m/%Y"),
drug=c(2,1,2,2,1,2,1,2,2,1,2,1,2,1,2,1,1),
score=c(3,4,4,4,4,4,3,4,3,3,4,5,1,4,4,2,2))
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(data)), grouped by 'ID', specify the i (drug ==1), get the difference of 'date' (diff(date)), concatenate with NA as the diff output length is 1 less than the original vector, convert to integer and assign (:=) to create the 'lagtime'. By default, all other values will be NA
library(data.table)
setDT(data)[drug==1, lagtime := as.integer(c(NA, diff(date))), ID]
data
# ID date drug score lagtime
# 1: A 2016-08-28 2 3 NA
# 2: A 2016-08-29 1 4 NA
# 3: A 2016-08-30 2 4 NA
# 4: A 2016-09-02 2 4 NA
# 5: A 2016-09-03 1 4 5
# 6: A 2016-09-04 2 4 NA
# 7: B 2016-08-08 1 3 NA
# 8: B 2016-08-09 2 4 NA
# 9: B 2016-08-10 2 3 NA
#10: B 2016-08-11 1 3 3
#11: C 2016-11-30 2 4 NA
#12: C 2016-12-02 1 5 NA
#13: C 2016-12-03 2 1 NA
#14: C 2016-12-05 1 4 3
#15: C 2016-12-06 2 4 NA
#16: C 2016-12-08 1 2 3
#17: C 2016-12-09 1 2 1

Shifting row values by lag value in another column

I have a rather large dataset and I am interested in "marching" values forward through time based on values from another column. For example, if I have a Value = 3 at Time = 0 and a DesiredShift = 2, I want the 3 to shift down two rows to be at Time = 2. Here is a reproducible example.
Build reproducible fake data
library(data.table)
set.seed(1)
rowsPerID <- 8
dat <- CJ(1:2, 1:rowsPerID)
setnames(dat, c("ID","Time"))
dat[, Value := rpois(.N, 4)]
dat[, Shift := sample(0:2, size=.N, replace=TRUE)]
Fake Data
# ID Time Value Shift
# 1: 1 1 3 2
# 2: 1 2 3 2
# 3: 1 3 4 1
# 4: 1 4 7 2
# 5: 1 5 2 2
# 6: 1 6 7 0
# 7: 1 7 7 1
# 8: 1 8 5 0
# 9: 2 1 5 0
# 10: 2 2 1 1
# 11: 2 3 2 0
# 12: 2 4 2 1
# 13: 2 5 5 2
# 14: 2 6 3 1
# 15: 2 7 5 1
# 16: 2 8 4 1
I want each Value to shift forward according the the Shift column. So the
DesiredOutput column for row 3 will be equal to 3 since the value at Time=1 is
Value = 3 and Shift = 2.
Row 4 shows 3+4=7 since 3 shifts down 2 and 4 shifts down 1.
I would like to be able to do this by ID group and hopefully take advantage
of data.table since speed is of interest for this problem.
Desired Result
# ID Time Value Shift DesiredOutput
# 1: 1 1 3 2 NA
# 2: 1 2 3 2 NA
# 3: 1 3 4 1 3
# 4: 1 4 7 2 3+4 = 7
# 5: 1 5 2 2 NA
# 6: 1 6 7 0 7+7 = 14
# 7: 1 7 7 1 2
# 8: 1 8 5 0 7+5 = 12
# 9: 2 1 5 0 5
# 10: 2 2 1 1 NA
# 11: 2 3 2 0 1+2 = 3
# 12: 2 4 2 1 NA
# 13: 2 5 5 2 2
# 14: 2 6 3 1 NA
# 15: 2 7 5 1 3+5=8
# 16: 2 8 4 1 5
I was hoping to get this working using the data.table::shift function, but I am unsure how to make this work using multiple lag parameters.
Try this:
dat[, TargetIndex:= .I + Shift]
toMerge = dat[, list(Out = sum(Value)), by='TargetIndex']
dat[, TargetIndex:= .I]
# dat = merge(dat, toMerge, by='TargetIndex', all=TRUE)
dat[toMerge, on='TargetIndex', DesiredOutput:= i.Out]
> dat
# ID Time Value Shift TargetIndex DesiredOutput
# 1: 1 1 3 2 1 NA
# 2: 1 2 3 2 2 NA
# 3: 1 3 4 1 3 3
# 4: 1 4 7 2 4 7
# 5: 1 5 2 2 5 NA
# 6: 1 6 7 0 6 14
# 7: 1 7 7 1 7 2
# 8: 1 8 5 0 8 12
# 9: 2 1 5 0 9 5
# 10: 2 2 1 1 10 NA
# 11: 2 3 2 0 11 3
# 12: 2 4 2 1 12 NA
# 13: 2 5 5 2 13 2
# 14: 2 6 3 1 14 NA
# 15: 2 7 5 1 15 8
# 16: 2 8 4 1 16 5

Replace all values in a data.table given a condition

How would you replace all values in a data.table given a condition?
For example
ppp <- data.table(A=1:6,B=6:1,C=1:6,D=3:8)
A B C D
1 6 1 3
2 5 2 4
3 4 3 5
4 3 4 6
5 2 5 7
6 1 6 8
I want to replace all "6" by NA
A B C D
1 NA 1 3
2 5 2 4
3 4 3 5
4 3 4 NA
5 2 5 7
NA 1 6 8
I've tried something like
ppp[,ifelse(.SD==6,NA,.SD)]
but it doesn't work, it produces a much wider table.
A native data.table way to do this would be:
for(col in names(ppp)) set(ppp, i=which(ppp[[col]]==6), j=col, value=NA)
# Test
> ppp
A B C D
1: 1 NA 1 3
2: 2 5 2 4
3: 3 4 3 5
4: 4 3 4 NA
5: 5 2 5 7
6: NA 1 NA 8
This approach - while perhaps more verbose - is nevertheless going to be significantly faster than ppp[ppp == 6] <- NA, because it avoids the copying of all columns.
Even easier:
ppp[ppp == 6] <- NA
ppp
A B C D
1: 1 NA 1 3
2: 2 5 2 4
3: 3 4 3 5
4: 4 3 4 NA
5: 5 2 5 7
6: NA 1 NA 8
Importantly, this doesn't change its class:
is.data.table(ppp)
[1] TRUE

R data.table conditional selection

I have a data.table
set.seed(1)
DT <- data.table(tag = rep(LETTERS[1:4],each = 2, times = 3),
year = rep(1:3, each = 8), month = rep(c(1:2), 12),
value = runif(24, 1, 10))
DT
tag year month value
1: A 1 1 3.389578
2: A 1 2 4.349115
3: B 1 1 6.155680
4: B 1 2 9.173870
5: C 1 1 2.815137
6: C 1 2 9.085507
7: D 1 1 9.502077
8: D 1 2 6.947180
9: A 2 1 6.662026
10: A 2 2 1.556076
11: B 2 1 2.853771
12: B 2 2 2.589011
13: C 2 1 7.183206
14: C 2 2 4.456933
15: D 2 1 7.928573
16: D 2 2 5.479293
17: A 3 1 7.458567
18: A 3 2 9.927155
19: B 3 1 4.420317
20: B 3 2 7.997007
21: C 3 1 9.412347
22: C 3 2 2.909283
23: D 3 1 6.865064
24: D 3 2 2.129996
Sort this DT by year, month, and -value:
setorder(DT, year, month, -value)
will produce:
tag year month value
1: D 1 1 9.502077
2: B 1 1 6.155680
3: A 1 1 3.389578
4: C 1 1 2.815137
5: B 1 2 9.173870
6: C 1 2 9.085507
7: D 1 2 6.947180
8: A 1 2 4.349115
9: D 2 1 7.928573
10: C 2 1 7.183206
11: A 2 1 6.662026
12: B 2 1 2.853771
13: D 2 2 5.479293
14: C 2 2 4.456933
15: B 2 2 2.589011
16: A 2 2 1.556076
17: C 3 1 9.412347
18: A 3 1 7.458567
19: D 3 1 6.865064
20: B 3 1 4.420317
21: A 3 2 9.927155
22: B 3 2 7.997007
23: C 3 2 2.909283
24: D 3 2 2.129996
I would like the result to be like following:
tag year month value
1: D 1 1 9.502077
2: B 1 1 6.155680
3: B 1 2 9.173870
4: D 1 2 6.947180
5: D 2 1 7.928573
6: C 2 1 7.183206
7: D 2 2 5.479293
8: C 2 2 4.456933
9: C 3 1 9.412347
10: A 3 1 7.458567
11: A 3 2 9.927155
12: C 3 2 2.909283
The result DT would have the following property: within each year keep only the two tags that have larger value in month 1. For example, year 1, the two tags that have larger value are D and B, so keep D and B for the whole year 1. year 2 keeps D and C. At each month 1, I need to reselect the rows with the year.
We group by 'year', get the first two elements in 'tag' for 'month' 1 (as it is already ordered), create a logical index using %in% and subset the rows.
DT[, .SD[tag %in% head(tag[month ==1],2)], .(year)]
# year tag month value
# 1: 1 D 1 9.502077
# 2: 1 B 1 6.155680
# 3: 1 B 2 9.173870
# 4: 1 D 2 6.947180
# 5: 2 D 1 7.928573
# 6: 2 C 1 7.183206
# 7: 2 D 2 5.479293
# 8: 2 C 2 4.456933
# 9: 3 C 1 9.412347
#10: 3 A 1 7.458567
#11: 3 A 2 9.927155
#12: 3 C 2 2.909283

Resources