applying cumsum() from different starting points - r

I have data
library(data.table)
set.seed(42)
t <- data.table(time=1:1000, value=runif(100,0,1))
p <- data.table(id=1:10, cut=sample(1:100,5))
vals <- 1:5
> head(t)
time value
1: 1 0.9148060
2: 2 0.9370754
3: 3 0.2861395
4: 4 0.8304476
5: 5 0.6417455
6: 6 0.5190959
> head(p)
id cut
1: 1 63
2: 2 22
3: 3 99
4: 4 38
5: 5 91
6: 6 63
> vals
[1] 1 2 3 4 5
where t gives some vector of values associated with time points, and p gives for each person a cutoff in time.
I would like to get for each person the time units it takes to accumulate each of the values in vals.
My approach now is to use a for-loop that computes for each person a temporary vector of cumulative sums, starting at its specific cutoff in time. Next, I use findInterval() to obtain the positions at which cumsum reaches each of the levels in vals.
out <- matrix(NA, nrow=nrow(p), ncol=length(vals)); colnames(out) <- vals
for(i in 1:nrow(p)){
temp <- cumsum(t$value[t$time > p$cut[i]]); temp <- temp[!is.na(temp)]
out[i,] <- findInterval(vals,temp)
}
which should yield
1 2 3 4 5
[1,] 1 4 5 9 12
[2,] 1 2 5 6 7
[3,] 1 2 4 5 7
[4,] 1 3 5 7 8
[5,] 2 3 5 7 8
[6,] 1 4 5 9 12
[7,] 1 2 5 6 7
[8,] 1 2 4 5 7
[9,] 1 3 5 7 8
[10,] 2 3 5 7 8
This is of course heavily inefficient and doesn't do justice to the powers of R. Is there a way of speeding this up?

I'd do
# precompute cumsum on full table
t[, cs := cumsum(value)]
# compute one time per unique cut value, not per id
cuts = unique(p[, .(t_cut = cut)])
# look up value at cut time
cuts[t, on=.(t_cut = time), v_cut := i.cs]
# look up time at every cut value combo
cutres = cuts[, .(pt = vals + v_cut), by=t_cut][, .(
t_cut,
v = vals,
t_plus = t[.SD, on=.(cs = pt), roll=TRUE, x.time] - t_cut
)]
which gives
t_cut v t_plus
1: 63 1 1
2: 63 2 4
3: 63 3 5
4: 63 4 9
5: 63 5 12
6: 22 1 1
7: 22 2 2
8: 22 3 5
9: 22 4 6
10: 22 5 7
11: 99 1 1
12: 99 2 2
13: 99 3 4
14: 99 4 5
15: 99 5 7
16: 38 1 1
17: 38 2 3
18: 38 3 5
19: 38 4 7
20: 38 5 8
21: 91 1 2
22: 91 2 3
23: 91 3 5
24: 91 4 7
25: 91 5 8
t_cut v t_plus
If you want to map this back to id and get it in a id x vals table...
cutres[p, on=.(t_cut = cut), allow.cartesian=TRUE,
dcast(.SD, id ~ v, value.var = "t_plus")]
id 1 2 3 4 5
1: 1 1 4 5 9 12
2: 2 1 2 5 6 7
3: 3 1 2 4 5 7
4: 4 1 3 5 7 8
5: 5 2 3 5 7 8
6: 6 1 4 5 9 12
7: 7 1 2 5 6 7
8: 8 1 2 4 5 7
9: 9 1 3 5 7 8
10: 10 2 3 5 7 8
(Alternately, the key part can be done like t_plus = t[.SD, on=.(cs = pt), roll=TRUE, which=TRUE] - t_cut since t$time is the row number.)

Related

How to replace single occurances with the previous status

I have a data table like below :
table=data.table(x=c(1:15),y=c(1,1,1,3,1,1,2,1,2,2,3,3,3,3,3),z=c(1:15)*3)
I have to clean this data table where there are single occurrences like a 3 in between the 1s and a 1 in between the 2s. It doesn't have to be a 3 but any number which occurs only once should be replaced by the previous number.
table=data.table(x=c(1:15),y=c(1,1,1,1,1,1,2,2,2,2,3,3,3,3,3),z=c(1:15)*3)
This is the expected data table.
Any help is appreciated.
Here's one way :
library(data.table)
#Count number of rows for each group
table[, N := .N, rleid(y)]
#Change `y` value which have only one row
table[, y := replace(y, N ==1, NA)]
#Replace NA with last non-NA value
table[, y := zoo::na.locf(y)][, N := NULL]
table
# x y z
# 1: 1 1 3
# 2: 2 1 6
# 3: 3 1 9
# 4: 4 1 12
# 5: 5 1 15
# 6: 6 1 18
# 7: 7 2 21
# 8: 8 2 24
# 9: 9 2 27
#10: 10 2 30
#11: 11 3 33
#12: 12 3 36
#13: 13 3 39
#14: 14 3 42
#15: 15 3 45
Here is a base R option
inds <- which(diff(c(head(table$y,1),table$y))*diff(c(table$y,tail(table$y,1)))<0)
table$y <- replace(table$y,inds,table$y[inds-1])
such that
> table
x y z
1: 1 1 3
2: 2 1 6
3: 3 1 9
4: 4 1 12
5: 5 1 15
6: 6 1 18
7: 7 2 21
8: 8 2 24
9: 9 2 27
10: 10 2 30
11: 11 3 33
12: 12 3 36
13: 13 3 39
14: 14 3 42
15: 15 3 45

R DataTable Solution Fast Reshape

data1=data.frame("StudentID"=c(1,2,3,4,5),
"a1cat"=c(9,10,2,0,10),
"a2cat"=c(0,2,8,6,7),
"a3cat"=c(4,2,1,6,5),
"a1dog"=c(8,4,4,5,8),
"a2dog"=c(1,9,10,5,7),
"a3dog"=c(9,3,2,7,7),
"q20fox"=c(2,8,6,1,9),
"q22fox"=c(8,10,9,6,6),
"q24fox"=c(5,0,2,9,7))
data2=data.frame("StudentID" = sort(rep(1:5,each=3)),
"timeX" = c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3),
"meow" = c(9,0,4,10,2,2,2,8,1,0,6,6,10,7,5),
"bark" = c(8,1,9,4,9,3,4,10,2,5,5,7,8,7,7),
"woof"=c(2,8,5,8,10,0,6,9,2,1,6,9,9,6,7))
I have 'data1' and wish to get 'data2' using data.table to reshape the data and give new names for each column.
data1x=data.frame("StudentID"=c(1,2,3,4,5),
"a1cat"=c(9,10,2,0,10),
"a2cat"=c(0,2,8,6,7),
"a3cat"=c(4,2,1,6,5),
"a1dog"=c(8,4,4,5,8),
"a2dog"=c(1,9,10,5,7),
"a3dog"=c(9,3,2,7,7),
"fox20"=c(2,8,6,1,9),
"fox22"=c(8,10,9,6,6),
"fox24"=c(5,0,2,9,7))
We can use melt with measure patterns
library(data.table)
melt(setDT(data1), measure = patterns("cat$", "dog$", "fox\\d*$"),
value.name = c("meow", "bark", "woof"),
variable.name = 'timeX')[order(StudentID)]
# StudentID timeX meow bark woof
# 1: 1 1 9 8 2
# 2: 1 2 0 1 8
# 3: 1 3 4 9 5
# 4: 2 1 10 4 8
# 5: 2 2 2 9 10
# 6: 2 3 2 3 0
# 7: 3 1 2 4 6
# 8: 3 2 8 10 9
# 9: 3 3 1 2 2
#10: 4 1 0 5 1
#11: 4 2 6 5 6
#12: 4 3 6 7 9
#13: 5 1 10 8 9
#14: 5 2 7 7 6
#15: 5 3 5 7 7

data.table manipulation and merging

I have data
dat1 <- data.table(id=1:8,
group=c(1,1,2,2,2,3,3,3),
value=c(5,6,10,11,12,20,21,22))
dat2 <- data.table(group=c(1,2,3),
value=c(3,6,13))
and I would like to subtract dat2$value from each of the dat1$value, based on group.
Is this possible using data.table or does it require additional packages?
With data.table, you could do:
library(data.table)
dat1[dat2, on = "group"][, new.value := value - i.value, by = "group"][]
Which returns:
id group value i.value new.value
1: 1 1 5 3 2
2: 2 1 6 3 3
3: 3 2 10 6 4
4: 4 2 11 6 5
5: 5 2 12 6 6
6: 6 3 20 13 7
7: 7 3 21 13 8
8: 8 3 22 13 9
Alternatively, you can do this in one step as akrun mentions:
dat1[dat2, newvalue := value - i.value, on = .(group)]
id group value newvalue
1: 1 1 5 2
2: 2 1 6 3
3: 3 2 10 4
4: 4 2 11 5
5: 5 2 12 6
6: 6 3 20 7
7: 7 3 21 8
8: 8 3 22 9

How do I select rows in a data frame before and after a condition is met?

I'm searching the web for a few a days now and I can't find a solution to my (probably easy to solve) problem.
I have huge data frames with 4 variables and over a million observations each. Now I want to select 100 rows before, all rows while and 1000 rows after a specific condition is met and fill the rest with NA's. I tried it with a for loop and if/ifelse but it doesn't work so far. I think it shouldn't be a big thing, but in the moment I just don't get the hang of it.
I create the data using:
foo<-data.frame(t = 1:15, a = sample(1:15), b = c(1,1,1,1,1,4,4,4,4,1,1,1,1,1,1), c = sample(1:15))
My Data looks like this:
ID t a b c
1 1 4 1 7
2 2 7 1 10
3 3 10 1 6
4 4 2 1 4
5 5 13 1 9
6 6 15 4 3
7 7 8 4 15
8 8 3 4 1
9 9 9 4 2
10 10 14 1 8
11 11 5 1 11
12 12 11 1 13
13 13 12 1 5
14 14 6 1 14
15 15 1 1 12
What I want is to pick the value of a (in this example) 2 rows before, all rows while and 3 rows after the value of b is >1 and fill the rest with NA's. [Because this is just an example I guess you can imagine that after these 15 rows there are more rows with the value for b changing from 1 to 4 several times (I did not post it, so I won't spam the question with unnecessary data).]
So I want to get something like:
ID t a b c d
1 1 4 1 7 NA
2 2 7 1 10 NA
3 3 10 1 6 NA
4 4 2 1 4 2
5 5 13 1 9 13
6 6 15 4 3 15
7 7 8 4 15 8
8 8 3 4 1 3
9 9 9 4 2 9
10 10 14 1 8 14
11 11 5 1 11 5
12 12 11 1 13 11
13 13 12 1 5 NA
14 14 6 1 14 NA
15 15 1 1 12 NA
I'm thankful for any help.
Thank you.
Best regards,
Chris
here is the same attempt as missuse, but with data.table:
library(data.table)
foo<-data.frame(t = 1:11, a = sample(1:11), b = c(1,1,1,4,4,4,4,1,1,1,1), c = sample(1:11))
DT <- setDT(foo)
DT[ unique(c(DT[,.I[b>1] ],DT[,.I[b>1]+3 ],DT[,.I[b>1]-2 ])), d := a]
t a b c d
1: 1 10 1 2 NA
2: 2 6 1 10 6
3: 3 5 1 7 5
4: 4 11 4 4 11
5: 5 4 4 9 4
6: 6 8 4 5 8
7: 7 2 4 8 2
8: 8 3 1 3 3
9: 9 7 1 6 7
10: 10 9 1 1 9
11: 11 1 1 11 NA
Here
unique(c(DT[,.I[b>1] ],DT[,.I[b>1]+3 ],DT[,.I[b>1]-2 ]))
gives you your desired indixes : the unique indices of the line for your condition, the same indices+3 and -2.
Here is an attempt.
Get indexes that satisfy the condition b > 1
z <- which(foo$b > 1)
get indexes for (z - 2) : (z + 3)
ind <- unique(unlist(lapply(z, function(x){
g <- pmax(x - 2, 1) #if x - 2 is negative
g : (x + 3)
})))
create d column filled with NA
foo$d <- NA
replace elements with appropriate indexes with foo$a
foo$d[ind] <- foo$a[ind]
library(dplyr)
library(purrr)
# example dataset
foo<-data.frame(t = 1:15,
a = sample(1:15),
b = c(1,1,1,1,1,4,4,4,4,1,1,1,1,1,1),
c = sample(1:15))
# function to get indices of interest
# for a given index x go 2 positions back and 3 forward
# keep only positive indices
GetIDsBeforeAfter = function(x) {
v = (x-2) : (x+3)
v[v > 0]
}
foo %>% # from your dataset
filter(b > 1) %>% # keep rows where b > 1
pull(t) %>% # get the positions
map(GetIDsBeforeAfter) %>% # for each position apply the function
unlist() %>% # unlist all sets indices
unique() -> ids_to_remain # keep unique ones and save them in a vector
foo$d = foo$c # copy column c as d
foo$d[-ids_to_remain] = NA # put NA to all positions not in our vector
foo
# t a b c d
# 1 1 5 1 8 NA
# 2 2 6 1 14 NA
# 3 3 4 1 10 NA
# 4 4 1 1 7 7
# 5 5 10 1 5 5
# 6 6 8 4 9 9
# 7 7 9 4 15 15
# 8 8 3 4 6 6
# 9 9 7 4 2 2
# 10 10 12 1 3 3
# 11 11 11 1 1 1
# 12 12 15 1 4 4
# 13 13 14 1 11 NA
# 14 14 13 1 13 NA
# 15 15 2 1 12 NA

Shifting row values by lag value in another column

I have a rather large dataset and I am interested in "marching" values forward through time based on values from another column. For example, if I have a Value = 3 at Time = 0 and a DesiredShift = 2, I want the 3 to shift down two rows to be at Time = 2. Here is a reproducible example.
Build reproducible fake data
library(data.table)
set.seed(1)
rowsPerID <- 8
dat <- CJ(1:2, 1:rowsPerID)
setnames(dat, c("ID","Time"))
dat[, Value := rpois(.N, 4)]
dat[, Shift := sample(0:2, size=.N, replace=TRUE)]
Fake Data
# ID Time Value Shift
# 1: 1 1 3 2
# 2: 1 2 3 2
# 3: 1 3 4 1
# 4: 1 4 7 2
# 5: 1 5 2 2
# 6: 1 6 7 0
# 7: 1 7 7 1
# 8: 1 8 5 0
# 9: 2 1 5 0
# 10: 2 2 1 1
# 11: 2 3 2 0
# 12: 2 4 2 1
# 13: 2 5 5 2
# 14: 2 6 3 1
# 15: 2 7 5 1
# 16: 2 8 4 1
I want each Value to shift forward according the the Shift column. So the
DesiredOutput column for row 3 will be equal to 3 since the value at Time=1 is
Value = 3 and Shift = 2.
Row 4 shows 3+4=7 since 3 shifts down 2 and 4 shifts down 1.
I would like to be able to do this by ID group and hopefully take advantage
of data.table since speed is of interest for this problem.
Desired Result
# ID Time Value Shift DesiredOutput
# 1: 1 1 3 2 NA
# 2: 1 2 3 2 NA
# 3: 1 3 4 1 3
# 4: 1 4 7 2 3+4 = 7
# 5: 1 5 2 2 NA
# 6: 1 6 7 0 7+7 = 14
# 7: 1 7 7 1 2
# 8: 1 8 5 0 7+5 = 12
# 9: 2 1 5 0 5
# 10: 2 2 1 1 NA
# 11: 2 3 2 0 1+2 = 3
# 12: 2 4 2 1 NA
# 13: 2 5 5 2 2
# 14: 2 6 3 1 NA
# 15: 2 7 5 1 3+5=8
# 16: 2 8 4 1 5
I was hoping to get this working using the data.table::shift function, but I am unsure how to make this work using multiple lag parameters.
Try this:
dat[, TargetIndex:= .I + Shift]
toMerge = dat[, list(Out = sum(Value)), by='TargetIndex']
dat[, TargetIndex:= .I]
# dat = merge(dat, toMerge, by='TargetIndex', all=TRUE)
dat[toMerge, on='TargetIndex', DesiredOutput:= i.Out]
> dat
# ID Time Value Shift TargetIndex DesiredOutput
# 1: 1 1 3 2 1 NA
# 2: 1 2 3 2 2 NA
# 3: 1 3 4 1 3 3
# 4: 1 4 7 2 4 7
# 5: 1 5 2 2 5 NA
# 6: 1 6 7 0 6 14
# 7: 1 7 7 1 7 2
# 8: 1 8 5 0 8 12
# 9: 2 1 5 0 9 5
# 10: 2 2 1 1 10 NA
# 11: 2 3 2 0 11 3
# 12: 2 4 2 1 12 NA
# 13: 2 5 5 2 13 2
# 14: 2 6 3 1 14 NA
# 15: 2 7 5 1 15 8
# 16: 2 8 4 1 16 5

Resources