Group rows up to current row in r data.table - r

I have a dataset that looks like this:
library(data.table)
set.seed(10)
n_rows <- 50
data <- data.table(id = 1:n_rows,
timestamp = Sys.Date() + as.difftime(1:n_rows, units = "days"),
subject = sample(letters[1:4], n_rows, replace = T),
response = sample(3, n_rows, replace = T)
)
head(data, 10)
id timestamp subject response
1: 1 2016-05-17 c 2
2: 2 2016-05-18 b 3
3: 3 2016-05-19 b 1
4: 4 2016-05-20 c 2
5: 5 2016-05-21 a 1
6: 6 2016-05-22 a 2
7: 7 2016-05-23 b 2
8: 8 2016-05-24 b 2
9: 9 2016-05-25 c 2
10: 10 2016-05-26 b 2
I need to do some group by operations that sum occurences of each response by subject to date.
The below group by produces the nth_test column.
new_vars <- data[, .(id, timestamp, nth_test = 1:.N, response), by=.(subject)]
subject id timestamp nth_test response
1: c 1 2016-05-17 1 2
2: c 4 2016-05-20 2 2
3: c 9 2016-05-25 3 2
4: c 11 2016-05-27 4 1
5: c 12 2016-05-28 5 1
6: c 14 2016-05-30 6 2
7: c 22 2016-06-07 7 2
8: c 26 2016-06-11 8 2
9: c 31 2016-06-16 9 3
10: c 36 2016-06-21 10 1
But I don't know how to produce columns resp_1, resp_2 & resp_3 like below.
subject id timestamp nth_test response resp_1 resp_2 resp_3
1: c 1 2016-05-17 1 2 0 1 0
2: c 4 2016-05-20 2 2 0 2 0
3: c 9 2016-05-25 3 2 0 3 0
4: c 11 2016-05-27 4 1 1 3 0
5: c 12 2016-05-28 5 1 2 3 0
6: c 14 2016-05-30 6 2 2 4 0
7: c 22 2016-06-07 7 2 2 5 0
8: c 26 2016-06-11 8 2 2 6 0
9: c 31 2016-06-16 9 3 2 6 1
10: c 36 2016-06-21 10 1 3 6 1
Cheers

We can try
Un1 <- unique(sort(data$response))
data[, c("nth_test", paste("resp", Un1, sep="_")) := c(list(1:.N),
lapply(Un1, function(x) cumsum(x==response))) , .(subject)]
data[order(subject, timestamp)][subject=="c"]
# id timestamp subject response nth_test resp_1 resp_2 resp_3
# 1: 1 2016-05-17 c 2 1 0 1 0
# 2: 4 2016-05-20 c 2 2 0 2 0
# 3: 9 2016-05-25 c 2 3 0 3 0
# 4: 11 2016-05-27 c 1 4 1 3 0
# 5: 12 2016-05-28 c 1 5 2 3 0
# 6: 14 2016-05-30 c 2 6 2 4 0
# 7: 22 2016-06-07 c 2 7 2 5 0
# 8: 26 2016-06-11 c 2 8 2 6 0
# 9: 31 2016-06-16 c 3 9 2 6 1
#10: 36 2016-06-21 c 1 10 3 6 1
#11: 39 2016-06-24 c 1 11 4 6 1
#12: 40 2016-06-25 c 1 12 5 6 1
#13: 44 2016-06-29 c 2 13 5 7 1

I wanted to see what this would look like if the cummax/cumsum was done while the data.table was in long form (might be more efficient in certain configurations):
> data[order(subject, timestamp)
+ ][, rCnt := 1:.N, .(subject, response)
+ ][, responseStr := sprintf('%s_%s', 'resp', response)
+ ][, dcast(.SD, id + timestamp + subject + response ~ responseStr, value.var='rCnt', fill=0)
+ ][, melt(.SD, id.vars=c('id', 'timestamp', 'subject', 'response'))
+ ][order(subject, timestamp)
+ ][, value := cummax(value), .(subject, variable)
+ ][, nth_test := 1:.N, .(subject, variable)
+ ][, dcast(.SD, id + timestamp + subject + response + nth_test ~ variable, value.var='value')
+ ][order(subject, timestamp)
+ ][subject == 'c'
+ ]
id timestamp subject response nth_test resp_1 resp_2 resp_3
1: 1 2016-05-17 c 2 1 0 1 0
2: 4 2016-05-20 c 2 2 0 2 0
3: 9 2016-05-25 c 2 3 0 3 0
4: 11 2016-05-27 c 1 4 1 3 0
5: 12 2016-05-28 c 1 5 2 3 0
6: 14 2016-05-30 c 2 6 2 4 0
7: 22 2016-06-07 c 2 7 2 5 0
8: 26 2016-06-11 c 2 8 2 6 0
9: 31 2016-06-16 c 3 9 2 6 1
10: 36 2016-06-21 c 1 10 3 6 1
11: 39 2016-06-24 c 1 11 4 6 1
12: 40 2016-06-25 c 1 12 5 6 1
13: 44 2016-06-29 c 2 13 5 7 1
>

Related

R subset a data.table only criteria is met, by group

I would like to subset a data.table where a certain criteria is met, but keep another subset of the data if the criteria is not met, by group.
E.g. in the following example data, I would like to subset every ID group where year = 2020, but if 2020 does not exist in that ID group, just keep the data where year = 0 only
libary(data.table)
# dummy data
dt <- data.table(ID = c(rep("A",12), rep("B",12), rep("C",6)),
year = c(rep(2020,6), rep(0,6), rep(2019,6), rep(0,12)), hour = c(rep(1:6,5)), N = sample(1:20,30, replace = T))
# desired result:
ID year hour N
1: A 2020 1 12
2: A 2020 2 18
3: A 2020 3 2
4: A 2020 4 7
5: A 2020 5 18
6: A 2020 6 8
7: B 0 1 18
8: B 0 2 17
9: B 0 3 2
10: B 0 4 3
11: B 0 5 9
12: B 0 6 5
13: C 0 1 17
14: C 0 2 19
15: C 0 3 5
16: C 0 4 11
17: C 0 5 20
18: C 0 6 15
I've only done this by subsetting twice and binding, e.g. ;
dt_res1 <- dt[year == 2020]
dt_res2 <- dt[!(ID %in% dt_res1[,ID])][year == 0]
rbindlist(list(dt_res1, dt_res2), use.names = T)
This?
dt[, .SD[year == 2020 | (!2020 %in% year & year == 0),], by = ID]
# ID year hour N
# <char> <num> <int> <int>
# 1: A 2020 1 10
# 2: A 2020 2 5
# 3: A 2020 3 7
# 4: A 2020 4 15
# 5: A 2020 5 1
# 6: A 2020 6 12
# 7: B 0 1 10
# 8: B 0 2 9
# 9: B 0 3 3
# 10: B 0 4 19
# 11: B 0 5 15
# 12: B 0 6 6
# 13: C 0 1 9
# 14: C 0 2 9
# 15: C 0 3 3
# 16: C 0 4 12
# 17: C 0 5 2
# 18: C 0 6 18

Subsetting panel observations

I have a data.table with firm information.
library(data.table)
DT <- fread("
iso Firm GDP year
A 1 1 1
A 2 1 1
A 3 1 1
A 4 1 1
A 5 3 2
A 6 3 2
A 7 3 2
A 8 3 2
B 9 2 1
B 10 2 1
B 11 2 1
B 12 2 1
B 13 4 1
B 14 4 1
B 15 4 1
B 16 4 1",
header = TRUE)
I want to calculate GDPgrowth (per country) from one year to the other and add it to the dataset ((N-O)/O). However, if I do:
DT <- DT[,GDPgrowth :=((GDP- shift(GDP))/shift(GDP)), by=iso]
the outcome will be zero because it subtracts the firm observations from each other.
How can I make sure it calculates for the whole group of firms belonging to the country together?
Desired output:
library(data.table)
DT <- fread("
iso Firm GDP GDPgrowth year
A 1 1 NA 1
A 2 1 NA 1
A 3 1 NA 1
A 4 1 NA 1
A 5 3 2 2
A 6 3 2 2
A 7 3 2 2
A 8 3 2 2
B 9 2 NA 1
B 10 2 NA 1
B 11 2 NA 1
B 12 2 NA 1
B 13 4 1 1
B 14 4 1 1
B 15 4 1 1
B 16 4 1 1",
header = TRUE)
Here is one way continuing from your current approach :
library(data.table)
DT[,GDPgrowth :=((GDP- shift(GDP))/shift(GDP)), by=iso]
DT[GDPgrowth == 0, GDPgrowth := NA]
DT[, GDPgrowth:= zoo::na.locf(GDPgrowth, na.rm = FALSE), .(iso, year)]
DT
# iso Firm GDP year GDPgrowth
# 1: A 1 1 1 NA
# 2: A 2 1 1 NA
# 3: A 3 1 1 NA
# 4: A 4 1 1 NA
# 5: A 5 3 2 2
# 6: A 6 3 2 2
# 7: A 7 3 2 2
# 8: A 8 3 2 2
# 9: B 9 2 1 NA
#10: B 10 2 1 NA
#11: B 11 2 1 NA
#12: B 12 2 1 NA
#13: B 13 4 1 1
#14: B 14 4 1 1
#15: B 15 4 1 1
#16: B 16 4 1 1
Using dplyr and tidyr::fill it can be done as
library(dplyr)
DT %>%
group_by(iso) %>%
mutate(GDPgrowth = (GDP - lag(GDP))/lag(GDP),
GDPgrowth = replace(GDPgrowth, GDPgrowth == 0, NA)) %>%
group_by(iso, year) %>%
tidyr::fill(GDPgrowth)

When 0 in x is odd, how to assign id value between this zero and the next zero to the new variable ref

x<-c(0,0,1,1,0,1,1,1,0,1,1,0,1,1)
aaa<-data.frame(x)
aaa$id<-1:nrow(aaa)
When 0 in x is odd, how to assign id value between this zero and the next zero to the new variable ref.
The results like:
aaa$ref <- with(aaa, ifelse(cumsum(x == 0) %% 2, id, NA))
aaa
# x id ref
# 1 0 1 1
# 2 0 2 NA
# 3 1 3 NA
# 4 1 4 NA
# 5 0 5 5
# 6 1 6 6
# 7 1 7 7
# 8 1 8 8
# 9 0 9 NA
# 10 1 10 NA
# 11 1 11 NA
# 12 0 12 12
# 13 1 13 13
# 14 1 14 14
An option using data.table
library(data.table)
i1 <- setDT(aaa)[, grp := rleid(x)][, .I[seq_len(.N) == .N & x==0], grp]$V1
i2 <- unlist(lapply(split(i1, as.integer(gl(length(i1), 2,
length(i1)))), function(x) head(x[1]:x[2],-1)))
aaa[!i2, ref := id][, grp := NULL][]
# x id ref
# 1: 0 1 1
# 2: 0 2 NA
# 3: 1 3 NA
# 4: 1 4 NA
# 5: 0 5 5
# 6: 1 6 6
# 7: 1 7 7
# 8: 1 8 8
# 9: 0 9 NA
#10: 1 10 NA
#11: 1 11 NA
#12: 0 12 12
#13: 1 13 13
#14: 1 14 14

Generate a new group ID every time a variable changes its value [duplicate]

This question already has answers here:
Increase counter by 1 for each unique group of values
(2 answers)
Closed 4 years ago.
Have been searching for a while, but haven't found what I wanted. For the sake of exposition, suppose one has a data set like the following:
library(data.table)
set.seed(666)
foo = data.table(id = 1:20, value = sample(c(1, -1), 20, replace = T))
id value
1: 1 -1
2: 2 1
3: 3 -1
4: 4 1
5: 5 1
6: 6 -1
7: 7 -1
8: 8 1
9: 9 1
10: 10 1
11: 11 -1
12: 12 1
13: 13 1
14: 14 1
15: 15 1
16: 16 -1
17: 17 1
18: 18 -1
19: 19 1
20: 20 1
I want to create a unique group id every time value changes, resulting in
id value grp
1: 1 -1 1
2: 2 1 2
3: 3 -1 3
4: 4 1 4
5: 5 1 4
6: 6 -1 5
7: 7 -1 5
8: 8 1 6
9: 9 1 6
10: 10 1 6
11: 11 -1 7
12: 12 1 8
13: 13 1 8
14: 14 1 8
15: 15 1 8
16: 16 -1 9
17: 17 1 10
18: 18 -1 11
19: 19 1 12
20: 20 1 12
I can do it in a loop
foo[, cc := value == shift(value)][is.na(cc), cc := FALSE]
for(i in 1:nrow(foo)){
if(foo[i]$cc != T){
pp = i
foo[i, grp := pp]} else {
foo[i, grp := pp]}
}
foo[, grp := as.numeric(as.factor(grp))]
Is there a smarter way of doing it?
We can use rleid
foo[, grp := rleid(value)]
foo
# id value grp
# 1: 1 -1 1
# 2: 2 1 2
# 3: 3 -1 3
# 4: 4 1 4
# 5: 5 1 4
# 6: 6 -1 5
# 7: 7 -1 5
# 8: 8 1 6
# 9: 9 1 6
#10: 10 1 6
#11: 11 -1 7
#12: 12 1 8
#13: 13 1 8
#14: 14 1 8
#15: 15 1 8
#16: 16 -1 9
#17: 17 1 10
#18: 18 -1 11
#19: 19 1 12
#20: 20 1 12

data.table: Select n specific rows before & after other rows meeting a condition

Given the following example data table:
library(data.table)
DT <- fread("grp y exclude
a 1 0
a 2 0
a 3 0
a 4 1
a 5 0
a 7 1
a 8 0
a 9 0
a 10 0
b 1 0
b 2 0
b 3 0
b 4 1
b 5 0
b 6 1
b 7 1
b 8 0
b 9 0
b 10 0
c 5 1
d 1 0")
I want to select
by group grp
all rows that have y==5
and up to two rows before and after each row from 2 within the grouping.
but 3. only those rows that have exclude==0.
Assuming each group has max one row with y==5, this would yield the desired result for 1.-3.:
idx <- -2:2 # 2 rows before match, the matching row itself, and two rows after match
(row_numbers <- DT[,.I[{
x <- rep(which(y==5),each=length(idx))+idx
x[x>0 & x<=.N]
}], by=grp]$V1)
# [1] 3 4 5 6 7 12 13 14 15 16 20
DT[row_numbers]
# grp y exclude
# 1: a 3 0
# 2: a 4 1
# 3: a 5 0 # y==5 + two rows before and two rows after
# 4: a 7 1
# 5: a 8 0
# 6: b 3 0
# 7: b 4 1
# 8: b 5 0 # y==5 + two rows before and two rows after
# 9: b 6 1
# 10: b 7 1
# 11: c 5 1 # y==5 + nothing, because the group has only 1 element
However, how do I incorporate 4. so that I get
# grp y exclude
# 1: a 2 0
# 2: a 3 0
# 3: a 5 0
# 4: a 8 0
# 5: a 9 0
# 6: b 2 0
# 7: b 3 0
# 8: b 5 0
# 9: b 8 0
# 10: b 9 0
# 11: c 5 1
? Feels like I'm close, but I guess I looked too long at heads and whiches, now, so I'd be thankful for some fresh ideas.
A bit more simplified:
DT[DT[, rn := .I][exclude==0 | y==5][, rn[abs(.I - .I[y==5]) <= 2], by=grp]$V1]
# grp y exclude rn
#1: a 2 0 2
#2: a 3 0 3
#3: a 5 0 5
#4: a 8 0 7
#5: a 9 0 8
#6: b 2 0 11
#7: b 3 0 12
#8: b 5 0 14
#9: b 8 0 17
#10: b 9 0 18
#11: c 5 1 20
You are very close. This should do it:
row_numbers <- DT[exclude==0 | y==5, .I[{
x <- rep(which(y==5), each=length(idx)) + idx
x[x>0 & x<=.N]
}], by=grp]$V1
DT[row_numbers]

Resources