Dynamic Cumsum with condition

Dynamic Cumsum with condition - r

I have the following problem I am trying to solve. Here some example data:
library(tidyverse)
library(lubridate)
date <- data.frame(date=seq(ymd('2018-01-01'),ymd('2018-02-28'), by = '1 day'))
group <- data.frame(group=c("A","B"))
subgroup <- data.frame(subgroup=c("C","D"))
DF <- merge(merge(date,group,by=NULL),subgroup,by=NULL)
DF$group_value <- apply(DF, 1, function(x) sample(8:12,1))
DF$subgroup_value <- apply(DF, 1, function(x) sample(1:5,1))
DF <- DF %>%
arrange(date,group,subgroup)
I now want to calculate the following:
for every given day t, group and subgroup combination calculate the
number of days until the (backward) cumsum of subgroup_value is equal
or greater than the group value of day t.
I know how to do that by using some for loops and some dplyr functionality, but this is just terrible slow:
for(i in seq(1,nrow(date),1)) {
for(j in seq(1,nrow(group),1)) {
for(k in seq(1,nrow(subgroup),1)) {
tmp <- DF %>%
filter(date<=date[i] & group == group[j] & subgroup == subgroup[k]) %>%
arrange(desc(date))
tmp$helper <- 1
tmp <- tmp %>%
mutate(
cs_helper = cumsum(helper),
cs_subgroup_value = cumsum(subgroup_value),
nr_days = case_when (
cs_subgroup_value >= group_value ~ cs_helper,
TRUE ~ NA_real_)
)
#this is the final result for date[i], group[j], subgroup[k]
value <- min(tmp[,"nr_days"], na.rm=T)
}
}
}
Example
head(DF,10)
date group subgroup group_value subgroup_value result
1 2018-01-01 A C 12 2 NA
2 2018-01-02 A C 11 4 NA
3 2018-01-03 A C 11 4 NA
4 2018-01-04 A C 9 5 2
5 2018-01-05 A C 12 5 3
6 2018-01-06 A C 10 3 3
7 2018-01-07 A C 12 5 3
8 2018-01-08 A C 8 1 3
9 2018-01-09 A C 12 4 4
10 2018-01-10 A C 9 1 4
So for row 10, I need to sum the last 4 values of subgroup to be greater or equal to 9.
I am sure that this code can be highly optimized by using some vectorized version but I am struggle to find a good starting point for that (As you can see from the code above, I am a newbie in R)
My question is: How would you approach this problem in order to vectorize it for speed optimisation?
Thanks!
Stephan

Here's an attempt, take a copy of each group/subgroups data frame, and cross join to the data. This is then filtered to only find the days before. This allows us for each day to calculate all of the cumulative sums
DF %>%
group_by(group, subgroup) %>%
mutate(day = row_number(), J = TRUE) %>%
nest() %>%
arrange(group, subgroup) %>%
mutate(data = map(data, function(d) {
inner_join(d, transmute(d, x = day, v = subgroup_value, J), by = "J") %>%
filter(day >= x) %>%
mutate(x = day - x + 1) %>%
arrange(day, x) %>%
group_by(date, group_value, date, subgroup_value) %>%
mutate(vv = cumsum(v),
vv = ifelse(vv >= group_value, vv, NA),
xx = ifelse(!is.na(vv), x, NA)) %>%
group_by(date, group_value, day, subgroup_value) %>%
summarise(x = min(xx, na.rm = TRUE), v = min(vv, na.rm = TRUE))
})) %>%
unnest()

Related

Entering additional rows in dataframe as mid points between values of adjacent existing rows

Say you have the following time series dataset in r:
n <- 3
set.seed(1)
df <- data.frame(Day = rep("Mon", n),
Time = 1:n,
Temper = round(rnorm(n, 4, 2), 0))
print(df)
Day Time Temper
Mon 1 3
Mon 2 4
Mon 3 2
Now say you wish to add midpoints between adjacent existing values as extra rows in the data frame. For example, say you wish to add the exact midpoint between every pair of adjacent values, to produce the following new dataframe:
Day Time Temper
Mon 1 3
Mon 1.5 3.5
Mon 2 4
Mon 2.5 3
Mon 3 2
What would be some efficient R code that could accomplish this on much larger datasets?
It would be great if this code could also populate the dataframe with values which are not just the exact midpoints, for example the 'one-third' datapoint:
Day Time Temper
Mon 1 3
Mon 1.33 3.33
Mon 2 4
Mon 2.33 3.33
Mon 3 2

Another solution:
library(tidyverse)
df %>%
slice(rep(1:n(), each = 2)) %>%
mutate_at(c("Time", "Temper"), function(x) {
replace(x, seq(2, n(), 2),
(x + (1 / 3) * (lead(x) - lag(x)))[seq(2, n(), 2)])
}) %>%
mutate_at(c("Time", "Temper"), round, 2) %>%
slice(-n())
#> Day Time Temper
#> 1 Mon 1.00 3.00
#> 2 Mon 1.33 3.33
#> 3 Mon 2.00 4.00
#> 4 Mon 2.33 3.33
#> 5 Mon 3.00 2.00

Here is an idea using dplyr and purrr. We first rbind rows of NA and we then fill those NAs, i.e.
library(tidyverse)
df %>%
group_by(Day) %>%
map_dfr(rbind, NA) %>%
fill(Day) %>%
mutate_at(vars(c(2, 3)), funs(replace(., is.na(.), (1/2) * (lag(.) + lead(.))[is.na(.)] ))) %>%
na.omit()
which gives,
# A tibble: 5 x 3
Day Time Temper
<int> <dbl> <dbl>
1 1 1 3
2 1 1.5 3.5
3 1 2 4
4 1 2.5 3
5 1 3 2

You may want to benchmark (cf library microbenchmark) those 3 solutions, depending on the numbers of factors and numeric variables you have.
Using base R
n <- 3
set.seed(1)
dframe <- data.frame(Day = rep("Mon", n),
Time = 1:n,
Temper = round(rnorm(n, 4, 2), 0))
# --- convert factor to numeric
mframe <- as.data.frame(sapply(dframe, as.numeric))
# --- function to use on variables
pfun <- function(x, coef = 1/4){
# x <- mframe$Time ; coef <- .25 ;
newp <- x[1:(length(x)-1)] + diff(x, lag = 1) * coef
res <- c(rbind(x[1:(length(x) -1) ], newp) , x[length(x)] )
return( res )
}
# --- base R way
# pfun( mframe$Time )
# sapply(mframe, pfun, .5)
apply(mframe, 2, pfun)
dframe_final <- as.data.frame ( apply(mframe, 2, pfun) )
# str(dframe_final)
# --- get Day's or other factors back
for(col in names(dframe)[sapply(dframe, is.factor)]){
dframe_final[[col]] <- factor(dframe_final[[col]])
levels( dframe_final[[col]] ) <- levels(dframe[[col]])
}
dplyr
# --- dplyr way
library(dplyr)
library(purrr)
lfactors <- dframe %>%
map_if(is.factor, levels)
dframe2 <- dframe %>%
as_tibble %>%
map_dfr(as.numeric) %>%
map_dfr(pfun) %>%
mutate_at(.vars = names(dframe)[sapply(dframe, is.factor)], .funs = factor)
# --- get Day's or other factors back
for(col in names(dframe)[sapply(dframe, is.factor)]){
dframe2[[col]] <- factor(dframe2[[col]])
levels( dframe2[[col]] ) <- levels(dframe[[col]])
}
data.table
# --- data.table way
library(data.table)
dframe3 <- data.table(dframe)
dframe3 <- dframe3[ , lapply(.SD, as.numeric)]
dframe3 <- dframe3[ , lapply(.SD, pfun)]
# --- get Day's or other factors back
for(col in names(dframe)[sapply(dframe, is.factor)]){
dframe3[ , (col) := factor(get(col)) ]
levels( dframe3[[col]] ) <- levels(dframe[[col]])
}

"Rolling" Regression in R

Say I want to run regressions per group whereby I want to use the last 5 year data as input for that regression. Then, for each next year, I would like to "shift" the input for that regression by one year (i.e., 4 observations).
From those regressions I want to extract both the R2 and the fitted values/residuals, which I then need in subsequent regressions that follow similar notions.
I have some code working using loops, but it is not really elegant nor efficient for large datasets. I assume there must be a nice plyr way for resolving this issue.
# libraries #
library(dplyr)
library(broom)
# reproducible data #
df <- tibble(ID = as.factor(rep(c(1, 2), each = 40)),
YEAR = rep(rep(c(2001:2010), each = 4), 2),
QTR = rep(c(1:4), 20),
DV = rnorm(80),
IV = DV * rnorm(80))
# output vector #
output = tibble(ID = NA,
YEAR = NA,
R2 = NA)
# loop #
k = 1
for (i in levels(df$ID)){
n_row = df %>%
arrange(ID) %>%
filter(ID == i) %>%
nrow()
for (j in seq(1, (n_row - 19), by = 4)){
output[k, 1] = i
output[k, 2] = df %>%
filter(ID == i) %>%
slice((j + 19)) %>%
select(YEAR) %>%
unlist()
output[k, 3] = df %>%
filter(ID == i) %>%
slice(j:(j + 19)) %>%
do(model = lm(DV ~ IV, data = .)) %>%
glance(model) %>%
ungroup() %>%
select(r.squared) %>%
ungroup()
k = k + 1
}
}

Define a function which returns the year and R squared given a subset of rows of df (without ID) and then use rollapply with it.
library(dplyr)
library(zoo)
R2 <- function(x) {
x <- as.data.frame(x)
c(YEAR = tail(x$YEAR, 1), R2 = summary(lm(DV ~ IV, x))$r.squared)
}
df %>%
group_by(ID) %>%
do(data.frame(rollapply(.[-1], 20, by = 4, R2, by.column = FALSE))) %>%
ungroup
giving:
# A tibble: 12 x 3
ID YEAR R2
<fct> <dbl> <dbl>
1 1 2005 0.0133
2 1 2006 0.130
3 1 2007 0.0476
4 1 2008 0.0116
5 1 2009 0.00337
6 1 2010 0.00570
7 2 2005 0.0481
8 2 2006 0.00527
9 2 2007 0.0158
10 2 2008 0.0303
11 2 2009 0.235
12 2 2010 0.116

R Sum numbers within string

I have a question:
I have a dataset like this simple example:
df<-data.frame(ID=c("A","B","C","D"),
Score=c("15","16/18/19+2/6","3/+2","19/18/14"))
I want to end up with a dataset that has split the score numbers. I have a problem with the /+2 part. when it says "3/+2"it actually means: "3/3+2" which would finally give "3/5". So what I would like some help with, is to end up with a dataset like this:
ID Score
A 15
B 16/18/19/21/6
C 3/5
D 19/18/14
I already found out that I can then seperate the score by
df<-df %>%
mutate(Score = strsplit(as.character(ID), "/")) %>%
unnest(Score)
But I don't know how I can let the numbers duplicate and then sum when /+ occurs, could someone help me?

It could be probably solved in a more elegant way, but here is one possibility:
df %>%
mutate(Score = strsplit(as.character(Score), "/")) %>%
unnest() %>%
rowwise() %>%
mutate(Score = eval(parse(text = paste0(Score)))) %>%
group_by(ID) %>%
mutate(Score = paste0(Score, collapse = "/")) %>%
distinct()
ID Score
<fct> <chr>
1 A 15
2 B 16/18/21/6
3 C 3/5
4 D 19/18/14
Sample data:
df <- data.frame(ID=c("A","B","C","D"),
Score=c("15","16/18/19+2/6","3/3+2","19/18/14"))
It splits "Score" based on /, converts characters to expression by parse() and then transforms it back.
Using the data you provided and the pattern from #A. Suliman:
df %>%
mutate(Score = strsplit(gsub("(\\d+)/*\\+(\\d+)","\\1/\\1+\\2", Score), "/")) %>%
unnest() %>%
rowwise() %>%
mutate(Score = eval(parse(text = paste0(Score)))) %>%
group_by(ID) %>%
mutate(Score = paste0(Score, collapse = "/")) %>%
distinct()
ID Score
<fct> <chr>
1 A 15
2 B 16/18/19/21/6
3 C 3/5
4 D 19/18/14

We can use gsubfn to do this in a compact way
library(gsubfn)
library(tidyverse)
df %>%
mutate(Score = gsubfn("\\d+\\+\\d+", ~ eval(parse(text = x)), Score))
# ID Score
#1 A 15
#2 B 16/18/21/6
#3 C 3/5
#4 D 19/18/14
data
df <- data.frame(ID=c("A","B","C","D"),
Score=c("15","16/18/19+2/6","3/3+2","19/18/14"), stringsAsFactors = FALSE)

library(dplyr)
library(tidyr) #separate_rows, no need for unnest
df %>% rowwise()%>%
mutate(Score_upd=paste0(sapply(unlist(strsplit(gsub('(\\d+)/*\\+(\\d+)','\\1/\\1+\\2',Score),'/')),
function(x)eval(parse(text = x))),collapse = '/')) %>%
separate_rows(Score_upd,sep = '/')
#short version
df %>% mutate(Score=gsub('(\\d+)/*\\+(\\d+)','\\1/\\1+\\2',Score)) %>%
separate_rows(Score,sep='/') %>% rowwise() %>% mutate(Score=eval(parse(text=Score))) %>%
group_by(ID) %>% summarise(Score=paste0(Score,collapse = '/'))
# A tibble: 4 x 2
ID Score
<fct> <chr>
1 A 15
2 B 16/18/19/21/6
3 C 3/5
4 D 19/18/14
The main idea is using gsub to separate 2+3 correctly, e.g:
gsub('(\\d+)/*\\+(\\d+)','\\1/\\1+\\2','20/8/2+3') #/* means 0 or 1 occurence of / e.g, 19+2 and 3/+2.
[1] "20/8/2/2+3"
Then
valid_str <- gsub('(\\d+)/*\\+(\\d+)','\\1/\\1+\\2','20/8/2+3')
sapply(unlist(strsplit(valid_str,'/')),function(x) eval(parse(text=x)))
20 8 2 2+3
20 8 2 5
#OR
sapply(unlist(strsplit(valid_str,'/')),function(x) sum(as.numeric(unlist(strsplit(x,'\\+')))))
20 8 2 2+3
20 8 2 5

Find difference between maximum value of group and current row with R

Similar to this question here, I am trying to find the difference between the maximum value of a group and the value of the current row.
For instance, if I have the following dataset:
ID <- c(1,1,1,2,2,2,2,3,3)
Value <- c(2,3,5,2,5,8,17,3,5)
group <- data.frame(Subject=ID, pt=Value)
How would I go about creating a new column called "diff" that would be the difference between the value of the current row and the maximum value in that group?
Thank you for your help!

The OP has tried a data.table solution. Here, we benefit from grouping and updating by reference simultaneously.
library(data.table)
setDT(group)[, diff := max(pt) - pt, by = Subject][]
Subject pt diff
1: 1 2 3
2: 1 3 2
3: 1 5 0
4: 2 2 15
5: 2 5 12
6: 2 8 9
7: 2 17 0
8: 3 3 2
9: 3 5 0
Data
ID <- c(1,1,1,2,2,2,2,3,3)
Value <- c(2,3,5,2,5,8,17,3,5)
group <- data.frame(Subject=ID, pt=Value)
Benchmark
At the time of writing, 5 answers were posted, including Frank's comment on the efficiency of th data.table approach. So, I was wondering which of the five solutions were the fastest.
r2evans
mine
Frank
harelhan
JonMinton
Some solutions modify the data.frame in place. To ensure a fair comparison, In addition,
The OP has required to create a new column called "diff". For comparison, all results should return group with three columns. Some answers were modified accordingly. The answer of harelhan required substantial modifications to remove the errors.
As group is modified, all benchmark runs start with a fresh copy of group with two columns.
The benchmark is parameterized over the number of rows and the share of groups, i.e., the number of groups varies with the problem size in order to scale.
library(data.table)
library(dplyr)
library(bench)
bm <- press(
# n_row = c(1E2, 1E4, 1E5, 1E6),
n_row = c(1E2, 1E4, 1E5),
grp_share = c(0.01, 0.1, 0.5, 0.9),
{
n_grp <- grp_share * n_row
set.seed(1)
group0 <- data.frame(
Subject = sample(n_grp, n_row, TRUE),
pt = as.numeric(rpois(n_row, 100)))
mark(
r2Evans = {
group <- copy(group0)
group <- group %>%
group_by(Subject) %>%
mutate(diff = max(pt) - pt)
group
},
Uwe = {
group <- copy(group0)
setDT(group)[, diff := max(pt) - pt, by = Subject]
group
},
Frank = {
group <- copy(group0)
setDT(group)[, mx := max(pt), by=Subject][, diff := mx - pt][, mx := NULL]
group
},
harelhan = {
group <- copy(group0)
max_group <- group %>% group_by(Subject) %>% summarize(max_val = max(pt))
group <- left_join(group, max_group[, c("Subject", "max_val")], by = "Subject")
group$diff <- group$max_val - group$pt
group <- group %>% select(-max_val)
group
},
JonMinton = {
group <- copy(group0)
group <- group %>%
group_by(Subject) %>%
mutate(max_group_val = max(pt)) %>%
ungroup() %>%
mutate(diff = max_group_val - pt) %>%
select(-max_group_val)
group
}
)
}
)
ggplot2::autoplot(bm)

Using your example data and breaking the logic into smaller steps:
library(dplyr)
ID <- c(1,1,1,2,2,2,2,3,3)
Value <- c(2,3,5,2,5,8,17,3,5)
group <- data.frame(Subject=ID, pt=Value)
max_group <- group %>% group_by(ID) %>% summarize(max_val = max(Value))
group <- left_join(group, max_group[,c(ID, max_val)], by = ID)
group$diff <- group$max_val - group$Value
Hope this solves the problem.

Based on harelhan's answer, but with piping:
require(dplyr)
df <- data_frame(
id = c(1,1,1,2,2,2,2,3,3),
value = c(2,3,5,2,5,8,17,3,5)
)
df %>%
group_by(id) %>%
mutate(max_group_val = max(value)) %>%
ungroup() %>%
mutate(diff_frm_group_max = max_group_val - value)
A tibble: 9 x 4
id value max_group_val diff_frm_group_max
<dbl> <dbl> <dbl> <dbl>
1 1 2 5 3
2 1 3 5 2
3 1 5 5 0
4 2 2 17 15
5 2 5 17 12
6 2 8 17 9
7 2 17 17 0
8 3 3 5 2
9 3 5 5 0

How can I create a column that cumulatively adds the sum of two previous rows based on conditions?

I tried asking this question before but was it was poorly stated. This is a new attempt cause I haven't solved it yet.
I have a dataset with winners, losers, date, winner_points and loser_points.
For each row, I want two new columns, one for the winner and one for the loser that shows how many points they have scored so far (as both winners and losers).
Example data:
winner <- c(1,2,3,1,2,3,1,2,3)
loser <- c(3,1,1,2,1,1,3,1,2)
date <- c("2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05","2017-10-06","2017-10-07","2017-10-08","2017-10-09")
winner_points <- c(2,1,2,1,2,1,2,1,2)
loser_points <- c(1,0,1,0,1,0,1,0,1)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points)
I want the output to be:
winner_points_sum <- c(0, 0, 1, 3, 1, 3, 5, 3, 5)
loser_points_sum <- c(0, 2, 2, 1, 4, 5, 4, 7, 4)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points, winner_points_sum, loser_points_sum)
How I've solved it thus far is to do a for loop such as:
library(dplyr)
test_data$winner_points_sum_loop <- 0
test_data$loser_points_sum_loop <- 0
for(i in row.names(test_data)) {
test_data[i,]$winner_points_sum_loop <-
(
test_data %>%
dplyr::filter(winner == test_data[i,]$winner & date < test_data[i,]$date) %>%
dplyr::summarise(points = sum(winner_points, na.rm = TRUE))
+
test_data %>%
dplyr::filter(loser == test_data[i,]$winner & date < test_data[i,]$date) %>%
dplyr::summarise(points = sum(loser_points, na.rm = TRUE))
)
}
test_data$winner_points_sum_loop <- unlist(test_data$winner_points_sum_loop)
Any suggestions how to tackle this problem? The queries take quite some time when the row numbers add up. I've tried elaborating with the AVE function, I can do it for one column to sum a players point as winner but can't figure out how to add their points as loser.

winner <- c(1,2,3,1,2,3,1,2,3)
loser <- c(3,1,1,2,1,1,3,1,2)
date <- c("2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05","2017-10-06","2017-10-07","2017-10-08","2017-10-09")
winner_points <- c(2,1,2,1,2,1,2,1,2)
loser_points <- c(1,0,1,0,1,0,1,0,1)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points)
library(dplyr)
library(tidyr)
test_data %>%
unite(winner, winner, winner_points) %>% # unite winner columns
unite(loser, loser, loser_points) %>% # unite loser columns
gather(type, pl_pts, winner, loser, -date) %>% # reshape
separate(pl_pts, c("player","points"), convert = T) %>% # separate columns
arrange(date) %>% # order dates (in case it's not)
group_by(player) %>% # for each player
mutate(sum_points = cumsum(points) - points) %>% # get points up to that date
ungroup() %>% # forget the grouping
unite(pl_pts_sumpts, player, points, sum_points) %>% # unite columns
spread(type, pl_pts_sumpts) %>% # reshape
separate(loser, c("loser", "loser_points", "loser_points_sum"), convert = T) %>% # separate columns and give appropriate names
separate(winner, c("winner", "winner_points", "winner_points_sum"), convert = T) %>%
select(winner, loser, date, winner_points, loser_points, winner_points_sum, loser_points_sum) # select the order you prefer
# # A tibble: 9 x 7
# winner loser date winner_points loser_points winner_points_sum loser_points_sum
# * <int> <int> <date> <int> <int> <int> <int>
# 1 1 3 2017-10-01 2 1 0 0
# 2 2 1 2017-10-02 1 0 0 2
# 3 3 1 2017-10-03 2 1 1 2
# 4 1 2 2017-10-04 1 0 3 1
# 5 2 1 2017-10-05 2 1 1 4
# 6 3 1 2017-10-06 1 0 3 5
# 7 1 3 2017-10-07 2 1 5 4
# 8 2 1 2017-10-08 1 0 3 7
# 9 3 2 2017-10-09 2 1 5 4

I finally understood what you want. And I took an approach of getting cumulative points of each player at each point in time and then joining it to the original test_data data frame.
winner <- c(1,2,3,1,2,3,1,2,3)
loser <- c(3,1,1,2,1,1,3,1,2)
date <- c("2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05","2017-10-06","2017-10-07","2017-10-08","2017-10-09")
winner_points <- c(2,1,2,1,2,1,2,1,2)
loser_points <- c(1,0,1,0,1,0,1,0,1)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points)
library(dplyr)
library(tidyr)
cum_points <- test_data %>%
gather(end_game_status, player_id, winner, loser) %>%
gather(which_point, how_many_points, winner_points, loser_points) %>%
filter(
(end_game_status == "winner" & which_point == "winner_points") |
(end_game_status == "loser" & which_point == "loser_points")) %>%
arrange(date = as.Date(date)) %>%
group_by(player_id) %>%
mutate(cumulative_points = cumsum(how_many_points)) %>%
mutate(cumulative_points_sofar = lag(cumulative_points, default = 0))
select(player_id, date, cumulative_points)
output <- test_data %>%
left_join(cum_points, by = c('date', 'winner' = 'player_id')) %>%
rename(winner_points_sum = cumulative_points_sofar) %>%
left_join(cum_points, by = c('date', 'loser' = 'player_id')) %>%
rename(loser_points_sum = cumulative_points_sofar)
output

The difference to the previous question of the OP is that the OP is now asking for the cumulative sum of points each player has scored so far, i.e., before the actual date. Furthermore, the sample data set now contains a date column which uniquely identifies each row.
So, my previous approach can be used here as well, with some modifications. The solution below reshapes the data from wide to long format whereby two value variables are reshaped simultaneously, computes the cumulative sums for each player id , and finally reshapes from long back to wide format, again. In order to sum only points scored before the actual date, the rows are lagged by one.
It is important to note that the winner and loser columns contain the respective player ids.
library(data.table)
cols <- c("winner", "loser")
setDT(test_data)[
# reshape multiple value variables simultaneously from wide to long format
, melt(.SD, id.vars = "date",
measure.vars = list(cols, paste0(cols, "_points")),
value.name = c("id", "points"))][
# rename variable column
, variable := forcats::lvls_revalue(variable, cols)][
# order by date and cumulate the lagged points by id
order(date), points_sum := cumsum(shift(points, fill = 0)), by = id][
# reshape multiple value variables simultaneously from long to wide format
, dcast(.SD, date ~ variable, value.var = c("id", "points", "points_sum"))]
date id_winner id_loser points_winner points_loser points_sum_winner points_sum_loser
1: 2017-10-01 1 3 2 1 0 0
2: 2017-10-02 2 1 1 0 0 2
3: 2017-10-03 3 1 2 1 1 2
4: 2017-10-04 1 2 1 0 3 1
5: 2017-10-05 2 1 2 1 1 4
6: 2017-10-06 3 1 1 0 3 5
7: 2017-10-07 1 3 2 1 5 4
8: 2017-10-08 2 1 1 0 3 7
9: 2017-10-09 3 2 2 1 5 4

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Dynamic Cumsum with condition - r

Related

Entering additional rows in dataframe as mid points between values of adjacent existing rows

"Rolling" Regression in R

R Sum numbers within string

Find difference between maximum value of group and current row with R

How can I create a column that cumulatively adds the sum of two previous rows based on conditions?

Categories

Resources