cumulative sum by ID with lag - r

I want to create a cumulative sum by id. But, it should not sum the value that belongs to the row where is being calculated.
I've already tried with cumsum. However, I do not know how to add a statement which specifies to do not add the amount of the row where the sum is made. The result column I am looking for is the third column called: "sum".
For example, for id 1, the first row is sum=0, because should not add this row. But, for id 1 and row 2 sum=100 because the amount of id 1 previous to the row 2 was 100 and so on.
id amount sum
1: 1 100 0
2: 1 20 100
3: 1 150 120
4: 2 60 0
5: 2 100 60
6: 1 30 270
7: 2 40 160
This is what I've tried:
df[,sum:=cumsum(amount),
by ="id"]
data: df <- data.table(id = c(1, 1, 1, 2, 2,1,2), amount = c(100, 20,
150,60,100,30,40),sum=c(0,100,120,0,60,270,160) ,stringsAsFactors =
FALSE)

You can do this without using lag:
> df %>%
group_by(id) %>%
mutate(sum = cumsum(amount) - amount)
# A tibble: 7 x 3
# Groups: id [2]
id amount sum
<dbl> <dbl> <dbl>
#1 1 100 0
#2 1 20 100
#3 1 150 120
#4 2 60 0
#5 2 100 60
#6 1 30 270
#7 2 40 160

With dplyr -
df %>%
group_by(id) %>%
mutate(sum = lag(cumsum(amount), default = 0)) %>%
ungroup()
# A tibble: 7 x 3
id amount sum
<dbl> <dbl> <dbl>
1 1 100 0
2 1 20 100
3 1 150 120
4 2 60 0
5 2 100 60
6 1 30 270
7 2 40 160
Thanks to #thelatemail here's the data.table version -
df[, sum := cumsum(shift(amount, fill=0)), by=id]

Here is an option in base R
df$Sum <- with(df, ave(amount, id, FUN = cumsum) - amount)
df$Sum
#[1] 0 100 120 0 60 270 160
Or by removing the last observation, take the cumsum
with(df, ave(amount, id, FUN = function(x) c(0, cumsum(x[-length(x)]))))

You can shift the values you're summing by using the lag function.
library(tidyverse)
df <- data.frame(id = c(1, 1, 1, 2, 2,1,2), amount = c(100, 20,
150,60,100,30,40),sum=c(0,100,120,0,60,270,160) ,stringsAsFactors =
FALSE)
df %>%
group_by(id) %>%
mutate(sum = cumsum(lag(amount, 1, default=0)))
# A tibble: 7 x 3
# Groups: id [2]
id amount sum
<dbl> <dbl> <dbl>
1 1 100 0
2 1 20 100
3 1 150 120
4 2 60 0
5 2 100 60
6 1 30 270
7 2 40 160

Related

Remove if unit only has one observation

I have a long form of clinical data that looks something like this:
patientid <- c(100,100,100,101,101,101,102,102,102,104,104,104)
outcome <- c(1,1,1,1,1,NA,1,NA,NA,NA,NA,NA)
time <- c(1,2,3,1,2,3,1,2,3,1,2,3)
Data <- data.frame(patientid=patientid, outcome=outcome, time=time)
A patient should be kept in the database only if they 2 or 3 observations (so patients that have complete data for 0 or only 1 time points should be thrown out. So for this example my desired result is this:
patientid <- c(100,100,100,101,101,101)
outcome <- c(1,1,1,1,1,NA)
time <- c(1,2,3,1,2,3)
Data <- data.frame(patientid=patientid, outcome=outcome, time=time)
Hence patients 102 and 104 are thrown out of the database because of they were missing the outcome variable in 2 or 3 of the time points.
We can create a logical expression on the sum of non-NA elements as a logical vector, grouped by 'patientid' to filter patientid's having more than one non-NA 'outcome'
library(dplyr)
Data %>%
group_by(patientid) %>%
filter(sum(!is.na(outcome)) > 1) %>%
ungroup
-output
# A tibble: 6 x 3
# patientid outcome time
# <dbl> <dbl> <dbl>
#1 100 1 1
#2 100 1 2
#3 100 1 3
#4 101 1 1
#5 101 1 2
#6 101 NA 3
A base R option using subset + ave
subset(
Data,
ave(!is.na(outcome), patientid, FUN = sum) > 1
)
giving
patientid outcome time
1 100 1 1
2 100 1 2
3 100 1 3
4 101 1 1
5 101 1 2
6 101 NA 3
A data.table option
setDT(Data)[, Y := sum(!is.na(outcome)), patientid][Y > 1, ][, Y := NULL][]
or a simpler one (thank #akrun)
setDT(Data)[Data[, .I[sum(!is.na(outcome)) > 1], .(patientid)]$V1]
which gives
patientid outcome time
1: 100 1 1
2: 100 1 2
3: 100 1 3
4: 101 1 1
5: 101 1 2
6: 101 NA 3
library(dplyr)
Data %>%
group_by(patientid) %>%
mutate(observation = sum(outcome, na.rm = TRUE)) %>% # create new variable (observation) and count the observation per patient
filter(observation >=2) %>%
ungroup
output:
# A tibble: 6 x 4
patientid outcome time observation
<dbl> <dbl> <dbl> <dbl>
1 100 1 1 3
2 100 1 2 3
3 100 1 3 3
4 101 1 1 2
5 101 1 2 2
6 101 NA 3 2

Dpylr solution for cumsum with a factor reset

I need a dpylr solution that creates a cumsum column.
# Input dataframe
df <- data.frame(OilChanged = c("No","No","Yes","No","No","No","No","No","No","No","No","Yes","No"),
Odometer = c(300,350,410,420,430,450,500,600,600,600,650,660,700))
# Create difference column - first row starting with zero
df <- df %>% dplyr::mutate(Odometer_delta = Odometer - lag(Odometer, default = Odometer[1]))
I'm trying to make a reset condition based on the factor column for a cumulative sum.
The result needs to be exactly like this.
# Wanted result dataframe
df <- data.frame(OilChanged = c("No","No","Yes","No","No","No","No","No","No","No","No","Yes","No"),
Odometer = c(300,350,410,420,430,450,500,600,600,600,650,660,700),
Diff = c(0,50,60,10,10,20,50,100,0,0,50,10,40),
CumSum = c(0,50,110,10,20,40,90,190,190,190,240,250,40))
You can create a new group everytime OilChanged == 'Yes' and take cumsum of Diff value in each group.
library(dplyr)
df %>%
group_by(grp = lag(cumsum(OilChanged == 'Yes'), default = 0)) %>%
mutate(newcumsum = cumsum(Diff)) %>%
ungroup %>%
select(-grp)
# OilChanged Odometer Diff CumSum newcumsum
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 No 300 0 0 0
# 2 No 350 50 50 50
# 3 Yes 410 60 110 110
# 4 No 420 10 10 10
# 5 No 430 10 20 20
# 6 No 450 20 40 40
# 7 No 500 50 90 90
# 8 No 600 100 190 190
# 9 No 600 0 190 190
#10 No 600 0 190 190
#11 No 650 50 240 240
#12 Yes 660 10 250 250
#13 No 700 40 40 40

Use `dplyr` to divide rows by group

On my attempt to learn dplyr, I want to divide each row by another row, representing the corresponding group's total.
I generated test data with
library(dplyr)
# building test data
data("OrchardSprays")
totals <- OrchardSprays %>% group_by(treatment) %>%
summarise(decrease = sum(decrease))
totals$decrease <- totals$decrease + seq(10, 80, 10)
totals$rowpos = totals$colpos <- "total"
df <- rbind(OrchardSprays, totals)
Note the line totals$decrease <- totals$decrease + seq(10, 80, 10): for the sake of the question, I assumed there was an additional decrease for each treatment, which was not observed in the single lines of the data frame but only in the "total" lines for each group.
What I now want to do is adding another column decrease_share to the data frame where each line's decrease value is divided by the corresponding treatment groups total decrease value.
So, for head(df) I would expect an output like this
> head(df)
decrease rowpos colpos treatment treatment_decrease
1 57 1 1 D 0.178125
2 95 2 1 E 0.1711712
3 8 3 1 B 0.09876543
4 69 4 1 H 0.08603491
5 92 5 1 G 0.1488673
6 90 6 1 F 0.1470588
My real world example is a bit more complex (more group variables and also more levels), therefore I am looking for a suitable solution in dplyr.
Here's a total dplyr approach:
library(dplyr) #version >= 1.0.0
OrchardSprays %>%
group_by(treatment) %>%
summarise(decrease = sum(decrease)) %>%
mutate(decrease = decrease + seq(10, 80, 10),
rowpos = "total",
colpos = "total") %>%
bind_rows(mutate(OrchardSprays, across(rowpos:colpos, as.character))) %>%
group_by(treatment) %>%
mutate(treatment_decrease = decrease / decrease[rowpos == "total"])
# A tibble: 72 x 5
# Groups: treatment [8]
treatment decrease rowpos colpos treatment_decrease
<fct> <dbl> <chr> <chr> <dbl>
1 A 47 total total 1
2 B 81 total total 1
3 C 232 total total 1
4 D 320 total total 1
5 E 555 total total 1
6 F 612 total total 1
7 G 618 total total 1
8 H 802 total total 1
9 D 57 1 1 0.178
10 E 95 2 1 0.171
# … with 62 more rows

R and dplyr: group by value ranges

hi everyone I have a dataframe like this:
value count
<dbl> <dbl>
1 1 10
2 2 20
3 3 30
4 4 40
5 5 50
6 6 60
I would like to be able to divide my observations into intervals. The first and last interval must include all the observations left out of the range (for example of 2)
interval count
<???> <dbl>
1 [<1, 2] 30
2 [3, 4] 50
3 [5, >6] 110
Is it possible to do this with dplyr?
You can use cut() to create a grouping variable with which to summarise count.
library(dplyr)
df %>%
group_by(grp = cut(value, c(-Inf, 2, 4, Inf))) %>%
summarise(count = sum(count))
# A tibble: 3 x 2
grp count
<fct> <int>
1 (-Inf,2] 30
2 (2,4] 70
3 (4, Inf] 110

Calculate a team's lead at regular intervals based on scoring data

I have a set of scoring data from a bunch of hockey games and I'm stuck at a stage of my analysis. I'm trying to plot the home team's lead for every ten minutes of every game.
Here's an example of where I've gotten my dataset so far:
library(tidyverse)
# Generate example data ordered by gameid and event_ts
game <- tibble(event_type = "goal", event_ts = runif(n = 1000, min = 0, max = 60),
team = sample(c("home", "away"), size = 1000, replace = TRUE, prob = c(0.55,0.45)),
gameid = sample(100:300, size = 1000, replace = TRUE)) %>%
arrange(gameid, event_ts)
I know that I can get the final score of each game using summarise. Here's a quick example that assumes both teams score at least one goal in every game:
game %>%
group_by(gameid, team) %>%
summarise(goals = n()) %>%
spread(key = team, value = goals) %>%
mutate(away = ifelse(is.null(away), 0, away))
I'd like to figure out the home team's lead (positive or negative) at ten minute intervals throughout the game. That requires summing all scoring that has happened up to that point. Here's an example of the structure I'd like to get:
finished_demo <- tibble(
gameid = sort(rep_len(seq(100, 300, 1), 1206)),
timestamp = rep(seq(10, 60, 10), 201),
home_lead = round(runif(
n = 1206, min = -5, max = 7
))
) %>% arrange(gameid, timestamp)
Here's one way of accomplishing it using data.table, IIUC:
require(data.table)
setDT(game) # generated with op's code but with a seed(1L)
key <- CJ(gameid=unique(game$gameid), start=1L, end=(1:6)*10L)
ans <- game[key, on=.(gameid, event_ts >= start, event_ts <= end), # (1)
.(home_lead=sum(team == "home")-sum(team == "away")), # (2)
by=.EACHI] # (3)
head(ans)
# gameid event_ts event_ts home_lead
# 1: 100 1 10 NA
# 2: 100 1 20 1
# 3: 100 1 30 0
# 4: 100 1 40 0
# 5: 100 1 50 -1
# 6: 100 1 60 -2
You can rename the duplicate column names (I'll fix this when I get time to work on it).
(1) searches for row indices in game that matches for every row in key while matching on the conditions provided under on argument.
(2) computes the lead of home team.
(3) .EACHI informs that the home team lead should be computed on matching rows of game for every row of key.
NA implies there were no matching events.. if necessary they can be replaced to 0 by doing:
ans[is.na(home_lead), home_lead := 0L]
I'm like 99% sure someone can rewrite this with some that embedded/nested (?) structure found in purrr. Different nrow() from results above (with same data) so no guarantee solution is right.
game %>%
group_by(gameid) %>%
do(data.frame(time = 10 * (1:(max(.$event_ts) %/% 10)))) %>%
apply(1, function(x) {
g = x[1] %>% unlist
t = x[2] %>% unlist
game %>%
filter(gameid == g, event_ts < t) %>%
group_by(gameid, team) %>%
summarise(goals = n()) %>%
mutate(time = t)
}) %>%
bind_rows %>%
spread(key = team, value = goals) %>%
mutate_all(as.numeric) %>%
mutate(away = ifelse(is.na(away), 0, away),
home = ifelse(is.na(home), 0, home))
gameid time away home
<int> <dbl> <dbl> <dbl>
1 100 10 0 1
2 100 20 1 3
3 100 30 1 3
4 101 20 0 1
5 101 30 1 1
6 101 40 1 2
7 101 50 1 2
How about this?
game %>%
mutate(ten_min = event_ts %/% 10,
homegoal = if_else(team == 'home', 1, -1)) %>%
group_by(ten_min, gameid) %>%
summarize(home_lead_interval = sum(homegoal)) %>%
ungroup() %>%
group_by(gameid) %>%
mutate(home_lead = cumsum(home_lead_interval)) %>%
arrange(gameid, ten_min)
# Source: local data frame [683 x 4]
# Groups: gameid [198]
#
# ten_min gameid home_lead_interval home_lead
# <dbl> <int> <dbl> <dbl>
# 1 0 100 0 0
# 2 1 100 -1 -1
# 3 2 100 -3 -4
# 4 3 100 -1 -5
# 5 4 100 2 -3
# 6 5 100 -1 -4
# 7 1 101 1 1
# 8 2 101 1 2
# 9 4 101 -2 0
# 10 0 102 1 1
# # ... with 673 more rows
My idea is to get the score of home and away for each 10 minutes. Then you can group the data.frame based on gameid and create the result you want.
set.seed(123)
# Generate example data ordered by gameid and event_ts
game <- tibble(event_type = "goal", event_ts = runif(n = 1000, min = 0, max = 60),
team = sample(c("home", "away"), size = 1000, replace = TRUE, prob = c(0.55,0.45)),
gameid = sample(100:300, size = 1000, replace = TRUE)) %>%
arrange(gameid, event_ts)
# Change the event_ts and get all 10 minutes intervals
hl <- game %>%
mutate(event_ts=ceiling(event_ts / 10) * 10) %>%
dcast(gameid + event_ts ~ team, length) %>%
right_join(expand.grid(gameid=unique(game$gameid), event_ts=seq(10, 60, 10)))
hl$away[is.na(hl$away)] <- 0
hl$home[is.na(hl$home)] <- 0
# Get the home lead
hl <- hl %>%
arrange(gameid, event_ts) %>%
group_by(gameid) %>%
mutate(away=cumsum(away),
home=cumsum(home),
home_lead=home - away)
# Check the game 100 and 101
game %>% filter(gameid %in% 100:101)
# A tibble: 7 × 4
event_type event_ts team gameid
<chr> <dbl> <chr> <int>
1 goal 30.460972 home 100
2 goal 57.270219 home 100
3 goal 1.126093 home 101
4 goal 27.879957 home 101
5 goal 33.086101 home 101
6 goal 42.497419 away 101
7 goal 45.649418 home 101
hl %>% filter(gameid %in% 100:101)
Source: local data frame [12 x 5]
Groups: gameid [2]
gameid event_ts away home home_lead
<int> <dbl> <dbl> <dbl> <dbl>
1 100 10 0 0 0
2 100 20 0 0 0
3 100 30 0 0 0
4 100 40 0 1 1
5 100 50 0 1 1
6 100 60 0 2 2
7 101 10 0 1 1
8 101 20 0 1 1
9 101 30 0 2 2
10 101 40 0 3 3
11 101 50 1 4 3
12 101 60 1 4 3

Resources