I have data on hospital admissions per patients. I am trying add up the price of care for patients that were re-admitted to hospital within 5 days.
This is an example dataset:
(
dt <- data.frame(
id = c(1, 1, 2, 2, 3, 4),
admit_date = c(1, 9, 5, 9, 10, 20),
price = c(10, 20, 20, 30, 15, 16)
)
)
# id admit_date price
# 1 1 1 10
# 2 1 9 20
# 3 2 5 20
# 4 2 9 30
# 5 3 10 15
# 6 4 20 16
And this is what I have tried so far:
library(dplyr)
# 5-day readmission:
dt %>%
group_by(id) %>%
arrange(id, admit_date)%>%
mutate(
duration = admit_date - lag(admit_date),
readmit = ifelse(duration < 6, 1, 0)
) %>%
group_by(id, readmit) %>% # this is where i get stuck
summarize(sumprice = sum(price))
# # A tibble: 6 × 3
# # Groups: id [4]
# id readmit sumprice
# <dbl> <dbl> <dbl>
# 1 1 0 20
# 2 1 NA 10
# 3 2 1 30
# 4 2 NA 20
# 5 3 NA 15
# 6 4 NA 16
And this is what I would like to have:
# id sum_price
# 1 1 10
# 2 1 20
# 3 2 50
# 4 3 15
# 5 4 16
If the difference in days, between adjacent visits is greater than 5 - return TRUE if not - return FALSE (-Inf > 5 is FALSE for the first day, thus lags default is Inf). After that, for each individual we take a cumulative sum to label the groups. We finally summarize within each individual, using this cumsum as a grouping variable for by:
dt |>
group_by(id) |>
arrange(id, admit_date) |>
summarise(
sum_price = by(
price,
cumsum((admit_date - lag(admit_date, , Inf)) > 5),
sum
)
) |>
ungroup()
# # A tibble: 5 × 2
# id sum_price
# <dbl> <by>
# 1 1 10
# 2 1 20
# 3 2 50
# 4 3 15
# 5 4 16
So, you want (at most) one row per patient in the final dataframe, so you should group on just id.
Then, for each patient, you should calculate if that patient has any row with readmit==).
Finally, you filter out any patient that wasn't readmitted from your summarized dataframe.
Putting it all together, it might look like:
dt %>%
group_by(id) %>%
arrange(id, admit_date) %>%
mutate(duration = admit_date - lag(admit_date),
readmit = ifelse(duration < 6, 1, 0)) %>%
group_by(id) %>% # group by just 'id' to get one row per patient
summarize(sumprice = sum(price, na.rm = T),
is_readmit = any(readmit == 1)) %>% # If patient has any 'readmit' rows, count the patient as a readmit patient
filter(is_readmit) %>% # Filter out any non-readmit patients
select(-is_readmit) # get rid of the `is_readmit` column
Which should result in:
# A tibble: 1 x 3
id sumprice is_readmit
<dbl> <dbl> <lgl>
1 2 50 TRUE
Related
I have a dataset with financial data. Sometimes, a product gets refunded, resulting in a negative count of the product (so the money gets returned). I want to conditionally filter these rows out of the dataset.
Example:
library(tidyverse)
set.seed(1)
df <- tibble(
count = sample(c(-1,1),80,replace = TRUE,prob=c(.2,.8)),
id = rep(1:4,20)
)
df %>%
group_by(id) %>%
summarize(total = sum(count))
# A tibble: 4 x 2
id total
<int> <dbl>
1 1 10
2 2 14
3 3 16
4 4 10
id = 1 has 15 positive counts and 5 negatives. (15 - 5= 10). I want to keep 10 values in df with id = 1 with the positive values.
id = 2 has 17 positive counts and 3 negatives. (17- 3 = 14). I want to keep 14 values in df with id = 2 with the positive values.
In the end, this condition should be True nrow(df) == sum(df$count)
Unfortunately, a filtering join such as anti_join() will remove all the rows. For some reason I cannot think of another option to filter the tibble.
Thanks for helping me!
You can "uncount" using the total column to get the number of repeats of each row.
df %>%
group_by(id) %>%
summarize(total = sum(count)) %>%
uncount(total) %>%
mutate(count = 1)
#> # A tibble: 50 x 2
#> id count
#> <int> <dbl>
#> 1 1 1
#> 2 1 1
#> 3 1 1
#> 4 1 1
#> 5 1 1
#> 6 1 1
#> 7 1 1
#> 8 1 1
#> 9 1 1
#> 10 1 1
#> # ... with 40 more rows
Created on 2022-10-21 with reprex v2.0.2
In Stata the commands I use:
bysort ID : egen numberhead=total(relationship==1)
assert numberhead==1
#household with multiple number of head
list ID relationship if numberhead>=2
#for household without a head
list ID relationship if numberhead<1
How can I achieve same in R?
structure(list(ID = c("SS/CR/BIA/ABEYOONG/1/0001/05", "SS/CR/BIA/ABEYOONG/1/0001/03",
"SS/CR/BIA/ABEYOONG/1/0001/04", "SS/CR/BIA/ABEYOONG/1/0001/02",
"SS/CR/BIA/ABEYOONG/1/0001/01", "SS/CR/BIA/ABEYOONG/1/0002/01",
"SS/CR/BIA/ABEYOONG/1/0002/04", "SS/CR/BIA/ABEYOONG/1/0002/03",
"SS/CR/BIA/ABEYOONG/1/0002/05", "SS/CR/BIA/ABEYOONG/1/0002/02",
"SS/CR/BIA/ABEYOONG/1/0003/01", "SS/CR/BIA/ABEYOONG/1/0003/03",
"SS/CR/BIA/ABEYOONG/1/0003/05", "SS/CR/BIA/ABEYOONG/1/0003/04",
"SS/CR/BIA/ABEYOONG/1/0003/02", "SS/CR/BIA/ABEYOONG/1/0004/02",
"SS/CR/BIA/ABEYOONG/1/0004/07", "SS/CR/BIA/ABEYOONG/1/0004/06",
"SS/CR/BIA/ABEYOONG/1/0004/05", "SS/CR/BIA/ABEYOONG/1/0004/04",
"SS/CR/BIA/ABEYOONG/1/0004/03", "SS/CR/BIA/ABEYOONG/1/0004/01",
"SS/CR/BIA/ABEYOONG/1/0005/01"), relationship = c(3, 3, 3, 2,
1, 1, 10, 3, 11, 2, 1, 3, 3, 3, 3, 3, 11, 3, 3, 3, 3, 1, 1)), row.names = c(NA,
-23L), class = c("tbl_df", "tbl", "data.frame"))
Alright, I hope this is what you were looking for Aquila:
# libraries
library(tidyverse)
# collect data
df <-
structure(
list(
ID = c(
"SS/CR/BIA/ABEYOONG/1/0001/05",
"SS/CR/BIA/ABEYOONG/1/0001/03",
"SS/CR/BIA/ABEYOONG/1/0001/04",
"SS/CR/BIA/ABEYOONG/1/0001/02",
"SS/CR/BIA/ABEYOONG/1/0001/01",
"SS/CR/BIA/ABEYOONG/1/0002/01",
"SS/CR/BIA/ABEYOONG/1/0002/04",
"SS/CR/BIA/ABEYOONG/1/0002/03",
"SS/CR/BIA/ABEYOONG/1/0002/05",
"SS/CR/BIA/ABEYOONG/1/0002/02",
"SS/CR/BIA/ABEYOONG/1/0003/01",
"SS/CR/BIA/ABEYOONG/1/0003/03",
"SS/CR/BIA/ABEYOONG/1/0003/05",
"SS/CR/BIA/ABEYOONG/1/0003/04",
"SS/CR/BIA/ABEYOONG/1/0003/02",
"SS/CR/BIA/ABEYOONG/1/0004/02",
"SS/CR/BIA/ABEYOONG/1/0004/07",
"SS/CR/BIA/ABEYOONG/1/0004/06",
"SS/CR/BIA/ABEYOONG/1/0004/05",
"SS/CR/BIA/ABEYOONG/1/0004/04",
"SS/CR/BIA/ABEYOONG/1/0004/03",
"SS/CR/BIA/ABEYOONG/1/0004/01",
"SS/CR/BIA/ABEYOONG/1/0005/01"
),
relationship = c(3, 3, 3, 2,
1, 1, 10, 3, 11, 2, 1, 3, 3, 3, 3, 3, 11, 3, 3, 3, 3, 1, 1)
),
row.names = c(NA,-23L),
class = c("tbl_df", "tbl", "data.frame")
)
#--------- by sort ID : egen numberhead = total(relationship == 1) ----------
# using data object named df
df %>%
group_by(ID) %>% # bysort
filter(relationship == 1) %>% # to only see these fields
summarise(numberhead = n()) # create a new variable
# # A tibble: 5 × 2
# ID numberhead
# <chr> <int>
# 1 SS/CR/BIA/ABEYOONG/1/0001/01 1
# 2 SS/CR/BIA/ABEYOONG/1/0002/01 1
# 3 SS/CR/BIA/ABEYOONG/1/0003/01 1
# 4 SS/CR/BIA/ABEYOONG/1/0004/01 1
# 5 SS/CR/BIA/ABEYOONG/1/0005/01 1
# of the individual IDs, that have relationship == 1
# there is one observation of each
# I don't think there is an equivalent to assert
# you could validate that there are 5 observations for relationship == 1
# to validate this result, though
df %>%
filter(relationship == 1) %>%
nrow() # number of rows
# [1] 5
#--------- List ID relationship if numberhead >= 2 ----------
# this one is simpler
df %>%
filter(relationship >=2)
# # A tibble: 18 × 2
# ID relationship
# <chr> <dbl>
# 1 SS/CR/BIA/ABEYOONG/1/0001/05 3
# 2 SS/CR/BIA/ABEYOONG/1/0001/03 3
# 3 SS/CR/BIA/ABEYOONG/1/0001/04 3
# 4 SS/CR/BIA/ABEYOONG/1/0001/02 2
# 5 SS/CR/BIA/ABEYOONG/1/0002/04 10
# 6 SS/CR/BIA/ABEYOONG/1/0002/03 3
# 7 SS/CR/BIA/ABEYOONG/1/0002/05 11
# 8 SS/CR/BIA/ABEYOONG/1/0002/02 2
# 9 SS/CR/BIA/ABEYOONG/1/0003/03 3
# 10 SS/CR/BIA/ABEYOONG/1/0003/05 3
# 11 SS/CR/BIA/ABEYOONG/1/0003/04 3
# 12 SS/CR/BIA/ABEYOONG/1/0003/02 3
# 13 SS/CR/BIA/ABEYOONG/1/0004/02 3
# 14 SS/CR/BIA/ABEYOONG/1/0004/07 11
# 15 SS/CR/BIA/ABEYOONG/1/0004/06 3
# 16 SS/CR/BIA/ABEYOONG/1/0004/05 3
# 17 SS/CR/BIA/ABEYOONG/1/0004/04 3
# 18 SS/CR/BIA/ABEYOONG/1/0004/03 3
# If you want to see only the unique IDs
df %>% filter(relationship >= 2) %>%
select(ID) %>%
distinct()
# however every ID is distinct in this data,
# so the results won't look different
#--------- List ID relationship if numberhead < 1 ----------
df %>%
filter(relationship < 1)
# # A tibble: 0 × 2
# # … with 2 variables: ID <chr>, relationship <dbl>
# no results
#--------- see it all at one time? ----------
df %>%
mutate(relates = cut(relationship,
c(0, 1, max(relationship)))) %>%
group_by(relates,ID) %>%
summarise(n()) %>%
print(n = nrow(df)) # when you have a tbl_df,
# you get pretty print in the console,
# this call will let you see it all
# # A tibble: 23 × 3
# # Groups: relates [2]
# relates ID `n()`
# <fct> <chr> <int>
# 1 (0,1] SS/CR/BIA/ABEYOONG/1/0001/01 1
# 2 (0,1] SS/CR/BIA/ABEYOONG/1/0002/01 1
# 3 (0,1] SS/CR/BIA/ABEYOONG/1/0003/01 1
# 4 (0,1] SS/CR/BIA/ABEYOONG/1/0004/01 1
# 5 (0,1] SS/CR/BIA/ABEYOONG/1/0005/01 1
# 6 (1,11] SS/CR/BIA/ABEYOONG/1/0001/02 1
# 7 (1,11] SS/CR/BIA/ABEYOONG/1/0001/03 1
# 8 (1,11] SS/CR/BIA/ABEYOONG/1/0001/04 1
# 9 (1,11] SS/CR/BIA/ABEYOONG/1/0001/05 1
# 10 (1,11] SS/CR/BIA/ABEYOONG/1/0002/02 1
# 11 (1,11] SS/CR/BIA/ABEYOONG/1/0002/03 1
# 12 (1,11] SS/CR/BIA/ABEYOONG/1/0002/04 1
# 13 (1,11] SS/CR/BIA/ABEYOONG/1/0002/05 1
# 14 (1,11] SS/CR/BIA/ABEYOONG/1/0003/02 1
# 15 (1,11] SS/CR/BIA/ABEYOONG/1/0003/03 1
# 16 (1,11] SS/CR/BIA/ABEYOONG/1/0003/04 1
# 17 (1,11] SS/CR/BIA/ABEYOONG/1/0003/05 1
# 18 (1,11] SS/CR/BIA/ABEYOONG/1/0004/02 1
# 19 (1,11] SS/CR/BIA/ABEYOONG/1/0004/03 1
# 20 (1,11] SS/CR/BIA/ABEYOONG/1/0004/04 1
# 21 (1,11] SS/CR/BIA/ABEYOONG/1/0004/05 1
# 22 (1,11] SS/CR/BIA/ABEYOONG/1/0004/06 1
# 23 (1,11] SS/CR/BIA/ABEYOONG/1/0004/07 1
Using the data you have provided and adding how to pull that data directly into R. Note that I assume ID is what is column 1 and that what you are calling relationship is the column hhsize.
For collecting the data, you can pull it directly from your personal computer drive or directly from the web.
library(openxlsx)
# from your computer
df2 <- read.xlsx("/path/in/you/computer/file.xlsx")
# if there was more than one sheet, you would designate which sheet
# from the web
# for dropbox, look in the path for "d1=0"
# you have to change that to "d1=1" for a direct download
df3 <- read.xlsx("https://www.dropbox.com/scl/fi/73dw92bpcjio3m1k0w5vv/Round-11th-19-08-2020.xlsx?dl=1&rlkey=2xxtyge3rppi0aikkl8nlt6oc")
If you really wanted to rename the columns you can do that this way:
names(df2)[1] <- "ID"
Is this what you are looking for?
#----- perhaps looking for this ------
df3[,c(1,17)] %>% # only look at IDs and household size
distinct() %>% # ignore duplicates, when both fields match
mutate(relates = cut(hhsize, # add factor for ranges
c(0, 1, 2,
max(hhsize)),
include.lowest = T)) %>%
group_by(relates) %>% # only group by household size ranges
summarise(count = n()) # show the count per case
# # A tibble: 3 × 2
# relates count
# <fct> <int>
# 1 [0,1] 505
# 2 (1,2] 1736
# 3 (2,25] 15771
I'm working on a problem that consists basically on sum all the rows based on their ID and sum some specific variables to get a consolidated dataset to input on another work, but there is an issue with the sum function and I'd appreciate some explanation about this.
Dataset:
teste <- data.frame(ID = c(1, 1, 2, 1, 3, 3, 2),
VALUE = c(10, 10, 10, 10, 10, 10, 10),
MOD = c(1, 1, 1, 1, 1, 1, 1))
ID VALUE MOD
1 1 10 1
2 1 10 1
3 2 10 1
4 1 10 1
5 3 10 1
6 3 10 1
7 2 10 1
Using + operator:
teste %>%
group_by(ID) %>%
summarise_all(sum, na.rm = TRUE) %>%
mutate(CONS = VALUE + MOD)
# A tibble: 3 x 4
ID VALUE MOD CONS
<dbl> <dbl> <dbl> <dbl>
1 1 30 3 33
2 2 20 2 22
3 3 20 2 22
Using sum function:
teste %>%
group_by(ID) %>%
summarise_all(sum, na.rm = TRUE) %>%
mutate(CONS = sum(VALUE, MOD))
# A tibble: 3 x 4
ID VALUE MOD CONS
<dbl> <dbl> <dbl> <dbl>
1 1 30 3 77
2 2 20 2 77
3 3 20 2 77
summarize_all removes one level of grouping so re-group it:
teste %>%
group_by(ID) %>%
summarise_all(sum, na.rm = TRUE) %>%
group_by(ID) %>% # <--------------------------
mutate(CONS = sum(VALUE, MOD)) %>%
ungroup
giving:
# A tibble: 3 x 4
# Groups: ID [3]
ID VALUE MOD CONS
<dbl> <dbl> <dbl> <dbl>
1 1 30 3 33
2 2 20 2 22
3 3 20 2 22
library(tidyverse)
df <- tibble(a = as.factor(1:20), b = c(50, 20, 13, rep(2, 10), rep(1, 7)))
How do I make dplyr look at this data frame df and collapse all these occurences of 2 into a single summed group, and collapse all the occurrences of 1 into a single summed group? And also keep the rest of the data frame.
Turn this:
# A tibble: 20 x 2
a b
<fct> <dbl>
1 1 50
2 2 20
3 3 13
4 4 2
5 5 2
6 6 2
7 7 2
8 8 2
9 9 2
10 10 2
11 11 2
12 12 2
13 13 2
14 14 1
15 15 1
16 16 1
17 17 1
18 18 1
19 19 1
20 20 1
into this:
# A tibble: 5 x 2
a b
<fct> <dbl>
1 1 50
2 2 20
3 3 13
4 grp2 20
5 grp1 7
[Edit] - I fixed the example data. Sorry about that.
We group by a manufactured sortkey to maintain sort order. We used the fact that b is in descending order in the input but if that is not the case in your actual data then replace sortkey = -b with the more general sortkey = data.table::rleid(b) or the longer sortkey = cumsum(coalesce(b != lag(b), FALSE)) .
We also convert b to the group names giving a new a. It wasn't clear which groups are to be converted to grp... form. Hard-coded 1 and 2? Any group with more than one row? Groups at the end with more than one row? At any rate it would be easy enough to change the condition in the if_else once that were clarified.
Finally perform the summation and then remove the sortkey.
df %>%
group_by(sortkey = -b, a = paste0(if_else(b %in% 1:2, "grp", ""), b)) %>%
summarize(b = sum(b)) %>%
ungroup %>%
select(-sortkey)
giving:
# A tibble: 5 x 2
a b
<chr> <int>
1 50 50
2 20 20
3 13 13
4 grp2 20
5 grp1 7
Here's a way. I have converted a from factor to character to make things easier. You can convert it back to factor if you want. Also your test data was a bit wrong.
df <- tibble(a = as.character(1:20), b = c(50, 20, 13, rep(2, 10), rep(1, 7)))
df %>%
mutate(
a = case_when(
b == 1 ~ "grp1",
b == 2 ~ "grp2",
TRUE ~ a
)
) %>%
group_by(a) %>%
summarise(b = sum(b))
# A tibble: 5 x 2
a b
<chr> <dbl>
1 1 50
2 2 20
3 3 13
4 grp1 7
5 grp2 20
This is an approach which gives you the desired names for groups & where you don't need to think in advance how many cases like that you would need (e.g. it would create grp3, grp4, ... depending on the number in b).
library(dplyr)
df %>%
mutate(
grp = as.numeric(lag(df$b) != df$b),
grp = cumsum(ifelse(is.na(grp), 0, grp))
) %>% group_by(grp) %>%
mutate(
a = ifelse(n() > 1, paste0("grp", b), a),
b = sum(b)
) %>% ungroup() %>% distinct(a, b)
Output:
a b
<chr> <dbl>
1 1 50
2 2 20
3 3 13
4 grp2 20
5 grp1 7
Note that the code could be also condensed but that leads to a certain lack of readability in my opinion:
df %>%
group_by(grp = cumsum(ifelse(is.na(as.numeric(lag(df$b) != df$b)), 0, as.numeric(lag(df$b) != df$b)))) %>%
mutate(
a = ifelse(n() > 1, paste0("grp", b), a),
b = sum(b)
) %>% ungroup() %>% distinct(a, b)
I have a set of scoring data from a bunch of hockey games and I'm stuck at a stage of my analysis. I'm trying to plot the home team's lead for every ten minutes of every game.
Here's an example of where I've gotten my dataset so far:
library(tidyverse)
# Generate example data ordered by gameid and event_ts
game <- tibble(event_type = "goal", event_ts = runif(n = 1000, min = 0, max = 60),
team = sample(c("home", "away"), size = 1000, replace = TRUE, prob = c(0.55,0.45)),
gameid = sample(100:300, size = 1000, replace = TRUE)) %>%
arrange(gameid, event_ts)
I know that I can get the final score of each game using summarise. Here's a quick example that assumes both teams score at least one goal in every game:
game %>%
group_by(gameid, team) %>%
summarise(goals = n()) %>%
spread(key = team, value = goals) %>%
mutate(away = ifelse(is.null(away), 0, away))
I'd like to figure out the home team's lead (positive or negative) at ten minute intervals throughout the game. That requires summing all scoring that has happened up to that point. Here's an example of the structure I'd like to get:
finished_demo <- tibble(
gameid = sort(rep_len(seq(100, 300, 1), 1206)),
timestamp = rep(seq(10, 60, 10), 201),
home_lead = round(runif(
n = 1206, min = -5, max = 7
))
) %>% arrange(gameid, timestamp)
Here's one way of accomplishing it using data.table, IIUC:
require(data.table)
setDT(game) # generated with op's code but with a seed(1L)
key <- CJ(gameid=unique(game$gameid), start=1L, end=(1:6)*10L)
ans <- game[key, on=.(gameid, event_ts >= start, event_ts <= end), # (1)
.(home_lead=sum(team == "home")-sum(team == "away")), # (2)
by=.EACHI] # (3)
head(ans)
# gameid event_ts event_ts home_lead
# 1: 100 1 10 NA
# 2: 100 1 20 1
# 3: 100 1 30 0
# 4: 100 1 40 0
# 5: 100 1 50 -1
# 6: 100 1 60 -2
You can rename the duplicate column names (I'll fix this when I get time to work on it).
(1) searches for row indices in game that matches for every row in key while matching on the conditions provided under on argument.
(2) computes the lead of home team.
(3) .EACHI informs that the home team lead should be computed on matching rows of game for every row of key.
NA implies there were no matching events.. if necessary they can be replaced to 0 by doing:
ans[is.na(home_lead), home_lead := 0L]
I'm like 99% sure someone can rewrite this with some that embedded/nested (?) structure found in purrr. Different nrow() from results above (with same data) so no guarantee solution is right.
game %>%
group_by(gameid) %>%
do(data.frame(time = 10 * (1:(max(.$event_ts) %/% 10)))) %>%
apply(1, function(x) {
g = x[1] %>% unlist
t = x[2] %>% unlist
game %>%
filter(gameid == g, event_ts < t) %>%
group_by(gameid, team) %>%
summarise(goals = n()) %>%
mutate(time = t)
}) %>%
bind_rows %>%
spread(key = team, value = goals) %>%
mutate_all(as.numeric) %>%
mutate(away = ifelse(is.na(away), 0, away),
home = ifelse(is.na(home), 0, home))
gameid time away home
<int> <dbl> <dbl> <dbl>
1 100 10 0 1
2 100 20 1 3
3 100 30 1 3
4 101 20 0 1
5 101 30 1 1
6 101 40 1 2
7 101 50 1 2
How about this?
game %>%
mutate(ten_min = event_ts %/% 10,
homegoal = if_else(team == 'home', 1, -1)) %>%
group_by(ten_min, gameid) %>%
summarize(home_lead_interval = sum(homegoal)) %>%
ungroup() %>%
group_by(gameid) %>%
mutate(home_lead = cumsum(home_lead_interval)) %>%
arrange(gameid, ten_min)
# Source: local data frame [683 x 4]
# Groups: gameid [198]
#
# ten_min gameid home_lead_interval home_lead
# <dbl> <int> <dbl> <dbl>
# 1 0 100 0 0
# 2 1 100 -1 -1
# 3 2 100 -3 -4
# 4 3 100 -1 -5
# 5 4 100 2 -3
# 6 5 100 -1 -4
# 7 1 101 1 1
# 8 2 101 1 2
# 9 4 101 -2 0
# 10 0 102 1 1
# # ... with 673 more rows
My idea is to get the score of home and away for each 10 minutes. Then you can group the data.frame based on gameid and create the result you want.
set.seed(123)
# Generate example data ordered by gameid and event_ts
game <- tibble(event_type = "goal", event_ts = runif(n = 1000, min = 0, max = 60),
team = sample(c("home", "away"), size = 1000, replace = TRUE, prob = c(0.55,0.45)),
gameid = sample(100:300, size = 1000, replace = TRUE)) %>%
arrange(gameid, event_ts)
# Change the event_ts and get all 10 minutes intervals
hl <- game %>%
mutate(event_ts=ceiling(event_ts / 10) * 10) %>%
dcast(gameid + event_ts ~ team, length) %>%
right_join(expand.grid(gameid=unique(game$gameid), event_ts=seq(10, 60, 10)))
hl$away[is.na(hl$away)] <- 0
hl$home[is.na(hl$home)] <- 0
# Get the home lead
hl <- hl %>%
arrange(gameid, event_ts) %>%
group_by(gameid) %>%
mutate(away=cumsum(away),
home=cumsum(home),
home_lead=home - away)
# Check the game 100 and 101
game %>% filter(gameid %in% 100:101)
# A tibble: 7 × 4
event_type event_ts team gameid
<chr> <dbl> <chr> <int>
1 goal 30.460972 home 100
2 goal 57.270219 home 100
3 goal 1.126093 home 101
4 goal 27.879957 home 101
5 goal 33.086101 home 101
6 goal 42.497419 away 101
7 goal 45.649418 home 101
hl %>% filter(gameid %in% 100:101)
Source: local data frame [12 x 5]
Groups: gameid [2]
gameid event_ts away home home_lead
<int> <dbl> <dbl> <dbl> <dbl>
1 100 10 0 0 0
2 100 20 0 0 0
3 100 30 0 0 0
4 100 40 0 1 1
5 100 50 0 1 1
6 100 60 0 2 2
7 101 10 0 1 1
8 101 20 0 1 1
9 101 30 0 2 2
10 101 40 0 3 3
11 101 50 1 4 3
12 101 60 1 4 3