Create ranges by accumulating values - r

I have the DF1:
KEY <- c(11,12,22,33,44,55,66,77,88,99,1010,1111,1212,1313,1414,1515,1616,1717,1818,1919,2020)
PRICE <- c(0,0,1,5,7,10,20,80,110,111,200,1000,2500,2799,3215,4999,7896,8968,58914,78422,96352)
DF1 <- data.frame(KEY,PRICE)
I want to group DF1 into ranges, accumulating the values of the two columns (count the KEY column and sum the PRICE column). This is the result I hope for:
INTERVAL <-c('0','UP_TO_10','UP_TO_100','UP_TO_1000','UP_TO_5000','UP_TO_10000','UP_TO_100000')
COUNT_KEY <-c(2,6,8,12,16,18,21)
SUM_PRICE <- c(0,23,123,1544,15057,31921,265609)
DF2 <- data.frame(INTERVAL,COUNT_KEY,SUM_PRICE)
How do I make this table?

If you have a vector of limits or thresholds, such as:
LIMITS <- c(0, 10, 100, 1000, 5000, 10000, 100000)
You could obtain a count of rows where PRICE is below each limit:
unlist(lapply(LIMITS, function(x) sum(DF1$PRICE <= x)))
[1] 2 6 8 12 16 18 21
And a sum of these prices as well:
unlist(lapply(LIMITS, function(x) sum(DF1$PRICE[DF1$PRICE <= x])))
[1] 0 23 123 1544 15057 31921 265609
Is this what you had in mind?
This is everything all together:
LIMITS <- c(0, 10, 100, 1000, 5000, 10000, 100000)
COUNT_KEY <- unlist(lapply(LIMITS, function(x) sum(DF1$PRICE <= x)))
SUM_PRICE <- unlist(lapply(LIMITS, function(x) sum(DF1$PRICE[DF1$PRICE <= x])))
data.frame(INTERVAL = c(0, paste("UP_TO", LIMITS[-1], sep="_")), COUNT_KEY, SUM_PRICE)
INTERVAL COUNT_KEY SUM_PRICE
1 0 2 0
2 UP_TO_10 6 23
3 UP_TO_100 8 123
4 UP_TO_1000 12 1544
5 UP_TO_5000 16 15057
6 UP_TO_10000 18 31921
7 UP_TO_100000 21 265609

You have to manually define you boundaries first:
X = c(-Inf,0,10,100,1000,5000,10000,100000)
Then you use cut to assign to entries to your labels. And we first summarize the counts and total price within the intervals.
library(dplyr)
DF1 %>%
mutate(LABELS = cut(DF1$PRICE,X,INTERVAL,include.lowest =TRUE)) %>%
group_by(LABELS) %>%
summarise(COUNT_KEY=n(),SUM_PRICE=sum(PRICE))
# A tibble: 7 x 3
LABELS COUNT_KEY SUM_PRICE
<fct> <int> <dbl>
1 0 2 0
2 UP_TO_10 4 23
3 UP_TO_100 2 100
4 UP_TO_1000 4 1421
5 UP_TO_5000 4 13513
6 UP_TO_10000 2 16864
7 UP_TO_100000 3 233688
This is close to what you want, except the sum_price and counts, should be cumulative. So this can be achieved by doing mutate_if(is.numeric,cumsum):
DF1 %>%
mutate(LABELS = cut(DF1$PRICE,X,INTERVAL,include.lowest =TRUE)) %>% group_by(LABELS) %>%
summarise(COUNT_KEY=n(),SUM_PRICE=sum(PRICE)) %>%
mutate_if(is.numeric,cumsum)
To give:
# A tibble: 7 x 3
LABELS COUNT_KEY SUM_PRICE
<fct> <int> <dbl>
1 0 2 0
2 UP_TO_10 6 23
3 UP_TO_100 8 123
4 UP_TO_1000 12 1544
5 UP_TO_5000 16 15057
6 UP_TO_10000 18 31921
7 UP_TO_100000 21 265609

Okay, here's an all-in-one, tidy way to handle this using dplyr ;)
library(dplyr)
DF1 %>%
mutate(
INTERVAL =
factor(
case_when( # create discrete variable
PRICE == 0 ~ '0',
PRICE <= 10 ~ 'UP_TO_10',
PRICE <= 100 ~ 'UP_TO_100',
PRICE <= 1000 ~ 'UP_TO_1000',
PRICE <= 5000 ~ 'UP_TO_5000',
PRICE <= 10000 ~ 'UP_TO_10000',
PRICE <= 100000 ~ 'UP_TO_100000'
),
levels = # set the factor levels
c(
'0',
'UP_TO_10',
'UP_TO_100',
'UP_TO_1000',
'UP_TO_5000',
'UP_TO_10000',
'UP_TO_100000'
)
)
) %>%
group_by(INTERVAL) %>% # create desired group
summarise( # and summary variables
COUNT_KEY = n(),
SUM_PRICE = sum(PRICE)
) %>%
mutate( # cumulative totals
COUNT_KEY_CUM = cumsum(COUNT_KEY),
SUM_PRICE_CUM = cumsum(SUM_PRICE)
)

Related

lag() with group_by between current and last observation in R

Edit: I found the solution with na.locf().
data <-
data %>%
group_by(country) %>%
arrange(wave) %>%
mutate(weight.io = na.locf(weight)) %>%
mutate(lag_weight = weight - lag(weight.io)
I have a dataset below.
set.seed(42000)
data <- data_frame(
country = sample(letters[1:20], size = 100, replace = TRUE),
weight = round(runif(100, min = 48, max = 90)))
data <- data %>%
group_by(country) %>%
arrange(weight) %>%
mutate(wave = seq_along(weight))
n_rows <- nrow(data)
perc_missing <- 10
data[sample(1:n_rows, sample(1:n_rows, round(perc_missing/100 * n_rows, 0))), c("weight")] <- NA
I would like to obtain the difference between one country's current "weight" and the last observed "weight for each wave.
For country "a" wave 5, I want the value to be 69 - 65 (last observed weight at wave < 5).
And for wave 8, 82(weight at wave 8) - 69(weight at wave 5).
My approach was the one below, but it didn't work.
data <-
data %>%
group_by(country) %>%
arrange(wave) %>%
mutate(lag_weight = weight - lag(weight, default = first(weight, na.rm = TRUE)))
Thank you!
I think this is a combination of diff (instead of lag, though that could work just as well) and more important tidyr::fill (or zoo::na.locf, not demonstrated):
BTW, na.rm= is not an argument for first, I've removed it.
library(dplyr)
# library(tidyr) # fill
data %>%
group_by(country) %>%
tidyr::fill(weight) %>%
filter(country == "a") %>%
mutate(lag_weight = weight - lag(weight, default = first(weight)))
# # A tibble: 10 x 4
# # Groups: country [1]
# country weight wave lag_weight
# <chr> <dbl> <int> <dbl>
# 1 a 54 1 0
# 2 a 55 2 1
# 3 a 65 3 10
# 4 a 65 4 0
# 5 a 69 5 4
# 6 a 69 6 0
# 7 a 69 7 0
# 8 a 82 8 13
# 9 a 82 9 0
# 10 a 85 10 3
The issue here is that weight is over-written with the LOCF (last-observation carried forward) value instead of preserving the NA values. If that's important, then you can make another weight variable for temporary use (and remove it):
data %>%
mutate(tmpweight = weight) %>%
group_by(country) %>%
tidyr::fill(tmpweight) %>%
filter(country == "a") %>%
mutate(lag_weight = tmpweight - lag(tmpweight, default = first(tmpweight))) %>%
select(-tmpweight)
# # A tibble: 10 x 4
# # Groups: country [1]
# country weight wave lag_weight
# <chr> <dbl> <int> <dbl>
# 1 a 54 1 0
# 2 a 55 2 1
# 3 a 65 3 10
# 4 a NA 4 0
# 5 a 69 5 4
# 6 a NA 6 0
# 7 a NA 7 0
# 8 a 82 8 13
# 9 a 82 9 0
# 10 a 85 10 3
FYI, you can use c(0, diff(weight)) instead of weight - lag(weight) for the same effect. Since it returns length of 1 shorter (since it is the gap between each value), we prepend a 0 here:
data %>%
group_by(country) %>%
tidyr::fill(weight) %>%
filter(country == "a") %>%
mutate(lag_weight = c(0, diff(weight)))
(The filter(country == "a") is purely for demonstration to match your example, not that it is required for this solution.)

R: Count number of times B follows A using dplyr

I have a data.frame of monthly averages of radon measured over a few months. I have labeled each value either "below" or "above" a threshold and would like to count the number of times the average value does: "below to above", "above to below", "above to above" or "below to below".
df <- data.frame(value = c(130, 200, 240, 230, 130),
level = c("below", "above","above","above", "below"))
A bit of digging into Matlab answer on here suggests that we could use the Matrix package:
require(Matrix)
sparseMatrix(i=c(2,2,2,1), j=c(2,2,2))
Produces this result which I can't yet interpret.
[1,] | |
[2,] | .
Any thoughts about a tidyverse method?
Sure, just use group by and count the values
library(dplyr)
df <- data.frame(value = c(130, 200, 240, 230, 130),
level = c("below", "above","above","above", "below"))
df %>%
group_by(grp = paste(level, lead(level))) %>%
summarise(n = n()) %>%
# drop the observation that does not have a "next" value
filter(!grepl(pattern = "NA", x = grp))
#> # A tibble: 3 × 2
#> grp n
#> <chr> <int>
#> 1 above above 2
#> 2 above below 1
#> 3 below above 1
You could use table from base R:
table(df$level[-1], df$level[-nrow(df)])
above below
above 2 1
below 1 0
EDIT in response to #HCAI's comment: applying table to multiple columns:
First, generate some data:
set.seed(1)
U = matrix(runif(4*20),nrow = 20)
dfU=data.frame(round(U))
library(plyr) # for mapvalues
df2 = data.frame(apply(dfU,
FUN = function(x) mapvalues(x, from=0:1, to=c('below','above')),
MARGIN=2))
so that df2 contains random 'above' and 'below':
X1 X2 X3 X4
1 below above above above
2 below below above below
3 above above above below
4 above below above below
5 below below above above
6 above below above below
7 above below below below
8 above below below above
9 above above above below
10 below below above above
11 below below below below
12 below above above above
13 above below below below
14 below below below below
15 above above below below
16 below above below above
17 above above below above
18 above below above below
19 below above above above
20 above below below above
Now apply table to each column and vectorize the output:
apply(df2,
FUN=function(x) as.vector(table(x[-1],
x[-nrow(df2)])),
MARGIN=2)
which gives us
X1 X2 X3 X4
[1,] 5 2 7 2
[2,] 5 6 4 6
[3,] 6 5 3 6
[4,] 3 6 5 5
All that's left is a bit of care in labeling the rows of the output. Maybe someone can come up with a clever way to merge/join the data frames resulting from apply(df2, FUN=function(x) melt(table(x[-1],x[-nrow(df2)])),2), which would maintain the row names. (I spent some time looking into it but couldn't work out how to do it easily.)
not run, so there may be a typo, but you get the idea. I'll leave it to you to deal with na and the first obs. Single pass through the vector.
library(dplyr)
summarize(increase = sum(case_when(value > lag(value) ~ 1, T ~ 0)),
decrease = sum(case_when(value > lag(value) ~ 1, T ~ 0)),
constant = sum(case_when(value = lag(value) ~ 1, T ~ 0))
)
A slightly different version:
library(dplyr)
library(stringr)
df %>%
group_by(level = str_c(level, lead(level), sep = " ")) %>%
count(level) %>%
na.omit()
level n
<chr> <int>
1 above above 2
2 above below 1
3 below above 1
Another possible solution, based on tidyverse:
library(tidyverse)
df<-data.frame(value=c(130,200, 240, 230, 130),level=c("below", "above","above","above", "below"))
df %>%
mutate(changes = str_c(lag(level), level, sep = "_")) %>%
count(changes) %>% drop_na(changes)
#> changes n
#> 1 above_above 2
#> 2 above_below 1
#> 3 below_above 1
Yet another solution, based on data.table:
library(data.table)
dt<-data.table(value=c(130,200, 240, 230, 130),level=c("below", "above","above","above", "below"))
dt[, changes := paste(shift(level), level, sep = "_")
][2:.N][,.(n = .N), keyby = .(changes)]
#> changes n
#> 1: above_above 2
#> 2: above_below 1
#> 3: below_above 1

Age groups into monthly buckets

I'm struggling to find a solution for the following problem. I have a df with id's/ dob's and another monthbucket df as following
set.seed(33)
df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10),
id = seq(1:10) )
monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)
I want to get an output which gives me the count of members within age groups (<19, 19-64, >64) for each of my monthly buckets. The count obviously switches over the year when people have birthdays.
I got the age calculation with something like:
age.fct <- function(dob, bucketdate) {
period <- as.period(interval(dob, bucketdate),unit = "year")
period$year}
I guess the general approach would be to calculate the age for each monthbucket, assign into one of the 3 age groups and count it up by month. Any suggestions?
EDIT 1.
Thanks for all the different approaches, I just run a brief benchmark on the solutions to determine which answer to accept. Somehow the data table solution didn't work on my test data set but I will check as soon as I have a few minutes in the next days.
set.seed(33)
df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10000),
id = seq(1:10000) )
monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)
birth_days <- df$dob
month_bucket <- monthbucket$startmonth
and the benchmark
microbenchmark::microbenchmark(
MM= monthbucket %>% group_by_all %>% expand(id=df$id) %>% left_join(.,{df %>% mutate(birth_month =cut(dob, "month"))},by="id") %>% mutate(age=time_length(difftime(startmonth, birth_month),"years")) %>%
mutate(age_cat=case_when(age<19 ~ "<19", age>64 ~ ">64",TRUE ~ "19-64")) %>% group_by(month) %>% count(age_cat) %>% gather(variable, count, n) %>%
unite(variable, age_cat) %>% spread(variable, count)
,
AkselA = {ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))
ages <- do.call(data.frame, lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))
ages <- sapply(ages, table)
colnames(ages) <- monthbucket$month
},
Cole1 ={t(table(apply(X = outer(month_bucket, birth_days, `-`) / 365.25, MARGIN = 2, FUN = cut, c(0,19,65, Inf)), rep(format(month_bucket,'%Y-%m'), length(birth_days))))
},
# cole2={ cast(CJ(month_bucket, birth_days)[, .N, by = .(month_bucket , cut(as.numeric(month_bucket - birth_days)/365.25, c(0,19,65,Inf)))], month_bucket ~ cut, value.var = 'N')
# },
#
Cole3={crossing(month_bucket, birth_days)%>%count(month_bucket, age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf)))%>%spread(age_range, n)
},
Cole4={all_combos <- expand.grid(month_bucket = month_bucket, birth_days = birth_days)
all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))
reshape(data = aggregate( all_combos$month_bucket, by = list(bucket = all_combos$month_bucket,age_group = all_combos$cut_r), FUN = length), timevar = 'age_group' , idvar = 'bucket', direction = 'wide' )
},
times = 1L)
Unit: milliseconds
expr min lq mean median uq max neval
MM 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810 1
AkselA 17.12697 17.12697 17.12697 17.12697 17.12697 17.12697 1
Cole1 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534 1
Cole3 23.63945 23.63945 23.63945 23.63945 23.63945 23.63945 1
Cole4 877.92782 877.92782 877.92782 877.92782 877.92782 877.92782 1
Based on speed AkselA's approach seems to be the fastest but I get a different result for M-M's approach compared to all others (once AkselA's changes to 65 in the cut part cut, c(0, 19, 64, Inf)... I will accept answer based on speed but will look into the differences in the results!
Not very sophisticated but I joined the two tables (first expanded monthbucket on df$id) and then calculated the age (as you have the whole month, I just calculated difftime with the first day of month of birth and startmonth). Then, for each month (bucket) I counted number of different age groups and at the end converted long format to wide for better illustration.
library(lubridate)
library(tidyverse)
monthbucket %>%
group_by_all %>%
expand(id=df$id) %>%
left_join(.,{df %>%
mutate(birth_month =cut(dob, "month"))},
by="id") %>%
mutate(age=time_length(difftime(startmonth, birth_month),"years")) %>%
mutate(age_cat=case_when(age<19 ~ "<19",
age>64 ~ ">64",
TRUE ~ "19-64")) %>%
group_by(month) %>%
count(age_cat) %>%
gather(variable, count, n) %>%
unite(variable, age_cat) %>%
spread(variable, count)
#> # A tibble: 13 x 4
#> # Groups: month [13]
#> month `<19` `>64` `19-64`
#> <fct> <int> <int> <int>
#> 1 2010-01 3 2 5
#> 2 2010-02 3 2 5
#> 3 2010-03 3 2 5
#> 4 2010-04 3 2 5
#> 5 2010-05 3 2 5
#> 6 2010-06 3 2 5
#> 7 2010-07 3 2 5
#> 8 2010-08 3 2 5
#> 9 2010-09 3 2 5
#> 10 2010-10 3 2 5
#> 11 2010-11 3 2 5
#> 12 2010-12 3 2 5
#> 13 2011-01 3 2 5
Created on 2019-07-03 by the reprex package (v0.3.0)
Assuming I understand your request.
ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))
ages <- do.call(data.frame,
lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))
ages <- sapply(ages, table)
colnames(ages) <- monthbucket$month
ages
# 2010-01 2010-02 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12 2011-01
# 0-19 2 2 2 2 2 2 2 2 2 2 2 2 2
# 19-64 7 7 7 7 7 7 7 7 7 7 7 7 7
# 64+ 1 1 1 1 1 1 1 1 1 1 1 1 1
#
There are some similarities to #AkselA's answer as it depends on outer(), cut(), and table().
set.seed(33)
birth_days <- sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10)
month_bucket <- seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months")
t(
table(
apply(
X = outer(month_bucket, birth_days, `-`) / 365.25
, MARGIN = 2
, FUN = cut, c(0,19,65, Inf)
)
, rep(format(month_bucket,'%Y-%m'), length(birth_days))
)
)
(0,19] (19,65] (65,Inf]
2010-01 2 7 1
2010-02 2 7 1
2010-03 2 7 1
2010-04 2 7 1
2010-05 2 7 1
2010-06 2 7 1
2010-07 2 7 1
2010-08 2 7 1
2010-09 2 7 1
2010-10 2 7 1
2010-11 2 7 1
2010-12 2 7 1
2011-01 2 7 1
I felt weird having such a similar solution so here is data.table:
library(data.table)
dcast(
CJ(month_bucket, birth_days
)[, .N
, by = .(month_bucket
, cut(as.numeric(month_bucket - birth_days)/365.25, c(0,19,65,Inf)))
]
, month_bucket ~ cut
, value.var = 'N')
dplyr and tidyr:
library(dplyr)
library(tidyr)
crossing(month_bucket, birth_days)%>%
count(month_bucket
, age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf))
)%>%
spread(age_range, n)
And a similar approach in base that I'm not completely happy with.
all_combos <- expand.grid(month_bucket = month_bucket, birth_days = birth_days)
all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))
reshape(
data = aggregate(
all_combos$month_bucket
, by = list(bucket = all_combos$month_bucket
,age_group = all_combos$cut_r)
, FUN = length)
, timevar = 'age_group'
, idvar = 'bucket'
, direction = 'wide'
)

Apply Bins to Data Frame Groups without making subset Data Frames

I have a data frame containing fish population sampling data. I would like to create bins to count how many fish are in a given length group for each species.
The below code accomplishes this task for 2 species. Doing this for all species in the data frame doesn't seem like the most elegant way to achieve this goal.
Plus I would like to apply this code to other lakes with different species. It would be great to find an "automated" way to apply these bins to each species group in the data frame.
The data frame looks like:
Species TL WT
BLG 75 6
BLG 118 27
LMB 200 98
LMB 315 369
RBS 112 23
RES 165 73
SPB 376 725
YEP 155 33
ss = read.csv("SS_West Point.csv" , na.strings="." , header=T)
blg = ss %>% subset(Species == "BLG")
lmb = ss %>% subset(Species == "LMB")
blgn = blg %>% summarise(n = n())
lmbn = lmb %>% summarise(n = n())
### 20mm Length Groups - BLG ###
blg20 = blg %>% group_by(gr=cut(TL , breaks = seq(0 , 1000 , by = 20))) %>%
summarise(n = n()) %>% mutate(freq = n , percent = ((n/blgn$n)*100) ,
cumfreq = cumsum(freq) , cumpercent = cumsum(percent))
### 20mm Length Groups - BLG ###
lmb20 = lmb %>% group_by(gr=cut(TL , breaks = seq(0 , 1000 , by = 20))) %>%
summarise(n = n()) %>% mutate(freq = n , percent = ((n/lmbn$n)*100) ,
cumfreq = cumsum(freq) , cumpercent = cumsum(percent))
I've successfully used do() to run linear models on this data frame but can't seem to get it to work on cut(). Here is how I used do() on lm():
ssl = ss %>% mutate(lTL = log10(TL) , lWT = log10(WT)) %>% group_by(Species)
m = ssl %>% do(lm(lWT~lTL , data =.)) %>% mutate(wp = 10^(.fitted))
Does this do what you expect?
ss20 <- ss %>%
add_count(Species) %>%
rename(Species_count = n) %>%
# I added Species_count to the grouping so it goes along for the ride in summarization
group_by(Species, Species_count, gr=cut(TL , breaks = seq(0 , 1000 , by = 20))) %>%
summarise(n = n()) %>%
mutate(freq = n, percent = ((n/Species_count)*100),
cumfreq = cumsum(freq) , cumpercent = cumsum(percent)) %>%
ungroup()
> ss20
# A tibble: 8 x 8
Species Species_count gr n freq percent cumfreq cumpercent
<chr> <int> <fct> <int> <int> <dbl> <int> <dbl>
1 BLG 2 (60,80] 1 1 50 1 50
2 BLG 2 (100,120] 1 1 50 2 100
3 LMB 2 (180,200] 1 1 50 1 50
4 LMB 2 (300,320] 1 1 50 2 100
5 RBS 1 (100,120] 1 1 100 1 100
6 RES 1 (160,180] 1 1 100 1 100
7 SPB 1 (360,380] 1 1 100 1 100
8 YEP 1 (140,160] 1 1 100 1 100

How can I create a column that cumulatively adds the sum of two previous rows based on conditions?

I tried asking this question before but was it was poorly stated. This is a new attempt cause I haven't solved it yet.
I have a dataset with winners, losers, date, winner_points and loser_points.
For each row, I want two new columns, one for the winner and one for the loser that shows how many points they have scored so far (as both winners and losers).
Example data:
winner <- c(1,2,3,1,2,3,1,2,3)
loser <- c(3,1,1,2,1,1,3,1,2)
date <- c("2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05","2017-10-06","2017-10-07","2017-10-08","2017-10-09")
winner_points <- c(2,1,2,1,2,1,2,1,2)
loser_points <- c(1,0,1,0,1,0,1,0,1)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points)
I want the output to be:
winner_points_sum <- c(0, 0, 1, 3, 1, 3, 5, 3, 5)
loser_points_sum <- c(0, 2, 2, 1, 4, 5, 4, 7, 4)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points, winner_points_sum, loser_points_sum)
How I've solved it thus far is to do a for loop such as:
library(dplyr)
test_data$winner_points_sum_loop <- 0
test_data$loser_points_sum_loop <- 0
for(i in row.names(test_data)) {
test_data[i,]$winner_points_sum_loop <-
(
test_data %>%
dplyr::filter(winner == test_data[i,]$winner & date < test_data[i,]$date) %>%
dplyr::summarise(points = sum(winner_points, na.rm = TRUE))
+
test_data %>%
dplyr::filter(loser == test_data[i,]$winner & date < test_data[i,]$date) %>%
dplyr::summarise(points = sum(loser_points, na.rm = TRUE))
)
}
test_data$winner_points_sum_loop <- unlist(test_data$winner_points_sum_loop)
Any suggestions how to tackle this problem? The queries take quite some time when the row numbers add up. I've tried elaborating with the AVE function, I can do it for one column to sum a players point as winner but can't figure out how to add their points as loser.
winner <- c(1,2,3,1,2,3,1,2,3)
loser <- c(3,1,1,2,1,1,3,1,2)
date <- c("2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05","2017-10-06","2017-10-07","2017-10-08","2017-10-09")
winner_points <- c(2,1,2,1,2,1,2,1,2)
loser_points <- c(1,0,1,0,1,0,1,0,1)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points)
library(dplyr)
library(tidyr)
test_data %>%
unite(winner, winner, winner_points) %>% # unite winner columns
unite(loser, loser, loser_points) %>% # unite loser columns
gather(type, pl_pts, winner, loser, -date) %>% # reshape
separate(pl_pts, c("player","points"), convert = T) %>% # separate columns
arrange(date) %>% # order dates (in case it's not)
group_by(player) %>% # for each player
mutate(sum_points = cumsum(points) - points) %>% # get points up to that date
ungroup() %>% # forget the grouping
unite(pl_pts_sumpts, player, points, sum_points) %>% # unite columns
spread(type, pl_pts_sumpts) %>% # reshape
separate(loser, c("loser", "loser_points", "loser_points_sum"), convert = T) %>% # separate columns and give appropriate names
separate(winner, c("winner", "winner_points", "winner_points_sum"), convert = T) %>%
select(winner, loser, date, winner_points, loser_points, winner_points_sum, loser_points_sum) # select the order you prefer
# # A tibble: 9 x 7
# winner loser date winner_points loser_points winner_points_sum loser_points_sum
# * <int> <int> <date> <int> <int> <int> <int>
# 1 1 3 2017-10-01 2 1 0 0
# 2 2 1 2017-10-02 1 0 0 2
# 3 3 1 2017-10-03 2 1 1 2
# 4 1 2 2017-10-04 1 0 3 1
# 5 2 1 2017-10-05 2 1 1 4
# 6 3 1 2017-10-06 1 0 3 5
# 7 1 3 2017-10-07 2 1 5 4
# 8 2 1 2017-10-08 1 0 3 7
# 9 3 2 2017-10-09 2 1 5 4
I finally understood what you want. And I took an approach of getting cumulative points of each player at each point in time and then joining it to the original test_data data frame.
winner <- c(1,2,3,1,2,3,1,2,3)
loser <- c(3,1,1,2,1,1,3,1,2)
date <- c("2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05","2017-10-06","2017-10-07","2017-10-08","2017-10-09")
winner_points <- c(2,1,2,1,2,1,2,1,2)
loser_points <- c(1,0,1,0,1,0,1,0,1)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points)
library(dplyr)
library(tidyr)
cum_points <- test_data %>%
gather(end_game_status, player_id, winner, loser) %>%
gather(which_point, how_many_points, winner_points, loser_points) %>%
filter(
(end_game_status == "winner" & which_point == "winner_points") |
(end_game_status == "loser" & which_point == "loser_points")) %>%
arrange(date = as.Date(date)) %>%
group_by(player_id) %>%
mutate(cumulative_points = cumsum(how_many_points)) %>%
mutate(cumulative_points_sofar = lag(cumulative_points, default = 0))
select(player_id, date, cumulative_points)
output <- test_data %>%
left_join(cum_points, by = c('date', 'winner' = 'player_id')) %>%
rename(winner_points_sum = cumulative_points_sofar) %>%
left_join(cum_points, by = c('date', 'loser' = 'player_id')) %>%
rename(loser_points_sum = cumulative_points_sofar)
output
The difference to the previous question of the OP is that the OP is now asking for the cumulative sum of points each player has scored so far, i.e., before the actual date. Furthermore, the sample data set now contains a date column which uniquely identifies each row.
So, my previous approach can be used here as well, with some modifications. The solution below reshapes the data from wide to long format whereby two value variables are reshaped simultaneously, computes the cumulative sums for each player id , and finally reshapes from long back to wide format, again. In order to sum only points scored before the actual date, the rows are lagged by one.
It is important to note that the winner and loser columns contain the respective player ids.
library(data.table)
cols <- c("winner", "loser")
setDT(test_data)[
# reshape multiple value variables simultaneously from wide to long format
, melt(.SD, id.vars = "date",
measure.vars = list(cols, paste0(cols, "_points")),
value.name = c("id", "points"))][
# rename variable column
, variable := forcats::lvls_revalue(variable, cols)][
# order by date and cumulate the lagged points by id
order(date), points_sum := cumsum(shift(points, fill = 0)), by = id][
# reshape multiple value variables simultaneously from long to wide format
, dcast(.SD, date ~ variable, value.var = c("id", "points", "points_sum"))]
date id_winner id_loser points_winner points_loser points_sum_winner points_sum_loser
1: 2017-10-01 1 3 2 1 0 0
2: 2017-10-02 2 1 1 0 0 2
3: 2017-10-03 3 1 2 1 1 2
4: 2017-10-04 1 2 1 0 3 1
5: 2017-10-05 2 1 2 1 1 4
6: 2017-10-06 3 1 1 0 3 5
7: 2017-10-07 1 3 2 1 5 4
8: 2017-10-08 2 1 1 0 3 7
9: 2017-10-09 3 2 2 1 5 4

Resources