Combining split() and cumsum() - r

I am trying to produce stats for cumulative goals by season by a particular soccer player. I have used the cut function to obtain the season from the game dates. I have data which corresponds to this dataframe
df.raw <-
data.frame(Game = 1:20,
Goals=c(1,0,0,2,1,0,3,2,0,0,0,1,0,4,1,2,0,0,0,3),
season = gl(4,5,labels = c("2001", "2002","2003", "2004")))
In real life, the number of games per season may not be constant
I want to end up with data that looks like this
df.seasoned <-
data.frame(Game = 1:20,seasonGame= rep(1:5),
Goals=c(1,0,0,2,1,0,3,2,0,0,0,1,0,4,1,2,0,0,0,3),
cumGoals = c(1,1,1,3,4,0,3,5,5,5,0,1,1,5,6,2,2,2,2,5),
season = gl(4,5,labels = c("2001", "2002","2003", "2004")))
With the goals cumulatively summed within year and a game number for the season

df.raw$cumGoals <- with(df.raw, ave(Goals, season, FUN=cumsum) )
df.raw$seasonGame <- with(df.raw, ave(Game, season, FUN=seq))
df.raw
Or with transform ... the original transform, that is:
df.seas <- transform(df.raw, seasonGame = ave(Game, season, FUN=seq),
cumGoals = ave(Goals, season, FUN=cumsum) )
df.seas
Game Goals season seasonGame cumGoals
1 1 1 2001 1 1
2 2 0 2001 2 1
3 3 0 2001 3 1
4 4 2 2001 4 3
5 5 1 2001 5 4
6 6 0 2002 1 0
7 7 3 2002 2 3
8 8 2 2002 3 5
9 9 0 2002 4 5
10 10 0 2002 5 5
snipped

Another job for ddply and transform (from the plyr package):
ddply(df.raw,.(season),transform,seasonGame = 1:NROW(piece),
cumGoals = cumsum(Goals))
Game Goals season seasonGame cumGoals
1 1 1 2001 1 1
2 2 0 2001 2 1
3 3 0 2001 3 1
4 4 2 2001 4 3
5 5 1 2001 5 4
6 6 0 2002 1 0
7 7 3 2002 2 3
8 8 2 2002 3 5
9 9 0 2002 4 5
10 10 0 2002 5 5
11 11 0 2003 1 0
12 12 1 2003 2 1
13 13 0 2003 3 1
14 14 4 2003 4 5
15 15 1 2003 5 6
16 16 2 2004 1 2
17 17 0 2004 2 2
18 18 0 2004 3 2
19 19 0 2004 4 2
20 20 3 2004 5 5

Here is a solution using data.table which is very fast.
library(data.table)
df.raw.tab = data.table(df.raw)
df.raw.tab[,list(seasonGame = 1:NROW(Goals), cumGoals = cumsum(Goals)),'season']

Related

How to create list of 2 from dataframe?

I have transportation data between UK cities:
from
to
Transit: if there is transit between these two cities = 1,
otherwise =0
weight: average number of passengers
Here is given sample from my data:
df2 <- data.frame (from = c("London", "London", "London", "London" ,"Liverpool","Liverpool","Liverpool" , "Manchester", "Manchester", "Bristol"),
to = c("Liverpool", "Manchester", "Bristol","Derby", "Manchester", "Bristol","Derby","Bristol","Derby","Derby"),
Transit = c(1,0,1,1,1,1,1,1,0,1),
ave.pas = c(10,0,11,24,40,45,12,34,9,29))
output
from to Transit weight
1 London Liverpool 1 10
2 London Manchester 0 0
3 London Bristol 1 11
4 London Derby 1 24
5 Liverpool Manchester 1 40
6 Liverpool Bristol 1 45
7 Liverpool Derby 1 12
8 Manchester Bristol 1 34
9 Manchester Derby 0 9
10 Bristol Derby 1 29
Now I want to convert it in the list of 2 to get data like this (this is different data but idea is to get the same from my df):
$nodes
# A tibble: 16 x 2
id label
<int> <chr>
1 1 France
2 2 Belgium
3 3 Germany
4 4 Danemark
5 5 Croatia
6 6 Slovenia
7 7 Hungary
8 8 Spain
9 9 Italy
10 10 Netherlands
11 11 UK
12 12 Austria
13 13 Poland
14 14 Switzerland
15 15 Czech republic
16 16 Slovania
$edges
# A tibble: 18 x 3
from to weight
<int> <int> <dbl>
1 1 3 9
2 2 1 4
3 1 8 3
4 1 9 4
5 1 10 2
6 1 11 3
7 3 12 2
8 3 13 2
9 2 3 3
10 3 14 2
11 3 15 2
12 3 10 2
13 4 3 2
14 5 3 2
15 5 16 2
16 5 7 2
17 6 3 2
18 7 16 2.5
In base R:
f2 = c('from', 'to')
nodes = data.frame(label = unique(unlist(df2[f2])))
nodes$id = seq_len(nrow(nodes))
edges = df2[df2$Transit == 1, c(f2, 'ave.pas')]
edges[f2] = lapply(edges[f2], match, nodes$label)
nodes
# label id
# 1 London 1
# 2 Liverpool 2
# 3 Manchester 3
# 4 Bristol 4
# 5 Derby 5
edges
# from to ave.pas
# 1 1 2 10
# 3 1 4 11
# 4 1 5 24
# 5 2 3 40
# 6 2 4 45
# 7 2 5 12
# 8 3 4 34
# 10 4 5 29
Create the dataframe of unique factor levels and create ids using as.numeric, then use match to replace the values with the id.
df1 <- data.frame(id = as.numeric(factor(unique(unlist(df2[c(1,2)])), levels = unique(unlist(df2[c(1,2)])))),
label = factor(unique(unlist(df2[c(1,2)])), levels = unique(unlist(df2[c(1,2)]))))
# id label
#1 1 London
#2 2 Liverpool
#3 3 Manchester
#4 4 Bristol
#5 5 Derby
df2$from <- df1$id[match(df2$from, df1$label)]
df2$to <- df1$id[match(df2$to, df1$label)]
# from to Transit ave.pas
#1 1 2 1 10
#2 1 3 0 0
#3 1 4 1 11
#4 1 5 1 24
#5 2 3 1 40
#6 2 4 1 45
#7 2 5 1 12
#8 3 4 1 34
#9 3 5 0 9
#10 4 5 1 29
Edit: you actually don't need to convert to factor (this comes then very close to #sindri_baldur's answer):
un <- unique(unlist(df2[c(1, 2)]))
df1 <- data.frame(id = seq_along(un), label = un)
df2[c(1, 2)] <- sapply(df2[c(1, 2)], match, df1$label)

Cumulative sum for variables with similar names in R

df_test <- data.frame(MONTH_NUM = c(7,7,8,8,8,10,11,12,1,2,3,4,4,5,5,5,5,NA)
, YEAR = c(2018,2018,2018,2018,2019,2019,2019,2019,2019,2018,2018,2019,2018,2018,2018,2018,2018,NA)
, Sys_Indicator = c(1,0,0,1,0,0,0,0,1,1,0,1,0,1,1,1,1,1)
, lbl_Indicator = c(1,1,1,1,0,1,0,0,1,1,0,1,1,1,1,1,1,0)
, Pk_Indicator=c(1,0,1,1,0,1,0,0,1,1,0,1,0,0,0,0,1,1))
I want to find the cumulative sum of each indicator for each month+year combination. I'm currently using dplyr to achieve this but I was wondering if there was an easier way to do this and to do it for all variables that have and Indicator in their names? I want all my variable with Indicator in them to have cumulative sum.
df_test %>%
group_by(YEAR,MONTH_NUM) %>%
summarize(Sys_sum=sum(Sys_Indicator),lbl_Sum=sum(lbl_Indicator),Pk_Sum=sum(Pk_Indicator)) %>%
arrange(MONTH_NUM,YEAR) %>%
ungroup() %>%
mutate(Sys_cum=cumsum(Sys_sum),Cum_lbl=cumsum(lbl_Sum),Pk_sum=cumsum(Pk_Sum))
You could use the _at variants in dplyr to apply this for multiple columns :
library(dplyr)
df_test %>%
arrange(MONTH_NUM,YEAR) %>%
group_by(YEAR,MONTH_NUM) %>%
summarize_at(vars(ends_with('Indicator')), sum) %>%
ungroup() %>%
mutate_at(vars(ends_with('Indicator')), list(cs = ~cumsum(.)))
# YEAR MONTH_NUM Sys_Indicator lbl_Indicator Pk_Indicator Sys_Indicator_cs lbl_Indicator_cs Pk_Indicator_cs
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2018 2 1 1 1 1 1 1
# 2 2018 3 0 0 0 1 1 1
# 3 2018 4 0 1 0 1 2 1
# 4 2018 5 4 4 1 5 6 2
# 5 2018 7 1 2 1 6 8 3
# 6 2018 8 1 2 2 7 10 5
# 7 2019 1 1 1 1 8 11 6
# 8 2019 4 1 1 1 9 12 7
# 9 2019 8 0 0 0 9 12 7
#10 2019 10 0 1 1 9 13 8
#11 2019 11 0 0 0 9 13 8
#12 2019 12 0 0 0 9 13 8
#13 NA NA 1 0 1 10 13 9
I think I understand what you want. Here is a data.table approach.
library(data.table)
setDT(df_test)[ ,sapply(names(df_test)[grep("Indicator",names(df_test))],paste0,"_cumsum") := lapply(.SD[,grep("Indicator",names(df_test))],cumsum)]
df_test
MONTH_NUM YEAR Sys_Indicator lbl_Indicator Pk_Indicator Sys_Indicator_cumsum lbl_Indicator_cumsum Pk_Indicator_cumsum
1: 7 2018 1 1 1 1 1 1
2: 7 2018 0 1 0 1 2 1
3: 8 2018 0 1 1 1 3 2
4: 8 2018 1 1 1 2 4 3
5: 8 2019 0 0 0 2 4 3
6: 10 2019 0 1 1 2 5 4
7: 11 2019 0 0 0 2 5 4
8: 12 2019 0 0 0 2 5 4
9: 1 2019 1 1 1 3 6 5
10: 2 2018 1 1 1 4 7 6
11: 3 2018 0 0 0 4 7 6
12: 4 2019 1 1 1 5 8 7
13: 4 2018 0 1 0 5 9 7
14: 5 2018 1 1 0 6 10 7
15: 5 2018 1 1 0 7 11 7
16: 5 2018 1 1 0 8 12 7
17: 5 2018 1 1 1 9 13 8
18: NA NA 1 0 1 10 13 9

Summing and discarding grouped variables

I have a dataframe of this form
familyid memberid year contract months
1 1 2000 1 12
1 1 2001 1 12
1 1 2002 1 12
1 1 2003 1 12
2 3 2000 2 12
2 3 2001 2 12
2 3 2002 2 12
2 3 2003 2 12
3 2 2000 1 5
3 2 2000 2 5
3 2 2001 1 12
3 2 2002 1 12
3 2 2003 1 12
4 1 2000 2 12
4 1 2001 2 12
4 1 2002 2 12
4 1 2003 2 12
5 2 2000 1 8
5 2 2001 1 12
5 2 2002 1 12
5 2 2003 1 4
5 2 2003 1 6
I want back a dataframe like
familyid memberid year contract months
1 1 2000 1 12
1 1 2001 1 12
1 1 2002 1 12
1 1 2003 1 12
2 3 2000 2 12
2 3 2001 2 12
2 3 2002 2 12
2 3 2003 2 12
4 1 2000 2 12
4 1 2001 2 12
4 1 2002 2 12
4 1 2003 2 12
5 2 2000 1 8
5 2 2001 1 12
5 2 2002 1 12
**5 2 2003 1 10**
Basically I want to sum the variable months if they same familyid shows the same value for the variable "contract" (in my example I am summing 6 and 4 for familyid=5 in year=2003). However, I want also to discard familiids which show, during the same year, two different values for the variable contract (in my case I am discarding familyid=3 since it shows contract=1 and contract=2 in year=2000). For the other observations I want to keep things unchanged.
Does anybody know how to do this?
Thanks to anyone helping me.
Marco
You mentioned that you wanted to get the total months within one family's single contract in one year, but also to remove the families entirely with more than one contract in a year. Here's one approach:
library(dplyr)
df2 <- df %>%
group_by(familyid, memberid, year, contract) %>%
summarize(months = sum(months, na.rm = T)) %>%
# We need this to answer the second part. How many contracts did this family have this year?
mutate(contracts_this_yr = n()) %>%
ungroup() %>%
# Only include the families with no years of multiple contracts
group_by(familyid, memberid) %>%
filter(max(contracts_this_yr) < 2) %>%
ungroup()
Output
df2
# A tibble: 16 x 5
familyid memberid year contract months
<int> <int> <int> <int> <int>
1 1 1 2000 1 12
2 1 1 2001 1 12
3 1 1 2002 1 12
4 1 1 2003 1 12
5 2 3 2000 2 12
6 2 3 2001 2 12
7 2 3 2002 2 12
8 2 3 2003 2 12
9 4 1 2000 2 12
10 4 1 2001 2 12
11 4 1 2002 2 12
12 4 1 2003 2 12
13 5 2 2000 1 8
14 5 2 2001 1 12
15 5 2 2002 1 12
16 5 2 2003 1 10

apply lag or lead in increasing order for the dataframe

df1 <- read.csv("C:/Users/uni/DS-project/df1.csv")
df1
year value
1 2000 1
2 2001 2
3 2002 3
4 2003 4
5 2004 5
6 2000 1
7 2001 2
8 2002 3
9 2003 4
10 2004 5
11 2000 1
12 2001 2
13 2002 3
14 2003 4
15 2004 5
16 2000 1
17 2001 2
18 2002 3
19 2003 4
20 2004 5
i want to apply lead so i can get the output in the below fashion.
we have set of 5 observation of each year repeated for n number of times, in output for 1st year we need to remove 2000 and its respective value, similar for second year we neglect 2000 and 2001 and its respective value, and for 3rd year remove - 2000, 2001, 2002 and its respective value. And so on.
so that we can get the below output in below manner.
output:
year value
2000 1
2001 2
2002 3
2003 4
2004 5
2001 2
2002 3
2003 4
2004 5
2002 3
2003 4
2004 5
2003 4
2004 5
please help.
Just for fun, adding a vectorized solution using matrix sub-setting
m <- matrix(rep(TRUE, nrow(df)), 5)
m[upper.tri(m)] <- FALSE
df[m,]
# year value
# 1 2000 1
# 2 2001 2
# 3 2002 3
# 4 2003 4
# 5 2004 5
# 7 2001 2
# 8 2002 3
# 9 2003 4
# 10 2004 5
# 13 2002 3
# 14 2003 4
# 15 2004 5
# 19 2003 4
# 20 2004 5
Below grp is 1 for each row of the first group, 2 for the second and so on. Seq is 1, 2, 3, ... for the successive rows of each grp. Now just pick out those rows for which Seq is at least as large as grp. This has the effect of removing the first i-1 rows from the ith group for i = 1, 2, ... .
grp <- cumsum(df1$year == 2000)
Seq <- ave(grp, grp, FUN = seq_along)
subset(df1, Seq >= grp)
We could alternately write this in the less general form:
subset(df1, 1:5 >= rep(1:4, each = 5))
In any case the output from either subset statement is:
year value
1 2000 1
2 2001 2
3 2002 3
4 2003 4
5 2004 5
7 2001 2
8 2002 3
9 2003 4
10 2004 5
13 2002 3
14 2003 4
15 2004 5
19 2003 4
20 2004 5
library(dplyr)
df %>%
group_by(g = cumsum(year == 2000)) %>%
filter(row_number() >= g) %>%
ungroup %>%
select(-g)
# # A tibble: 14 x 2
# year value
# <int> <int>
# 1 2000 1
# 2 2001 2
# 3 2002 3
# 4 2003 4
# 5 2004 5
# 6 2001 2
# 7 2002 3
# 8 2003 4
# 9 2004 5
# 10 2002 3
# 11 2003 4
# 12 2004 5
# 13 2003 4
# 14 2004 5
Using lapply():
to <- nrow(df) / 5 - 1
df[-unlist(lapply(1:to, function(x) seq(1:x) + 5*x)), ]
year value
1 2000 1
2 2001 2
3 2002 3
4 2003 4
5 2004 5
7 2001 2
8 2002 3
9 2003 4
10 2004 5
13 2002 3
14 2003 4
15 2004 5
19 2003 4
20 2004 5
Where unlist(lapply(1:to, function(x) seq(1:x) + 5*x)) are the indices to skip:
[1] 6 11 12 16 17 18
Using sequence:
df[5-rev(sequence(2:5)-1),]
# year value
# 1 2000 1
# 2 2001 2
# 3 2002 3
# 4 2003 4
# 5 2004 5
# 2.1 2001 2
# 3.1 2002 3
# 4.1 2003 4
# 5.1 2004 5
# 3.2 2002 3
# 4.2 2003 4
# 5.2 2004 5
# 4.3 2003 4
# 5.3 2004 5
how it works:
5-rev(sequence(2:5)-1)
# [1] 1 2 3 4 5 2 3 4 5 3 4 5 4 5
rev(sequence(2:5)-1)
# [1] 4 3 2 1 0 3 2 1 0 2 1 0 1 0
sequence(2:5)-1
# [1] 0 1 0 1 2 0 1 2 3 0 1 2 3 4
sequence(2:5)
# [1] 1 2 1 2 3 1 2 3 4 1 2 3 4 5

reshape a dataframe R

I am facing a reshaping problem with a dataframe. It has many more rows and columns. Simplified, it structure looks like this:
rownames year x1 x2 x3
a 2000 2 6 11
b 2000 0 4 2
c 2000 0 3 5
a 2010 2 6 11
b 2010 0 0 0
c 2020 4 1 8
a 2020 10 1 7
b 2020 8 4 10
c 2020 22 1 16
I would like to come out with a dataframe that has one single row for the variable "year", copy the x1, x2, x3 values in subsequent columns, and rename the columns with a combination between the rowname and the x-variable. It should look like this:
year a_x1 a_x2 a_x3 b_x1 b_x2 b_x3 c_x1 c_x2 c_x3
2000 2 6 11 0 4 2 0 3 5
2010 2 6 11 0 0 0 4 1 8
2020 10 1 7 8 4 10 22 1 16
I thought to use subsequent cbind() functions, but since I have to do it for thousands of rows and hundreds columns, I hope there is a more direct way with the reshape package (with which I am not so familiar yet)
Thanks in advance!
First, I hope that rownames is a data.frame column and not the data.frame's rownames. Otherwise you'll encounter problems due to the non-uniqueness of the values.
I think your main problem is, that your data.frame is not entirely molten:
library(reshape2)
dt <- melt( dt, id.vars=c("year", "rownames") )
head(dt)
year rownames variable value
1 2000 a x1 2
2 2000 b x1 0
3 2000 c x1 0
4 2010 a x1 2
...
dcast( dt, year ~ rownames + variable )
year a_x1 a_x2 a_x3 b_x1 b_x2 b_x3 c_x1 c_x2 c_x3
1 2000 2 6 11 0 4 2 0 3 5
2 2010 2 6 11 0 0 0 4 1 8
3 2020 10 1 7 8 4 10 22 1 16
EDIT:
As #spdickson points out, there is also an error in your data avoiding a simple aggregation. Combinations of year, rowname have to be unique of course. Otherwise you need an aggregation function which determines the resulting values of non-unique combinations. So we assume that row 6 in your data should read c 2010 4 1 8.
You can try using reshape() from base R without having to melt your dataframe further:
df1 <- read.table(text="rownames year x1 x2 x3
a 2000 2 6 11
b 2000 0 4 2
c 2000 0 3 5
a 2010 2 6 11
b 2010 0 0 0
c 2010 4 1 8
a 2020 10 1 7
b 2020 8 4 10
c 2020 22 1 16",header=T,as.is=T)
reshape(df1,direction="wide",idvar="year",timevar="rownames")
# year x1.a x2.a x3.a x1.b x2.b x3.b x1.c x2.c x3.c
# 1 2000 2 6 11 0 4 2 0 3 5
# 4 2010 2 6 11 0 0 0 4 1 8
# 7 2020 10 1 7 8 4 10 22 1 16

Resources