Calculating cumulative sum for multiple columns in R - r

R newb, I'm trying to calculate the cumulative sum grouped by year, month, group and subgroup, also having multiple columns to calculate.
Sample of the data:
df <- data.frame("Year"=2020,
"Month"=c("Jan","Jan","Jan","Jan","Feb","Feb","Feb","Feb"),
"Group"=c("A","A","A","B","A","B","B","B"),
"SubGroup"=c("a","a","b","b","a","b","a","b"),
"V1"=c(10,10,20,20,50,50,10,10),
"V2"=c(0,1,2,2,0,5,1,1))
Year Month Group SubGroup V1 V2
1 2020 Jan A a 10 0
2 2020 Jan A a 10 1
3 2020 Jan A b 20 2
4 2020 Jan B b 20 2
5 2020 Feb A a 50 0
6 2020 Feb B b 50 5
7 2020 Feb B a 10 1
8 2020 Feb B b 10 1
Resulting Table wanted:
Year Month Group SubGroup V1 V2
1 2020 Jan A a 20 1
2 2020 Feb A a 70 1
3 2020 Jan A b 20 2
4 2020 Feb A b 20 2
5 2020 Jan B a 0 0
6 2020 Feb B a 10 1
7 2020 Jan B b 20 2
8 2020 Feb B b 80 8
From Sample Table, on Jan 2020, the sum of Group 'A' Subgroup 'a' was 10+10 = 20... On Feb 2020, the value was 50, therefore 20 from Jan + 50 = 70, and so on...
If there is no value, it should consider 0.
I've tried few codes but none didn't get even close to the output I need. Would really appreciate if someone could help me with some tips for this problem.

This is a simple group_by/mutate problem. The columns V1, V2 are chosen with across and cumsum applied to them.
df$Month <- factor(df$Month, levels = c("Jan", "Feb"))
df %>%
group_by(Year, Group, SubGroup) %>%
mutate(across(V1:V2, ~cumsum(.x))) %>%
ungroup() %>%
arrange(Year, Group, SubGroup, Month)
## A tibble: 8 x 6
# Year Month Group SubGroup V1 V2
# <chr> <fct> <chr> <chr> <dbl> <dbl>
#1 2020 Jan A a 10 0
#2 2020 Jan A a 20 1
#3 2020 Feb A a 70 1
#4 2020 Jan A b 20 2
#5 2020 Feb B a 10 1
#6 2020 Jan B b 20 2
#7 2020 Feb B b 70 7
#8 2020 Feb B b 80 8

If I understand what you are doing, you're taking the sum for each month, then doing the cumulative sums for the months. This is usuaully pretty easy in dplyr.
library(dplyr)
df %>%
group_by(Year, Month, Group, SubGroup) %>%
summarize(
V1_sum = sum(V1),
V2_sum = sum(V2)
) %>%
group_by(Year, Group, SubGroup) %>%
mutate(
V1_cumsum = cumsum(V1_sum),
V2_cumsum = cumsum(V2_sum)
)
# A tibble: 6 x 8
# Groups: Year, Group, SubGroup [4]
# Year Month Group SubGroup V1_sum V2_sum V1_cumsum V2_cumsum
# <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 2020 Feb A a 50 0 50 0
# 2 2020 Feb B a 10 1 10 1
# 3 2020 Feb B b 60 6 60 6
# 4 2020 Jan A a 20 1 70 1
# 5 2020 Jan A b 20 2 20 2
# 6 2020 Jan B b 20 2 80 8
But you'll notice that the monthly cumulative sums are backwards (i.e. January comes after February), because by default group_by groups alphabetically. Also, you don't see the empty values because dplyr doesn't fill them in.
To fix the order of the months, you can either make your months numeric (convert to dates) or turn them into factors. You can add back 'missing' combinations of the grouping variables by using aggregate in base R instead of dplyr::summarize. aggregate includes all combinations of the grouping factors. aggregate converts the missing values to NA, but you can replace the NA with 0 with tidyr::replace_na, for example.
library(dplyr)
library(tidyr)
df <- data.frame("Year"=2020,
"Month"=c("Jan","Jan","Jan","Jan","Feb","Feb","Feb","Feb"),
"Group"=c("A","A","A","B","A","B","B","B"),
"SubGroup"=c("a","a","b","b","a","b","a","b"),
"V1"=c(10,10,20,20,50,50,10,10),
"V2"=c(0,1,2,2,0,5,1,1))
df$Month <- factor(df$Month, levels = c("Jan", "Feb"), ordered = TRUE)
# Get monthly sums
df1 <- with(df, aggregate(
list(V1_sum = V1, V2_sum = V2),
list(Year = Year, Month = Month, Group = Group, SubGroup = SubGroup),
FUN = sum, drop = FALSE
))
df1 <- df1 %>%
# Replace NA with 0
mutate(
V1_sum = replace_na(V1_sum, 0),
V2_sum = replace_na(V2_sum, 0)
) %>%
# Get cumulative sum across months
group_by(Year, Group, SubGroup) %>%
mutate(V1cumsum = cumsum(V1_sum),
V2cumsum = cumsum(V2_sum)) %>%
ungroup() %>%
select(Year, Month, Group, SubGroup, V1 = V1cumsum, V2 = V2cumsum)
This gives the same result as your example:
# # A tibble: 8 x 6
# Year Month Group SubGroup V1 V2
# <dbl> <ord> <chr> <chr> <dbl> <dbl>
# 1 2020 Jan A a 20 1
# 2 2020 Feb A a 70 1
# 3 2020 Jan B a 0 0
# 4 2020 Feb B a 10 1
# 5 2020 Jan A b 20 2
# 6 2020 Feb A b 20 2
# 7 2020 Jan B b 20 2
# 8 2020 Feb B b 80 8

library(dplyr)
library(zoo)
df %>%
arrange(as.yearmon(paste0(Year, '-', Month), '%Y-%b'), Group, SubGroup) %>%
group_by(Year, Group, SubGroup) %>%
mutate(
V1 = cumsum(V1),
V2 = cumsum(V2)
) %>%
arrange(Year, Group, SubGroup, as.yearmon(paste0(Year, '-', Month), '%Y-%b')) #for desired output ordering
# A tibble: 8 x 6
# Groups: Year, Group, SubGroup [4]
# Year Month Group SubGroup V1 V2
# <chr> <chr> <chr> <chr> <dbl> <dbl>
# 1 2020 Jan A a 10 0
# 2 2020 Jan A a 20 1
# 3 2020 Feb A a 70 1
# 4 2020 Jan A b 20 2
# 5 2020 Feb B a 10 1
# 6 2020 Jan B b 20 2
# 7 2020 Feb B b 70 7
# 8 2020 Feb B b 80 8

Related

Reformat Dataframe to start a new row at a certain column

I have a dataframe that looks like this:
ID x.2019 x.2020
1 10 20
2 20 30
3 30 40
4 40 50
5 50 60
and I would like to reformat it to look like this:
ID time x
1 2019 10
1 2020 20
2 2019 20
2 2020 30
3 2019 40
3 2020 50
4 2019 60
4 2020 70
5 2019 70
5 2020 80
Any idea how to achieve this?
This is a rather simple task which you can probably find in other answers. Though, you can achieve what you want with data.table as follows:
library(data.table)
df = data.table( ID = 1:5,
x.2019 = seq(10, 50, by = 10),
x.2020 = seq(20, 60, by = 10)
)
# change column names conveniently
setnames(df, c("x.2019", "x.2020"), c("2019", "2020"))
# transform the dataset from wide to long format
out = melt(df, id.vars = "ID", variable.name = "time", value.name = "x", variable.factor = FALSE)
# cast time to integer
out[ , time := as.integer(time)]
# reorder by ID
setorder(out, ID)
out
#> ID time x
#> 1: 1 2019 10
#> 2: 1 2020 20
#> 3: 2 2019 20
#> 4: 2 2020 30
#> 5: 3 2019 30
#> 6: 3 2020 40
#> 7: 4 2019 40
#> 8: 4 2020 50
#> 9: 5 2019 50
#> 10: 5 2020 60
Created on 2022-01-20 by the reprex package (v2.0.1)
You can use pivot_longer:
library(dplyr)
library(tidyr)
df = data.frame(ID=1:5,
x.2019=c(10, 20, 30, 40, 50),
x.2020=c(20, 30, 40, 50, 60))
df %>%
pivot_longer(cols = c(2, 3), names_to = 'time', values_to = 'x') %>%
mutate(time = as.integer(stringr::str_replace(time, 'x.', '')))
Result:
# A tibble: 10 x 3
ID time x
<int> <int> <dbl>
1 1 2019 10
2 1 2020 20
3 2 2019 20
4 2 2020 30
5 3 2019 30
6 3 2020 40
7 4 2019 40
8 4 2020 50
9 5 2019 50
10 5 2020 60

How to calculate Quarter Over Quarter %change when the dataset is monthly

I have this df which observations are monthly represented:
library(dplyr)
library(lubridate)
Date <- seq(from = as_date("2019-11-01"), to = as_date("2020-10-01"), by = "month")
A <- (10:21)
df <- data.frame(Date, A)
view(df)
Date A
<date> <int>
1 2019-11-01 10
2 2019-12-01 11
3 2020-01-01 12
4 2020-02-01 13
5 2020-03-01 14
6 2020-04-01 15
7 2020-05-01 16
8 2020-06-01 17
9 2020-07-01 18
10 2020-08-01 19
11 2020-09-01 20
12 2020-10-01 21
Using lag() I know how to calculate %change from Month over Month (MoM), but haven't been able to compare a quarter with the previous quarter: i.e, the sum of 3 months compared with the previous 3 months summed. I tried a loop approach but it didn't work and there should be a more efficient approach.
I appreciate it if someone can help.
We can use as.yearqtr from zoo to convert the 'Date' column to quarter, do a group by sum and then get the Difference between the current and next (lead) or current and previous (lag)
library(dplyr)
library(zoo)
df %>%
group_by(Quarter = as.yearqtr(Date)) %>%
summarise(A = sum(A), .groups = 'drop') %>%
mutate(Diff = lead(A) - A)
-output
# A tibble: 5 x 3
# Quarter A Diff
# <yearqtr> <int> <int>
#1 2019 Q4 21 18
#2 2020 Q1 39 9
#3 2020 Q2 48 9
#4 2020 Q3 57 -36
#5 2020 Q4 21 NA

Group By and summaries with condition

I have data frame df. After group_by(id, Year, Month, new_used_ind) and summarise(n = n()) it looks like:
id Year Month new_used_ind n
1 2001 apr N 3
1 2001 apr U 2
2 2002 mar N 5
3 2003 mar U 3
4 2004 july N 4
4 2004 july U 2
I want to add and get total for id, year and month but also want a total of ' N' from new_used_ind in a new column.
Something like this
id Year Month Total_New total
1 2001 apr 3 5
2 2002 mar 5 8
4 2004 july 4 6
library(dplyr)
read.table(text= "id Year Month new_used_ind n
1 2001 apr N 3
1 2001 apr U 2
2 2002 mar N 5
3 2003 mar U 3
4 2004 july N 4
4 2004 july U 2", header = T) -> df
df %>%
group_by(id, Year, Month) %>%
mutate(total_New=sum(n*(new_used_ind=="N"))) %>%
mutate(total_n=sum(n)) %>%
summarise_at(c("total_New", "total_n"), mean)
#> # A tibble: 4 x 5
#> # Groups: id, Year [4]
#> id Year Month total_New total_n
#> <int> <int> <fct> <dbl> <dbl>
#> 1 1 2001 apr 3 5
#> 2 2 2002 mar 5 5
#> 3 3 2003 mar 0 3
#> 4 4 2004 july 4 6
Created on 2019-06-11 by the reprex package (v0.3.0)

how do I identify rows where an element appears for the first time?

I have the following data frame of student records. what I want is to identify students who joined a certain program in 2014 for the first time when they were in 9th grade.
names.first<-c('a','a','b','b','c','d')
names.last<-c('c','c','z','z','f','h')
year<-c(2014,2013,2014,2015,2015,2014)
grade<-c(9,8,9,10,10,10)
df<-data.frame(names.first,names.last,year,grade)
df
To do this, I have used the following statement to say that I want students where the program year==2014 and their grade ==9.
df$first.cohort<-ifelse(df$year==2014 & df$grade==9,1,0)
df
names.first names.last year grade first.cohort
1 a c 2014 9 1
2 a c 2013 8 0
3 b z 2014 9 1
4 b z 2015 10 0
5 c f 2015 10 0
6 d h 2014 10 0
However, as you can notice this would include students who didn't enter the program in year 2014 such as student awho started in 2013. How do I create a ifelse statement where I only capture students who are in 9th grade and started the program in 2014 for the first time so that the df looks like
names.first names.last year grade first.cohort
1 a c 2014 9 0
2 a c 2013 8 0
3 b z 2014 9 1
4 b z 2015 10 0
5 c f 2015 10 0
6 d h 2014 10 0
We can use first after arrangeing by 'name' and 'year' to create the logical expression
library(dplyr)
df %>%
arrange(names, year) %>%
group_by(names) %>%
mutate(first.cohort = as.integer(grade == 9 & first(year) == 2014))
# A tibble: 6 x 4
# Groups: names [4]
# names year grade first.cohort
# <fct> <dbl> <dbl> <int>
#1 a 2013 8 0
#2 a 2014 9 0
#3 b 2014 9 1
#4 b 2015 10 0
#5 c 2015 10 0
#6 d 2014 10 0
For keeping the same order as in the input dataset, we can create a sequence column first and then do the arrange on the column after the mutate
df %>%
mutate(rn = row_number()) %>%
arrange(names, year) %>%
group_by(names) %>%
mutate(first.cohort = as.integer(grade == 9 & first(year) == 2014)) %>%
ungroup %>%
arrange(rn) %>%
select(-rn)
Or using the same logic with data.table that have the additional advantage of keeping the same order as in the input dataset
library(data.table)
setDT(df)[order(names, year), first.cohort := as.integer(grade == 9 &
first(year) == 2014), names]
Update
With the new example in the OP's post, we do the grouping by both the 'names' column
df %>%
arrange(names.first, names.last, year) %>%
group_by(names.first, names.last) %>%
mutate(first.cohort = as.integer(grade == 9 & first(year) == 2014))
# A tibble: 6 x 5
# Groups: names.first, names.last [4]
# names.first names.last year grade first.cohort
# <fct> <fct> <dbl> <dbl> <int>
#1 a c 2013 8 0
#2 a c 2014 9 0
#3 b z 2014 9 1
#4 b z 2015 10 0
#5 c f 2015 10 0
#6 d h 2014 10 0
Using dplyr
library(dplyr)
df%>%group_by(names)%>%dplyr::mutate(Fc=as.numeric((year==2014&grade==9)&(min(year)==2014)))
# A tibble: 6 x 4
# Groups: names [4]
names year grade Fc
<fctr> <dbl> <dbl> <dbl>
1 a 2014 9 0
2 a 2013 8 0
3 b 2014 9 1
4 b 2015 10 0
5 c 2015 10 0
6 d 2014 10 0

data standardization for all group data.frame in R

I have a dataset as below
Date <- rep(c("Jan", "Feb"), 3)[1:5]
Group <- c(rep(letters[1:2],each=2),"c")
value <- sample(1:10,5)
data <- data.frame(Date, Group, value)
> data
Date Group value
1 Jan a 2
2 Feb a 7
3 Jan b 3
4 Feb b 9
5 Jan c 1
As you can observed, for group c it do not have data on Date=Feb.
How can i make a dataset such that
> DATA
Date Group value
1 Jan a 2
2 Feb a 7
3 Jan b 3
4 Feb b 9
5 Jan c 1
6 Feb c 0
I have added last row such that value for group c in feb is 0.
Thanks
With base R you can use xtabs wrapped in as.data.frame:
as.data.frame(xtabs(formula = value ~ Date + Group, data = data))
# Date Group Freq
#1 Feb a 8
#2 Jan a 6
#3 Feb b 4
#4 Jan b 1
#5 Feb c 0
#6 Jan c 10
Using merge:
#get all combinations of 2 columns
all.comb <- expand.grid(unique(data$Date),unique(data$Group))
colnames(all.comb) <- c("Date","Group")
#merge with all.x=TRUE to keep nonmatched rows
res <- merge(all.comb,data,all.x=TRUE)
#convert NA to 0
res$value[is.na(res$value)] <- 0
#result
res
# Date Group value
# 1 Feb a 3
# 2 Feb b 4
# 3 Feb c 0
# 4 Jan a 5
# 5 Jan b 7
# 6 Jan c 10
Using reshape2
library(reshape2)
melt(dcast(data, Date~Group, value.var="value",fill=0), id.var="Date") #values differ as there was no set.seed()
# Date variable value
#1 Feb a 1
#2 Jan a 10
#3 Feb b 7
#4 Jan b 4
#5 Feb c 0
#6 Jan c 5
Or using dplyr
library(dplyr)
library(tidyr)
data%>%
spread(Group, value, fill=0) %>%
gather(Group, value, a:c)
# Date Group value
#1 Feb a 1
#2 Jan a 10
#3 Feb b 7
#4 Jan b 4
#5 Feb c 0
#6 Jan c 5

Resources