Aggregate rows with specific shared value - r

I want to aggregate my data as follows:
Aggregate only for successive rows where status = 0
Keep age and sum up points
Example data:
da <- data.frame(userid = c(1,1,1,1,2,2,2,2), status = c(0,0,0,1,1,1,0,0), age = c(10,10,10,11,15,16,16,16), points = c(2,2,2,6,3,5,5,5))
da
userid status age points
1 1 0 10 2
2 1 0 10 2
3 1 0 10 2
4 1 1 11 6
5 2 1 15 3
6 2 1 16 5
7 2 0 16 5
8 2 0 16 5
I would like to have:
da2
userid status age points
1 1 0 10 6
2 1 1 11 6
3 2 1 15 3
4 2 1 16 5
5 2 0 16 10

da %>%
mutate(grp = with(rle(status),
rep(seq_along(values), lengths)) + cumsum(status != 0)) %>%
group_by_at(vars(-points)) %>%
summarise(points = sum(points)) %>%
ungroup() %>%
select(-grp)
## A tibble: 5 x 4
# userid status age points
# <dbl> <dbl> <dbl> <dbl>
#1 1 0 10 6
#2 1 1 11 6
#3 2 0 16 10
#4 2 1 15 3
#5 2 1 16 5

You can use group_by from dplyr:
da %>% group_by(da$userid, cumsum(da$status), da$status)
%>% summarise(age=max(age), points=sum(points))
Output:
`da$userid` `cumsum(da$status)` `da$status` age points
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 0 10 6
2 1 1 1 11 6
3 2 2 1 15 3
4 2 3 0 16 10
5 2 3 1 16 5

Exactly the same idea as above :
library(dplyr)
data1 <- data %>% group_by(userid, age, status) %>%
filter(status == 0) %>%
summarise(points = sum(points))
data2 <- data %>%
group_by(userid, age, status) %>%
filter(status != 0) %>%
summarise(points = sum(points))
data <- rbind(data1,
data2)
We need to be more carreful with your specification of status equal to 0. I think the code of Quang Hoang works only for your specific example.
I hope it will help.

Related

How to count groupings of elements in base R or dplyr using multiple conditions?

I am trying to count the number of elements by groupings, subject to the condition that each grouping code ("Group") is > 0. Suppose we start with the below output DF generated via the code immediately beneath:
Element Group reSeq
<chr> <dbl> <int>
1 R 0 1
2 R 0 1
3 X 0 1
4 X 1 2
5 X 1 2
6 X 0 1
7 X 0 1
8 X 0 1
9 B 0 1
10 R 0 1
11 R 2 2
12 R 2 2
13 X 3 3
14 X 3 3
15 X 3 3
library(dplyr)
myDF <- data.frame(
Element = c("R","R","X","X","X","X","X","X","B","R","R","R","X","X","X"),
Group = c(0,0,0,1,1,0,0,0,0,0,2,2,3,3,3)
)
myDF %>% group_by(Element) %>% mutate(reSeq = match(Group, unique(Group)))
Instead, I would like the reSeq column to calculate and output as shown below with explanations to the right:
Element Group reSeq reSeq explanation
<chr> <dbl> <int>
1 R 0 1 1st instance of R (ungrouped)(Group = 0 means not grouped)
2 R 0 2 2nd instance of R (ungrouped)(Group = 0 means not grouped)
3 X 0 1 1st instance of X (ungrouped)(Group = 0 means not grouped)
4 X 1 2 2nd instance of X (grouped by Group = 1)
5 X 1 2 2nd instance of X (grouped by Group = 1)
6 X 0 3 3rd instance of X (ungrouped)
7 X 0 4 4th instance of X (ungrouped)
8 X 0 5 5th instance of X (ungrouped)
9 B 0 1 1st instance of B (ungrouped)
10 R 0 3 3rd instance of R (ungrouped)
11 R 2 4 4th instance of R (grouped by Group = 2)
12 R 2 4 4th instance of R (grouped by Group = 2)
13 X 3 6 6th instance of X (grouped by Group = 3)
14 X 3 6 6th instance of X (grouped by Group = 3)
15 X 3 6 6th instance of X (grouped by Group = 3)
Any recommendations for doing this? If possible, starting with the dplyr code I use above because I am fairly familiar with it.
If we use rowid from data.table, can skip a couple of steps
library(dplyr)
library(data.table)
library(tidyr)
myDF %>%
mutate(reSeq = rowid(Element) * NA^!(Group == 0 |!duplicated(Group))) %>%
group_by(Element) %>%
fill(reSeq) %>%
mutate(reSeq = match(reSeq, unique(reSeq))) %>%
ungroup
-output
# A tibble: 15 × 3
Element Group reSeq
<chr> <dbl> <int>
1 R 0 1
2 R 0 2
3 X 0 1
4 X 1 2
5 X 1 2
6 X 0 3
7 X 0 4
8 X 0 5
9 B 0 1
10 R 0 3
11 R 2 4
12 R 2 4
13 X 3 6
14 X 3 6
15 X 3 6
Below is what I managed to cobble together. Maybe there's a cleaner solution? Here's the code:
library(dplyr)
library(tidyr)
myDF %>%
group_by(Element) %>%
mutate(eleCnt = row_number()) %>%
ungroup()%>%
mutate(reSeq = ifelse(Group == 0 | Group != lag(Group), eleCnt,0)) %>%
mutate(reSeq = na_if(reSeq, 0)) %>%
group_by(Element) %>%
fill(reSeq) %>%
mutate(reSeq = match(reSeq, unique(reSeq))) %>%
ungroup
And here's the output:
# A tibble: 15 x 4
Element Group eleCnt reSeq
<chr> <dbl> <int> <int>
1 R 0 1 1
2 R 0 2 2
3 X 0 1 1
4 X 1 2 2
5 X 1 3 2
6 X 0 4 3
7 X 0 5 4
8 X 0 6 5
9 B 0 1 1
10 R 0 3 3
11 R 2 4 4
12 R 2 5 4
13 X 3 7 6
14 X 3 8 6
15 X 3 9 6

How to count the cumulative number of subgroupings using dplyr?

I'm trying to run the number of cumulative subgroupings using dplyr, as illustrated and explanation in the image below. I am trying to solve for Flag2 in the image. Any recommendations for how to do this?
Beneath the image I also have the reproducible code that runs all columns up through Flag1 which works fine.
Reproducible code:
library(dplyr)
myData <-
data.frame(
Element = c("A","B","B","B","B","B","A","C","C","C","C","C"),
Group = c(0,0,1,1,2,2,0,3,3,0,0,0)
)
excelCopy <- myData %>%
group_by(Element) %>%
mutate(Element_Count = row_number()) %>%
mutate(Flag1 = case_when(Group > 0 ~ match(Group, unique(Group)),TRUE ~ Element_Count)) %>%
ungroup()
print.data.frame(excelCopy)
Using row_number and setting 0 values to NA
library(dplyr)
excelCopy |>
group_by(Element, Group) |>
mutate(Flag2 = ifelse(Group == 0, NA, row_number()))
Element Group Element_Count Flag1 Flag2
<chr> <dbl> <int> <int> <int>
1 A 0 1 1 NA
2 B 0 1 1 NA
3 B 1 2 2 1
4 B 1 3 2 2
5 B 2 4 3 1
6 B 2 5 3 2
7 A 0 2 2 NA
8 C 3 1 1 1
9 C 3 2 1 2
10 C 0 3 3 NA
11 C 0 4 4 NA
12 C 0 5 5 NA

Summarise Data using group_by across several columns

Let's say I have a dataframe where all columns are factors:
set.seed(123)
gender <- sample(1:2,12,replace=T)
country <- c('FIN', 'FIN', 'EST', 'NIG','NIG','JAM', 'FIN', 'NIG', 'EST', 'NIG','NIG','JAM','FIN', 'FIN', 'EST', 'NIG','NIG','JAM', 'FIN', 'NIG', 'EST', 'NIG','NIG','JAM')
a <- sample(1:5,24,,replace=T)
b <- sample(1:5,24,,replace=T)
c <- sample(1:5,24,,replace=T)
d <- sample(1:5,24,,replace=T)
# Join the variables to create a data frame
df <- data.frame(gender,country,a,b,c,d)
df <- df %>% mutate_at(c("gender","country","a","b","c","d"), as.factor)
I want a table that will give a proportion percentage and n across every column a:d by Gender and Region so the end result would look something like this:
I've done this for a using the code (but I want to do this also for all the other columns b,c,d without replicating the code):
df %>%
group_by(gender,country,a) %>%
summarise(totaln=n()) %>%
group_by(country,gender) %>%
mutate(percentage=totaln/sum(totaln)*100)
so for a I get this:
gender country a totaln percentage
<fct> <fct> <fct> <int> <dbl>
1 1 EST 1 2 50
2 1 EST 3 1 25
3 1 EST 4 1 25
4 1 FIN 1 1 25
5 1 FIN 2 3 75
6 1 NIG 1 1 25
7 1 NIG 2 1 25
8 1 NIG 3 1 25
9 1 NIG 4 1 25
10 2 FIN 1 1 50
11 2 FIN 3 1 50
12 2 JAM 1 2 50
13 2 JAM 3 2 50
14 2 NIG 3 1 16.7
15 2 NIG 4 1 16.7
16 2 NIG 5 4 66.7
What you probably first want to do is to transform your data in a tidy-format by using pivot_longer on a:d, and then group also by this variable, from there it is almost the same code as you wrote:
df_summary <- df %>% pivot_longer(c(a, b, c, d), names_to = "abcd") %>%
group_by(gender, country, abcd, value) %>%
summarise(totaln = n()) %>%
group_by(country, gender, abcd) %>%
mutate(percentag = totaln / sum(totaln) * 100)
df_summary
#> # A tibble: 64 × 6
#> # Groups: country, gender, abcd [24]
#> gender country abcd value totaln percentag
#> <fct> <fct> <chr> <fct> <int> <dbl>
#> 1 1 EST a 1 1 25
#> 2 1 EST a 3 2 50
#> 3 1 EST a 4 1 25
#> 4 1 EST b 2 1 25
#> 5 1 EST b 3 1 25
#> 6 1 EST b 4 2 50
#> 7 1 EST c 2 1 25
#> 8 1 EST c 5 3 75
#> 9 1 EST d 1 1 25
#> 10 1 EST d 2 2 50
#> # … with 54 more rows
To create your desired output shape, you can do:
df_summary %>%
ungroup() %>%
mutate(summary = paste0(round(percentag), "% (n = ", totaln, ")")) %>%
select(gender, country, abcd, value, summary) %>%
pivot_wider(names_from = c(gender, country), values_from = summary, values_fill = "0")
Created on 2022-06-16 by the reprex package (v2.0.1)
Two options. Both retain the 0 combinations, that can easily be filtered out if needed.
Long format:
df %>%
tidyr::pivot_longer(-c(gender, country), names_to = "ltr", values_to = "val") %>%
group_by(gender, country, ltr) %>%
summarize(
totaln = sum(as.numeric(table(val))),
percentage = 100 * as.numeric(table(val)) / totaln
) %>%
ungroup()
# `summarise()` has grouped output by 'gender', 'country', 'ltr'. You can override using the `.groups` argument.
# # A tibble: 120 x 5
# gender country ltr totaln percentage
# <fct> <fct> <chr> <dbl> <dbl>
# 1 1 EST a 4 50
# 2 1 EST a 4 0
# 3 1 EST a 4 25
# 4 1 EST a 4 25
# 5 1 EST a 4 0
# 6 1 EST b 4 25
# 7 1 EST b 4 50
# 8 1 EST b 4 25
# 9 1 EST b 4 0
# 10 1 EST b 4 0
# # ... with 110 more rows
Wide format:
df %>%
group_by(gender, country) %>%
summarize(across(a:d,
list(
perc = ~ 100 * as.numeric(table(.)) / sum(as.numeric(table(.))),
total = ~ as.numeric(table(.)),
newval = ~ factor(names(table(.)), levels = levels(.))
)
)) %>%
ungroup()
# `summarise()` has grouped output by 'gender', 'country'. You can override using the `.groups` argument.
# # A tibble: 30 x 14
# gender country a_perc a_total a_newval b_perc b_total b_newval c_perc c_total c_newval d_perc d_total d_newval
# <fct> <fct> <dbl> <dbl> <fct> <dbl> <dbl> <fct> <dbl> <dbl> <fct> <dbl> <dbl> <fct>
# 1 1 EST 50 2 1 25 1 1 75 3 1 0 0 1
# 2 1 EST 0 0 2 50 2 2 0 0 2 50 2 2
# 3 1 EST 25 1 3 25 1 3 0 0 3 25 1 3
# 4 1 EST 25 1 4 0 0 4 0 0 4 25 1 4
# 5 1 EST 0 0 5 0 0 5 25 1 5 0 0 5
# 6 1 FIN 25 1 1 25 1 1 25 1 1 0 0 1
# 7 1 FIN 75 3 2 50 2 2 25 1 2 50 2 2
# 8 1 FIN 0 0 3 0 0 3 25 1 3 25 1 3
# 9 1 FIN 0 0 4 0 0 4 25 1 4 25 1 4
# 10 1 FIN 0 0 5 25 1 5 0 0 5 0 0 5
# # ... with 20 more rows

Code number of days elapsed since last activity

I want to code the number of days elapsed since the users last activity for a churn analysis.
I have tried a code I have found in a related topic but it does not work:
da = da %>%
arrange(dayid) %>%
group_by(dayid) %>%
mutate(dayssincelastactivity = c(NA, diff(dayid))
Lets say this is the data. active indicates if the user was active on this day. I want to add the variable dayssincelastactivity, that indicates the number of days elapsed since a user's last active day.
da <- data.frame(dayid = c(1,2,3,4,5,6,7,8), active = c(1,1,0,0,0,1,1,1), dayssincelastactivity = c(1,1,2,3,4,1,1,1))
da
dayid active dayssincelastactivity
1 1 1 1
2 2 1 1
3 3 0 2
4 4 0 3
5 5 0 4
6 6 1 1
7 7 1 1
8 8 1 1
Create a grouping variable using cumsum and seq_along each group.
with(da, ave(dayid, cumsum(active == 1), FUN = seq_along))
#[1] 1 1 2 3 4 1 1 1
You can also translate this to dplyr
library(dplyr)
da %>%
group_by(group = cumsum(active == 1)) %>%
mutate(new_val = row_number()) %>%
ungroup() %>%
select(-group)
# dayid active dayssincelastactivity new_val
# <dbl> <dbl> <dbl> <int>
#1 1 1 1 1
#2 2 1 1 1
#3 3 0 2 2
#4 4 0 3 3
#5 5 0 4 4
#6 6 1 1 1
#7 7 1 1 1
#8 8 1 1 1

Filtering and summing rows in dplyr

I have a data that I want to first filter some rows and sum those remaining rows.
The filtering conditions as follows;
for gr==1 find the last occurrence of y_value==10 and keep the all rows before it (including the last occurrence of this value 10 row)!
for gr==2 find the first occurrence of y_value==10 and keep all the rows after it (including the first occurrence of this value 10 row)!
The data is like this;
df <- data.frame(gr=rep(c(1,2),c(8,7)),
y_value=c(c(2,10,10,8,10,6,0,0),c(0,0,10,10,6,8,10)))
gr y_value
1 1 2
2 1 10
3 1 10
4 1 8
5 1 10
6 1 6
7 1 0
8 1 0
9 2 0
10 2 0
11 2 10
12 2 10
13 2 6
14 2 8
15 2 10
I tried this in the light of summing-rows-based-on-conditional-in-groups;
df_temp <- df %>%
group_by(gr) %>%
mutate(rows_to_aggregate=cumsum(y_value==10)) %>%
filter(ifelse(gr==1, rows_to_aggregate !=0, ifelse(gr==2, rows_to_aggregate ==0 | y_value==10, rows_to_aggregate ==0))) %>%
filter(ifelse(gr==1, row_number(gr) != 1, ifelse(gr==2, row_number(gr) != n(), rows_to_aggregate ==0)))
but the if I do rows_to_aggregate !=0 in gr==1 the rows in the interest will be gone! Any guide at this point will be appreciated!
df_to_aggregate <- df %>%
group_by(gr) %>%
mutate(rows_to_aggregate = cumsum(y_value == 10)) %>%
filter(!(gr == 1 & rows_to_aggregate == max(rows_to_aggregate) & y_value != 10)) %>%
filter(!(gr == 2 & rows_to_aggregate == 0)) %>%
select(-rows_to_aggregate)
df_to_aggregate
# A tibble: 10 x 2
# Groups: gr [2]
gr y_value
<dbl> <dbl>
1 1 2
2 1 10
3 1 10
4 1 8
5 1 10
6 2 10
7 2 10
8 2 6
9 2 8
10 2 10
Do not know how to do it in dplyr, but this code seems to work
gr1 = df[df$gr==1,]
last = tail(which(gr1$y_value==10),1)
gr1 = gr1[1:(last-1),]
gr2 = df[df$gr==2,]
first = head(which(gr2$y_value==10),1)
gr2 = gr2[(first+1):dim(gr2)[1],]
final = rbind(gr1,gr2)
You can slice with a different slicing condition for each gr.
df %>%
group_by(gr) %>%
slice(if(any(gr==1)) {1:max(which(y_value==10))} else {min(which(y_value==10)):n()})
gr y_value
1 1 2
2 1 10
3 1 10
4 1 8
5 1 10
6 2 10
7 2 10
8 2 6
9 2 8
10 2 10

Resources