How to flag the last row of a data frame group? - r

Suppose we start with the below dataframe df:
ID <- c(1, 1, 1, 5, 5)
Period <- c(1,2,3,1,2)
Value <- c(10,12,11,4,6)
df <- data.frame(ID, Period, Value)
ID Period Value
1 1 1 10
2 1 2 12
3 1 3 11
4 5 1 4
5 5 2 6
Now using dplyr I add a "Calculate" column that multiplies Period and Value of each row, giving me the following:
> df %>% mutate(Calculate = Period * Value)
ID Period Value Calculate
1 1 1 10 10
2 1 2 12 24
3 1 3 11 33
4 5 1 4 4
5 5 2 6 12
I'd like to modify the above "Calculate" to give me a value of 0, when reaching the last row for a given ID, so that the data frame output looks like:
ID Period Value Calculate
1 1 1 10 10
2 1 2 12 24
3 1 3 11 0
4 5 1 4 4
5 5 2 6 0
I was going to use the lead() function to peer at the next row to see if the ID changes but wasn't sure that happens when reaching the end of the data frame.
How could this be accomplished using dplyr?

You can group_by ID and replace the last row for each ID with 0.
library(dplyr)
df %>%
mutate(Calculate = Period * Value) %>%
group_by(ID) %>%
mutate(Calculate = replace(Calculate, n(), 0)) %>%
ungroup
# ID Period Value Calculate
# <dbl> <dbl> <dbl> <dbl>
#1 1 1 10 10
#2 1 2 12 24
#3 1 3 11 0
#4 5 1 4 4
#5 5 2 6 0

Yet another possibility:
library(tidyverse)
ID <- c(1, 1, 1, 5, 5)
Period <- c(1,2,3,1,2)
Value <- c(10,12,11,4,6)
df <- data.frame(ID, Period, Value)
df %>%
mutate(Calculate = Period * Value) %>%
group_by(ID) %>%
mutate(Calculate = if_else(row_number() == n(), 0, Calculate)) %>%
ungroup
#> # A tibble: 5 × 4
#> ID Period Value Calculate
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 10 10
#> 2 1 2 12 24
#> 3 1 3 11 0
#> 4 5 1 4 4
#> 5 5 2 6 0

ID <- c(1, 1, 1, 5, 5)
Period <- c(1,2,3,1,2)
Value <- c(10,12,11,4,6)
df <- data.frame(ID, Period, Value)
library(tidyverse)
df %>%
mutate(Calculate = Period * Value * duplicated(ID, fromLast = TRUE))
#> ID Period Value Calculate
#> 1 1 1 10 10
#> 2 1 2 12 24
#> 3 1 3 11 0
#> 4 5 1 4 4
#> 5 5 2 6 0
Created on 2022-01-09 by the reprex package (v2.0.1)

This should work. You can also replace rownum with Period (most likely)
ID <- c(1, 1, 1, 5, 5)
Period <- c(1,2,3,1,2)
Value <- c(10,12,11,4,6)
df <- data.frame(ID, Period, Value)
df = df %>% mutate(Calculate = Period * Value)
df$rownum = rownames(df)
df = df %>%
group_by(ID) %>%
mutate(Calculate = ifelse(rownum == max(rownum), 0, Calculate)) %>%
ungroup()
A tibble: 5 × 5
ID Period Value Calculate rownum
<dbl> <dbl> <dbl> <dbl> <chr>
1 1 1 10 10 1
2 1 2 12 24 2
3 1 3 11 0 3
4 5 1 4 4 4
5 5 2 6 0 5

Related

How to calculate cumulative sum for each group in time?

For each unique ID and rep, I want to calculate the cumulative number of babies at each age?
For instance, A1, the cumulative sum should look like 1,3,6
I tried the folowing method
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df$csum <- ave(df$babies, c(df$id,df$age, df$age), FUN=cumsum)
The result is cumulative sum is calculated over ID alone but not replicate or age. Any suggestions?
How about this:
library(dplyr)
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df %>%
group_by(id, rep) %>%
arrange(age, .by_group = TRUE) %>%
mutate(csum = cumsum(babies))
#> # A tibble: 15 × 5
#> # Groups: id, rep [4]
#> id rep age babies csum
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 0 1 1
#> 2 A 1 1 2 3
#> 3 A 1 2 3 6
#> 4 A 2 0 0 0
#> 5 A 2 1 1 1
#> 6 A 2 2 3 4
#> 7 B 1 0 0 0
#> 8 B 1 1 1 1
#> 9 B 1 2 5 6
#> 10 B 1 3 1 7
#> 11 B 2 0 0 0
#> 12 B 2 1 0 0
#> 13 B 2 2 12 12
#> 14 B 2 3 1 13
#> 15 B 2 4 1 14
Created on 2022-12-08 by the reprex package (v2.0.1)

Creating a binary variable if individual was observed in the previous year

Let's say I have an example dataframe in the following format:
df <- data.frame( c(1,2,3,1,2,3,1,2,3),
c(3,3,3,2,2,2,1,1,1),
c(23,23,34,134,134,NA,45,NA,NA)
)
colnames(df) <- c("id", "year", "fte_wage")
df <- df[is.na(df$fte_wage) == FALSE,]
I want to create a binary variable (let's say, a column named "obs") if the individual was observed in the previous or not. I have tried the following:
library(dplyr)
df2 <-
df %>%
arrange(id, year) %>%
group_by(id) %>%
rowwise() %>%
mutate(obs = ifelse((lag(year) %in% df[df$id == id,]$year & year > lag(year)), 1, 0))
Which generates a column of only 0 values. If I remove the second condition the code works, but then it misinterprets the lag(year) command, as it takes values from different individuals as well.
My desired output would be a dataframe in the following format:
id
year
fte_wage
ob
1
1
23
0
1
2
23
1
1
3
43
1
2
1
54
0
2
2
32
1
3
1
56
0
You can just group_by(id) and then check if row_number() is > 1 to see if it falls in repeating run or is alone.
library(tidyverse)
df <- data.frame("id" = c(1,2,3,1,2,3,1,2,3),
"year" = c(3,3,3,2,2,2,1,1,1),
"fte_wage" = c(23,23,34,134,134,NA,45,NA,NA))
df %>%
drop_na(fte_wage) %>%
arrange(id, year) %>%
group_by(id) %>%
mutate(obs = as.numeric(row_number() > 1))
#> # A tibble: 6 × 4
#> # Groups: id [3]
#> id year fte_wage obs
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 45 0
#> 2 1 2 134 1
#> 3 1 3 23 1
#> 4 2 2 134 0
#> 5 2 3 23 1
#> 6 3 3 34 0
Created on 2022-11-21 with reprex v2.0.2
This is one approach using dplyr without grouping.
library(dplyr)
df %>%
na.omit() %>%
arrange(id, year) %>%
mutate(obs = (lag(id, default=F) == id) * 1)
id year fte_wage obs
1 1 1 45 0
2 1 2 134 1
3 1 3 23 1
4 2 2 134 0
5 2 3 23 1
6 3 3 34 0
You could use diff in the following way:
library(dplyr)
df %>%
group_by(id) %>%
arrange(id, year) %>%
mutate(obs = +(c(0, diff(year)) == 1L))
Output:
# A tibble: 6 x 4
# Groups: id [3]
id year fte_wage obs
<dbl> <dbl> <dbl> <dbl>
1 1 1 45 0
2 1 2 134 1
3 1 3 23 1
4 2 2 134 0
5 2 3 23 1
6 3 3 34 0

How to count data frame elements grouped by multiple conditions in dplyr?

I am trying to use dplyr to count elements grouped by multiple conditions (columns) in a data frame. In the below example (dataframe output is at the top (except that I manually inserted the 2 right-most columns to explain what I am trying to do), and R code is underneath), I am trying to count the joint groupings of the Element and Group columns. My multiple condition grouping attempt is eleGrpCnt. Any recommendations for the correct way to do this in dplyr? I thought that group_by a combined (Element, Group) would work.
desired
Element Group origOrder eleCnt eleGrpCnt eleGrpCnt explanation
<chr> <dbl> <int> <int> <int> <comment> <comment>
1 B 0 1 1 1 1 1st grouping of B where Group = 0
2 R 0 2 1 1 1 1st grouping of R where Group = 0
3 R 1 3 2 1 2 2nd grouping of R where Group = 1
4 R 1 4 3 2 2 2nd grouping of R where Group = 1
5 B 0 5 2 2 1 1st grouping of B where Group = 0
6 X 2 6 1 1 1 1st grouping of X where Group = 2
7 X 2 7 2 2 1 1st grouping of X where Group = 2
8 X 0 8 3 1 2 2nd grouping of X where Group = 0
9 X 0 9 4 2 2 2nd grouping of X where Group = 0
10 X -1 10 5 1 3 3rd grouping of X where Group = -1
library(dplyr)
myData6 <-
data.frame(
Element = c("B","R","R","R","B","X","X","X","X","X"),
Group = c(0,0,1,1,0,2,2,0,0,-1)
)
myData6 %>%
mutate(origOrder = row_number()) %>%
group_by(Element) %>%
mutate(eleCnt = row_number()) %>%
ungroup() %>%
group_by(Element, Group) %>%
mutate(eleGrpCnt = row_number())%>%
ungroup()
If you group by element then the numbers you are looking for are simply the matches of Group against the unique values of Group:
library(dplyr)
myData6 %>%
mutate(origOrder = row_number()) %>%
group_by(Element) %>%
mutate(eleCnt = row_number()) %>%
ungroup() %>%
group_by(Element) %>%
mutate(eleGrpCnt = match(Group, unique(Group)))
#> # A tibble: 10 x 5
#> # Groups: Element [3]
#> Element Group origOrder eleCnt eleGrpCnt
#> <chr> <dbl> <int> <int> <dbl>
#> 1 B 0 1 1 1
#> 2 R 0 2 1 1
#> 3 R 1 3 2 2
#> 4 R 1 4 3 2
#> 5 B 0 5 2 1
#> 6 X 2 6 1 1
#> 7 X 2 7 2 1
#> 8 X 0 8 3 2
#> 9 X 0 9 4 2
#> 10 X -1 10 5 3
Created on 2022-09-11 with reprex v2.0.2
Here's one approach; I'm sorting by Group value but if you want to change the order to match original appearance order we could add a step.
myData6 %>%
mutate(origOrder = row_number()) %>%
group_by(Element) %>%
mutate(eleCnt = row_number()) %>%
ungroup() %>%
arrange(Element, Group) %>%
group_by(Element) %>%
mutate(eleGrpCnt = cumsum(Group != lag(Group, default = -999))) %>%
ungroup() %>%
arrange(origOrder)
# A tibble: 10 × 5
Element Group origOrder eleCnt eleGrpCnt
<chr> <dbl> <int> <int> <int>
1 B 0 1 1 1
2 R 0 2 1 1
3 R 1 3 2 2
4 R 1 4 3 2
5 B 0 5 2 1
6 X 2 6 1 3
7 X 2 7 2 3
8 X 0 8 3 2
9 X 0 9 4 2
10 X -1 10 5 1

grouping to aggregate values, but tripping up on NA's

I have long data, and I am trying to make a new variable (consistent) that is the value for a given column (VALUE), for each person (ID), at TIME = 2. I used the code below to do this, but I am getting tripped up on NA's. If the VALUE for TIME = 2 is NA, then I want it to grab the VALUE at TIME = 1 instead. That part I'm not sure how to do. So, in the example below, I want the new variable (consistent) should be 10 instead of NA.
ID = c("A", "A", "B", "B", "C", "C", "D", "D")
TIME = c(1, 2, 1, 2, 1, 2, 1, 2)
VALUE = c(8, 9, 10, NA, 12, 13, 14, 9)
df = data.frame(ID, TIME, VALUE)
df <- df %>%
group_by(ID) %>%
mutate(consistent = VALUE[TIME == 2]) %>% ungroup
df
If we want to use the same code, then coalesce with the 'VALUE' where 'TIME' is 1 (assuming there is a single observation of 'TIME' for each 'ID')
library(dplyr)
df %>%
group_by(ID) %>%
mutate(consistent = coalesce(VALUE[TIME == 2], VALUE[TIME == 1])) %>%
ungroup
-output
# A tibble: 8 × 4
ID TIME VALUE consistent
<chr> <dbl> <dbl> <dbl>
1 A 1 8 9
2 A 2 9 9
3 B 1 10 10
4 B 2 NA 10
5 C 1 12 13
6 C 2 13 13
7 D 1 14 9
8 D 2 9 9
Or another option is to arrange before doing the group_by and get the first element of 'VALUE' (assuming no replicating for 'TIME')
df %>%
arrange(ID, is.na(VALUE), desc(TIME)) %>%
group_by(ID) %>%
mutate(consistent = first(VALUE)) %>%
ungroup
-output
# A tibble: 8 × 4
ID TIME VALUE consistent
<chr> <dbl> <dbl> <dbl>
1 A 2 9 9
2 A 1 8 9
3 B 1 10 10
4 B 2 NA 10
5 C 2 13 13
6 C 1 12 13
7 D 2 9 9
8 D 1 14 9
Another possible solution, using tidyr::fill:
library(tidyverse)
df %>%
group_by(ID) %>%
mutate(consistent = VALUE) %>% fill(consistent) %>% ungroup
#> # A tibble: 8 × 4
#> ID TIME VALUE consistent
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 1 8 8
#> 2 A 2 9 9
#> 3 B 1 10 10
#> 4 B 2 NA 10
#> 5 C 1 12 12
#> 6 C 2 13 13
#> 7 D 1 14 14
#> 8 D 2 9 9
You can also use ifelse with your condition. TIME is guaranteed to be 1 in this scenario if there are only 2 group member each with TIME 1 and 2.
df %>%
group_by(ID) %>%
arrange(TIME, .by_group=T) %>%
mutate(consistent=ifelse(is.na(VALUE)&TIME==2, lag(VALUE), VALUE)) %>%
ungroup()
# A tibble: 8 × 4
ID TIME VALUE consistent
<chr> <dbl> <dbl> <dbl>
1 A 1 8 8
2 A 2 9 9
3 B 1 10 10
4 B 2 NA 10
5 C 1 12 12
6 C 2 13 13
7 D 1 14 14
8 D 2 9 9

Aggregate rows with specific shared value

I want to aggregate my data as follows:
Aggregate only for successive rows where status = 0
Keep age and sum up points
Example data:
da <- data.frame(userid = c(1,1,1,1,2,2,2,2), status = c(0,0,0,1,1,1,0,0), age = c(10,10,10,11,15,16,16,16), points = c(2,2,2,6,3,5,5,5))
da
userid status age points
1 1 0 10 2
2 1 0 10 2
3 1 0 10 2
4 1 1 11 6
5 2 1 15 3
6 2 1 16 5
7 2 0 16 5
8 2 0 16 5
I would like to have:
da2
userid status age points
1 1 0 10 6
2 1 1 11 6
3 2 1 15 3
4 2 1 16 5
5 2 0 16 10
da %>%
mutate(grp = with(rle(status),
rep(seq_along(values), lengths)) + cumsum(status != 0)) %>%
group_by_at(vars(-points)) %>%
summarise(points = sum(points)) %>%
ungroup() %>%
select(-grp)
## A tibble: 5 x 4
# userid status age points
# <dbl> <dbl> <dbl> <dbl>
#1 1 0 10 6
#2 1 1 11 6
#3 2 0 16 10
#4 2 1 15 3
#5 2 1 16 5
You can use group_by from dplyr:
da %>% group_by(da$userid, cumsum(da$status), da$status)
%>% summarise(age=max(age), points=sum(points))
Output:
`da$userid` `cumsum(da$status)` `da$status` age points
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 0 10 6
2 1 1 1 11 6
3 2 2 1 15 3
4 2 3 0 16 10
5 2 3 1 16 5
Exactly the same idea as above :
library(dplyr)
data1 <- data %>% group_by(userid, age, status) %>%
filter(status == 0) %>%
summarise(points = sum(points))
data2 <- data %>%
group_by(userid, age, status) %>%
filter(status != 0) %>%
summarise(points = sum(points))
data <- rbind(data1,
data2)
We need to be more carreful with your specification of status equal to 0. I think the code of Quang Hoang works only for your specific example.
I hope it will help.

Resources