how to replace age with previous value + 1 - r

I'm trying to replace missing age values in one wave by adding 1 to the value from the previous wave. So, for instance:
ID
Age
Wave
1
20
1
1
NA
2
2
61
1
2
NA
2
would become
ID
Age
Wave
1
20
1
1
21
2
2
61
1
2
62
2

library(tidyverse)
df %>%
mutate(Age = case_when(is.na(Age) ~ lag(Age) + 1,
TRUE ~ Age))
# A tibble: 4 x 3
ID Age Wave
<dbl> <dbl> <dbl>
1 1 20 1
2 1 21 2
3 2 61 1
4 2 62 2

Base R
> ave(df$Age,df$ID,FUN=function(x){x[1]+seq_along(x)-1})
[1] 20 21 61 62

With tidyverse, assuming your data is in df dataframe:
library(tidyverse)
df %>%
group_by(ID) %>% arrange(ID, Wave) %>%
mutate(missing_grp = cumsum( (is.na(Age)!=is.na(lag(Age))) | !is.na(Age) )) %>%
group_by(ID, missing_grp) %>%
mutate(age_offset=cumsum(is.na(Age))) %>%
group_by(ID) %>%
fill(Age, .direction='down') %>%
mutate(Age = Age + age_offset) %>%
ungroup() %>% select(-missing_grp, -age_offset)
It works also with multiple successive missing ages.
For the following input:
df <- tribble(
~ID, ~Age, ~Wave,
1, 21, 1,
1, NA, 2,
2, 61, 1,
2, NA, 2,
2, NA, 3,
2, 70, 4,
2, NA, 5,
)
it returns:
# A tibble: 7 × 3
ID Age Wave
<dbl> <dbl> <dbl>
1 1 21 1
2 1 22 2
3 2 61 1
4 2 62 2
5 2 63 3
6 2 70 4
7 2 71 5

In base R
within(df, Age[is.na(Age)] <- Age[which(is.na(Age)) - 1] + 1)
#> ID Age Wave
#> 1 1 20 1
#> 2 1 21 2
#> 3 2 61 1
#> 4 2 62 2

If you have more than two waves, we could use the row number:
library(dplyr)
library(tidyverse)
df |>
group_by(ID) |>
fill(Age) |>
mutate(Age = Age + row_number() - 1) |>
ungroup()
Output:
# A tibble: 5 × 3
ID Age Wave
<dbl> <dbl> <dbl>
1 1 21 1
2 1 22 2
3 2 61 1
4 2 62 2
5 2 63 3

Related

Creating a binary variable if individual was observed in the previous year

Let's say I have an example dataframe in the following format:
df <- data.frame( c(1,2,3,1,2,3,1,2,3),
c(3,3,3,2,2,2,1,1,1),
c(23,23,34,134,134,NA,45,NA,NA)
)
colnames(df) <- c("id", "year", "fte_wage")
df <- df[is.na(df$fte_wage) == FALSE,]
I want to create a binary variable (let's say, a column named "obs") if the individual was observed in the previous or not. I have tried the following:
library(dplyr)
df2 <-
df %>%
arrange(id, year) %>%
group_by(id) %>%
rowwise() %>%
mutate(obs = ifelse((lag(year) %in% df[df$id == id,]$year & year > lag(year)), 1, 0))
Which generates a column of only 0 values. If I remove the second condition the code works, but then it misinterprets the lag(year) command, as it takes values from different individuals as well.
My desired output would be a dataframe in the following format:
id
year
fte_wage
ob
1
1
23
0
1
2
23
1
1
3
43
1
2
1
54
0
2
2
32
1
3
1
56
0
You can just group_by(id) and then check if row_number() is > 1 to see if it falls in repeating run or is alone.
library(tidyverse)
df <- data.frame("id" = c(1,2,3,1,2,3,1,2,3),
"year" = c(3,3,3,2,2,2,1,1,1),
"fte_wage" = c(23,23,34,134,134,NA,45,NA,NA))
df %>%
drop_na(fte_wage) %>%
arrange(id, year) %>%
group_by(id) %>%
mutate(obs = as.numeric(row_number() > 1))
#> # A tibble: 6 × 4
#> # Groups: id [3]
#> id year fte_wage obs
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 45 0
#> 2 1 2 134 1
#> 3 1 3 23 1
#> 4 2 2 134 0
#> 5 2 3 23 1
#> 6 3 3 34 0
Created on 2022-11-21 with reprex v2.0.2
This is one approach using dplyr without grouping.
library(dplyr)
df %>%
na.omit() %>%
arrange(id, year) %>%
mutate(obs = (lag(id, default=F) == id) * 1)
id year fte_wage obs
1 1 1 45 0
2 1 2 134 1
3 1 3 23 1
4 2 2 134 0
5 2 3 23 1
6 3 3 34 0
You could use diff in the following way:
library(dplyr)
df %>%
group_by(id) %>%
arrange(id, year) %>%
mutate(obs = +(c(0, diff(year)) == 1L))
Output:
# A tibble: 6 x 4
# Groups: id [3]
id year fte_wage obs
<dbl> <dbl> <dbl> <dbl>
1 1 1 45 0
2 1 2 134 1
3 1 3 23 1
4 2 2 134 0
5 2 3 23 1
6 3 3 34 0

How to count distinct non-missing rows in R using dplyr

library(dplyr)
mydat <- data.frame(id = c(123, 111, 234, "none", 123, 384, "none"),
id2 = c(1, 1, 1, 2, 2, 3, 4))
> mydat
id id2
1 123 1
2 111 1
3 234 1
4 none 2
5 123 2
6 384 3
7 none 4
I would like to count the number of unique ids for each id2 in medal. However, for the id that is none, I do not want to count it.
> mydat %>% group_by(id2) %>% summarise(count = n_distinct(id))
# A tibble: 4 × 2
id2 count
<dbl> <int>
1 1 3
2 2 2
3 3 1
4 4 1
Using this mistakenly counts none. The desired output should be
> mydat %>% group_by(id2) %>% summarise(count = n_distinct(id))
# A tibble: 4 × 2
id2 count
<dbl> <int>
1 1 3
2 2 1
3 3 1
4 4 0
mydat %>% group_by(id2) %>%
summarise(
count = n_distinct(id),
wanted = n_distinct(id[id != "none"])
)
# # A tibble: 4 × 3
# id2 count wanted
# <dbl> <int> <int>
# 1 1 3 3
# 2 2 2 1
# 3 3 1 1
# 4 4 1 0

New variable from grouped calculation in R

I have a dataset:
library(dplyr)
my_df <- data.frame(day = c(1,1,1,2,2,2,3,3,3), age = c(18, 18, 18, 25, 18, 35, 76, 76, 15))
my_df
# day age
# 1 1 18
# 2 1 18
# 3 1 18
# 4 2 25
# 5 2 18
# 6 2 35
# 7 3 76
# 8 3 76
# 9 3 15
For each row, I want to know the frequency and percentage of age for a given value of day. For example, I can calculate this with a dplyr chain:
my_df %>%
group_by(day, age) %>%
summarize(n=n()) %>%
group_by(day) %>%
mutate(pct = n/sum(n))
# day age n pct
# 1 1 18 3 1
# 2 2 18 1 0.333
# 3 2 25 1 0.333
# 4 2 35 1 0.333
# 5 3 15 1 0.333
# 6 3 76 2 0.667
How can I add the vales of n values back onto my original df? Desired output:
# day age n
# 1 1 18 3
# 2 1 18 3
# 3 1 18 3
# 4 2 25 1
# 5 2 18 1
# 6 2 35 1
# 7 3 76 2
# 8 3 76 2
# 9 3 15 1
For your desired output we could use add_count()
library(dplyr)
my_df %>%
add_count(day, age)
day age n
1 1 18 3
2 1 18 3
3 1 18 3
4 2 25 1
5 2 18 1
6 2 35 1
7 3 76 2
8 3 76 2
9 3 15 1
I would store this as a variable, as such:
my_helper_df <- my_df %>%
group_by(day, age) %>%
summarize(n=n()) %>%
group_by(day) %>%
mutate(pct = n/sum(n))
Then left_join to the original df, as so:
final_df <- dplyr::left_join(df, my_helper_df, by = c("day", "age"))

How to flag the last row of a data frame group?

Suppose we start with the below dataframe df:
ID <- c(1, 1, 1, 5, 5)
Period <- c(1,2,3,1,2)
Value <- c(10,12,11,4,6)
df <- data.frame(ID, Period, Value)
ID Period Value
1 1 1 10
2 1 2 12
3 1 3 11
4 5 1 4
5 5 2 6
Now using dplyr I add a "Calculate" column that multiplies Period and Value of each row, giving me the following:
> df %>% mutate(Calculate = Period * Value)
ID Period Value Calculate
1 1 1 10 10
2 1 2 12 24
3 1 3 11 33
4 5 1 4 4
5 5 2 6 12
I'd like to modify the above "Calculate" to give me a value of 0, when reaching the last row for a given ID, so that the data frame output looks like:
ID Period Value Calculate
1 1 1 10 10
2 1 2 12 24
3 1 3 11 0
4 5 1 4 4
5 5 2 6 0
I was going to use the lead() function to peer at the next row to see if the ID changes but wasn't sure that happens when reaching the end of the data frame.
How could this be accomplished using dplyr?
You can group_by ID and replace the last row for each ID with 0.
library(dplyr)
df %>%
mutate(Calculate = Period * Value) %>%
group_by(ID) %>%
mutate(Calculate = replace(Calculate, n(), 0)) %>%
ungroup
# ID Period Value Calculate
# <dbl> <dbl> <dbl> <dbl>
#1 1 1 10 10
#2 1 2 12 24
#3 1 3 11 0
#4 5 1 4 4
#5 5 2 6 0
Yet another possibility:
library(tidyverse)
ID <- c(1, 1, 1, 5, 5)
Period <- c(1,2,3,1,2)
Value <- c(10,12,11,4,6)
df <- data.frame(ID, Period, Value)
df %>%
mutate(Calculate = Period * Value) %>%
group_by(ID) %>%
mutate(Calculate = if_else(row_number() == n(), 0, Calculate)) %>%
ungroup
#> # A tibble: 5 × 4
#> ID Period Value Calculate
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 10 10
#> 2 1 2 12 24
#> 3 1 3 11 0
#> 4 5 1 4 4
#> 5 5 2 6 0
ID <- c(1, 1, 1, 5, 5)
Period <- c(1,2,3,1,2)
Value <- c(10,12,11,4,6)
df <- data.frame(ID, Period, Value)
library(tidyverse)
df %>%
mutate(Calculate = Period * Value * duplicated(ID, fromLast = TRUE))
#> ID Period Value Calculate
#> 1 1 1 10 10
#> 2 1 2 12 24
#> 3 1 3 11 0
#> 4 5 1 4 4
#> 5 5 2 6 0
Created on 2022-01-09 by the reprex package (v2.0.1)
This should work. You can also replace rownum with Period (most likely)
ID <- c(1, 1, 1, 5, 5)
Period <- c(1,2,3,1,2)
Value <- c(10,12,11,4,6)
df <- data.frame(ID, Period, Value)
df = df %>% mutate(Calculate = Period * Value)
df$rownum = rownames(df)
df = df %>%
group_by(ID) %>%
mutate(Calculate = ifelse(rownum == max(rownum), 0, Calculate)) %>%
ungroup()
A tibble: 5 × 5
ID Period Value Calculate rownum
<dbl> <dbl> <dbl> <dbl> <chr>
1 1 1 10 10 1
2 1 2 12 24 2
3 1 3 11 0 3
4 5 1 4 4 4
5 5 2 6 0 5

r group lag sum

I have some data with groups for which I want to compute a summary (sum or mean) over a fixed number of periods. I'm trying to do this with a group_by followed by mutate and then operating with the variable and its dplyr::lag. Here is an example:
library(tidyverse)
df <- data.frame(group = rep(c("A", "B"), 5),
x = c(1, 3, 4, 7, 9, 10, 17, 29, 30, 55))
df %>%
group_by(group) %>%
mutate(cs = x + lag(x, 1, 0) + lag(x, 2, 0) + lag(x, 3, 0)) %>%
ungroup()
Which yields the desired result:
# A tibble: 10 x 3
group x cs
<fctr> <dbl> <dbl>
1 A 1 1
2 B 3 3
3 A 4 5
4 B 7 10
5 A 9 14
6 B 10 20
7 A 17 31
8 B 29 49
9 A 30 60
10 B 55 101
Is there a shorter way to accomplish this? (Here I calculated four values but I actually need twelve or more).
Perhaps you could use the purrr functions reduce and map included with the tidyverse:
library(tidyverse)
df <- data.frame(group = rep(c("A", "B"), 5),
x = c(1, 3, 4, 7, 9, 10, 17, 29, 30, 55))
df %>%
group_by(group) %>%
mutate(cs = reduce(map(0:3, ~ lag(x, ., 0)), `+`)) %>%
ungroup()
#> # A tibble: 10 x 3
#> group x cs
#> <fctr> <dbl> <dbl>
#> 1 A 1 1
#> 2 B 3 3
#> 3 A 4 5
#> 4 B 7 10
#> 5 A 9 14
#> 6 B 10 20
#> 7 A 17 31
#> 8 B 29 49
#> 9 A 30 60
#> 10 B 55 101
To see what's happening here it's probably easier to see with a simpler example that doesn't require a group.
v <- 1:5
lagged_v <- map(0:3, ~ lag(v, ., 0))
lagged_v
#> [[1]]
#> [1] 1 2 3 4 5
#>
#> [[2]]
#> [1] 0 1 2 3 4
#>
#> [[3]]
#> [1] 0 0 1 2 3
#>
#> [[4]]
#> [1] 0 0 0 1 2
reduce(lagged_v, `+`)
#> [1] 1 3 6 10 14

Resources