R Lag Variable And Skip Value Between - r

DATA = data.frame(STUDENT = c(1,1,1,2,2,2,3,3,4,4),
SCORE = c(6,4,8,10,9,0,2,3,3,7),
CLASS = c('A', 'B', 'C', 'A', 'B', 'C', 'B', 'C', 'A', 'B'),
WANT = c(NA, NA, 2, NA, NA, -10, NA, NA, NA, NA))
I have DATA and wish to create 'WANT' which is calculate by:
For each STUDENT, find the SCORE where SCORE equals to SCORE(CLASS = C) - SCORE(CLASS = A)
EX: SCORE(STUDENT = 1, CLASS = C) - SCORE(STUDENT = 1, CLASS = A) = 8-6=2

Assuming at most one 'C' and 'A' CLASS per each 'STUDENT', just subset the 'SCORE' where the CLASS value is 'C', 'A', do the subtraction and assign the value only to position where CLASS is 'C' by making all other positions to NA (after grouping by 'STUDENT')
library(dplyr)
DATA <- DATA %>%
group_by(STUDENT) %>%
mutate(WANT2 = (SCORE[CLASS == 'C'][1] - SCORE[CLASS == 'A'][1]) *
NA^(CLASS != "C")) %>%
ungroup
-output
# A tibble: 10 × 5
STUDENT SCORE CLASS WANT WANT2
<dbl> <dbl> <chr> <dbl> <dbl>
1 1 6 A NA NA
2 1 4 B NA NA
3 1 8 C 2 2
4 2 10 A NA NA
5 2 9 B NA NA
6 2 0 C -10 -10
7 3 2 B NA NA
8 3 3 C NA NA
9 4 3 A NA NA
10 4 7 B NA NA

Here is a solution with the data organized in a wider format first, then a longer format below. This solution works regardless of the order of the "CLASS" column (for instance, if there is one instance in which the CLASS order is CBA or BCA instead os ABC, this solution will work).
Solution
library(dplyr)
library(tidyr)
wider <- DATA %>% select(-WANT) %>%
pivot_wider( names_from = "CLASS", values_from = "SCORE") %>%
rowwise() %>%
mutate(WANT = C-A) %>%
ungroup()
output wider
# A tibble: 4 × 5
STUDENT A B C WANT
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 6 4 8 2
2 2 10 9 0 -10
3 3 NA 2 3 NA
4 4 3 7 NA NA
If you really want like your output example, then we can reorganize the wider data this way:
Reorganizing wider to long format
wider %>%
pivot_longer(A:C, values_to = "SCORE", names_to = "CLASS") %>%
relocate(WANT, .after = SCORE) %>%
mutate(WANT = if_else(CLASS == "C", WANT, NA_real_))
Final Output
# A tibble: 12 × 4
STUDENT CLASS SCORE WANT
<dbl> <chr> <dbl> <dbl>
1 1 A 6 NA
2 1 B 4 NA
3 1 C 8 2
4 2 A 10 NA
5 2 B 9 NA
6 2 C 0 -10
7 3 A NA NA
8 3 B 2 NA
9 3 C 3 NA
10 4 A 3 NA
11 4 B 7 NA
12 4 C NA NA

Related

fill NA values per group based on first value of a group

I am trying to fill NA values of my dataframe. However, I would like to fill them based on the first value of each group.
#> df = data.frame(
group = c(rep("A", 4), rep("B", 4)),
val = c(1, 2, NA, NA, 4, 3, NA, NA)
)
#> df
group val
1 A 1
2 A 2
3 A NA
4 A NA
5 B 4
6 B 3
7 B NA
8 B NA
#> fill(df, val, .direction = "down")
group val
1 A 1
2 A 2
3 A 2 # -> should be 1
4 A 2 # -> should be 1
5 B 4
6 B 3
7 B 3 # -> should be 4
8 B 3 # -> should be 4
Can I do this with tidyr::fill()? Or is there another (more or less elegant) way how to do this? I need to use this in a longer chain (%>%) operation.
Thank you very much!
Use tidyr::replace_na() and dplyr::first() (or val[[1]]) inside a grouped mutate():
library(dplyr)
library(tidyr)
df %>%
group_by(group) %>%
mutate(val = replace_na(val, first(val))) %>%
ungroup()
#> # A tibble: 8 × 2
#> group val
#> <chr> <dbl>
#> 1 A 1
#> 2 A 2
#> 3 A 1
#> 4 A 1
#> 5 B 4
#> 6 B 3
#> 7 B 4
#> 8 B 4
PS - #richarddmorey points out the case where the first value for a group is NA. The above code would keep all NA values as NA. If you'd like to instead replace with the first non-missing value per group, you could subset the vector using !is.na():
df %>%
group_by(group) %>%
mutate(val = replace_na(val, first(val[!is.na(val)]))) %>%
ungroup()
Created on 2022-11-17 with reprex v2.0.2
This should work, which uses dplyr's case_when
library(dplyr)
df %>%
group_by(group) %>%
mutate(val = case_when(
is.na(val) ~ val[1],
TRUE ~ val
))
Output:
group val
<chr> <dbl>
1 A 1
2 A 2
3 A 1
4 A 1
5 B 4
6 B 3
7 B 4
8 B 4

Create new column based on previous column by group; if missing, use NA

I am trying out to select a value by group from one column, and pass it as value in another column, extending for the whole group. This is similar to question asked here . BUt, some groups do not have this number: in that case, I need to fill the column with NAs. How to do this?
Dummy example:
dd1 <- data.frame(type = c(1,1,1),
grp = c('a', 'b', 'd'),
val = c(1,2,3))
dd2 <- data.frame(type = c(2,2),
grp = c('a', 'b'),
val = c(8,2))
dd3 <- data.frame(type = c(3,3),
grp = c('b', 'd'),
val = c(7,4))
dd <- rbind(dd1, dd2, dd3)
Create new column:
dd %>%
group_by(type) %>%
mutate(#val_a = ifelse(grp == 'a', val , NA),
val_a2 = val[grp == 'a'])
Expected outcome:
type grp val val_a # pass in `val_a` value of teh group 'a'
1 1 a 1 1
2 1 b 2 1
3 1 d 3 1
4 2 a 8 8
5 2 b 2 8
6 3 b 7 NA
7 3 d 4 NA # value for 'a' is missing from group 3
You were close with your first approach; use any to apply the condition to all observations in the group:
dd %>%
group_by(type) %>%
mutate(val_a = ifelse(any(grp == "a"), val[grp == "a"] , NA))
type grp val val_a
<dbl> <chr> <dbl> <dbl>
1 1 a 1 1
2 1 b 2 1
3 1 d 3 1
4 2 a 8 8
5 2 b 2 8
6 3 b 7 NA
7 3 d 4 NA
Try this:
dd %>%
group_by(type) %>%
mutate(val_a2 = val[which(c(grp == 'a'))[1]])
# # A tibble: 7 x 4
# # Groups: type [3]
# type grp val val_a2
# <dbl> <chr> <dbl> <dbl>
# 1 1 a 1 1
# 2 1 b 2 1
# 3 1 d 3 1
# 4 2 a 8 8
# 5 2 b 2 8
# 6 3 b 7 NA
# 7 3 d 4 NA
This also controls against the possibility that there could be more than one match, which may cause bad results (with or without a warning).

How do I pivot_wider a char column?

I'm trying to pivot_wider a tibble of random alpha strings
stri_rand_strings(252, 5, '[a-z]') %>%
sort() %>%
as_tibble() %>%
mutate(id = row_number(),
col = rep(letters[1:4], each = length(value) / 4)) %>%
pivot_wider(names_from = col, values_from = value)
I get three columns of NA in a tibble (252 x 5):
# A tibble: 252 × 5
id a b c d
<int> <chr> <chr> <chr> <chr>
1 1 aarup NA NA NA
2 2 abhir NA NA NA
3 3 afpgt NA NA NA
4 4 apjts NA NA NA
5 5 arlst NA NA NA
6 6 awkjn NA NA NA
7 7 babro NA NA NA
8 8 bbrpn NA NA NA
9 9 bbrzt NA NA NA
10 10 bedzs NA NA NA
# … with 242 more rows
instead of the desired 63 x 5.
your id-column is messing everything up. rownumbers are unique, so casting to wide does not make sense, since you have got unique identifiers.
try something like
stringi::stri_rand_strings(252, 5, '[a-z]') %>%
sort() %>%
as_tibble() %>%
mutate(id = rep(1:(length(value) / 4), 4), # !! <-- !!
col = rep(letters[1:4], each = length(value) / 4)) %>%
pivot_wider(names_from = col, values_from = value)
# A tibble: 63 x 5
id a b c d
<int> <chr> <chr> <chr> <chr>
1 1 ababk glynv mottj tqcbv
2 2 abysq gmfhc mujcw twjix
3 3 aerkp godcs mycak tzqny
4 4 agtoa gpler naetp ucuvg
5 5 ahebl grqgz nfali ufbqv
6 6 amdvv gswwu nhmnu ulgup
7 7 apgut gvkwh nkcks umwih
8 8 atgxy gynef nkklm uojxc
9 9 bcklx hcdup nngfz upfhx
10 10 bcnxz hcpzy nnvpd uqlgs
# ... with 53 more rows

A computation efficient way to find the IDs of the Type 1 rows just above and below each Type 2 rows?

I have the following data
df <- tibble(Type=c(1,2,2,1,1,2),ID=c(6,4,3,2,1,5))
Type ID
1 6
2 4
2 3
1 2
1 1
2 5
For each of the type 2 rows, I want to find the IDs of the type 1 rows just below and above them. For the above dataset, the output will be:
Type ID IDabove IDbelow
1 6 NA NA
2 4 6 2
2 3 6 2
1 2 NA NA
1 1 NA NA
2 5 1 NA
Naively, I can write a for loop to achieve this, but that would be too time consuming for the dataset I am dealing with.
One approach using dplyr lead,lag to get next and previous value respectively and data.table's rleid to create groups of consecutive Type values.
library(dplyr)
library(data.table)
df %>%
mutate(IDabove = ifelse(Type == 2, lag(ID), NA),
IDbelow = ifelse(Type == 2, lead(ID), NA),
grp = rleid(Type)) %>%
group_by(grp) %>%
mutate(IDabove = first(IDabove),
IDbelow = last(IDbelow)) %>%
ungroup() %>%
select(-grp)
# Type ID IDabove IDbelow
# <dbl> <dbl> <dbl> <dbl>
#1 1 6 NA NA
#2 2 4 6 2
#3 2 3 6 2
#4 1 2 NA NA
#5 1 1 NA NA
#6 2 5 1 NA
A dplyr only solution:
You could create your own rleid function then apply the logic provided by Ronak(Many thanks. Upvoted).
library(dplyr)
my_func <- function(x) {
x <- rle(x)$lengths
rep(seq_along(x), times=x)
}
# this part is the same as provided by Ronak.
df %>%
mutate(IDabove = ifelse(Type == 2, lag(ID), NA),
IDbelow = ifelse(Type == 2, lead(ID), NA),
grp = my_func(Type)) %>%
group_by(grp) %>%
mutate(IDabove = first(IDabove),
IDbelow = last(IDbelow)) %>%
ungroup() %>%
select(-grp)
Output:
Type ID IDabove IDbelow
<dbl> <dbl> <dbl> <dbl>
1 1 6 NA NA
2 2 4 6 2
3 2 3 6 2
4 1 2 NA NA
5 1 1 NA NA
6 2 5 1 NA

How to calculate row differences in r when it's not in sequence

I have a data frame like this:
name count
a 3
a 5
a 8
b 2
a 9
b 7
so I want to calculate the row differences group by name. so my code is:
data%>%group_by(Name)%>%mutate(last_count = lag(count),diff = count - last_count)
However, I get a result like the below table
name count last_count diff
a 3 NA NA
a 5 3 2
a 8 5 3
b 2 NA NA
a 9 8 1
b 7 2 5
But what I want should look like this:
name count last_count diff
a 3 NA NA
a 5 3 2
a 8 5 3
b 2 NA NA
a 9 NA NA
b 7 NA NA
Thanks in advance to whoever can help me fix it!
Does this work:
> library(dplyr)
> df %>% mutate(last_count = case_when(name == lag(name) ~ lag(count), TRUE ~ NA_real_),
diff = case_when(name == lag(name) ~ count - lag(count), TRUE ~ NA_real_))
# A tibble: 6 x 4
name count last_count diff
<chr> <dbl> <dbl> <dbl>
1 a 3 NA NA
2 a 5 3 2
3 a 8 5 3
4 b 2 NA NA
5 a 9 NA NA
6 b 7 NA NA
>
We could use rleid to create a grouping column based on the adjacent matching values in the 'name' column and then apply the diff
library(dplyr)
library(data.table)
data %>%
group_by(grp = rleid(name)) %>%
mutate(last_count = lag(count), diff = count - last_count) %>%
ungroup %>%
select(-grp)
-output
# A tibble: 6 x 4
# name count last_count diff
# <chr> <int> <int> <int>
#1 a 3 NA NA
#2 a 5 3 2
#3 a 8 5 3
#4 b 2 NA NA
#5 a 9 NA NA
#6 b 7 NA NA
Or using base R with ave and rle
data$diff <- with(data, ave(count, with(rle(name),
rep(seq_along(values), lengths)), FUN = function(x) c(NA, diff(x)))
data
data <- structure(list(name = c("a", "a", "a", "b", "a", "b"), count = c(3L,
5L, 8L, 2L, 9L, 7L)), class = "data.frame", row.names = c(NA,
-6L))

Resources