reset a ranking when a variable exceeds a value using dplyr - r

Suppose I have the following data:
df <- tibble(ID=c(1,2,3,4,5,6,7,8,9,10),
ID2=c(1,1,1,1,2,2,2,3,4,4),
VAR=c(25,10,120,60,85,90,20,40,60,150))
I want to add a new column with a ranking that would be reset either when the ID2 changes or when VAR is greater than 100.
The desired result is:
# A tibble: 10 x 4
ID ID2 VAR RANK
<dbl> <dbl> <dbl> <dbl>
1 1 1 25 1
2 2 1 10 2
3 3 1 120 1
4 4 1 60 2
5 5 2 85 1
6 6 2 90 2
7 7 2 20 3
8 8 3 40 1
9 9 4 60 1
10 10 4 150 1
I know how to add a new column with a ranking that would be reset only when the ID2 changes:
df %>%
arrange(ID2) %>%
group_by(ID2) %>%
mutate(RANK = row_number())
... but treating both conditions at the same time is more difficult. How should I do using dplyr?

You can group_by ID2 and cumsum(VAR > 100), i.e.:
library(dplyr)
df %>%
group_by(ID2, cumVAR = cumsum(VAR > 100)) %>%
mutate(RANK = row_number())
output
# A tibble: 10 x 5
# Groups: ID2, cumVAR [6]
ID ID2 VAR cumVAR RANK
<dbl> <dbl> <dbl> <int> <int>
1 1 1 25 0 1
2 2 1 10 0 2
3 3 1 120 1 1
4 4 1 60 1 2
5 5 2 85 1 1
6 6 2 90 1 2
7 7 2 20 1 3
8 8 3 40 1 1
9 9 4 60 1 1
10 10 4 150 2 1

rowid from data.table would be useful as well
library(dplyr)
library(data.table)
df %>%
mutate(RANK = rowid(ID2, cumsum(VAR > 100)))
-output
# A tibble: 10 × 4
ID ID2 VAR RANK
<dbl> <dbl> <dbl> <int>
1 1 1 25 1
2 2 1 10 2
3 3 1 120 1
4 4 1 60 2
5 5 2 85 1
6 6 2 90 2
7 7 2 20 3
8 8 3 40 1
9 9 4 60 1
10 10 4 150 1

Related

count the different name considering as same

I want to count the number of fluctuation of responses under the column response per id. However, responses No~ no ~ DK. I need to consider as the same response just only for count to the number of fluctuate in response. I don't change responses permanently.
df <- data.frame(
id=c(1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4),
response=c("Yes","Yes","No","DK","no","No","No","no","No","Yes","Yes","DK","No","Yes","Yes","No","No","No","died","TO","Yes","No","Yes")
)
I am trying it using the following code:
library(tidyverse)
df <- df %>%
group_by(id) %>% fill(response) %>%
mutate(new = rleid(response), rn = row_number()) %>%
mutate(flactuation = case_when(rn >2 & duplicated(new) ~ 'No', rn > 2 ~ 'Yes')) %>%
mutate(numberofchange = sum(flactuation=="Yes", na.rm = T)) %>% select(-rn, -flactuation)
Expected
id response new numberofchange
<dbl> <chr> <int> <int>
1 1 Yes 1 1
2 1 Yes 1 1
3 1 No 2 1
4 1 DK 2 1
5 1 no 2 1
6 2 No 1 1
7 2 No 1 1
8 2 no 1 1
9 2 No 1 1
10 2 Yes 2 1
11 2 Yes 2 1
12 3 DK 1 2
13 3 No 1 2
14 3 Yes 2 2
15 3 Yes 2 2
16 3 No 3 2
17 3 No 3 2
18 4 No 1 5
19 4 died 2 5
20 4 TO 3 5
21 4 Yes 4 5
22 4 No 5 5
23 4 Yes 6 5
You could use data.table::rleid() to get the run-length indices.
library(dplyr)
df %>%
group_by(id) %>%
mutate(new = data.table::rleid(replace(response, response %in% c('no', 'DK'), "No")),
numberofchange = max(new) - 1) %>%
ungroup()
# A tibble: 23 × 4
id response new numberofchange
<dbl> <chr> <int> <dbl>
1 1 Yes 1 1
2 1 Yes 1 1
3 1 No 2 1
4 1 DK 2 1
5 1 no 2 1
6 2 No 1 1
7 2 No 1 1
8 2 no 1 1
9 2 No 1 1
10 2 Yes 2 1
11 2 Yes 2 1
12 3 DK 1 2
13 3 No 1 2
14 3 Yes 2 2
15 3 Yes 2 2
16 3 No 3 2
17 3 No 3 2
18 4 No 1 5
19 4 died 2 5
20 4 TO 3 5
21 4 Yes 4 5
22 4 No 5 5
23 4 Yes 6 5

DPLYR - merging rows together using a column value as a conditional

I have a series of rows in a single dataframe. I'm trying to aggregate the first two rows for each ID- i.e. - I want to combine events 1 and 2 for ID 1 into a single row, events 1 and 2 for ID 2 into a singlw row etc, but leave event 3 completely untouched.
id <- c(1,1,1,2,2,2,3,3,3,4,4,4,5,5,5)
event <- c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3)
score <- c(3,NA,1,3,NA,2,6,NA,1,8,NA,2,4,NA,1)
score2 <- c(NA,4,1,NA,5,2,NA,0,3,NA,5,6,NA,8,7)
df <- tibble(id, event, score, score2)
# A tibble: 15 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 NA
2 1 2 NA 4
3 1 3 1 1
4 2 1 3 NA
5 2 2 NA 5
6 2 3 2 2
7 3 1 6 NA
8 3 2 NA 0
9 3 3 1 3
10 4 1 8 NA
11 4 2 NA 5
12 4 3 2 6
13 5 1 4 NA
14 5 2 NA 8
15 5 3 1 7
I've tried :
df_merged<- df %>% group_by (id) %>% summarise_all(funs(min(as.character(.),na.rm=TRUE))),
which aggregates these nicely, but then I struggle to merge these back into the orignal dataframe/tibble (there are really about 300 different "score" columns in the full dataset, so a right_join is a headache with score.x, score.y, score2.x, score2.y all over the place...)
Ideally, the situation would need to be dplyr as the rest of my code runs on this!
EDIT:
Ideally, my expected output would be:
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
3 1 3 1 1
4 2 1 3 5
6 2 3 2 2
7 3 1 6 0
9 3 3 1 3
10 4 1 8 5
12 4 3 2 6
13 5 1 4 8
15 5 3 1 7
We may change the order of NA elements with replace
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with('score'),
~replace(., 1:2, .[1:2][order(is.na(.[1:2]))]))) %>%
ungroup %>%
filter(if_all(starts_with('score'), Negate(is.na)))
-output
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
Here is an alternative way to achieve your task with fill from tidyr package:
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
fill(everything(), .direction = "down") %>%
fill(everything(), .direction = "up") %>%
slice(1,3)
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
How about this?
library(dplyr)
df_e12 <- df %>%
filter(event %in% c(1, 2)) %>%
group_by(id) %>%
mutate(across(starts_with("score"), ~min(.x, na.rm = TRUE))) %>%
ungroup() %>%
distinct(id, .keep_all = TRUE)
df_e3 <- df %>%
filter(event == 3)
df <- bind_rows(df_e12, df_e3) %>%
arrange(id, event)
df
> df
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7

Optimize computation in dplyr mutate function

Assume following table:
library(dplyr)
library(tibble)
library(purrr)
df = tibble(
client = c(1,1,1,1,2,2,2,2),
prod_type = c(1,1,2,2,1,1,2,2),
max_prod_type = c(2,2,2,2,2,2,2,2),
value_1 = c(10,20,30,30,100,200,300,300),
value_2 = c(1,2,3,3,1,2,3,3),
)
# A tibble: 8 x 5
client prod_type max_prod_type value_1 value_2
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 2 10 1
2 1 1 2 20 2
3 1 2 2 30 3
4 1 2 2 30 3
5 2 1 2 100 1
6 2 1 2 200 2
7 2 2 2 300 3
8 2 2 2 300 3
Column 'max_prod_type' here denotes maximum value for 'prod_type' column per each 'client' value. I need to compute new column 'sum', which would contain sum from adding the values from 'value_1' and 'value_2', but only for those rows, where 'prod_type' == 'max_prod_type' per each 'client' value.
I have tried following code:
df %>%
mutate(
sum =
map2_dbl(
client, max_prod_type,
~case_when(
prod_type == .y~
filter(df, client == .x, prod_type == .y) %>%
mutate(sum = value_1 + value_2) %>%
select(sum) %>%
sum(),
T~NA_real_
)
)
)
Desired output is following:
# A tibble: 8 x 6
client prod_type max_prod_type value_1 value_2 sum
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 2 10 1 NA
2 1 1 2 20 2 NA
3 1 2 2 30 3 66
4 1 2 2 30 3 66
5 2 1 2 100 1 NA
6 2 1 2 200 2 NA
7 2 2 2 300 3 606
8 2 2 2 300 3 606
But it throws an error:
Error: Problem with `mutate()` input `sum`.
x Result 1 must be a single double, not a double vector of length 6
i Input `sum` is `map2_dbl(...)`.
Moreover, as for me such way of implementation is somewhat slow. I'm wondering if there any correct and more optimized solution to this problem.
Appreciate your help!
One option could be:
df %>%
group_by(client) %>%
mutate(res = row_number() == which(value_1 == max(value_1)),
res = if_else(res, sum(value_1[res]) + sum(value_2[res]), NA_real_))
client prod_type max_prod_type value_1 value_2 res
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 2 10 1 NA
2 1 1 2 20 2 NA
3 1 2 2 30 3 66
4 1 2 2 30 3 66
5 2 1 2 100 1 NA
6 2 1 2 200 2 NA
7 2 2 2 300 3 606
8 2 2 2 300 3 606
I think this is closer to what you want:
df %>%
mutate(sum = case_when(prod_type == max_prod_type ~ value_1 + value_2,
TRUE ~ NA_real_))
# A tibble: 6 x 6
client prod_type max_prod_type value_1 value_2 sum
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 2 10 1 NA
2 1 1 2 20 2 NA
3 1 2 2 30 3 33
4 2 1 2 100 1 NA
5 2 1 2 200 2 NA
6 2 2 2 300 3 303

Count number of new and lost friends between two data frames in R

I have two data frames of the same respondents, one from Time 1 and the next from Time 2. In each wave they nominated their friends, and I want to know:
1) how many friends are nominated in Time 2 but not in Time 1 (new friends)
2) how many friends are nominated in Time 1 but not in Time 2 (lost friends)
Sample data:
Time 1 DF
ID friend_1 friend_2 friend_3
1 4 12 7
2 8 6 7
3 9 NA NA
4 15 7 2
5 2 20 7
6 19 13 9
7 12 20 8
8 3 17 10
9 1 15 19
10 2 16 11
Time 2 DF
ID friend_1 friend_2 friend_3
1 4 12 3
2 8 6 14
3 9 NA NA
4 15 7 2
5 1 17 9
6 9 19 NA
7 NA NA NA
8 7 1 16
9 NA 10 12
10 7 11 9
So the desired DF would include these columns (EDIT filled in columns):
ID num_newfriends num_lostfriends
1 1 1
2 1 1
3 0 0
4 0 0
5 3 3
6 0 1
7 0 3
8 3 3
9 2 3
10 2 1
EDIT2:
I've tried doing an anti join
df3 <- anti_join(df1, df2)
But this method doesn't take into account friend id numbers that might appear in a different column in time 2 (For example respondent #6 friend 9 and 19 are in T1 and T2 but in different columns in each time)
Another option:
library(tidyverse)
left_join(
gather(df1, key, x, -ID),
gather(df2, key, y, -ID),
by = c("ID", "key")
) %>%
group_by(ID) %>%
summarise(
num_newfriends = sum(!y[!is.na(y)] %in% x[!is.na(x)]),
num_lostfriends = sum(!x[!is.na(x)] %in% y[!is.na(y)])
)
Output:
# A tibble: 10 x 3
ID num_newfriends num_lostfriends
<int> <int> <int>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
Simple comparisons would be an option
library(tidyverse)
na_sums_old <- rowSums(is.na(time1))
na_sums_new <- rowSums(is.na(time2))
kept_friends <- map_dbl(seq(nrow(time1)), ~ sum(time1[.x, -1] %in% time2[.x, -1]))
kept_friends <- kept_friends - na_sums_old * (na_sums_new >= 1)
new_friends <- 3 - na_sums_new - kept_friends
lost_friends <- 3 - na_sums_old - kept_friends
tibble(ID = time1$ID, new_friends = new_friends, lost_friends = lost_friends)
# A tibble: 10 x 3
ID new_friends lost_friends
<int> <dbl> <dbl>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
You can make anti_join work by first pivoting to a "long" data frame.
df1 <- df1 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
df2 <- df2 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
head(df1)
#> # A tibble: 6 x 3
#> ID name friend
#> <int> <chr> <int>
#> 1 1 friend_1 4
#> 2 1 friend_2 12
#> 3 1 friend_3 7
#> 4 2 friend_1 8
#> 5 2 friend_2 6
#> 6 2 friend_3 7
lost_friends <- anti_join(df1, df2, by = c("ID", "friend"))
new_fiends <- anti_join(df2, df1, by = c("ID", "friend"))
respondents <- distinct(df1, ID)
respondents %>%
full_join(
count(lost_friends, ID, name = "num_lost_friends")
) %>%
full_join(
count(new_fiends, ID, name = "num_new_friends")
) %>%
mutate_at(vars(starts_with("num_")), replace_na, 0)
#> Joining, by = "ID"
#> Joining, by = "ID"
#> # A tibble: 10 x 3
#> ID num_lost_friends num_new_friends
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 1 1
#> 3 3 0 0
#> 4 4 0 0
#> 5 5 3 3
#> 6 6 1 0
#> 7 7 3 0
#> 8 8 3 3
#> 9 9 3 2
#> 10 10 2 2
Created on 2019-11-01 by the reprex package (v0.3.0)

count positive negative values in column by group

I want to create two variables giving me the total number of positive and negative values by id, hopefully using dplyr.
Example data:
library(dplyr)
set.seed(42)
df <- data.frame (id=rep(1:10,each=10),
ff=rnorm(100, 0,14 ))
> head(df,20)
id ff
1 1 19.1934183
2 1 -7.9057744
3 1 5.0837978
4 1 8.8600765
5 1 5.6597565
6 1 -1.4857432
7 1 21.1613080
8 1 -1.3252265
9 1 28.2579320
10 1 -0.8779974
11 2 18.2681752
12 2 32.0130355
13 2 -19.4440498
14 2 -3.9030427
15 2 -1.8664987
16 2 8.9033056
17 2 -3.9795409
18 2 -37.1903759
19 2 -34.1665370
20 2 18.4815868
the resulting dataset should look like:
> head(df,20)
id ff pos neg
1 1 19.1934183 6 4
2 1 -7.9057744 6 4
3 1 5.0837978 6 4
4 1 8.8600765 6 4
5 1 5.6597565 6 4
6 1 -1.4857432 6 4
7 1 21.1613080 6 4
8 1 -1.3252265 6 4
9 1 28.2579320 6 4
10 1 -0.8779974 6 4
11 2 18.2681752 4 6
12 2 32.0130355 4 6
13 2 -19.4440498 4 6
14 2 -3.9030427 4 6
15 2 -1.8664987 4 6
16 2 8.9033056 4 6
17 2 -3.9795409 4 6
18 2 -37.1903759 4 6
19 2 -34.1665370 4 6
20 2 18.4815868 4 6
I have thought something similar to this will work:
df<-df%>% group_by(id) %>% mutate(pos= nrow(ff>0)) %>% ungroup()
Any help would be great, thanks.
You need sum():
df %>% group_by(id) %>%
mutate(pos = sum(ff>0),
neg = sum(ff<0))
For a fun (and a fast) solution data.table can also be used:
library(data.table)
setDT(df)
df[, ":="(pos = sum(ff > 0), neg = sum(ff < 0)), by = id]
Here's an answer that add the ifelse part of your question:
df <- df %>% group_by(id) %>%
mutate(pos = sum(ff>0), neg = sum(ff<0)) %>%
group_by(id) %>%
mutate(any_neg=ifelse(any(ff < 0), 1, 0))
Output:
> head(df, 20)
Source: local data frame [20 x 5]
Groups: id [2]
id ff pos neg any_neg
<int> <dbl> <int> <int> <dbl>
1 1 19.1934183 6 4 1
2 1 -7.9057744 6 4 1
3 1 5.0837978 6 4 1
4 1 8.8600765 6 4 1
5 1 5.6597565 6 4 1
6 1 -1.4857432 6 4 1
7 1 21.1613080 6 4 1
8 1 -1.3252265 6 4 1
9 1 28.2579320 6 4 1
10 1 -0.8779974 6 4 1
11 2 18.2681752 4 6 1
12 2 32.0130355 4 6 1
13 2 -19.4440498 4 6 1
14 2 -3.9030427 4 6 1
15 2 -1.8664987 4 6 1
16 2 8.9033056 4 6 1
17 2 -3.9795409 4 6 1
18 2 -37.1903759 4 6 1
19 2 -34.1665370 4 6 1
20 2 18.4815868 4 6 1

Resources