I observe 12 responses of 2 survey participants.
data = data.frame(id = c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2), response = c(2,2,3,3,6,3,6,7,3,1,4,3,3,3,6,4,2,6,7,3,2,1,5,6))
data
id response
1 1 2
2 1 2
3 1 3
4 1 3
5 1 6
6 1 3
7 1 6
8 1 7
9 1 3
10 1 1
11 1 4
12 1 3
13 2 3
14 2 3
15 2 6
16 2 4
17 2 2
18 2 6
19 2 7
20 2 3
21 2 2
22 2 1
23 2 5
24 2 6
Now I want to add 2 things to the data of each survey participant:
a) The most frequent value of this survey participant
b) the relative frequency of the most frequent value
How can I add these things using dplyr:
data %>%
group_by(id) %>%
mutate(most_frequent_value = ?,
relative_frequency_of_most_frequent_value = ?)
I'd probably use a two step solution. First, create a data.frame of frequency/relative frequency. Then join to it. We use slice(which.max()), because it will return one row. Using slice_max may return multiple rows.
library(tidyverse)
# count by id, response, calculate rel frequency
# rename columns to make inner_join easier
freq_table <- dd %>%
count(id, response) %>%
group_by(id) %>%
mutate(rel_freq = n / sum(n)) %>%
select(id, most_frequent_response = response, rel_freq)
# inner join to sliced freq_table (grouping by id is preserved)
dd %>%
inner_join(freq_table %>% slice(which.max(rel_freq)))
# id response most_frequent_response rel_freq
# 1 1 2 3 0.4166667
# 2 1 2 3 0.4166667
# 3 1 3 3 0.4166667
# 4 1 3 3 0.4166667
# 5 1 6 3 0.4166667
# ...
You could try:
table(data$id, data$response) %>%
as.data.frame() %>%
setNames(c("id", "response", "n")) %>%
group_by(id) %>%
slice_max(n, 1) %>%
group_by(response) %>%
filter(n() > 1) %>%
mutate(ratio = c(n[1]/sum(n), n[2]/sum(n)))
#> # A tibble: 2 x 4
#> # Groups: response [1]
#> id response n ratio
#> <fct> <fct> <int> <dbl>
#> 1 1 3 5 0.625
#> 2 2 3 3 0.375
Does this work:
data %>% group_by(id, response) %>% mutate(n = n()) %>%
ungroup() %>% group_by(id) %>%
mutate(most_frequent_value = response[n == max(n)][1],
relative_frequency_of_most_frequent_value = max(n)/n())
# A tibble: 24 x 5
# Groups: id [2]
id response n most_frequent_value relative_frequency_of_most_frequent_value
<dbl> <dbl> <int> <dbl> <dbl>
1 1 2 2 3 0.417
2 1 2 2 3 0.417
3 1 3 5 3 0.417
4 1 3 5 3 0.417
5 1 6 2 3 0.417
6 1 3 5 3 0.417
7 1 6 2 3 0.417
8 1 7 1 3 0.417
9 1 3 5 3 0.417
10 1 1 1 3 0.417
11 1 4 1 3 0.417
12 1 3 5 3 0.417
13 2 3 3 3 0.25
14 2 3 3 3 0.25
15 2 6 3 3 0.25
16 2 4 1 3 0.25
17 2 2 2 3 0.25
18 2 6 3 3 0.25
19 2 7 1 3 0.25
20 2 3 3 3 0.25
21 2 2 2 3 0.25
22 2 1 1 3 0.25
23 2 5 1 3 0.25
24 2 6 3 3 0.25
>
Related
I have a data frame like this:
plazo
monto
20
2
50
3
I need to add a rows for values between 1 to the value of plazo and expand my dataframe like below;
plazo
monto
Semana
20
2
1
20
2
2
20
2
3
20
2
…
20
2
20
50
3
1
50
3
2
50
3
3
50
3
…
50
3
50
We can create a nested column with values from 1:plazo for each row and then unnest that column.
df1 <- data.frame(plazo = c(2, 5), monto = c(2,3))
library(tidyverse)
df1 %>%
rowwise() %>%
mutate(Semana = list(1:plazo)) %>%
unnest(Semana)
#> # A tibble: 7 x 3
#> plazo monto Semana
#> <dbl> <dbl> <int>
#> 1 2 2 1
#> 2 2 2 2
#> 3 5 3 1
#> 4 5 3 2
#> 5 5 3 3
#> 6 5 3 4
#> 7 5 3 5
We may use uncount
library(dplyr)
library(tidyr)
df1 %>%
uncount(plazo, .id = 'Semana', .remove = FALSE)
-output
plazo monto Semana
1 2 2 1
2 2 2 2
3 5 3 1
4 5 3 2
5 5 3 3
6 5 3 4
7 5 3 5
I have a series of rows in a single dataframe. I'm trying to aggregate the first two rows for each ID- i.e. - I want to combine events 1 and 2 for ID 1 into a single row, events 1 and 2 for ID 2 into a singlw row etc, but leave event 3 completely untouched.
id <- c(1,1,1,2,2,2,3,3,3,4,4,4,5,5,5)
event <- c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3)
score <- c(3,NA,1,3,NA,2,6,NA,1,8,NA,2,4,NA,1)
score2 <- c(NA,4,1,NA,5,2,NA,0,3,NA,5,6,NA,8,7)
df <- tibble(id, event, score, score2)
# A tibble: 15 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 NA
2 1 2 NA 4
3 1 3 1 1
4 2 1 3 NA
5 2 2 NA 5
6 2 3 2 2
7 3 1 6 NA
8 3 2 NA 0
9 3 3 1 3
10 4 1 8 NA
11 4 2 NA 5
12 4 3 2 6
13 5 1 4 NA
14 5 2 NA 8
15 5 3 1 7
I've tried :
df_merged<- df %>% group_by (id) %>% summarise_all(funs(min(as.character(.),na.rm=TRUE))),
which aggregates these nicely, but then I struggle to merge these back into the orignal dataframe/tibble (there are really about 300 different "score" columns in the full dataset, so a right_join is a headache with score.x, score.y, score2.x, score2.y all over the place...)
Ideally, the situation would need to be dplyr as the rest of my code runs on this!
EDIT:
Ideally, my expected output would be:
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
3 1 3 1 1
4 2 1 3 5
6 2 3 2 2
7 3 1 6 0
9 3 3 1 3
10 4 1 8 5
12 4 3 2 6
13 5 1 4 8
15 5 3 1 7
We may change the order of NA elements with replace
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with('score'),
~replace(., 1:2, .[1:2][order(is.na(.[1:2]))]))) %>%
ungroup %>%
filter(if_all(starts_with('score'), Negate(is.na)))
-output
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
Here is an alternative way to achieve your task with fill from tidyr package:
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
fill(everything(), .direction = "down") %>%
fill(everything(), .direction = "up") %>%
slice(1,3)
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
How about this?
library(dplyr)
df_e12 <- df %>%
filter(event %in% c(1, 2)) %>%
group_by(id) %>%
mutate(across(starts_with("score"), ~min(.x, na.rm = TRUE))) %>%
ungroup() %>%
distinct(id, .keep_all = TRUE)
df_e3 <- df %>%
filter(event == 3)
df <- bind_rows(df_e12, df_e3) %>%
arrange(id, event)
df
> df
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
I have two data frames of the same respondents, one from Time 1 and the next from Time 2. In each wave they nominated their friends, and I want to know:
1) how many friends are nominated in Time 2 but not in Time 1 (new friends)
2) how many friends are nominated in Time 1 but not in Time 2 (lost friends)
Sample data:
Time 1 DF
ID friend_1 friend_2 friend_3
1 4 12 7
2 8 6 7
3 9 NA NA
4 15 7 2
5 2 20 7
6 19 13 9
7 12 20 8
8 3 17 10
9 1 15 19
10 2 16 11
Time 2 DF
ID friend_1 friend_2 friend_3
1 4 12 3
2 8 6 14
3 9 NA NA
4 15 7 2
5 1 17 9
6 9 19 NA
7 NA NA NA
8 7 1 16
9 NA 10 12
10 7 11 9
So the desired DF would include these columns (EDIT filled in columns):
ID num_newfriends num_lostfriends
1 1 1
2 1 1
3 0 0
4 0 0
5 3 3
6 0 1
7 0 3
8 3 3
9 2 3
10 2 1
EDIT2:
I've tried doing an anti join
df3 <- anti_join(df1, df2)
But this method doesn't take into account friend id numbers that might appear in a different column in time 2 (For example respondent #6 friend 9 and 19 are in T1 and T2 but in different columns in each time)
Another option:
library(tidyverse)
left_join(
gather(df1, key, x, -ID),
gather(df2, key, y, -ID),
by = c("ID", "key")
) %>%
group_by(ID) %>%
summarise(
num_newfriends = sum(!y[!is.na(y)] %in% x[!is.na(x)]),
num_lostfriends = sum(!x[!is.na(x)] %in% y[!is.na(y)])
)
Output:
# A tibble: 10 x 3
ID num_newfriends num_lostfriends
<int> <int> <int>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
Simple comparisons would be an option
library(tidyverse)
na_sums_old <- rowSums(is.na(time1))
na_sums_new <- rowSums(is.na(time2))
kept_friends <- map_dbl(seq(nrow(time1)), ~ sum(time1[.x, -1] %in% time2[.x, -1]))
kept_friends <- kept_friends - na_sums_old * (na_sums_new >= 1)
new_friends <- 3 - na_sums_new - kept_friends
lost_friends <- 3 - na_sums_old - kept_friends
tibble(ID = time1$ID, new_friends = new_friends, lost_friends = lost_friends)
# A tibble: 10 x 3
ID new_friends lost_friends
<int> <dbl> <dbl>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
You can make anti_join work by first pivoting to a "long" data frame.
df1 <- df1 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
df2 <- df2 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
head(df1)
#> # A tibble: 6 x 3
#> ID name friend
#> <int> <chr> <int>
#> 1 1 friend_1 4
#> 2 1 friend_2 12
#> 3 1 friend_3 7
#> 4 2 friend_1 8
#> 5 2 friend_2 6
#> 6 2 friend_3 7
lost_friends <- anti_join(df1, df2, by = c("ID", "friend"))
new_fiends <- anti_join(df2, df1, by = c("ID", "friend"))
respondents <- distinct(df1, ID)
respondents %>%
full_join(
count(lost_friends, ID, name = "num_lost_friends")
) %>%
full_join(
count(new_fiends, ID, name = "num_new_friends")
) %>%
mutate_at(vars(starts_with("num_")), replace_na, 0)
#> Joining, by = "ID"
#> Joining, by = "ID"
#> # A tibble: 10 x 3
#> ID num_lost_friends num_new_friends
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 1 1
#> 3 3 0 0
#> 4 4 0 0
#> 5 5 3 3
#> 6 6 1 0
#> 7 7 3 0
#> 8 8 3 3
#> 9 9 3 2
#> 10 10 2 2
Created on 2019-11-01 by the reprex package (v0.3.0)
data
data=data.frame("person"=c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2),
"score"=c(1,2,1,2,3,1,3,NA,4,2,1,NA,2,NA,3,1,2,4),
"want"=c(1,2,1,2,3,3,3,3,4,2,1,1,2,2,3,3,3,4))
attempt
library(dplyr)
data = data %>%
group_by(person) %>%
mutate(wantTEST = ifelse(score >= 3 | (row_number() >= which.max(score == 3)),
cummax(score), score),
wantTEST = replace(wantTEST, duplicated(wantTEST == 4) & wantTEST == 4, NA))
i am basically working to use the cummax function but only under specific circumstances. i want to keep any values (1-2-1-1) except if there is a 3 or 4 (1-2-1-3-2-1-4) should be (1-2-1-3-3-4). if there is NA value i want to carry forward previous value. thank you.
Here's one way with tidyverse. You may want to use fill() after group_by() but that's somewhat unclear.
data %>%
fill(score) %>%
group_by(person) %>%
mutate(
w = ifelse(cummax(score) > 2, cummax(score), score)
) %>%
ungroup()
# A tibble: 18 x 4
person score want w
<dbl> <dbl> <dbl> <dbl>
1 1 1 1 1
2 1 2 2 2
3 1 1 1 1
4 1 2 2 2
5 1 3 3 3
6 1 1 3 3
7 1 3 3 3
8 1 3 3 3
9 1 4 4 4
10 2 2 2 2
11 2 1 1 1
12 2 1 1 1
13 2 2 2 2
14 2 2 2 2
15 2 3 3 3
16 2 1 3 3
17 2 2 3 3
18 2 4 4 4
One way to do this is to first fill NA values and then for each row check if anytime the score of 3 or more is passed in the group. If the score of 3 is reached till that point we take the max score until that point or else return the same score.
library(tidyverse)
data %>%
fill(score) %>%
group_by(person) %>%
mutate(want1 = map_dbl(seq_len(n()), ~if(. >= which.max(score == 3))
max(score[seq_len(.)]) else score[.]))
# person score want want1
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 1 1
# 2 1 2 2 2
# 3 1 1 1 1
# 4 1 2 2 2
# 5 1 3 3 3
# 6 1 1 3 3
# 7 1 3 3 3
# 8 1 3 3 3
# 9 1 4 4 4
#10 2 2 2 2
#11 2 1 1 1
#12 2 1 1 1
#13 2 2 2 2
#14 2 2 2 2
#15 2 3 3 3
#16 2 1 3 3
#17 2 2 3 3
#18 2 4 4 4
Another way is to use accumulate from purrr. I use if_else_ from hablar for type stability:
library(tidyverse)
library(hablar)
data %>%
fill(score) %>%
group_by(person) %>%
mutate(wt = accumulate(score, ~if_else_(.x > 2, max(.x, .y), .y)))
I have the following data frame
d2
# A tibble: 10 x 2
ID Count
<int> <dbl>
1 1
2 1
3 1
4 1
5 1
6 2
7 2
8 2
9 3
10 3
Which states how many counts each person (ID) had.
I would like to calculate the cumulative percentage of each count: 1 - 50%, up to 2: 80%, up to 3: 100%.
I tried
> d2 %>% mutate(cum = cumsum(Count)/sum(Count))
# A tibble: 10 x 3
ID Count cum
<int> <dbl> <dbl>
1 1 0.05882353
2 1 0.11764706
3 1 0.17647059
4 1 0.23529412
5 1 0.29411765
6 2 0.41176471
7 2 0.52941176
8 2 0.64705882
9 3 0.82352941
10 3 1.00000000
but this result is obviously incorrect because I would expect that the count of 1 would correspond to 50% rather than 29.4%.
What is wrong here? How do I get the correct answer?
We get the count of 'Count', create the 'Cum' by taking the cumulative sum of 'n' and divide it by the sum of 'n', then right_join with the original data
d2 %>%
count(Count) %>%
mutate(Cum = cumsum(n)/sum(n)) %>%
select(-n) %>%
right_join(d2) %>%
select(names(d2), everything())
# A tibble: 10 x 3
# ID Count Cum
# <int> <int> <dbl>
# 1 1 1 0.500
# 2 2 1 0.500
# 3 3 1 0.500
# 4 4 1 0.500
# 5 5 1 0.500
# 6 6 2 0.800
# 7 7 2 0.800
# 8 8 2 0.800
# 9 9 3 1.00
#10 10 3 1.00
If we need the output as #LAP mentioned
d2 %>%
mutate(Cum = row_number()/n())
# ID Count Cum
#1 1 1 0.1
#2 2 1 0.2
#3 3 1 0.3
#4 4 1 0.4
#5 5 1 0.5
#6 6 2 0.6
#7 7 2 0.7
#8 8 2 0.8
#9 9 3 0.9
#10 10 3 1.0
This works:
d2 %>%
mutate(cum = cumsum(rep(1/n(), n())))
ID Count cum
1 1 1 0.1
2 2 1 0.2
3 3 1 0.3
4 4 1 0.4
5 5 1 0.5
6 6 2 0.6
7 7 2 0.7
8 8 2 0.8
9 9 3 0.9
10 10 3 1.0
One option could be as:
library(dplyr)
d2 %>%
group_by(Count) %>%
summarise(proportion = n()) %>%
mutate(Perc = cumsum(100*proportion/sum(proportion))) %>%
select(-proportion)
# # A tibble: 3 x 2
# Count Perc
# <int> <dbl>
# 1 1 50.0
# 2 2 80.0
# 3 3 100.0