I have a data frame like this:
plazo
monto
20
2
50
3
I need to add a rows for values between 1 to the value of plazo and expand my dataframe like below;
plazo
monto
Semana
20
2
1
20
2
2
20
2
3
20
2
…
20
2
20
50
3
1
50
3
2
50
3
3
50
3
…
50
3
50
We can create a nested column with values from 1:plazo for each row and then unnest that column.
df1 <- data.frame(plazo = c(2, 5), monto = c(2,3))
library(tidyverse)
df1 %>%
rowwise() %>%
mutate(Semana = list(1:plazo)) %>%
unnest(Semana)
#> # A tibble: 7 x 3
#> plazo monto Semana
#> <dbl> <dbl> <int>
#> 1 2 2 1
#> 2 2 2 2
#> 3 5 3 1
#> 4 5 3 2
#> 5 5 3 3
#> 6 5 3 4
#> 7 5 3 5
We may use uncount
library(dplyr)
library(tidyr)
df1 %>%
uncount(plazo, .id = 'Semana', .remove = FALSE)
-output
plazo monto Semana
1 2 2 1
2 2 2 2
3 5 3 1
4 5 3 2
5 5 3 3
6 5 3 4
7 5 3 5
Related
So basically I have a data frame that looks like this:
BX
BY
1
12
1
12
1
12
2
14
2
14
3
5
I want to create another colum ID, which will have the same number for the same values in BX and BY. So the table would look like this then:
BX
BY
ID
1
12
1
1
12
1
1
12
1
2
14
2
2
14
2
3
5
3
Here is a base R way.
Subset the data.frame by the grouping columns, find the duplicated rows and use a standard cumsum trick.
df1<-'BX BY
1 12
1 12
1 12
2 14
2 14
3 5'
df1 <- read.table(textConnection(df1), header = TRUE)
cumsum(!duplicated(df1[c("BX", "BY")]))
#> [1] 1 1 1 2 2 3
df1$ID <- cumsum(!duplicated(df1[c("BX", "BY")]))
df1
#> BX BY ID
#> 1 1 12 1
#> 2 1 12 1
#> 3 1 12 1
#> 4 2 14 2
#> 5 2 14 2
#> 6 3 5 3
Created on 2022-10-12 with reprex v2.0.2
You can do:
transform(dat, ID = as.numeric(interaction(dat, drop = TRUE, lex.order = TRUE)))
BX BY ID
1 1 12 1
2 1 12 1
3 1 12 1
4 2 14 2
5 2 14 2
6 3 5 3
Or if you prefer dplyr:
library(dplyr)
dat %>%
group_by(across()) %>%
mutate(ID = cur_group_id()) %>%
ungroup()
# A tibble: 6 × 3
BX BY ID
<dbl> <dbl> <int>
1 1 12 1
2 1 12 1
3 1 12 1
4 2 14 2
5 2 14 2
6 3 5 3
I have a series of rows in a single dataframe. I'm trying to aggregate the first two rows for each ID- i.e. - I want to combine events 1 and 2 for ID 1 into a single row, events 1 and 2 for ID 2 into a singlw row etc, but leave event 3 completely untouched.
id <- c(1,1,1,2,2,2,3,3,3,4,4,4,5,5,5)
event <- c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3)
score <- c(3,NA,1,3,NA,2,6,NA,1,8,NA,2,4,NA,1)
score2 <- c(NA,4,1,NA,5,2,NA,0,3,NA,5,6,NA,8,7)
df <- tibble(id, event, score, score2)
# A tibble: 15 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 NA
2 1 2 NA 4
3 1 3 1 1
4 2 1 3 NA
5 2 2 NA 5
6 2 3 2 2
7 3 1 6 NA
8 3 2 NA 0
9 3 3 1 3
10 4 1 8 NA
11 4 2 NA 5
12 4 3 2 6
13 5 1 4 NA
14 5 2 NA 8
15 5 3 1 7
I've tried :
df_merged<- df %>% group_by (id) %>% summarise_all(funs(min(as.character(.),na.rm=TRUE))),
which aggregates these nicely, but then I struggle to merge these back into the orignal dataframe/tibble (there are really about 300 different "score" columns in the full dataset, so a right_join is a headache with score.x, score.y, score2.x, score2.y all over the place...)
Ideally, the situation would need to be dplyr as the rest of my code runs on this!
EDIT:
Ideally, my expected output would be:
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
3 1 3 1 1
4 2 1 3 5
6 2 3 2 2
7 3 1 6 0
9 3 3 1 3
10 4 1 8 5
12 4 3 2 6
13 5 1 4 8
15 5 3 1 7
We may change the order of NA elements with replace
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with('score'),
~replace(., 1:2, .[1:2][order(is.na(.[1:2]))]))) %>%
ungroup %>%
filter(if_all(starts_with('score'), Negate(is.na)))
-output
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
Here is an alternative way to achieve your task with fill from tidyr package:
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
fill(everything(), .direction = "down") %>%
fill(everything(), .direction = "up") %>%
slice(1,3)
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
How about this?
library(dplyr)
df_e12 <- df %>%
filter(event %in% c(1, 2)) %>%
group_by(id) %>%
mutate(across(starts_with("score"), ~min(.x, na.rm = TRUE))) %>%
ungroup() %>%
distinct(id, .keep_all = TRUE)
df_e3 <- df %>%
filter(event == 3)
df <- bind_rows(df_e12, df_e3) %>%
arrange(id, event)
df
> df
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
I have this file:
ID
1
1
1
3
3
3
7
7
7
And I need to assign two sets randomly, (1,2,3) and (5,15,25).
To do this I used this:
set.seed(1109201)
df %>%
group_by(ID) %>%
dplyr::mutate(set1=sample(c(1,2,3), size=n(), replace=F),set2=sample(c(5,15,25), size=n(), replace=F))
and I obtained this:
ID set1 set2
1 1 15
3 1 25
7 1 25
1 2 5
3 2 15
7 2 5
1 3 25
3 3 5
7 3 15
but I need different values for set2 in set1 and ID, like this:
ID set1 set2
1 1 15
3 1 25
7 1 5
1 2 5
3 2 15
7 2 25
1 3 25
3 3 5
7 3 15
Set2 cannot be repeated into ID or set1
some suggestion to control these 2 sets?
Change your dplyr code to the following. Using a 'group_by()` step will have the second sampling occur only within the group.
set.seed(1109201)
df %>%
group_by(ID) %>%
dplyr::mutate(set1=sample(c(1,2,3), size=n(), replace=F)) %>%
group_by(set1) %>%
mutate(set2=sample(c(5,15,25), size=n(), replace=F)) %>%
ungroup()
# A tibble: 8 x 3
ID set1 set2
<dbl> <dbl> <dbl>
1 1 2 15
2 1 3 5
3 1 1 25
4 3 3 15
5 3 2 5
6 3 1 5
7 7 2 25
8 7 3 25
I observe 12 responses of 2 survey participants.
data = data.frame(id = c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2), response = c(2,2,3,3,6,3,6,7,3,1,4,3,3,3,6,4,2,6,7,3,2,1,5,6))
data
id response
1 1 2
2 1 2
3 1 3
4 1 3
5 1 6
6 1 3
7 1 6
8 1 7
9 1 3
10 1 1
11 1 4
12 1 3
13 2 3
14 2 3
15 2 6
16 2 4
17 2 2
18 2 6
19 2 7
20 2 3
21 2 2
22 2 1
23 2 5
24 2 6
Now I want to add 2 things to the data of each survey participant:
a) The most frequent value of this survey participant
b) the relative frequency of the most frequent value
How can I add these things using dplyr:
data %>%
group_by(id) %>%
mutate(most_frequent_value = ?,
relative_frequency_of_most_frequent_value = ?)
I'd probably use a two step solution. First, create a data.frame of frequency/relative frequency. Then join to it. We use slice(which.max()), because it will return one row. Using slice_max may return multiple rows.
library(tidyverse)
# count by id, response, calculate rel frequency
# rename columns to make inner_join easier
freq_table <- dd %>%
count(id, response) %>%
group_by(id) %>%
mutate(rel_freq = n / sum(n)) %>%
select(id, most_frequent_response = response, rel_freq)
# inner join to sliced freq_table (grouping by id is preserved)
dd %>%
inner_join(freq_table %>% slice(which.max(rel_freq)))
# id response most_frequent_response rel_freq
# 1 1 2 3 0.4166667
# 2 1 2 3 0.4166667
# 3 1 3 3 0.4166667
# 4 1 3 3 0.4166667
# 5 1 6 3 0.4166667
# ...
You could try:
table(data$id, data$response) %>%
as.data.frame() %>%
setNames(c("id", "response", "n")) %>%
group_by(id) %>%
slice_max(n, 1) %>%
group_by(response) %>%
filter(n() > 1) %>%
mutate(ratio = c(n[1]/sum(n), n[2]/sum(n)))
#> # A tibble: 2 x 4
#> # Groups: response [1]
#> id response n ratio
#> <fct> <fct> <int> <dbl>
#> 1 1 3 5 0.625
#> 2 2 3 3 0.375
Does this work:
data %>% group_by(id, response) %>% mutate(n = n()) %>%
ungroup() %>% group_by(id) %>%
mutate(most_frequent_value = response[n == max(n)][1],
relative_frequency_of_most_frequent_value = max(n)/n())
# A tibble: 24 x 5
# Groups: id [2]
id response n most_frequent_value relative_frequency_of_most_frequent_value
<dbl> <dbl> <int> <dbl> <dbl>
1 1 2 2 3 0.417
2 1 2 2 3 0.417
3 1 3 5 3 0.417
4 1 3 5 3 0.417
5 1 6 2 3 0.417
6 1 3 5 3 0.417
7 1 6 2 3 0.417
8 1 7 1 3 0.417
9 1 3 5 3 0.417
10 1 1 1 3 0.417
11 1 4 1 3 0.417
12 1 3 5 3 0.417
13 2 3 3 3 0.25
14 2 3 3 3 0.25
15 2 6 3 3 0.25
16 2 4 1 3 0.25
17 2 2 2 3 0.25
18 2 6 3 3 0.25
19 2 7 1 3 0.25
20 2 3 3 3 0.25
21 2 2 2 3 0.25
22 2 1 1 3 0.25
23 2 5 1 3 0.25
24 2 6 3 3 0.25
>
I have two data frames of the same respondents, one from Time 1 and the next from Time 2. In each wave they nominated their friends, and I want to know:
1) how many friends are nominated in Time 2 but not in Time 1 (new friends)
2) how many friends are nominated in Time 1 but not in Time 2 (lost friends)
Sample data:
Time 1 DF
ID friend_1 friend_2 friend_3
1 4 12 7
2 8 6 7
3 9 NA NA
4 15 7 2
5 2 20 7
6 19 13 9
7 12 20 8
8 3 17 10
9 1 15 19
10 2 16 11
Time 2 DF
ID friend_1 friend_2 friend_3
1 4 12 3
2 8 6 14
3 9 NA NA
4 15 7 2
5 1 17 9
6 9 19 NA
7 NA NA NA
8 7 1 16
9 NA 10 12
10 7 11 9
So the desired DF would include these columns (EDIT filled in columns):
ID num_newfriends num_lostfriends
1 1 1
2 1 1
3 0 0
4 0 0
5 3 3
6 0 1
7 0 3
8 3 3
9 2 3
10 2 1
EDIT2:
I've tried doing an anti join
df3 <- anti_join(df1, df2)
But this method doesn't take into account friend id numbers that might appear in a different column in time 2 (For example respondent #6 friend 9 and 19 are in T1 and T2 but in different columns in each time)
Another option:
library(tidyverse)
left_join(
gather(df1, key, x, -ID),
gather(df2, key, y, -ID),
by = c("ID", "key")
) %>%
group_by(ID) %>%
summarise(
num_newfriends = sum(!y[!is.na(y)] %in% x[!is.na(x)]),
num_lostfriends = sum(!x[!is.na(x)] %in% y[!is.na(y)])
)
Output:
# A tibble: 10 x 3
ID num_newfriends num_lostfriends
<int> <int> <int>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
Simple comparisons would be an option
library(tidyverse)
na_sums_old <- rowSums(is.na(time1))
na_sums_new <- rowSums(is.na(time2))
kept_friends <- map_dbl(seq(nrow(time1)), ~ sum(time1[.x, -1] %in% time2[.x, -1]))
kept_friends <- kept_friends - na_sums_old * (na_sums_new >= 1)
new_friends <- 3 - na_sums_new - kept_friends
lost_friends <- 3 - na_sums_old - kept_friends
tibble(ID = time1$ID, new_friends = new_friends, lost_friends = lost_friends)
# A tibble: 10 x 3
ID new_friends lost_friends
<int> <dbl> <dbl>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
You can make anti_join work by first pivoting to a "long" data frame.
df1 <- df1 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
df2 <- df2 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
head(df1)
#> # A tibble: 6 x 3
#> ID name friend
#> <int> <chr> <int>
#> 1 1 friend_1 4
#> 2 1 friend_2 12
#> 3 1 friend_3 7
#> 4 2 friend_1 8
#> 5 2 friend_2 6
#> 6 2 friend_3 7
lost_friends <- anti_join(df1, df2, by = c("ID", "friend"))
new_fiends <- anti_join(df2, df1, by = c("ID", "friend"))
respondents <- distinct(df1, ID)
respondents %>%
full_join(
count(lost_friends, ID, name = "num_lost_friends")
) %>%
full_join(
count(new_fiends, ID, name = "num_new_friends")
) %>%
mutate_at(vars(starts_with("num_")), replace_na, 0)
#> Joining, by = "ID"
#> Joining, by = "ID"
#> # A tibble: 10 x 3
#> ID num_lost_friends num_new_friends
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 1 1
#> 3 3 0 0
#> 4 4 0 0
#> 5 5 3 3
#> 6 6 1 0
#> 7 7 3 0
#> 8 8 3 3
#> 9 9 3 2
#> 10 10 2 2
Created on 2019-11-01 by the reprex package (v0.3.0)