Count variable until observations changes [duplicate] - r

This question already has answers here:
Create counter within consecutive runs of values
(3 answers)
Closed 1 year ago.
Unfortunately, I can't wrap my head around this but I'm sure there is a straightforward solution. I've a data.frame that looks like this:
set.seed(1)
mydf <- data.frame(group=sample(c("a", "b"), 20, replace=T))
I'd like to create a new variable that counts from top to bottom, how many times the group occured in a row. Hence, within the example from above it should look like this:
mydf$question <- c(1, 2, 1, 2, 1, 1, 2, 3, 4, 1, 2, 3, 1, 1, 1, 1, 1, 2, 1, 1)
> mydf[1:10,]
group question
1 a 1
2 a 2
3 b 1
4 b 2
5 a 1
6 b 1
7 b 2
8 b 3
9 b 4
10 a 1
Thanks for help.

Using data.table::rleid and dplyr you could do:
set.seed(1)
mydf <- data.frame(group=sample(c("a", "b"), 20, replace=T))
library(dplyr)
library(data.table)
mydf %>%
mutate(id = data.table::rleid(group)) %>%
group_by(id) %>%
mutate(question = row_number()) %>%
ungroup()
#> # A tibble: 20 × 3
#> group id question
#> <chr> <int> <int>
#> 1 a 1 1
#> 2 b 2 1
#> 3 a 3 1
#> 4 a 3 2
#> 5 b 4 1
#> 6 a 5 1
#> 7 a 5 2
#> 8 a 5 3
#> 9 b 6 1
#> 10 b 6 2
#> 11 a 7 1
#> 12 a 7 2
#> 13 a 7 3
#> 14 a 7 4
#> 15 a 7 5
#> 16 b 8 1
#> 17 b 8 2
#> 18 b 8 3
#> 19 b 8 4
#> 20 a 9 1

Update: Most is the same as stefan but without data.table package:
library(dplyr)
mydf %>%
mutate(myrleid = with(rle(group), rep(seq_along(lengths), lengths))) %>%
group_by(myrleid) %>%
mutate(question = row_number()) %>%
ungroup()
group myrleid question
<chr> <int> <int>
1 a 1 1
2 b 2 1
3 a 3 1
4 a 3 2
5 b 4 1
6 a 5 1
7 a 5 2
8 a 5 3
9 b 6 1
10 b 6 2
11 a 7 1
12 a 7 2
13 a 7 3
14 a 7 4
15 a 7 5
16 b 8 1
17 b 8 2
18 b 8 3
19 b 8 4
20 a 9 1

Related

Rearranging the rows based on a sequential unique values

I have the following data set containing duplicate columns and I would like to stack them but in the following way. I can get the desired output with bind_rows but I would like to try it with tidyr functions:
df <- tibble(
runs = c(1, 2, 3, 4),
col1 = c(3, 4, 5, 5),
col2 = c(5, 3, 1, 4),
col3 = c(6, 4, 9, 2),
col1 = c(0, 2, 2, 1),
col2 = c(2, 3, 1, 7),
col3 = c(2, 4, 9, 9),
col1 = c(3, 4, 5, 7),
col2 = c(3, 3, 1, 4),
col3 = c(3, 2, NA, NA), .name_repair = "minimal")
df %>%
select(runs, 2:4) %>%
bind_rows(df %>%
select(runs, 5:7)) %>%
bind_rows(df %>%
select(runs, 8:10))
# A tibble: 12 x 4 # This is my desired output in a way that column runs is a repeated number of 1 to 4
runs col1 col2 col3
<dbl> <dbl> <dbl> <dbl>
1 1 3 5 6
2 2 4 3 4
3 3 5 1 9
4 4 5 4 2
5 1 0 2 2
6 2 2 3 4
7 3 2 1 9
8 4 1 7 9
9 1 3 3 3
10 2 4 3 2
11 3 5 1 NA
12 4 7 4 NA
However when I use tidyr the runs is arranged differently in the following way.
df %>%
pivot_longer(-runs) %>%
group_by(name) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = name, values_from = value) %>%
select(-id)
# A tibble: 12 x 4
runs col1 col2 col3
<dbl> <dbl> <dbl> <dbl>
1 1 3 5 6
2 1 0 2 2
3 1 3 3 3
4 2 4 3 4
5 2 2 3 4
6 2 4 3 2
7 3 5 1 9
8 3 2 1 9
9 3 5 1 NA
10 4 5 4 2
11 4 1 7 9
12 4 7 4 NA
I would be grateful if you could let me know how I could rearrange runs so that the numbers are sequential and not like three 1 in a row and ...
Thank you very much in advance.
There may be a more elegant way to do this, but could you not simply group by runs and use the row numbers to arrange.
df %>%
pivot_longer(cols = starts_with("col"),
names_to = c(".value")) %>%
group_by(runs) %>%
mutate(grp_n = row_number()) %>%
ungroup() %>%
arrange(grp_n, runs)
# A tibble: 12 x 5
runs col1 col2 col3 grp_n
<dbl> <dbl> <dbl> <dbl> <int>
1 1 3 5 6 1
2 2 4 3 4 1
3 3 5 1 9 1
4 4 5 4 2 1
5 1 0 2 2 2
6 2 2 3 4 2
7 3 2 1 9 2
8 4 1 7 9 2
9 1 3 3 3 3
10 2 4 3 2 3
11 3 5 1 NA 3
12 4 7 4 NA 3
A base R option using split.default :
data.frame(runs = df$runs,
sapply(split.default(df[-1], names(df)[-1]), unlist),row.names = NULL)
# runs col1 col2 col3
#1 1 3 5 6
#2 2 4 3 4
#3 3 5 1 9
#4 4 5 4 2
#5 1 0 2 2
#6 2 2 3 4
#7 3 2 1 9
#8 4 1 7 9
#9 1 3 3 3
#10 2 4 3 2
#11 3 5 1 NA
#12 4 7 4 NA

How can I distribute a vector of numbers by a vector of percentages, round the result, and always get the same total that I started with in R?

Question Summary
I want to multiply a vector of numbers (the Sum_By_Group column) by a vector of percentages (the Percent column) to distribute the total number for the group into each ID, round the result, and end up with the same total number that I started with. In other words, I want the Distribution_Post_Round column to be the same as the Sum_By_Group column.
Below is an example of the issue that I am running into. In Group A, I multiply Percent by Sum_By_Group and finish with 3 in ID 1, 3 in ID 2, and 1 in ID 5 for a total of 7. The Sum_By_Group column and Distribution_Post_Round column are the same for Group A and this is what I want. In Group B, I multiply Percent by Sum_By_Group and finish with 1 in ID 8 and 1 in ID 10 for a total of 2. I want the Distribution_Post_Round column to be 3 for Group B.
Is there a way to do this without using loops, subsetting data frames, and then rejoining the data frames together?
Example
library(dplyr)
df = data.frame('Group' = c(rep('A', 7), rep('B', 5)),
'ID' = c(1:12),
'Percent' = c(0.413797750, 0.385366840, 0.014417571, 0.060095668, 0.076399650,
0.019672573, 0.030249949, 0.381214519, 0.084121796, 0.438327886,
0.010665749, 0.085670050),
'Sum_By_Group' = c(rep(7,7), rep(3, 5)))
df$Distribute_By_ID = round(df$Percent * df$Sum_By_Group, 0)
df_round = aggregate(Distribute_By_ID ~ Group, data = df, sum)
names(df_round)[names(df_round) == 'Distribute_By_ID'] = 'Distribution_Post_Round'
df = left_join(df, df_round, by = 'Group')
df
Group ID Percent Sum_By_Group Distribute_By_ID Distribution_Post_Round
A 1 0.41379775 7 3 7
A 2 0.38536684 7 3 7
A 3 0.01441757 7 0 7
A 4 0.06009567 7 0 7
A 5 0.07639965 7 1 7
A 6 0.01967257 7 0 7
A 7 0.03024995 7 0 7
B 8 0.38121452 3 1 2
B 9 0.08412180 3 0 2
B 10 0.43832789 3 1 2
B 11 0.01066575 3 0 2
B 12 0.08567005 3 0 2
Thank you so much for your help. Please let me know if additional clarification is needed.
Wow, who knew someone had already written a package that includes a function to solve this... kudos to that team https://cran.r-project.org/web/packages/sfsmisc/index.html
Since you seem to be willing to use dplyr hopefully this additional package will be worth it because it certainly makes a solution elegant.
# https://stackoverflow.com/questions/61667720
library(dplyr)
df = data.frame('Group' = c(rep('A', 7), rep('B', 5)),
'ID' = c(1:12),
'Percent' = c(0.413797750, 0.385366840, 0.014417571, 0.060095668, 0.076399650,
0.019672573, 0.030249949, 0.381214519, 0.084121796, 0.438327886,
0.010665749, 0.085670050),
'Sum_By_Group' = c(rep(7,7), rep(3, 5)))
glimpse(df)
#> Rows: 12
#> Columns: 4
#> $ Group <chr> "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "…
#> $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
#> $ Percent <dbl> 0.41379775, 0.38536684, 0.01441757, 0.06009567, 0.076399…
#> $ Sum_By_Group <dbl> 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3, 3
df %>%
group_by(Group) %>%
mutate(Distribute_By_ID = sfsmisc::roundfixS(Percent * Sum_By_Group))
#> # A tibble: 12 x 5
#> # Groups: Group [2]
#> Group ID Percent Sum_By_Group Distribute_By_ID
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 A 1 0.414 7 3
#> 2 A 2 0.385 7 3
#> 3 A 3 0.0144 7 0
#> 4 A 4 0.0601 7 0
#> 5 A 5 0.0764 7 1
#> 6 A 6 0.0197 7 0
#> 7 A 7 0.0302 7 0
#> 8 B 8 0.381 3 1
#> 9 B 9 0.0841 3 0
#> 10 B 10 0.438 3 2
#> 11 B 11 0.0107 3 0
#> 12 B 12 0.0857 3 0
Created on 2020-05-07 by the reprex package (v0.3.0)
df %>%
mutate(dividend = floor(Percent*Sum_By_Group),
remainder= Percent*Sum_By_Group-dividend) %>%
group_by(Group) %>%
arrange(desc(remainder),.by_group=TRUE) %>%
mutate(delivered=sum(dividend),
rownumber=1:n(),
lastdelivery=if_else(rownumber<=Sum_By_Group-delivered,1,0),
Final=dividend+lastdelivery) %>%
ungroup()
# A tibble: 12 x 10
Group ID Percent Sum_By_Group dividend remainder delivered rownumber lastdelivery Final
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 A 1 0.414 7 2 0.897 4 1 1 3
2 A 2 0.385 7 2 0.698 4 2 1 3
3 A 5 0.0764 7 0 0.535 4 3 1 1
4 A 4 0.0601 7 0 0.421 4 4 0 0
5 A 7 0.0302 7 0 0.212 4 5 0 0
6 A 6 0.0197 7 0 0.138 4 6 0 0
7 A 3 0.0144 7 0 0.101 4 7 0 0
8 B 10 0.438 3 1 0.315 2 1 1 2
9 B 12 0.0857 3 0 0.257 2 2 0 0
10 B 9 0.0841 3 0 0.252 2 3 0 0
11 B 8 0.381 3 1 0.144 2 4 0 1
12 B 11 0.0107 3 0 0.0320 2 5 0 0
This is my solution, without any other dependencies relying on Hare quota :
I distributed all the integer "seats", then I distributed the remaining "seats" in the order of remainders.
The column "Final" is then OK.
Note : It seems to give the same results than the other solution with a package
Formulating this as an integer optimization problem:
library(CVXR)
A <- as.data.frame.matrix(t(model.matrix(~0+Group, df)))
prop <- df$Percent * df$Sum_By_Group
x <- Variable(nrow(df), integer=TRUE)
sums <- df$Sum_By_Group[!duplicated(df$Group)]
p <- Problem(Minimize(sum_squares(x - prop)), list(A %*% x == sums))
result <- solve(p)
df$Distribute_By_ID <- as.integer(round(result$getValue(x)))
output:
Group ID Percent Sum_By_Group
1 A 1 0.41379775 7
2 A 2 0.38536684 7
3 A 3 0.01441757 7
4 A 4 0.06009567 7
5 A 5 0.07639965 7
6 A 6 0.01967257 7
7 A 7 0.03024995 7
8 B 8 0.38121452 3
9 B 9 0.08412180 3
10 B 10 0.43832789 3
11 B 11 0.01066575 3
12 B 12 0.08567005 3

Unnest or unchop dataframe containing lists of different lengths

I have a dataframe with several columns containing list columns that I want to unnest (or unchop). BUT, they are different lengths, so the resulting error is Error: No common size for...
Here is a reprex to show what works and doesn't work.
library(tidyr)
library(vctrs)
# This works as expected
df_A <- tibble(
ID = 1:3,
A = as_list_of(list(c(9, 8, 5), c(7,6), c(6, 9)))
)
unchop(df_A, cols = c(A))
# A tibble: 7 x 2
ID A
<int> <dbl>
1 1 9
2 1 8
3 1 5
4 2 7
5 2 6
6 3 6
7 3 9
# This works as expected as the lists are the same lengths
df_AB_1 <- tibble(
ID = 1:3,
A = as_list_of(list(c(9, 8, 5), c(7,6), c(6, 9))),
B = as_list_of(list(c(1, 2, 3), c(4, 5), c(7, 8)))
)
unchop(df_AB_1, cols = c(A, B))
# A tibble: 7 x 3
ID A B
<int> <dbl> <dbl>
1 1 9 1
2 1 8 2
3 1 5 3
4 2 7 4
5 2 6 5
6 3 6 7
7 3 9 8
# This does NOT work as the lists are different lengths
df_AB_2 <- tibble(
ID = 1:3,
A = as_list_of(list(c(9, 8, 5), c(7,6), c(6, 9))),
B = as_list_of(list(c(1, 2), c(4, 5, 6), c(7, 8, 9, 0)))
)
unchop(df_AB_2, cols = c(A, B))
# Error: No common size for `A`, size 3, and `B`, size 2.
The output that I would like to achieve for df_AB_2 above is as follows where each list is unchopped and missing values are filled with NA:
# A tibble: 10 x 3
ID A B
<dbl> <dbl> <dbl>
1 1 9 1
2 1 8 2
3 1 5 NA
4 2 7 4
5 2 6 5
6 2 NA 6
7 3 6 7
8 3 9 8
9 3 NA 9
10 3 NA 0
I have referenced this issue on Github and StackOverflow here.
Any ideas how to achieve the result above?
Versions
> packageVersion("tidyr")
[1] ‘1.0.0’
> packageVersion("vctrs")
[1] ‘0.2.0.9001’
Here is an idea via dplyr that you can generalise to as many columns as you want,
library(tidyverse)
df_AB_2 %>%
pivot_longer(c(A, B)) %>%
mutate(value = lapply(value, `length<-`, max(lengths(value)))) %>%
pivot_wider(names_from = name, values_from = value) %>%
unnest() %>%
filter(rowSums(is.na(.[-1])) != 2)
which gives,
# A tibble: 10 x 3
ID A B
<int> <dbl> <dbl>
1 1 9 1
2 1 8 2
3 1 5 NA
4 2 7 4
5 2 6 5
6 2 NA 6
7 3 6 7
8 3 9 8
9 3 NA 9
10 3 NA 0
Defining a helper function to update the lengths of the element and proceeding with dplyr:
foo <- function(x, len_vec) {
lapply(
seq_len(length(x)),
function(i) {
length(x[[i]]) <- len_vec[i]
x[[i]]
}
)
}
df_AB_2 %>%
mutate(maxl = pmax(lengths(A), lengths(B))) %>%
mutate(A = foo(A, maxl), B = foo(B, maxl)) %>%
unchop(cols = c(A, B)) %>%
select(-maxl)
# A tibble: 10 x 3
ID A B
<int> <dbl> <dbl>
1 1 9 1
2 1 8 2
3 1 5 NA
4 2 7 4
5 2 6 5
6 2 NA 6
7 3 6 7
8 3 9 8
9 3 NA 9
10 3 NA 0
Using data.table:
library(data.table)
setDT(df_AB_2)
df_AB_2[, maxl := pmax(lengths(A), lengths(B))]
df_AB_2[, .(unlist(A)[seq_len(maxl)], unlist(B)[seq_len(maxl)]), by = ID]

Count number of new and lost friends between two data frames in R

I have two data frames of the same respondents, one from Time 1 and the next from Time 2. In each wave they nominated their friends, and I want to know:
1) how many friends are nominated in Time 2 but not in Time 1 (new friends)
2) how many friends are nominated in Time 1 but not in Time 2 (lost friends)
Sample data:
Time 1 DF
ID friend_1 friend_2 friend_3
1 4 12 7
2 8 6 7
3 9 NA NA
4 15 7 2
5 2 20 7
6 19 13 9
7 12 20 8
8 3 17 10
9 1 15 19
10 2 16 11
Time 2 DF
ID friend_1 friend_2 friend_3
1 4 12 3
2 8 6 14
3 9 NA NA
4 15 7 2
5 1 17 9
6 9 19 NA
7 NA NA NA
8 7 1 16
9 NA 10 12
10 7 11 9
So the desired DF would include these columns (EDIT filled in columns):
ID num_newfriends num_lostfriends
1 1 1
2 1 1
3 0 0
4 0 0
5 3 3
6 0 1
7 0 3
8 3 3
9 2 3
10 2 1
EDIT2:
I've tried doing an anti join
df3 <- anti_join(df1, df2)
But this method doesn't take into account friend id numbers that might appear in a different column in time 2 (For example respondent #6 friend 9 and 19 are in T1 and T2 but in different columns in each time)
Another option:
library(tidyverse)
left_join(
gather(df1, key, x, -ID),
gather(df2, key, y, -ID),
by = c("ID", "key")
) %>%
group_by(ID) %>%
summarise(
num_newfriends = sum(!y[!is.na(y)] %in% x[!is.na(x)]),
num_lostfriends = sum(!x[!is.na(x)] %in% y[!is.na(y)])
)
Output:
# A tibble: 10 x 3
ID num_newfriends num_lostfriends
<int> <int> <int>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
Simple comparisons would be an option
library(tidyverse)
na_sums_old <- rowSums(is.na(time1))
na_sums_new <- rowSums(is.na(time2))
kept_friends <- map_dbl(seq(nrow(time1)), ~ sum(time1[.x, -1] %in% time2[.x, -1]))
kept_friends <- kept_friends - na_sums_old * (na_sums_new >= 1)
new_friends <- 3 - na_sums_new - kept_friends
lost_friends <- 3 - na_sums_old - kept_friends
tibble(ID = time1$ID, new_friends = new_friends, lost_friends = lost_friends)
# A tibble: 10 x 3
ID new_friends lost_friends
<int> <dbl> <dbl>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
You can make anti_join work by first pivoting to a "long" data frame.
df1 <- df1 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
df2 <- df2 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
head(df1)
#> # A tibble: 6 x 3
#> ID name friend
#> <int> <chr> <int>
#> 1 1 friend_1 4
#> 2 1 friend_2 12
#> 3 1 friend_3 7
#> 4 2 friend_1 8
#> 5 2 friend_2 6
#> 6 2 friend_3 7
lost_friends <- anti_join(df1, df2, by = c("ID", "friend"))
new_fiends <- anti_join(df2, df1, by = c("ID", "friend"))
respondents <- distinct(df1, ID)
respondents %>%
full_join(
count(lost_friends, ID, name = "num_lost_friends")
) %>%
full_join(
count(new_fiends, ID, name = "num_new_friends")
) %>%
mutate_at(vars(starts_with("num_")), replace_na, 0)
#> Joining, by = "ID"
#> Joining, by = "ID"
#> # A tibble: 10 x 3
#> ID num_lost_friends num_new_friends
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 1 1
#> 3 3 0 0
#> 4 4 0 0
#> 5 5 3 3
#> 6 6 1 0
#> 7 7 3 0
#> 8 8 3 3
#> 9 9 3 2
#> 10 10 2 2
Created on 2019-11-01 by the reprex package (v0.3.0)

Create new column based on condition from other column per group using tidy evaluation

Similar to this question but I want to use tidy evaluation instead.
df = data.frame(group = c(1,1,1,2,2,2,3,3,3),
date = c(1,2,3,4,5,6,7,8,9),
speed = c(3,4,3,4,5,6,6,4,9))
> df
group date speed
1 1 1 3
2 1 2 4
3 1 3 3
4 2 4 4
5 2 5 5
6 2 6 6
7 3 7 6
8 3 8 4
9 3 9 9
The task is to create a new column (newValue) whose values equals to the values of the date column (per group) with one condition: speed == 4. Example: group 1 has a newValue of 2 because date[speed==4] = 2.
group date speed newValue
1 1 1 3 2
2 1 2 4 2
3 1 3 3 2
4 2 4 4 4
5 2 5 5 4
6 2 6 6 4
7 3 7 6 8
8 3 8 4 8
9 3 9 9 8
It worked without tidy evaluation
df %>%
group_by(group) %>%
mutate(newValue=date[speed==4L])
#> # A tibble: 9 x 4
#> # Groups: group [3]
#> group date speed newValue
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 3 2
#> 2 1 2 4 2
#> 3 1 3 3 2
#> 4 2 4 4 4
#> 5 2 5 5 4
#> 6 2 6 6 4
#> 7 3 7 6 8
#> 8 3 8 4 8
#> 9 3 9 9 8
But had error with tidy evaluation
my_fu <- function(df, filter_var){
filter_var <- sym(filter_var)
df <- df %>%
group_by(group) %>%
mutate(newValue=!!filter_var[speed==4L])
}
my_fu(df, "date")
#> Error in quos(..., .named = TRUE): object 'speed' not found
Thanks in advance.
We can place the evaluation within brackets. Otherwise, it may try to evaluate the whole expression (filter_var[speed = 4L]) instead of filter_var alone
library(rlang)
library(dplyr)
my_fu <- function(df, filter_var){
filter_var <- sym(filter_var)
df %>%
group_by(group) %>%
mutate(newValue=(!!filter_var)[speed==4L])
}
my_fu(df, "date")
# A tibble: 9 x 4
# Groups: group [3]
# group date speed newValue
# <dbl> <dbl> <dbl> <dbl>
#1 1 1 3 2
#2 1 2 4 2
#3 1 3 3 2
#4 2 4 4 4
#5 2 5 5 4
#6 2 6 6 4
#7 3 7 6 8
#8 3 8 4 8
#9 3 9 9 8
Also, you can use from sqldf. Join df with a constraint on that:
library(sqldf)
df = data.frame(group = c(1,1,1,2,2,2,3,3,3),
date = c(1,2,3,4,5,6,7,8,9),
speed = c(3,4,3,4,5,6,6,4,9))
sqldf("SELECT df_origin.*, df4.`date` new_value FROM
df df_origin join (SELECT `group`, `date` FROM df WHERE speed = 4) df4
on (df_origin.`group` = df4.`group`)")

Resources