Several column sums by grouping variables in R - r

I have a data frame of term frequencies and some other random demographic variables. I want to utilize two grouping variables, drop the ones I do not need, and sum the frequencies based on the grouping variables.
Here is similar to what I have
df <- data.frame(user= c(1:9),
Group1 = c("a", "a", "a", "b", "b","b","c", "c", "c"),
Group2 = c("d", "e", "d", "e", "d", "e", "e", "e", "e"),
term1 = c(0, 1, 1, 0, 1, 1, 0, 0, 0),
term2 = c(1, 0, 1, 1, 0, 1, 0, 1, 1),
term3 = c(0, 1, 0, 0, 0, 0, 1, 1, 0))
and here is what I am trying to get.
desired <- data.frame(Group1 = c("a", "a", "b", "b", "c", "c"),
Group2 = c("d", "e", "d", "e", "d", "e"),
term1 = c(1, 1, 1, 1, 0, 0),
term2 = c(2, 0, 0, 2, 0, 2),
term3 = c(0, 1, 0, 0, 0, 2))
My real frame has about 4000 term columns, so naming each one individual in a dplyr function does not seem feasible.
Thank you!

You can try aggregate + expand.grid + merge
merge(
with(df, expand.grid(Group1 = unique(Group1), Group2 = unique(Group2))),
aggregate(. ~ Group1 + Group2, df[-1], sum),
all = TRUE
)
which gives
Group1 Group2 term1 term2 term3
1 a d 1 2 0
2 a e 1 0 1
3 b d 1 0 0
4 b e 1 2 0
5 c d NA NA NA
6 c e 0 2 2
If you want to have NAs as 0, you can try
> res <- merge(
with(df, expand.grid(Group1 = unique(Group1), Group2 = unique(Group2))),
aggregate(. ~ Group1 + Group2, df[-1], sum),
all = TRUE
)
> replace(res, is.na(res), 0)
Group1 Group2 term1 term2 term3
1 a d 1 2 0
2 a e 1 0 1
3 b d 1 0 0
4 b e 1 2 0
5 c d 0 0 0
6 c e 0 2 2

We can group by 'Group1, 'Group2', get the sum of 'term' columns in summarise and expand the data with complete for the missing combinations
library(dplyr)
library(tidyr)
df %>%
group_by(Group1, Group2) %>%
summarise(across(starts_with('term'), sum), .groups = 'drop') %>%
complete(Group1, Group2, fill = list(term1 = 0, term2 = 0, term3 = 0))
-output
# A tibble: 6 x 5
Group1 Group2 term1 term2 term3
<chr> <chr> <dbl> <dbl> <dbl>
1 a d 1 2 0
2 a e 1 0 1
3 b d 1 0 0
4 b e 1 2 0
5 c d 0 0 0
6 c e 0 2 2

If you don't need to compete all varible, setDT(df)[,lapply(.SD[,-1], sum),.(Group1,Group2)] is enough. Otherwise, you can use complete in package tidyr (as used in the first answer) to fill
the lacking varible.
library(data.table)
library(tidyr)
setDT(df)[,lapply(.SD[,-1], sum),.(Group1,Group2)] %>%
complete(Group1, Group2, fill = list(term1 = 0, term2 = 0, term3 = 0))
#> # A tibble: 6 x 5
#> Group1 Group2 term1 term2 term3
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 a d 1 2 0
#> 2 a e 1 0 1
#> 3 b d 1 0 0
#> 4 b e 1 2 0
#> 5 c d 0 0 0
#> 6 c e 0 2 2

Related

Generate table with count of all combinations by group in a efficient way

I have the following dataset example:
df <- tibble(group = c(rep(1, 6), rep(2, 6)),
class1 = c("A", "A", "B", "B", "B", "C", "B", "B", "B", "C", "C", "C"),
class2 = c("A", "B", "B", "B", "C", "B", "B", "B", "A", "C", "A", "B"))
df
I would like to do a table of all combinations between class1 and class2, by group in a fast way.
I try the code below, but it is painfully slow for my data (that is huge > 10 million rows). It takes more than 30 minutes.
output <- df %>% table() %>% as.data.table()
output desired:
output <- tibble(group = c(rep(1, 9), rep(1, 9)),
class1 = c(rep("A", 3), rep("B", 3), rep("C", 3),
rep("A", 3), rep("B", 3), rep("C", 3)),
class2 = rep(c("A", "B", "C"), 6),
N = c(1, 1, 0, 0, 2, 1, 0, 1, 0, 0, 0, 0, 1, 2, 0, 1, 1, 1))
output
Thanks for any help
Does this work:
library(dplyr)
library(tidyr)
df %>% mutate(N = 1) %>% complete( group, class1, class2) %>%
distinct() %>% mutate(N = replace_na(N, 0))
# A tibble: 18 × 4
group class1 class2 N
<dbl> <chr> <chr> <dbl>
1 1 A A 1
2 1 A B 1
3 1 A C 0
4 1 B A 0
5 1 B B 1
6 1 B C 1
7 1 C A 0
8 1 C B 1
9 1 C C 0
10 2 A A 0
11 2 A B 0
12 2 A C 0
13 2 B A 1
14 2 B B 1
15 2 B C 0
16 2 C A 1
17 2 C B 1
18 2 C C 1
This can be a bit faster than table:
library(data.table)
df <- data.table(group = c(rep(1, 6), rep(2, 6)),
class1 = c("A", "A", "B", "B", "B", "C", "B", "B", "B", "C", "C", "C"),
class2 = c("A", "B", "B", "B", "C", "B", "B", "B", "A", "C", "A", "B"))
u <- lapply(df, function(x) sort(unique(x)))
m <- rev(cumprod(c(1, rev(lengths(u)))))
do.call(CJ, u)[
, N := tabulate(rowSums(mapply(function(i) (match(df[[i]], u[[i]]) - 1)*m[i + 1], 1:ncol(df))) + 1, m[1])
][]
#> group class1 class2 N
#> 1: 1 A A 1
#> 2: 1 A B 1
#> 3: 1 A C 0
#> 4: 1 B A 0
#> 5: 1 B B 2
#> 6: 1 B C 1
#> 7: 1 C A 0
#> 8: 1 C B 1
#> 9: 1 C C 0
#> 10: 2 A A 0
#> 11: 2 A B 0
#> 12: 2 A C 0
#> 13: 2 B A 1
#> 14: 2 B B 2
#> 15: 2 B C 0
#> 16: 2 C A 1
#> 17: 2 C B 1
#> 18: 2 C C 1
Timing a much larger data set:
library(stringi)
df <- data.table(
group = sample(20, 2e7, TRUE),
class1 = stri_rand_strings(2e7, 2, "[A-Za-z]"),
class2 = stri_rand_strings(2e7, 2, "[A-Za-z]")
)
system.time({
u <- lapply(df, function(x) sort(unique(x)))
m <- rev(cumprod(c(1, rev(lengths(u)))))
output <- do.call(CJ, u)[
, N := tabulate(rowSums(mapply(function(i) (match(df[[i]], u[[i]]) - 1)*m[i + 1], 1:ncol(df))) + 1, m[1])
]
})
#> user system elapsed
#> 3.98 0.68 4.41
Compared to table:
system.time({output <- setorder(as.data.table(table(df)))})
#> user system elapsed
#> 28.40 3.64 13.77
Even with 20M rows, table is finishing within seconds. My guess is the > 30 minute timing experienced by the OP is due to a large number of combinations of group, class1, and class2.
With data.table:
setDT(df)[CJ(group=unique(group),class1=unique(class1),class2=unique(class2))
,.(group,x.group,class1,class2),on=.(group,class1,class2)][
,.(N=sum(!is.na(x.group))),by=.(group,class1,class2)]
group class1 class2 N
<num> <char> <char> <int>
1: 1 A A 1
2: 1 A B 1
3: 1 A C 0
4: 1 B A 0
5: 1 B B 2
6: 1 B C 1
7: 1 C A 0
8: 1 C B 1
9: 1 C C 0
10: 2 A A 0
11: 2 A B 0
12: 2 A C 0
13: 2 B A 1
14: 2 B B 2
15: 2 B C 0
16: 2 C A 1
17: 2 C B 1
18: 2 C C 1
However, this is much slower than your initial solution:
microbenchmark::microbenchmark(table = {df %>% table() %>% as.data.table()},
data.table = setDT(df)[CJ(group=unique(group),class1=unique(class1),class2=unique(class2)),.(group,x.group,class1,class2),on=.(group,class1,class2)][
,.(N=sum(!is.na(x.group))),by=.(group,class1,class2)] )
Unit: microseconds
expr min lq mean median uq max neval
table 546.501 615.9015 737.100 697.6505 775.152 1619.901 100
data.table 4242.001 4495.0010 5038.249 4766.6005 5192.601 14618.100 100

how to recode dummy column with the column name?

I have the input dataset, and I'm looking for generating the output dataset by recoding 1 as the name of the columns and 0 as NA. I managed to do it manually see Not optional solution below. But I have a dataset with hundreds of columns, so I'm looking for a way to automatize this process.
Packages
library(tibble)
library(dplyr)
Input
input <- tibble( a = c(1, 0, 0, 1, 0),
b = c(0, 0, 0, 1, 1),
c = c(1, 1, 1, 1, 1),
d = c(0, 0, 0, 0, 0))
# # A tibble: 5 × 4
# a b c d
# <dbl> <dbl> <dbl> <dbl>
# 1 1 0 1 0
# 2 0 0 1 0
# 3 0 0 1 0
# 4 1 1 1 0
# 5 0 1 1 0
Output
output <- tibble( a = c("a", NA, NA, "a", NA),
b = c(NA, NA, NA, "b", NA),
c = c("c", "c", "c", "c", "c"),
d = c(NA, NA, NA, NA, NA))
# # A tibble: 5 × 4
# a b c d
# <chr> <chr> <chr> <lgl>
# 1 a NA c NA
# 2 NA NA c NA
# 3 NA NA c NA
# 4 a b c NA
# 5 NA NA c NA
Not optional solution
input %>%
mutate(a = case_when(a == 1 ~ "a",
T ~ NA_character_),
b = case_when(b == 1 ~ "b",
T ~ NA_character_),
c = case_when(c == 1 ~ "c",
T ~ NA_character_),
d = case_when(d == 1 ~ "d",
T ~ NA_character_))
We could use across with an ifelse statement:
library(dplyr)
input %>%
mutate(across(everything(), ~ifelse(. == 1, cur_column(), NA)))
a b c d
<chr> <chr> <chr> <lgl>
1 a NA c NA
2 NA NA c NA
3 NA NA c NA
4 a b c NA
5 NA b c NA

return column differences when comparing rows in grouped data frame

I would like to do a pairwise comparison per group and return the row with a mismatch and which columns are different. Below is a sample dataset to explain the problem my actual data will have many more rows and columns.
data=structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20), Common_1 = c("A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B"), Common_2 = c("C", "C", "C", "C", "C", "D",
"D", "D", "D", "D", "C", "C", "C", "C", "C", "D", "D", "D", "D",
"D"), Common_3 = c("X", "X", "X", "X", "X", "X", "X", "X", "X",
"X", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), G = c(0,
1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0), var_1 = c(1,
3, 3, 3, 3, 1, 3, 2, 4, 3, 5, 5, 3, 4, 5, 1, 3, 5, 1, 4), var_2 = c("lev1",
"lev1", "lev2", "lev2", "lev1", "lev2", "lev2", "lev1", "lev1",
"lev2", "lev2", "lev2", "lev2", "lev1", "lev1", "lev1", "lev1",
"lev1", "lev2", "lev2"), var_3 = c("on", "on", "on", "off", "off",
"on", "on", "on", "off", "off", "on", "on", "on", "off", "off",
"on", "on", "on", "off", "off"), var_4 = c("up", "up", "down",
"down", "up", "down", "up", "down", "up", "up", "up", "up", "down",
"down", "up", "up", "up", "up", "down", "down")), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
ID is a unique identifier, Common_1,Common_2,Common_3are grouping variables, G is the group that I want to make the comparisons between and finally the rest of the columns var_1:var_4 are the columns to determine differences. The process would be to compare each row in G=0 to G=1 and if there is a difference in any of the var columns return the ID combination of the mismatch and which columns differ.
Here are the desired results for Common_1=A,Common_2=C,Common_3=X it has the ID for rows G=0, all the grouping variables, the ID for the G=1 mismatch and indicator variables showing which columns differed.
results=structure(list(ID = c(1, 1, 3, 3, 4, 4), Common_1 = c("A", "A",
"A", "A", "A", "A"), Common_2 = c("C", "C", "C", "C", "C", "C"
), Common_3 = c("X", "X", "X", "X", "X", "X"), G = c(0, 0, 0,
0, 0, 0), var_1 = c(1, 1, 0, 0, 0, 0), var_2 = c(0, 0, 1, 1,
1, 1), var_3 = c(0, 1, 0, 1, 1, 0), var_4 = c(0, 0, 1, 1, 1,
1), ID_diff = c(2, 5, 2, 5, 2, 5)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
Update: Added explanation of results
I am doing a pairwise comparison of G=0 to G=1. The first two rows of results are derived like so:
Same overall group Common_1=A,Common_2=C,Common_3=X
now compare ID=1 to ID=2
var_1 is different so a 1 is placed in the var_1 column and rest are zero. ID_diff=2 because that is the ID that differs from ID=1
compare ID=1 to ID=5
var_1 and var_3 are different so a 1 is placed in each column and rest are zero. ID_diff=5 because that is the ID that differs from ID=1
I tried writing a function to loop through each case with G=0 and compare to each case where G=1 but got stuck extracting the mismatch info, any help is appreciated.
Results from Ronak Shah's solution which works but I am having trouble displaying the results correctly.
> var_col <- grep('^var', names(data))
>
> apply_fun <- function(tmp) {
+ df1 <- subset(tmp, G == 0)
+ df2 <- subset(tmp, G == 1)
+ lapply(seq(nrow(df1)), function(x) {
+ df3 <- df1[rep(x, nrow(df2)), ]
+ df3$ID_diff <- df2$ID
+ df3[var_col] <- +(df1[rep(x, nrow(df2)), var_col] != df2[var_col])
+ df3
+ })
+ }
>
>
> library(dplyr)
> data %>%
+ group_by(across(starts_with('Common'))) %>%
+ summarise(data = apply_fun(cur_data_all())) %>%
+ ungroup %>%
+ select(data) %>%
+ tidyr::unnest(data)
`summarise()` regrouping output by 'Common_1', 'Common_2', 'Common_3' (override with `.groups` argument)
# A tibble: 22 x 10
ID Common_1 Common_2 Common_3 G var_1[,1] [,2] [,3] [,4] var_2[,1] [,2] [,3] [,4] var_3[,1] [,2] [,3] [,4] var_4[,1] [,2]
<dbl> <chr> <chr> <chr> <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 A C X 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0
2 1 A C X 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0
3 3 A C X 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1
4 3 A C X 0 0 1 1 1 0 1 1 1 0 1 1 1 0 1
5 4 A C X 0 0 1 1 1 0 1 1 1 0 1 1 1 0 1
6 4 A C X 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1
7 7 A D X 0 1 0 0 1 1 0 0 1 1 0 0 1 1 0
8 8 A D X 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
9 9 A D X 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
10 10 A D X 0 1 0 1 1 1 0 1 1 1 0 1 1 1 0
# ... with 12 more rows, and 3 more variables: [,3] <int>, [,4] <int>, ID_diff <dbl>
Try using this :
We create a function which compares every row of G == 0 with every row of G == 1 and changes the values to 1/0 in var_col based on the comparison.
var_col <- grep('^var', names(data))
apply_fun <- function(tmp) {
df1 <- subset(tmp, G == 0)
df2 <- subset(tmp, G == 1)
lapply(seq(nrow(df1)), function(x) {
df3 <- df1[rep(x, nrow(df2)), ]
df3$ID_diff <- df2$ID
df3[var_col] <- +(df1[rep(x, nrow(df2)), var_col] != df2[var_col])
df3
})
}
Apply data by group :
library(dplyr)
data %>%
group_by(across(starts_with('Common'))) %>%
summarise(data = apply_fun(cur_data_all())) %>%
ungroup %>%
select(data) %>%
tidyr::unnest(data)
# A tibble: 22 x 10
# ID Common_1 Common_2 Common_3 G var_1 var_2 var_3 var_4 ID_diff
# <dbl> <chr> <chr> <chr> <dbl> <int> <int> <int> <int> <dbl>
# 1 1 A C X 0 1 0 0 0 2
# 2 1 A C X 0 1 0 1 0 5
# 3 3 A C X 0 0 1 0 1 2
# 4 3 A C X 0 0 1 1 1 5
# 5 4 A C X 0 0 1 1 1 2
# 6 4 A C X 0 0 1 0 1 5
# 7 7 A D X 0 1 0 0 1 6
# 8 8 A D X 0 1 1 0 0 6
# 9 9 A D X 0 1 1 1 1 6
#10 10 A D X 0 1 0 1 1 6
# … with 12 more rows
cur_data_all() and across needs latest installation of dplyr. My packageVersion('dplyr') is 1.0.1’.

How to create dummy variables per group of another variable in tidyverse

I want create (dummy) variables that show whether an observation is in a group of observations (Identifiable by a common Group_ID) with a certain combination of characteristics across that group. The code example makes it clearer what I exactly mean.
I tried combinations of group_by and caret::dummyVars, but had no success. I am running out of ideas - any help would be appreciated very much.
library(tidyverse)
# Input data
# please note: in my case each value of the column Role will appear only once per Group_ID.
input_data <- tribble( ~Group_ID, ~Role, ~Income,
#--|--|----
1, "a", 3.6,
1, "b", 8.5,
2, "a", 7.6,
2, "c", 9.5,
2, "d", 9.7,
3, "a", 1.6,
3, "b", 4.5,
3, "c", 2.7,
3, "e", 7.7,
4, "b", 3.3,
4, "c", 6.2,
)
# desired output
output_data <- tribble( ~Group_ID, ~Role, ~Income, ~Role_A, ~Role_B, ~Role_C, ~Role_D, ~Role_E, ~All_roles,
#--|--|----
1, "a", 3.6, 1, 1, 0, 0, 0, "ab",
1, "b", 8.5, 1, 1, 0, 0, 0, "ab",
2, "a", 7.6, 1, 0, 1, 1, 0, "acd",
2, "c", 9.5, 1, 0, 1, 1, 0, "acd",
2, "d", 9.7, 1, 0, 1, 1, 0, "acd",
3, "a", 1.6, 1, 1, 1, 0, 1, "abce",
3, "b", 4.5, 1, 1, 1, 0, 1, "abce",
3, "c", 2.7, 1, 1, 1, 0, 1, "abce",
3, "e", 7.7, 1, 1, 1, 0, 1, "abce",
4, "b", 3.3, 0, 1, 1, 0, 0, "bc",
4, "c", 6.2, 0, 1, 1, 0, 0, "bc"
)
The following takes advantage of base R modeling functions to create the dummies.
First, create a model matrix with no intercept.
fit <- lm(Group_ID ~ 0 + Role, input_data)
m <- model.matrix(fit)
Now, process that matrix by noting that the dummies the question asks for are the sums by groups of Group_ID.
input_data %>%
bind_cols(m %>% as.data.frame()) %>%
group_by(Group_ID) %>%
mutate_at(vars(matches("Role[[:alpha:]]")), sum) %>%
mutate(all_roles = paste(Role, collapse = ""))
## A tibble: 11 x 9
## Groups: Group_ID [4]
# Group_ID Role Income Rolea Roleb Rolec Roled Rolee all_roles
# <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
# 1 1 a 3.6 1 1 0 0 0 ab
# 2 1 b 8.5 1 1 0 0 0 ab
# 3 2 a 7.6 1 0 1 1 0 acd
# 4 2 c 9.5 1 0 1 1 0 acd
# 5 2 d 9.7 1 0 1 1 0 acd
# 6 3 a 1.6 1 1 1 0 1 abce
# 7 3 b 4.5 1 1 1 0 1 abce
# 8 3 c 2.7 1 1 1 0 1 abce
# 9 3 e 7.7 1 1 1 0 1 abce
#10 4 b 3.3 0 1 1 0 0 bc
#11 4 c 6.2 0 1 1 0 0 bc
Using dplyr and cSplit_e from splitstackshape. For every Group_ID we paste the Role together and then separate them into new columns of binary value based on their presence and absence using cSplit_e.
library(splitstackshape)
library(dplyr)
input_data %>%
group_by(Group_ID) %>%
mutate(new_role = paste(Role, collapse = "")) %>%
ungroup() %>%
cSplit_e("new_role", sep = "", type = "character", fill = 0)
# Group_ID Role Income new_role new_role_a new_role_b new_role_c new_role_d new_role_e
#1 1 a 3.6 ab 1 1 0 0 0
#2 1 b 8.5 ab 1 1 0 0 0
#3 2 a 7.6 acd 1 0 1 1 0
#4 2 c 9.5 acd 1 0 1 1 0
#5 2 d 9.7 acd 1 0 1 1 0
#6 3 a 1.6 abce 1 1 1 0 1
#7 3 b 4.5 abce 1 1 1 0 1
#8 3 c 2.7 abce 1 1 1 0 1
#9 3 e 7.7 abce 1 1 1 0 1
#10 4 b 3.3 bc 0 1 1 0 0
#11 4 c 6.2 bc 0 1 1 0 0

Find if survey data is consistent across years by participant in dplyr in R

I have data which looks like this:
df <- data.frame(
ID = c(rep(c("ABC123", "BCD234", "CDE345", "DEF456", "EFG567", "FGH678", "GHI891", "HIJ910", "IJK101", "JKL011"),2)),
eth = c(1, 2, 2, 3, 1, 1, 4, 4, 3, 3, 1, 4, 1, 3, 1, 3, 4, 4, 3, 2),
nzdep = c(4, 3, 3, 2, 4, 4, 1, 1, 2, 2, 4, 3, 3, 4, 4, 2, 1, 1, 2, 3),
sex = c("M", "M", "F", "F", "M", "M", "F", "F", "M", "M", "F", "M", "M", "M", "M", "F", "F", "M", "F", "M"),
Year = c(rep("Y1", 10), rep("Y2", 10)))
This is survey data, for the same people, in different years. The ID is a unique ID per person, and the Year tells us which year the survey was completed. What I want to know, is whether the same ID answered the same question the same way in both years.
I have tried something like this:
dems <- df %>%
group_by(ID) %>%
mutate(dep_dif = ifelse(nzdep = nzdep, 1, 0),
sex_dif = ifelse(sex = sex, 1, 0),
eth_dif = ifelse(eth = eth, 1, 0))
This doesn't work, but I was thinking something along these lines.
My desired output would be:
dems <- data.frame(
ID = c(rep(c("ABC123", "BCD234", "CDE345", "DEF456", "EFG567", "FGH678", "GHI891", "HIJ910", "IJK101", "JKL011"),2)),
eth = c(1, 2, 2, 3, 1, 1, 4, 4, 3, 3, 1, 4, 1, 3, 1, 3, 4, 4, 3, 2),
nzdep = c(4, 3, 3, 2, 4, 4, 1, 1, 2, 2, 4, 3, 3, 4, 4, 2, 1, 1, 2, 3),
sex = c("M", "M", "F", "F", "M", "M", "F", "F", "M", "M", "F", "M", "M", "M", "M", "F", "F", "M", "F", "M"),
Year = c(rep("Y1", 10), rep("Y2", 10)),
eth_dif = c(rep(c(1, 0, 0, 1, 1, 0, 1, 1, 1, 0),2)),
dep_dif = c(rep(c(1, 1, 1, 0, 1, 0, 1, 1, 1, 0),2)),
sex_dif = c(rep(c(0, 1, 0, 0, 1, 0, 1, 0, 0, 1),2)))
Does anyone know how to do this?
Thanks
Seems like you need unique value equal to one
df%>%group_by(ID)%>%dplyr::mutate( ifelse(length(unique(nzdep))==1, 1, 0),
+ sex_dif = ifelse(length(unique(sex))==1, 1, 0),
+ eth_dif = ifelse(length(unique(eth))==1, 1, 0))
# A tibble: 20 x 8
# Groups: ID [10]
ID eth nzdep sex Year `ifelse(length(unique(nzdep)) == 1, 1, 0)` sex_dif eth_dif
<fctr> <dbl> <dbl> <fctr> <fctr> <dbl> <dbl> <dbl>
1 ABC123 1 4 M Y1 1 0 1
2 BCD234 2 3 M Y1 1 1 0
3 CDE345 2 3 F Y1 1 0 0
4 DEF456 3 2 F Y1 0 0 1
5 EFG567 1 4 M Y1 1 1 1
6 FGH678 1 4 M Y1 0 0 0
7 GHI891 4 1 F Y1 1 1 1
8 HIJ910 4 1 F Y1 1 0 1
9 IJK101 3 2 M Y1 1 0 1
10 JKL011 3 2 M Y1 0 1 0
11 ABC123 1 4 F Y2 1 0 1
12 BCD234 4 3 M Y2 1 1 0
13 CDE345 1 3 M Y2 1 0 0
14 DEF456 3 4 M Y2 0 0 1
15 EFG567 1 4 M Y2 1 1 1
16 FGH678 3 2 F Y2 0 0 0
17 GHI891 4 1 F Y2 1 1 1
18 HIJ910 4 1 M Y2 1 0 1
19 IJK101 3 2 F Y2 1 0 1
20 JKL011 2 3 M Y2 0 1 0
We could do this with mutate_at
library(dplyr)
df %>%
group_by(ID) %>%
mutate_at(2:4, funs(dif = as.integer(.[Year == "Y1"] == .[Year == "Y2"])))
# A tibble: 20 x 8
# Groups: ID [10]
# ID eth nzdep sex Year eth_dif nzdep_dif sex_dif
# <fct> <dbl> <dbl> <fct> <fct> <int> <int> <int>
# 1 ABC123 1 4 M Y1 1 1 0
# 2 BCD234 2 3 M Y1 0 1 1
# 3 CDE345 2 3 F Y1 0 1 0
# 4 DEF456 3 2 F Y1 1 0 0
# 5 EFG567 1 4 M Y1 1 1 1
# 6 FGH678 1 4 M Y1 0 0 0
# 7 GHI891 4 1 F Y1 1 1 1
# 8 HIJ910 4 1 F Y1 1 1 0
# 9 IJK101 3 2 M Y1 1 1 0
#10 JKL011 3 2 M Y1 0 0 1
#11 ABC123 1 4 F Y2 1 1 0
#12 BCD234 4 3 M Y2 0 1 1
#13 CDE345 1 3 M Y2 0 1 0
#14 DEF456 3 4 M Y2 1 0 0
#15 EFG567 1 4 M Y2 1 1 1
#16 FGH678 3 2 F Y2 0 0 0
#17 GHI891 4 1 F Y2 1 1 1
#18 HIJ910 4 1 M Y2 1 1 0
#19 IJK101 3 2 F Y2 1 1 0
#20 JKL011 2 3 M Y2 0 0 1
If the 'ID' is already ordered a base R option would be
df[paste0(names(df)[2:4], "_dif")] <- +(Reduce(`==`, split(df[2:4], df$Year)))

Resources