I want create (dummy) variables that show whether an observation is in a group of observations (Identifiable by a common Group_ID) with a certain combination of characteristics across that group. The code example makes it clearer what I exactly mean.
I tried combinations of group_by and caret::dummyVars, but had no success. I am running out of ideas - any help would be appreciated very much.
# Input data
# please note: in my case each value of the column Role will appear only once per Group_ID.
input_data <- tribble( ~Group_ID, ~Role, ~Income,
1, "a", 3.6,
1, "b", 8.5,
2, "a", 7.6,
2, "c", 9.5,
2, "d", 9.7,
3, "a", 1.6,
3, "b", 4.5,
3, "c", 2.7,
3, "e", 7.7,
4, "b", 3.3,
4, "c", 6.2,
# desired output
output_data <- tribble( ~Group_ID, ~Role, ~Income, ~Role_A, ~Role_B, ~Role_C, ~Role_D, ~Role_E, ~All_roles,
1, "a", 3.6, 1, 1, 0, 0, 0, "ab",
1, "b", 8.5, 1, 1, 0, 0, 0, "ab",
2, "a", 7.6, 1, 0, 1, 1, 0, "acd",
2, "c", 9.5, 1, 0, 1, 1, 0, "acd",
2, "d", 9.7, 1, 0, 1, 1, 0, "acd",
3, "a", 1.6, 1, 1, 1, 0, 1, "abce",
3, "b", 4.5, 1, 1, 1, 0, 1, "abce",
3, "c", 2.7, 1, 1, 1, 0, 1, "abce",
3, "e", 7.7, 1, 1, 1, 0, 1, "abce",
4, "b", 3.3, 0, 1, 1, 0, 0, "bc",
4, "c", 6.2, 0, 1, 1, 0, 0, "bc"
The following takes advantage of base R modeling functions to create the dummies.
First, create a model matrix with no intercept.
fit <- lm(Group_ID ~ 0 + Role, input_data)
m <- model.matrix(fit)
Now, process that matrix by noting that the dummies the question asks for are the sums by groups of Group_ID.
input_data %>%
bind_cols(m %>% as.data.frame()) %>%
group_by(Group_ID) %>%
mutate_at(vars(matches("Role[[:alpha:]]")), sum) %>%
mutate(all_roles = paste(Role, collapse = ""))
## A tibble: 11 x 9
## Groups: Group_ID [4]
# Group_ID Role Income Rolea Roleb Rolec Roled Rolee all_roles
# <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
# 1 1 a 3.6 1 1 0 0 0 ab
# 2 1 b 8.5 1 1 0 0 0 ab
# 3 2 a 7.6 1 0 1 1 0 acd
# 4 2 c 9.5 1 0 1 1 0 acd
# 5 2 d 9.7 1 0 1 1 0 acd
# 6 3 a 1.6 1 1 1 0 1 abce
# 7 3 b 4.5 1 1 1 0 1 abce
# 8 3 c 2.7 1 1 1 0 1 abce
# 9 3 e 7.7 1 1 1 0 1 abce
#10 4 b 3.3 0 1 1 0 0 bc
#11 4 c 6.2 0 1 1 0 0 bc
Using dplyr and cSplit_e from splitstackshape. For every Group_ID we paste the Role together and then separate them into new columns of binary value based on their presence and absence using cSplit_e.
input_data %>%
group_by(Group_ID) %>%
mutate(new_role = paste(Role, collapse = "")) %>%
ungroup() %>%
cSplit_e("new_role", sep = "", type = "character", fill = 0)
# Group_ID Role Income new_role new_role_a new_role_b new_role_c new_role_d new_role_e
#1 1 a 3.6 ab 1 1 0 0 0
#2 1 b 8.5 ab 1 1 0 0 0
#3 2 a 7.6 acd 1 0 1 1 0
#4 2 c 9.5 acd 1 0 1 1 0
#5 2 d 9.7 acd 1 0 1 1 0
#6 3 a 1.6 abce 1 1 1 0 1
#7 3 b 4.5 abce 1 1 1 0 1
#8 3 c 2.7 abce 1 1 1 0 1
#9 3 e 7.7 abce 1 1 1 0 1
#10 4 b 3.3 bc 0 1 1 0 0
#11 4 c 6.2 bc 0 1 1 0 0
Could someone help me with this transformation in R? I would like to transform
this table
into this table
To create a table of like-against like
Count of ID
Any help would be appreciated. Thank you.
You can do:
with(dat, split(Count, Condition)) |>
table() |>
A B Freq
1 0 0 1
2 1 0 2
3 0 1 2
4 1 1 3
dat <- structure(list(ID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7,
7, 8, 9), Condition = c("A", "B", "A", "B", "A", "B", "A", "B",
"A", "B", "A", "B", "A", "B", "A", "B"), Count = c(1, 0, 1, 1,
0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0)), class = "data.frame", row.names = c(NA,
Here is a tidyverse solution. I filled missing values with 0, please note that this leads to a different count than in your table (do you mean to have 8, 8 as the last two IDs and not 8, 9?):
data <- read.table(text = "ID Condition Count
1 A 1
1 B 0
2 A 1
2 B 1
3 A 0
3 B 1
4 A 1
4 B 1
5 A 1
5 B 1
6 A 1
6 B 0
7 A 0
7 B 1
8 A 0
9 B 0", header = TRUE)
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> filter, lag
#> The following objects are masked from 'package:base':
#> intersect, setdiff, setequal, union
data %>%
id_cols = ID,
names_from = Condition,
values_from = Count,
values_fill = 0
) %>%
count(A, B, name = "Count of ID")
#> # A tibble: 4 × 3
#> A B `Count of ID`
#> <int> <int> <int>
#> 1 0 0 2
#> 2 0 1 2
#> 3 1 0 2
#> 4 1 1 3
Created on 2023-01-20 by the reprex package (v1.0.0)
I have a data set that looks like this
ID Group 1 Group 2 Group 3 Group 4
1 1 0 1 0
2 0 1 1 1
3 1 1 0 0
100 0 1 0 1
I want to make another column lets say Group 5 where if the condition of Group 1 is 1 then Group 5 would be 1. If Group 2 = 1, then Group 5 = 2. If Group 3 = 1, then Group 5 = 3, and if Group 4 = 1, then Group 5 = 4. How do I do this?
I tried these lines of code, but I seem to be missing something.
Group5 <- data.frame(Group1, Group2, Group3, Group4, stringsAsFactors=FALSE)
df$Group5 <- with(finalmerge, ifelse(Group1 %in% c("1", "0"),
"1", ""))
Any advice would be helpful, thanks in advance.
You could use which.max(), and apply this to each row.
df["Group_5"] <- apply(df[, -1], 1, which.max)
ID Group_1 Group_2 Group_3 Group_4 Group_5
1 1 0 0 0 1 4
2 2 0 1 0 0 2
3 3 0 0 1 0 3
4 4 1 0 0 0 1
df = structure(list(ID = c(1, 2, 3, 4), Group_1 = c(0, 0, 0, 1), Group_2 = c(0,
1, 0, 0), Group_3 = c(0, 0, 1, 0), Group_4 = c(1, 0, 0, 0)), class = "data.frame", row.names = c(NA,
I have a data frame of term frequencies and some other random demographic variables. I want to utilize two grouping variables, drop the ones I do not need, and sum the frequencies based on the grouping variables.
Here is similar to what I have
df <- data.frame(user= c(1:9),
Group1 = c("a", "a", "a", "b", "b","b","c", "c", "c"),
Group2 = c("d", "e", "d", "e", "d", "e", "e", "e", "e"),
term1 = c(0, 1, 1, 0, 1, 1, 0, 0, 0),
term2 = c(1, 0, 1, 1, 0, 1, 0, 1, 1),
term3 = c(0, 1, 0, 0, 0, 0, 1, 1, 0))
and here is what I am trying to get.
desired <- data.frame(Group1 = c("a", "a", "b", "b", "c", "c"),
Group2 = c("d", "e", "d", "e", "d", "e"),
term1 = c(1, 1, 1, 1, 0, 0),
term2 = c(2, 0, 0, 2, 0, 2),
term3 = c(0, 1, 0, 0, 0, 2))
My real frame has about 4000 term columns, so naming each one individual in a dplyr function does not seem feasible.
Thank you!
You can try aggregate + expand.grid + merge
with(df, expand.grid(Group1 = unique(Group1), Group2 = unique(Group2))),
aggregate(. ~ Group1 + Group2, df[-1], sum),
all = TRUE
which gives
Group1 Group2 term1 term2 term3
1 a d 1 2 0
2 a e 1 0 1
3 b d 1 0 0
4 b e 1 2 0
5 c d NA NA NA
6 c e 0 2 2
If you want to have NAs as 0, you can try
> res <- merge(
with(df, expand.grid(Group1 = unique(Group1), Group2 = unique(Group2))),
aggregate(. ~ Group1 + Group2, df[-1], sum),
all = TRUE
> replace(res, is.na(res), 0)
Group1 Group2 term1 term2 term3
1 a d 1 2 0
2 a e 1 0 1
3 b d 1 0 0
4 b e 1 2 0
5 c d 0 0 0
6 c e 0 2 2
We can group by 'Group1, 'Group2', get the sum of 'term' columns in summarise and expand the data with complete for the missing combinations
df %>%
group_by(Group1, Group2) %>%
summarise(across(starts_with('term'), sum), .groups = 'drop') %>%
complete(Group1, Group2, fill = list(term1 = 0, term2 = 0, term3 = 0))
# A tibble: 6 x 5
Group1 Group2 term1 term2 term3
<chr> <chr> <dbl> <dbl> <dbl>
1 a d 1 2 0
2 a e 1 0 1
3 b d 1 0 0
4 b e 1 2 0
5 c d 0 0 0
6 c e 0 2 2
If you don't need to compete all varible, setDT(df)[,lapply(.SD[,-1], sum),.(Group1,Group2)] is enough. Otherwise, you can use complete in package tidyr (as used in the first answer) to fill
the lacking varible.
setDT(df)[,lapply(.SD[,-1], sum),.(Group1,Group2)] %>%
complete(Group1, Group2, fill = list(term1 = 0, term2 = 0, term3 = 0))
#> # A tibble: 6 x 5
#> Group1 Group2 term1 term2 term3
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 a d 1 2 0
#> 2 a e 1 0 1
#> 3 b d 1 0 0
#> 4 b e 1 2 0
#> 5 c d 0 0 0
#> 6 c e 0 2 2
I would like to do a pairwise comparison per group and return the row with a mismatch and which columns are different. Below is a sample dataset to explain the problem my actual data will have many more rows and columns.
data=structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20), Common_1 = c("A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B"), Common_2 = c("C", "C", "C", "C", "C", "D",
"D", "D", "D", "D", "C", "C", "C", "C", "C", "D", "D", "D", "D",
"D"), Common_3 = c("X", "X", "X", "X", "X", "X", "X", "X", "X",
"X", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), G = c(0,
1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0), var_1 = c(1,
3, 3, 3, 3, 1, 3, 2, 4, 3, 5, 5, 3, 4, 5, 1, 3, 5, 1, 4), var_2 = c("lev1",
"lev1", "lev2", "lev2", "lev1", "lev2", "lev2", "lev1", "lev1",
"lev2", "lev2", "lev2", "lev2", "lev1", "lev1", "lev1", "lev1",
"lev1", "lev2", "lev2"), var_3 = c("on", "on", "on", "off", "off",
"on", "on", "on", "off", "off", "on", "on", "on", "off", "off",
"on", "on", "on", "off", "off"), var_4 = c("up", "up", "down",
"down", "up", "down", "up", "down", "up", "up", "up", "up", "down",
"down", "up", "up", "up", "up", "down", "down")), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
ID is a unique identifier, Common_1,Common_2,Common_3are grouping variables, G is the group that I want to make the comparisons between and finally the rest of the columns var_1:var_4 are the columns to determine differences. The process would be to compare each row in G=0 to G=1 and if there is a difference in any of the var columns return the ID combination of the mismatch and which columns differ.
Here are the desired results for Common_1=A,Common_2=C,Common_3=X it has the ID for rows G=0, all the grouping variables, the ID for the G=1 mismatch and indicator variables showing which columns differed.
results=structure(list(ID = c(1, 1, 3, 3, 4, 4), Common_1 = c("A", "A",
"A", "A", "A", "A"), Common_2 = c("C", "C", "C", "C", "C", "C"
), Common_3 = c("X", "X", "X", "X", "X", "X"), G = c(0, 0, 0,
0, 0, 0), var_1 = c(1, 1, 0, 0, 0, 0), var_2 = c(0, 0, 1, 1,
1, 1), var_3 = c(0, 1, 0, 1, 1, 0), var_4 = c(0, 0, 1, 1, 1,
1), ID_diff = c(2, 5, 2, 5, 2, 5)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
Update: Added explanation of results
I am doing a pairwise comparison of G=0 to G=1. The first two rows of results are derived like so:
Same overall group Common_1=A,Common_2=C,Common_3=X
now compare ID=1 to ID=2
var_1 is different so a 1 is placed in the var_1 column and rest are zero. ID_diff=2 because that is the ID that differs from ID=1
compare ID=1 to ID=5
var_1 and var_3 are different so a 1 is placed in each column and rest are zero. ID_diff=5 because that is the ID that differs from ID=1
I tried writing a function to loop through each case with G=0 and compare to each case where G=1 but got stuck extracting the mismatch info, any help is appreciated.
Results from Ronak Shah's solution which works but I am having trouble displaying the results correctly.
> var_col <- grep('^var', names(data))
> apply_fun <- function(tmp) {
+ df1 <- subset(tmp, G == 0)
+ df2 <- subset(tmp, G == 1)
+ lapply(seq(nrow(df1)), function(x) {
+ df3 <- df1[rep(x, nrow(df2)), ]
+ df3$ID_diff <- df2$ID
+ df3[var_col] <- +(df1[rep(x, nrow(df2)), var_col] != df2[var_col])
+ df3
+ })
+ }
> library(dplyr)
> data %>%
+ group_by(across(starts_with('Common'))) %>%
+ summarise(data = apply_fun(cur_data_all())) %>%
+ ungroup %>%
+ select(data) %>%
+ tidyr::unnest(data)
`summarise()` regrouping output by 'Common_1', 'Common_2', 'Common_3' (override with `.groups` argument)
# A tibble: 22 x 10
ID Common_1 Common_2 Common_3 G var_1[,1] [,2] [,3] [,4] var_2[,1] [,2] [,3] [,4] var_3[,1] [,2] [,3] [,4] var_4[,1] [,2]
<dbl> <chr> <chr> <chr> <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 A C X 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0
2 1 A C X 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0
3 3 A C X 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1
4 3 A C X 0 0 1 1 1 0 1 1 1 0 1 1 1 0 1
5 4 A C X 0 0 1 1 1 0 1 1 1 0 1 1 1 0 1
6 4 A C X 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1
7 7 A D X 0 1 0 0 1 1 0 0 1 1 0 0 1 1 0
8 8 A D X 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
9 9 A D X 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
10 10 A D X 0 1 0 1 1 1 0 1 1 1 0 1 1 1 0
# ... with 12 more rows, and 3 more variables: [,3] <int>, [,4] <int>, ID_diff <dbl>
Try using this :
We create a function which compares every row of G == 0 with every row of G == 1 and changes the values to 1/0 in var_col based on the comparison.
var_col <- grep('^var', names(data))
apply_fun <- function(tmp) {
df1 <- subset(tmp, G == 0)
df2 <- subset(tmp, G == 1)
lapply(seq(nrow(df1)), function(x) {
df3 <- df1[rep(x, nrow(df2)), ]
df3$ID_diff <- df2$ID
df3[var_col] <- +(df1[rep(x, nrow(df2)), var_col] != df2[var_col])
Apply data by group :
data %>%
group_by(across(starts_with('Common'))) %>%
summarise(data = apply_fun(cur_data_all())) %>%
ungroup %>%
select(data) %>%
# A tibble: 22 x 10
# ID Common_1 Common_2 Common_3 G var_1 var_2 var_3 var_4 ID_diff
# <dbl> <chr> <chr> <chr> <dbl> <int> <int> <int> <int> <dbl>
# 1 1 A C X 0 1 0 0 0 2
# 2 1 A C X 0 1 0 1 0 5
# 3 3 A C X 0 0 1 0 1 2
# 4 3 A C X 0 0 1 1 1 5
# 5 4 A C X 0 0 1 1 1 2
# 6 4 A C X 0 0 1 0 1 5
# 7 7 A D X 0 1 0 0 1 6
# 8 8 A D X 0 1 1 0 0 6
# 9 9 A D X 0 1 1 1 1 6
#10 10 A D X 0 1 0 1 1 6
# … with 12 more rows
cur_data_all() and across needs latest installation of dplyr. My packageVersion('dplyr') is 1.0.1’.
I have data which looks like this:
df <- data.frame(
ID = c(rep(c("ABC123", "BCD234", "CDE345", "DEF456", "EFG567", "FGH678", "GHI891", "HIJ910", "IJK101", "JKL011"),2)),
eth = c(1, 2, 2, 3, 1, 1, 4, 4, 3, 3, 1, 4, 1, 3, 1, 3, 4, 4, 3, 2),
nzdep = c(4, 3, 3, 2, 4, 4, 1, 1, 2, 2, 4, 3, 3, 4, 4, 2, 1, 1, 2, 3),
sex = c("M", "M", "F", "F", "M", "M", "F", "F", "M", "M", "F", "M", "M", "M", "M", "F", "F", "M", "F", "M"),
Year = c(rep("Y1", 10), rep("Y2", 10)))
This is survey data, for the same people, in different years. The ID is a unique ID per person, and the Year tells us which year the survey was completed. What I want to know, is whether the same ID answered the same question the same way in both years.
I have tried something like this:
dems <- df %>%
group_by(ID) %>%
mutate(dep_dif = ifelse(nzdep = nzdep, 1, 0),
sex_dif = ifelse(sex = sex, 1, 0),
eth_dif = ifelse(eth = eth, 1, 0))
This doesn't work, but I was thinking something along these lines.
My desired output would be:
dems <- data.frame(
ID = c(rep(c("ABC123", "BCD234", "CDE345", "DEF456", "EFG567", "FGH678", "GHI891", "HIJ910", "IJK101", "JKL011"),2)),
eth = c(1, 2, 2, 3, 1, 1, 4, 4, 3, 3, 1, 4, 1, 3, 1, 3, 4, 4, 3, 2),
nzdep = c(4, 3, 3, 2, 4, 4, 1, 1, 2, 2, 4, 3, 3, 4, 4, 2, 1, 1, 2, 3),
sex = c("M", "M", "F", "F", "M", "M", "F", "F", "M", "M", "F", "M", "M", "M", "M", "F", "F", "M", "F", "M"),
Year = c(rep("Y1", 10), rep("Y2", 10)),
eth_dif = c(rep(c(1, 0, 0, 1, 1, 0, 1, 1, 1, 0),2)),
dep_dif = c(rep(c(1, 1, 1, 0, 1, 0, 1, 1, 1, 0),2)),
sex_dif = c(rep(c(0, 1, 0, 0, 1, 0, 1, 0, 0, 1),2)))
Does anyone know how to do this?
Seems like you need unique value equal to one
df%>%group_by(ID)%>%dplyr::mutate( ifelse(length(unique(nzdep))==1, 1, 0),
+ sex_dif = ifelse(length(unique(sex))==1, 1, 0),
+ eth_dif = ifelse(length(unique(eth))==1, 1, 0))
# A tibble: 20 x 8
# Groups: ID [10]
ID eth nzdep sex Year `ifelse(length(unique(nzdep)) == 1, 1, 0)` sex_dif eth_dif
<fctr> <dbl> <dbl> <fctr> <fctr> <dbl> <dbl> <dbl>
1 ABC123 1 4 M Y1 1 0 1
2 BCD234 2 3 M Y1 1 1 0
3 CDE345 2 3 F Y1 1 0 0
4 DEF456 3 2 F Y1 0 0 1
5 EFG567 1 4 M Y1 1 1 1
6 FGH678 1 4 M Y1 0 0 0
7 GHI891 4 1 F Y1 1 1 1
8 HIJ910 4 1 F Y1 1 0 1
9 IJK101 3 2 M Y1 1 0 1
10 JKL011 3 2 M Y1 0 1 0
11 ABC123 1 4 F Y2 1 0 1
12 BCD234 4 3 M Y2 1 1 0
13 CDE345 1 3 M Y2 1 0 0
14 DEF456 3 4 M Y2 0 0 1
15 EFG567 1 4 M Y2 1 1 1
16 FGH678 3 2 F Y2 0 0 0
17 GHI891 4 1 F Y2 1 1 1
18 HIJ910 4 1 M Y2 1 0 1
19 IJK101 3 2 F Y2 1 0 1
20 JKL011 2 3 M Y2 0 1 0
We could do this with mutate_at
df %>%
group_by(ID) %>%
mutate_at(2:4, funs(dif = as.integer(.[Year == "Y1"] == .[Year == "Y2"])))
# A tibble: 20 x 8
# Groups: ID [10]
# ID eth nzdep sex Year eth_dif nzdep_dif sex_dif
# <fct> <dbl> <dbl> <fct> <fct> <int> <int> <int>
# 1 ABC123 1 4 M Y1 1 1 0
# 2 BCD234 2 3 M Y1 0 1 1
# 3 CDE345 2 3 F Y1 0 1 0
# 4 DEF456 3 2 F Y1 1 0 0
# 5 EFG567 1 4 M Y1 1 1 1
# 6 FGH678 1 4 M Y1 0 0 0
# 7 GHI891 4 1 F Y1 1 1 1
# 8 HIJ910 4 1 F Y1 1 1 0
# 9 IJK101 3 2 M Y1 1 1 0
#10 JKL011 3 2 M Y1 0 0 1
#11 ABC123 1 4 F Y2 1 1 0
#12 BCD234 4 3 M Y2 0 1 1
#13 CDE345 1 3 M Y2 0 1 0
#14 DEF456 3 4 M Y2 1 0 0
#15 EFG567 1 4 M Y2 1 1 1
#16 FGH678 3 2 F Y2 0 0 0
#17 GHI891 4 1 F Y2 1 1 1
#18 HIJ910 4 1 M Y2 1 1 0
#19 IJK101 3 2 F Y2 1 1 0
#20 JKL011 2 3 M Y2 0 0 1
If the 'ID' is already ordered a base R option would be
df[paste0(names(df)[2:4], "_dif")] <- +(Reduce(`==`, split(df[2:4], df$Year)))