Count combinations between columns in data frame - r

I have a dataframe like this
V1 V2 V3
1 A A A
2 B A A
3 A B C
4 C A A
With this code i get another dataframe with all the possible combinations with "A", "B", "C".
library("gtools")
vars <- c("A", "B", "C")
combMatrix <- (combinations(n = 3, r = 2, repeats.allowed = T, v = vars))
combArray <- paste(combMatrix [,1], combMatrix [,2], sep="")
combDf <- expand.grid(combArray ,vars)
Then I want to count the combinations between a couple of two columns in the first dataframe(let's say V1 and V2) and the other column, and it's important to consider the concatenated characters (V1+V2 in this case) like "AB" and "BA" as the same combination.
The final data frame should look like this.
V1+V2 V3 Freq
AA A 1
AB A 1
AC A 1
BB A 0
BC A 0
CC A 0
AA B 0
AB B 0
AC B 0
BB B 0
BC B 0
CC B 0
AA C 0
AB C 1
AC C 0
BB C 0
BC C 0
CC C 0
Then I have to iterate the process for every combinations of columns (V1+V2/V3, V1+V3/V2, V2+V3/V1).

You can try this tidyverse solution.
First calculate all possible 18 combinations considering AB==BA for two variables using some magic and not-so-elegant code including map, unite, sort and paste together with rowwise.
library(tidyverse)
all_combs <- expand.grid(unique(unlist(d)),unique(unlist(d)),unique(unlist(d))) %>%
rowwise() %>%
mutate_all(as.character) %>%
mutate(two=paste(sort(c(Var1,Var2)), collapse="")) %>%
ungroup() %>%
unite(all, two, Var3) %>%
select(all) %>%
distinct()
Then the rest
combn(1:ncol(d),2, simplify = F) %>%
set_names(map(.,~paste(., collapse = "&"))) %>%
map(~select(d,a =.[1], b=.[2], everything()) %>%
rowwise() %>%
mutate_all(as.character) %>%
mutate(two=paste(sort(c(a, b)), collapse="")) %>%
select(two, contains("V"), -a,-b) %>%
ungroup() %>%
unite(all, two, contains("V")) %>%
count(all)) %>%
map(~right_join(.,all_combs, by="all")) %>%
bind_rows(.id = "id") %>%
mutate(n=ifelse(is.na(n), 0, n)) %>%
spread(id, n)
# A tibble: 18 x 4
all `1&2` `1&3` `2&3`
<chr> <dbl> <dbl> <dbl>
1 AA_A 1 1 1
2 AA_B 0 0 1
3 AA_C 0 0 1
4 AB_A 1 1 0
5 AB_B 0 0 0
6 AB_C 1 0 0
7 AC_A 1 1 0
8 AC_B 0 1 0
9 AC_C 0 0 0
10 BB_A 0 0 0
11 BB_B 0 0 0
12 BB_C 0 0 0
13 BC_A 0 0 1
14 BC_B 0 0 0
15 BC_C 0 0 0
16 CC_A 0 0 0
17 CC_B 0 0 0
18 CC_C 0 0 0

Related

r count values in rows after dcast

I want to sum all values in a row of a dataframe after performing a dcast operation from the reshape2 package. Problem is that all values are the same (10) and are the sum of all rows combined. Values should be 4,2,4
Example data with code:
df <- data.frame(x = as.factor(c("A","A","A","A","B","B","C","C","C","C")),
y = as.factor(c("AA","AB","AA","AC","BB","BA","CC","CC","CC","CD")),
z = c("var1","var1","var2","var1","var2","var1","var1","var2","var2","var1"))
df2 <- df %>%
group_by(x,y) %>%
summarise(num = n()) %>%
ungroup()
df3 <- dcast(df2,x~y, fill = 0 )
df3$total <- sum(df3$AA,df3$AB,df3$AC,df3$BA,df3$BB,df3$CC,df3$CD)
sum gives you 1 combined value and that value is repeated for all other rows.
sum(df3$AA,df3$AB,df3$AC,df3$BA,df3$BB,df3$CC,df3$CD)
#[1] 10
You need rowSums to get sum of each row separately.
df3$total <- rowSums(df3[-1])
Here is a simplified tidyverse approach starting from df :
library(dplyr)
library(tidyr)
df %>%
count(x, y, name = 'num') %>%
pivot_wider(names_from = y, values_from = num, values_fill = 0) %>%
mutate(total = rowSums(select(., AA:CD)))
# x AA AB AC BA BB CC CD total
# <fct> <int> <int> <int> <int> <int> <int> <int> <dbl>
#1 A 2 1 1 0 0 0 0 4
#2 B 0 0 0 1 1 0 0 2
#3 C 0 0 0 0 0 3 1 4
We can specify the values_fn in pivot_wider and also use adorn_totals from janitor
library(dplyr)
library(tidyr)
library(janitor)
df %>%
pivot_wider(names_from = y, values_from = z, values_fill = 0,
values_fn = length) %>%
adorn_totals("col")
-output
# x AA AB AC BB BA CC CD Total
# A 2 1 1 0 0 0 0 4
# B 0 0 0 1 1 0 0 2
# C 0 0 0 0 0 3 1 4
Or using base R with xtabs and addmargins
addmargins(xtabs(z ~ x + y, transform(df, z = 1)), 2)
# y
#x AA AB AC BA BB CC CD Sum
# A 2 1 1 0 0 0 0 4
# B 0 0 0 1 1 0 0 2
# C 0 0 0 0 0 3 1 4

Construct a variable name using a string and another variable in the same df and then update its value in R

I am trying to update variables in a df selected using a constructed name with a string and another variable in the same data frame.
Say my df is:
df
y index
1 4 1
2 8 5
3 4 3
4 6 2
to which I add five variables m.1 through m.5:
> df
y index m.1 m.2 m.3 m.4 m.5
1 4 1 0 0 0 0 0
2 8 5 0 0 0 0 0
3 4 3 0 0 0 0 0
4 6 2 0 0 0 0 0
using something like this:
createvars <- function(df, n) { mutate(df, "m.{n}" := 0) }
for(i in 1:max(df$index)) {df <- createvars(df, n=i)}
I want to update the variables m.1 through m.5 using something like this m.{n} := index based on conditions that look something like this {n} == index.
The result then should look like:
> df
y index m.1 m.2 m.3 m.4 m.5
1 4 1 1 0 0 0 0
2 8 5 0 0 0 0 5
3 4 3 0 0 3 0 0
4 6 2 0 0 0 4 0
Note, in the actual sample/application:
the number of variables m. will depend on the sample and can be several hundred,
the values in the variables m. will be functions of index and other variables in the df,
the df will ultimately be used in lm(y~) and this might not be the right way to proceed.
Any suggestion how to accomplish this?
Thanks tons for any suggestion!!!!
One dplyr and tibble solution could be:
df %>%
add_column(!!!setNames(rep(0, 5), paste0("m", 1:5))) %>%
mutate(across(starts_with("m"), ~ +(paste0("m", index) == cur_column()) * index))
y index m1 m2 m3 m4 m5
1 4 1 1 0 0 0 0
2 8 5 0 0 0 0 5
3 4 3 0 0 3 0 0
4 6 2 0 2 0 0 0
It is easier with row/column indexing in base R
# // create the 'm.' columns with 0 values
nm1 <- paste0('m.', 1:5)
df[nm1] <- 0
# // assign the elements that corresponds to row/column index with index
df[nm1][cbind(seq_len(nrow(df)), df$index)] <- df$index
-output
df
# y index m.1 m.2 m.3 m.4 m.5
#1 4 1 1 0 0 0 0
#2 8 5 0 0 0 0 5
#3 4 3 0 0 3 0 0
#4 6 2 0 2 0 0 0
Or another option is model.matrix from base R
df[nm1] <- model.matrix(~ factor(df$index, levels = 1:5) - 1) * df$index
Or modify the function createvars and use a for loop
library(stringr)
createvars <- function(data, n) {
data %>%
mutate(!! str_c('m.', n) := case_when(index == n ~ index, TRUE ~ 0L ))
}
for(i in seq_len(max(df$index))) df <- createvars(df, i)
Or another option with rowwise and unnest
library(tidyr)
library(dplyr)
mx <- max(df$index)
df %>%
rowwise %>%
mutate(new = list(replace(numeric(mx), index, index))) %>%
ungroup %>%
unnest_wider(c(new)) %>%
rename_at(vars(starts_with("..")), ~ str_c('m.', seq_along(.)))
data
df <- structure(list(y = c(4L, 8L, 4L, 6L), index = c(1L, 5L, 3L, 2L
)), class = "data.frame", row.names = c("1", "2", "3", "4"))
You can get the data in wide format after creating a dummy column.
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number(),
col = paste0('m', index),
index1 = index) %>%
complete(col = paste0('m', 1:max(index))) %>%
pivot_wider(names_from = col, values_from = index1, values_fill = 0) %>%
filter(!is.na(row)) %>%
arrange(row) %>%
select(-row)
# y index m1 m2 m3 m4 m5
# <int> <int> <int> <int> <int> <int> <int>
#1 4 1 1 0 0 0 0
#2 8 5 0 0 0 0 5
#3 4 3 0 0 3 0 0
#4 6 2 0 2 0 0 0

R: adapt mutate call from handling three binary variables to n binary variables

I've got a dataframe with 3 binary variables that relate to time period 1 and three corresponding variables that relate to time 2.
df <- data.frame("user" = c("a","b","c","d","e"), "item_1_time_1" = c(1,0,0,0,NA), "item_2_time_1" = c(1,1,1,0,NA), "item_3_time_1" = c(0,0,1,0,0), "item_1_time_2" = c(1,0,0,0,NA), "item_2_time_2" = c(1,0,0,0,NA), "item_3_time_2" = c(0,0,1,0,1))
df
user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2
1 a 1 1 0 1 1 0
2 b 0 1 0 0 0 0
3 c 0 1 1 0 0 1
4 d 0 0 1 0 0 0
5 e NA NA 0 NA NA 1
I would like to to know if an observation has a 1 for a given item during period 1 but not during period 2. Moreover, I would like to know if an observation has any instance in which an item is 1 during period 1 and not period 2.
So the ideal output would look like
df2 <- data.frame("user" = c("a","b","c","d","e"), "item_1_time_1" = c(1,0,0,0,NA), "item_2_time_1" = c(1,1,1,0,NA), "item_3_time_1" = c(0,0,1,1,0), "item_1_time_2" = c(1,0,0,0,NA), "item_2_time_2" = c(1,0,0,0,NA), "item_3_time_2" = c(0,0,1,0,1), "item_1_check" = c(1,1,1,1,1), "item_2_check" = c(1,0,0,1,1), "item_3_check" = c(1,1,1,0,1), item_check = c(1,0,0,0,1))
df2
user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2 item_1_check item_2_check item_3_check item_check
1 a 1 1 0 1 1 0 1 1 1 1
2 b 0 1 0 0 0 0 1 0 1 0
3 c 0 1 1 0 0 1 1 0 1 0
4 d 0 0 1 0 0 0 1 1 0 0
5 e NA NA 0 NA NA 1 1 1 1 1
So far I've tried
library(tidyverse)
df2 <- df %>%
mutate(across(ends_with('time_2'), replace_na, 0)) %>%
mutate(across(ends_with('time_1'), replace_na, 0)) %>%
mutate(item_1_check = if_else(item_1_time_1 == 1 & item_1_time_2 == 0, 0, 1),
item_2_check = if_else(item_2_time_1 == 1 & item_2_time_2 == 0, 0, 1),
item_3_check = if_else(item_3_time_1 == 1 & item_3_time_2 == 0, 0, 1)) %>%
mutate(item_check = pmin(item_1_check, item_2_check, item_3_check))
I would like to generalize the above mutate calls so that they can handle n many items rather than just 3. Is there a way that I can use ends_with('check') for the final mutate? The variable names don't vary but for the item number and time period.
An option would be to reshape to 'long' format and do this once
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -user, names_to = c('group', '.value'),
names_sep="_(?=time)") %>%
mutate(across(starts_with('time'), replace_na, 0)) %>%
group_by(group) %>%
transmute(user, check = !(time_1 & !time_2)) %>%
ungroup %>%
group_by(user) %>%
summarise(check = min(check), .groups = 'drop') %>%
right_join(df, .) %>%
select(names(df), check)
# user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2 check
#1 a 1 1 0 1 1 0 1
#2 b 0 1 0 0 0 0 0
#3 c 0 1 1 0 0 1 0
#4 d 0 0 0 0 0 0 1
#5 e NA NA 0 NA NA 1 1
Or using base R
df$check <- +( Reduce(`&`, lapply(split.default(replace(df[-1],
is.na(df[-1]), 0), sub("time_\\d+", "", names(df)[-1])),
function(x) !(x[[1]] & !x[[2]]))))

Turn row values from multiple columns into column names in R?

I have a data frame that looks as follows:
state1 state1_pp state2 state2_pp state3 state3_pp
<chr> <chr> <chr> <chr> <chr> <chr>
1 0 0.995614 F 0.004386 NA 0
2 0 1 NA 0 NA 0
3 0 1 NA 0 NA 0
I want the values from each of the rows to be the column names the numeric values to be the row values:
0 F NA
<chr> <chr> <chr>
1 0.995614 0.004386 0
2 1 0 0
3 1 0 0
How do I do this in R?
Or a more complex scenario:
state1 state1_pp state2 state2_pp state3 state3_pp
1 0 0.995614 F 0.004386 NA 0
2 A 1 B 0 C 0
3 D 0.7 B 0.3 NA 0
This is what I want:
0 A D F B C NA
1 0.995614 0 0 0.004386 0 0 0
2 0 1 0 0 0 0 0
3 0 0 0.7 0 0.3 0 0
First a warning, having column names that are numeric (like 1) or are reserved R keywords (like NA) can cause you all sorts of errors. But if you must do it, I suggest the following:
library(dplyr)
# extract title row
headers <- df %>%
head(1) %>%
select(state1, state2, state3) %>%
unlist(use.names = FALSE) %>%
as.character()
# replace NA with "NA"
headers[is.na(headers)] = "NA"
# drop columns that are not wanted
new_df <- df %>%
select(-state1, -state2, -state3)
# replace column names
colnames(new_df) <- headers
In order to refer to your new columns you will probably need to use backticks: `
So with your new column names 0, F and NA you can call df$F but you can not call df$NA or df$1. Instead you will have to call df$`1` and df$`NA`.
Here's an attempt using dplyr and tidyr :
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number()) %>%
mutate_all(as.character) %>%
pivot_longer(cols = -row) %>%
mutate(name = sub('\\d+', '', name)) %>%
group_by(name, row) %>%
mutate(row1 = row_number()) %>%
pivot_wider() %>%
group_by(state, row) %>%
mutate(row1 = row_number()) %>%
pivot_wider(names_from = state, values_from = state_pp,
values_fill = list(state_pp = 0)) %>%
ungroup() %>%
select(-row, -row1)
# A tibble: 3 x 7
# `0` F `NA` A B C D
# <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 0.995614 0.004386 0 0 0 0 0
#2 0 0 0 1 0 0 0
#3 0 0 0 0 0.3 0 0.7

Use mutate_at() to create multiple binary variables from the values of a single variable

I have some variables that contain the following support values {a, b, c, ... k} and I wanted to create multiple binary variables for each response. For example, var_a would be equivalent to as.numeric(variable name very long== "a"), var_b would be equivalent to as.numeric(variable name very long== "b") and so on. However, in some of the variables, they don't go neatly from a:k. Some might have skipped a letter or two.
I know how to use mutate_at when I have multiple variables that I want to change, but what if I only have one variable from which I want to create multiple variables all at once?
What I have been doing so far is this:
df <- df %>% mutate(var_a = as.numeric(`variable name very long` == "a"),
var_b = as.numeric(`variable name very long` == "b"),
...)
Except of course there are more than two variables that I want to create. Is there an easier way to do this? And I also use mutate as a way to shorten the variable name. I've also tried creating a function that might be able to do this for whatever variable and value I want it to be since I have to do this often, but I wasn't able to get it to work:
varname <- function(newvar, var, value){
df <- df %>% mutate(newvar = as.numeric(var == "value"))
}
varname("var_a", "`variable name very long`", "a")
Any suggestions are deeply appreciated. Thank you!
We could use map2 to loop over the unique elements in the column, along with the vector of new column names, transmute to create the column, and bind the output with the original data
library(dplyr)
library(purrr)
library(stringr)
un1 <- sort(as.character(unique(df[["variable name very long"]])))
un2 <- str_c('var_', un1)
map2_dfc(un1, un2, ~ df %>%
transmute(!! .y := +(`variable name very long` == .x))) %>%
bind_cols(df, .)
# A tibble: 20 x 7
# `variable name very long` val var_a var_b var_c var_d var_e
# * <chr> <dbl> <int> <int> <int> <int> <int>
# 1 c -0.710 0 0 1 0 0
# 2 b -1.04 0 1 0 0 0
# 3 c -0.798 0 0 1 0 0
# 4 e 0.319 0 0 0 0 1
# 5 b 1.87 0 1 0 0 0
# 6 b -0.317 0 1 0 0 0
# 7 a -0.773 1 0 0 0 0
# 8 d -1.44 0 0 0 1 0
# 9 a -0.348 1 0 0 0 0
#10 a -0.421 1 0 0 0 0
#11 e 1.06 0 0 0 0 1
#12 e 0.528 0 0 0 0 1
#13 a 3.13 1 0 0 0 0
#14 e -0.546 0 0 0 0 1
#15 e -1.05 0 0 0 0 1
#16 d -0.687 0 0 0 1 0
#17 e -1.13 0 0 0 0 1
#18 b -0.489 0 1 0 0 0
#19 a 1.85 1 0 0 0 0
#20 d -0.0376 0 0 0 1 0
Or another option is pivot_wider
library(tidyr)
df %>%
mutate(rn = row_number(), n = 1,
newcol = str_c('var_', `variable name very long`)) %>%
pivot_wider(names_from = newcol, values_from = n, values_fill = list(n = 0))
Or in base R with model.matrix
cbind(df, model.matrix(~ `variable name very long` -1, df))
data
set.seed(24)
df <- tibble(`variable name very long` = sample(letters[1:5],
20, replace = TRUE), val = rnorm(20))

Resources