tidyverse: all permutations of categories - r

Here's a problem: I have all possible combinations of M elements from a set of N elements (N choose M). Each combination has a value assigned.
An example for N = 5 and M = 3:
library(tidyverse)
df <- letters[1:5] %>% combn( m = 3 ) %>% t() %>%
as_tibble( .name_repair = function(x) {paste0('id', 1:length(x))} )
df$val <- runif( nrow(df) )
Which gives a set of 10 combinations:
# A tibble: 10 x 4
id1 id2 id3 val
<chr> <chr> <chr> <dbl>
1 a b c 0.713
2 a b d 0.314
3 a b e 0.831
4 a c d 0.555
5 a c e 0.915
6 a d e 0.954
7 b c d 0.131
8 b c e 0.0583
9 b d e 0.533
10 c d e 0.857
Now I would like to add the combinations such that the results represents selection of M elements without replacement (N!/(N-M)!), but conserving the values for each set of M elements.
So, staying with the example, the result should contain 543=60 rows. For the example, I can do it in a "manual" permutation of the columns:
# add missing combinations
df_perm <- df %>% bind_rows(
# 1, 3, 2
df %>% mutate( tmp = id2, id2 = id3, id3 = tmp ) %>%
select( -tmp )
) %>% bind_rows(
# 2, 1, 3
df %>% mutate( tmp = id1, id1 = id2, id2 = tmp ) %>%
select( -tmp )
) %>% bind_rows(
# 2, 3, 1
df %>% mutate( tmp = id1, id1 = id2, id2 = id3, id3 = tmp ) %>%
select( -tmp )
) %>% bind_rows(
# 3, 1, 2
df %>% mutate( tmp = id2, id2 = id1, id1 = id3, id3 = tmp ) %>%
select( -tmp )
) %>% bind_rows(
# 3, 2, 1
df %>% mutate( tmp = id3, id3 = id1, id1 = tmp ) %>%
select( -tmp )
)
However, this becomes unfeasible quickly for M>3.
What would be a more elegant way to achieve the same result?

As I read your question, it essentially seems that you have assigned a value to each possible combination of size M from a set of size N. You would then like to map the value for each combination to its permutations.
For example, if the combination a, b, d has a value of 0.4, then you would like a, b, d, a, d, b, b, a, d, b, d, a, d, b, a and d, a, b to have a value of 0.4.
First, get all possible permutations of the vector 1:M, where M is the number of elements per combination as defined above:
M <- 3
perm_mat <- gtools::permutations(M, M)
Then permute the columns of the df as per the above permutations:
perm_df <- purrr::map_df(1:nrow(perm_mat), function(i){
df_curr <- df[,c(perm_mat[i,], M+1)]
colnames(df_curr) <- colnames(df)
df_curr
})
This produces the following output (first twenty rows):
V1 V2 V3 val
<chr> <chr> <chr> <dbl>
1 a b c 0.0682
2 a b d 0.735
3 a b e 0.0336
4 a c d 0.965
5 a c e 0.889
6 a d e 0.796
7 b c d 0.792
8 b c e 0.508
9 b d e 0.606
10 c d e 0.623
11 a c b 0.0682
12 a d b 0.735
13 a e b 0.0336
14 a d c 0.965
15 a e c 0.889
16 a e d 0.796
17 b d c 0.792
18 b e c 0.508
19 b e d 0.606
20 c e d 0.623
Note that the numbers in the values column are different to the original post simply because I used a different seed before running runif.

Related

Merge data frames based on custom condition - string comparison

I'd like to merge rows of two data frames - df1 and df2 using column A:
#df1
A <- c('ab','ab','bc','bc','bc','cd')
B <- floor(runif(6, min=0, max=10))
C <- floor(runif(6, min=0, max=10))
D <- floor(runif(6, min=0, max=10))
E <- c('a, b, c','a, d, e','a, g, h','d, e, f','a, d, f','f, j')
df1 <- data.frame(A,B,C,D,E)
df1
A B C D E
1 ab 5 4 3 a, b, c
2 ab 9 4 0 a, d, e
3 bc 4 4 9 a, g, h
4 bc 5 5 6 d, e, f
5 bc 1 6 6 a, d, f
6 cd 1 2 0 f, j
#df2
A <- c('ab','bc','cd')
B <- floor(runif(3, min=0, max=10))
E <- c('a, d','d, f','n, m')
df2 <- data.frame(A,B,E)
df2
A B E
1 ab 4 a, d
2 bc 7 d, f
3 cd 1 n, m
I can do simply:
df3 <- merge(x=df1, y=df2, by='A', all.x = TRUE)
However there's condition of merging. Namely, I'd like to merge only rows from df2 to df1 when all substrings (column E) from df2 are present in df1, so the output should look like this:
df3
A B C D E A.y B.y E.y
1 ab 5 4 3 a, b, c NA NA NA
2 ab 9 4 0 a, d, e, ab 6 a, d
3 bc 4 4 9 a, g, h NA NA NA
4 bc 5 5 6 d, e, f bc 7 d, f
5 bc 1 6 6 a, d, f bc 7 d, f
6 cd 1 2 0 f, j NA NA NA
I know there's an option using %in% regarding vector comparison. However I have strings, should I first do some strsplit and unlist and then perform the comparison?
This is pretty messy but should do what you're looking for:
First, expand rows for both E values, then group by the key column to check if any values from RHS E are in LHS E. Then filter based on the lookup table.
library(tidyverse)
df3 <- merge(x=df1, y=df2, by='A', all.x = TRUE)
check_rows <- df3 %>%
separate_rows(E.y, sep = ',') %>%
separate_rows(E.x, sep = ',') %>%
mutate(E.x = trimws(E.x),
E.y = trimws(E.y)) %>%
group_by(A) %>%
mutate(check = E.y %in% E.x,
check = ifelse(any(check == TRUE), TRUE, FALSE)) %>%
select(A, check) %>%
unique() %>%
filter(check == TRUE)
df3 <- df3 %>%
filter(A %in% check_rows$A)

Extract rows where value appears in any of multiple columns

Let' say I have two data.frames
name_df = read.table(text = "player_name
a
b
c
d
e
f
g", header = T)
game_df = read.table(text = "game_id winner_name loser_name
1 a b
2 b a
3 a c
4 a d
5 b c
6 c d
7 d e
8 e f
9 f a
10 g f
11 g a
12 f e
13 a d", header = T)
name_df contains a unique list of all the winner_name or loser_name values in game_df. I want to create a new data.frame that has, for each person in the name_df a row if a given name (e.g. a) appears in either the winner_name or loser_name column
So I essentially want to merge game_df with name_df, but the key column (name) can appear in either winner_name or loser_name.
So, for just a and b the final output would look something like:
final_df = read.table(text = "player_name game_id winner_name loser_name
a 1 a b
a 2 b a
a 3 a c
a 4 a d
a 9 f a
a 11 g a
a 13 a d
b 1 a b
b 2 b a
b 5 b c", header = T)
We can loop over the elements in 'name_df' for 'player_name', filter the rows from 'game_df' for either the 'winner_name' or 'loser_name'
library(dplyr)
library(purrr)
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ game_df %>%
filter(winner_name %in% .x|loser_name %in% .x), .id = 'player_name')
Or if there are many columns, use if_any
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ {
nm1 <- .x
game_df %>%
filter(if_any(c(winner_name, loser_name), ~ . %in% nm1))
}, .id = 'player_name')
Dedicated to our teacher and mentor dear #akrun
I think we can also make use of the add_row() function you first taught me the other day. Unbelievable!!!
library(dplyr)
library(purrr)
library(tibble)
game_df %>%
rowwise() %>%
mutate(player_name = winner_name) %>%
group_split(game_id) %>%
map_dfr(~ add_row(.x, game_id = .x$game_id, winner_name = .x$winner_name,
loser_name = .x$loser_name, player_name = .x$loser_name)) %>%
arrange(player_name) %>%
relocate(player_name)
# A tibble: 26 x 4
player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows
This can be directly expressed in SQL:
library(sqldf)
sqldf("select *
from name_df
left join game_df on winner_name = player_name or loser_name = player_name")
Without using purrr. I think this is appropriate use case of tidyr::unite with argument remove = F where we can first unite the winners' and losers' names and then use tidyr::separate_rows to split new column into rows.
library(tidyr)
library(dplyr)
game_df %>% unite(Player_name, winner_name, loser_name, remove = F, sep = ', ') %>%
separate_rows(Player_name) %>%
relocate(Player_name) %>%
arrange(Player_name)
# A tibble: 26 x 4
Player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows
A Base R approach :
result <- do.call(rbind, lapply(name_df$player_name, function(x)
cbind(plaername = x,
subset(game_df, winner_name == x | loser_name == x))))
rownames(result) <- NULL
result
# playername game_id winner_name loser_name
#1 a 1 a b
#2 a 2 b a
#3 a 3 a c
#4 a 4 a d
#5 a 9 f a
#6 a 11 g a
#7 a 13 a d
#8 b 1 a b
#...
#...

correlation with a reference group in a dataframe

Consider noMissing data frame.
library(lubridate)
set.seed(123)
value <- rnorm(300)
value[sample(1:300,10)]<- NA
b <- rep(c("a","b", "c", "d","e", "f"), each=50)
b[sample(1:300,12)] <- NA
c <- rep(rep(as.character(1:2), each = 25) , 6)
c[sample(1:300,10)] <- NA
datee <- seq(lubridate::ymd("2012-01-01"),lubridate::ymd("2012-01-01") + 24 , by = "days")
datee <- rep(datee, 12)
datee[sample(1:300,20)] <- NA
dataframe <- cbind.data.frame( b, c, datee, value)
noMissing <- dataframe[complete.cases(dataframe),]
head(noMissing)
b c datee value
1 a 1 2012-01-01 -0.56047565
2 a 1 2012-01-02 -0.23017749
3 a 1 2012-01-03 1.55870831
4 a 1 2012-01-04 0.07050839
5 a 1 2012-01-05 0.12928774
6 a 1 2012-01-06 1.71506499
Now I want to group data by columns b, and c then calculate the correlation of each group with group a in b column which has the same dates in datee column as the other group.
For example correlation between b, 1 and the refrenced group a show in the following picture
My initial solution:
b_unique <- unique(noMissing$b)
c_unique <- unique(noMissing$c)
out <- list()
v <- 0
for (i in 1:length(b_unique)) {
v <- v + 1
group <- noMissing[noMissing$b==b_unique[i] & noMissing$c == c_unique[k],]
ref <- noMissing[noMissing$b=="a" & noMissing$c == c_unique[k] ,]
inter <-ymd("1970-01-01") + intersect(ref$datee, group$datee )
x <- cor(group$value[group$datee %in% inter],ref[ref$datee %in% inter , "value"])
out[[v]] <- list(b = b_unique[i], c = c_unique[k], cor = x)
}
}
dplyr::bind_rows(out)
b c cor
<fct> <fct> <dbl>
1 a 1 1.000
2 a 2 1
3 b 1 0.175
4 b 2 -0.247
5 c 1 0.216
6 c 2 0.101
7 d 1 0.159
8 d 2 -0.253
9 e 1 0.177
10 e 2 -0.528
11 f 1 0.179
12 f 2 -0.178
I am seeking good taste coding solutions
You can do the following:
library(data.table)
# convert the data shape to have datewise information across all groups
df <- dcast(data.table(noMissing), datee+c ~ b, value.var='value')
# rename c as c_1 column as there are multiple column with c name
setnames(df, old = 2, new = 'c_1')
# groupby 'c_1' and for each group calculate correlation between b-a, c-a, d-a, e-a and so on
df <- df[,
lapply(.SD[,-c('datee'), with=F], function(x) {
cols <- c('a','b','c','d','e','f')
vals <- vector(mode = 'numeric', length = 6)
for(i in seq(cols)) {vals[i] <- (cor(get(cols[i]), get(('a')), use = complete.obs'))}
return (vals)})
,c_1]
# finally reshape the table as you posted in solution above.
df <- melt(df, id.vars = c('c_1'))
colnames(df) <- c('c','b','cor')
c b cor
1: 1 a 1.00000000
2: 1 a -0.12499728
3: 1 a -0.13133257
4: 1 a 0.02573947
5: 1 a 0.07239559
6: 1 a -0.07421281

Adding new column names to use in spread()

Big picture: I'm trying to set up an export that has one route as a row and columns for each value.
This code: I'm trying to select the top three transfers for each route (using slice(1:3) because I need no more than three values. top_n() allows for ties). Then, I'm trying to spread() to create 6 columns: a name and a pct for each.
If I were to spread the data right now, the names would become columns, but I need to keep the names in the rows (see Desired Output). I want to create the column names as a key column to use to spread(). My approach is creating an error. I'm having trouble thinking of another strategy.
Data frame:
# A tibble: 7 x 3
route_shortname transfer_to pct
<chr> <chr> <dbl>
1 A D 0.5
2 A E 0.5
3 B F 0.667
4 B G 0.333
5 C D 0.111
6 C E 0.111
7 C G 0.111
Desired output:
# A tibble: 3 x 7
route_shortname transfer1 transfer1_pct transfer2 transfer2_pct transfer3 transfer3_pct
<chr> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 A D 0.5 E 0.5 NA NA
2 B F 0.667 G 0.333 NA NA
3 C D 0.111 E 0.111 G 0.111
Reprex:
library(tidyverse)
sample_data <- tibble::tribble(
~route_shortname, ~transfer_to, ~pct,
"A", "D", 0.5,
"A", "E", 0.5,
"B", "F", 0.666666666666667,
"B", "G", 0.333333333333333,
"C", "D", 0.111111111111111,
"C", "E", 0.111111111111111,
"C", "G", 0.111111111111111
)
transfer_to_table <- sample_data %>%
group_by(route_shortname) %>%
mutate(key = c("transfer1", "transfer2", "transfer3"))
#> Error in mutate_impl(.data, dots): Column `key` must be length 2 (the group size) or one, not 3
df = read.table(text = "
route_shortname transfer_to pct
1 A D 0.5
2 A E 0.5
3 B F 0.667
4 B G 0.333
5 C D 0.111
6 C E 0.111
7 C G 0.111
", header=T)
library(tidyverse)
df %>%
group_by(route_shortname) %>%
mutate(id = paste0("transfer", row_number())) %>%
ungroup() %>%
unite(v, transfer_to, pct) %>%
spread(id, v) %>%
separate(transfer1, c("transfer1","transfer1_pct"), sep = "_", convert = T) %>%
separate(transfer2, c("transfer2","transfer2_pct"), sep = "_", convert = T) %>%
separate(transfer3, c("transfer3","transfer3_pct"), sep = "_", convert = T)
# route_shortname transfer1 transfer1_pct transfer2 transfer2_pct transfer3 transfer3_pct
# <fct> <chr> <dbl> <chr> <dbl> <chr> <dbl>
# 1 A D 0.5 E 0.5 NA NA
# 2 B F 0.667 G 0.333 NA NA
# 3 C D 0.111 E 0.111 G 0.111
Though you tagged this question with tidyverse packages, here is an option using dcast from data.table which let's you do the reshaping in one (admittedly long) line.
library(data.table)
setDT(sample_data)
dcast(sample_data, route_shortname ~ rowid(route_shortname), value.var = c('transfer_to', 'pct'))
# route_shortname transfer_to_1 transfer_to_2 transfer_to_3 pct_1 pct_2 pct_3
#1: A D E <NA> 0.5000000 0.5000000 NA
#2: B F G <NA> 0.6666667 0.3333333 NA
#3: C D E G 0.1111111 0.1111111 0.1111111
You could also use reshape from base R
sample_data <- as.data.frame(sample_data) # does not work with tibbles for some reason
sample_data$idx <- with(sample_data,
ave(route_shortname, route_shortname, FUN = seq_along))
reshape(sample_data, idvar = "route_shortname", timevar = "idx", direction = "wide", sep = "_")
# route_shortname transfer_to_1 pct_1 transfer_to_2 pct_2 transfer_to_3 pct_3
#1 A D 0.5000000 E 0.5000000 <NA> NA
#3 B F 0.6666667 G 0.3333333 <NA> NA
#5 C D 0.1111111 E 0.1111111 G 0.1111111
In both cases you'd need to rename columns but I that shouldn't be too hard.

r create new data frame that matches in rows elements grouped by another column

I want to create a new data frame from the df one below. In the new data frame (df2), each element in df$name is placed in the first column and matched in its row with other element of df$name grouped by df$group.
df <- data.frame(group = rep(letters[1:2], each=3),
name = LETTERS[1:6])
> df
group name
1 a A
2 a B
3 a C
4 b D
5 b E
6 b F
In this example, "A", "B", and "C" in df$name belong to "a" in df$group, and I want to put them in the same row in a new data frame. The desired output looks like this:
> df2
V1 V2
1 A B
2 A C
3 B A
4 B C
5 C A
6 C B
7 D E
8 D F
9 E D
10 E F
11 F D
12 F E
We could do this in base R with merge
out <- setNames(subset(merge(df, df, by.x = 'group', by.y = 'group'),
name.x != name.y, select = -group), c("V1", "V2"))
row.names(out) <- NULL
out
# V1 V2
#1 A B
#2 A C
#3 B A
#4 B C
#5 C A
#6 C B
#7 D E
#8 D F
#9 E D
#10 E F
#11 F D
#12 F E
In my opinion its case of self-join. Using dplyr a solution can be as:
library(dplyr)
inner_join(df, df, by="group") %>%
filter(name.x != name.y) %>%
select(V1 = name.x, V2 = name.y)
# V1 V2
# 1 A B
# 2 A C
# 3 B A
# 4 B C
# 5 C A
# 6 C B
# 7 D E
# 8 D F
# 9 E D
# 10 E F
# 11 F D
# 12 F E
df <- data.frame(group = rep(letters[1:2], each=3),
name = LETTERS[1:6])
library(tidyverse)
df %>%
group_by(group) %>% # for every group
summarise(v = list(expand.grid(V1=name, V2=name))) %>% # create all combinations of names
select(v) %>% # keep only the combinations
unnest(v) %>% # unnest combinations
filter(V1 != V2) # exclude rows with same names
# # A tibble: 12 x 2
# V1 V2
# <fct> <fct>
# 1 B A
# 2 C A
# 3 A B
# 4 C B
# 5 A C
# 6 B C
# 7 E D
# 8 F D
# 9 D E
# 10 F E
# 11 D F
# 12 E F

Resources