I have two dataframes that I want to combine, for each possible combination.
Basically, I dataframes like this:
> table1 = data.frame(a1 = c("a","b"), a2 = c("c", "d"))
> table1
a1 a2
1 a c
2 b d
> table2 = data.frame(b1 = c("e", "f"), b2 = c("g", "h"))
> table2
b1 b2
1 e g
2 f h
and I want to get a result like this:
> combinedtable = data.frame(a1 = c("a","a", "b","b"), a2 = c("c", "c", "d", "d"), b1 = c("e", "f","e", "f"), b2 = c("g", "h","g", "h"))
> combinedtable
a1 a2 b1 b2
1 a c e g
2 a c f h
3 b d e g
4 b d f h
Is there a neat way to do this? What I eventually want to do is to run an lapply on the resulting table. Otherwise I need to write a function like:
for each row in X, apply this function for each row in Y.
Combining first seems more efficient.
base R
with(expand.grid(a=seq_len(nrow(table1)), b=seq_len(nrow(table2))),
cbind(table1[a,], table2[b,]))
# a1 a2 b1 b2
# 1 a c e g
# 2 b d e g
# 1.1 a c f h
# 2.1 b d f h
or
merge(table1, table2, by = NULL)
# a1 a2 b1 b2
# 1 a c e g
# 2 b d e g
# 3 a c f h
# 4 b d f h
dplyr
Similar to the by=NULL method, we can do
dplyr::full_join(table1, table2, by = character())
Here is my approach with purrr:
purrr::pmap_dfr(table1, ~ data.frame(..., table2))
Returning:
a1 a2 b1 b2
1 a c e g
2 a c f h
3 b d e g
4 b d f h
The other answers have it for each combination of a1 and a2. I read this question a little differently so just in case: if you want a table with every combination of all levels of all columns:
cbind(table1, table2) %>%
complete(a1, a2, nesting(b1, b2))
Related
I have example data as follows:
library(data.table)
dat <- fread("Survey Variable_codes_2022
D D1
A A1
B B1
B B3
B B2
E E1
B NA
E NA")
For the two rows that have Variable_codes_2022==NA, I would like to increment the variable code so that it becomes:
dat <- fread("Survey Variable_codes_2022
D D1
A A1
B B1
B B3
B B2
E E1
B B4
E E2"
Because the column Variable_codes_2022 is a string variable, the numbers are not in numerical order.
I have no idea where to start and I was wondering if someone could help me on the right track.
We could do it this way:
grouping
arranging and
mutate.
To keep the original order we could first create and id and then rearrange:
library(dplyr)
dat %>%
group_by(Survey) %>%
arrange(.by_group = TRUE) %>%
mutate(Variable_codes_2022 = paste0(Survey, row_number()))
Survey Variable_codes_2022
<chr> <chr>
1 A A1
2 B B1
3 B B2
4 B B3
5 B B4
6 D D1
7 E E1
8 E E2
data.table option using rleid like this:
library(data.table)
dat[, Variable_codes_2022 := paste0(Survey, rleid(Variable_codes_2022)), by = Survey]
dat
#> Survey Variable_codes_2022
#> 1: D D1
#> 2: A A1
#> 3: B B1
#> 4: B B2
#> 5: B B3
#> 6: E E1
#> 7: B B4
#> 8: E E2
Created on 2022-12-01 with reprex v2.0.2
dat <-
structure(list(survey = c("D", "A", "B", "B", "B", "E", "B",
"E", "B"), var_code = c("D1", "A1", "B1", "B3", "B2", "E1", NA,
NA, NA)), row.names = c(NA, -9L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x0000026db10f1ef0>)
library(dplyr)
library(stringr)
dat %>%
group_by(survey) %>%
mutate(
aux1 = as.numeric(stringr::str_remove(var_code,survey)),
aux2 = cumsum(is.na(var_code)),
var_code = paste0(survey,max(aux1,na.rm = TRUE)+aux2)
) %>%
ungroup() %>%
select(-aux1,-aux2)
# A tibble: 9 x 2
survey var_code
<chr> <chr>
1 D D1
2 A A1
3 B B3
4 B B3
5 B B3
6 E E1
7 B B4
8 E E2
9 B B5
This solution with rowid.
Added an extra element to the sample so it can be tested against multiple missings
library(data.table)
#> Warning: package 'data.table' was built under R version 4.2.2
dat <- fread("Survey Variable_codes_2022
D D1
A A1
B B1
B B3
B B2
E E1
B NA
E NA
E NA")
dat[, n := as.numeric(substr(
Variable_codes_2022, nchar(Survey)+1, nchar(Variable_codes_2022)))]
dat[is.na(n),
Variable_codes_2022 := paste0(Survey, rowid(Survey) +
dat[.SD[,.(Survey)], .(m=max(n, na.rm=T)), on = "Survey", by=.EACHI ][,m])]
dat
#> Survey Variable_codes_2022 n
#> 1: D D1 1
#> 2: A A1 1
#> 3: B B1 1
#> 4: B B3 3
#> 5: B B2 2
#> 6: E E1 1
#> 7: B B4 NA
#> 8: E E2 NA
#> 9: E E3 NA
how to make a combination of letters
label=c("A","B","C","D","E")
into a dataframe with 4 group (G1, G2, G3, G4) as follows
k2=data.frame(G1=c("AB","AC","AD","AE","BC","BD","BE","CD","CE","DE"),
G2=c("C","B","B","B","A","A","A","A","A","A"),
G3=c("D","D","C","C","D","C","C","B","B","B"),
G4=c("E","E","E","D","E","E","D","E","D","C"))
and if i want to make group into 3 (G1, G2, G3) and give condition so that "B" and "C" can't separate like below dataframe how to do?
k3=data.frame(G1=c("BCD","BCE","BCA","AE","AD","DE"),
G2=c("A","A","D","BC","BC","BC"),
G3=c("E","D","E","D","E","A"))
Thank you very much for the help
Here is one way to do what you want to do:
a <- t(combn(c("A", "B", "C", "D", "E"), 2))
a <- paste0(a[, 1], a[, 2])
b <- t(apply(a, 1, function(x) setdiff(c("A", "B", "C", "D", "E"), x)))
k2 <- data.frame(a, b)
colnames(k2) <- paste0("G", 1:4)
k2
# G1 G2 G3 G4
# 1 AB C D E
# 2 AC B D E
# 3 AD B C E
# 4 AE B C D
# 5 BC A D E
# 6 BD A C E
# 7 BE A C D
# 8 CD A B E
# 9 CE A B D
# 10 DE A B C
The simplest way to do the second version is to exclude "C" and add it at the end:
d <- t(combn(c("A", "B", "D", "E"), 2))
d <- paste0[d[, 1], d[, 2]]
e <- t(apply(d, 1, function(x) setdiff(c("A", "B", "D", "E"), x)))
k3 <- data.frame(d, e)
colnames(k3) <- paste0("G", 1:3)
k3 <- data.frame(sapply(g, function(x) gsub("B", "BC", x)))
k3
# G1 G2 G3
# 1 ABC D E
# 2 AD BC E
# 3 AE BC D
# 4 BCD A E
# 5 BCE A D
# 6 DE A BC
This does not match your k3 exactly, but it is more consistent with k2.
I have a grouped data set that looks like this:
data = data.frame(group = c(1,1,1,1,2,2,2,2),
c1 = c("A", "E", "A", "J", "L", "M", "L", "J"),
c2 = c("B", "F", "F", "K", "B", "F", "T", "E"),
c3 = c("C", "G", "C", "L", "C", "X", "C", "V"),
c4 = c("D", "H", "I", "M", "D", "T", "I", "W"))
And I need to calculate the number of values in each row that are not duplicated within each group. For example, something that looks like this:
group c1 c2 c3 c4 uniq.vals
1 1 A B C D 2
2 1 E F G H 3
3 1 A F C I 1
4 1 J K L M 4
5 2 L B C D 2
6 2 M F X T 3
7 2 L T C I 1
8 2 J E V W 4
The count for row 1 would be 2, because B and D do not show up in any of the other rows within group 1.
I am familiar with using group_by and summarize but I am having trouble extending that to this particular situation, which requires that each value be checked across multiple columns and rows. For example, n_distinct on its own would not work because I'm looking for non-duplicated values, not unique values.
Ideally the solution would also ignore NAs and not count them as duplicated or non-duplicated values.
Here is an option with tidyverse. Reshape to 'long' format with pivot_longer, grouped by 'group', replace all the duplicate 'value' to NA, then grouped by row number, summarise to get the counts with n_distinct (number of distinct elements), and bind with the original data
library(dplyr)
library(tidyr)
data %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = starts_with('c')) %>%
group_by(group) %>%
mutate(value = replace(value, duplicated(value)|duplicated(value,
fromLast = TRUE), NA)) %>%
group_by(rn) %>%
summarise(uniq.vals = n_distinct(value, na.rm = TRUE), .groups = 'drop') %>%
select(uniq.vals) %>%
bind_cols(data, .)
-output
# group c1 c2 c3 c4 uniq.vals
#1 1 A B C D 2
#2 1 E F G H 3
#3 1 A F C I 1
#4 1 J K L M 4
#5 2 L B C D 2
#6 2 M F X T 3
#7 2 L T C I 1
#8 2 J E V W 4
In base R you would do:
a <- tapply(unlist(data[-1]), data$group[row(data[-1])],table)
data$uniq.vals <- c(by(data, seq(nrow(data)),
function(x)sum(a[[x[,1]]][unlist(x[-1])]<2)))
group c1 c2 c3 c4 uniq.vals
1 1 A B C D 2
2 1 E F G H 3
3 1 A F C I 1
4 1 J K L M 4
5 2 L B C D 2
6 2 M F X T 3
7 2 L T C I 1
8 2 J E V W 4
Note that in your case, row 3 should have 1 since only I is the unique value
I got a nested list l with each item each self is a 2 level list. For example:
l1 = list("a", list("a1"= "a1v"))
l2 = list("b", list("b1" = "b1v", b2 = "b2v"))
l3 = list("c", list("c1" = c("c1v1", "c1v2", "c1v3")))
l = list(l1, l2, l3)
How do I tranform it to a data.frame like this:
df = data.frame(A = c("a", "b", "b", "c", "c", "c"), B= c("a1", "b1", "b2", "c1", "c1", "c1"), C=c("a1v", "b1v", "b2v", "c1v1", "c1v2", "c1v3"))
> df
A B C
1 a a1 a1v
2 b b1 b1v
3 b b2 b2v
4 c c1 c1v1
5 c c1 c1v2
6 c c1 c1v3
Tried with seperate_rows and map_df but both failed to deal with inconsistent dimension of .x[[2]] items.
Update 1:
#akrun's solution is not running for me:
We could use bind_rows with map
library(purrr)
library(dplyr)
library(tidyr)
map_dfr(l, ~bind_cols(.x) %>%
pivot_longer(cols = -1, names_to = 'B', values_to = 'C') %>%
rename_at(1, ~'A'))
# A tibble: 6 x 3
# A B C
#* <chr> <chr> <chr>
#1 a a1 a1v
#2 b b1 b1v
#3 b b2 b2v
#4 c c1 c1v1
#5 c c1 c1v2
#6 c c1 c1v3
If the sample data in your question accurately reflects your actual data, you can try one of the following:
library(data.table)
data.table(l)[, list(names(unlist(l)),
unlist(l, use.names = FALSE))][
, V3 := V2[1], cumsum(V1 == "")][V1 != ""]
## V1 V2 V3
## 1: a1 a1v a
## 2: b1 b1v b
## 3: b2 b2v b
## 4: c11 c1v1 c
## 5: c12 c1v2 c
## 6: c13 c1v3 c
reshape2::melt(setNames(lapply(l, "[[", -1), lapply(l, "[[", 1)))
## value L2 L1
## 1 a1v a1 a
## 2 b1v b1 b
## 3 b2v b2 b
## 4 c1v1 c1 c
## 5 c1v2 c1 c
## 6 c1v3 c1 c
Base R option :
do.call(rbind, lapply(l, function(x) {
data.frame(A = x[[1]], B = unlist(x[[2]]), C = names(x[[2]]))
}))
# A B C
#a1 a a1v a1
#b1 b b1v b1
#b2 b b2v b2
#c11 c c1v1 c1
#c12 c c1v2 c1
#c13 c c1v3 c1
Since this is also one of the solution, I will post it here as well. This one is the one I can relate to.
map_df(l, ~ tibble(A=.x[[1]], B=names(.x[[2]]), C= unlist(.x[[2]])))
Read:
Run through all elements of l and make a data.frame (map_df and ~ inside) from a sub-data.frame created by tibble where column A = ..., B = ..`, ...
Thanks go to:
#akrun for prompt answer, I could have used the solution, but was
too busy to figure out.
#A5C1D2H2I1M1N2O1R2T1 also provided a
performant answer.
#Ronak Shah provided a plain R base
solution that I can translate to this.
I have a dataframe:
df = read.table(text="group X1 X2 X3 X4 X5 X6 X7
P1 H H H H H H H
P1 C D C D B C C
P1 D C B A C D H
P1 D C B A C D D
P1 C D C D B C D
P2 C D B D C D C
P2 H H H H H H H
P2 D C C A B C D
P3 C D C D B C C
P3 H H H H H H H
P3 C D C D B C C
P3 D C B A C D D", header=T, stringsAsFactors=F)
I have another dataframe:
df2 = read.table(text="Group col R S
P1 'X2 X4 X7' 'C A D' 'D D C'
P2 'X2 X3 X4 X6' 'C C A C' 'D B D D'
P3 'X3 X5 X6 X7' 'B C D D' 'C B C C'", header=T, stringsAsFactors=F)
I would like to add a column named "assign" to hold the assignment which is based on df2. For example, if df$group=="P1", then only concatenate columns in df shown in df2$col "P1" row, if all columns have the same letter "H", then assign "H" to the "assign" column; if match the string in df2$R column, assign "R"; if match the string in df2$S column, assign "S"; if not match any three cases as mentioned, then assign "U".
I have tested my script in the group "P1", but I don't know how to return the assigned value to the df and go through the loop. Appreciate any helps.
I expect the result as:
df = read.table(text="group 1 2 3 4 5 6 7 assign
P1 H H H H H H H H
P1 C D C D B C C S
P1 D C B A C D D U
P1 D C B A C D D R
P1 C D C D B C D U
P2 C D B D C D C S
P2 H H H H H H H H
P2 D C C A B C D R
P3 C D C D B C C S
P3 H H H H H H H H
P3 C D C D B C C S
P3 D C B A C D D R
", header=T, stringsAsFactors=F)
You can use data.table and solve your problem in three steps:
merge the data.tables
this is the key step, build a pattern to match later, the cool thing is that we can use a flexible number of .SDcols across the by groups in the data.table
build the assign variable
Here is the code:
# data
require(data.table)
dt = data.table(df)
dt2 = data.table(df2)
# add col_int, a list(!) of col indices, to dt2 for each Group
dt3 = dt2[, list(col_name = strsplit(col, ' ')[[1]]), by = Group]
dt3 = dt3[, col_idx := match(col_name, names(dt))]
dt3 = dt3[, list(col_idx = list(col_idx)), by = Group]
dt2 = merge(dt2, dt3, by = 'Group')
# solution
dt = merge(x = dt,
y = dt2,
by = 'Group')
idx_matching_table = names(dt)
# a: using strings
dt[,
j = pattern := {
.SD[, do.call('paste', c(.SD)), .SDcols = strsplit(col, ' ')[[1]]]
},
by = list(Group, col)]
# b: using indices
dt[,
j = pattern_2 := {
# .SD has less cols (compared to dt), therefore find out what the integer index of col_idx in .SD is:
col_idx_sd = match(idx_matching_table[col_idx[[1]]], names(.SD))
.SD[, do.call('paste', c(.SD)), .SDcols = col_idx_sd]
},
by = list(Group, col)]
dt[, identical(pattern, pattern_2)] # TRUE
dt[, assign := 'U']
dt[pattern %like% '[H ]+H', assign := 'H']
dt[pattern == R, assign := 'R']
dt[pattern == S, assign := 'S']
EDIT I replaced apply(.SD, 1, paste, collapse = ' ') with do.call('paste', c(.SD)) to avoid coercion to matrix.