I have a list that tells you how many possible mates an individual has. It looks like this:
list:
$`A1`
[1] "D2" "E2" "F2" "H2"
$`B1`
[1] "G2" "I2" "J2" "K2" "L2"
$`C1`
[1] "J2" "M2" "N2" "O2" "P2"
[6] "Q2" "R2" "S2"
So for example, the possible mates of individual A1 are individual D2, individual E2, individual F2, and individual H2.
I want to turn this into a data frame that pairs up an individual to its possible mates. So I want something like this:
df:
Female ID Mate ID
A1 D2
A1 E2
A1 F2
A1 H2
B1 G2
B1 I2
B1 J2
B1 K2
B1 L2
C1 J2
C1 M2
C1 N2
C1 O2
C1 P2
C1 Q2
C1 R2
C1 S2
I like purrr:map_df for this problem:
Your data
L <- list(A1 = c("D2", "E2", "F2", "H2"),
B1 = c("G2", "I2", "J2", "K2", "L2"),
C1 = c("J2", "M2", "N2", "O2", "P2", "Q2", "R2", "S2"))
Solution
library(purrr)
map_df(L, ~data.frame("Mate.ID" = .x), .id="Female.ID")
# Female.ID Mate.ID
# 1 A1 D2
# 2 A1 E2
# 3 A1 F2
# 4 A1 H2
# 5 B1 G2
# etc
I like map_df because of the useful .id argument that attaches the name of the list-entry.
(This can also be written...)
map_df(L, function(i) data.frame("Mate.ID" = i), .id="Female.ID")
(...the .x is shorthand for function(i) i)
One solution using just base R:
mylist <- list(A1=c("D2", "E2", "F2", "H2"),
B1=c("G2", "I2", "J2", "K2", "L2"),
C1=c("J2", "M2", "N2", "O2", "P2", "Q2", "R2", "S2"))
mydf <- lapply(1:length(mylist), function(i) {
data.frame(`Female ID`=names(mylist)[i], `Mate ID`=mylist[[i]], stringsAsFactors=F, check.names=F)
})
mydf <- do.call(rbind, mydf)
You can use stack from base R -
L <- list(A1 = c("D2", "E2", "F2", "H2"),
B1 = c("G2", "I2", "J2", "K2", "L2"),
C1 = c("J2", "M2", "N2", "O2", "P2", "Q2", "R2", "S2"))
result <- stack(L)
names(result) <- c("MaleID", "FemaleID")
result
MaleID FemaleID
1 D2 A1
2 E2 A1
3 F2 A1
4 H2 A1
5 G2 B1
6 I2 B1
7 J2 B1
8 K2 B1
9 L2 B1
10 J2 C1
11 M2 C1
12 N2 C1
13 O2 C1
14 P2 C1
15 Q2 C1
16 R2 C1
17 S2 C1
> L <- list(A1 = c("D2", "E2", "F2", "H2"),
B1 = c("G2", "I2", "J2", "K2", "L2"),
C1 = c("J2", "M2", "N2", "O2", "P2", "Q2", "R2", "S2"))
> library(tidyverse)
> library(plyr)
> ldply(L, data.frame) %>%
set_names("Female ID", "Mate ID")
Female ID Mate ID
1 A1 D2
2 A1 E2
3 A1 F2
4 A1 H2
5 B1 G2
6 B1 I2
7 B1 J2
8 B1 K2
9 B1 L2
10 C1 J2
11 C1 M2
12 C1 N2
13 C1 O2
14 C1 P2
15 C1 Q2
16 C1 R2
17 C1 S2
Related
I have this dataframe:
df <- structure(list(col1 = c("Z2", "A2", "B2", "C2", "A2", "E2", "F2",
"G2"), col2 = c("Z2", "Z2", "A2", "B2", "C2", "D2", "A2", "F2"
), col3 = c("A2", "B2", "C2", "D2", "E2", "F2", "G2", "Z2")), class = "data.frame", row.names = c(NA, -8L))
> df
col1 col2 col3
1 Z2 Z2 A2
2 A2 Z2 B2
3 B2 A2 C2
4 C2 B2 D2
5 A2 C2 E2
6 E2 D2 F2
7 F2 A2 G2
8 G2 F2 Z2
I would like to use explicitly filter, across and str_detect in a tidyverse setting to filter all rows that start with an A over col1:col3.
Expected result:
col1 col2 col3
1 Z2 Z2 A2
2 A2 Z2 B2
3 B2 A2 C2
4 A2 C2 E2
5 F2 A2 G2
I have tried:
library(dplyr)
library(stringr)
df %>%
filter(across(c(col1, col2, col3), ~str_detect(., "^A")))
This gives:
[1] col1 col2 col3
<0 Zeilen> (oder row.names mit Länge 0)
I want to learn why this code is not working using filter, across and str_detect!
We can use if_any as across will look for & condition i.e. all columns should meet the condition for a particular row to get filtered
library(dplyr)
library(stringr)
df %>%
filter(if_any(everything(), ~str_detect(., "^A")))
-output
col1 col2 col3
1 Z2 Z2 A2
2 A2 Z2 B2
3 B2 A2 C2
4 A2 C2 E2
5 F2 A2 G2
According to ?across
if_any() and if_all() apply the same predicate function to a selection of columns and combine the results into a single logical vector: if_any() is TRUE when the predicate is TRUE for any of the selected columns, if_all() is TRUE when the predicate is TRUE for all selected columns.
across() supersedes the family of "scoped variants" like summarise_at(), summarise_if(), and summarise_all().
The if_any/if_all are not part of the scoped variants
This question already has answers here:
How to create a consecutive group number
(13 answers)
Closed 1 year ago.
I have a dataframe for multiple products and different date ranges. I want to assign unique value to each date so that even if the starting dates are different for various products, I can group by the dates.
df
acc product date
a1 p1 d1
a1 p1 d2
a1 p1 d3
a1 p1 d4
a1 p2 d1
a1 p2 d2
a1 p2 d3
a1 p3 d3
a1 p3 d4
I want to arrange the dates so that there is a unique identifier each for d1, d2, d3 etc.
I used the following code to try this:
df <- df %>% group_by(acc, product) %>% mutate(t = row_number())
Output
df
acc product date t EXPECTED
a1 p1 d1 1 1
a1 p1 d2 2 2
a1 p1 d3 3 3
a1 p1 d4 4 4
a1 p2 d1 1 1
a1 p2 d2 2 2
a1 p2 d3 3 3
a1 p3 d3 1 3
a1 p3 d4 2 4
Any suggestions for this?
use dplyr::dense_rank()
df %>% mutate(new = dense_rank(date))
acc product date new
1 a1 p1 d1 1
2 a1 p1 d2 2
3 a1 p1 d3 3
4 a1 p1 d4 4
5 a1 p2 d1 1
6 a1 p2 d2 2
7 a1 p2 d3 3
8 a1 p3 d3 3
9 a1 p3 d4 4
If however, you want to restart ranks for each acc use group_by before the mutate statement.
dput used
df <- structure(list(acc = c("a1", "a1", "a1", "a1", "a1", "a1", "a1",
"a1", "a1"), product = c("p1", "p1", "p1", "p1", "p2", "p2",
"p2", "p3", "p3"), date = c("d1", "d2", "d3", "d4", "d1", "d2",
"d3", "d3", "d4")), class = "data.frame", row.names = c(NA, -9L
))
To create the sample needed:
require(pacman)
p_load(data.table)
DT_start <- data.table(ID = c(1,1,1,2,2,2), valueA = c("a1","a2","a3","b1","b2","b3"), valueB = c("A1","A2","A3","B1","B2","B3"))
DT_end <- data.table(ID = c(1,2)
, T01_valueA = c("a1","b1")
, T02_valueA = c("a2","b2")
, T03_valueA = c("a3","b3")
, T01_valueB = c("A1","B1")
, T02_valueB = c("A2","B2")
, T03_valueB = c("A3","B3"))
setcolorder(DT_end, c("ID","T01_valueA","T01_valueB","T02_valueA","T02_valueB","T03_valueA","T03_valueB"))
I have:
> DT_start
ID valueA valueB
1: 1 a1 A1
2: 1 a2 A2
3: 1 a3 A3
4: 2 b1 B1
5: 2 b2 B2
6: 2 b3 B3
I need:
> DT_end
ID T01_valueA T01_valueB T02_valueA T02_valueB T03_valueA T03_valueB
1: 1 a1 A1 a2 A2 a3 A3
2: 2 b1 B1 b2 B2 b3 B3
How to achieve it? basically transpose DT_start to DT_end with customized names: T01, T02, T03...
Using the input DT in the Note at the end we create a sequence within ID column s, melt it to long form and then dcast it back to the desired wide form. (The dcast formula could alternately be written as ID ~ s + variable.)
library(data.table)
DT[, s := sprintf("T%02d", seq_along(.I)), ID]
m <- melt(DT, id.vars = c("ID", "s"))
dcast(m, ID ~ ...)
giving:
ID T01_valueA T01_valueB T02_valueA T02_valueB T03_valueA T03_valueB
1: 1 a1 A1 a2 A2 a3 A3
2: 2 b1 B1 b2 B2 b3 B3
Note:
Input used:
library(data.table)
DF <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L), valueA = c("a1",
"a2", "a3", "b1", "b2", "b3"), valueB = c("A1", "A2", "A3", "B1",
"B2", "B3")), class = "data.frame",
row.names = c(NA, -6L))
DT <- as.data.table(DF)
This question already has answers here:
How to join (merge) data frames (inner, outer, left, right)
(13 answers)
Simultaneously merge multiple data.frames in a list
(9 answers)
Closed 4 years ago.
I try to combine two data.tables in R based on a common ID but varying columns and I also want to drop duplicate ID rows. My approach would be:
dt1 dt2
ID X1 Y1 Z1 ID X2 Y2 Z2
1 a1 a2 a3 1 A1 A2 A3
2 b1 b2 b3 2 B1 NA B3
3 c1 c2 NA 3 C1 C2 C3
4 d1 d2 d3 5 E1 E2 E3
6 f1 f2 f3 6 F1 F2 F3
Using rbind(dt1, dt2, fill = TRUE) gives me:
dt_merged
ID X1 Y1 Z1 X2 Y2 Z2
1 a1 a2 a3 NA NA NA
1 NA NA NA A1 A2 A3
2 b1 b2 b3 NA NA NA
2 NA NA NA B1 NA B3
3 c1 c2 NA NA NA NA
3 NA NA NA C1 C2 C3
4 d1 d2 d3 NA NA NA
5 NA NA NA E1 E2 E3
6 f1 f2 f3 NA NA NA
6 NA NA NA F1 F2 F3
My problem is now that I don´t know how to merge the duplicate row IDs and fill in the NAs with the corresponding data from the duplicate ID rows. My desired output data.table would be:
ID X1 Y1 Z1 X2 Y2 Z2
1 a1 a2 a3 A1 A2 A3
2 b1 b2 b3 B1 NA B3
3 c1 c2 NA C1 C2 C3
4 d1 d2 d3 NA NA NA
5 NA NA NA E1 E2 E3
6 f1 f2 f3 F1 F2 F3
I hope my stated description is good enough to give you an overview of my problem. Any kind of help would be higly appreciated by me and excuse me for my foolish question but data.table wrangling gives me sometimes a very hard time.
Simply do a full join. It is very simple with the dplyr package.
(or the data.table package)
library(dplyr)
dt1 <- data.frame("ID" = c(1,2,3,4,6),
"X1" = c("a1", "b1", "c1", "d1", "f1"),
"Y1" = c("a2", "b2", "c2", "d2", "f2"),
"Z1" = c("a3", "b3", NA, "d3", "f3")
)
dt2 <- data.frame("ID" = c(1,2,3,5,6),
"X2" = c("A1", "B1", "C1", "E1", "F1"),
"Y2" = c("A2", NA, "C2", "E2", "F2"),
"Z2" = c("A3", "B3", "C3", "E3", "F3")
)
dt3 <- full_join(x = dt1, y = dt2, by = "ID") %>%
arrange(ID)
dt4 <- merge(dt1, dt2, by = "ID", all = TRUE)
dt3
dt4
Updated:
If you ever need to join more tables (as per OP's comment), just chain them:
dt5 <- data.frame("ID" = c(1,3,4,5,7),
"X3" = c("A1", "C1", "D1", "E1","G1"),
"Y3" = c(NA, "C2", "D2", "E2", "G2"),
"Z3" = c("A3","C3", "D3", "E3", NA)
)
dt6 <- full_join(x = dt1, y = dt2, by = "ID") %>%
full_join( x = ., y = dt5, by = "ID") %>%
arrange(ID)
dt6
I have the list below.
Suppose, i want 1 element from group1, 2 from group2, 3 from group3, 1 from groups 4 - 6. What's the most R like way to get all the different element combinations if elements are not allowed to repeat.
So for instance:
(A1, B1, B2, C1, C2, C3, D1, E1, F1) is ok but (A1, B1, B1, C1, C2, C3, D1, E1, F1) is not?
itemNames <- list(group1 = c("A1", "A2", "A3", "A4", "A5", "A6"),
group2 = c("B1", "B2", "B3", "B4", "B5", "B6"),
group3 = c("C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12"),
group4 = c("D1", "D2", "D3", "D4", "D5", "D6"),
group5 = c("E1", "E2", "E3", "E4"),
group6 = c("F1", "F2", "F3", "F4"))
Oviously, I can do this with 9 nested for loops -- a waste. I was playing with melt and reshape2 but haven't gotten anywhere. Thanks!
You can use the Map Reduce functional combination. Map combn to obtain the combinations of each group. Then reduce with a version of expand.grid that does not flatten.
expand.grid.XY <- function(X,Y)
cbind(X[rep(1:nrow(X),nrow(Y)),], Y[rep(1:nrow(Y),each=nrow(X)),])
combos <- function(items,reps)
Reduce(expand.grid.XY, Map(function(...) t(combn(...)),items,reps))
dim(combos(itemNames,c(1,2,3,1,1,1)))
# [1] 1900800 9
Another iteration
ex <- c(1,2,3,1,1,1)
lst <- lapply(seq(itemNames), function(i)
combn(itemNames[[i]], ex[i], toString))
out <- do.call("expand.grid", lst)
head(out)
# Var1 Var2 Var3 Var4 Var5 Var6
#1 A1 B1, B2 C1, C2, C3 D1 E1 F1
#2 A2 B1, B2 C1, C2, C3 D1 E1 F1
#3 A3 B1, B2 C1, C2, C3 D1 E1 F1
#4 A4 B1, B2 C1, C2, C3 D1 E1 F1
#5 A5 B1, B2 C1, C2, C3 D1 E1 F1
#6 A6 B1, B2 C1, C2, C3 D1 E1 F1
dim(out)
#[1] 1900800 6
prod(sapply(lst, length))
#[1] 1900800
itemNames <- list(group1 = c("A1","A2","A3","A4","A5","A6"),
group2 = c("B1","B2","B3","B4","B5","B6"),
group3 = c("C1","C2","C3","C4","C5","C6","C7","C8","C9","C10","C11","C12"),
group4 = c("D1","D2","D3","D4","D5","D6"),
group5 = c("E1","E2","E3","E4"),
group6 = c("F1","F2","F3","F4"))
f <- function(x, n) {
tmp <- t(combn(length(x),n))
p <- function(...) paste(..., sep = ',')
do.call('p', lapply(1:n, function(xx) as.matrix(x)[tmp[, xx]]))
}
tmp <- Map(f, itemNames, c(1,2,3,1,1,1))
Reduce(`*`, Map(choose, sapply(tmp, length), 1))
# [1] 1900800
dim(out <- expand.grid(tmp))
# [1] 1900800 6
format(object.size(out), units = 'Mb')
# [1] "43.5 Mb"
head(out)
# group1 group2 group3 group4 group5 group6
# 1 A1 B1B2 C1C2C3 D1 E1 F1
# 2 A2 B1B2 C1C2C3 D1 E1 F1
# 3 A3 B1B2 C1C2C3 D1 E1 F1
# 4 A4 B1B2 C1C2C3 D1 E1 F1
# 5 A5 B1B2 C1C2C3 D1 E1 F1
# 6 A6 B1B2 C1C2C3 D1 E1 F1
out <- apply(out, 1, paste0, collapse = ',')
(out <- strsplit(out, ','))[1:5]
# [[1]]
# [1] "A1" "B1" "B2" "C1" "C2" "C3" "D1" "E1" "F1"
#
# [[2]]
# [1] "A2" "B1" "B2" "C1" "C2" "C3" "D1" "E1" "F1"
#
# [[3]]
# [1] "A3" "B1" "B2" "C1" "C2" "C3" "D1" "E1" "F1"
#
# [[4]]
# [1] "A4" "B1" "B2" "C1" "C2" "C3" "D1" "E1" "F1"
#
# [[5]]
# [1] "A5" "B1" "B2" "C1" "C2" "C3" "D1" "E1" "F1"
No duplicates:
any(duplicated(out))
# [1] FALSE