I have two data tables in R which have the same columns (number, name and order) and an ID as follows:
library(data.table)
dt1 <- data.table(ids = c(1, 2, 5), col1 = c("A", "B", "F"), col2 = c("B", "F", "G"))
dt2 <- data.table(ids = c(2, 1, 6, 5), col1 = c("B", "A", "K", "L"), col2 = c("F", "G", "M", "G"))
> dt1
ids col1 col2
1: 1 A B
2: 2 B F
3: 5 F G
> dt2
ids col1 col2
1: 2 B F
2: 1 A G
3: 6 K M
4: 5 L G
I would like to know for every column how many (common) ids have the same value. For example for col1 we have: for ID1 both values are A, for ID2 both values are B and for ID5 the values differ, therefore the end result for this column is 2.
What I have is the following solution:
joint_dt <- merge(dt1, dt2, by = "ids", suffixes = c("", "_old"))
comp_res <- mapply(function(x, y) sum(x == y), joint_dt[, 2:ncol(dt1)], joint_dt[, (ncol(dt1) + 1):ncol(joint_dt)])
> comp_res
col1 col2
2 2
Is this the best way to do what I want or am I missing some package or function more designated for this?
Another method is to use inner join to achieve the result:
sapply(c(col1="col1",col2="col2"), function(x) dt1[dt2, on=c("ids", x), nomatch=0L, .N])
output:
col1 col2
2 2
here is a sample data if anyone is interested to time the codes (no tidyverse here to time)
library(data.table)
set.seed(0L)
nr <- 1e6L
nc <- 2L
nids <- nr/100
dt1 <- as.data.table(matrix(sample(nids, nr*nc, replace=TRUE), ncol=nc))[, ids := 1:nr]
setnames(dt1, names(dt1), gsub("^V", "col", names(dt1)))
dt2 <- as.data.table(matrix(sample(nids, nr*nc, replace=TRUE), ncol=nc))[, ids := 1:nr]
setnames(dt2, names(dt2), gsub("^V", "col", names(dt2)))
some timings for data.table solutions:
timing code:
library(microbenchmark)
microbenchmark(
mtd0={
cols <- structure(paste0("col", seq_len(nc)), names=paste0("col", seq_len(nc)))
sapply(cols, function(x) dt1[dt2, on=c("ids", x), nomatch=0L, .N])
},
mtd1=melt(dt1, id.vars = "ids")[ melt(dt2, id.vars = "ids"), ids2 := i.ids, on = .(variable, value)][
!is.na(ids2), .N, by = variable],
times=3L)
timings:
Unit: milliseconds
expr min lq mean median uq max neval cld
mtd0 179.4386 186.3906 195.6833 193.3425 203.8057 214.2689 3 a
mtd1 8306.7968 8373.2351 8467.4561 8439.6734 8547.7858 8655.8982 3 b
An approach using a join on molten data.tables
melt(dt1, id.vars = "ids")[ melt(dt2, id.vars = "ids"), ids2 := i.ids, on = .(variable, value)][!is.na(ids2), .N, by = variable][]
variable N
1: col1 2
2: col2 2
Another tidyverse approach:
library(tidyverse)
library(data.table)
dt1 <- data.table(ids = c(1, 2, 5), col1 = c("A", "B", "F"), col2 = c("B", "F", "G"))
dt2 <- data.table(ids = c(2, 1, 6, 5), col1 = c("B", "A", "K", "L"), col2 = c("F", "G", "M", "G"))
dt1 %>% gather(col,value1,-ids) %>% # reshape dt1
inner_join(dt2 %>% gather(col,value2,-ids), by=c("ids","col")) %>% # reshape dt2 and join
group_by(col) %>% # for each col value
summarise(res = sum(value1 == value2)) # count matches
# # A tibble: 2 x 2
# col res
# <chr> <int>
# 1 col1 2
# 2 col2 2
One tidyverse possibility could be:
dt2 %>%
inner_join(dt1, by = c("ids" = "ids")) %>%
gather(var, val, -ids) %>%
separate(var, c("var", "temp")) %>%
count(ids, var, val) %>%
group_by(var) %>%
summarise(n = length(n[n > 1])) %>%
ungroup()
var n
<chr> <int>
1 col1 2
2 col2 2
I think map from purrr is perfect for this in combination with the filtering join semi_join from dplyr that returns rows that exist in both df.
library(purrr)
library(dplyr)
map_dfc(c("col1", "col2"),
~dt1 %>%
semi_join(dt2 %>% select("ids", .x)) %>%
summarise(!!.x := n()))
Result
col1 col2
1 2 2
Related
I'd like to join (left_join) a tibble (df2) to another one (df1) only where the value of col2 in df1 is NA. I am currently using a code that is not very elegant. Any advice on how to shorten the code would be greatly appreciated!
library(tidyverse)
# df1 contains NAs that need to be replaced by values from df2, for relevant col1 values
df1 <- tibble(col1 = c("a", "b", "c", "d"), col2 = c(1, 2, NA, NA), col3 = c(10, 20, 30, 40))
df2 <- tibble(col1 = c("a", "b", "c", "d"), col2 = c(5, 6, 7, 8), col3 = c(50, 60, 70, 80))
# my current approach
df3 <- df1 %>%
filter(!is.na(col2))
df4 <- df1 %>%
filter(is.na(col2)) %>%
select(col1)%>%
left_join(df2)
# output tibble that is expected
df_final <- df3 %>%
bind_rows(df4)
Here's a small dplyr answer that works for me, although it might get slow if you have tons of rows:
df1 %>%
filter(is.na(col2)) %>%
select(col1) %>%
left_join(df2, by = "col1") %>%
bind_rows(df1, .) %>%
filter(!is.na(col2))
We can use data.table methods
library(data.table)
setDT(df1)[setDT(df2), col2 := fcoalesce(col2, i.col2), on = .(col1)]
-output
> df1
col1 col2 col3
1: a 1 10
2: b 2 20
3: c 7 30
4: d 8 40
Or an option with tidyverse
library(dplyr)
library(stringr)
df1 %>%
left_join(df2, by = c("col1")) %>%
transmute(col1, across(ends_with(".x"),
~ coalesce(., get(str_replace(cur_column(), ".x", ".y"))),
.names = "{str_remove(.col, '.x')}"))
-output
# A tibble: 4 x 3
col1 col2 col3
<chr> <dbl> <dbl>
1 a 1 10
2 b 2 20
3 c 7 30
4 d 8 40
I want to fill df2 with information from df1.
df1 as below
ID Mutation
1 A
2 B
2 C
3 A
df2 as below
ID A B C
1
2
3
For example, if mutation A is found in ID 1, then I want it in df2 it marked as "Y".
So the df2 result should be
ID A B C
1 Y
2 Y Y
3 Y
I have hundreds of IDs and more than 20 mutations. How can I efficiently achieve this in R? Thanks!
Using data.table you can try
setDT(df)
df2 <- dcast(df,formula = ID~Mutation )
df2[, c("A", "B", "C") := lapply(.SD, function(x) ifelse(is.na(x), " ", "Y")), ID]
df2
#Output
ID A B C
1: 1 Y
2: 2 Y Y
3: 3 Y
Create a new column with value 'Y' and cast the data in wide format.
library(dplyr)
library(tidyr)
df %>%
mutate(value = 'Y') %>%
pivot_wider(names_from = Mutation, values_from = value, values_fill = '')
# ID A B C
# <int> <chr> <chr> <chr>
#1 1 "Y" "" ""
#2 2 "" "Y" "Y"
#3 3 "Y" "" ""
data
df <- structure(list(ID = c(1L, 2L, 2L, 3L), Mutation = c("A", "B",
"C", "A")), class = "data.frame", row.names = c(NA, -4L))
I have 2 dataframes
DF x:
ID A B C
1 x y z
2 x y z
DF y:
ID A B C
1 NA d f
2 e NA NA
I want to join them in such way that the value of x gets overwritten by the value of y, but only if there is a value in y for the matching column in x.
Hence, the outcome of the above should be:
ID A B C
1 x d f
2 e y z
One option is coalesce
library(dplyr)
left_join(dfx, dfy, by = 'ID') %>%
transmute(ID, A = coalesce(A.y, A.x),
B = coalesce(B.y, B.x),
C = coalesce(C.y, C.x))
# ID A B C
#1 1 x d f
#2 2 e y z
Or if there are many columns, reshape it to 'long' format, do the coalesce and then reshape into 'wide' format
library(tidyr)
left_join(dfx, dfy, by = 'ID') %>%
pivot_longer(cols = -ID, names_to = c("group", ".value"), names_sep = "\\.") %>%
mutate(x = coalesce(y, x)) %>%
select(-y) %>%
pivot_wider(names_from = group, values_from = x)
Or another option is to bind_rows the two datasets and then do a group_by summarise (assuming one row per 'ID')
bind_rows(dfy, dfx) %>%
group_by(ID) %>%
summarise_at(vars(-group_cols()), ~ first(.[!is.na(.)]))
Or using a join on data.table
library(data.table)
nm1 <- names(dfx)[-1]
nm2 <- paste0("i.", nm1)
setDT(dfy)[dfx, (nm1) := Map(coalesce, mget(nm1), mget(nm2)), on = .(ID)]
dfy
data
dfx <- structure(list(ID = 1:2, A = c("x", "x"), B = c("y", "y"), C = c("z",
"z")), class = "data.frame", row.names = c(NA, -2L))
dfy <- structure(list(ID = 1:2, A = c(NA, "e"), B = c("d", NA), C = c("f",
NA)), class = "data.frame", row.names = c(NA, -2L))
Using base R, we can get the index of non-NA values in dfy and replace the corresponding values of dfx with it.
#Rearrange dfy if you have columns in different order than dfx
#dfy <- dfy[names(dfx)]
inds <- which(!is.na(dfy[-1]), arr.ind = TRUE)
dfx[-1][inds] <- dfy[-1][inds]
dfx
# ID A B C
#1 1 x d f
#2 2 e y z
This is a more complex version of a previous question where I had abstracted the actual problem too much to apply the answers.
R convert tidy hierarchical data frame to hierarchical list
I've converted a hierarchical data frame with two grouping levels into a hierarchical list-grid using a for loop.
Is there a more efficient base R, tidyverse or other approach to achieve this?
In the real dataset:
The grouping variables and description are multi word strings.
The description preface - d# - is in the MWE for ease of checking.
There are 14 associated variables variously of type: character, integer and double
Rules
Group 1 and Group 2 headings to be in description column
Group 1 headings to appear once only
Group 2 heading are children of group 1 heading, and only change when there is a new group 2 heading
Descriptions are children of group 2 headings
From this
g1 g2 desc var1 var2 var3
A a d1 KS3 0.0500 2 PLs
A a d2 CTI 0.0500 9 7O0
A b d3 b8x 0.580 5 he2
A b d4 XOf 0.180 12 XJE
A b d5 ygn 0.900 11 v48
A c d6 dGY 0.770 6 UcH
A d d7 jpG 0.600 4 P5M
B d d8 Z95 0.600 10 j6O
To this
desc var1 var2 var3
A
a
d1 KS3 0.0500 2 PLs
d2 CTI 0.0500 9 7O0
b
d3 b8x 0.580 5 he2
d4 XOf 0.180 12 XJE
d5 ygn 0.900 11 v48
c
d6 dGY 0.770 6 UcH
d
d7 jpG 0.600 4 P5M
B
d
Code
library(tidyverse)
library(stringi)
set.seed(2018)
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = paste0("d", 1:12, " ", stri_rand_strings(12, 3)),
var1 = round(runif(12), 2),
var2 = sample.int(12),
var3 = stri_rand_strings(12, 3))
tib
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = ncol(tib)-1)) %>%
rename(id = V1, desc = V2, var1 = V3, var2 = V4, var3 = V5) %>%
mutate(id = NA_character_,
desc = NA_character_,
var1 = NA_real_,
var2 = NA_integer_,
var3 = NA_character_)
# Loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$id[[output_row]] <- "g1"
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$id[[output_row]] <- "g2"
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data grid
output$desc[[output_row]] <- tib$desc[[i]]
output$var1[[output_row]] <- tib$var1[[i]]
output$var2[[output_row]] <- tib$var2[[i]]
output$var3[[output_row]] <- tib$var3[[i]]
output_row <- output_row + 1
}
output
Adapting the answer from tyluRp R convert tidy hierarchical data frame to hierarchical list I've hit on a solution.
library(tidyverse)
library(stringi)
set.seed(2018)
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = paste0("d", 1:12, " ", stri_rand_strings(12, 3)),
var1 = round(runif(12), 2),
var2 = sample.int(12),
var3 = stri_rand_strings(12, 3))
# add unique identifier for description and variable rows
tib <-
tib %>%
rowid_to_column() %>%
mutate(rowid = paste0("z_", rowid))
# separate tibble for variables associated with descriptions
tib_var <-
tib %>%
select(rowid, var1, var2, var3)
# code adapted from tyluRp to reorder the data and add description variables
tib <-
tib %>%
select(g1, g2, desc, rowid) %>%
mutate(g2 = paste(g1, g2, sep = "_")) %>%
transpose() %>%
unlist() %>%
stack() %>%
distinct(values, ind) %>%
mutate(detect_var = str_detect(values, "^z_"),
ind = lead(case_when(detect_var == TRUE ~ values)),
values = case_when(detect_var == TRUE ~ NA_character_,
TRUE ~ values))%>%
drop_na(values) %>%
select(values, ind) %>%
mutate(values = str_remove(values, "\\D_")) %>%
left_join(tib_var, by = c("ind" = "rowid")) %>%
select(-ind) %>%
replace_na(list(var1 = "", var2 = "", var3 = ""))
Let's say we have two data frames:
df1 <- data.frame(A = letters[1:3], B = letters[4:6], C = letters[7:9], stringsAsFactors = FALSE)
A B C
1 a d g
2 b e h
3 c f i
df2 <- data.frame(V1 = 1:3, V2 = 4:6, V3 = 7:9)
V1 V2 V3
1 1 4 7
2 2 5 8
3 3 6 9
I need to build a function that takes as input a single value or a vector containing elements from one of the data frames and returns the elements from the other data frame according to their positional indexes.
The function should work like this:
> matchdf(values = c("a", "e", "i"), dfin = df1, dfout = df2)
[1] 1 5 9
> matchdf(values = c(1, 5, 9), dfin = df2, dfout = df1)
[1] "a" "e" "i"
> matchdf(values = c(1, 1, 1), dfin = df2, dfout = df1)
[1] "a" "a" "a"
This is what I have tried so far:
requiere(dplyr)
toVec <- function(df) df %>% as.matrix %>% as.vector
matchdf <- function(values, dfin, dfout) toVec(dfout)[toVec(dfin) %in% values]
# But sometimes the output values aren't in correct order:
> matchdf(c("c", "i", "h"), dt1, dt2)
[1] 3 8 9
# should output 3 9 8
> matchdf(values = c("a", "a", "a"), dfin = dt1, dfout = dt2)
[1] 1
# Should output 1 1 1
Feel free to use data.table or/and dplyr if it eases the task. I would prefer a solution without for loops.
Assumptions:
elements from df1 are different from df2
dim(df1) = dim(df2)
matchdf <- function(values, dfin, dfout){
unlist(sapply(values,
function(val) dfout[dfin == val],
USE.NAMES = F)
)
}
matchdf(c("c", "i", "h"), df1, df2)
#should output 3 9 8
[1] 3 9 8
matchdf(values = c("a", "a", "a"), dfin = df1, dfout = df2)
#should output 1 1 1
[1] 1 1 1
matchdf(values = c("X", "Y", "a"), dfin = df1, dfout = df2)
#should output vector, not list
[1] 1