Join of column values for specific row values - r

I'd like to join (left_join) a tibble (df2) to another one (df1) only where the value of col2 in df1 is NA. I am currently using a code that is not very elegant. Any advice on how to shorten the code would be greatly appreciated!
library(tidyverse)
# df1 contains NAs that need to be replaced by values from df2, for relevant col1 values
df1 <- tibble(col1 = c("a", "b", "c", "d"), col2 = c(1, 2, NA, NA), col3 = c(10, 20, 30, 40))
df2 <- tibble(col1 = c("a", "b", "c", "d"), col2 = c(5, 6, 7, 8), col3 = c(50, 60, 70, 80))
# my current approach
df3 <- df1 %>%
filter(!is.na(col2))
df4 <- df1 %>%
filter(is.na(col2)) %>%
select(col1)%>%
left_join(df2)
# output tibble that is expected
df_final <- df3 %>%
bind_rows(df4)

Here's a small dplyr answer that works for me, although it might get slow if you have tons of rows:
df1 %>%
filter(is.na(col2)) %>%
select(col1) %>%
left_join(df2, by = "col1") %>%
bind_rows(df1, .) %>%
filter(!is.na(col2))

We can use data.table methods
library(data.table)
setDT(df1)[setDT(df2), col2 := fcoalesce(col2, i.col2), on = .(col1)]
-output
> df1
col1 col2 col3
1: a 1 10
2: b 2 20
3: c 7 30
4: d 8 40
Or an option with tidyverse
library(dplyr)
library(stringr)
df1 %>%
left_join(df2, by = c("col1")) %>%
transmute(col1, across(ends_with(".x"),
~ coalesce(., get(str_replace(cur_column(), ".x", ".y"))),
.names = "{str_remove(.col, '.x')}"))
-output
# A tibble: 4 x 3
col1 col2 col3
<chr> <dbl> <dbl>
1 a 1 10
2 b 2 20
3 c 7 30
4 d 8 40

Related

Replacing NA with 0 in columns that contain a substring in the column name

Let's say I had a data frame and I wanted to replace NA's with 0's only in columns that contained a specific string in the column name.
Let's call the dataframe df, the columns are numeric type, and the string "keyword".
i.e., a column name would be, "Column1keyword", etc.
How can I do this?
I've tried this and it didn't work:
df %>%
mutate(across(where(~ colnames.is.numeric(.x) && 'keyword' %in% colnames.x), replace_na, 0))
Update
If it is to select columns having 'keyword' as substring in the column names, use contains to select across those columns
library(dplyr)
library(tidyr)
df1 <- df1 %>%
mutate(across(contains('keyword'), replace_na, 0))
-output
df1
# A tibble: 5 × 4
col1 col2_keyword col3 col4
<int> <chr> <chr> <dbl>
1 1 a a 1
2 2 b b 3
3 3 0 c NA
4 4 c d 5
5 5 d <NA> 6
Assuming that the OP mentioned to replace NA only in columns that have a specific element 'keyword', use where with a logical expression to select the columns that have the 'keyword', loop across those columns and use replace_na to replace the NA to 0
df <- df %>%
mutate(across(where(~ is.character(.x) && 'keyword' %in% .x), replace_na, 0))
-output
df
# A tibble: 5 × 4
col1 col2 col3 col4
<int> <chr> <chr> <dbl>
1 1 a a 1
2 2 b b 3
3 3 keyword c NA
4 4 0 d 5
5 5 c <NA> 6
data
df <- tibble(col1 = 1:5, col2 = c("a", "b", "keyword", NA, 'c'),
col3 = c('a', 'b', 'c', 'd', NA), col4 = c(1, 3, NA, 5, 6))
df1 <- tibble(col1 = 1:5, col2_keyword = c("a", "b", NA, 'c', 'd'),
col3 =c('a', 'b', 'c', 'd', NA), col4 = c(1, 3, NA, 5, 6))

Merge data frames and divide rows by group

I would like to divide the values from df1 over the values from df2. In this reproducible example, I am able to sum these values. What about the division? Thanks in advance!
df1 <- data.frame(country = c("a", "b", "c"), year1 = c(1, 2, 3), year2 = c(1, 2, 3))
df2 <- data.frame(country = c("a", "b", "d"), year1 = c(1, 2, NA), year2 = c(1, 2, 3))
df3 <- bind_rows(df1, df2) %>%
mutate_if(is.numeric, tidyr::replace_na, 0) %>%
group_by(country) %>%
summarise_all(., sum, na.rm = TRUE) %>%
na_if(., 0)
Expected result is:
# A tibble: 4 x 3
country year1 year2
<chr> <dbl> <dbl>
1 a 1 1
2 b 1 1
3 c NA NA
4 d NA NA
As there are groups with 2 rows and some with 1, use an if/else condition within summarise/across to divide the first element by the last if there are two elements or else return NA
library(dplyr) # version 1.0.4
library(tidyr)
bind_rows(df1, df2) %>%
mutate(across(where(is.numeric), replace_na, 0)) %>%
group_by(country) %>%
summarise(across(everything(), ~ if(n() == 2) first(.)/last(.)
else NA_real_))
-output
# A tibble: 4 x 3
# country year1 year2
#* <chr> <dbl> <dbl>
#1 a 1 1
#2 b 1 1
#3 c NA NA
#4 d NA NA
Here is a base R option using merge + split.default
df <- merge(df1, df2, by = "country", all = TRUE)
cbind(
df[1],
list2DF(lapply(
split.default(df[-1], gsub("\\.(x|y)", "", names(df)[-1])),
function(v) do.call("/", v)
))
)
which gives
country year1 year2
1 a 1 1
2 b 1 1
3 c NA NA
4 d NA NA

Add 2 dataframe with dfifferent lengths in R

I have the above 2 dataframes in R,
df1 = [a,2 df2 = [a,10
b,3] c,2]
I want to add those 2 df, so the output can be
df = [a, 12,
b, 3,
c, 2]
Any advice would be much appreciated, thanks!
We can rbind the two datasets and do a group by sum
aggregate(col2 ~ col1, rbind(df1, df2), sum)
-output
# col1 col2
#1 a 12
#2 b 3
#3 c 2
Or in dplyr
library(dplyr)
bind_rows(df1, df2) %>%
group_by(col1) %>%
summarise(col2 = sum(col2), .groups = 'drop')
-output
# A tibble: 3 x 2
# col1 col2
# <chr> <dbl>
#1 a 12
#2 b 3
#3 c 2
data
df2 <- data.frame(col1 = c('a', 'c'), col2 = c(10, 2))
df1 <- data.frame(col1 = c('a', 'b'), col2 = c(2, 3))

R group by column, count the combinations observed

I imagine this is already solved in many places, but I lack the right wordage to use to search for a solution. In R I have example data in long format like this:
A = tibble( c(1,2,3,1,2,4,5,5), c('a','b','c','a','f','-','b', 'f'))
and what I want returned is sort of a grouped result (something like a spread?) where I first collect the set of letters that match each number to get something like this.
1: 'a', 'a'
2: 'b', 'f'
3: 'c', 'c'
4: '_'
5: 'b', 'f'
and the actual final result I am looking for is the count of how many times each letter combination, when is observed:
'a','a': 1
'b','f': 2
'c','c': 1
'-': 1
I can do the last step with group_by() but I mention it here in case there is some magic sauce that does the whole thing.
We can do a group by 'a', then paste the second column while taking the number of distinct elements in 'b' and get the distinct rows
library(dplyr)
library(stringr)
A %>%
group_by(a) %>%
summarise(out = str_c(b, collapse=","), n = n_distinct(b))%>%
distinct(out, n)
# A tibble: 4 x 2
# out n
# <chr> <int>
#1 a,a 1
#2 b,f 2
#3 c 1
#4 - 1
data
A <- structure(list(a = c(1, 2, 3, 1, 2, 4, 5, 5), b = c("a", "b",
"c", "a", "f", "-", "b", "f")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
This is close to what you are looking for:
library(tidyverse)
#Data
A <- structure(list(v1 = c(1, 2, 3, 1, 2, 4, 5, 5), v2 = c("a", "b",
"c", "a", "f", "-", "b", "f")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
#Code
A %>% group_by(v1) %>% summarise(chain=paste0(v2,collapse = ',')) %>% ungroup() %>%
group_by(chain) %>% summarise(N=n())
# A tibble: 4 x 2
chain N
<chr> <int>
1 - 1
2 a,a 1
3 b,f 2
4 c 1
Here is a base R option using nested aggregate
aggregate(.~y,aggregate(y~.,A,toString),length)
which gives
> aggregate(.~y,aggregate(y~.,A,toString),length)
y x
1 - 1
2 a, a 1
3 b, f 2
4 c 1
Data
A = tibble(x = c(1,2,3,1,2,4,5,5), y = c('a','b','c','a','f','-','b', 'f'))
Maybe you want to cast the data in wide format and then count the combinations. Try :
library(dplyr)
library(tidyr)
A %>%
group_by(v1) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from = row, values_from = v2, names_prefix = 'col_') %>%
ungroup %>%
count(col_1, col_2)
# col_1 col_2 n
# <chr> <chr> <int>
#1 - NA 1
#2 a a 1
#3 b f 2
#4 c NA 1

Efficient way to compare all columns in data table R

I have two data tables in R which have the same columns (number, name and order) and an ID as follows:
library(data.table)
dt1 <- data.table(ids = c(1, 2, 5), col1 = c("A", "B", "F"), col2 = c("B", "F", "G"))
dt2 <- data.table(ids = c(2, 1, 6, 5), col1 = c("B", "A", "K", "L"), col2 = c("F", "G", "M", "G"))
> dt1
ids col1 col2
1: 1 A B
2: 2 B F
3: 5 F G
> dt2
ids col1 col2
1: 2 B F
2: 1 A G
3: 6 K M
4: 5 L G
I would like to know for every column how many (common) ids have the same value. For example for col1 we have: for ID1 both values are A, for ID2 both values are B and for ID5 the values differ, therefore the end result for this column is 2.
What I have is the following solution:
joint_dt <- merge(dt1, dt2, by = "ids", suffixes = c("", "_old"))
comp_res <- mapply(function(x, y) sum(x == y), joint_dt[, 2:ncol(dt1)], joint_dt[, (ncol(dt1) + 1):ncol(joint_dt)])
> comp_res
col1 col2
2 2
Is this the best way to do what I want or am I missing some package or function more designated for this?
Another method is to use inner join to achieve the result:
sapply(c(col1="col1",col2="col2"), function(x) dt1[dt2, on=c("ids", x), nomatch=0L, .N])
output:
col1 col2
2 2
here is a sample data if anyone is interested to time the codes (no tidyverse here to time)
library(data.table)
set.seed(0L)
nr <- 1e6L
nc <- 2L
nids <- nr/100
dt1 <- as.data.table(matrix(sample(nids, nr*nc, replace=TRUE), ncol=nc))[, ids := 1:nr]
setnames(dt1, names(dt1), gsub("^V", "col", names(dt1)))
dt2 <- as.data.table(matrix(sample(nids, nr*nc, replace=TRUE), ncol=nc))[, ids := 1:nr]
setnames(dt2, names(dt2), gsub("^V", "col", names(dt2)))
some timings for data.table solutions:
timing code:
library(microbenchmark)
microbenchmark(
mtd0={
cols <- structure(paste0("col", seq_len(nc)), names=paste0("col", seq_len(nc)))
sapply(cols, function(x) dt1[dt2, on=c("ids", x), nomatch=0L, .N])
},
mtd1=melt(dt1, id.vars = "ids")[ melt(dt2, id.vars = "ids"), ids2 := i.ids, on = .(variable, value)][
!is.na(ids2), .N, by = variable],
times=3L)
timings:
Unit: milliseconds
expr min lq mean median uq max neval cld
mtd0 179.4386 186.3906 195.6833 193.3425 203.8057 214.2689 3 a
mtd1 8306.7968 8373.2351 8467.4561 8439.6734 8547.7858 8655.8982 3 b
An approach using a join on molten data.tables
melt(dt1, id.vars = "ids")[ melt(dt2, id.vars = "ids"), ids2 := i.ids, on = .(variable, value)][!is.na(ids2), .N, by = variable][]
variable N
1: col1 2
2: col2 2
Another tidyverse approach:
library(tidyverse)
library(data.table)
dt1 <- data.table(ids = c(1, 2, 5), col1 = c("A", "B", "F"), col2 = c("B", "F", "G"))
dt2 <- data.table(ids = c(2, 1, 6, 5), col1 = c("B", "A", "K", "L"), col2 = c("F", "G", "M", "G"))
dt1 %>% gather(col,value1,-ids) %>% # reshape dt1
inner_join(dt2 %>% gather(col,value2,-ids), by=c("ids","col")) %>% # reshape dt2 and join
group_by(col) %>% # for each col value
summarise(res = sum(value1 == value2)) # count matches
# # A tibble: 2 x 2
# col res
# <chr> <int>
# 1 col1 2
# 2 col2 2
One tidyverse possibility could be:
dt2 %>%
inner_join(dt1, by = c("ids" = "ids")) %>%
gather(var, val, -ids) %>%
separate(var, c("var", "temp")) %>%
count(ids, var, val) %>%
group_by(var) %>%
summarise(n = length(n[n > 1])) %>%
ungroup()
var n
<chr> <int>
1 col1 2
2 col2 2
I think map from purrr is perfect for this in combination with the filtering join semi_join from dplyr that returns rows that exist in both df.
library(purrr)
library(dplyr)
map_dfc(c("col1", "col2"),
~dt1 %>%
semi_join(dt2 %>% select("ids", .x)) %>%
summarise(!!.x := n()))
Result
col1 col2
1 2 2

Resources