Joining two incomplete data.tables with the same column names - r

I have two incomplete data.tables with the same column names.
dt1 <- data.table(id = c(1, 2, 3), v1 = c("w", "x", NA), v2 = c("a", NA, "c"))
dt2 <- data.table(id = c(2, 3, 4), v1 = c(NA, "y", "z"), v2 = c("b", "c", NA))
They look like this:
dt1
id v1 v2
1: 1 w a
2: 2 x <NA>
3: 3 <NA> c
> dt2
id v1 v2
1: 2 <NA> b
2: 3 y c
3: 4 z <NA>
Is there a way to merge the two by filling in the missing info?
This is the result I'm after:
id v1 v2
1: 1 w a
2: 2 x b
3: 3 y c
4: 4 z <NA>
I've tried various data.table joins, merges but I either get the columns repeated:
> merge(dt1,
+ dt2,
+ by = "id",
+ all = TRUE)
id v1.x v2.x v1.y v2.y
1: 1 w a <NA> <NA>
2: 2 x <NA> <NA> b
3: 3 <NA> c y c
4: 4 <NA> <NA> z <NA>
or the rows repeated:
> merge(dt1,
+ dt2,
+ by = names(dt1),
+ all = TRUE)
id v1 v2
1: 1 w a
2: 2 <NA> b
3: 2 x <NA>
4: 3 <NA> c
5: 3 y c
6: 4 z <NA>
Both data.tables have the same column names.

You can group by ID and get the unique values after omitting NAs, i.e.
library(data.table)
merge(dt1, dt2, all = TRUE)[,
lapply(.SD, function(i)na.omit(unique(i))),
by = id][]
# id v1 v2
#1: 1 w a
#2: 2 x b
#3: 3 y c
#4: 4 z <NA>

You could also start out with rbind():
rbind(dt1, dt2)[, lapply(.SD, \(x) unique(x[!is.na(x)])), by = id]
# id v1 v2
# <num> <char> <char>
# 1: 1 w a
# 2: 2 x b
# 3: 3 y c
# 4: 4 z <NA>

First full_join and after that group_by per id and merge the rows:
library(dplyr)
library(tidyr)
dt1 %>%
full_join(dt2, by = c("id", "v1", "v2")) %>%
group_by(id) %>%
fill(starts_with('v'), .direction = 'updown') %>%
slice(1) %>%
ungroup
Output:
# A tibble: 4 × 3
id v1 v2
<dbl> <chr> <chr>
1 1 w a
2 2 x b
3 3 y c
4 4 z NA

Related

New column containing string that appears the most in the row

Im trying to Create a column with the string that appears the most in the row and create another column with the number of times this most prevalent string appeared.
To facilitate my question this is what im trying to achieve:
My actual DF
What im trying to obtain:
most prevalente category and count
example df:
d
f <- data.frame(ID = 1:4,
V1 = c("A","B","C","D"),
V2 = c("A", "B","D","B"),
V3 = c("A","C","D","B"))
Here is another way:
count <- sapply(apply(f[, -1], 1, table), max)
count
# [1] 3 2 2 2
category <- names(sapply(apply(f[, -1], 1, table), which.max))
category
# [1] "A" "B" "D" "B"
f2 <- data.frame(f, category, count)
f2
# ID V1 V2 V3 category count
# 1 1 A A A A 3
# 2 2 B B C B 2
# 3 3 C D D D 2
# 4 4 D B B B 2
df <- data.frame(ID = 1:4,
V1 = c("A","B","C","D"),
V2 = c("A", "B","D","B"),
V3 = c("A","C","D","B"))
library(data.table)
setDT(df)
other <- melt(df, id.vars = "ID", measure.vars = c("V1", "V2", "V3"))
other <- other[, .N, by = .(ID, value)]
colnames(other) <- c("ID", "category", "count")
other <- other[, .SD[which.max(count)], by = .(ID)]
res <- merge(df, other, by = c("ID"))
res
We can use dplyr rowwise function to apply table to each row from V1:V3
library(dplyr)
df |> rowwise() |>
mutate(category = names(table(c_across(V1:V3)))[which.max(table(c_across(V1:V3)))] ,
count = max(table(c_across(V1:V3))))
Output
# A tibble: 4 × 6
# Rowwise:
ID V1 V2 V3 category count
<int> <chr> <chr> <chr> <chr> <int>
1 1 A A A A 3
2 2 B B C B 2
3 3 C D D D 2
4 4 D B B B 2

How to merge two data.tables with complementary column data in one go?

I have two data.tables, columns v2 of each one are complementary:
set.seed(1234)
v1 <- sample(1:20, 5)
v2a <- c(1:2,NA,NA,NA)
v2b <- c(NA,NA,3:5)
id <- c(letters[1:5])
library(data.table)
dt1 <- data.table(id = id, v1=v1,v2=v2a)
dt2 <- data.table(id = id, v2=v2b)
dt1
id v1 v2
1: a 16 1
2: b 5 2
3: c 12 NA
4: d 15 NA
5: e 9 NA
dt2
id v2
1: a NA
2: b NA
3: c 3
4: d 4
5: e 5
The goal is to merge the two data.tables and have column v2 with the proper values without NA.
I got it correctly done either by:
dt <- rbindlist(list(dt1,dt2), use.names = T, fill = T)
dt <- dt[,v2:= sum(v2, na.rm = T), by = id]
dt <- dt[!is.na(v1)]
or:
dt <- merge(dt1, dt2, by = "id", all = T)
dt[, v2:=sum(v2.x, v2.y, na.rm = T), by = id][, v2.x := NULL][,v2.y := NULL]
both giving the correct desired result:
dt
id v1 v2
1: a 16 1
2: b 5 2
3: c 12 3
4: d 15 4
5: e 9 5
Is there an easier/one go way to do it?
The code below updates the values of dt1$v2 where is.na(dt1$v2) == TRUE with the values of dt$v2, based on id.
dt1[is.na(v2), v2 := dt2[ dt1[is.na(v2),], v2, on = .(id)] ][]
id v1 v2
1: a 16 1
2: b 5 2
3: c 12 3
4: d 15 4
5: e 9 5
There is another, less convoluted approach which uses the fcoalesce() function which was introduced with data.table v1.12.4 (on CRAN 03 Oct 2019):
dt1[dt2, on = .(id), v2 := fcoalesce(x.v2, i.v2)][]
id v1 v2
1: a 16 1
2: b 5 2
3: c 12 3
4: d 15 4
5: e 9 5
dt1[dt2, on = .(id), v2 := fcoalesce(v2, i.v2)][]
works as well because
dt1[dt2, on = .(id)]
returns
id v1 v2 i.v2
1: a 16 1 NA
2: b 5 2 NA
3: c 12 NA 3
4: d 15 NA 4
5: e 9 NA 5

Generate columns based on a particular column, but filled with the value of other column

I've a dataset that looks like the following structure, but I need to generate columns based on the v2 but filled with the value of v3. How can I get this done? The desired result is show below.
df <- data_frame(v1 = c(3, 3, 2, 2, 3, 1, 1, 1, 0), v2 = c("a", "b", "a", "c", "c", "b", "c", "a", "a"), v3 = c("y", "y", "n","y", "n","y", "y",NA, "n"))
> df
# A tibble: 9 x 3
v1 v2 v3
<dbl> <chr> <chr>
1 3 a y
2 3 b y
3 2 a n
4 2 c y
5 3 c n
6 1 b y
7 1 c y
8 1 a NA
9 0 a n
The desired outcome: Grouped by v1, identify the value of v2 == "a" in v3 and generate a column v_a with that value. Apply the same rationality to other classes of v2.
# A tibble: 9 x 4
v1 v2 v3 v_a ...
<dbl> <chr> <chr> <chr>
1 3 a y y
2 3 b y y
3 2 a n n
4 2 c y n
5 3 c n y
6 1 b y NA
7 1 c y NA
8 1 a NA NA
9 0 a n n
We can get data in wide format and do the join :
library(dplyr)
df %>%
tidyr::pivot_wider(names_from = v2, values_from = v3, names_prefix = 'v_') %>%
left_join(df, by = 'v1')
# A tibble: 9 x 6
# v1 v_a v_b v_c v2 v3
# <dbl> <chr> <chr> <chr> <chr> <chr>
#1 3 y y n a y
#2 3 y y n b y
#3 3 y y n c n
#4 2 n NA y a n
#5 2 n NA y c y
#6 1 NA y y b y
#7 1 NA y y c y
#8 1 NA y y a NA
#9 0 n NA NA a n
To get the names inverted, we can use :
cols<- unique(df$v2)
df %>%
tidyr::pivot_wider(names_from = v2, values_from = v3) %>%
left_join(df, by = 'v1') %>%
rename_at(vars(cols), ~paste0(., '_v'))
An option using data.table:
uv <- setDT(df)[, unique(v2)]
df[, paste0(uv, "_v") := lapply(uv, function(x)
if(any(v2==x)) v3[v2==x] else NA_character_), v1]
output:
v1 v2 v3 a_v b_v c_v
1: 3 a y y y n
2: 3 b y y y n
3: 2 a n n <NA> y
4: 2 c y n <NA> y
5: 3 c n y y n
6: 1 b y <NA> y y
7: 1 c y <NA> y y
8: 1 a <NA> <NA> y y
9: 0 a n n <NA> <NA>

unique rows in data frame based on if's

I have a large data table (millions of rows), where I need to trim the rows down to one per ID. The rule is that if another art than "X" is in the unique ID,
the X'es should be deleted. But if no other art is in the Unique ID, the X should stay.
Test dataset:
dt <- data.table(
ID=c(1,1,1,2,2,3,4,4),
art=c("X", "Y", "X", "X", "X", "X", "Z", "X"),
redskb=c("a", "Y", "a", "b", "b", "c", "k", "n")
)
ID art redskb
1: 1 X a
2: 1 Y Y
3: 1 X a
4: 2 X b
5: 2 X b
6: 3 X c
7: 4 X k
8: 4 Z n
Required output:
ID art redskb
1: 1 Y Y
2: 2 X b
3: 3 X c
4: 4 Z n
I tried with
unique(dt, by = c("ID"))
but could not get it to work efficiently with if's.
I'd try something like this:
unique(dt)[, `:=`(flag, if (.N == 1) TRUE else art != "X"), ID][(flag)]
## ID art redskb flag
## 1: 1 Y Y TRUE
## 2: 2 X b TRUE
## 3: 3 X c TRUE
## 4: 4 Z k TRUE
data.table:
dt[order(ID,art=="X"),.SD[1],ID]
or #Frank's version:
unique(dt[order(ID,art == "X")], by="ID")
# ID art redskb
# 1: 1 Y Y
# 2: 2 X b
# 3: 3 X c
# 4: 4 Z k
dplyr:
dt %>% group_by(ID) %>% slice(which.max(art != "X"))
# # A tibble: 4 x 3
# # Groups: ID [4]
# ID art redskb
# <dbl> <fctr> <chr>
# 1 1 Y Y
# 2 2 X b
# 3 3 X c
# 4 4 Z k
We can do
dt[dt[, .I[if(uniqueN(art) >1 & any(art == "X")) art!="X" else seq_len(.N)==1], ID]$V1]
# ID art redskb
#1: 1 Y Y
#2: 2 X b
#3: 3 X c
#4: 4 Z k

Filling missing value in group

I have data frame where some of the values are missing
A 1
A NA
A NA
B NA
B 2
B NA
C NA
C NA
C NA
How can I fill in groups where I have data?
You can also use fill from tidyr:
library(dplyr)
library(tidyr)
df1 %>%
group_by(ID) %>%
fill(v1) %>%
fill(v1, .direction = "up")
Result:
# A tibble: 9 x 2
# Groups: ID [3]
ID v1
<chr> <int>
1 A 1
2 A 1
3 A 1
4 B 2
5 B 2
6 B 2
7 C NA
8 C NA
9 C NA
Credits to #akrun for dput
Alternative solution, though perhaps a bit flawed in how many assumptions it makes:
library(dplyr)
y %>%
group_by(V1) %>%
arrange(V2) %>%
mutate(V2 = V2[1])
# Source: local data frame [9 x 2]
# Groups: V1 [3]
# V1 V2
# (chr) (int)
# 1 A 1
# 2 A 1
# 3 A 1
# 4 B 2
# 5 B 2
# 6 B 2
# 7 C NA
# 8 C NA
# 9 C NA
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'ID', we assign (:=) the column 'v1' as the first non-NA value.
library(data.table)
setDT(df1)[, v1:= v1[!is.na(v1)][1L] , by = ID]
df1
# ID v1
#1: A 1
#2: A 1
#3: A 1
#4: B 2
#5: B 2
#6: B 2
#7: C NA
#8: C NA
#9: C NA
Or using only base R
with(df1, ave(v1, ID, FUN = function(x)
replace(x, is.na(x), x[!is.na(x)][1L])))
#[1] 1 1 1 2 2 2 NA NA NA
data
df1 <- structure(list(ID = c("A", "A", "A", "B", "B", "B", "C", "C",
"C"), v1 = c(1L, NA, NA, NA, 2L, NA, NA, NA, NA)), .Names = c("ID",
"v1"), class = "data.frame", row.names = c(NA, -9L))

Resources