unnest_auto and unnest_longer to unnest multiple columns - r

I have a nested dataframe that I'm trying to unnest. Here's a fake example of the structure.
df <- structure(list(`_id` = c("a", "b", "c", "d"),
variable = list(structure(list(type = c("u", "a", "u", "a", "u", "a", "a"),
m_ = c("m1",
"m2",
"m3",
"m4",
"m5",
"m6", "m7"), #omitted from original example by mistake
t_ = c("2015-07-21 4:13 PM",
"2016-04-21 7:25 PM",
"2017-10-04 9:49 PM",
"2018-12-04 12:29 PM",
"2019-04-20 20:20 AM",
"2016-05-20 12:00 AM",
"2016-06-20 12:00 AM"),
a_ = c(NA,
"",
NA,
"",
NA,
"C",
"C")),
class = "data.frame",
row.names = c(NA, 7L)),
structure(list(type = c("u", "a"),
m_ = c("m1",
"m2"),
t_ = c("2018-05-24 12:08 AM",
"2019-04-24 3:05 PM"),
a_ = c(NA, "")),
class = "data.frame",
row.names = 1:2),
structure(list(type = "u",
m_ = "m1",
t_ = "2018-02-17 3:14 PM"),
class = "data.frame",
row.names = 1L),
structure(list(type = "u",
m_ = "m1",
t_ = "2016-05-27 5:14 PM",
b_ = "b1",
i_ = "i1",
e_ = structure(list(),
.Names = character(0),
class = "data.frame",
row.names = c(NA, -1L)),
l_ = "l1"),
class = "data.frame",
row.names = 1L)),
myDate = structure(c(1521503311.992,
1521514011.161,
1551699584.65,
1553632693.94),
class = c("POSIXct", "POSIXt"))),
row.names = c(1L, 2L, 3L, 4L),
class = "data.frame")
View(df)
variable is a list of dataframes that vary in length (max fields is 7 in this example, but can expand over time).
I tried using the development version of tidyr to take advantage of the new unnest_auto() function.
# devtools::install_github("tidyverse/tidyr")
df2 <- unnest_auto(df, variable)
View(df2)
If I use unnest_longer on the result and specify one column like type I get it to expand.
df3 <- unnest_longer(df2, type)
I don't see any arguments to unnest_longer() that handle multiple columns. Is there a better approach?

Here, since you're unnesting a two dimensional structure (i.e. you want to change both the rows and columns), you can just use unnest:
library(tidyr)
df <- as_tibble(df)
df
#> # A tibble: 4 × 3
#> `_id` variable myDate
#> <chr> <list> <dttm>
#> 1 a <df [7 × 4]> 2018-03-19 18:48:31
#> 2 b <df [2 × 4]> 2018-03-19 21:46:51
#> 3 c <df [1 × 3]> 2019-03-04 05:39:44
#> 4 d <df [1 × 7]> 2019-03-26 15:38:13
df |>
unnest(variable)
#> # A tibble: 11 × 10
#> `_id` type m_ t_ a_ b_ i_ e_ l_ myDate
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <df[> <chr> <dttm>
#> 1 a u m1 2015-07-… <NA> <NA> <NA> <NA> 2018-03-19 18:48:31
#> 2 a a m2 2016-04-… "" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 3 a u m3 2017-10-… <NA> <NA> <NA> <NA> 2018-03-19 18:48:31
#> 4 a a m4 2018-12-… "" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 5 a u m5 2019-04-… <NA> <NA> <NA> <NA> 2018-03-19 18:48:31
#> 6 a a m6 2016-05-… "C" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 7 a a m7 2016-06-… "C" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 8 b u m1 2018-05-… <NA> <NA> <NA> <NA> 2018-03-19 21:46:51
#> 9 b a m2 2019-04-… "" <NA> <NA> <NA> 2018-03-19 21:46:51
#> 10 c u m1 2018-02-… <NA> <NA> <NA> <NA> 2019-03-04 05:39:44
#> 11 d u m1 2016-05-… <NA> b1 i1 l1 2019-03-26 15:38:13
If you did want to do it in two steps, you could take advantage of the fact that unnest_longer() now takes a tidyselect specification:
df |>
unnest_wider(variable) |>
unnest_longer(type:a_)
#> # A tibble: 11 × 10
#> `_id` type m_ t_ a_ b_ i_ e_ l_ myDate
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <df[> <chr> <dttm>
#> 1 a u m1 2015-07-… <NA> <NA> <NA> <NA> 2018-03-19 18:48:31
#> 2 a a m2 2016-04-… "" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 3 a u m3 2017-10-… <NA> <NA> <NA> <NA> 2018-03-19 18:48:31
#> 4 a a m4 2018-12-… "" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 5 a u m5 2019-04-… <NA> <NA> <NA> <NA> 2018-03-19 18:48:31
#> 6 a a m6 2016-05-… "C" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 7 a a m7 2016-06-… "C" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 8 b u m1 2018-05-… <NA> <NA> <NA> <NA> 2018-03-19 21:46:51
#> 9 b a m2 2019-04-… "" <NA> <NA> <NA> 2018-03-19 21:46:51
#> 10 c u m1 2018-02-… <NA> <NA> <NA> <NA> 2019-03-04 05:39:44
#> 11 d u m1 2016-05-… <NA> b1 i1 l1 2019-03-26 15:38:13

This appears to work:
df %>% unnest_auto(variable) %>% unnest()
#Warning message:
#`cols` is now required.
#Please use `cols = c(type, m_, t_, a_, e_)`
df %>% unnest_auto(variable) %>% unnest(cols = c(type, m_, t_, a_, e_, l_))
# A tibble: 11 x 10
`_id` type m_ t_ a_ b_ i_ e_ l_ myDate
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <???> <chr> <dttm>
1 a u m1 2015-… NA NA NA NA NA 2018-03-20 02:48:31
2 a a m2 2016-… "" NA NA NA NA 2018-03-20 02:48:31
3 a u m3 2017-… NA NA NA NA NA 2018-03-20 02:48:31
4 a a m4 2018-… "" NA NA NA NA 2018-03-20 02:48:31
5 a u m5 2019-… NA NA NA NA NA 2018-03-20 02:48:31
6 a a m6 2016-… C NA NA NA NA 2018-03-20 02:48:31
7 a a m7 2016-… C NA NA NA NA 2018-03-20 02:48:31
8 b u m1 2018-… NA NA NA NA NA 2018-03-20 05:46:51
9 b a m2 2019-… "" NA NA NA NA 2018-03-20 05:46:51
10 c u m1 2018-… NA NA NA NA NA 2019-03-04 14:39:44
11 d u m1 2016-… NA b1 i1 NA l1 2019-03-26 23:38:13

Related

List of differing length, named vectors into data frame - R

I have a list of named vectors containing dates in R and I'd like to convert it into a data frame where columns will be named after vector names. Please note that due to differing vector length there will be NAs in the final fata frame.
d5 <- list(structure(c(event13 = 1797, event17 = 7006, event3 = -6796
), class = "Date"), structure(c(event3 = 5984, event14 = 7175,
event11 = -4031, event20 = 8612, event2 = 5158, event5 = 1002,
event8 = -382, event15 = 4367, event11 = 2960), class = "Date"),
structure(c(event7 = 4394, event13 = -1389, event9 = -4407
), class = "Date"), structure(c(event5 = 7729), class = "Date"),
structure(c(event4 = -3384, event10 = 1288, event10 = 7502,
event5 = -5100, event9 = -3177, event8 = -4027, event11 = -3554,
event16 = 9484, event3 = 1386), class = "Date"), structure(c(event10 = -6906,
event3 = 6966, event8 = -975, event14 = -3286, event12 = 744,
event11 = 7111, event15 = 9576, event12 = 2223, event9 = 10771
), class = "Date"), structure(c(event16 = 8764), class = "Date"),
structure(c(event15 = 9795, event14 = 7681, event2 = -1728,
event14 = 10876), class = "Date"), structure(c(event13 = -1341,
event11 = 1202), class = "Date"), structure(c(event3 = -666,
event1 = -4192, event9 = 8808, event4 = -1765), class = "Date"))
> d5
[[1]]
event13 event17 event3
"1974-12-03" "1989-03-08" "1951-05-25"
[[2]]
event3 event14 event11 event20 event2 event5 event8 event15
"1986-05-21" "1989-08-24" "1958-12-19" "1993-07-31" "1984-02-15" "1972-09-29" "1968-12-15" "1981-12-16"
event11
"1978-02-08"
[[3]]
event7 event13 event9
"1982-01-12" "1966-03-14" "1957-12-08"
[[4]]
event5
"1991-03-01"
...
The output would look like this:
ID
event1
event2
event3
...
1
date1
NA
date3
...
Using do.call with rbind returns a following error"
as.data.frame(do.call(rbind, d5))
event3 event14 event11 event20 event2 event5 event8 event15 event11
1 1797 7006 -6796 1797 7006 -6796 1797 7006 -6796
2 5984 7175 -4031 8612 5158 1002 -382 4367 2960
3 4394 -1389 -4407 4394 -1389 -4407 4394 -1389 -4407
4 7729 7729 7729 7729 7729 7729 7729 7729 7729
5 -3384 1288 7502 -5100 -3177 -4027 -3554 9484 1386
6 -6906 6966 -975 -3286 744 7111 9576 2223 10771
7 8764 8764 8764 8764 8764 8764 8764 8764 8764
8 9795 7681 -1728 10876 9795 7681 -1728 10876 9795
9 -1341 1202 -1341 1202 -1341 1202 -1341 1202 -1341
10 -666 -4192 8808 -1765 -666 -4192 8808 -1765 -666
Warning message:
In (function (..., deparse.level = 1) :
number of columns of result is not a multiple of vector length (arg 8)
How to convert it efficiently (without equalising length of each vector by adding NAs and resorting them to match the columns)?
Does this satisfy your needs? Note that due to your second element, there are two columns event11.
res <- data.table::rbindlist(lapply(d5, function(x) data.table::as.data.table(as.list(x))),
fill = TRUE,
use.names = TRUE)
res[1:5]
#> event13 event17 event3 event14 event11 event20 event2
#> 1: 1974-12-03 1989-03-08 1951-05-25 <NA> <NA> <NA> <NA>
#> 2: <NA> <NA> 1986-05-21 1989-08-24 1958-12-19 1993-07-31 1984-02-15
#> 3: 1966-03-14 <NA> <NA> <NA> <NA> <NA> <NA>
#> 4: <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 5: <NA> <NA> 1973-10-18 <NA> 1960-04-09 <NA> <NA>
#> event5 event8 event15 event11 event7 event9 event4
#> 1: <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 2: 1972-09-29 1968-12-15 1981-12-16 1978-02-08 <NA> <NA> <NA>
#> 3: <NA> <NA> <NA> <NA> 1982-01-12 1957-12-08 <NA>
#> 4: 1991-03-01 <NA> <NA> <NA> <NA> <NA> <NA>
#> 5: 1956-01-15 1958-12-23 <NA> <NA> <NA> 1961-04-21 1960-09-26
#> event10 event10 event16 event12 event12 event14 event1
#> 1: <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 2: <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 3: <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 4: <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 5: 1973-07-12 1990-07-17 1995-12-20 <NA> <NA> <NA> <NA>
library(tidyr)
library(dplyr)
# Make each element of the list a dataframe with two columns: x and event_name
dataframed <- lapply(d5, function(x) {
out <- as.data.frame(x)
out$event_name <- names(x)
out
})
# Make a unique dataframe from this list of dataframes
unlisted <- do.call(rbind, dataframed)
row.names(unlisted) <- NULL
# pivot this dataframe to have event_name as column names
unlisted |>
group_by(event_name) |>
mutate(id = seq_len(n())) |>
ungroup() |>
pivot_wider(names_from = event_name, values_from = x) |>
select(-id)
#> # A tibble: 5 × 17
#> event13 event17 event3 event14 event11 event20 event2
#> <date> <date> <date> <date> <date> <date> <date>
#> 1 1974-12-03 1989-03-08 1951-05-25 1989-08-24 1958-12-19 1993-07-31 1984-02-15
#> 2 1966-03-14 NA 1986-05-21 1961-01-02 1960-04-09 NA 1965-04-09
#> 3 1966-05-01 NA 1973-10-18 1991-01-12 1989-06-21 NA NA
#> 4 NA NA 1989-01-27 1999-10-12 1973-04-17 NA NA
#> 5 NA NA 1968-03-06 NA NA NA NA
#> # … with 10 more variables: event5 <date>, event8 <date>, event15 <date>,
#> # event12 <date>, event7 <date>, event9 <date>, event4 <date>,
#> # event10 <date>, event16 <date>, event1 <date>
Created on 2022-10-12 with reprex v2.0.2
With base R this is an option
Get all event names
nn <- unique(unlist(sapply(d5, function(x) names(x))))
Iterate through all event names and name the columns
res <- data.frame(t(sapply(d5, function(x) strftime(x[nn]))))
colnames(res) <- nn
res
event13 event17 event3 event14 event11 event20 event2
1 1974-12-03 1989-03-08 1951-05-25 <NA> <NA> <NA> <NA>
2 <NA> <NA> 1986-05-21 1989-08-24 1958-12-19 1993-07-31 1984-02-15
3 1966-03-14 <NA> <NA> <NA> <NA> <NA> <NA>
4 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
5 <NA> <NA> 1973-10-18 <NA> 1960-04-09 <NA> <NA>
6 <NA> <NA> 1989-01-27 1961-01-02 1989-06-21 <NA> <NA>
7 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
8 <NA> <NA> <NA> 1991-01-12 <NA> <NA> 1965-04-09
9 1966-05-01 <NA> <NA> <NA> 1973-04-17 <NA> <NA>
10 <NA> <NA> 1968-03-06 <NA> <NA> <NA> <NA>
event5 event8 event15 event7 event9 event4 event10
1 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2 1972-09-29 1968-12-15 1981-12-16 <NA> <NA> <NA> <NA>
3 <NA> <NA> <NA> 1982-01-12 1957-12-08 <NA> <NA>
4 1991-03-01 <NA> <NA> <NA> <NA> <NA> <NA>
5 1956-01-15 1958-12-23 <NA> <NA> 1961-04-21 1960-09-26 1973-07-12
6 <NA> 1967-05-02 1996-03-21 <NA> 1999-06-29 <NA> 1951-02-04
7 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
8 <NA> <NA> 1996-10-26 <NA> <NA> <NA> <NA>
9 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
10 <NA> <NA> <NA> <NA> 1994-02-12 1965-03-03 <NA>
event16 event12 event1
1 <NA> <NA> <NA>
2 <NA> <NA> <NA>
3 <NA> <NA> <NA>
4 <NA> <NA> <NA>
5 1995-12-20 <NA> <NA>
6 <NA> 1972-01-15 <NA>
7 1993-12-30 <NA> <NA>
8 <NA> <NA> <NA>
9 <NA> <NA> <NA>
10 <NA> <NA> 1958-07-11
Assuming no duplicated events in each element of the list:
>dplyr::bind_rows(d5)
# A tibble: 10 × 17
event13 event17 event3 event14 event11 event20 event2 event5 event8
<date> <date> <date> <date> <date> <date> <date> <date> <date>
1 1974-12-03 1989-03-08 1951-05-25 NA NA NA NA NA NA
2 NA NA 1986-05-21 1989-08-24 1958-12-19 1993-07-31 1984-02-15 1972-09-29 1968-12-15
3 1966-03-14 NA NA NA NA NA NA NA NA
4 NA NA NA NA NA NA NA 1991-03-01 NA
5 NA NA 1973-10-18 NA 1960-04-09 NA NA 1956-01-15 1958-12-23
6 NA NA 1989-01-27 1961-01-02 1989-06-21 NA NA NA 1967-05-02
7 NA NA NA NA NA NA NA NA NA
8 NA NA NA 1991-01-12 NA NA 1965-04-09 NA NA
9 1966-05-01 NA NA NA 1973-04-17 NA NA NA NA
10 NA NA 1968-03-06 NA NA NA NA NA NA
# … with 8 more variables: event15 <date>, event7 <date>, event9 <date>, event4 <date>, event10 <date>,
# event16 <date>, event12 <date>, event1 <date>
Here, d5 is manually edited to remove duplicated entries:
d5 <- list(structure(c(event13 = 1797, event17 = 7006, event3 = -6796
), class = "Date"), structure(c(event3 = 5984, event14 = 7175,
event11 = -4031, event20 = 8612, event2 = 5158, event5 = 1002,
event8 = -382, event15 = 4367), class = "Date"), structure(c(event7 = 4394,
event13 = -1389, event9 = -4407), class = "Date"), structure(c(event5 = 7729), class = "Date"),
structure(c(event4 = -3384, event10 = 1288, event5 = -5100,
event9 = -3177, event8 = -4027, event11 = -3554, event16 = 9484,
event3 = 1386), class = "Date"), structure(c(event10 = -6906,
event3 = 6966, event8 = -975, event14 = -3286, event11 = 7111,
event15 = 9576, event12 = 2223, event9 = 10771), class = "Date"),
structure(c(event16 = 8764), class = "Date"), structure(c(event15 = 9795,
event14 = 7681, event2 = -1728), class = "Date"), structure(c(event13 = -1341,
event11 = 1202), class = "Date"), structure(c(event3 = -666,
event1 = -4192, event9 = 8808, event4 = -1765), class = "Date"))

data.table roll "nearest" left join for single best match (rest to NA)

I have two data.tables with different number of rows. I would like to left join by matching on a single column so that first dt dt1 keeps all rows. Only best nearest values from second dt2 should be joined.
Minimal data:
library(data.table)
set.seed(42)
timestamp <- sort(rnorm(10, mean = 1, sd = 1))
dt1 <- data.table(
id = letters[1:10],
timestamp = timestamp,
timestamp1 = timestamp,
other1 = 1:10,
other2 = 11:20
)
dt2 <- data.table(
timestamp = timestamp[c(3, 5, 8)] + 0.1,
timestamp2 = timestamp[c(3, 5, 8)] + 0.1,
other3 = c("x", "y", "z"),
other4 = c(333, 444, 555)
)
What I tried:
dt2[dt1, roll = "nearest", on = "timestamp"]
#> timestamp timestamp2 other3 other4 id timestamp1 other1 other2
#> 1: 0.4353018 1.005341 x 333 a 0.4353018 1 11
#> 2: 0.8938755 1.005341 x 333 b 0.8938755 2 12
#> 3: 0.9053410 1.005341 x 333 c 0.9053410 3 13
#> 4: 0.9372859 1.005341 x 333 d 0.9372859 4 14
#> 5: 1.3631284 1.463128 y 444 e 1.3631284 5 15
#> 6: 1.4042683 1.463128 y 444 f 1.4042683 6 16
#> 7: 1.6328626 1.463128 y 444 g 1.6328626 7 17
#> 8: 2.3709584 2.470958 z 555 h 2.3709584 8 18
#> 9: 2.5115220 2.470958 z 555 i 2.5115220 9 19
#> 10: 3.0184237 2.470958 z 555 j 3.0184237 10 20
I am failing to understand how roll="nearest" works. I see that it indeed matches the nearest values, but it does it with all of them. I would like to merge only those 3 rows from dt2 that have the absolute nearest values.
Using joins based on tolerance (max_dist) would also give more than three matches, but in this case I found the value of max_dist that gives the best nearest for this tiny example data.
Desired output:
library(fuzzyjoin)
fuzzyjoin::difference_left_join(as.data.frame(dt1), as.data.frame(dt2), by = "timestamp", max_dist = 0.09)
#> id timestamp.x timestamp1 other1 other2 timestamp.y timestamp2 other3 other4
#> 1 a 0.4353018 0.4353018 1 11 NA NA <NA> NA
#> 2 b 0.8938755 0.8938755 2 12 NA NA <NA> NA
#> 3 c 0.9053410 0.9053410 3 13 NA NA <NA> NA
#> 4 d 0.9372859 0.9372859 4 14 1.005341 1.005341 x 333
#> 5 e 1.3631284 1.3631284 5 15 NA NA <NA> NA
#> 6 f 1.4042683 1.4042683 6 16 1.463128 1.463128 y 444
#> 7 g 1.6328626 1.6328626 7 17 NA NA <NA> NA
#> 8 h 2.3709584 2.3709584 8 18 NA NA <NA> NA
#> 9 i 2.5115220 2.5115220 9 19 2.470958 2.470958 z 555
#> 10 j 3.0184237 3.0184237 10 20 NA NA <NA> NA
Created on 2022-08-25 with reprex v2.0.2
You can try a proper left update join and assign the desired variables from dt2 explicitely
library(data.table)
set.seed(42)
timestamp <- sort(rnorm(10, mean = 1, sd = 1))
dt1 <- data.table(
id = letters[1:10],
timestamp = timestamp,
timestamp1 = timestamp,
other1 = 1:10,
other2 = 11:20
)
dt2 <- data.table(
timestamp = timestamp[c(3, 5, 8)] + 0.1,
timestamp2 = timestamp[c(3, 5, 8)] + 0.1,
other3 = c("x", "y", "z"),
other4 = c(333, 444, 555)
)
# left join: leading table on the left
dt1[dt2,
roll = "nearest",
on = "timestamp",
# assign desired values explicitely
`:=`(other3 = i.other3,
other4 = i.other4)]
dt1[]
#> id timestamp timestamp1 other1 other2 other3 other4
#> 1: a 0.4353018 0.4353018 1 11 <NA> NA
#> 2: b 0.8938755 0.8938755 2 12 <NA> NA
#> 3: c 0.9053410 0.9053410 3 13 <NA> NA
#> 4: d 0.9372859 0.9372859 4 14 x 333
#> 5: e 1.3631284 1.3631284 5 15 <NA> NA
#> 6: f 1.4042683 1.4042683 6 16 y 444
#> 7: g 1.6328626 1.6328626 7 17 <NA> NA
#> 8: h 2.3709584 2.3709584 8 18 <NA> NA
#> 9: i 2.5115220 2.5115220 9 19 z 555
#> 10: j 3.0184237 3.0184237 10 20 <NA> NA

Anti_join between df1 and df2 but how to change all mismatch in df2 to NA

Below are my two dataframes, df1 and df2
df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),response=c("y","y","y","n","n","y","y","n","n","y"))
id text response
1 632592651 asdf y
2 633322173 cat y
3 634703802 dog y
4 634927873 mouse n
5 635812953 elephant n
6 636004739 goose y
7 636101211 rat y
8 636157799 mice n
9 636263106 kitty n
10 636752420 kitten y
df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
volume=c("1234","432","324","333","2223","412346","7456","3456","2345","2345","6","345","23","2","4778","234","8675","3459","8","9"))
id text volume
1 632592651 asdf_xyz 1234
2 633322173 cat 432
3 634703802 dog 324
4 634927873 mouse 333
5 635812953 elephant 2223
6 636004739 goose 412346
7 636101211 rat 7456
8 636157799 mice 3456
9 636263106 kitty 2345
10 636752420 kitten 2345
11 636809222 tiger_xyz 6
12 2004722036 lion 345
13 2004894388 leopard 23
14 2005045755 ostrich 2
15 2005535472 kangaroo 4778
16 2005630542 platypus 234
17 2005788781 fish 8675
18 2005809679 reptile 3459
19 2005838317 mammals 8
20 2005866692 amphibians_xyz 9
How do I change the non-matching items from row id1:20 of df2 to NA (i.e. all of them as no matching with df1) and the column 'text' (i.e. asdf_xyz) of id1 to NA?
I have tried
library(dplyr)
df3 <- df2 %>%
anti_join(df1, by=c("id"))
id text volume
1 636809222 tiger_xyz 6
2 2004722036 lion 345
3 2004894388 leopard 23
4 2005045755 ostrich 2
5 2005535472 kangaroo 4778
6 2005630542 platypus 234
7 2005788781 fish 8675
8 2005809679 reptile 3459
9 2005838317 mammals 8
10 2005866692 amphibians_xyz 9
df3$id[df3$id != 0] <- NA
df3$text[df3$text != 0] <- NA
df3$volume[df3$volume != 0] <- NA
(Doing this one by one because I couldn't find solution how to change the entire value of the dataframe to NA)
id text volume
1 <NA> <NA> <NA>
2 <NA> <NA> <NA>
3 <NA> <NA> <NA>
4 <NA> <NA> <NA>
5 <NA> <NA> <NA>
6 <NA> <NA> <NA>
7 <NA> <NA> <NA>
8 <NA> <NA> <NA>
9 <NA> <NA> <NA>
10 <NA> <NA> <NA>
and df4 (solution from How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'?)
inner_join(x = df1,
y = df2,
by = "id") %>%
mutate_if(is.factor, as.character) %>%
mutate(text = ifelse(test = text.x != text.y,
yes = NA,
no = text.x)) %>%
select(id, text, response, volume)
id text response volume
1 632592651 <NA> y 1234
2 633322173 cat y 432
3 634703802 dog y 324
4 634927873 mouse n 333
5 635812953 elephant n 2223
6 636004739 goose y 412346
7 636101211 rat y 7456
8 636157799 mice n 3456
9 636263106 kitty n 2345
10 636752420 kitten y 2345
but not sure how to replace df2 with df3 and df4. The desired output is shown below:
id text volume
1 632592651 NA 1234
2 633322173 cat 432
3 634703802 dog 324
4 634927873 mouse 333
5 635812953 elephant 2223
6 636004739 goose 412346
7 636101211 rat 7456
8 636157799 mice 3456
9 636263106 kitty 2345
10 636752420 kitten 2345
11 NA NA NA
12 NA NA NA
13 NA NA NA
14 NA NA NA
15 NA NA NA
16 NA NA NA
17 NA NA NA
18 NA NA NA
19 NA NA NA
20 NA NA NA
Can someone help please?
If possible, may I also know if there's a manual approach to select subset of df2 based on df3$id and change all values to NA?
Part 2:
For the second part of my request, I would like to create another dataframes from joined_df which appears only in df1 (call it found_in_df1). Example of output:
found_in_df1:
# id text volume
# 1: 632592651 <NA> 1234
# 2: 633322173 cat 432
# 3: 634703802 dog 324
# 4: 634927873 mouse 333
# 5: 635812953 elephant 2223
# 6: 636004739 goose 412346
# 7: 636101211 rat 7456
# 8: 636157799 mice 3456
# 9: 636263106 kitty 2345
#10: 636752420 kitten 2345
The solution is given in How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'? but I'm looking for an alternative approach, i.e., is it possible to write a script to say retrieve from joined_df using df1 to give found_in_df1 since we have df1 and joined_df?
One potential solution for dealing with conflicts is to use the powerjoin package, e.g.
library(dplyr)
df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),
text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),
response=c("y","y","y","n","n","y","y","n","n","y"))
df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
volume=c(1234,432,324,333,2223,412346,7456,3456,2345,2345,6,345,23,2,4778,234,8675,3459,8,9))
expected_outcome <- data.frame(id = c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
text = c(NA, "cat", "dog", "mouse", "elephant", "goose",
"rat", "mice", "kitty", "kitten",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
volume = c(1234, 432, 324, 333, 2223, 412346, 7456,
3456, 2345, 2345, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA))
library(powerjoin)
joined_df <- power_full_join(df1, df2, by = c("id"),
conflict = rw ~ ifelse(.x != .y,
NA_integer_,
.x))
final_df <- joined_df %>%
mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
select(id, text, volume)
final_df
#> id text volume
#> 1 632592651 <NA> 1234
#> 2 633322173 cat 432
#> 3 634703802 dog 324
#> 4 634927873 mouse 333
#> 5 635812953 elephant 2223
#> 6 636004739 goose 412346
#> 7 636101211 rat 7456
#> 8 636157799 mice 3456
#> 9 636263106 kitty 2345
#> 10 636752420 kitten 2345
#> 11 <NA> <NA> NA
#> 12 <NA> <NA> NA
#> 13 <NA> <NA> NA
#> 14 <NA> <NA> NA
#> 15 <NA> <NA> NA
#> 16 <NA> <NA> NA
#> 17 <NA> <NA> NA
#> 18 <NA> <NA> NA
#> 19 <NA> <NA> NA
#> 20 <NA> <NA> NA
all_equal(final_df, expected_outcome)
#> [1] TRUE
# Part 2
found_in_df1 <- power_left_join(df1, df2, by = c("id"),
conflict = rw ~ ifelse(.x != .y,
NA_integer_,
.x)) %>%
select(id, text, volume)
found_in_df1
#> id text volume
#> 1 632592651 <NA> 1234
#> 2 633322173 cat 432
#> 3 634703802 dog 324
#> 4 634927873 mouse 333
#> 5 635812953 elephant 2223
#> 6 636004739 goose 412346
#> 7 636101211 rat 7456
#> 8 636157799 mice 3456
#> 9 636263106 kitty 2345
#> 10 636752420 kitten 2345
Created on 2022-07-02 by the reprex package (v2.0.1)
Edit
Per the comment below from the creator of the powerjoin package (Mr. Mudskipper): these operations are vectorised, so you don't need to perform the command 'rowwise', i.e. you can remove "rw" to simplify and gain performance. There is no practical difference between including and excluding "rw" with df1 and df2, but if we use larger dataframes you can see a clear increase in performance, e.g.
library(dplyr)
library(powerjoin)
# define functions
power_full_join_func_rowwise <- function(df1, df2) {
joined_df <- power_full_join(df1, df2, by = c("id"),
conflict = rw ~ ifelse(.x != .y,
NA_integer_,
.x))
final_df <- joined_df %>%
mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
select(id, text, volume)
return(final_df)
}
power_full_join_func_not_rowwise <- function(df1, df2) {
joined_df <- power_full_join(df1, df2, by = c("id"),
conflict = ~ifelse(.x != .y,
NA_integer_,
.x))
final_df <- joined_df %>%
mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
select(id, text, volume)
return(final_df)
}
library(microbenchmark)
library(purrr)
library(ggplot2)
# make larger dfs (copy df1 and df2 X100)
df3 <- map_dfr(seq_len(100), ~ df1)
df4 <- map_dfr(seq_len(100), ~ df2)
# benchmark performance on the larger dataframes
res <- microbenchmark(power_full_join_func_rowwise(df3, df4),
power_full_join_func_not_rowwise(df3, df4))
res
#> Unit: milliseconds
#> expr min lq mean
#> power_full_join_func_rowwise(df3, df4) 397.32661 426.08117 449.88787
#> power_full_join_func_not_rowwise(df3, df4) 71.85757 77.25344 90.36191
#> median uq max neval cld
#> 446.41715 472.47817 587.3301 100 b
#> 81.18239 93.95103 191.1248 100 a
autoplot(res)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Is the result the same?
all_equal(power_full_join_func_rowwise(df3, df4),
power_full_join_func_not_rowwise(df3, df4))
#> [1] TRUE
Created on 2022-11-24 by the reprex package (v2.0.1)
data.table version using an !antijoin, and overwriting := all columns/rows returned in df2 with an NA (recycled list .(NA) to all columns).
Then looping over all the common variables and overwriting any values which don't match by id:
library(data.table)
setDT(df1)
setDT(df2)
df2[!df1, on=.(id), names(df2) := .(NA)]
idvars <- "id"
compvars <- setdiff(intersect(names(df1), names(df2)), idvars)
for (i in compvars) {
df2[!df1, on=c(idvars,i), (i) := NA]
}
# id text volume
# 1: 632592651 <NA> 1234
# 2: 633322173 cat 432
# 3: 634703802 dog 324
# 4: 634927873 mouse 333
# 5: 635812953 elephant 2223
# 6: 636004739 goose 412346
# 7: 636101211 rat 7456
# 8: 636157799 mice 3456
# 9: 636263106 kitty 2345
#10: 636752420 kitten 2345
#11: <NA> <NA> <NA>
#12: <NA> <NA> <NA>
#13: <NA> <NA> <NA>
#14: <NA> <NA> <NA>
#15: <NA> <NA> <NA>
#16: <NA> <NA> <NA>
#17: <NA> <NA> <NA>
#18: <NA> <NA> <NA>
#19: <NA> <NA> <NA>
#20: <NA> <NA> <NA>

Interpolate NA values with the mean for each row but only for one or two NA values between numerical values

I try to interpolate NA values for each row but I only want to interpolate the NA values if I have two or fewer NA values next to each other. So for example in row 3, there are three NAs next to each other, so I don't want to interpolate but in rows one and two there are two or fewer next to each other, so I aim to interpolate them linearly. Is there an efficient way to deal with it?
I have a dataset that looks like that:
df1:
ID string1 2018 2019 2020 2021 2022 string2
1: a1 x2 3 3 NA 4 4 si
2: a2 g3 5 5 NA NA 1 q2
3: a3 n2 11 NA NA NA 3 oq
4: a4 m3 3 NA 9 8 8 mx
5: a5 2w 9 1 NA 5 NA ix
6: a6 ps2 2 NA 7 4 4 p2
7: a7 kg2 6 NA NA NA 6 2q
For reproducibility:
df1 = data.table(
ID = c("a1", "a2", "a3", "a4", "a5", "a6", "a7"),
"string1" = c("x2", "g3", "n2", "m3", "2w", "ps2", "kg2"),
"2018" = c(3,5,11,3,9,2,6),
"2019" = c(3,5,NA,NA,1,NA,NA),
"2020" = c(NA,NA,NA,9,NA,7,NA),
"2021" = c(4,NA,NA,8,5,4,NA),
"2022" = c(4,1,3,8,NA,4,6),
"string2" = c("si", "q2", "oq", "mx", "ix", "p2", "2q"))
I try to get a data.table which looks like that:
ID string1 2018 2019 2020 2021 2022 string2
1: a1 x2 3 3.00 3.5 4 4 si
2: a2 g3 5 5.00 4.3 3 1 q2
3: a3 n2 11 NA NA NA 3 oq
4: a4 m3 3 8.25 9.0 8 8 mx
5: a5 2w 9 1.00 -0.3 5 17 ix
6: a6 ps2 2 8.00 7.0 4 4 p2
7: a7 kg2 6 NA NA NA 6 2q
Thanks for any suggestions!
Please find a solution (cf. reprex below) using data.table and imputeTS libraries.
Reprex
Code
library(data.table)
library(imputeTS)
results <- df1 %>%
transpose(., keep.names = 'rn') %>%
{.[3:nrow(df1), lapply(.SD, as.numeric),
][, lapply(.SD, na_interpolation, "spline", 2)]} %>%
round(., 2) %>%
transpose(., make.names = 'rn') %>%
cbind(.,df1[,c("ID", "string1", "string2")]) %>%
setcolorder(., names(df1))
Output
results
#> ID string1 2018 2019 2020 2021 2022 string2
#> <char> <char> <num> <num> <num> <num> <num> <char>
#> 1: a1 x2 3 3.00 3.50 4 4 si
#> 2: a2 g3 5 5.00 4.33 3 1 q2
#> 3: a3 n2 11 NA NA NA 3 oq
#> 4: a4 m3 3 8.25 9.00 8 8 mx
#> 5: a5 2w 9 1.00 -0.50 5 5 ix
#> 6: a6 ps2 2 8.00 7.00 4 4 p2
#> 7: a7 kg2 6 NA NA NA 6 2q
Created on 2021-12-02 by the reprex package (v2.0.1)
Probably a better solution (cf. reprex below) using data.table and zoo libraries. This solution gives exactly the result you want (i.e. forget my comment under your question!)
Reprex
Code
library(data.table)
library(zoo)
library(magrittr) # for the pipes!
results <- df1 %>%
transpose(., keep.names = 'rn') %>%
{.[3:nrow(df1), lapply(.SD, as.numeric),
][, lapply(.SD, na.spline, maxgap = 2)]} %>%
round(., 2) %>%
transpose(., make.names = 'rn') %>%
cbind(.,df1[,c("ID", "string1", "string2")]) %>%
setcolorder(., names(df1))
Output
results
#> ID string1 2018 2019 2020 2021 2022 string2
#> <char> <char> <num> <num> <num> <num> <num> <char>
#> 1: a1 x2 3 3.00 3.50 4 4 si
#> 2: a2 g3 5 5.00 4.33 3 1 q2
#> 3: a3 n2 11 NA NA NA 3 oq
#> 4: a4 m3 3 8.25 9.00 8 8 mx
#> 5: a5 2w 9 1.00 -0.33 5 17 ix
#> 6: a6 ps2 2 8.00 7.00 4 4 p2
#> 7: a7 kg2 6 NA NA NA 6 2q
Created on 2021-12-03 by the reprex package (v2.0.1)

If NA exists in column, replace with value in another column - sqldf

I have a dataframe below:
df
ColA ColB ColC
NA BN 6
JH NA 8
NA rewr 9
NA NA 10
Expected Output:
newdf
ColA ColB ColC New_Col
NA BN 6 BN
JH NA 8 JH
NA rewr 9 rewr
NA NA 10 NA
How do I do this using sqldf?
This was my attempt but it did not get the output I was looking for:
newdf<- sqldf("SELECT *, replace([ColA], NULL, [ColB]) [New_Col] from df")
Using coalesce
library(sqldf)
sqldf("SELECT ColA, ColB, ColC, coalesce(ColA, ColB) as New_Col from df")
# ColA ColB ColC New_Col
#1 <NA> BN 6 BN
#2 JH <NA> 8 JH
#3 <NA> rewr 9 rewr
#4 <NA> <NA> 10 <NA>
Or with tidyverse
library(dplyr)
df %>%
mutate(New_Col = coalesce(ColA, ColB))
# ColA ColB ColC New_Col
#1 <NA> BN 6 BN
#2 JH <NA> 8 JH
#3 <NA> rewr 9 rewr
#4 <NA> <NA> 10 <NA>
data
df <- structure(list(ColA = c(NA, "JH", NA, NA), ColB = c("BN", NA,
"rewr", NA), ColC = c(6L, 8L, 9L, 10L)), class = "data.frame", row.names = c(NA,
-4L))

Resources