Unnest a data frame and fill new rows with NAs - r

Let's say I have a nested df, and I want to unnest the columns:
df <- tibble::tribble(
~x, ~y, ~nestdf,
1, 2, tibble::tibble(a=1:2, b=3:4),
3, 4, tibble::tibble(a=3:5, b=5:7)
)
tidyr::unnest(df, nestdf)
# x y a b
# <dbl> <dbl> <int> <int>
#1 1 2 1 3
#2 1 2 2 4
#3 3 4 3 5
#4 3 4 4 6
#5 3 4 5 7
The result has the x and y columns extended to match the dimensions of nestdf, with the new rows using the existing values. However, I want the new rows to contain NA, like so:
# x y a b
# <dbl> <dbl> <int> <int>
#1 1 2 1 3
#2 NA NA 2 4
#3 3 4 3 5
#4 NA NA 4 6
#5 NA NA 5 7
Is it possible to do this with unnest? Either the first or last row for each group can be kept as non-NA, I don't mind.

Repeating rows, and binding with an unnest of the nested list column(s):
nr <- sapply(df$nestdf, nrow) - 1
cbind(
df[rep(rbind(seq_along(nr), NA), rbind(1, nr)), c("x","y")],
unnest(df["nestdf"], cols=everything())
)
# x y a b
#1 1 2 1 3
#2 NA NA 2 4
#3 3 4 3 5
#4 NA NA 4 6
#5 NA NA 5 7

One way would be to change the duplicates to NA.
df1 <- tidyr::unnest(df, nestdf)
cols <- c('x', 'y')
df1[duplicated(df1[cols]), cols] <- NA
df1
# x y a b
# <dbl> <dbl> <int> <int>
#1 1 2 1 3
#2 NA NA 2 4
#3 3 4 3 5
#4 NA NA 4 6
#5 NA NA 5 7
If the values in columns x and y can repeat you can create a row number to identify them uniquely -
library(dplyr)
library(tidyr)
df1 <- df %>% mutate(row = row_number()) %>% unnest(nestdf)
cols <- c('x', 'y', 'row')
df1[duplicated(df1[cols]), cols] <- NA
df1 <- select(df1, -row)

You could convert x and y to lists first:
library(tidyverse)
df <- tibble::tribble(
~x, ~y, ~nestdf,
1, 2, tibble::tibble(a=1:2, b=3:4),
3, 4, tibble::tibble(a=3:5, b=5:7)
)
df %>%
mutate_at(vars(x:y), ~map2(., nestdf, ~.x[seq(nrow(.y))])) %>%
unnest(everything())
# A tibble: 5 x 4
#x y a b
#<dbl> <dbl> <int> <int>
# 1 1 2 1 3
#2 NA NA 2 4
#3 3 4 3 5
#4 NA NA 4 6
#5 NA NA 5 7

Related

How to conditionally update a R tibble using multiple conditions of another tibble

I have two tables. I would like to update the first table using a second table using multiple conditions. In base R I would use if...else type constructs to do this but would like to know how to achieve this using dplyr.
The table to be updated (have a field added) looks like this:
> Intvs
# A tibble: 12 x 3
Group From To
<chr> <dbl> <dbl>
1 A 0 1
2 A 1 2
3 A 2 3
4 A 3 4
5 A 4 5
6 A 5 6
7 B 0 1
8 B 1 2
9 B 2 3
10 B 3 4
11 B 4 5
12 B 5 6
The tibble that I would like to use to make the update looks like this:
>Zns
# A tibble: 2 x 4
Group From To Zone
<chr> <chr> <dbl> <dbl>
1 A X 1 5
2 B Y 3 4
I would like to update the Intvs tibble with the Zns tibble using the fields == Group, >= From, and <= To to control the update. The expected output should look like this
> Intvs
# A tibble: 12 x 4
Group From To Zone
<chr> <dbl> <dbl> <chr>
1 A 0 1 NA
2 A 1 2 X
3 A 2 3 X
4 A 3 4 X
5 A 4 5 X
6 A 5 6 NA
7 B 0 1 NA
8 B 1 2 NA
9 B 2 3 NA
10 B 3 4 Y
11 B 4 5 NA
12 B 5 6 NA
What is the most efficient way to do this using dplyr?
The code below should make the dummy tables Intv and Zns
# load packages
require(tidyverse)
# Intervals table
a <- c(rep("A", 6), rep("B", 6))
b <- c(seq(0,5,1), seq(0,5,1) )
c <- c(seq(1,6,1), seq(1,6,1))
Intvs <- bind_cols(a, b, c)
names(Intvs) <- c("Group", "From", "To")
# Zones table
a <- c("A", "B")
b <- c("X", "Y")
c <- c(1, 3)
d <- c(5, 4)
Zns <- bind_cols(a, b, c, d)
names(Zns) <- c("Group", "From", "To", "Zone")
Using non-equi join from data.table
library(data.table)
setDT(Intvs)[Zns, Zone := Zone, on = .(Group, From >= From, To <= To)]
-output
> Intvs
Group From To Zone
<char> <num> <num> <char>
1: A 0 1 <NA>
2: A 1 2 X
3: A 2 3 X
4: A 3 4 X
5: A 4 5 X
6: A 5 6 <NA>
7: B 0 1 <NA>
8: B 1 2 <NA>
9: B 2 3 <NA>
10: B 3 4 Y
11: B 4 5 <NA>
12: B 5 6 <NA>
This is the closest I get. It is not giving the expected output:
library(dplyr)
left_join(Intvs, Zns, by="Group") %>%
group_by(Group) %>%
mutate(Zone1 = case_when(From.x <= Zone & From.x >= To.y ~ From.y)) %>%
select(Group, From=From.x, To=To.x, Zone = Zone1)
Group From To Zone
<chr> <dbl> <dbl> <chr>
1 A 0 1 NA
2 A 1 2 X
3 A 2 3 X
4 A 3 4 X
5 A 4 5 X
6 A 5 6 X
7 B 0 1 NA
8 B 1 2 NA
9 B 2 3 NA
10 B 3 4 Y
11 B 4 5 Y
12 B 5 6 NA
Not sure why your first row does not give NA, since 0 - 1 is not in the range of 1 - 5.
First left_join the two dataframes using the Group column. Here I assign the suffix "_Zns" to values from the Zns dataframe. Then use a single case_when or (ifelse) statement to assign NA to rows that do not fit the range. Finally, drop the columns that end with Zns.
library(dplyr)
left_join(Intvs, Zns, by = "Group", suffix = c("", "_Zns")) %>%
mutate(Zone = case_when(From >= From_Zns & To <= To_Zns ~ Zone,
TRUE ~ NA_character_)) %>%
select(-ends_with("Zns"))
# A tibble: 12 × 4
Group From To Zone
<chr> <dbl> <dbl> <chr>
1 A 0 1 NA
2 A 1 2 X
3 A 2 3 X
4 A 3 4 X
5 A 4 5 X
6 A 5 6 NA
7 B 0 1 NA
8 B 1 2 NA
9 B 2 3 NA
10 B 3 4 Y
11 B 4 5 NA
12 B 5 6 NA
Data
Note that I have changed your column name order in the Zns dataframe.
a <- c(rep("A", 6), rep("B", 6))
b <- c(seq(0,5,1), seq(0,5,1) )
c <- c(seq(1,6,1), seq(1,6,1))
Intvs <- bind_cols(a, b, c)
names(Intvs) <- c("Group", "From", "To")
# Zones table
a <- c("A", "B")
b <- c("X", "Y")
c <- c(1, 3)
d <- c(5, 4)
Zns <- bind_cols(a, b, c, d)
colnames(Zns) <- c("Group", "Zone", "From", "To")

R: Get row values based on column name in R

I have a dataframe like this:
ID Q4_1 Q4_2 Q4_3 Q4_4 Q4_5
1 4 3 1
2 1 3
3 3 1 2
4 5
Can I know how to rearrange the row values based on the column's prefix to have something like this?
ID Q4_1 Q4_2 Q4_3 Q4_4 Q4_5
1 1 3 4
2 1 3
3 1 2 3
4 5
Here is a base R approach. Note that I've replaced the empty space in your input with NA.
The core of this solution is to use lapply to go through all rows in the dataframe, then use ifelse to match the row values with the header_suffix I created, which contains the suffix of colnames of the dataframe.
df <- read.table(header = T,
text = "ID Q4_1 Q4_2 Q4_3 Q4_4 Q4_5
1 4 3 1 NA NA
2 1 3 NA NA NA
3 3 1 2 NA NA
4 5 NA NA NA NA")
header_suffix <- gsub("^.+_", "", grep("_", colnames(df), value = T))
header_suffix
#> [1] "1" "2" "3" "4" "5"
df2 <-
cbind(df[, 1],
as.data.frame(
do.call(
rbind,
lapply(1:nrow(df), function(x)
ifelse(header_suffix %in% df[x, 2:ncol(df)], header_suffix, NA))
)
)
)
colnames(df2) <- colnames(df)
df2
#> ID Q4_1 Q4_2 Q4_3 Q4_4 Q4_5
#> 1 1 1 <NA> 3 4 <NA>
#> 2 2 1 <NA> 3 <NA> <NA>
#> 3 3 1 2 3 <NA> <NA>
#> 4 4 <NA> <NA> <NA> <NA> 5
Created on 2022-03-31 by the reprex package (v2.0.1)
Here's a Tidyverse approach
df %>% pivot_longer(2:6, names_to = "Q", values_to = "no") %>%
mutate(new_Q = paste0("Q4_", no)) %>%
filter(new_Q != "Q4_NA") %>%
select(ID, new_Q, no ) %>%
pivot_wider(id_cols = ID, names_from = new_Q, values_from = no) %>%
select(ID, Q4_1, Q4_2, Q4_3, Q4_4, Q4_5 )
# A tibble: 4 x 6
ID Q4_1 Q4_2 Q4_3 Q4_4 Q4_5
<int> <int> <int> <int> <int> <int>
1 1 1 NA 3 4 NA
2 2 1 NA 3 NA NA
3 3 1 2 3 NA NA
4 4 NA NA NA NA 5

R - Purrr - Slicing using map(), or: How to slice list of tibbles of varying lengths based on NAs?

I'm trying to figure out how to slice or subset a list of tibbles of varying lengths based upon first appearance of a non-NA. All my tibbles are of varying dimensions with numerous NAs, but common for all is that they have some number of NAs in the first rows that I need to remove.
I am only interested in getting rid of the first number of rows while keeping all the following rows.
I've created a reprex that sort of illustrates my data:
tbl <- tibble(
first = c(NA, 1, 2, 3, NA),
second = c(NA, 1, NA, 3, NA),
third = c(NA, 1, 2, NA, NA)
)
lst <- list(
list1 = tbl,
list2 = tbl,
list3 = tbl
)
lst
$list1
# A tibble: 5 x 3
first second third
<dbl> <dbl> <dbl>
1 NA NA NA
2 1 1 1
3 2 NA 2
4 3 3 NA
5 NA NA NA
$list2
# A tibble: 5 x 3
first second third
<dbl> <dbl> <dbl>
1 NA NA NA
2 1 1 1
3 2 NA 2
4 3 3 NA
5 NA NA NA
$list3
# A tibble: 5 x 3
first second third
<dbl> <dbl> <dbl>
1 NA NA NA
2 1 1 1
3 2 NA 2
4 3 3 NA
5 NA NA NA
I've tried using map() in combination with which.min() and is.na() to try and slice based upon the first instance of non-NA, but cannot make it work.
sliced <- map(lst, slice, which.min(is.na):nrow())
All i get is the following error:
Error in which.min(is.na) :
cannot coerce type 'builtin' to vector of type 'double'
Is there a way to fix this?
is.na needs a vector. You may need to pass a particular column to it.
For example, using the first column you can do -
library(dplyr)
library(purrr)
map(lst, ~.x %>% slice(which.max(!is.na(.[[1]])) : n()))
#$list1
# A tibble: 4 x 3
# first second third
# <dbl> <dbl> <dbl>
#1 1 1 1
#2 2 NA 2
#3 3 3 NA
#4 NA NA NA
#$list2
# A tibble: 4 x 3
# first second third
# <dbl> <dbl> <dbl>
#1 1 1 1
#2 2 NA 2
#3 3 3 NA
#4 NA NA NA
#$list3
# A tibble: 4 x 3
# first second third
# <dbl> <dbl> <dbl>
#1 1 1 1
#2 2 NA 2
#3 3 3 NA
#4 NA NA NA

counting the number of observations row wise using dplyr

I have a dataset look like this -
sample <- tibble(x = c (1,2,3,NA), y = c (5, NA,2, NA))
sample
# A tibble: 4 x 2
x y
<dbl> <dbl>
1 1 5
2 2 NA
3 3 2
4 NA NA
Now I want create a new variable Z, which will count how many observations are in each row. For example for the sample dataset above the first value of new variable Z should be 2 because both x and y have values. Similarly, for 2nd row the value of Z is 1 as there is one missing value and for 4th row, the value is 0 as there is no observations in the row.
The expected dataset looks like this -
x y z
<dbl> <dbl> <dbl>
1 1 5 2
2 2 NA 1
3 3 2 2
4 NA NA 0
I want to do this on few number of variables, not the whole dataset.
Using base R. First line checks all columns, second one checks columns by name, third might not work as good if the number of columns is substantial.
sample$z1 <- rowSums(!is.na(sample))
sample$z2 <- rowSums(!is.na(sample[c("x", "y")]))
sample$z3 <- is.finite(sample$x) + is.finite(sample$y)
> sample
# A tibble: 4 x 5
x y z1 z2 z3
<dbl> <dbl> <dbl> <dbl> <int>
1 1 5 2 2 2
2 2 NA 1 1 1
3 3 2 2 2 2
4 NA NA 0 0 0
We can use
library(dplyr)
sample %>%
rowwise %>%
mutate(z = sum(!is.na(cur_data()))) %>%
ungroup
-output
# A tibble: 4 x 3
# x y z
# <dbl> <dbl> <int>
#1 1 5 2
#2 2 NA 1
#3 3 2 2
#4 NA NA 0
If it is select columns
sample %>%
rowwise %>%
mutate(z = sum(!is.na(select(cur_data(), x:y))))
Or with rowSums on a logical matrix
sample %>%
mutate(z = rowSums(!is.na(cur_data())))
-output
# A tibble: 4 x 3
# x y z
# <dbl> <dbl> <dbl>
#1 1 5 2
#2 2 NA 1
#3 3 2 2
#4 NA NA 0
apply function with selected columns example:
set.seed(7)
vals <- sample(c(1:20, NA, NA), 20)
sample <- matrix(vals, ncol = 5)
# Select columns 1, 3, 4
cols <- c(1, 3, 4)
rowcnts <- apply(sample[ , cols], 1, function(x) length(x[!is.na(x)]))
sample <- cbind(sample, rowcnts)
> sample
rowcnts
[1,] 10 15 16 NA 12 2
[2,] 19 8 14 18 9 3
[3,] 7 17 6 4 1 3
[4,] 2 3 13 NA 5 2

Remove trailing NA by group in a data.frame

I have a data.frame with a grouping variable, and some NAs in the value column.
df = data.frame(group=c(1,1,2,2,2,2,2,3,3), value1=1:9, value2=c(NA,4,9,6,2,NA,NA,1,NA))
I can use zoo::na.trim to remove NA at the end of a column: this will remove the last line of the data.frame:
library(zoo)
library(dplyr)
df %>% na.trim(sides="right")
Now I want to remove the trailing NAs by group; how can I achieve this using dplyr?
Expected output for value2 column: c(NA, 4,9,6,2,1)
You could write a little helper function that checks for trailing NAs of a vector and then use group_by and filter.
f <- function(x) { rev(cumsum(!is.na(rev(x)))) != 0 }
library(dplyr)
df %>%
group_by(group) %>%
filter(f(value2))
# A tibble: 6 x 3
# Groups: group [3]
group value1 value2
<dbl> <int> <dbl>
1 1 1 NA
2 1 2 4
3 2 3 9
4 2 4 6
5 2 5 2
6 3 8 1
edit
If we need to remove both leading and trailing zero we need to extend that function a bit.
f1 <- function(x) { cumsum(!is.na(x)) != 0 & rev(cumsum(!is.na(rev(x)))) != 0 }
Given df1
df1 = data.frame(group=c(1,1,2,2,2,2,2,3,3), value1=1:9, value2=c(NA,4,9,NA,2,NA,NA,1,NA))
df1
# group value1 value2
#1 1 1 NA
#2 1 2 4
#3 2 3 9
#4 2 4 NA
#5 2 5 2
#6 2 6 NA
#7 2 7 NA
#8 3 8 1
#9 3 9 NA
We get this result
df1 %>%
group_by(group) %>%
filter(f1(value2))
# A tibble: 5 x 3
# Groups: group [3]
group value1 value2
<dbl> <int> <dbl>
1 1 2 4
2 2 3 9
3 2 4 NA
4 2 5 2
5 3 8 1
Using lapply, loop through group:
do.call("rbind", lapply(split(df, df$group), na.trim, sides = "right"))
# group value1 value2
# 1.1 1 1 NA
# 1.2 1 2 4
# 2.3 2 3 9
# 2.4 2 4 6
# 2.5 2 5 2
# 3 3 8 1
Or using by, as mentioned by #Henrik:
do.call("rbind", by(df, df$group, na.trim, sides = "right"))

Resources