Variable/column selection in tidyr fill() - r

Suppose a df with some missing values like this:
ID col_A_1 col_A_2 col_B_1 col_B_2
1 1 1 NA NA a
2 1 2 NA 1 b
3 1 3 1 2 c
4 1 4 2 3 d
5 1 NA 3 4 e
6 2 NA 1 5 f
7 2 NA 2 6 g
8 2 1 3 7 h
9 2 2 4 8 <NA>
10 2 3 5 NA <NA>
I want to fill the missing values using tidyr fill(), however, only the missing values in columns containing A.
I was able to achieve it using:
library(dplyr)
library(tidyr)
df %>%
group_by(ID) %>%
fill(names(.)[grepl("A", names(.))], .direction = "up") %>%
fill(names(.)[grepl("A", names(.))], .direction = "down") %>%
ungroup()
ID col_A_1 col_A_2 col_B_1 col_B_2
<dbl> <int> <int> <int> <chr>
1 1 1 1 NA a
2 1 2 1 1 b
3 1 3 1 2 c
4 1 4 2 3 d
5 1 4 3 4 e
6 2 1 1 5 f
7 2 1 2 6 g
8 2 1 3 7 h
9 2 2 4 8 <NA>
10 2 3 5 NA <NA>
however, I'm looking for other variable/column selection possibilities inside tidyr fill().
Sample data:
df <- data.frame(ID = c(rep(1, 5), rep(2, 5)),
col_A_1 = c(1:4, NA, NA, NA, 1:3),
col_A_2 = c(NA, NA, 1:3, 1:5),
col_B_1 = c(NA, 1:8, NA),
col_B_2 = c(letters[1:8], NA, NA),
stringsAsFactors = FALSE)

The fill can take select_helpers
library(tidyverse)
df %>%
group_by(ID) %>%
fill(matches('A'), .direction = 'up') %>%
fill(matches('A'), .direction = 'down')
# A tibble: 10 x 5
# Groups: ID [2]
# ID col_A_1 col_A_2 col_B_1 col_B_2
# <dbl> <int> <int> <int> <chr>
# 1 1 1 1 NA a
# 2 1 2 1 1 b
# 3 1 3 1 2 c
# 4 1 4 2 3 d
# 5 1 4 3 4 e
# 6 2 1 1 5 f
# 7 2 1 2 6 g
# 8 2 1 3 7 h
# 9 2 2 4 8 <NA>
#10 2 3 5 NA <NA>

Related

Nested list to grouped rows in R

I have the following nested list called l (dput below):
> l
$A
$A$`1`
[1] 1 2 3
$A$`2`
[1] 3 2 1
$B
$B$`1`
[1] 2 2 2
$B$`2`
[1] 3 4 3
I would like to convert this to a grouped dataframe where A and B are the first group column and 1 and 2 are the subgroups with respective values. The desired output should look like this:
group subgroup values
1 A 1 1
2 A 1 2
3 A 1 3
4 A 2 3
5 A 2 2
6 A 2 1
7 B 1 2
8 B 1 2
9 B 1 2
10 B 2 3
11 B 2 4
12 B 2 3
As you can see A and B are the main group and 1 and 2 are the subgroups. Using purrr::flatten(l) or unnest doesn't work. So I was wondering if anyone knows how to convert a nested list to a grouped row dataframe?
dput of l:
l <- list(A = list(`1` = c(1, 2, 3), `2` = c(3, 2, 1)), B = list(`1` = c(2,
2, 2), `2` = c(3, 4, 3)))
Using stack and rowbind with id:
data.table::rbindlist(lapply(l, stack), idcol = "id")
# id values ind
# 1: A 1 1
# 2: A 2 1
# 3: A 3 1
# 4: A 3 2
# 5: A 2 2
# 6: A 1 2
# 7: B 2 1
# 8: B 2 1
# 9: B 2 1
# 10: B 3 2
# 11: B 4 2
# 12: B 3 2
You can use enframe() to convert the list into a data.frame, and unnest the value column twice.
library(tidyr)
tibble::enframe(l, name = "group") %>%
unnest_longer(value, indices_to = "subgroup") %>%
unnest(value)
# A tibble: 12 × 3
group value subgroup
<chr> <dbl> <chr>
1 A 1 1
2 A 2 1
3 A 3 1
4 A 3 2
5 A 2 2
6 A 1 2
7 B 2 1
8 B 2 1
9 B 2 1
10 B 3 2
11 B 4 2
12 B 3 2
Turn the list directly into a data frame, then pivot it into a long format and arrange to your desired order.
library(tidyverse)
lst %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = c("group", "subgroup"),
values_to = "values",
names_pattern = "(.+?)\\.(.+?)") %>%
arrange(group, subgroup)
# A tibble: 12 × 3
group subgroup values
<chr> <chr> <dbl>
1 A 1 1
2 A 1 2
3 A 1 3
4 A 2 3
5 A 2 2
6 A 2 1
7 B 1 2
8 B 1 2
9 B 1 2
10 B 2 3
11 B 2 4
12 B 2 3
You can combine rrapply with unnest, which has the benefit to work in lists of arbitrary lengths:
library(rrapply)
library(tidyr)
rrapply(l, how = "melt") |>
unnest(value)
# A tibble: 12 × 3
L1 L2 value
<chr> <chr> <dbl>
1 A 1 1
2 A 1 2
3 A 1 3
4 A 2 3
5 A 2 2
6 A 2 1
7 B 1 2
8 B 1 2
9 B 1 2
10 B 2 3
11 B 2 4
12 B 2 3

Rearranging the rows based on a sequential unique values

I have the following data set containing duplicate columns and I would like to stack them but in the following way. I can get the desired output with bind_rows but I would like to try it with tidyr functions:
df <- tibble(
runs = c(1, 2, 3, 4),
col1 = c(3, 4, 5, 5),
col2 = c(5, 3, 1, 4),
col3 = c(6, 4, 9, 2),
col1 = c(0, 2, 2, 1),
col2 = c(2, 3, 1, 7),
col3 = c(2, 4, 9, 9),
col1 = c(3, 4, 5, 7),
col2 = c(3, 3, 1, 4),
col3 = c(3, 2, NA, NA), .name_repair = "minimal")
df %>%
select(runs, 2:4) %>%
bind_rows(df %>%
select(runs, 5:7)) %>%
bind_rows(df %>%
select(runs, 8:10))
# A tibble: 12 x 4 # This is my desired output in a way that column runs is a repeated number of 1 to 4
runs col1 col2 col3
<dbl> <dbl> <dbl> <dbl>
1 1 3 5 6
2 2 4 3 4
3 3 5 1 9
4 4 5 4 2
5 1 0 2 2
6 2 2 3 4
7 3 2 1 9
8 4 1 7 9
9 1 3 3 3
10 2 4 3 2
11 3 5 1 NA
12 4 7 4 NA
However when I use tidyr the runs is arranged differently in the following way.
df %>%
pivot_longer(-runs) %>%
group_by(name) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = name, values_from = value) %>%
select(-id)
# A tibble: 12 x 4
runs col1 col2 col3
<dbl> <dbl> <dbl> <dbl>
1 1 3 5 6
2 1 0 2 2
3 1 3 3 3
4 2 4 3 4
5 2 2 3 4
6 2 4 3 2
7 3 5 1 9
8 3 2 1 9
9 3 5 1 NA
10 4 5 4 2
11 4 1 7 9
12 4 7 4 NA
I would be grateful if you could let me know how I could rearrange runs so that the numbers are sequential and not like three 1 in a row and ...
Thank you very much in advance.
There may be a more elegant way to do this, but could you not simply group by runs and use the row numbers to arrange.
df %>%
pivot_longer(cols = starts_with("col"),
names_to = c(".value")) %>%
group_by(runs) %>%
mutate(grp_n = row_number()) %>%
ungroup() %>%
arrange(grp_n, runs)
# A tibble: 12 x 5
runs col1 col2 col3 grp_n
<dbl> <dbl> <dbl> <dbl> <int>
1 1 3 5 6 1
2 2 4 3 4 1
3 3 5 1 9 1
4 4 5 4 2 1
5 1 0 2 2 2
6 2 2 3 4 2
7 3 2 1 9 2
8 4 1 7 9 2
9 1 3 3 3 3
10 2 4 3 2 3
11 3 5 1 NA 3
12 4 7 4 NA 3
A base R option using split.default :
data.frame(runs = df$runs,
sapply(split.default(df[-1], names(df)[-1]), unlist),row.names = NULL)
# runs col1 col2 col3
#1 1 3 5 6
#2 2 4 3 4
#3 3 5 1 9
#4 4 5 4 2
#5 1 0 2 2
#6 2 2 3 4
#7 3 2 1 9
#8 4 1 7 9
#9 1 3 3 3
#10 2 4 3 2
#11 3 5 1 NA
#12 4 7 4 NA

Is there a way to get subdataframes with purrr in magrittr pipes workflow without using data.frame name?

That is, I was interested in doing the same as in the example, but with purrr functions.
tibble(a, b = a * 2, c = 1) %>%
{lapply(X = names(.), FUN = function(.x) select(., 1:.x))}
[[1]]
# A tibble: 5 x 1
a
<int>
1 1
2 2
3 3
4 4
5 5
[[2]]
# A tibble: 5 x 2
a b
<int> <dbl>
1 1 2
2 2 4
3 3 6
4 4 8
5 5 10
[[3]]
# A tibble: 5 x 3
a b c
<int> <dbl> <dbl>
1 1 2 1
2 2 4 1
3 3 6 1
4 4 8 1
5 5 10 1
I only could do it if I named foo <- tibble(a, b = a * 2, c = 1) and inside map I did select(foo, ...), but I wanted to avoid that, since I wanted to mutate the named dataframe in pipe workflow.
Thank you!
You can use map in the following way :
library(dplyr)
library(purrr)
tibble(a = 1:5, b = a * 2, c = 1) %>%
{map(names(.), function(.x) select(., 1:.x))}
Based on your actual use case you can also use imap which will pass column value (.x) along with it's name (.y).
tibble(a = 1:5, b = a * 2, c = 1) %>%
imap(function(.x, .y) select(., 1:.y))
#$a
# A tibble: 5 x 1
# a
# <int>
#1 1
#2 2
#3 3
#4 4
#5 5
#$b
# A tibble: 5 x 2
# a b
# <int> <dbl>
#1 1 2
#2 2 4
#3 3 6
#4 4 8
#5 5 10
#$c
# A tibble: 5 x 3
# a b c
# <int> <dbl> <dbl>
#1 1 2 1
#2 2 4 1
#3 3 6 1
#4 4 8 1
#5 5 10 1

Split information from two columns, R, tidyverse

i've got some data in two columns:
# A tibble: 16 x 2
code niveau
<chr> <dbl>
1 A 1
2 1 2
3 2 2
4 3 2
5 4 2
6 5 2
7 B 1
8 6 2
9 7 2
My desired output is:
A tibble: 16 x 3
code niveau cat
<chr> <dbl> <chr>
1 A 1 A
2 1 2 A
3 2 2 A
4 3 2 A
5 4 2 A
6 5 2 A
7 B 1 B
8 6 2 B
I there a tidy way to convert these data without looping through it?
Here some dummy data:
data<-tibble(code=c('A', 1,2,3,4,5,'B', 6,7,8,9,'C',10,11,12,13), niveau=c(1, 2,2,2,2,2,1,2,2,2,2,1,2,2,2,2))
desired_output<-tibble(code=c('A', 1,2,3,4,5,'B', 6,7,8,9,'C',10,11,12,13), niveau=c(1, 2,2,2,2,2,1,2,2,2,2,1,2,2,2,2),
cat=c(rep('A', 6),rep('B', 5), rep('C', 5)))
Nicolas
Probably, you can create a new column cat and replace code values with NA where there is a number. We can then use fill to replace missing values with previous non-NA value.
library(dplyr)
data %>% mutate(cat = replace(code, grepl('\\d', code), NA)) %>% tidyr::fill(cat)
# A tibble: 16 x 3
# code niveau cat
# <chr> <dbl> <chr>
# 1 A 1 A
# 2 1 2 A
# 3 2 2 A
# 4 3 2 A
# 5 4 2 A
# 6 5 2 A
# 7 B 1 B
# 8 6 2 B
# 9 7 2 B
#10 8 2 B
#11 9 2 B
#12 C 1 C
#13 10 2 C
#14 11 2 C
#15 12 2 C
#16 13 2 C
We can use str_detect from stringr
library(dplyr)
library(stringr)
library(tidyr)
data %>%
mutate(cat = replace(code, str_detect(code, '\\d'), NA)) %>%
fill(cat)

How to group_by(everything())

I want to count unique combinations in a dataframe using dplyr
I tried the following:
require(dplyr)
set.seed(314)
dat <- data.frame(a = sample(1:3, 100, replace = T),
b = sample(1:2, 100, replace = T),
c = sample(1:2, 100, replace = T))
dat %>% group_by(a,b,c) %>% summarise(n = n())
But to make this generic (unrelated to the names of the columns) I tried:
dat %>% group_by(everything()) %>% summarise(n = n())
Which results in:
a b c n
<int> <int> <int> <int>
1 1 1 1 6
2 1 1 2 8
3 1 2 1 13
4 1 2 2 8
5 2 1 1 7
6 2 1 2 12
7 2 2 1 14
8 2 2 2 10
9 3 1 1 3
10 3 1 2 4
11 3 2 1 7
12 3 2 2 8
Which gives the error
Error in mutate_impl(.data, dots) : `c(...)` must be a character vector
I fiddled around with different things but cannot get it to work. I know I could use names(dat) but the columns in the dataframe that need to be in the group_by() are depended on previous steps in the dplyr chain.
There is a function called group_by_all() (and in the same sense group_by_at and group_by_if )which does exactly that.
library(dplyr)
dat %>%
group_by_all() %>%
summarise(n = n())
which gives the same result,
# A tibble: 12 x 4
# Groups: a, b [?]
a b c n
<int> <int> <int> <int>
1 1 1 1 6
2 1 1 2 8
3 1 2 1 13
4 1 2 2 8
5 2 1 1 7
6 2 1 2 12
7 2 2 1 14
8 2 2 2 10
9 3 1 1 3
10 3 1 2 4
11 3 2 1 7
12 3 2 2 8
PS
packageVersion('dplyr')
#[1] ‘0.7.2’
We can use .dots
dat %>%
group_by(.dots = names(.)) %>%
summarise(n = n())
# A tibble: 12 x 4
# Groups: a, b [?]
# a b c n
# <int> <int> <int> <int>
#1 1 1 1 6
#2 1 1 2 8
#3 1 2 1 13
#4 1 2 2 8
#5 2 1 1 7
#6 2 1 2 12
#7 2 2 1 14
#8 2 2 2 10
#9 3 1 1 3
#10 3 1 2 4
#11 3 2 1 7
#12 3 2 2 8
Another option would be to use the unquote, sym approach
dat %>%
group_by(!!! rlang::syms(names(.))) %>%
summarise(n = n())
In dplyr version 1.0.0 and later, you would now use across().
library(dplyr)
dat %>%
group_by(across(everything())) %>%
summarise(n = n())
Package version:
> packageVersion("dplyr")
[1] ‘1.0.5’

Resources