Extract values from list of arbitrary depth - r

I have a messy, highly nested, list:
m <- list('form' = list('elements' = list('name' = 'Bob', 'code' = 12), 'name' = 'Mary', 'code' = 15))
> m
$form
$form$elements
$form$elements$name
[1] "Bob"
$form$elements$code
[1] 12
$form$name
[1] "Mary"
$form$code
[1] 15
How can I extract from the object m the name and code, regardless as to how nested name and code appears within a list?
Expected output:
# A tibble: 2 x 2
name code
<chr> <dbl>
1 Bob 12
2 Mary 15

1) rrapply Flatten m using rrapply giving r and then separate the name and code fields of unlist(r) using tapply, remove the dimensions using c, convert to data.frame and set the order of the columns.
Note that this is not hard coded to name and code and would work with other fields and numbers of fields.
library(rrapply)
r <- rrapply(m, f = c, how = "flatten")
nms <- names(r)
as.data.frame(c(tapply(unname(r), nms, unlist)))[unique(nms)]
giving:
name code
1 Bob 12
2 Mary 15
An alternative to the final two lines of code above would be:
out <- unstack(stack(r))
out[] <- lapply(out, type.convert)
If there could be other fields in m in addition to name and code that we want ignored then use this in place of the statement that defines r above:
cond <- function(x, .xname) .xname %in% c("name", "code")
r <- rrapply(m, cond, c, how = "flatten")
2) Base R A base R solution is the following which unlists m, and then uses tapply as in (1) grouping by the suffixes of names(r). Like (1) this is a general approach that is not hard coded to name and code. Note that tools comes with R so it is part of Base R.
r <- unlist(m)
nms <- tools::file.ext(names(r))
as.data.frame(c(tapply(unname(r), nms, unlist)))[unique(nms)]

This could help formating the list into a dataframe and then reshaping it:
library(tidyverse)
#Process
y1 <- as.data.frame(lapply(m,unlist),stringsAsFactors = F)
y1$id <- rownames(y1)
rownames(y1)<-NULL
#Dplyr mutation
y1 %>% mutate(Var=ifelse(grepl('name',id,),'name',
ifelse(grepl('code',id),'code',NA))) %>%
select(-id) %>% group_by(Var) %>%
mutate(i=1:n())%>% pivot_wider(names_from = Var,values_from = form) %>%
select(-i) %>% mutate(code=as.numeric(code))
Output:
# A tibble: 2 x 2
name code
<chr> <dbl>
1 Bob 12
2 Mary 15

Related

Use input of a function as variable name in R

I have a simple function in R to modify a dataframe
monthly_fun <- function(x){
x %>%
mutate(obstime = convert_dates(obstime)) %>%
select(obstime, x = obsvalue)
}
When applying the function to dataframe df, i.e. monthly_fun(df), I would like df to be the name of obsvalue. In my current code, the name is obviously "x", how can I modify the part in select to get the name of the supplied dataframe as the variable name instead?
Thanks a lot
EDIT: I want to apply this function to several dataframes using
result <- list( df1, df2, df3) %>%
lapply( monthly_fun )
You could extract the name of input by deparse(substitute(x)), and use !!y := obsvalue in mutate().
monthly_fun <- function(x) {
y <- deparse(substitute(x))
x %>%
mutate(obstime = convert_dates(obstime),
!!y := obsvalue) %>%
select(obstime, y)
}
A simplified example:
fun <- function(x) {
y <- deparse(substitute(x))
x %>%
mutate(!!y := 1) %>%
select(y)
}
fun(df)
# df
# 1 1
# 2 1
# 3 1
# 4 1
# 5 1
Update
If you want to apply it to several data frames stored in a list, you should design a 2-argument function, one argument for data and the other for new column names. Then use Map() to apply this function over each pair of data and names.
fun <- function(x, y) {
x %>%
mutate(!!y := 1) %>%
select(y)
}
Map(fun, list(df1, df2), c("name1", "name2"))
# [[1]]
# name1
# 1 1
# 2 1
# 3 1
# 4 1
# 5 1
#
# [[2]]
# name2
# 1 1
# 2 1
# 3 1
# 4 1
# 5 1
If you're familiar with purrr, The use of Map can be replaced with map2() or imap(). (Notice the difference of inputs to the both functions)
library(purrr)
# (1) map2(): Input data and names separately
map2(list(df1, df2), c("name1", "name2"), fun)
# (2) imap(): Input a named list
imap(list(name1 = df1, name2 = df2), fun)
Using the suggestion by Julien and creating a variable using deparse(substitute(df)) and rename using that.
monthly_fun <- function(x) {
y = deparse(substitute(x))
x <- x %>%
mutate(obstime = obstime*5) %>%
select(obstime, obsvalue)
names(x)[names(x) == "obsvalue"] <- y
return(x)
}
see this site for more naming methods.

Filter data.frame with another data.frame using select(contains)

I have 2 dataframes like the following:
df1
colA
A
B
C
D
df2
one two
x A
y A;B
z A;D;C
p E
q F
I want to filter df2 for entries contained in df1. i.e "two" containing values of colA, so that my output will be
one two
x A
y A;B
z A;D;C
I tried all these options that didn't work
df2filtered = df2 %>% filter(two %in% df1$colA)
df2filtered = df2 %>% filter(two %in% str_detect(df1$colA))
df2filtered = df2 %>% select(two, contains(df1$colA))
str_detect with character works but not when given in df like above. What is the right solution?
Here's one way to obtaning the desired output using map to create an extra column to afterwards apply the filter.
library(tidyverse)
df2 %>%
# Use map to check if any string in df1$colA is found in
# df2$two; then use any to check if any entry is T
mutate(stay = map(two, function(x){
any(str_detect(x,df1$colA))
})) %>%
# Filter
filter(stay == T) %>%
# Remove extra column
select(-c(stay))
# one two
#1 x A
#2 y A;B
#3 z A;D;C
Your data is not "tidy". I'd reshape it into a long format. Then, filtering becomes easy.
Below an approach which makes use of an non-exported function of the eye package in order to split the column into an unknown number of columns. (disclaimer: I am the author of this package. The function was inspired and modified from this answer). Then pivot the result longer and filter by the presence in df1$colA. I'd leave the result in a tidy format, but you can of course melt it back to your rather messy shape.
library(tidyverse)
df1 <- read.table(text = "colA
A
B
C
D", header = TRUE)
df2 <- read.table(text = "one two
x A
y A;B
z A;D;C
p E
q F ", header = TRUE)
#install.packages("eye")
eye:::split_mult(df2, "two", pattern = ";" ) %>%
pivot_longer(cols = starts_with("var"), names_to = "var", values_to = "val") %>%
drop_na(val)%>%
select(-var) %>%
group_by(one) %>%
filter(any(val %in% df1$colA))
#> # A tibble: 6 x 2
#> # Groups: one [3]
#> one val
#> <chr> <chr>
#> 1 x A
#> 2 y A
#> 3 y B
#> 4 z A
#> 5 z D
#> 6 z C
Created on 2021-07-14 by the reprex package (v2.0.0)
because this function might change in the future, here for future reference:
split_mult <- function (x, col, pattern = "_", into = NULL, prefix = "var",
sep = "")
{
cols <- stringr::str_split_fixed(x[[col]], pattern, n = Inf)
cols[which(cols == "")] <- NA_character_
m <- dim(cols)[2]
if (length(into) == m) {
colnames(cols) <- into
}
else {
colnames(cols) <- paste(prefix, 1:m, sep = sep)
}
cbind(cols, x[names(x) != col])
}
Another option using str_detect. You can collapse df1$colA so that str_detect searches for A or B or C or D. e.g. "A|B|C|D".
library(tidyverse)
df2 %>% filter(str_detect(two, paste(df1$colA, collapse = '|')))
#> one two
#> 1 x A
#> 2 y A;B
#> 3 z A;D;C

extract identically named vectors from nested lists, where the list names vary? Using purrr?

I have to work with some data that is in recursive lists like this (simplified reproducible example below):
groups
#> $group1
#> $group1$countries
#> [1] "USA" "JPN"
#>
#>
#> $group2
#> $group2$countries
#> [1] "AUS" "GBR"
Code for data input below:
chars <- c("USA", "JPN")
chars2 <- c("AUS", "GBR")
group1 <- list(countries = chars)
group2 <- list(countries = chars2)
groups <- list(group1 = group1, group2 = group2)
groups
I'm trying to work out how to extract the vectors that are in the lists, without manually having to write a line of code for each group. The code below works, but my example has a large number of groups (and the number of groups will change), so it would be great to work out how to extract all of the vectors in a more efficient manner. This is the brute force way, that works:
countries1 <- groups$group1$countries
countries2 <- groups$group2$countries
In the example, the bottom level vector I'm trying to extract is always called countries, but the lists they're contained in change name, varying only by numbering.
Would there be an easy purrr solution? Or tidyverse solution? Or other solution?
Add some additional cases to your list
groups[["group3"]] <- list()
groups[["group4"]] <- list(foo = letters[1:2])
groups[["group5"]] <- list(foo = letters[1:2], countries = LETTERS[1:2])
Here's a function that maps any list to just the elements named "countries"; it returns NULL if there are no elements
fun = function(x)
x[["countries"]]
Map your original list to contain just the elements you're interested in
interesting <- Map(fun, groups)
Then transform these into a data.frame using a combination of unlist() and rep()
df <- data.frame(
country = unlist(interesting, use.names = FALSE),
name = rep(names(interesting), lengths(interesting))
)
Alternatively, use tidy syntax, e.g.,
interesting %>%
tibble(group = names(.), value = .) %>%
unnest("value")
The output is
# A tibble: 6 x 2
group value
<chr> <chr>
1 group1 USA
2 group1 JPN
3 group2 AUS
4 group2 GBR
5 group5 A
6 group5 B
If there are additional problems parsing individual elements of groups, then modify fun, e.g.,
fun = function(x)
as.character(x[["countries"]])
This will put the output in a list which will handle any number of groups
countries <- unlist(groups, recursive = FALSE)
names(countries) <- sub("^\\w+(\\d+)\\.(\\w+)", "\\2\\1", names(countries), perl = TRUE)
> countries
$countries1
[1] "USA" "JPN"
$countries2
[1] "AUS" "GBR"
You can simply transform your nested list to a data.frame and then unnest the country column.
library(dplyr)
library(tidyr)
groups %>%
tibble(group = names(groups),
country = .) %>%
unnest(country) %>%
unnest(country)
#> # A tibble: 4 x 2
#> group country
#> <chr> <chr>
#> 1 group1 USA
#> 2 group1 JPN
#> 3 group2 AUS
#> 4 group2 GBR
Created on 2020-01-15 by the reprex package (v0.3.0)
Since the countries are hidden 2 layers deep, you have to run unnest twice. Otherwise I think this is straightforward.
If you actually want to have each vector as a an object in you global environment a combination of purrr::map2/walk and list2env will work. In order to make this work, we have to give the country entries in the list individual names first, otherwise list2env just overwrites the same object over and over again.
library(purrr)
groups <-
map2(groups, 1:length(groups), ~setNames(.x, paste0(names(.x), .y)))
walk(groups, ~list2env(. , envir = .GlobalEnv))
This would create the exact same results you are describing in your question. I am not sure though, if it is the best solution for a smooth workflow, since I don't know where you are going with this.

Applying pmap list arguments to a function nested within another function

I need to perform some rowwise operations with the help of pmap variants but I can't do so when it comes to passing the list of arguments (that is the ".l" argument) to a function that's nested within another function.
I've tried various things including passing the name of the arguments and the dot dot dot syntax all to no avail. I need to know if there's a way to do this as I need to extend this to more complicated functions.
Let's say I have the following data frame and that I would like to paste the first two columns from each row. I can easily do that with the following code:
dff <- data_frame(
first = c("A", "B"),
second = c("X", "Y"),
third = c("L", "M")
)
df_easy <- dff %>%
mutate(joined_upper = pmap_chr(list(first, second), function(...) paste(..., sep = "&")))
df_easy
#> # A tibble: 2 x 4
#> first second third joined_upper
#> <chr> <chr> <chr> <chr>
#> 1 A X L A&X
#> 2 B Y M B&Y
However, if I would like to expand this so that I can lower case the first two letters prior to merging them, my attempt fails. I would like to see if I can get dff3.
# df_hard <- dff %>%
# mutate(joined_smaller = pmap_chr(list(first, second), function(...) paste(tolower(...), sep = "&")))
dff3 <- data.frame(
first = c("A", "B"),
second = c("X", "Y"),
third = c("L", "M"),
joined_smaller = c("a&X", "b&Y")
)
dff3
#> first second third joined_smaller
#> 1 A X L a&X
#> 2 B Y M b&Y
Here is one option. Note that paste and str_c are vectorized i.e.
library(dplyr)
library(stringr)
dff %>%
mutate(joined_sma = str_c(tolower(first), second, sep="&"))
and assuming that this is an exercise just for pmap
library(purrr)
dff %>%
mutate(joined_sma = pmap_chr(list(first, second), ~ c(...) %>%
{str_c(tolower(first(.)), .[-1], sep="&")}
))
# A tibble: 2 x 4
# first second third joined_sma
# <chr> <chr> <chr> <chr>
#1 A X L a&X
#2 B Y M b&Y
Also, as there are only two columns, we can use convention .x, .y to call those
dff %>%
mutate(joined_sma = pmap_chr(list(first, second), ~
str_c(tolower(.x), .y, sep="&")
))
NOTE: Here, we are using str_c instead of paste as this can have different behavior when there is missing values (NA)

Create R function using dplyr::filter problem

I've looked at other answers but cannot find a solution for the code below to work. Basically, I'm creating a function that inner_join the two data frame and filter based on a column inputted in the function.
The problem is that the filter part of the function doesn't work. However it works if I take filter off the function and append it like mydiff("a") %>% filter(a.x != a.y)
Any suggestion is helpful.
Note that I am function input in quotes
library(dplyr)
# fake data
df1<- tibble(id = seq(4,19,2),
a = c("a","b","c","d","e","f","g","h"),
b = c(rep("foo",3), rep("bar",5)))
df2<- tibble(id = seq(10, 20, 1),
a = c("d","a", "e","f","k","m","g","i","h", "a", "b"),
b = c(rep("bar", 7), rep("foo",4)))
# What I am trying to do
dplyr::inner_join(df1, df2, by = "id") %>% select(id, b.x, b.y) %>% filter(b.x!=b.y)
#> # A tibble: 1 x 3
#> id b.x b.y
#> <dbl> <chr> <chr>
#> 1 18 bar foo
# creating a function so that I can filter by difference in column if I have more columns
mydiff <- function(filteron, df_1 = df1, df_2 = df2){
require(dplyr, warn.conflicts = F)
col_1 = paste0(quo_name(filteron), "x")
col_2 = paste0(quo_name(filteron), "y")
my_df<- inner_join(df_1, df_2, by = "id", suffix = c("x", "y"))
my_df %>% select(id, col_1, col_2) %>% filter(col_1 != col_2)
}
# the filter part is not working as expected.
# There is no difference whether i pipe filter or leave it out
mydiff("a")
#> # A tibble: 5 x 3
#> id ax ay
#> <dbl> <chr> <chr>
#> 1 10 d d
#> 2 12 e e
#> 3 14 f k
#> 4 16 g g
#> 5 18 h h
The reason it did not work in your original function was that col_1 was string but dplyr::filter() expected "unquoted" input variable for the LHS. Thus, you need to first convert col_1 to variable using sym() then unquote it inside filter using !! (bang bang).
rlang has really nice function qq_show to show what actually happens with quoting/unquoting (see the output below)
See also this similar question
library(rlang)
library(dplyr)
# creating a function that can take either string or symbol as input
mydiff <- function(filteron, df_1 = df1, df_2 = df2) {
col_1 <- paste0(quo_name(enquo(filteron)), "x")
col_2 <- paste0(quo_name(enquo(filteron)), "y")
my_df <- inner_join(df_1, df_2, by = "id", suffix = c("x", "y"))
cat('\nwithout sym and unquote\n')
qq_show(col_1 != col_2)
cat('\nwith sym and unquote\n')
qq_show(!!sym(col_1) != !!sym(col_2))
cat('\n')
my_df %>%
select(id, col_1, col_2) %>%
filter(!!sym(col_1) != !!sym(col_2))
}
### testing: filteron as a string
mydiff("a")
#>
#> without sym and unquote
#> col_1 != col_2
#>
#> with sym and unquote
#> ax != ay
#>
#> # A tibble: 1 x 3
#> id ax ay
#> <dbl> <chr> <chr>
#> 1 14 f k
### testing: filteron as a symbol
mydiff(a)
#>
#> without sym and unquote
#> col_1 != col_2
#>
#> with sym and unquote
#> ax != ay
#>
#> # A tibble: 1 x 3
#> id ax ay
#> <dbl> <chr> <chr>
#> 1 14 f k
Created on 2018-09-28 by the reprex package (v0.2.1.9000)
From https://dplyr.tidyverse.org/articles/programming.html
Most dplyr functions use non-standard evaluation (NSE). This is a catch-all term that means they don't follow the usual R rules of evaluation.
This can sometimes create a few issues when attempting to wrap them in functions.
Here is a base version of the function you created.
mydiff<- function(filteron, df_1=df1, df_2 = df2){
col_1 = paste0(filteron,"x")
col_2 = paste0(filteron, "y")
my_df <- merge(df1, df2, by="id", suffixes = c("x","y"))
my_df[my_df[, col_1] != my_df[, col_2], c("id", col_1, col_2)]
}
> mydiff("a")
id ax ay
3 14 f k
> mydiff("b")
id bx by
5 18 bar foo
This will solve your problem and will likely work as one expects, now and in the future. With less dependencies on outside packages, you reduce these kind of issues and other quirks which may develop in the future as the package authors evolve their work.
Seems an evaluation issue to me. Try this modified mydiff function, using the lazyeval package:
mydiff <- function(filteron, df_1 = df1, df_2 = df2){
require(dplyr, warn.conflicts = F)
col_1 <- paste0(quo_name(filteron), "x")
col_2 <- paste0(quo_name(filteron), "y")
criteria <- lazyeval::interp(~ x != y, .values = list(x = as.name(col_1), y = as.name(col_2)))
my_df <- inner_join(df_1, df_2, by = "id", suffix = c("x", "y"))
my_df %>% select(id, col_1, col_2) %>% filter_(criteria)
}
You can take a look at the Functions chapter from Hadley Wickham’s book Advanced R for more on that.
The advice of using base R for simple functions is good, however it does not scale to more complex tidyverse functions and you lose the portability to dplyr backends like databases. If you want to create functions around tidyverse pipelines, you'll have to learn a bit about R expressions and the unquoting operator !!. I recommend skimming over the first sections of https://tidyeval.tidyverse.org to get a rough idea of the concepts used here.
Since the function you'd like to create takes a bare column name and does not involve complex expressions (like you would pass to mutate() or summarise()), we don't need fancy stuff like quosures. We can work with symbols. To create a symbol, use as.name() or rlang::sym().
as.name("mycolumn")
#> mycolumn
rlang::sym("mycolumn")
#> mycolumn
The latter has the advantage of being part of a larger family of functions: ensym(), and the plural variants syms() and ensyms(). We are going to use ensym() to capture a column name, i.e. delay the execution of the column in order to pass it to dplyr after a few transformations. Delaying the execution is called "quoting".
I have made a few changes to the interface of your function:
Take the data frames first for consistency with dplyr functions
Don't provide defaults for the data frames. These defaults are making too many assumptions.
Make by and suffix user-configurable, with reasonable defaults.
Here is the code, with explanations inline:
mydiff <- function(df1, df2, var, by = "id", suffix = c(".x", ".y")) {
stopifnot(is.character(suffix), length(suffix) == 2)
# Let's start by the easy task, joining the data frames
df <- dplyr::inner_join(df1, df2, by = by, suffix = suffix)
# Now onto dealing with the diff variable. `ensym()` takes a column
# name and delays its execution:
var <- rlang::ensym(var)
# A delayed column name is not a string, it's a symbol. So we need
# to transform it to a string in order to work with paste() etc.
# `quo_name()` works in this case but is generally only for
# providing default names.
#
# Better use base::as.character() or rlang::as_string() (the latter
# works a bit better on Windows with foreign UTF-8 characters):
var_string <- rlang::as_string(var)
# Now let's add the suffix to the name:
col1_string <- paste0(var_string, suffix[[1]])
col2_string <- paste0(var_string, suffix[[2]])
# dplyr::select() supports column names as strings but it is an
# exception in the dplyr API. Generally, dplyr functions take bare
# column names, i.e. symbols. So let's transform the strings back to
# symbols:
col1 <- rlang::sym(col1_string)
col2 <- rlang::sym(col2_string)
# The delayed column names now need to be inserted back into the
# dplyr code. This is accomplished by unquoting with the !!
# operator:
df %>%
dplyr::select(id, !!col1, !!col2) %>%
dplyr::filter(!!col1 != !!col2)
}
mydiff(df1, df2, b)
#> # A tibble: 1 x 3
#> id b.x b.y
#> <dbl> <chr> <chr>
#> 1 18 bar foo
mydiff(df1, df2, "a")
#> # A tibble: 1 x 3
#> id a.x a.y
#> <dbl> <chr> <chr>
#> 1 14 f k
You can also simplify the function by taking strings instead of bare column names. In this version, I'll use syms() to create a list of symbols, and !!! to pass it all at once to select():
mydiff2 <- function(df1, df2, var, by = "id", suffix = c(".x", ".y")) {
stopifnot(
is.character(suffix), length(suffix) == 2,
is.character(var), length(var) == 1
)
# Create a list of symbols from a character vector:
cols <- rlang::syms(paste0(var, suffix))
df <- dplyr::inner_join(df1, df2, by = by, suffix = suffix)
# Unquote the whole list as once with the big bang !!!
df %>%
dplyr::select(id, !!!cols) %>%
dplyr::filter(!!cols[[1]] != !!cols[[2]])
}
mydiff2(df1, df2, "a")
#> # A tibble: 1 x 3
#> id a.x a.y
#> <dbl> <chr> <chr>
#> 1 14 f k
Finding index for col_1 != col_2 first might be enough for this problem.
mydiff <- function(filteron, df_1 = df1, df_2 = df2){
require(dplyr, warn.conflicts = F)
col_1 <- paste0(quo_name(filteron), "x")
col_2 <- paste0(quo_name(filteron), "y")
my_df <-
inner_join(df_1, df_2, by = "id", suffix = c("x", "y")) %>%
select(id, col_1, col_2)
# find indices of different columns
same <- my_df[, col_1] != my_df[, col_2]
# return for the rows
my_df[same, ]
}
my_diff("a")
#> # A tibble: 1 x 3
#> id ax ay
#> <dbl> <chr> <chr>
#> 1 14 f k

Resources