Mutate All columns in a list of tibbles - r

Lets suppose I have the following list of tibbles:
a_list_of_tibbles <- list(
a = tibble(a = rnorm(10)),
b = tibble(a = runif(10)),
c = tibble(a = letters[1:10])
)
Now I want to map them all into a single dataframe/tibble, which is not possible due to the differing column types.
How would I go about this?
I have tried this, but I want to get rid of the for loop
for(i in 1:length(a_list_of_tibbles)){
a_list_of_tibbles[[i]] <- a_list_of_tibbles[[i]] %>% mutate_all(as.character)
}
Then I run:
map_dfr(.x = a_list_of_tibbles, .f = as_tibble)

We could do the computation within the map - use across instead of the suffix _all (which is getting deprecated) to loop over the columns of the dataset
library(dplyr)
library(purrr)
map_dfr(a_list_of_tibbles,
~.x %>%
mutate(across(everything(), as.character) %>%
as_tibble))
-output
# A tibble: 30 × 1
a
<chr>
1 0.735200825884485
2 1.4741501589461
3 1.39870958697574
4 -0.36046362308853
5 -0.893860999301402
6 -0.565468636033674
7 -0.075270267983768
8 2.33534260196058
9 0.69667906338348
10 1.54213170143702
# … with 20 more rows

Another alternative is to use:
library(tidyverse)
map_depth(a_list_of_tibbles, 2, as.character) %>%
bind_rows()
#> # A tibble: 30 × 1
#> a
#> <chr>
#> 1 0.0894618169853206
#> 2 -1.50144637645091
#> 3 1.44795821718513
#> 4 0.0795342912030257
#> 5 -0.837985570593029
#> 6 -0.050845557103668
#> 7 0.031194556366589
#> 8 0.0989551909839589
#> 9 1.87007290229274
#> 10 0.67816212007413
#> # … with 20 more rows
Created on 2021-12-20 by the reprex package (v2.0.1)

Related

Accessing variable name in for loop in R?

I am trying to run a for loop where I randomly subsample a dataset using sample_n command. I also want to name each new subsampled dataframe as "df1" "df2" "df3". Where the numbers correspond to i in the for loop. I know the way I wrote this code is wrong and why i am getting the error. How can I access "df" "i" in the for loop so that it reads as df1, df2, etc.? Happy to clarify if needed. Thanks!
for (i in 1:9){ print(get(paste("df", i, sep=""))) = sub %>%
group_by(dietAandB) %>%
sample_n(1) }
Error in print(get(paste("df", i, sep = ""))) = sub %>% group_by(dietAandB) %>% :
target of assignment expands to non-language object
Instead of using get you could use assign.
Using some fake example data:
library(dplyr, warn=FALSE)
sub <- data.frame(
dietAandB = LETTERS[1:2]
)
for (i in 1:2) {
assign(paste0("df", i), sub %>% group_by(dietAandB) %>% sample_n(1) |> ungroup())
}
df1
#> # A tibble: 2 × 1
#> dietAandB
#> <chr>
#> 1 A
#> 2 B
df2
#> # A tibble: 2 × 1
#> dietAandB
#> <chr>
#> 1 A
#> 2 B
But the more R-ish way to do this would be to use a list instead of creating single objects:
df <- list(); for (i in 1:2) { df[[i]] = sub %>% group_by(dietAandB) %>% sample_n(1) |> ungroup() }
df
#> [[1]]
#> # A tibble: 2 × 1
#> dietAandB
#> <chr>
#> 1 A
#> 2 B
#>
#> [[2]]
#> # A tibble: 2 × 1
#> dietAandB
#> <chr>
#> 1 A
#> 2 B
Or more concise to use lapply instead of a for loop
df <- lapply(1:2, function(x) sub %>% group_by(dietAandB) %>% sample_n(1) |> ungroup())
df
#> [[1]]
#> # A tibble: 2 × 1
#> dietAandB
#> <chr>
#> 1 A
#> 2 B
#>
#> [[2]]
#> # A tibble: 2 × 1
#> dietAandB
#> <chr>
#> 1 A
#> 2 B
It depends on the sample size which is missing in your question. So, As an example I considered the mtcars dataset (32 rows) and sampling three subsamples of size 20 from the data:
library(dplyr)
for (i in 1:3) {
assign(paste0("df", i), sample_n(mtcars, 20))
}

counter for unique values found along the vector

first of all let me say that I have searched a lot for this basic question, but none of the answers found seems to do the job. If this specific question has already an answer, please excuse me.
I want to count the occurrence of behaviours in my data.
mydata <- data.frame(BH=c(
"sniff","explore","walking","explore","walking","trotting","sniff","explore","trotting","trotting","walking","walking","walking","watch","walking","trotting","watch","walking","walking","walking"))
and the output has to be like this
myoutput <- data.frame(
BH=c(
"sniff","explore","walking","explore","walking","trotting","sniff","explore","trotting","trotting","walking","walking","walking","watch","walking","trotting","watch","walking","walking","walking"),
mycount=c(
1,2,3,3,3,4,4,4,4,4,4,4,4,
5,5,5,5,5,5,5))
I have experimented using ave and n_distinct from dplyr package, but I only get the count of a given behaviour, not the cumulative count.
Any help or hint on how to solve this problem would be appreciate.
Stef
This is easy with a group-by operation and cumsum. I like using package data.table.
library(data.table)
setDT(mydata)
mydata[, mycount := c(1, rep(0, .N - 1)), by = BH] #first occurences
mydata[, mycount := cumsum(mycount)]
all.equal(setDF(mydata), myoutput)
#[1] TRUE
Here is a solution with tidyverse - not as concise as Roland`s solution, but it works.
library(tidyverse)
x <- mydata |>
mutate(rn = row_number())
x |>
group_by(BH) |>
mutate(id = cur_group_id()) |>
ungroup() |>
pivot_wider(names_from = BH,
values_from = id,
values_fill = 0) |>
mutate(across(
sniff:watch, ~ cumsum(.x) > 0, .names = "{.col}_temp"),
mycount = rowSums(across(ends_with('_temp')))
) |>
dplyr::select(c(rn:watch, mycount)) |>
right_join(x, by = 'rn') |>
pivot_longer(-c(rn, mycount, BH)) |>
filter(value !=0) |>
dplyr::select(BH, mycount)
#> # A tibble: 20 × 2
#> BH mycount
#> <chr> <dbl>
#> 1 sniff 1
#> 2 explore 2
#> 3 walking 3
#> 4 explore 3
#> 5 walking 3
#> 6 trotting 4
#> 7 sniff 4
#> 8 explore 4
#> 9 trotting 4
#> 10 trotting 4
#> 11 walking 4
#> 12 walking 4
#> 13 walking 4
#> 14 watch 5
#> 15 walking 5
#> 16 trotting 5
#> 17 watch 5
#> 18 walking 5
#> 19 walking 5
#> 20 walking 5

Continuing a sequence into NAs using dplyr

I am trying to figure out a dplyr specific way of continuing a sequence of numbers when there are NAs in that column.
For example I have this dataframe:
library(tibble)
dat <- tribble(
~x, ~group,
1, "A",
2, "A",
NA_real_, "A",
NA_real_, "A",
1, "B",
NA_real_, "B",
3, "B"
)
dat
#> # A tibble: 7 × 2
#> x group
#> <dbl> <chr>
#> 1 1 A
#> 2 2 A
#> 3 NA A
#> 4 NA A
#> 5 1 B
#> 6 NA B
#> 7 3 B
I would like this one:
#> # A tibble: 7 × 2
#> x group
#> <dbl> <chr>
#> 1 1 A
#> 2 2 A
#> 3 3 A
#> 4 4 A
#> 5 1 B
#> 6 2 B
#> 7 3 B
When I try this I get a warning which makes me think I am probably approaching this incorrectly:
library(dplyr)
dat %>%
group_by(group) %>%
mutate(n = n()) %>%
mutate(new_seq = seq_len(n))
#> Warning in seq_len(n): first element used of 'length.out' argument
#> Warning in seq_len(n): first element used of 'length.out' argument
#> # A tibble: 7 × 4
#> # Groups: group [2]
#> x group n new_seq
#> <dbl> <chr> <int> <int>
#> 1 1 A 4 1
#> 2 2 A 4 2
#> 3 NA A 4 3
#> 4 NA A 4 4
#> 5 1 B 3 1
#> 6 NA B 3 2
#> 7 3 B 3 3
It's easier if you do it in one go. Your approach is not 'wrong', it is just that seq_len needs one integer, and you are giving a vector (n), so seq_len corrects it by using the first value.
dat %>%
group_by(group) %>%
mutate(x = seq_len(n()))
Note that row_number might be even easier here:
dat %>%
group_by(group) %>%
mutate(x = row_number())
We could use rowid directly if the intention is to create a sequence and group size is just intermediate column
library(data.table)
library(dplyr)
dat %>%
mutate(new_seq = rowid(group))
The issue with using a column after it is created is that it is no longer a single row as showed in #Maëls post. If we need to do that, use first as seq_len is not vectorized and here it is not needed as well
dat %>%
group_by(group) %>%
mutate(n = n()) %>%
mutate(new_seq = seq_len(first(n)))
A base R option using ave (work in a similar way as group_by in dplyr)
> transform(dat, x = ave(x, group, FUN = seq_along))
x group
1 1 A
2 2 A
3 3 A
4 4 A
5 1 B
6 2 B
7 3 B

Unexpected dplyr::bind_rows() behavior

Short Version:
I'm encountering an error with dplyr::bind_rows() which I don't understand. I want to split my data based on some condition (e.g. a == 1), operate on one part (e.g. b = b * 10), and bind it back to the other part using dplyr::bind_rows() in a single pipe chain. It works fine if I provide the first input to the two parts explictly, but if instead I pipe them in with . it complains about the data type of agrument 2.
Here's a MRE of the issue:
library(tidyverse)
# sim data
d <- tibble(a = 1:4, b = 1:4)
# works when 'd' is supplied directly to bind_rows()
bind_rows(d %>% filter(a == 1),
d %>% filter(!a == 1) %>% mutate(b = b * 10))
#> # A tibble: 4 x 2
#> a b
#> <int> <dbl>
#> 1 1 1
#> 2 2 20
#> 3 3 30
#> 4 4 40
# fails when 'd' is piped in to bind_rows()
d %>%
bind_rows(. %>% filter(a == 1),
. %>% filter(!a == 1) %>% mutate(b = b * 10))
#> Error: Argument 2 must be a data frame or a named atomic vector.
Long Version:
If I capture what the bind_rows() call is getting as input as a list() instead, I can see that two unexpected (to me) things are happening.
Instead of evaluating the pipe chains I provided it seems to just capure them as a functional sequence.
I can see that the input (.) is invisibly being provided in addition to the two explict arguments, so I get 3 items instead of 2 in the list.
# capture intermediate values for diagnostics
d %>%
list(. %>% filter(a == 1),
. %>% filter(!a == 1) %>% mutate(b = b * 10))
#> [[1]]
#> # A tibble: 4 x 2
#> a b
#> <int> <int>
#> 1 1 1
#> 2 2 2
#> 3 3 3
#> 4 4 4
#>
#> [[2]]
#> Functional sequence with the following components:
#>
#> 1. filter(., a == 1)
#>
#> Use 'functions' to extract the individual functions.
#>
#> [[3]]
#> Functional sequence with the following components:
#>
#> 1. filter(., !a == 1)
#> 2. mutate(., b = b * 10)
#>
#> Use 'functions' to extract the individual functions.
This leads me to the following inelegant solution where I solve the first problem by piping to the inner function which seems to force evaluation correctly (for reasons I don't understand) and then solve the second problem by subsetting the list prior to performing the bind_rows() operation.
# hack solution to force eval and clean duplicated input
d %>%
list(filter(., a == 1),
filter(., !a == 1) %>% mutate(b = b * 10)) %>%
.[-1] %>%
bind_rows()
#> # A tibble: 4 x 2
#> a b
#> <int> <dbl>
#> 1 1 1
#> 2 2 20
#> 3 3 30
#> 4 4 40
Created on 2022-01-24 by the reprex package (v2.0.1)
It seems like it might be related to this issue, but I can't quite see how. It would be great to understand why this is happening and find a way code this without the need to assign intermediate variables or do this weird hack to subset the intermediate list.
EDIT:
Knowing this was related to curly braces ({}) enabled me to find a few more helpful links:
1, 2, 3
If we want to use ., then block it with scope operator ({})
library(dplyr)
d %>%
{
bind_rows({.} %>% filter(a == 1),
{.} %>% filter(!a == 1) %>% mutate(b = b * 10))
}
-output
# A tibble: 4 × 2
a b
<int> <dbl>
1 1 1
2 2 20
3 3 30
4 4 40

Modify a vector based on a vector of regular expressions (regex) using (if possible) a functional approach

I have a dataframe with some columns that I want to modify depending on whether they match some patterns included in a vector with regular expressions
library(fuzzyjoin)
library(tidyverse)
(df <- tribble(~a,
"GUA-ABC",
"REF-CDE",
"ACC.S93",
"ACC.ATN"))
#> # A tibble: 4 x 1
#> a
#> <chr>
#> 1 GUA-ABC
#> 2 REF-CDE
#> 3 ACC.S93
#> 4 ACC.ATN
Depending on the pattern I want to paste a text, for example, for those that contain GUA- paste "GUA001" at the end of the chain joined by a point and for those that contain REF- paste "GUA002" in the same way, to be able to obtain the following:
# This is the resulting data.frame I need
#> # A tibble: 4 x 1
#> a
#> <chr>
#> 1 GUA-ABC.GUA001
#> 2 REF-CDE.GUA002
#> 3 ACC.S93
#> 4 ACC.ATN
I have thought of some approaches.
Approach # 1
# list of patterns to search
patterns <- c("\\b^GUA\\b", "\\b^REF\\b")
# Create a named list for recoding
model_key <- list("\\b^GUA\\b" = "GUA001",
"\\b^REF\\b" = "GUA002")
# Create a data.frame of regexs
(k <- tibble(regex = patterns))
#> # A tibble: 2 x 1
#> regex
#> <chr>
#> 1 "\\b^GUA\\b"
#> 2 "\\b^REF\\b"
# perform a regex_left_join to identify the pattern
df %>%
regex_left_join(k, by = c(a = "regex")) %>%
mutate(
across(regex, recode, !!!model_key),
a = case_when(
!is.na(regex) ~ str_c(a, regex, sep = "."),
TRUE ~ a)
) %>% select(-regex)
#> # A tibble: 4 x 1
#> a
#> <chr>
#> 1 GUA-ABC.GUA001
#> 2 REF-CDE.GUA002
#> 3 ACC.S93
#> 4 ACC.ATN
Why is this approach not optimal? The original data frame has millions of rows and fuzzyjoin::regex_left_join takes too long to do this.
Approach # 2
patron <- c("GUA001" = "\\b^GUA\\b", "GUA002" = "\\b^REF\\b")
newtex <- c("GUA001", "GUA002")
pegar <- function(string, pattern, text_to_paste) {
if_else(condition = str_detect(string, pattern),
true = str_c(string, text_to_paste, sep = "."),
false = string)
}
map2_dfr(.x = patron, .y = newtex, ~ pegar(string = df$a,
pattern = .x,
text_to_paste = .y))
#> # A tibble: 4 x 2
#> GUA001 GUA002
#> <chr> <chr>
#> 1 GUA-ABC.GUA001 GUA-ABC
#> 2 REF-CDE REF-CDE.GUA002
#> 3 ACC.S93 ACC.S93
#> 4 ACC.ATN ACC.ATN
Created on 2021-05-20 by the reprex package (v2.0.0)
With approach # 2 I can't get a single column.
As a side note, using str_replace_all and using a named vector to replace some of the values within the string has not seemed like a good alternative at the moment.
Is there a way to do this more optimally?
One option utilizing stringr and purrr could be:
imap_dfr(model_key,
~ df %>%
filter(str_detect(a, .y)) %>%
mutate(a = str_c(a, .x, sep = "."))) %>%
bind_rows(df %>%
filter(str_detect(a, str_c(names(model_key), collapse = "|"), negate = TRUE)))
a
<chr>
1 GUA-ABC.GUA001
2 REF-CDE.GUA002
3 ACC.S93
4 ACC.ATN
What about a boring old loop?
## make df millions of rows
df <- df[rep(1:4,1e6),]
system.time({
val <- c("GUA\\-", "REF\\-", "ACC\\.", "QQQ\\.")
rpl <- c("GUA001", "GUA002", "ACC001", "QQQ001")
for(i in seq_along(val)) {
sel <- grepl(val[i], df$a)
df$a[sel] <- paste(df$a[sel], rpl[i], sep=".")
}
})
## user system elapsed
## 2.14 0.03 2.17
2 seconds to complete
df
## A tibble: 4,000,000 x 1
# a
# <chr>
# 1 GUA-ABC.GUA001
# 2 REF-CDE.GUA002
# 3 ACC.S93.ACC001
# 4 ACC.ATN.ACC001
# ...
If the functional approach is absolutely necessary, you can squish it into a Reduce function:
Reduce(
function(str, args) {
sel <- grepl(args[1], str)
str[sel] <- paste(str[sel], args[2], sep=".")
str
},
Map(c, val, rpl), init = df$a
)

Resources