Convert list to string with conditions - r

I have a dataframe that looks like:
x <- tibble(
experiment_id = rep(c('1a','1b'),each=5),
keystroke = rep(c('a','SHIFT','b','SPACE','e'),2)
)
I know I can concatenate a list into a string using str_c or str_flatten and only keep certain values like below:
> y <- c('b','a','SPACE','d')
> y[y %in% letters]
[1] "b" "a" "d"
But when I try the same thing in a grouped pipe:
x_out <- x %>%
group_by(experiment_id) %>%
mutate(
grp = cumsum(lag(keystroke=='SPACE',default=0))) %>%
group_by(grp, .add=TRUE) %>%
mutate(within_keystrokes = list(keystroke),
within_word = within_keystrokes[within_keystrokes %in% letters]
) %>%
ungroup()
I get the error:
Error: Problem with `mutate()` input `within_word`.
x Input `within_word` can't be recycled to size 2.
ℹ Input `within_word` is `within_keystrokes[within_keystrokes %in% letters]`.
ℹ Input `within_word` must be size 2 or 1, not 0.
ℹ The error occurred in group 1: experiment_id = "1a", grp = 0.
I read this answer and tried using ifelse but still ran into errors.
Any insight into what I'm doing wrong?
EDIT: EXPECTED OUTPUT Sorry for not including this. I would expect the final df to look like:
x <- tibble(
experiment_id = rep(c('1a','1b'),each=5),
keystroke = rep(c('a','SHIFT','b','SPACE','e'),2),
within_keystrokes = list(list('a','SHIFT','b','SPACE'),
list('a','SHIFT','b','SPACE'),
list('a','SHIFT','b','SPACE'),
list('a','SHIFT','b','SPACE'),
'e',
list('a','SHIFT','b','SPACE'),
list('a','SHIFT','b','SPACE'),
list('a','SHIFT','b','SPACE'),
list('a','SHIFT','b','SPACE'),
'e'),
within_word = rep(list('ab','ab','ab','ab','e'),2)
)

You almost solved your issue. You could use
library(dplyr)
library(stringr)
x %>%
group_by(experiment_id, grp = cumsum(lag(keystroke == "SPACE", default = 0))) %>%
mutate(
within_keystrokes = list(keystroke),
within_word = list(str_c(keystroke[keystroke %in% letters], collapse = ""))
)
to get
# A tibble: 10 x 4
experiment_id keystroke within_keystrokes within_word
<chr> <chr> <list> <list>
1 1a a <list [4]> <chr [1]>
2 1a SHIFT <list [4]> <chr [1]>
3 1a b <list [4]> <chr [1]>
4 1a SPACE <list [4]> <chr [1]>
5 1a e <chr [1]> <chr [1]>
6 1b a <list [4]> <chr [1]>
7 1b SHIFT <list [4]> <chr [1]>
8 1b b <list [4]> <chr [1]>
9 1b SPACE <list [4]> <chr [1]>
10 1b e <chr [1]> <chr [1]>
If you don't want within_word to be a list, just remove the list() function.

Related

Remove empty lists from a tibble in R

I am trying to remove any list from my tibble that has "<chr [0]>"
library(tidyverse)
df <- tibble(x = 1:3, y = list(as.character()),
z=list(as.character("ATC"),as.character("TAC"), as.character()))
df
#> # A tibble: 3 × 3
#> x y z
#> <int> <list> <list>
#> 1 1 <chr [0]> <chr [1]>
#> 2 2 <chr [0]> <chr [1]>
#> 3 3 <chr [0]> <chr [0]>
Created on 2022-02-15 by the reprex package (v2.0.1)
I want my tibble to look like this
#> # A tibble: 3 × 3
#> x z
#> <int> <list>
#> 1 1 <chr [1]>
#> 2 2 <chr [1]>
#> 3 3 NA
any help is appreciated
You can do:
df %>%
select(where(~!all(lengths(.) == 0))) %>%
mutate(z = lapply(z, function(x) ifelse(length(x) == 0, NA, x)))
# A tibble: 3 x 2
x z
<int> <list>
1 1 <chr [1]>
2 2 <chr [1]>
3 3 <lgl [1]>
Note, in your z column you can‘t have list elemtents for row 1 and 2 and a direct logical value NA. The whole column needs to be a list.
If all elements of z have only one element, you can add another line of code with mutate(z = unlist(z)).
TO asked for a more dynamic solution to pass several columns.
Here is an example where I simply created another z2 variable. Generally, you can repeat the recoding for several columns using across.
library(tidyverse)
df <- tibble(x = 1:3, y = list(as.character()),
z=list(as.character("ATC"),as.character("TAC"), as.character()),
z2 = z)
df %>%
select(where(~!all(lengths(.) == 0))) %>%
mutate(across(starts_with('z'), ~ lapply(., function(x) ifelse(length(x) == 0, NA, x))))
Which gives:
# A tibble: 3 x 3
x z z2
<int> <list> <list>
1 1 <chr [1]> <chr [1]>
2 2 <chr [1]> <chr [1]>
3 3 <lgl [1]> <lgl [1]>
A two-step way using base R:
df <- tibble(x = 1:3, y = list(as.character()),
z=list(as.character("ATC"),as.character("TAC"), as.character()))
df <- df[apply(df, 2, function(x) any(lapply(x, length) > 0))] #Remove empty columns
df[apply(df, 2, function(x) lapply(x, length) == 0)] <- NA #Replace empty lists with NA
df
# A tibble: 3 x 2
x z
<int> <list>
1 1 <chr [1]>
2 2 <chr [1]>
3 3 <NULL>

Find differences in character column in R

I have a dataframe with ICPM codes before and after recoding of an operation.
df1 <- tibble::tribble(~ops, ~opsalt,
"8-915, 5-847.32", "5-847.32, 5-852.f3, 8-915",
"8-915, 5-781.30, 8-919, 5-807.4, 5-800.c1, 5-79b.81", "5-79b.81, 5-800.c1, 5-805.y, 5-807.4, 8-919, 5-781.30, 8-915",
"5-786.1, 5-808.a4, 5-784.1u, 5-783.2d, 5-788.5e", "5-788.5e, 5-783.2d, 5-780.4d, 5-784.7d, 5-784.1u, 5-808.a4, 5-786.1",
"8-915, 5-784.0v, 5-788.5f, 5-788.40, 5-808.b0, 5-786.k, 5-788.60, 5-788.00, 5-786.0, 5-783.2d", "5-788.00, 5-788.60, 5-786.0, 5-786.k, 5-788.40, 5-808.b0, 5-788.5f, 5-781.ad, 5-784.0v, 8-915")
I want to calculate two columns which contains the differing codes between the two columns.
For the first row the difference between ops and opsalt would be character(0).
The difference between opsalt and ops would be 5-852.f3.
Tried:
df <– df %>% mutate(ops = strsplit(ops,",")) %>%
mutate(opsalt =strsplit(opsalt,","))
df <- df %>% rowwise() %>% mutate(neu_alt = list(setdiff(ops,opsalt))) %>% mutate(alt_neu = list(setdiff(opsalt,ops)))
This didn't work, because I want to compare parts of the respective strings and not the whole string.
It should work if you use ", " in strsplit and df1 in your first mutate call.
library(dplyr)
df1 %>%
mutate(across(.fns = ~ strsplit(.x, ", "))) %>%
rowwise %>%
mutate(neu_alt = list(setdiff(ops, opsalt)),
alt_neu = list(setdiff(opsalt, ops)))
#> # A tibble: 4 x 4
#> # Rowwise:
#> ops opsalt neu_alt alt_neu
#> <list> <list> <list> <list>
#> 1 <chr [2]> <chr [3]> <chr [0]> <chr [1]>
#> 2 <chr [6]> <chr [7]> <chr [0]> <chr [1]>
#> 3 <chr [5]> <chr [7]> <chr [0]> <chr [2]>
#> 4 <chr [10]> <chr [10]> <chr [1]> <chr [1]>
Created on 2022-01-04 by the reprex package (v0.3.0)
If you want to keep them as strings, you can try this method. If you intend to do similar ops repeatedly, then I suggest retaining the list-columns (instead of repeatedly strspliting them).
df1 %>%
mutate(
d = mapply(function(...) toString(setdiff(...)),
strsplit(ops, "[ ,]+"), strsplit(opsalt, "[ ,]+"))
)
# # A tibble: 4 x 3
# ops opsalt d
# <chr> <chr> <chr>
# 1 8-915, 5-847.32 5-847.32, 5-852.f3, 8-915 ""
# 2 8-915, 5-781.30, 8-919, 5-807.4, 5-800.c1, 5-79b.81 5-79b.81, 5-800.c1, 5-805.y, 5-807.4, 8-919, 5-781.30, 8-915 ""
# 3 5-786.1, 5-808.a4, 5-784.1u, 5-783.2d, 5-788.5e 5-788.5e, 5-783.2d, 5-780.4d, 5-784.7d, 5-784.1u, 5-808.a4, 5-786.1 ""
# 4 8-915, 5-784.0v, 5-788.5f, 5-788.40, 5-808.b0, 5-786.k, 5-788.60, 5-788.00, 5-786.0, 5-783.2d 5-788.00, 5-788.60, 5-786.0, 5-786.k, 5-788.40, 5-808.b0, 5-788.5f, 5-781.ad, 5-784.0v, 8-915 "5-783.2d"
(I recommend using list-columns, though, as demonstrated in TimTeaFan's answer.)

Expand tibble of email dataset in R

I have a massive tibble of my email data which looks like the following:
library(dplyr)
emails <- tibble(
from = c('employee.1#xtra.co','employee.5#xtra.co','employee.1#xtra.co',
'employee.3#xtra.co','employee.1#xtra.co'),
to = list(
c('employee.5#xtra.co', 'employee.3xtra.co'),
c('employee.3#xtra.co', 'employee.1#xtra.co'),
c('employee.2#xtra.co'),
c('employee.1#xtra.co'),
c('employee.3#xtra.co','employee.5#xtra.co','employee.6#xtra.co')),
cc = list(
c('employee.2xtra.co', 'employee.4xtra.co', 'employee.6xtra.co'),
c('employee.1xtra.co', 'employee.8xtra.co', 'employee.6xtra.co'),
NA,
c('employee.2xtra.co', 'employee.4xtra.co'),
c('employee.2xtra.co', 'employee.6xtra.co'))
)
emails
# A tibble: 5 x 3
from to cc
<chr> <list> <list>
1 employee.1#xtra.co <chr [2]> <chr [3]>
2 employee.5#xtra.co <chr [2]> <chr [3]>
3 employee.1#xtra.co <chr [1]> <lgl [1]>
4 employee.3#xtra.co <chr [1]> <chr [2]>
5 employee.1#xtra.co <chr [3]> <chr [2]>
I need your help to be able to expand each record for each combination. For example, what I want to achieve for row 1 is:
from to cc
employee.1#xtra.co employee.5#xtra.co employee.2xtra.co
employee.1#xtra.co employee.5#xtra.co employee.4xtra.co
employee.1#xtra.co employee.5#xtra.co employee.6xtra.co
employee.1#xtra.co employee.3xtra.co employee.2xtra.co
employee.1#xtra.co employee.3xtra.co employee.4xtra.co
employee.1#xtra.co employee.3xtra.co employee.6xtra.co
Thank you very much for your time.
We can apply unnest twice.
library(dplyr)
library(tidyr)
emails2 <- emails %>%
unnest(cols = "to") %>%
unnest(cols = "cc")
head(emails2)
# # A tibble: 6 x 3
# from to cc
# <chr> <chr> <chr>
# 1 employee.1#xtra.co employee.5#xtra.co employee.2xtra.co
# 2 employee.1#xtra.co employee.5#xtra.co employee.4xtra.co
# 3 employee.1#xtra.co employee.5#xtra.co employee.6xtra.co
# 4 employee.1#xtra.co employee.3xtra.co employee.2xtra.co
# 5 employee.1#xtra.co employee.3xtra.co employee.4xtra.co
# 6 employee.1#xtra.co employee.3xtra.co employee.6xtra.co
If you have more than two columns to expand, below is one approach. First identify the columns that are list. Store the column names in names_target, and then use a for loop to repeatedly apply the unnest function.
names_target <- emails %>%
select(where(is.list)) %>%
names()
temp <- emails
for (i in names_target){
temp <- temp %>% unnest(cols = all_of(i))
}
identical(temp, emails2)
# [1] TRUE

Unnesting a combination variable (combn) as a vector

With the following code, I manage to get a fine combination :
tibble(
x = list(c(1, 2, 3), c(4,5,6))
) %>%
mutate(
combination =
x %>%
map(
.f = combn
, 2
) %>%
map(.f = t)
) %>%
unnest(combination)
# A tibble: 6 x 2
x combination[,1] [,2]
<list> <dbl> <dbl>
1 <dbl [3]> 1 2
2 <dbl [3]> 1 3
3 <dbl [3]> 2 3
4 <dbl [3]> 4 5
5 <dbl [3]> 4 6
6 <dbl [3]> 5 6
Howerver, when observed with the View() function, I get :
How can I proceed to get combination displayed as a vector? i.e. :
We can specify the simplify = FALSE in combn to return a list instead of coercing to matrix
library(purrr)
library(dplyr)
library(tidyr)
tbl1 <- tibble(
x = list(c(1, 2, 3), c(4,5,6))
) %>%
mutate(
combination =
x %>%
map(
.f = combn
, 2, simplify = FALSE
))
Now, do the unnest
out <- tbl1 %>%
unnest(combination)
out
# A tibble: 6 x 2
# x combination
# <list> <list>
#1 <dbl [3]> <dbl [2]>
#2 <dbl [3]> <dbl [2]>
#3 <dbl [3]> <dbl [2]>
#4 <dbl [3]> <dbl [2]>
#5 <dbl [3]> <dbl [2]>
#6 <dbl [3]> <dbl [2]>
check the View
Here is a data.table option that might help
library(data.table)
library(tidyr)
unnest(setDT(df)[, combination := lapply(x, function(v) combn(v, 2, simplify = FALSE))], combination)

Determine whether any value from vector in R is found in another vector

I have a data frame that has two columns, a and b. Both a and b have vectors of different lengths, and I want to create another column, called boolean, which should be true when at least one value from the vector in b is present in the vector in column a. However, I used the following logic and I'm getting a false for all rows, even though I should be getting true for rows 1, 3 and 4. A reproducible example of my attempt is as follows:
library(tidyverse)
library(tibble)
data_frame <-
tribble(
~a, ~b,
c("50", "50", "50", "3"), c("1", "2", "3"),
c("17", "50", "50", "50"), "19",
c("50", "28", "50", "3"), c("30", "28", "29"),
c("21", "19", "50", "50", "50"), c("19", "20", "18")
)
data_frame %>%
mutate(
boolean = if_else(
any(b %in% a),
"yes",
"no"
)
)
#> # A tibble: 4 x 3
#> a b boolean
#> <list> <list> <chr>
#> 1 <chr [4]> <chr [3]> no
#> 2 <chr [4]> <chr [1]> no
#> 3 <chr [4]> <chr [3]> no
#> 4 <chr [5]> <chr [3]> no
Is there something wrong with how I am using the any command? Thanks.
A base R option using Vectorize + intersect
within(data_frame, boolean <- ifelse(lengths(Vectorize(intersect)(a,b)),"yes","no"))
or Map + intersect
within(data_frame, boolean <- ifelse(lengths(Map(intersect,a,b)),"yes","no"))
which gives
# A tibble: 4 x 3
a b boolean
<list> <list> <chr>
1 <chr [4]> <chr [3]> yes
2 <chr [4]> <chr [1]> no
3 <chr [4]> <chr [3]> yes
4 <chr [5]> <chr [3]> yes
A purrr solution with map2():
library(dplyr)
library(purrr)
data_frame %>%
mutate(boolean = map2_chr(a, b, ~ if_else(any(.y %in% .x), "yes", "no")))
# # A tibble: 4 x 3
# a b boolean
# <list> <list> <chr>
# 1 <chr [4]> <chr [3]> yes
# 2 <chr [4]> <chr [1]> no
# 3 <chr [4]> <chr [3]> yes
# 4 <chr [5]> <chr [3]> yes
or with pmap():
data_frame %>%
mutate(boolean = pmap_chr(select(., a, b),
~ if_else(any(.y %in% .x), "yes", "no")))
Add rowwise to perform this operation on each row :
library(dplyr)
data_frame %>%
rowwise() %>%
mutate(boolean = if_else(any(b %in% a),"yes","no"))
# a b boolean
# <list> <list> <chr>
#1 <chr [4]> <chr [3]> yes
#2 <chr [4]> <chr [1]> no
#3 <chr [4]> <chr [3]> yes
#4 <chr [5]> <chr [3]> yes
Similar variations :
Without ifelse :
library(dplyr)
library(purrr)
data_frame %>%
rowwise() %>%
mutate(boolean = c('no', 'yes')[any(b %in% a) + 1])
Or with map2_lgl :
data_frame %>%
mutate(boolean = c('no', 'yes')[map2_lgl(b, a, ~any(.x %in% .y)) + 1])
Are you after this?
data_frame %>%
rowwise() %>%
mutate(boolean = if_else(any(b %in% a), "yes", "no"))
# A tibble: 4 x 3
# Rowwise:
a b boolean
<list> <list> <chr>
1 <chr [4]> <chr [3]> yes
2 <chr [4]> <chr [1]> no
3 <chr [4]> <chr [3]> yes
4 <chr [5]> <chr [3]> yes
We can use pmap
library(dplyr)
library(purrr)
data_frame %>%
mutate(boolean = pmap_chr(., ~ c("no", "yes")[1+any(..2 %in% ..1)]))
# A tibble: 4 x 3
# a b boolean
# <list> <list> <chr>
#1 <chr [4]> <chr [3]> yes
#2 <chr [4]> <chr [1]> no
#3 <chr [4]> <chr [3]> yes
#4 <chr [5]> <chr [3]> yes

Resources