How to use case_when to apply different functions in dplyr - r

What I am thinking might be naive. But I want to split the rows [1:3] of df based on the second "_", using tidyr::extract()
library(tidyr)
library(dplyr)
extract(col1, into = c("col1", "col2"), "^(.*?_.*?)_(.*)$")
and the rows of df [4:6] based on the first "_"
extract(col1, into = c("col1", "col2"), "^(.*?)_(.*)$")
I am thinking of something like
df %>%
mutate(n=row_number())
mutate(col2=case_when
(n<=3 ~ extract(col1, into = c("col1", "col2"), "^(.*?_.*?)_(.*)$"),
n>3 ~ extract(col1, into = c("col1", "col2"), "^(.*?)_(.*)$")
)
Of course, this is screamingly wrong but is it possible in some way?
Example data:
df=tibble(col1 = c("2397_A_run379_CTTGTACT_S119_L004_R1_001",
"3779_A_run535_TTATAGCC_S91_L003_R1_001",
"4958_BV_run685_GCGTACGT_S89_L005_R1_001",
"5126AA_S27_L004_R1_001",
"5126AF_S32_L004_R1_001",
"5126AL_S38_L004_R1_001"))
df
#> # A tibble: 6 × 1
#> col1
#> <chr>
#> 1 2397_A_run379_CTTGTACT_S119_L004_R1_001
#> 2 3779_A_run535_TTATAGCC_S91_L003_R1_001
#> 3 4958_BV_run685_GCGTACGT_S89_L005_R1_001
#> 4 5126AA_S27_L004_R1_001
#> 5 5126AF_S32_L004_R1_001
#> 6 5126AL_S38_L004_R1_001
Created on 2022-11-17 with reprex v2.0.2

If the pattern is to extract the substring by matching the _ the precedes one or more letters followed by digits,
library(dplyr)
library(stringr)
df %>%
mutate(col2 = str_extract(col1, "(?<=_)[A-Za-z]+\\d+.*"))
-output
# A tibble: 6 × 2
col1 col2
<chr> <chr>
1 2397_A_run379_CTTGTACT_S119_L004_R1_001 run379_CTTGTACT_S119_L004_R1_001
2 3779_A_run535_TTATAGCC_S91_L003_R1_001 run535_TTATAGCC_S91_L003_R1_001
3 4958_BV_run685_GCGTACGT_S89_L005_R1_001 run685_GCGTACGT_S89_L005_R1_001
4 5126AA_S27_L004_R1_001 S27_L004_R1_001
5 5126AF_S32_L004_R1_001 S32_L004_R1_001
6 5126AL_S38_L004_R1_001 S38_L004_R1_001
Or use separate
library(tidyr)
separate(df, col1, into = c("col1", "col2"),
sep = "(?<=[A-Z])_(?=[A-Za-z]+\\d+)", extra = "merge")
-output
# A tibble: 6 × 2
col1 col2
<chr> <chr>
1 2397_A run379_CTTGTACT_S119_L004_R1_001
2 3779_A run535_TTATAGCC_S91_L003_R1_001
3 4958_BV run685_GCGTACGT_S89_L005_R1_001
4 5126AA S27_L004_R1_001
5 5126AF S32_L004_R1_001
6 5126AL S38_L004_R1_001

tidyr::extract() takes and returns a dataframe, and will be tricky to use inside mutate(). I would instead use something like stringr::str_match():
library(dplyr)
library(stringr)
df %>%
mutate(
row = row_number(),
col2 = case_when(
row < 4 ~ str_match(col1, ".+?_.+?_(.+)")[, 2],
row < 7 ~ str_match(col1, ".+?_(.+)")[, 2]
)
)
# A tibble: 6 × 3
col1 row col2
<chr> <int> <chr>
1 2397_A_run379_CTTGTACT_S119_L004_R1_001 1 run379_CTTGTACT_S119_L004_R1_001
2 3779_A_run535_TTATAGCC_S91_L003_R1_001 2 run535_TTATAGCC_S91_L003_R1_001
3 4958_BV_run685_GCGTACGT_S89_L005_R1_001 3 run685_GCGTACGT_S89_L005_R1_001
4 5126AA_S27_L004_R1_001 4 S27_L004_R1_001
5 5126AF_S32_L004_R1_001 5 S32_L004_R1_001
6 5126AL_S38_L004_R1_001 6 S38_L004_R1_001

Related

How to filter out groups empty for 1 column in Tidyverse

tibble(
A = c("A","A","B","B"),
x = c(NA,NA,NA,1),
y = c(1,2,3,4),
) %>% group_by(A) -> df
desired output:
tibble(
A = c("B","B"),
x = c(NA,1)
y = c(3,4),
)
I want to find all groups for which all elements of x and x only are all NA, then remove those groups. "B" is filtered in because it has at least 1 non NA element.
I tried:
df %>%
filter(all(!is.na(x)))
but it seems that filters out if it finds at least 1 NA; I need the correct word, which is not all.
This will remove groups of column A if all elements of x are NA:
library(dplyr)
df %>%
group_by(A) %>%
filter(! all(is.na(x)))
# A tibble: 2 × 3
# Groups: A [1]
# A x y
# <chr> <dbl> <dbl>
#1 B NA 3
#2 B 1 4
Note that group "A" was removed because both cells in the column x are not defined.
We can use any with complete.cases
library(dplyr)
df %>%
group_by(A) %>%
filter(any(complete.cases(x))) %>%
ungroup
-output
# A tibble: 2 × 3
A x y
<chr> <dbl> <dbl>
1 B NA 3
2 B 1 4
In the devel version of dplyr, we could use .by in filter thus we don't need to group_by/ungroup
df %>%
filter(any(complete.cases(x)), .by = 'A')
# A tibble: 2 × 3
A x y
<chr> <dbl> <dbl>
1 B NA 3
2 B 1 4

Apply the same function with multiple columns as inputs to multiple columns in R with tidyverse

As an example, I have the following data frame:
df <- data.frame(a1=1,a2=2,a3=3,b1=1,b2=2,b3=3)
I have a function:
fn <- function(x,y,z) x^y+(z-x)^(y-x)
I want the following:
df <- df %>% mutate(a=fn(a1,a2,a3),b=fn(b1,b2,b3))
The problem is, I have tons of triplets in my dataset, so it is not ideal to write them out one by one.
Here are base R options using:
split.default + lapply + do.call
cbind(
df,
lapply(
split.default(df, gsub("\\d+", "", names(df))),
function(x) do.call(fn, unname(x))
)
)
reshape + lapply + do.call
cbind(
df,
lapply(
subset(
reshape(
setNames(df, gsub("(\\d+)$", "\\.\\1", names(df))),
direction = "long",
varying = 1:length(df)
),
select = -c(time, id)
),
function(x) do.call(fn, as.list(x))
)
)
Output
a1 a2 a3 b1 b2 b3 a b
1 1 2 3 1 2 3 3 3
I would convert df to long format then use lag to create 3 columns then apply fn() on them
library(tidyverse)
df_long <- df %>%
pivot_longer(everything(),
names_to = c(".value", "set"),
names_pattern = "(.)(.)")
df_longer <- df_long %>%
pivot_longer(-c(set),
names_to = "key",
values_to = "val") %>%
arrange(key)
df_longer
#> # A tibble: 6 x 3
#> set key val
#> <chr> <chr> <dbl>
#> 1 1 a 1
#> 2 2 a 2
#> 3 3 a 3
#> 4 1 b 1
#> 5 2 b 2
#> 6 3 b 3
lag then apply fn(), keep only non-NA val_fn
df_longer <- df_longer %>%
group_by(key) %>%
mutate(val_lag1 = lag(val, n = 1),
val_lag2 = lag(val, n = 2)) %>%
mutate(val_fn = fn(val_lag2, val_lag1, val)) %>%
filter(!is.na(val_fn))
df_longer
#> # A tibble: 2 x 6
#> # Groups: key [2]
#> set key val val_lag1 val_lag2 val_fn
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 3 a 3 2 1 3
#> 2 3 b 3 2 1 3
Created on 2020-12-03 by the reprex package (v0.3.0)
I think it would be easier/shorter to combine columns into their separate group and apply the function to each column.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = everything(),
names_to = '.value',
names_pattern = '([a-z]+)') %>%
summarise(across(.fns = ~do.call(fn, as.list(.)))) -> result
result
# a b
# <dbl> <dbl>
#1 3 3
You can bind the result to your original dataset if needed.
bind_cols(df, result)
# a1 a2 a3 b1 b2 b3 a b
#1 1 2 3 1 2 3 3 3

How do you use dplyr::pull to convert grouped a colum into vectors?

I have a tibble, df, I would like to take the tibble and group it and then use dplyr::pull to create vectors from the grouped dataframe. I have provided a reprex below.
df is the base tibble. My desired output is reflected by df2. I just don't know how to get there programmatically. I have tried to use pull to achieve this output but pull did not seem to recognize the group_by function and instead created a vector out of the whole column. Is what I'm trying to achieve possible with dplyr or base r. Note - new_col is supposed to be a vector created from the name column.
library(tidyverse)
library(reprex)
df <- tibble(group = c(1,1,1,1,2,2,2,3,3,3,3,3),
name = c('Jim','Deb','Bill','Ann','Joe','Jon','Jane','Jake','Sam','Gus','Trixy','Don'),
type = c(1,2,3,4,3,2,1,2,3,1,4,5))
df
#> # A tibble: 12 x 3
#> group name type
#> <dbl> <chr> <dbl>
#> 1 1 Jim 1
#> 2 1 Deb 2
#> 3 1 Bill 3
#> 4 1 Ann 4
#> 5 2 Joe 3
#> 6 2 Jon 2
#> 7 2 Jane 1
#> 8 3 Jake 2
#> 9 3 Sam 3
#> 10 3 Gus 1
#> 11 3 Trixy 4
#> 12 3 Don 5
# Desired Output - New Col is a column of vectors
df2 <- tibble(group=c(1,2,3),name=c("Jim","Jane","Gus"), type=c(1,1,1), new_col = c("'Jim','Deb','Bill','Ann'","'Joe','Jon','Jane'","'Jake','Sam','Gus','Trixy','Don'"))
df2
#> # A tibble: 3 x 4
#> group name type new_col
#> <dbl> <chr> <dbl> <chr>
#> 1 1 Jim 1 'Jim','Deb','Bill','Ann'
#> 2 2 Jane 1 'Joe','Jon','Jane'
#> 3 3 Gus 1 'Jake','Sam','Gus','Trixy','Don'
Created on 2020-11-14 by the reprex package (v0.3.0)
Maybe this is what you are looking for:
library(dplyr)
df <- tibble(group = c(1,1,1,1,2,2,2,3,3,3,3,3),
name = c('Jim','Deb','Bill','Ann','Joe','Jon','Jane','Jake','Sam','Gus','Trixy','Don'),
type = c(1,2,3,4,3,2,1,2,3,1,4,5))
df %>%
group_by(group) %>%
mutate(new_col = name, name = first(name, order_by = type), type = first(type, order_by = type)) %>%
group_by(name, type, .add = TRUE) %>%
summarise(new_col = paste(new_col, collapse = ","))
#> `summarise()` regrouping output by 'group', 'name' (override with `.groups` argument)
#> # A tibble: 3 x 4
#> # Groups: group, name [3]
#> group name type new_col
#> <dbl> <chr> <dbl> <chr>
#> 1 1 Jim 1 Jim,Deb,Bill,Ann
#> 2 2 Jane 1 Joe,Jon,Jane
#> 3 3 Gus 1 Jake,Sam,Gus,Trixy,Don
EDIT If new_col should be a list of vectors then you could do `summarise(new_col = list(c(new_col)))
df %>%
group_by(group) %>%
mutate(new_col = name, name = first(name, order_by = type), type = first(type, order_by = type)) %>%
group_by(name, type, .add = TRUE) %>%
summarise(new_col = list(c(new_col)))
Another option would be to use tidyr::nest:
df %>%
group_by(group) %>%
mutate(new_col = name, name = first(name, order_by = type), type = first(type, order_by = type)) %>%
nest(new_col = new_col)

Renaming columns in a dataframe based on a vector

I was able to do this, but was wondering if there was a more elegant way, possibly with dplyr rename?
# Create dataframe with three named columns
tb <- tibble(col1 = 1:3, col2 = 1:3, col3 = 1:3)
#> # A tibble: 3 x 3
#> col1 col2 col3
#> <int> <int> <int>
#> 1 1 1 1
#> 2 2 2 2
#> 3 3 3 3
# Named vector with replacement names
new_names <- c(col1 = "Column 1", col3 = "Col3")
#> col1 col3
#> "Column 1" "Col3"
# Rename columns within dataframe
tb <- new_names[colnames(tb)] %>%
coalesce(colnames(tb)) %>%
setNames(object = tb, nm = .)
#> # A tibble: 3 x 3
#> `Column 1` col2 Col3
#> <int> <int> <int>
#> 1 1 1 1
#> 2 2 2 2
#> 3 3 3 3
# loading dplyr
pacman::p_load(dplyr)
# rename() syntax demands:
# LHS - a new column name
# RHS - an existing column name
# can be either a named vector or a named list
c('Column 1' = 'col1', 'Col3' = 'col3') -> x
# the unquote-splice (!!!) operator unquotes and splices its argument
rename(tibble(col1 = 1:3, col2 = 1:3, col3 = 1:3), !!!x)
#> # A tibble: 3 x 3
#> `Column 1` col2 Col3
#> <int> <int> <int>
#> 1 1 1 1
#> 2 2 2 2
#> 3 3 3 3
You can find more about it here:
a good book
And here: pretty documentation
Pipe operators are kinda slow so you ought to try to avoid using them when not needed.

How to use dplyr `rowwise()` column numbers instead of column names

library(tidyverse)
df <- tibble(col1 = c(5, 2), col2 = c(6, 4), col3 = c(9, 9))
df %>% rowwise() %>% mutate(col4 = sd(c(col1, col3)))
# # A tibble: 2 x 4
# col1 col2 col3 col4
# <dbl> <dbl> <dbl> <dbl>
# 1 5 6 9 2.83
# 2 2 4 9 4.95
After asking a series of questions I can finally calculate standard deviation across rows. See my code above.
But I can't use column names in my production code, because the database I pull from likes to change the column names periodically. Lucky for me the relative column positions is always the same.
So I'll just use column numbers instead. And let's check to make sure I can just swap things in and out:
identical(df$col1, df[[1]])
# [1] TRUE
Yes, I can just swap df[[1]] in place of df$col1. I think I do it like this.
df %>% rowwise() %>% mutate(col4 = sd(c(.[[1]], .[[3]])))
# # A tibble: 2 x 4
# col1 col2 col3 col4
# <dbl> <dbl> <dbl> <dbl>
# 1 5 6 9 3.40
# 2 2 4 9 3.40
df %>% rowwise() %>% {mutate(col4 = sd(c(.[[1]], .[[3]])))}
# Error in mutate_(.data, .dots = compat_as_lazy_dots(...)) :
# argument ".data" is missing, with no default
Nope, it looks like these don't work because the results are different from my original. And I can't use apply, if you really need to know why I made a separate question.
df %>% mutate(col4 = apply(.[, c(1, 3)], 1, sd))
How do I apply dplyr rowwise() with column numbers instead of names?
The issue in using .[[1]] or .[[3]] after doing the rowwise (grouping by row - have only single row per group) is that it breaks the grouping structure and extracts the whole column. Inorder to avoid that, we can create a row_number() column before doing the rowwise and then subset the columns based on that index
library(dplyr)
df %>%
mutate(rn = row_number()) %>% # create a sequence of row index
rowwise %>%
mutate(col4 = sd(c(.[[1]][rn[1]], .[[3]][rn[1]]))) %>% #extract with index
select(-rn)
#Source: local data frame [2 x 4]
#Groups: <by row>
# A tibble: 2 x 4
# col1 col2 col3 col4
# <dbl> <dbl> <dbl> <dbl>
#1 5 6 9 2.83
#2 2 4 9 4.95
Or another option is map from purrr where we loop over the row_number() and do the subsetting of rows of dataset
library(purrr)
df %>%
mutate(col4 = map_dbl(row_number(), ~ sd(c(df[[1]][.x], df[[3]][.x]))))
# A tibble: 2 x 4
# col1 col2 col3 col4
# <dbl> <dbl> <dbl> <dbl>
#1 5 6 9 2.83
#2 2 4 9 4.95
Or another option is pmap (if we don't want to use row_number())
df %>%
mutate(col4 = pmap_dbl(.[c(1, 3)], ~ sd(c(...))))
# A tibble: 2 x 4
# col1 col2 col3 col4
# <dbl> <dbl> <dbl> <dbl>
#1 5 6 9 2.83
#2 2 4 9 4.95
Of course, the easiest way would be to use rowSds from matrixStats as described in the dupe tagged post here
NOTE: All of the above methods doesn't require any reshaping
Since you don't necessarily know the column names, but know the positions of the columns for which you need standard deviation, etc., I'd reshape into long data and add an ID column. You can gather by position instead of column name, either by giving the numbers of the column that should become the key, or the numbers of the columns to omit from the key. That way, you don't need to specify those values by column because you'll have them all in one column already. Then you can join those summary values back to your original wide-shaped data.
library(dplyr)
library(tidyr)
df <- tibble(col1 = c(5, 2), col2 = c(6, 4), col3 = c(9, 9)) %>%
mutate(id = row_number())
df %>%
mutate(id = row_number()) %>%
gather(key, value, 1, 3) %>%
group_by(id) %>%
summarise(sd = sd(value)) %>%
inner_join(df, by = "id")
#> # A tibble: 2 x 5
#> id sd col1 col2 col3
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2.83 5 6 9
#> 2 2 4.95 2 4 9
Rearrange columns by position as you need.
An approach transposing data, converting it to matrix, computing the standard deviation, transposing again and transforming into tibble.
df %>%
t %>%
rbind(col4 = c(sd(.[c(1, 3),1]), sd(.[c(1, 3),2]))) %>%
t %>%
as_tibble()

Resources