How to replace string in a tibble? - r

I want to replace apple with frui and pear with bord
df <- tibble(
word = c("apple", "apple","apple","banana", "pear","pear"),
i = seq_along(word)
)
Any idea?

A tidyverse option using str_replace_all.
library(tidyverse)
mutate(df, word = str_replace_all(word, c('apple' = 'frui', 'pear' = 'bord')))
# # A tibble: 6 x 2
# word i
# <chr> <int>
# 1 frui 1
# 2 frui 2
# 3 frui 3
# 4 banana 4
# 5 bord 5
# 6 bord 6

We could use recode
library(dplyr)
df <- df %>%
mutate(word = recode(word, apple = 'frui', pear = 'bord'))
-output
df
# A tibble: 6 × 2
word i
<chr> <int>
1 frui 1
2 frui 2
3 frui 3
4 banana 4
5 bord 5
6 bord 6

For the sake of completeness, here is another tidyverse option using case_when.
library(tidyverse)
df <- tibble(
word = c("apple", "apple","apple","banana", "pear","pear"),
i = seq_along(word)
)
df %>%
mutate(word = case_when(
TRUE ~ word,
word == "apple" ~ "frui",
word == "pear" ~ "bord"
))
#> # A tibble: 6 x 2
#> word i
#> <chr> <int>
#> 1 apple 1
#> 2 apple 2
#> 3 apple 3
#> 4 banana 4
#> 5 pear 5
#> 6 pear 6
Created on 2021-11-25 by the reprex package (v0.3.0)

Related

Convert any element that does not start with a specific string to NA

I have a large data frame that looks like df2.
I want to convert any element across columns code1, code2 that does not start with
AT to NA.
library(tidyverse)
df2 <- tibble(type=c("Jeep", "4x4", "convertible"),
code1=c("ATG1",NA, "ATG2"),
code2=c("random", "ATG3", "xyz"))
df2
#> # A tibble: 3 × 3
#> type code1 code2
#> <chr> <chr> <chr>
#> 1 Jeep ATG1 random
#> 2 4x4 <NA> ATG3
#> 3 convertible ATG2 xyz
Created on 2022-09-29 with reprex v2.0.2
I want my data to look like this
#> type code1 code2
#>
#> 1 Jeep ATG1 NA
#> 2 4x4 ATG3
#> 3 convertible ATG2 NA
You could do
df2 %>%
mutate(across(code1:code2, ~ifelse(substr(.x, 1, 2) == 'AT', .x, NA)))
#> # A tibble: 3 x 3
#> type code1 code2
#> <chr> <chr> <chr>
#> 1 Jeep ATG1 NA
#> 2 4x4 NA ATG3
#> 3 convertible ATG2 NA
With replace and grepl:
df2 %>%
mutate(across(starts_with("code"), ~ replace(.x, !grepl("^AT", .x), NA)))
Using case_when
library(dplyr)
library(stringr)
df2 %>%
mutate(across(starts_with('code'), ~ case_when(str_detect(.x, '^AT')~ .x)))
-output
# A tibble: 3 × 3
type code1 code2
<chr> <chr> <chr>
1 Jeep ATG1 <NA>
2 4x4 <NA> ATG3
3 convertible ATG2 <NA>

summarise based on multiple groups in R dplyr

I have a large data frame that looks like this
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df
#> # A tibble: 6 × 3
#> id counts fruit
#> <dbl> <dbl> <chr>
#> 1 1 10 apple
#> 2 1 20 banana
#> 3 2 15 cherry
#> 4 2 15 cherry
#> 5 2 10 ananas
#> 6 3 20 pear
Created on 2022-04-13 by the reprex package (v2.0.1)
For each id, I want to keep the fruit with the maximum counts and then I want to add the sum_counts of unique fruits per id in another column.
I want my data to look like this:
# A tibble: 3 × 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 30
3 3 pear pear 20
This is what I have tried so far and I do not know why I fail miserably
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,15,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id,fruit) %>%
add_count(fruit) %>%
ungroup() %>%
group_by(id) %>%
summarise(central_fruit=fruit[which.max(counts)],
fruits = toString(sort(unique(fruit), decreasing = TRUE)),
sum_counts = sum(unique(counts)))
#> # A tibble: 3 × 4
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana banana, apple 30
#> 2 2 cherry cherry, ananas 15
#> 3 3 pear pear 20
Created on 2022-04-13 by the reprex package (v2.0.1)
Here's a dplyr approach.
library(dplyr)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id) %>%
mutate(fruits = paste0(unique(fruit), collapse = ", "),
sum_counts = sum(unique(counts))) %>%
filter(counts == max(counts)) %>%
distinct() %>%
rename("central_fruit" = "fruit") %>%
select(-counts)
#> # A tibble: 3 × 4
#> # Groups: id [3]
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana apple, banana 30
#> 2 2 cherry cherry, ananas 25
#> 3 3 pear pear 20
Created on 2022-04-13 by the reprex package (v2.0.1)
This should work:
df |>
group_by(id) |>
distinct(fruit, .keep_all = TRUE) |>
mutate(
is_central_fruit = counts == max(counts),
sum_counts = sum(counts),
fruits = paste(fruit, collapse = ", ")
) |>
filter(
is_central_fruit
) |>
select(
-is_central_fruit,
-counts,
central_fruit = fruit
)
# id central_fruit sum_counts fruits
# <dbl> <chr> <dbl> <chr>
# 1 1 banana 30 apple, banana
# 2 2 cherry 25 cherry, ananas
# 3 3 pear 20 pear
If you want to order the fruits column then I wouldn't store fruits as a character vector, but as a list of factors.
And another dplyr approach but preserving the fruits order (central_fruit is first in fruits):
df %>%
distinct() %>%
group_by(id) %>%
mutate(sum_counts = sum(counts)) %>%
arrange(id, desc(counts)) %>%
mutate(fruits = paste(fruit, collapse = ", ")) %>%
slice(1) %>%
select(id, central_fruit = fruit, fruits, sum_counts) %>%
ungroup()
This returns
# A tibble: 3 x 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 25
3 3 pear pear 20

Select the row with the maximum value in each group based on multiple columns in R dplyr

My data frame looks like this one
library(tidyverse)
df1 <- tibble(col1= c("apple","apple","banana","banana"),
col2 = c("appl","aple","banan","bananb"),
count_col1=c(1,1,4,4), count_col2=c(3,4,1,1))
df1
#> # A tibble: 4 × 4
#> col1 col2 count_col1 count_col2
#> <chr> <chr> <dbl> <dbl>
#> 1 apple appl 1 3
#> 2 apple aple 1 4
#> 3 banana banan 4 1
#> 4 banana bananb 4 1
Created on 2022-02-17 by the reprex package (v2.0.1)
I want to select after grouping_by col1 the row that has the maximum value based on count_col1 and count_col2.
I want my data to look like this
col1 col2 count_col1 count_col2
apple aple 1 4
banana banan 4 1
banana bananb 4 1
for one column you can write something
df1 %>%
slice(which.max(count_col1))
but not for two
We may get rowwise max of the 'count' columns with pmax, grouped by 'col1', filter the rows where the max value of 'Max' column is.
library(dplyr)
df1 %>%
mutate(Max = pmax(count_col1, count_col2) ) %>%
group_by(col1) %>%
filter(Max == max(Max)) %>%
ungroup %>%
select(-Max)
-output
# A tibble: 3 × 4
col1 col2 count_col1 count_col2
<chr> <chr> <dbl> <dbl>
1 apple aple 1 4
2 banana banan 4 1
3 banana bananb 4 1
We may also use slice_max
library(purrr)
df1 %>%
group_by(col1) %>%
slice_max(invoke(pmax, across(starts_with("count")))) %>%
ungroup
# A tibble: 3 × 4
col1 col2 count_col1 count_col2
<chr> <chr> <dbl> <dbl>
1 apple aple 1 4
2 banana banan 4 1
3 banana bananb 4 1

How to add a row to each group and assign values

I have this tibble:
library(tibble)
library(dplyr)
df <- tibble(id = c("one", "two", "three"),
A = c(1,2,3),
B = c(4,5,6))
id A B
<chr> <dbl> <dbl>
1 one 1 4
2 two 2 5
3 three 3 6
I want to add a row to each group AND assign values to the new column BUT with a function (here the new row in each group should get A=4 B = the first group value of column B USING first(B)-> desired output:
id A B
<chr> <dbl> <dbl>
1 one 1 4
2 one 4 4
3 three 3 6
4 three 4 6
5 two 2 5
6 two 4 5
I have tried so far:
If I add a row in a ungrouped tibble with add_row -> this works perfect!
df %>%
add_row(A=4, B=4)
id A B
<chr> <dbl> <dbl>
1 one 1 4
2 two 2 5
3 three 3 6
4 NA 4 4
If I try to use add_row in a grouped tibble -> this works not:
df %>%
group_by(id) %>%
add_row(A=4, B=4)
Error: Can't add rows to grouped data frames.
Run `rlang::last_error()` to see where the error occurred.
According to this post Add row in each group using dplyr and add_row() we could use group_modify -> this works great:
df %>%
group_by(id) %>%
group_modify(~ add_row(A=4, B=4, .x))
id A B
<chr> <dbl> <dbl>
1 one 1 4
2 one 4 4
3 three 3 6
4 three 4 4
5 two 2 5
6 two 4 4
I want to assign to column B the first value of column B (or it can be any function min(B), max(B) etccc.) -> this does not work:
df %>%
group_by(id) %>%
group_modify(~ add_row(A=4, B=first(B), .x))
Error in h(simpleError(msg, call)) :
Fehler bei der Auswertung des Argumentes 'x' bei der Methodenauswahl für Funktion 'first': object 'B' not found
library(tidyverse)
df <- tibble(id = c("one", "two", "three"),
A = c(1,2,3),
B = c(4,5,6))
df %>%
group_by(id) %>%
summarise(add_row(cur_data(), A = 4, B = first(cur_data()$B)))
#> `summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
#> # A tibble: 6 × 3
#> # Groups: id [3]
#> id A B
#> <chr> <dbl> <dbl>
#> 1 one 1 4
#> 2 one 4 4
#> 3 three 3 6
#> 4 three 4 6
#> 5 two 2 5
#> 6 two 4 5
Or
df %>%
group_by(id) %>%
group_split() %>%
map_dfr(~ add_row(.,id = first(.$id), A = 4, B = first(.$B)))
#> # A tibble: 6 × 3
#> id A B
#> <chr> <dbl> <dbl>
#> 1 one 1 4
#> 2 one 4 4
#> 3 three 3 6
#> 4 three 4 6
#> 5 two 2 5
#> 6 two 4 5
Created on 2022-01-02 by the reprex package (v2.0.1)
Maybe this is an option
library(dplyr)
df %>%
group_by(id) %>%
summarise( A=c(A,4), B=c(B,first(B)) ) %>%
ungroup
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
# A tibble: 6 x 3
id A B
<chr> <dbl> <dbl>
1 one 1 4
2 one 4 4
3 three 3 6
4 three 4 6
5 two 2 5
6 two 4 5
According to the documentation of the function group_modify, if you use a formula, you must use ". or .x to refer to the subset of rows of .tbl for the given group;" that's why you used .x inside the add_row function. To be entirely consistent, you have to do it also within the first function.
df %>%
group_by(id) %>%
group_modify(~ add_row(A=4, B=first(.x$B), .x))
# A tibble: 6 x 3
# Groups: id [3]
id A B
<chr> <dbl> <dbl>
1 one 1 4
2 one 4 4
3 three 3 6
4 three 4 6
5 two 2 5
6 two 4 5
Using first(.$B) or first(df$B) will provide the same results.
A possible solution:
library(tidyverse)
df <- tibble(id = c("one", "two", "three"),
A = c(1,2,3),
B = c(4,5,6))
df %>%
group_by(id) %>%
slice(rep(1,2)) %>% mutate(A = if_else(row_number() > 1, first(df$B), A)) %>%
ungroup
#> # A tibble: 6 × 3
#> id A B
#> <chr> <dbl> <dbl>
#> 1 one 1 4
#> 2 one 4 4
#> 3 three 3 6
#> 4 three 4 6
#> 5 two 2 5
#> 6 two 4 5

Follow-up: Putting back a given missing column from a data.frame into a list of dta.frames

I'm following up on this question. My LIST of data.frames below is made from my data. However, this LIST is missing the paper column (the name(s) of the missing column(s) are always provided) which is available in the original data.
I was wondering how to put the missing paper column back into LIST to achieve my DESIRED_LIST below?
I tried the solution suggested in this answer (lapply(LIST, function(x)data[do.call(paste, data[names(x)]) %in% do.call(paste, x),])) but it doesn't produce my DESIRED_LIST.
A Base R or tidyverse solution is appreciated.
Reproducible data and code are below.
m2="
paper study sample comp ES bar
1 1 1 1 1 7
1 2 2 2 2 6
1 2 3 3 3 5
2 3 4 4 4 4
2 3 4 4 5 3
2 3 4 5 6 2
2 3 4 5 7 1"
data <- read.table(text=m2,h=T)
LIST <- list(data.frame(study=1 ,sample=1 ,comp=1),
data.frame(study=rep(3,4),sample=rep(4,4),comp=c(4,4,5,5)),
data.frame(study=c(2,2) ,sample=c(2,3) ,comp=c(2,3)))
DESIRED_LIST <- list(data.frame(paper=1 ,study=1 ,sample=1 ,comp=1),
data.frame(paper=rep(2,4),study=rep(3,4),sample=rep(4,4),comp=c(4,4,5,5)),
data.frame(paper=rep(1,2),study=c(2,2) ,sample=c(2,3) ,comp=c(2,3)))
Please find a solution with the package data.table. Is this what you were looking for?
Reprex 1
library(data.table)
cols_to_remove <- c("ES")
split(setDT(data)[, (cols_to_remove) := NULL], by = c("paper", "study"))
#> $`1.1`
#> paper study sample comp
#> 1: 1 1 1 1
#>
#> $`1.2`
#> paper study sample comp
#> 1: 1 2 2 2
#> 2: 1 2 3 3
#>
#> $`2.3`
#> paper study sample comp
#> 1: 2 3 4 4
#> 2: 2 3 4 4
#> 3: 2 3 4 5
#> 4: 2 3 4 5
Created on 2021-11-06 by the reprex package (v2.0.1)
EDIT
Please find solution 2 with the package dplyr
Reprex 2
library(dplyr)
drop.cols <- c("ES")
data %>%
group_by(paper, study) %>%
select(-drop.cols) %>%
group_split()
#> <list_of<
#> tbl_df<
#> paper : integer
#> study : integer
#> sample: integer
#> comp : integer
#> >
#> >[3]>
#> [[1]]
#> # A tibble: 1 x 4
#> paper study sample comp
#> <int> <int> <int> <int>
#> 1 1 1 1 1
#>
#> [[2]]
#> # A tibble: 2 x 4
#> paper study sample comp
#> <int> <int> <int> <int>
#> 1 1 2 2 2
#> 2 1 2 3 3
#>
#> [[3]]
#> # A tibble: 4 x 4
#> paper study sample comp
#> <int> <int> <int> <int>
#> 1 2 3 4 4
#> 2 2 3 4 4
#> 3 2 3 4 5
#> 4 2 3 4 5
Created on 2021-11-07 by the reprex package (v2.0.1)
Consider ave to create a grouping column (due to repeated rows) and then run an iterative merge.
DESIRED_LIST_SO <- lapply(
LIST,
function(df) merge(
transform(data, grp = ave(paper, paper, study, sample, comp, FUN=seq_along)),
transform(df, grp = ave(study, study, sample, comp, FUN=seq_along)),
by=c("study", "sample", "comp", "grp")
)[c("paper", "study", "sample", "comp")]
)
all.equal(DESIRED_LIST, DESIRED_LIST_SO)
[1] TRUE
(Consider keeping the unique identifiers, ES and bar in desired list to avoid the duplicates rows.)
A tidyverse solution. First, create a look-up table, data2, which contains the four target columns. mutate(across(.fns = as.numeric)) is to make column type consistent. It may not be needed. Second, use map to apply left_join to all data frames in LIST. LIST2 and DESIRED_LIST are completely the same.
data2 <- data %>%
distinct(paper, study, sample, comp) %>%
mutate(across(.fns = as.numeric))
LIST2 <- map(LIST, function(x){
x2 <- x %>%
left_join(data2, by = names(x)) %>%
select(all_of(names(data2)))
return(x2)
})
# Check if the results are the same
identical(DESIRED_LIST, LIST2)
# [1] TRUE

Resources