How to mutate a subset of columns with dplyr? - r

I have this tbl
data_frame(a_a = letters[1:10], a_b = letters[1:10], a = letters[1:10])
And I am trying to substitute all d in each column starting with a_ with the value new value.
I thought the below code would do the job, but it doesn't:
data_frame(a_a = letters[1:10], a_b = letters[1:10], a = letters[1:10]) %>%
mutate_each(vars(starts_with('a_'), funs(gsub('d', 'new value',.))))
instead it gives
Error: is.fun_list(calls) is not TRUE

Guiding from this similar question and considering dft as your input, you can try :
dft %>%
dplyr::mutate_each(funs(replace(., . == "d", "nval")), matches("a_"))
which gives:
## A tibble: 10 × 3
# a_a a_b a
# <chr> <chr> <chr>
#1 a a a
#2 b b b
#3 c c c
#4 nval nval d
#5 e e e
#6 f f f
#7 g g g
#8 h h h
#9 i i i
#10 j j j

Related

R Replace column values in dataframe base on matching indexing column in separate dataframe

I have a dataframe 'df1' that looks like:
Number
Variable1
Variable
Variable3
1
A
B
C
2
A
B
C
3
A
B
C
4
A
B
C
5
A
B
C
And I have a second dataframe 'df2' that looks like:
Number
Variable1
Variable
Variable3
1
D
E
F
2
G
H
I
3
J
K
L
4
M
N
O
15
P
Q
R
I want to update the three Variable columns in df1 with the data in the Variable columns in df2 based on matching values in Number so that df1 ends up looking like:
Number
Variable1
Variable
Variable3
1
D
E
F
2
G
H
I
3
J
K
L
4
M
N
O
5
A
B
C
You could use a power_left_join from powerjoin package with conflict = coalesce_yx like this:
library(powerjoin)
power_left_join(df1, df2, by = "Number", conflict = coalesce_yx)
#> Number Variable1 Variable Variable3
#> 1 1 D E F
#> 2 2 G H I
#> 3 3 J K L
#> 4 4 M N O
#> 5 5 A B C
Created on 2022-12-13 with reprex v2.0.2
Data:
df1 <- read.table(text = 'Number Variable1 Variable Variable3
1 A B C
2 A B C
3 A B C
4 A B C
5 A B C
', header = TRUE)
df2 <- read.table(text = 'Number Variable1 Variable Variable3
1 D E F
2 G H I
3 J K L
4 M N O
15 P Q R
', header = TRUE)
Would be helpful if dput(df) done. Have created another dataset for replication
df1<-cbind.data.frame(id=c(1:5),var1=rep("A",5),var2=rep("B",5),var3=rep("C",5))
df2<-cbind.data.frame(id=c(1:4,15),var1=LETTERS[7:11],var2=LETTERS[12:16],var3=LETTERS[16:20])
df1 %>%
left_join(df2, by = "id") %>%
mutate(var1 = coalesce(var1.y, var1.x),
var2 = coalesce(var2.y, var2.x),
var3 = coalesce(var3.y, var3.x)) %>%
select(-var1.y, -var1.x,
-var2.y, -var2.x,
-var3.y, -var3.x)
With dplyr, we can use rows_update
library(dplyr)
rows_update(df1, df2, by = 'Number', unmatched = "ignore")
-output
Number Variable1 Variable Variable3
1 1 D E F
2 2 G H I
3 3 J K L
4 4 M N O
5 5 A B C
You could update df1 while joining using data.table package and fcoalesce function:
library(data.table)
cols = c("Variable1", "Variable", "Variable3")
setDT(df1)[df2, (cols) := Map(fcoalesce, mget(paste0("i.", cols)), mget(cols)), on="Number"]
Number Variable1 Variable Variable3
<int> <char> <char> <char>
1: 1 D E F
2: 2 G H I
3: 3 J K L
4: 4 M N O
5: 5 A B C

find value furthest to the right in a table r

Let's say I've got some data:
data <- tibble(A = c("a", "b", "c", "d"),
B = c("e", "f", "g", NA_character_),
C = c("h", "i", NA_character_, NA_character_))
Which looks like this:
# A tibble: 4 x 3
A B C
<chr> <chr> <chr>
1 a e h
2 b f i
3 c g NA
4 d NA NA
What I'd like to do is get the value that's furthest to the right into a new column:
# A tibble: 4 x 4
A B C D
<chr> <chr> <chr> <chr>
1 a e h h
2 b f i i
3 c g NA g
4 d NA NA d
I know I could do it with case_when and a bunch of logical !is.na(A) ~ A, statements, but say I've got a load of columns and that's not feasible. I feel like there probably is an easy way that I just don't know about and haven't been able to find. Thanks
coalesce would be more easier
library(dplyr)
data %>%
mutate(D = coalesce(C, B, A))
-output
# A tibble: 4 x 4
# A B C D
# <chr> <chr> <chr> <chr>
#1 a e h h
#2 b f i i
#3 c g <NA> g
#4 d <NA> <NA> d
Or if there are many column, rev the column names, convert to symbols and evaluate (!!!)
data %>%
mutate(D = coalesce(!!! rlang::syms(rev(names(.)))))

Combine multiple columns into vector by row with dplyr

I am trying to combine multiple columns into a single cell for each row and then remove missing values.
Sample data:
df <- data.frame(a=c("a", "b", "c", "d"),
b=c(NA, "a", "b", "c"),
c=c("a", "b", "e", "g"))
Attempt:
df %>% rowwise() %>%
mutate(collapse=as.character(paste(a,b,c, collapse=",")),
collapse_nona=na.omit(collapse))
Output:
# A tibble: 4 x 5
a b c collapse collapse_nona
* <fct> <fct> <fct> <chr> <chr>
1 a NA a a NA a,b a b,c b e,d c… a NA a,b a b,c b e,d …
2 b a b a NA a,b a b,c b e,d c… a NA a,b a b,c b e,d …
3 c b e a NA a,b a b,c b e,d c… a NA a,b a b,c b e,d …
4 d c g a NA a,b a b,c b e,d c… a NA a,b a b,c b e,d …
1) I am not successfully creating cells with values for each row (the whole column appears in collapse).
2) Cells in the collapse column do not behave like a vector.
Desired output
a b c collapse collapse_nona
* <fct> <fct> <fct> <chr> <chr>
1 a NA a a NA a a a
2 b a b b a b b a b
3 c b e c b e c b e
4 d c g d c g d c g
Thank you
With unite, there is an option for na.rm and it is by default FALSE
library(tidyr)
library(dplyr)
df %>%
mutate_all(as.character) %>%
unite(collapse, a, b,c, remove = FALSE, sep=" ") %>%
unite(collapse_nona, a, b, c, remove = FALSE, sep=" ", na.rm = TRUE) %>%
select(names(df), everything())
# a b c collapse collapse_nona
#1 a <NA> a a NA a a a
#2 b a b b a b b a b
#3 c b e c b e c b e
#4 d c g d c g d c g
Or with paste and str_remove_all (from stringr) - Note that paste/str_c are vectorized, so there is no need to loop over each row with rowwise
df %>%
mutate(collapse = paste(a, b, c),
collapse_nona = str_remove_all(collapse, "\\sNA|NA\\s"))
# a b c collapse collapse_nona
#1 a <NA> a a NA a a a
#2 b a b b a b b a b
#3 c b e c b e c b e
#4 d c g d c g d c g
Another option is pmap to loop over each row, remove the NA elements with na.omit and then paste or str_c (from stringr)
library(dplyr)
library(stringr)
library(purrr)
df %>%
mutate_all(as.character) %>%
mutate(collapse_nona = pmap_chr(., ~ c(...) %>%
na.omit %>%
str_c(collapse=" ")))
# a b c collapse_nona
#1 a <NA> a a a
#2 b a b b a b
#3 c b e c b e
#4 d c g d c g
The think the core issue is that you don't want collapse, you want sep. Then rowwise calculation is unnecessary. Also, NA will get printed as character, so you cannot remove them with na.omit
df %>%
mutate(collapse = paste(a,b,c, sep = " "), collapse_nona = gsub("NA", "", collapse))
a b c collapse collapse_nona
1 a <NA> a a NA a a a
2 b a b b a b b a b
3 c b e c b e c b e
4 d c g d c g d c g
I think this does it. You could play around with the sep argument in str_c.
library(dplyr)
library(stringr)
df %>%
mutate(collapse = str_c(str_replace_na(a), str_replace_na(b), str_replace_na(c), sep = " "),
collapse_nona = str_c(str_replace_na(a, ""), str_replace_na(b, ""), str_replace_na(c,""), sep = " "))
a b c collapse collapse_nona
1 a <NA> a a NA a a a
2 b a b b a b b a b
3 c b e c b e c b e
4 d c g d c g d c g

r create new data frame that matches in rows elements grouped by another column

I want to create a new data frame from the df one below. In the new data frame (df2), each element in df$name is placed in the first column and matched in its row with other element of df$name grouped by df$group.
df <- data.frame(group = rep(letters[1:2], each=3),
name = LETTERS[1:6])
> df
group name
1 a A
2 a B
3 a C
4 b D
5 b E
6 b F
In this example, "A", "B", and "C" in df$name belong to "a" in df$group, and I want to put them in the same row in a new data frame. The desired output looks like this:
> df2
V1 V2
1 A B
2 A C
3 B A
4 B C
5 C A
6 C B
7 D E
8 D F
9 E D
10 E F
11 F D
12 F E
We could do this in base R with merge
out <- setNames(subset(merge(df, df, by.x = 'group', by.y = 'group'),
name.x != name.y, select = -group), c("V1", "V2"))
row.names(out) <- NULL
out
# V1 V2
#1 A B
#2 A C
#3 B A
#4 B C
#5 C A
#6 C B
#7 D E
#8 D F
#9 E D
#10 E F
#11 F D
#12 F E
In my opinion its case of self-join. Using dplyr a solution can be as:
library(dplyr)
inner_join(df, df, by="group") %>%
filter(name.x != name.y) %>%
select(V1 = name.x, V2 = name.y)
# V1 V2
# 1 A B
# 2 A C
# 3 B A
# 4 B C
# 5 C A
# 6 C B
# 7 D E
# 8 D F
# 9 E D
# 10 E F
# 11 F D
# 12 F E
df <- data.frame(group = rep(letters[1:2], each=3),
name = LETTERS[1:6])
library(tidyverse)
df %>%
group_by(group) %>% # for every group
summarise(v = list(expand.grid(V1=name, V2=name))) %>% # create all combinations of names
select(v) %>% # keep only the combinations
unnest(v) %>% # unnest combinations
filter(V1 != V2) # exclude rows with same names
# # A tibble: 12 x 2
# V1 V2
# <fct> <fct>
# 1 B A
# 2 C A
# 3 A B
# 4 C B
# 5 A C
# 6 B C
# 7 E D
# 8 F D
# 9 D E
# 10 F E
# 11 D F
# 12 E F

How to mutate columns whose column names differ by a suffix?

In a dataset like
data_frame(a=letters, a_1=letters, b=letters, b_1=letters)
I would like to concatenate the columns that share a similar "root", namely a with a_1 and b with b_1. The output should look like
# A tibble: 26 x 2
a b
<chr> <chr>
1 a a a a
2 b b b b
3 c c c c
4 d d d d
5 e e e e
6 f f f f
7 g g g g
8 h h h h
9 i i i i
10 j j j j
# ... with 16 more rows
If you're looking for a tidyverse approach, you can do it using tidyr::unite_:
library(tidyr)
# get a list column name groups
cols <- split(names(df), sub("_.*", "", names(df)))
# loop through list and unite columns
for(x in names(cols)) {
df <- unite_(df, x, cols[[x]], sep = " ")
}
Here is one way to go about it,
ind <- sub('_.*', '', names(df))
as.data.frame(sapply(unique(ind), function(i) do.call(paste, df[i == ind])))
# a b
#1 a a a a
#2 b b b b
#3 c c c c
#4 d d d d
#5 e e e e
#6 f f f f
#7 g g g g
#8 h h h h

Resources