how can I use mutate in dplyr to modify variable dynamically? - r

I wish to dynamically select variables and modify them in tibble dataframe. I have copied a sample problem. Can someone please help with that.
Thanks
df <- tibble(
x = c(1, 2, 3),
y = c(4, 5, 6),
z = c(6, 7, 8))
variables = c("x", "y")
for (var in variables)
{
df <- df %>% mutate(var = var + 1)
}

We can use mutate with across
library(dplyr)
df %>%
mutate(across(all_of(variables), ~ . + 1))
-output
# A tibble: 3 x 3
# x y z
# <dbl> <dbl> <dbl>
#1 2 5 6
#2 3 6 7
#3 4 7 8
data
df <- tibble(
x = c(1, 2, 3),
y = c(4, 5, 6),
z = c(6, 7, 8))
variables = c("x", "y")

Try this. You can use !! from rlang and sym() from dplyr to make the evaluation you want using the operator :=. Here the code:
library(dplyr)
#Data and code
df <- tibble(
x = c(1, 2, 3),
y = c(4, 5, 6),
z = c(6, 7, 8))
variables = c("x", "y")
for (var in variables)
{
var <- sym(var)
df <- df %>% mutate(!!var := !!var + 1)
}
Output:
# A tibble: 3 x 3
x y z
<dbl> <dbl> <dbl>
1 2 5 6
2 3 6 7
3 4 7 8

Related

How to automatically fill in a blank column

I am trying to get the list of sums of two columns from my original data set, from left to right
I have made a loop:
for (i in 1:ncol(df)) {
m = i
n = i + 1
if (i %% 2 != 0) {
df_cum$V1 <- sum(df[,m] + df[,n])
}
}
But, the way to add value to the new list is wrong:
df_cum$V1 <- sum(df[,m] + df[,n])
would be really appreciated if anyone knows how to do that in R
You can try split.default(), i.e.
sapply(split.default(df, gsub('\\d+', '', names(df))), sum)
A B
17 12
A base R option using tapply -
tapply(unlist(df),
rep(1:ncol(df), each = nrow(df) * 2, length.out = nrow(df) * ncol(df)),
sum)
# 1 2 3
#17 12 13
The logic here is to create group of every 2 columns and sum them.
data
It is easier to help if you provide data in a reproducible format
df <- data.frame(A1 = c(0, 3, 2), A2 = c(2, 6, 4),
B1 = c(3, 0, 1), B2 = c(2, 3, 3),
C1 = c(7, 3, 2), C2 = c(1, 0, 0))
We can do this in tidyverse
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(everything(), names_to = c(".value", "grp"),
names_sep ="(?<=[A-Z])(?=[0-9])") %>%
select(-grp) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 1 x 3
A B C
<dbl> <dbl> <dbl>
1 17 12 13
Or using base R
aggregate(values ~ ., transform(stack(df1),
ind = sub("\\d+", "", ind)), FUN = sum)
ind values
1 A 17
2 B 12
3 C 13
Or another option with rowsum from base R
with(stack(df1), rowsum(values, group = trimws(ind, whitespace = "\\d+")))
[,1]
A 17
B 12
C 13
Or another option is with colSums and rowsum
{tmp <- colSums(df1); rowsum(tmp, group = substr(names(tmp), 1, 1))}
[,1]
A 17
B 12
C 13
data
df1 <- structure(list(A1 = c(0, 3, 2), A2 = c(2, 6, 4), B1 = c(3, 0,
1), B2 = c(2, 3, 3), C1 = c(7, 3, 2), C2 = c(1, 0, 0)),
class = "data.frame", row.names = c(NA,
-3L))

Concatenate tibbles with different columns

I want to concatenate tibbles,
there are few common columns
few columns with same name but different values
one different column
I have create an example below, can someone please create the desired tibble
Thanks
library(tidyverse)
# common columns in both tibble
x <- c(1, 2, 3)
y <- c(2, 3, 4)
# common column name and different value for each tibble
v <- c(15, 10, 20)
# specific column to tibble
t_a <- c(4, 5, 6)
tbl_a <- tibble(x, y, v, t_a)
# common column name and different value for each tibble
v <- c(7, 11, 13)
# specific column to tibble
t_b<- c(9, 14, 46)
tbl_b <- tibble(x, y, v, t_b)
# concatenate tbl such output looks like this
x <- c(1, 2, 3, 1, 2, 3)
y <- c(2, 3, 4, 2, 3, 4)
v <- c(15, 10, 20, 7, 11, 13)
t <- c(4, 5, 6, 9, 14, 46)
name <- c("a", "a", "a", "b", "b", "b")
# desired output
tbl <- tibble(x, y, v, t, name)
Here, we can bind the datasets together and use pivot_longer
library(dplyr)
library(tidyr)
bind_rows(tbl_a, tbl_b) %>%
pivot_longer(cols = c(t_a, t_b), names_to = c('.value', 'name'),
names_sep="_", values_to = 't', values_drop_na = TRUE)
-output
# A tibble: 6 x 5
# x y v name t
# <dbl> <dbl> <dbl> <chr> <dbl>
#1 1 2 15 a 4
#2 2 3 10 a 5
#3 3 4 20 a 6
#4 1 2 7 b 9
#5 2 3 11 b 14
#6 3 4 13 b 46

How to add new column and calculate recursive cum using dplyr and shift

I have a dataset: (actually I have more than 100 groups)
and I want to use dplyr to create a variable-y for each group, and fill first value of y to be 1,
Second y = 1* first x + 2*first y
The result would be:
I tried to create a column- y, all=1, then use
df%>% group_by(group)%>% mutate(var=shift(x)+2*shift(y))%>% ungroup()
but the formula for y become, always use initialize y value--1
Second y = 1* first x + 2*1
Could someone give me some ideas about this? Thank you!
The dput of my result data is:
structure(list(group = c("a", "a", "a", "a", "a", "b", "b", "b" ), x =
c(1, 2, 3, 4, 5, 6, 7, 8), y = c(1, 3, 8, 19, 42, 1, 8, 23)),
row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame" ))
To perform such calculation we can use accumulate from purrr or Reduce in base R.
Since you are already using dplyr we can use accumulate :
library(dplyr)
df %>%
group_by(group) %>%
mutate(y1 = purrr::accumulate(x[-n()], ~.x * 2 + .y, .init = 1))
# group x y y1
# <chr> <dbl> <dbl> <dbl>
#1 a 1 1 1
#2 a 2 3 3
#3 a 3 8 8
#4 a 4 19 19
#5 a 5 42 42
#6 b 6 1 1
#7 b 7 8 8
#8 b 8 23 23

Filter only EQUAL values into columns with dplyr::filter_if

I have following data:
df <- data.frame(
x = c(1, 4, 3, 4, 4, 3),
y = c(2, 3, 4, 4, 2, 3)
)
I try use this code:
library(tidyverse)
df %>%
filter_if(~ is.numeric(.), all_vars(. %in% c('3', '4')))
x y
1 4 3
2 3 4
3 4 4
4 3 3
But, the expected result is:
x y
1 3 3
2 4 4
How do this?
A different approach:
require(tidyverse)
df <- data.frame(
x = c(1, 4, 3, 4, 4, 3),
y = c(2, 3, 4, 4, 2, 3),
z = letters[1:6]
)
df %>%
filter(apply(.,1,function(x) length(unique(x[grepl('[0-9]',x)]))==1))
gives:
x y z
1 4 4 d
2 3 3 f
I have added a non-numeric column to the example data, to illustrate this solution.
Not a filter_if() possibility, but essentially following that logic:
df %>%
filter(rowMeans(select_if(., is.numeric) == pmax(!!!select_if(., is.numeric))) == 1)
x y z
1 4 4 d
2 3 3 f
Sample data:
df <- data.frame(x = c(1, 4, 3, 4, 4, 3),
y = c(2, 3, 4, 4, 2, 3),
z = letters[1:6],
stringsAsFactors = FALSE)

Calculating the delta between multiple variables grouped by user ids

How might I calculate the delta between multiple variables grouped by user ids in a "long" data frame?
Data format:
d1 <- data.frame(
id = rep(c(1, 2, 3, 4, 5), each = 2),
purchased = c(rep(c(T, F), 3), F, T, T, F),
product = rep(c("A", "B"), 5),
grade = c(1, 2, 1, 2, 2, 3, 7, 5, 1, 2),
rate = c(10, 12, 10, 12, 12, 14, 22, 18, 10, 12),
fee = rep(c(1, 2), 5))
This is my roundabout solution:
dA <- d1 %>%
filter(product == "A")
dB <- d1 %>%
filter(product == "B")
d2 <- inner_join(dA, dB, by = "id", suffix = c(".A", ".B"))
d3 <- d2 %>%
mutate(
purchased = if_else(purchased.A == T, "A", "B"),
dGrade = grade.B - grade.A,
dRate = rate.B - rate.A,
dFee = fee.B - fee.A) %>%
select(id, purchased:dFee)
All of this just seems terribly inefficient and complex. Is tidyr::spread or another dplyr/tidyr function appropriate here? (I couldn't get anything else to work)...
We can do this with gather/spread. Reshape the data from 'wide' to 'long' using gather, grouped by 'id', 'Var', we get the 'product' based on the logical column 'purchased', get the difference of 'Val' for 'product' that are 'B' and 'A', and spread it from 'long' to 'wide' format.
library(dplyr)
library(tidyr)
gather(d1, Var, Val, grade:fee) %>%
group_by(id, Var) %>%
summarise(purchased = product[purchased],
Val = Val[product == 'B'] - Val[product == 'A'])%>%
spread(Var, Val)
# id purchased fee grade rate
# <dbl> <fctr> <dbl> <dbl> <dbl>
#1 1 A 1 1 2
#2 2 A 1 1 2
#3 3 A 1 1 2
#4 4 B 1 -2 -4
#5 5 A 1 1 2
The OP's output ('d3') is
d3
# id purchased dGrade dRate dFee
#1 1 A 1 2 1
#2 2 A 1 2 1
#3 3 A 1 2 1
#4 4 B -2 -4 1
#5 5 A 1 2 1

Resources