Specifying multiple column names inside mutate - r

How do I specify column names inside mutate when multiple columns are generated?
In this example:
set.seed(5)
data.frame(x2 = sample(1:10, 10),
x3 = sample(1:10, 10),
x1 = sample(1:10, 10),
y3 = sample(1:10, 10),
y2 = sample(1:10, 10),
y1 = sample(1:10, 10)) |>
mutate(z1 = x1 - y1,
z2 = x2 - y2,
z3 = x3 - y3) |>
mutate(zz = across(num_range(prefix = 'x',
range = 1:3)) - across(num_range(prefix = 'y',
range = 1:3)))
Resulting in:
x2 x3 x1 y3 y2 y1 z1 z2 z3 zz.x1 zz.x2 zz.x3
1 2 3 9 10 9 6 3 -7 -7 3 -7 -7
2 9 10 6 6 4 5 1 5 4 1 5 4
3 7 6 4 8 8 3 1 -1 -2 1 -1 -2
4 3 2 3 4 10 8 -5 -7 -2 -5 -7 -2
5 1 5 2 5 7 7 -5 -6 0 -5 -6 0
6 6 4 5 3 6 2 3 0 1 3 0 1
7 5 8 10 2 1 4 6 4 6 6 4 6
8 10 7 8 7 3 1 7 7 0 7 7 0
9 4 1 1 9 2 9 -8 2 -8 -8 2 -8
10 8 9 7 1 5 10 -3 3 8 -3 3 8
I want zz.x1 be named zz1, ...

Here's a dplyr-way:
mutate takes its name(s) from the first across and we can change this using the .names-argument. It accepts glue-style input that we can adapt to your needs using str_replace().
library(dplyr)
library(stringr)
df |>
mutate(across(num_range(prefix = 'x', range = 1:3),
.names = "{str_replace(col, 'x', 'z')}")
- across(num_range(prefix = 'y',range = 1:3)))
Output:
x2 x3 x1 y3 y2 y1 z1 z2 z3
1 2 3 9 10 9 6 3 -7 -7
2 9 10 6 6 4 5 1 5 4
3 7 6 4 8 8 3 1 -1 -2
4 3 2 3 4 10 8 -5 -7 -2
5 1 5 2 5 7 7 -5 -6 0
6 6 4 5 3 6 2 3 0 1
7 5 8 10 2 1 4 6 4 6
8 10 7 8 7 3 1 7 7 0
9 4 1 1 9 2 9 -8 2 -8
10 8 9 7 1 5 10 -3 3 8
Data:
set.seed(5)
df <- data.frame(x2 = sample(1:10, 10),
x3 = sample(1:10, 10),
x1 = sample(1:10, 10),
y3 = sample(1:10, 10),
y2 = sample(1:10, 10),
y1 = sample(1:10, 10))
Update: Or similar to OP's desired output
df2 |>
mutate(across(num_range(prefix = 'x', range = 1:3),
.names = "{str_replace(col, 'x', 'zz')}")
- across(num_range(prefix = 'y',range = 1:3)))
Output:
x2 x3 x1 y3 y2 y1 z1 z2 z3 zz1 zz2 zz3
1 2 3 9 10 9 6 3 -7 -7 3 -7 -7
2 9 10 6 6 4 5 1 5 4 1 5 4
3 7 6 4 8 8 3 1 -1 -2 1 -1 -2
4 3 2 3 4 10 8 -5 -7 -2 -5 -7 -2
5 1 5 2 5 7 7 -5 -6 0 -5 -6 0
6 6 4 5 3 6 2 3 0 1 3 0 1
7 5 8 10 2 1 4 6 4 6 6 4 6
8 10 7 8 7 3 1 7 7 0 7 7 0
9 4 1 1 9 2 9 -8 2 -8 -8 2 -8
10 8 9 7 1 5 10 -3 3 8 -3 3 8
Data
set.seed(5)
df2 <- data.frame(x2 = sample(1:10, 10),
x3 = sample(1:10, 10),
x1 = sample(1:10, 10),
y3 = sample(1:10, 10),
y2 = sample(1:10, 10),
y1 = sample(1:10, 10)) |>
mutate(z1 = x1 - y1,
z2 = x2 - y2,
z3 = x3 - y3)

I don't know how to do this with dplyr but in base R it is pretty straightforward. This might partly answer also your previous question.
# hard-coded variable suffixes
suff <- 1:3
# OR suffixes extracted from data
suff <- sort(unique(sub('[a-z]*', '', names(df))))
for (i in suff) {
df[[paste0('zz', i)]] <- df[[paste0('x', i)]] - df[[paste0('y', i)]]
}
df
# x2 x3 x1 y3 y2 y1 zz1 zz2 zz3
# 1 2 3 9 10 9 6 3 -7 -7
# 2 9 10 6 6 4 5 1 5 4
# 3 7 6 4 8 8 3 1 -1 -2
# 4 3 2 3 4 10 8 -5 -7 -2
# 5 1 5 2 5 7 7 -5 -6 0
# 6 6 4 5 3 6 2 3 0 1
# 7 5 8 10 2 1 4 6 4 6
# 8 10 7 8 7 3 1 7 7 0
# 9 4 1 1 9 2 9 -8 2 -8
# 10 8 9 7 1 5 10 -3 3 8
A more efficient way which avoids the loop over suffixes would be like this:
zz <- df[paste0('x', suff)] - df[paste0('y', suff)]
names(zz) <- paste0('zz', suff)
df <- cbind(df, zz)
Data:
set.seed(5)
df <- data.frame(x2 = sample(1:10, 10),
x3 = sample(1:10, 10),
x1 = sample(1:10, 10),
y3 = sample(1:10, 10),
y2 = sample(1:10, 10),
y1 = sample(1:10, 10))

Related

Calculating the difference of all variables to all other variables

Suppose my Dataframe look like:
w <- sample(-10:10, size =10)
x <- sample(-10:10, size =10)
y <- sample(-10:10, size =10)
z <- sample(-10:10, size =10)
df <- data.frame(w,x,y,z)
Now I want to calculate the amout of the difference (|w-x|) of each column to all the others.So I want it like the example down but additionally also wie the columns w_z, and x_y, y_z, y_z ....(my orignial df have some more Variables)
df$w_x <- abs(df$w-df$x)
df$w_y <- abs(df$w-df$x)
Alternativly, if there are easyer solution, its ok to have the calculation to a new df / matrix. Thanks in advance.
Here is a method with combn
nm1 <- combn(names(df), 2, FUN = paste, collapse = "_")
df[nm1] <- combn(df, 2, FUN = function(x) abs(x[[1]]- x[[2]]))
-output
> df
w x y z w_x w_y w_z x_y x_z y_z
1 7 3 -2 6 4 9 1 5 3 8
2 -6 6 7 0 12 13 6 1 6 7
3 -8 0 -7 8 8 1 16 7 8 15
4 9 -9 -4 -2 18 13 11 5 7 2
5 5 -3 5 -4 8 0 9 8 1 9
6 2 -6 -8 -10 8 10 12 2 4 2
7 -5 7 2 -9 12 7 4 5 16 11
8 3 -2 4 -6 5 1 9 6 4 10
9 -10 -4 9 9 6 19 19 13 13 0
10 8 5 10 4 3 2 4 5 1 6

derive multiple columns from multiple columns in r

Consider that we have the below data and would like to derive variables z1,z2,z3 from x1y1, x2y2 and x3*y3.
could you please help me how i can achieve this in R
x1 <- c(1,2,3,4,5,6)
x2 <- c(2,3,4,5,6,7)
x3 <- c(3,4,5,6,7,8)
x4 <- c('A','B','C','D','E','F')
y1 <- c(1,2,3,4,5,6)
y2 <- c(2,3,4,5,6,7)
y3 <- c(3,4,5,6,7,8)
testa <- data.frame(x1,x2,x3,x4,y1,y2,y3)
Assuming the integrity of your structure and naming conventions, you can select the x and y variables, multiple them together as a group, and then assign back to z.
var_i <- 1:3
testa[paste0("z", var_i)] <- testa[paste0("x", var_i)] * testa[paste0("y", var_i)]
x1 x2 x3 x4 y1 y2 y3 z1 z2 z3
1 1 2 3 A 1 2 3 1 4 9
2 2 3 4 B 2 3 4 4 9 16
3 3 4 5 C 3 4 5 9 16 25
4 4 5 6 D 4 5 6 16 25 36
5 5 6 7 E 5 6 7 25 36 49
6 6 7 8 F 6 7 8 36 49 64
If we want to do this automatically, a tidyverse option is
library(dplyr)
library(stringr)
testa <- testa %>%
mutate(across(x1:x3, ~ .x * get(str_replace(cur_column(), "x",
"y")), .names = "{str_replace(.col, 'x', 'z')}"))
-output
testa
x1 x2 x3 x4 y1 y2 y3 z1 z2 z3
1 1 2 3 A 1 2 3 1 4 9
2 2 3 4 B 2 3 4 4 9 16
3 3 4 5 C 3 4 5 9 16 25
4 4 5 6 D 4 5 6 16 25 36
5 5 6 7 E 5 6 7 25 36 49
6 6 7 8 F 6 7 8 36 49 64

conditional counting and grouping for the whole dataframe

I have this dataframe:
> df <- data.frame(Semester = sample(1:4, 20, replace=TRUE),
X1 = sample(c(1:7,NA), 20, replace =TRUE),
X2 = sample(c(1:7,NA), 20, replace =TRUE),
X3 = sample(c(1:7,NA), 20, replace =TRUE),
X4 = sample(c(1:7,NA), 20, replace =TRUE),
X5 = sample(c(1:7,NA), 20, replace =TRUE),
X6 = sample(c(1:7,NA), 20, replace =TRUE),
X7 = sample(c(1:7,NA), 20, replace =TRUE),
stringsAsFactors = FALSE)
> df
Semester X1 X2 X3 X4 X5 X6 X7
1 4 3 7 NA NA 1 2 7
2 3 NA 3 NA 4 3 2 6
3 1 2 5 3 4 7 NA 2
4 3 1 1 6 1 3 2 4
5 1 1 2 1 3 2 6 5
6 2 1 7 1 5 2 2 6
7 4 7 6 5 2 7 1 2
8 1 5 5 7 4 5 1 5
9 1 3 1 1 5 6 3 7
10 3 6 NA 1 1 5 NA 2
11 1 1 6 6 6 3 5 7
12 3 1 5 1 2 3 1 NA
13 4 1 4 1 1 5 6 1
14 1 5 4 4 NA 5 3 3
15 2 2 NA 4 1 1 5 4
16 3 6 7 6 7 3 3 7
17 1 1 2 4 5 4 5 3
18 4 4 7 7 6 NA 4 NA
19 3 4 2 3 4 4 3 5
20 2 1 NA 3 5 7 NA 6
And I'm trying to get this output, where n_* is the count for the number n_* for the all X* variables. For example, n_7 for Semester==1 is the count where X* values are 7 (This output is just referential, the values are artificial).
Semester n_7 n_6 n_5 n_4 n_3 n_2 n_1
1 5 7 1 5 7 7 7
2 4 10 1 3 6 3 4
3 5 5 2 5 3 3 2
4 3 9 10 5 7 0 0
I triedby(), but it counts the values of Semester also. Is there another way to do this?:
by(df, df$Semester,function(df){
count_if(eq(7), df)
count_if(eq(6), df)
count_if(eq(5), df)
count_if(eq(4), df)
count_if(eq(3), df)
count_if(eq(2), df)
count_if(eq(1), df)})
You could use a dcast() melt() approach.
library(data.table)
dcast(melt(df, "Semester"), Semester ~ value, fun=length)[-9]
# Semester 1 2 3 4 5 6 7
# 1 1 5 8 10 2 7 8 4
# 2 2 8 6 7 2 5 2 5
# 3 3 2 1 4 3 2 4 5
# 4 4 1 1 3 4 7 2 8

Expanding a dataframe with condition

I would like to expand a dataframe with duplicates of its own elements, but with specific conditions.
Here is my example data:
x1 <- c(1, 2, 3, 4, 5)
x2 <- c(2, 2, 2, 2, 2)
y1 <- c(9, 9, 8, 9, 9)
y2 <- c(0, 0, 0, 1, 1)
df <- data.frame(x1, x2, y1, y2)
df
x1 x2 y1 y2
1 1 2 9 0
2 2 2 9 0
3 3 2 8 0
4 4 2 9 1
5 5 2 9 1
The condition: only duplicate if y1 = 9 and y2 = 0. Therefore the output should look like this:
x1 x2 y1 y2
1 1 2 9 0
2 2 2 9 0
3 3 2 8 0
4 4 2 9 1
5 5 2 9 1
6 1 2 9 0
7 2 2 9 0
Case 1 and 2 were duplicated and accordingly the dataframe was expanded (new rows 6 and 7). Case 3, 4 and 5 were ignored, the condition was not met.
I am grateful for any help.
We can get the row index of the rows which satisfies our condition using which and just rbind those rows together to the original data frame.
inds <- which(df$y1 == 9 & df$y2 == 0)
rbind(df, df[inds,])
# x1 x2 y1 y2
#1 1 2 9 0
#2 2 2 9 0
#3 3 2 8 0
#4 4 2 9 1
#5 5 2 9 1
#6 1 2 9 0
#7 2 2 9 0
Or using dplyr bind_rows
library(dplyr)
bind_rows(df,
df %>%
filter(y1 == 9 & y2 == 0))
If we want to change the values for the duplicated rows for y1 to 10, we can do
bind_rows(df,
df %>%
filter(y1 == 9 & y2 == 0) %>%
mutate(y1 = 10)
)
# x1 x2 y1 y2
#1 1 2 9 0
#2 2 2 9 0
#3 3 2 8 0
#4 4 2 9 1
#5 5 2 9 1
#6 1 2 10 0
#7 2 2 10 0

How to add row and column to a dataframe of different length?

I have two dataframes of different length:
Headers <- data.frame(x = paste0("x", 1:4), y = 1:4)
Dataset <- data.frame(H = c(20, 10, 11, 8, 10), W = c(30, 20, 30, 10, 6))
Headers
x y
1 x1 1
2 x2 2
3 x3 3
4 x4 4
Dataset
H W
1 20 30
2 10 20
3 11 30
4 8 10
5 10 6
I need to convert column 'x' from 'Headers' to header, and column 'y' to corresponding values, and then bind to 'Dataset':
H W x1 x2 x3 x4
20 30 1 2 3 4
10 20 1 2 3 4
11 30 1 2 3 4
8 10 1 2 3 4
10 6 1 2 3 4
Here is the code which I tried:
H <- t(Headers)
Dataset <- cbind(H, Dataset)
names(H) <- NULL
Dataset <- qpcR:::cbind.na(H, Dataset)
Any help will be appreciated.Thanks
Transpose 'y' and repeat to the desired number of rows. Set column names to 'x'.
cbind(Dataset, `colnames<-`(t(Headers$y)[rep(1, nrow(Dataset)), ], Headers$x))
H W x1 x2 x3 x4
1 20 30 1 2 3 4
2 10 20 1 2 3 4
3 11 30 1 2 3 4
4 8 10 1 2 3 4
5 10 6 1 2 3 4
A data.table approach:
library(data.table)
cbind(Dataset, dcast(Headers, . ~ x, value.var = "y")[,-1])
Output:
H W x1 x2 x3 x4
1 20 30 1 2 3 4
2 10 20 1 2 3 4
3 11 30 1 2 3 4
4 8 10 1 2 3 4
5 10 6 1 2 3 4
A tidyverse approach:
library(tidyverse)
Headers %>%
rownames_to_column %>%
spread(x, y) %>%
summarise_all(funs(first(na.omit(.)))) %>%
cbind(Dataset, .) %>% select(-rowname)
Output:
H W x1 x2 x3 x4
1 20 30 1 2 3 4
2 10 20 1 2 3 4
3 11 30 1 2 3 4
4 8 10 1 2 3 4
5 10 6 1 2 3 4
You could also go with basic R
cbind(Dataset,data.frame(matrix(rep(Headers$y,each=nrow(Dataset)),nrow=nrow(Dataset))))

Resources