Subtracting columns in dataframe [duplicate] - r

I have a dataframe. I'd like to subtract the 2nd column from all other columns. I can do it in a loop, but I'd like to do it in one call. Here's my working loop code:
df <- data.frame(x = 100:101, y = 2:3,z=3:4,a = -1:0,b=4:5)
for( i in 3:length(df) ) {
df[i] <- df[i] - df[2]
}

If you need to subtract the columns 3:ncol(df) from the second column
df[3:ncol(df)] <- df[3:ncol(df)]-df[,2]

Another solution using dplyr::mutate_at() function
# install.packages("dplyr", dependencies = TRUE)
library(dplyr)
df <- data.frame(x = 100:101, y = 2:3, z = 3:4, a = -1:0, b = 4:5)
df %>%
mutate_at(vars(-matches("y"), -matches("x")), list(dif = ~ . - y))
#> x y z a b z_dif a_dif b_dif
#> 1 100 2 3 -1 4 1 -3 2
#> 2 101 3 4 0 5 1 -3 2
Created on 2019-11-05 by the reprex package (v0.3.0)

This would also work - returns the 9 columns that you subtracted the second one from.
df = data.frame(matrix(rnorm(100,0,1),nrow = 10))
df[,-2] - df[,2]

Related

mutate(across) to generate multiple new columns in tidyverse

I usually have to perform equivalent calculations on a series of variables/columns that can be identified by their suffix (ranging, let's say from _a to _i) and save the result in new variables/columns. The calculations are equivalent, but vary between the variables used in the calculations. These again can be identified by the same suffix (_a to _i). So what I basically want to achieve is the following:
newvar_a = (oldvar1_a + oldvar2_a) - z
...
newvar_i = (oldvar1_i + oldvar2_i) - z
This is the farest I got:
mutate(across(c(oldvar1_a:oldvar1_i), ~ . - z, .names = "{col}_new"))
Thus, I'm able to "loop" over oldvar1_a to oldvar1_i, substract z from them and save the results in new columns named oldvar1_a_new to oldvar1_i_new. However, I'm not able to include oldvar2_a to oldvar2_i in the calculations as R won't loop over them. (Additionally, I'd still need to rename the new columns).
I found a way to achieve the result using a for-loop. However, this definitely doesn't look like the most efficient and straightforward way to do it:
for (i in letters[1:9]) {
oldvar1_x <- paste0("oldvar1_", i)
oldvar2_x <- paste0("oldvar2_", i)
newvar_x <- paste0("newvar_", i)
df <- df %>%
mutate(!!sym(newvar_x) := (!!sym(oldvar1_x) + !!sym(oldvar2_x)) - z)
}
Thus, I'd like to know whether/how to make mutate(across) loop over multiple columns that can be identified by suffixes (as in the example above)
In this case, you can use cur_data() and cur_column() to take advantage that we are wanting to sum together columns that have the same suffix but just need to swap out the numbers.
library(dplyr)
df <- data.frame(
oldvar1_a = 1:3,
oldvar2_a = 4:6,
oldvar1_i = 7:9,
oldvar2_i = 10:12,
z = c(1,10,20)
)
mutate(
df,
across(
starts_with("oldvar1"),
~ (.x + cur_data()[gsub("1", "2", cur_column())]) - z,
.names = "{col}_new"
)
)
#> oldvar1_a oldvar2_a oldvar1_i oldvar2_i z oldvar2_a oldvar2_i
#> 1 1 4 7 10 1 4 16
#> 2 2 5 8 11 10 -3 9
#> 3 3 6 9 12 20 -11 1
If you want to use with case_when, just make sure to index using [[, you can read more here.
df <- data.frame(
oldvar1_a = 1:3,
oldvar2_a = 4:6,
oldvar1_i = 7:9,
oldvar2_i = 10:12,
z = c(1,2,0)
)
mutate(
df,
across(
starts_with("oldvar1"),
~ case_when(
z == 1 ~ .x,
z == 2 ~ cur_data()[[gsub("1", "2", cur_column())]],
TRUE ~ NA_integer_
),
.names = "{col}_new"
)
)
#> oldvar1_a oldvar2_a oldvar1_i oldvar2_i z oldvar1_a_new oldvar1_i_new
#> 1 1 4 7 10 1 1 7
#> 2 2 5 8 11 2 5 11
#> 3 3 6 9 12 0 NA NA
There is a fairly straightforward way to do what I believe you are attempting to do.
# first lets create data
library(dplyr)
df <- data.frame(var1_a=runif(10, min = 128, max = 131),
var2_a=runif(10, min = 128, max = 131),
var1_b=runif(10, min = 128, max = 131),
var2_b=runif(10, min = 128, max = 131),
var1_c=runif(10, min = 128, max = 131),
var2_c=runif(10, min = 128, max = 131))
# taking a wild guess at what your z is
z <- 4
# initialize a list
fnl <- list()
# iterate over all your combo, put in list
for (i in letters[1:3])
{
dc <- df %>% select(ends_with(i))
i <- dc %>% mutate(a = rowSums(dc[1:ncol(dc)]) - z)
fnl <- append(fnl, i)
}
# convert to a dataframe/tibble
final <- bind_cols(fnl)
I left the column names sloppy assuming you had specific requirements here. You can convert this loop into a function and do the whole thin in a single step using purrr.

How to get the sum of the product of selected column in a data frame?

This is probably very simple but I couldn't think of a solution.
I have the following data frame, and I want to multiply column y with column z and sum the answer.
> df <- data.frame(x = c(1,2,3), y = c(2,4,6), z = c(2,3,4))
> df
x y z
1 1 2 2
2 2 4 3
3 3 6 4
The value found should be equal to 40.
with would be an option here if we don't want to repeat df$ or df[[ to extract the column
with(df, sum( y * z))
#[1] 40
Or %*%
c(df$y %*% df$z)
Additionally, you could use data table. The second row after the comma indicates columns (j). You don't need the spaces, they're just there to show how it works.
library(data.table)
a <- data.table(x = c(1,2,3), y = c(2,4,6), z = c(2,3,4))
#dt i j by
a[ , sum(y*z), ]

R. Create a column that is the min() value of my row [duplicate]

I'm try to calculate minimum across multiple columns (row-wise min) in a data frame, but the min function automatically returns the minimum across the whole of each column rather than for each row separately. I'm sure I'm missing something really simple here? Any ideas much appreciated.
x <- c(1,2,7)
y <- c(1,5,4)
minIwant <- c(1,2,4)
df <- data.frame(x, y, minIwant)
df$minIget <- min(df$x,df$y)
df
x y minIwant minIget
1 1 1 1 1
2 2 5 2 1
3 7 4 4 1
You can use apply to go through each row
apply(df, 1, FUN = min)
Where 1 means to apply FUN to each row of df, 2 would mean to apply FUN to columns.
To remove missing values, use:
apply(df, 1, FUN = min, na.rm = TRUE)
We could use pmin, which finds the parallel minima of sets of values. Since our df is technically a list, we will need to run it via do.call.
df$min <- do.call(pmin, df)
which gives
df
# x y min
# 1 1 1 1
# 2 2 5 2
# 3 7 4 4
Data:
df <- data.frame(x = c(1, 2, 7), y = c(1, 5, 4))
Furthermore, if na.rm = TRUE is needed, you can do
df$min <- do.call(pmin, c(df, na.rm = TRUE))
Just want to add on how you can also do this with dplyr.
library(dplyr)
x<-c(1,2,7)
y<-c(1,5,4)
df <- data.frame(x,y)
df %>% rowwise() %>% mutate(minIget = min(x, y))
# A tibble: 3 x 3
x y minIget
<dbl> <dbl> <dbl>
1 1. 1. 1.
2 2. 5. 2.
3 7. 4. 4.
We could also use rowMins from library(matrixStats)
library(matrixStats)
df$minIwant <- rowMins(as.matrix(df))

Sum observations from two columns, looping over many columns in R

I have searched high and low, but am stuck on how to approach this. I have two sets of columns that I want to sum, row by row, but which I want to loop over many columns. If I were to do this manually, I would want:
df1[1,1]+df2[1,1]
df1[2,1]+df2[2,1]
etc... I've found many helpful examples on how to do something like:
apply(df[,c("a","d")], 1, sum)
though I want to do this over lots of columns. Also, while it's not entirely relevant, I want to phrase my question as close to my reality as possible, so my example below includes NA's, since my actual data contains many missing values.
# make a data frame, df1, with three columns
a <- sample(1:100, 50, replace = T)
b <- sample(100:300, 50, replace = T)
c <- sample(2:50, 500, replace = T)
df1 <- cbind(a,b,c)
# make another data frame, df2, with three columns
x <- sample(1:100, 50, replace = T)
y <- sample(100:300, 50, replace = T)
z <- sample(2:50, 50, replace = T)
df2 <- cbind(x,y,z)
# make another data frame, df2, with three columns
x <- sample(1:100, 50, replace = T)
y <- sample(100:300, 50, replace = T)
z <- sample(2:50, 50, replace = T)
df2 <- cbind(x,y,z)
Make it possible to randomly throw a few NAs in, function from http://www.r-bloggers.com/function-to-generate-a-random-data-set/
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
Add the NAs to the frames
NAins(df1, .2)
NAins(df2, .14)
Then, I tried to seq along the columns in each data frame, and used apply setting the index to 1, meaning to sum each row entry. This doesn't work.
for(i in seq_along(df1)){
for(j in seq_along(df2)){
apply(c(df1[,i], col2[j]), 1, function(x) sum(x, na.rm = T))}}
Thanks for any help!
You should be able to just replace NA with 0, and then add with "+":
replace(df1, is.na(df1), 0) + replace(df2, is.na(df2), 0)
# X Y Z
# 1 7 19 6
# 2 11 12 1
# 3 16 14 11
# 4 13 7 13
# 5 10 2 11
Alternatively, if you have more than just two data.frames, you can collect them in a list and use Reduce:
Reduce("+", lapply(mget(c("df1", "df2", "df3")), function(x) replace(x, is.na(x), 0)))
Here's some sample data (and what I think is an easier way to create it):
set.seed(1) ## Set a seed so others can reproduce your sample data
dfmaker <- function() {
setNames(
data.frame(
replicate(3, sample(c(NA, 1:10), 5, TRUE), FALSE)),
c("X", "Y", "Z"))
}
df1 <- dfmaker()
df1
# X Y Z
# 1 2 9 2
# 2 4 10 1
# 3 6 7 7
# 4 9 6 4
# 5 2 NA 8
df2 <- dfmaker()
df2
# X Y Z
# 1 5 10 4
# 2 7 2 NA
# 3 10 7 4
# 4 4 1 9
# 5 8 2 3
df3 <- dfmaker()
You can transform the data.frame to an array and sum them using apply function.
install.package('abind')
library(abind)
df <- abind(list(df1,df2), along = 3)
results <- apply(df, MARGIN = c(1,2), FUN = function(x) sum(x, na.rm = TRUE))
results

Subtract a column in a dataframe from many columns in R

I have a dataframe. I'd like to subtract the 2nd column from all other columns. I can do it in a loop, but I'd like to do it in one call. Here's my working loop code:
df <- data.frame(x = 100:101, y = 2:3,z=3:4,a = -1:0,b=4:5)
for( i in 3:length(df) ) {
df[i] <- df[i] - df[2]
}
If you need to subtract the columns 3:ncol(df) from the second column
df[3:ncol(df)] <- df[3:ncol(df)]-df[,2]
Another solution using dplyr::mutate_at() function
# install.packages("dplyr", dependencies = TRUE)
library(dplyr)
df <- data.frame(x = 100:101, y = 2:3, z = 3:4, a = -1:0, b = 4:5)
df %>%
mutate_at(vars(-matches("y"), -matches("x")), list(dif = ~ . - y))
#> x y z a b z_dif a_dif b_dif
#> 1 100 2 3 -1 4 1 -3 2
#> 2 101 3 4 0 5 1 -3 2
Created on 2019-11-05 by the reprex package (v0.3.0)
This would also work - returns the 9 columns that you subtracted the second one from.
df = data.frame(matrix(rnorm(100,0,1),nrow = 10))
df[,-2] - df[,2]

Resources