calculate row sum and product in data.frame - r

I would like to append a columns to my data.frame in R that contain row sums and products
Consider following data frame
x y z
1 2 3
2 3 4
5 1 2
I want to get the following
x y z sum prod
1 2 3 6 6
2 3 4 9 24
5 1 2 8 10
I have tried
sum = apply(ages,1,add)
but it gives me a row vector. Can some one please show me an efficient command to sum and product and append them to original data frame as shown above?

Try
transform(df, sum=rowSums(df), prod=x*y*z)
# x y z sum prod
#1 1 2 3 6 6
#2 2 3 4 9 24
#3 5 1 2 8 10
Or
transform(df, sum=rowSums(df), prod=Reduce(`*`, df))
# x y z sum prod
#1 1 2 3 6 6
#2 2 3 4 9 24
#3 5 1 2 8 10
Another option would be to use rowProds from matrixStats
library(matrixStats)
transform(df, sum=rowSums(df), prod=rowProds(as.matrix(df)))
If you are using apply
df[,c('sum', 'prod')] <- t(apply(df, 1, FUN=function(x) c(sum(x), prod(x))))
df
# x y z sum prod
#1 1 2 3 6 6
#2 2 3 4 9 24
#3 5 1 2 8 10

Another approach.
require(data.table)
# Create data
dt <- data.table(x = c(1,2,5), y = c(2,3,1), z = c(3,4,2))
# Create index
dt[, i := .I]
# Compute sum and prod
dt[, sum := sum(x, y, z), by = i]
dt[, prod := prod(x, y, z), by = i]
dt
# Compute sum and prod using .SD
dt[, c("sum", "prod") := NULL]
dt
dt[, sum := sum(.SD), by = i, .SDcols = c("x", "y", "z")]
dt[, prod := prod(.SD), by = i, .SDcols = c("x", "y", "z")]
dt
# Compute sum and prod using .SD and list
dt[, c("sum", "prod") := NULL]
dt
dt[, c("sum", "prod") := list(sum(.SD), prod(.SD)), by = i,
.SDcols = c("x", "y", "z")]
dt
# Compute sum and prod using .SD and lapply
dt[, c("sum", "prod") := NULL]
dt
dt[, c("sum", "prod") := lapply(list(sum, prod), do.call, .SD), by = i,
.SDcols = c("x", "y", "z")]
dt

Following can also be done but column names need to be entered:
ddf$sum = with(ddf, x+y+z)
ddf$prod = with(ddf, x*y*z)
ddf
x y z sum prod
1 1 2 3 6 6
2 2 3 4 9 24
3 5 1 2 8 10
With data.table, another form can be:
library(data.table)
cbind(dt, dt[,list(sum=x+y+z, product=x*y*z),])
x y z sum product
1: 1 2 3 6 6
2: 2 3 4 9 24
3: 5 1 2 8 10
A simpler version is suggested by #David Arenberg in comments:
dt[, ":="(sum = x+y+z, product = x*y*z)]

Only a partial answer, but if all values are greater than or equal to 0, rowSums/rowsum can be used to calculate products:
df <- data.frame(x = c(1, 2, 5), y = c(2, 3, 1), z = c(3, 4, 2))
# custom row-product-function
my_rowprod <- function(x) exp(rowSums(log(x)))
df$prod <- my_rowprod(df)
df
The generic version is (including negatives):
my_rowprod_2 <- function(x) {
sign <- ifelse((rowSums(x < 0) %% 2) == 1, -1, 1)
prod <- exp(rowSums(log(abs(x)))) * sign
prod
}
df$prod <- my_rowprod_2(df)
df

Related

function in R that creates a new column in a data.table, whose i-th row is the sum of product of two columns starting from i+1 row

Example:
data.table(x=1:3, y=4:6)
I want to insert a new column whose values would be:
z=(2*5+3*6, 3*6, NA)
I tried to create this function firstly but it doesnt work :
sumprod <- function(x, y){
z=vector()
for (i in 1:length(x)-1){
z=c(z, sum(shift(x, n=i+1, type="lag")*shift(y, n=i+1, type="lag"), na.rm=FALSE))
}
return(z)
}
We may do
library(data.table)
dt1[, z := rev(cumsum(rev(Reduce(`*`, shift(.SD, type = "lead",
fill = 0)))))]
dt1[z == 0, z := NA_real_]
-output
> dt1
x y z
1: 1 4 28
2: 2 5 18
3: 3 6 NA
Or with fcumsum
library(collapse)
dt1[, z := fcumsum(shift(do.call(`*`, .SD), type = "lead")[.N:1])[.N:1]]
data
dt1 <- data.table(x=1:3, y=4:6)
You could Reduce using right=T argument (from right to left):
dt[,z:=shift(Reduce('+',x*y,accumulate=T,right=T),-1)][]
x y z
<int> <int> <int>
1: 1 4 28
2: 2 5 18
3: 3 6 NA

apply two functions by row condition data.table

I have the following data.table
df <- data.table(
id = c(rep(1,6),rep(2,6),rep(3,6)),
grp = c(rep("x",6),rep("y",6),rep("z",6)),
val1 = 1:18,
val2 = 13:30
)
I want two apply two different functions by row condition
for example:
cols <- paste0("val",1:2)
df[id == 1,lapply(.SD, function (x) tail(x,2)),.SDcols = cols,by = list(id,grp)]
df[id != 1,lapply(.SD, function (x) tail(x,3)),.SDcols = cols,by = list(id,grp)]
I'm quite new to working with data.table so there is maybe a more efficient way than carrying out separate calculations then joining the two tables above
If the conditions are disjunct, i.e., id == 1 and id != 1, and if id is also one of the grouping variables (in the by = clause), two different functions can be applied by
df[, lapply(.SD, function (x) if (first(id) == 1) tail(x, 2) else tail(x, 3)),
.SDcols = cols, by = .(id, grp)]
id grp val1 val2
1: 1 x 5 17
2: 1 x 6 18
3: 2 y 10 22
4: 2 y 11 23
5: 2 y 12 24
6: 3 z 16 28
7: 3 z 17 29
8: 3 z 18 30
So, subsetting is not by row but by grouping variable and has been moved into the anonymous function definition within lapply(). This avoids to rbind() the subsets afterwards.
For the sake of completeness, in the particular case of the tail() function being called with different parameters we can write more concisely
df[, lapply(.SD, tail, n = fifelse(first(id) == 1, 2, 3)),
.SDcols = cols, by = .(id, grp)]
Here is another option:
df[.N:1L, ri := rowid(id, grp)]
rbindlist(list(
df[id == 1L & ri <= 2L], #for the first, df[id == 1L, tail(.SD, 2L), .(id, grp), .SDcols = cols]
df[id != 1L & ri <= 3L] #and for df[id != 2, tail(.SD, 3L), .(id,grp), .SDcols = cols]
))
output:
id grp val1 val2 ri
1: 1 x 5 17 2
2: 1 x 6 18 1
3: 2 y 10 22 3
4: 2 y 11 23 2
5: 2 y 12 24 1
6: 3 z 16 28 3
7: 3 z 17 29 2
8: 3 z 18 30 1
Would be interested to know the size of your dataset and the speedup.

Max by Group with Condition for a data.table

I have data like this:
library(data.table)
group <- c("a","a","a","b","b","b")
cond <- c("N","Y","N","Y","Y","N")
value <- c(2,1,3,4,2,5)
dt <- data.table(group, cond, value)
group cond value
a N 2
a Y 1
a N 3
b Y 4
b Y 2
b N 5
I would like to return max value when the cond is Y for the entire group. Something like this:
group cond value max
a N 2 1
a Y 1 1
a N 3 1
b Y 4 4
b Y 2 4
b N 5 4
I've tried adding an ifelse condition to a grouped max, however, I end up just returning the no condition of NA when the row doesn't meet the condition:
dt[, max := ifelse(cond=="Y", max(value), NA), by = group]
Assuming that for each 'group' we need to get the max of 'value' where the 'cond' is "Y", after grouping by 'group', subset the 'value' with the logical condition (cond == 'Y') and get the max value
dt[, max := max(value[cond == 'Y']), by = group]
dt
# group cond value max
#1: a N 2 1
#2: a Y 1 1
#3: a N 3 1
#4: b Y 4 4
#5: b Y 2 4
#6: b N 5 4
You could do...
dt[CJ(group = group, cond = "Y", unique=TRUE), on=.(group, cond),
.(mv = max(value))
, by=.EACHI]
# group cond mv
# 1: a Y 1
# 2: b Y 4
Using a join like this will eventually have optimization of the max calculation.
Another way (originally included in #akrun's answer):
dt[cond == "Y", mv := max(value), by=group]
From the prior link, we can see that this way is already optimized, except for the := part.

Ranking multiple columns by different orders using data table

Using my example below, how can I rank multiple columns using different orders, so for example rank y as descending and z as ascending?
require(data.table)
dt <- data.table(x = c(rep("a", 5), rep("b", 5)),
y = abs(rnorm(10)) * 10, z = abs(rnorm(10)) * 10)
cols <- c("y", "z")
dt[, paste0("rank_", cols) := lapply(.SD, function(x) frankv(x, ties.method = "min")), .SDcols = cols, by = .(x)]
data.table's frank() function has some useful features which aren't available in base R's rank() function (see ?frank). E.g., we can reverse the order of the ranking by prepending the variable with a minus sign:
library(data.table)
# create reproducible data
set.seed(1L)
dt <- data.table(x = c(rep("a", 5), rep("b", 5)),
y = abs(rnorm(10)) * 10, z = abs(rnorm(10)) * 10)
# rank y descending, z ascending
dt[, rank_y := frank(-y), x][, rank_z := frank(z), x][]
x y z rank_y rank_z
1: a 6.264538 15.1178117 3 4
2: a 1.836433 3.8984324 5 1
3: a 8.356286 6.2124058 2 2
4: a 15.952808 22.1469989 1 5
5: a 3.295078 11.2493092 4 3
6: b 8.204684 0.4493361 1 2
7: b 4.874291 0.1619026 4 1
8: b 7.383247 9.4383621 2 5
9: b 5.757814 8.2122120 3 4
10: b 3.053884 5.9390132 5 3
If there are many columns which are to be ranked individually, some descending, some ascending, we can do this in two steps
# first rank all columns in descending order
cols_desc <- c("y")
dt[, paste0("rank_", cols_desc) := lapply(.SD, frankv, ties.method = "min", order = -1L),
.SDcols = cols_desc, by = x][]
# then rank all columns in ascending order
cols_asc <- c("z")
dt[, paste0("rank_", cols_asc) := lapply(.SD, frankv, ties.method = "min", order = +1L),
.SDcols = cols_asc, by = x][]
x y z rank_y rank_z
1: a 6.264538 15.1178117 3 4
2: a 1.836433 3.8984324 5 1
3: a 8.356286 6.2124058 2 2
4: a 15.952808 22.1469989 1 5
5: a 3.295078 11.2493092 4 3
6: b 8.204684 0.4493361 1 2
7: b 4.874291 0.1619026 4 1
8: b 7.383247 9.4383621 2 5
9: b 5.757814 8.2122120 3 4
10: b 3.053884 5.9390132 5 3

how to compare and tabulate frequency of common elements between two vectors in r

I have two vectors having common and repetitive elements. I want a table comparing the frequency of common elements in both vectors. Here is subset
plyr::count(V1)
x freq
1 A*02:01 106
2 A*02:02 88
3 A*03:01 95
4 A*03:02 60
plyr::count(V2)
x freq
1 A*02:01 11
2 A*02:02 11
3 A*02:04 1
4 A*03:01 20
The Output I want is:
x freq.V1 freq.V2
1 A*02:01 106 11
2 A*02:02 88 11
3 A*03:01 60 20
I think merge seems a good choice here as the default is to keep observations common to both datasets. So the following should work
merge(plyr::count(V1), plyr::count(V2), by="x")
Worked example
plyr::count(mtcars$gear)
# x freq
# 1 3 15
# 2 4 12
# 3 5 5
plyr::count(mtcars$gear[1:10])
# x freq
# 1 3 4
# 2 4 6
merge(
plyr::count(mtcars$gear),
plyr::count(mtcars$gear[1:10]),
by="x")
# x freq.x freq.y
# 1 3 15 4
# 2 4 12 6
Just use table:
tbl1 <- table(V1[V1 %in% (int <- intersect(unique(V1), unique(V2)))])
tbl2 <- table(V2[V2 %in% int])
data.frame(x = names(tbl1), freq.V1 = as.vector(tbl1), freq.V2 = as.vector(tbl2))
Or my favorite, data.table:
library(data.table)
DT <- data.table(V1 = V1, V2 = V2)
DT[V1 %in% unique(V2), .(freq.V1 = .N), by = .(x = V1)
][DT[V2 %in% unique(V1), .N, by = .(x = V2)],
freq.V2 := i.N, on = "x", nomatch = 0L]
Of course both options look much simpler if you know beforehand that V1 and V2 consist of the same set of elements:
data.frame(x = names(tbl1 <- table(V1)), freq.V1 = as.vector(tbl1),
freq.V2 = as.vector(table(V2)))
and
DT[ , .(freq.V1 = .N), by = .(x = V1)
][DT[ , .(freq.V2 = .N), by = .(x = V2)], on = "x"]

Resources