Sum of subset based on second vector - r

I got two vectors:
a <- c(1,1,2,3,4,4,4,4,5,6)
b <- c(T,F,T,F,T,T,F,F,F,T)
I would like to have a vector that tells me how many TRUEs there are in b for each unique value in a. (the second column)
[,1] [,2]
[1,] 1 1
[2,] 2 1
[3,] 3 0
[4,] 4 2
[5,] 5 0
[6,] 6 1
The best I can come up here with is using sapply:
sapply(unique(a), FUN = function(uniqueA, a, b) sum(b[a == uniqueA]), a = a, b = b)
This is fine, but for larger vectors it is rather slow. (I tried some subset variants.)
a <- sample(1:1000, 1e5, replace = TRUE)
b <- sample(c(T,F), 1e5, replace = TRUE)
microbenchmark::microbenchmark(
subset = sapply(unique(a), FUN = function(uniqueA, a, b) sum(b[a == uniqueA]), a = a, b = b)
, iN = sapply(unique(a), FUN = function(uniqueA, a, b) sum(a %in% uniqueA & b), a = a, b = b)
, equal = sapply(unique(a), FUN = function(uniqueA, a, b) sum(a == uniqueA & b), a = a, b = b)
, times = 5
)
Unit: milliseconds
expr min lq mean median uq max neval
subset 389.1995 390.6002 413.6969 393.0396 445.6553 449.9897 5
iN 2746.8407 2798.0462 2797.3155 2806.9477 2814.6317 2820.1110 5
equal 1080.3430 1089.2507 1111.0267 1096.8082 1135.1957 1153.5358 5
Does anyone have an idea how to do this faster?

You could use aggregate:
aggregate(b, list(a), sum)
For the fastest performance, I'd suggest a data.table. It will take longer to set up, but the performance should be quite good for larger amounts of data.
library(data.table)
dt <- data.table(a = a, b = b)
dt[,sum(b), by = a]
Speed test comparing (1) aggregate, (2) sapply, (3) data.table, (4) tapply:
a <- sample(1:1000, 1e5, replace = TRUE)
b <- sample(c(T,F), 1e5, replace = TRUE)
summarize_dt <- function(x) {
dt <- data.table(a = a, b = b)
dt[,sum(b), by = a]
}
microbenchmark::microbenchmark(
aggregate = aggregate(b, list(a), sum),
sapply = sapply(unique(a), FUN = function(uniqueA, a, b) sum(b[a == uniqueA]), a = a, b = b),
datatable = summarize_dt(),
tapply = tapply(b, a, sum)
)
#expr min lq mean median uq max neval
#aggregate 130.995347 133.672041 141.404597 135.301762 137.199151 213.730345 100
#sapply 335.344866 357.387474 394.432339 411.994214 425.604144 486.548520 100
#datatable 1.540011 1.914712 2.430220 2.027578 2.239999 5.297593 100
#tapply 3.075646 3.627395 4.719595 4.089434 5.934675 8.758332 100
Looks like data.table is the fastest by a lot

This one maybe using table in base R:
t <- table(a[b])
z <- as.numeric(names(t))
rbind(unname(cbind(z, t)), cbind(setdiff(unique(a),z),0))
# [,1] [,2]
# [1,] 1 1
# [2,] 2 1
# [3,] 4 2
# [4,] 6 1
# [5,] 3 0
# [6,] 5 0
If you want those with non-zero number of TRUE's, just table(a[b]) would suffice.

Or we can use tidyverse
library(tidyverse)
tibble(a, b) %>%
group_by(a) %>%
summarise(b = sum(b))
A base R option would be
rowsum(+b, a)

Related

How transform a factor to numeric binary variable?

I have a column with different types of sites (factor) :
Localisation
A
A
B
A
B
B
I would like to create a new column with binary values (numeric) who correspond to Localization column :
A = 1 and B = 0
Localisation Binom
A 1
A 1
B 0
A 1
B 0
B 0
Thanks !
dplyr approach, handy when there are more than two if-else conditions.
df <- read.table(stringsAsFactors = T, header = T, text = "Localisation
+ A
+ A
+ B
+ A
+ B
+ B")
df %>% mutate(Binom = case_when(Localisation == "A" ~ 1, #condition1
Localisation == "B" ~ 0) #condition2
)
Here are several alternatives.
library(dplyr)
library(microbenchmark)
mb <- microbenchmark(
NelsonGon = ifelse(df$Localisation %in% "A",1,0),
Edward = ifelse(df$Localisation == "A",1,0),
Edward2 = +(df$Localisation == "A"),
Rui = 2L - as.integer(df$Localisation),
massisenergy = df %>% mutate(Binom = case_when(Localisation == "A" ~ 1, #condition1
Localisation == "B" ~ 0) #condition2
)
)
print(mb, unit = "relative", order = "median")
#Unit: relative
# expr min lq mean median uq max neval cld
# Rui 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100 a
# NelsonGon 4.107345 3.041659 2.490878 2.679642 2.341985 0.4714148 100 ab
# Edward2 4.358608 3.339862 2.834451 3.032853 2.741840 1.0814972 100 ab
# Edward 7.631876 5.320371 4.330419 4.575165 3.967027 1.4852678 100 b
# massisenergy 247.792745 161.000287 117.762537 131.729545 96.032138 22.5566734 100 c
Data.
Localisation <- scan(what = character(), text = '
A
A
B
A
B
B')
df <- data.frame(Localisation)
Another option is
+(df$Localisation != "B")

Operations in a matrix with (i,j) values with no for or while loops

I need to write a function in R that receives as input an integer number n>1, and generates an output matrix P, where P_{i,j} = min (i,j) for(i,j)=1,...,n. This function must not have for nor while loops.
So far I have tried with the following code.
mat <- function(n){
m <- matrix(0,nrow = n,ncol = n)
if(row(m) >= col(m)){
col(m)
}
else{
row(m)
}
}
I know that with the if conditions, row(m) and col(m) I should be capable to look over the matrix, however, I don't know how to set that for that conditions I can have the min of row(m) and col(m) in the (i,j) position. I know I won't achieve the latter with the conditions I have above, but so far is the closest I've been.
An example is the following.
If n=3, then the result should be:
[,1] [,2] [,3]
[1,] 1 1 1
[2,] 1 2 2
[3,] 1 2 3
Try pmin, row and col
f1 <- function(n = 3) {
mat <- matrix(nrow = n, ncol = n)
pmin(row(mat), col(mat))
}
f1()
# [,1] [,2] [,3]
#[1,] 1 1 1
#[2,] 1 2 2
#[3,] 1 2 3
Or use outer and pmin which is more effiecient
f2 <- function(n = 3) {
idx <- sequence(n)
outer(idx, idx, pmin)
}
benchmark
library(microbenchmark)
n <- 10000
b <- microbenchmark(
f1 = f1(n),
f2 = f2(n),
times = 10
)
library(ggplot2)
autoplot(b)
b
#Unit: seconds
# expr min lq mean median uq max neval cld
# f1 5.554471 5.908210 5.924173 5.950610 5.996274 6.058502 10 b
# f2 1.272793 1.298099 1.354428 1.309208 1.464950 1.495362 10 a

Repeating calculation based on conditions

What I am trying to do is pretty simple. However, I am new to R and have not learned much about loops and functions and am not sure what is the most efficient way to get the results. Basically, I want to count the number of rows that meet my conditions and do a division. Here is an example:
df1 <- data.frame(
Main = c(0.0089, -0.050667, -0.030379, 0.066484, 0.006439, -0.026076),
B = c(NA, 0.0345, -0.0683, -0.052774, 0.014661, -0.040537),
C = c(0.0181, 0, -0.056197, 0.040794, 0.03516, -0.022662),
D = c(-0.0127, -0.025995, -0.04293, 0.057816, 0.033458, -0.058382)
)
df1
# Main B C D
# 1 0.008900 NA 0.018100 -0.012700
# 2 -0.050667 0.034500 0.000000 -0.025995
# 3 -0.030379 -0.068300 -0.056197 -0.042930
# 4 0.066484 -0.052774 0.040794 0.057816
# 5 0.006439 0.014661 0.035160 0.033458
# 6 -0.026076 -0.040537 -0.022662 -0.058382
My criteria for the numerator is to count the number of B/C/D that is >0 when Main is >0; For denominator, count the number of B/C/D that is != 0 when Main is != 0. I can use length(which(df1$Main >0 & df1$B>0)) / length(which(df1$Main !=0 & df1$B !=0)) to get the ratios for each of the column individually. But my data set has many more columns, and I am wondering if there is a way to get those ratio all at once so that my result will be like:
# B C D
# 1 0.2 0.6 0.3
Use apply:
apply(df1[,-1], 2, function(x) length(which(df1$Main >0 & x>0)) / length(which(df1$Main !=0 & x !=0)))
You could do this vectorized (No apply or for is needed):
tail(colSums(df[df$Main>0,]>0, na.rm = T) / colSums(df[df$Main!=0,]!=0, na.rm = T), -1)
# B C D
#0.2000000 0.6000000 0.3333333
One way to do this would be with a for loop that loops over the columns and applies the function that you wrote. Something like this:
ratio1<-vector()
for(i in 2:ncol(df1)){
ratio1[i-1] <- length(which(df1$Main >0 & df1[,i]>0)) / length(which(df1$Main !=0 & df1[,i] !=0))
}
Maybe there is a better way to do this with apply or data.table, but this is a simple solution that I can come up with. Works on any number of columns. Use round() if you want the answer in one decimal.
criteria1 <- df1[which(df1$Main > 0), -1] > 0
criteria2 <- df1[which(df1$Main != 0), -1] != 0
colSums(criteria1, na.rm = T)/colSums(criteria2, na.rm = T)
## B C D
## 0.2000000 0.6000000 0.3333333
Edit: It appears Niek's method is quickest for this specific data
# Unit: microseconds
# expr min lq mean median uq max neval
# Jim(df1) 216.468 230.0585 255.3755 239.8920 263.6870 802.341 300
# emilliman5(df1) 120.109 135.5510 155.9018 142.4615 156.0135 1961.931 300
# Niek(df1) 97.118 107.6045 123.5204 111.1720 119.6155 1966.830 300
# nine89(df1) 211.683 222.6660 257.6510 232.2545 252.6570 2246.225 300
#[[1]]
# [,1] [,2] [,3] [,4]
#median 239.892 142.462 111.172 232.255
#ratio 1.000 0.594 0.463 0.968
#diff 0.000 -97.430 -128.720 -7.637
However, when there are many columns the vectorized approach is quicker.
Nrow <- 1000
Ncol <- 1000
mat <- matrix(runif(Nrow*Ncol),Nrow)
df1 <- data.frame(Main = sample(-2:2,Nrow,T), mat) #1001 columns
#Unit: milliseconds
# expr min lq mean median uq max
# Jim(df1) 46.75627 53.88500 66.93513 56.58143 62.04375 185.0460
#emilliman5(df1) 73.35257 91.87283 151.38991 178.53188 185.06860 292.5571
# Niek(df1) 68.17073 76.68351 89.51625 80.14190 86.45726 200.7119
# nine89(df1) 51.36117 56.79047 74.53088 60.07220 66.34270 191.8294
#[[1]]
# [,1] [,2] [,3] [,4]
#median 56.581 178.532 80.142 60.072
#ratio 1.000 3.155 1.416 1.062
#diff 0.000 121.950 23.560 3.491
functions
Jim <- function(df1){
criteria1 <- df1[which(df1$Main > 0), -1] > 0
criteria2 <- df1[which(df1$Main != 0), -1] != 0
colSums(criteria1, na.rm = T)/colSums(criteria2, na.rm = T)
}
emilliman5 <- function(df1){
apply(df1[,-1], 2, function(x) length(which(df1$Main >0 & x>0)) / length(which(df1$Main !=0 & x !=0)))
}
Niek <- function(df1){
ratio1<-vector()
for(i in 2:ncol(df1)){
ratio1[i-1] <- length(which(df1$Main >0 & df1[,i]>0)) / length(which(df1$Main !=0 & df1[,i] !=0))
}
ratio1
}
nine89 <- function(df){
tail(colSums(df[df$Main>0,]>0, na.rm = T) / colSums(df[df$Main!=0,]!=0, na.rm = T), -1)
}

Count number of occurrences of vector in list

I have a list of vectors of variable length, for example:
q <- list(c(1,3,5), c(2,4), c(1,3,5), c(2,5), c(7), c(2,5))
I need to count the number of occurrences for each of the vectors in the list, for example (any other suitable datastructure acceptable):
list(list(c(1,3,5), 2), list(c(2,4), 1), list(c(2,5), 2), list(c(7), 1))
Is there an efficient way to do this? The actual list has tens of thousands of items so quadratic behaviour is not feasible.
match and unique accept and handle "list"s too (?match warns for being slow on "list"s). So, with:
match(q, unique(q))
#[1] 1 2 1 3 4 3
each element is mapped to a single integer. Then:
tabulate(match(q, unique(q)))
#[1] 2 1 2 1
And find a structure to present the results:
as.data.frame(cbind(vec = unique(q), n = tabulate(match(q, unique(q)))))
# vec n
#1 1, 3, 5 2
#2 2, 4 1
#3 2, 5 2
#4 7 1
Alternatively to match(x, unique(x)) approach, we could map each element to a single value with deparseing:
table(sapply(q, deparse))
#
# 7 c(1, 3, 5) c(2, 4) c(2, 5)
# 1 2 1 2
Also, since this is a case with unique integers, and assuming in a small range, we could map each element to a single integer after transforming each element to a binary representation:
n = max(unlist(q))
pow2 = 2 ^ (0:(n - 1))
sapply(q, function(x) tabulate(x, nbins = n)) # 'binary' form
sapply(q, function(x) sum(tabulate(x, nbins = n) * pow2))
#[1] 21 10 21 18 64 18
and then tabulate as before.
And just to compare the above alternatives:
f1 = function(x)
{
ux = unique(x)
i = match(x, ux)
cbind(vec = ux, n = tabulate(i))
}
f2 = function(x)
{
xc = sapply(x, deparse)
i = match(xc, unique(xc))
cbind(vec = x[!duplicated(i)], n = tabulate(i))
}
f3 = function(x)
{
n = max(unlist(x))
pow2 = 2 ^ (0:(n - 1))
v = sapply(x, function(X) sum(tabulate(X, nbins = n) * pow2))
i = match(v, unique(v))
cbind(vec = x[!duplicated(v)], n = tabulate(i))
}
q2 = rep_len(q, 1e3)
all.equal(f1(q2), f2(q2))
#[1] TRUE
all.equal(f2(q2), f3(q2))
#[1] TRUE
microbenchmark::microbenchmark(f1(q2), f2(q2), f3(q2))
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# f1(q2) 7.980041 8.161524 10.525946 8.291678 8.848133 178.96333 100 b
# f2(q2) 24.407143 24.964991 27.311056 25.514834 27.538643 45.25388 100 c
# f3(q2) 3.951567 4.127482 4.688778 4.261985 4.518463 10.25980 100 a
Another interesting alternative is based on ordering. R > 3.3.0 has a grouping function, built off data.table, which, along with the ordering, provides some attributes for further manipulation:
Make all elements of equal length and "transpose" (probably the most slow operation in this case, though I'm not sure how else to feed grouping):
n = max(lengths(q))
qq = .mapply(c, lapply(q, "[", seq_len(n)), NULL)
Use ordering to group similar elements mapped to integers:
gr = do.call(grouping, qq)
e = attr(gr, "ends")
i = rep(seq_along(e), c(e[1], diff(e)))[order(gr)]
i
#[1] 1 2 1 3 4 3
then, tabulate as before.
To continue the comparisons:
f4 = function(x)
{
n = max(lengths(x))
x2 = .mapply(c, lapply(x, "[", seq_len(n)), NULL)
gr = do.call(grouping, x2)
e = attr(gr, "ends")
i = rep(seq_along(e), c(e[1], diff(e)))[order(gr)]
cbind(vec = x[!duplicated(i)], n = tabulate(i))
}
all.equal(f3(q2), f4(q2))
#[1] TRUE
microbenchmark::microbenchmark(f1(q2), f2(q2), f3(q2), f4(q2))
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# f1(q2) 7.956377 8.048250 8.792181 8.131771 8.270101 21.944331 100 b
# f2(q2) 24.228966 24.618728 28.043548 25.031807 26.188219 195.456203 100 c
# f3(q2) 3.963746 4.103295 4.801138 4.179508 4.360991 35.105431 100 a
# f4(q2) 2.874151 2.985512 3.219568 3.066248 3.186657 7.763236 100 a
In this comparison q's elements are of small length to accomodate for f3, but f3 (because of large exponentiation) and f4 (because of mapply) will suffer, in performance, if "list"s of larger elements are used.
One way is to paste each vector , unlist and tabulate, i.e.
table(unlist(lapply(q, paste, collapse = ',')))
#1,3,5 2,4 2,5 7
# 2 1 2 1

Compare each row with other rows of matrix

I am looking for an efficient solution for the following problem:
b <- matrix(c(0,0,0,1,1,0), nrow = 2, byrow = T)
weight <- c(1,1)
times <- 5
abc <- do.call(rbind, replicate(times, b, simplify=FALSE))
weight <- rep.int(weight,times)
sum1 <- as.numeric(rep.int(NA,nrow(abc)))
##Rprof()
for(j in 1:nrow(abc)){
a <- abc[j,]
sum1[j] <- sum(weight[rowSums(t(a == t(abc)) + 0) == ncol(abc)])
}
##Rprof(NULL)
##summaryRprof()
Is there a faster way to do this? Rprof shows that rowSums(), t(), == and + are quite slow. If nrows is 20,000 it takes like 21 seconds.
Thanks for helping!
Edit: I have a matrix abc and a vector weight with length equal to nrow(abc). The first value of weight corresponds to the first row of matrix abc and so on... Now, I would like to determine which rows of matrix abc are equal. Then, I want to remember the position of those rows in order to sum up the corresponding weights which have the same position. The appropriate sum I wanna store for each row.
Here is a way that looks valid and fast:
ff <- function(mat, weights)
{
rs <- apply(mat, 1, paste, collapse = ";")
unlist(lapply(unique(rs),
function(x)
sum(weights[match(rs, x, 0) > 0])))[match(rs, unique(rs))]
}
ff(abc, weight)
# [1] 5 5 5 5 5 5 5 5 5 5
And comparing with your function:
ffOP <- function(mat, weights)
{
sum1 <- as.numeric(rep.int(NA,nrow(mat)))
for(j in 1:nrow(mat)) {
a <- mat[j,]
sum1[j] <- sum(weights[rowSums(t(a == t(mat)) + 0) == ncol(mat)])
}
sum1
}
ffOP(abc, weight)
# [1] 5 5 5 5 5 5 5 5 5 5
library(microbenchmark)
m = do.call(rbind, replicate(1e3, matrix(0:11, 3, 4), simplify = F))
set.seed(101); w = runif(1e3*3)
all.equal(ffOP(m, w), ff(m, w))
#[1] TRUE
microbenchmark(ffOP(m, w), ff(m, w), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# ffOP(m, w) 969.83968 986.47941 996.68563 1015.53552 1051.23847 10
# ff(m, w) 20.42426 20.64002 21.36508 21.97182 22.59127 10
For the record, I, also, implemented your approach in C and here are the benchmarkings:
#> microbenchmark(ffOP(m, w), ff(m, w), ffC(m, w), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# ffOP(m, w) 957.66691 967.09429 991.35232 1000.53070 1016.74100 10
# ff(m, w) 20.60243 20.85578 21.70578 22.13434 23.04924 10
# ffC(m, w) 36.24618 36.40940 37.18927 37.39877 38.83358 10

Resources