Hi I am new to vectorizing functions in R. I have a code similar the following.
library(truncnorm)
library(microbenchmark)
num_obs=10000
Observation=seq(1,num_obs)
Obs_Type=sample(1:4, num_obs, replace=T)
Upper_bound = runif(num_obs,0,1)
Lower_bound=runif(num_obs,2,4)
mean = runif(num_obs,10,15)
df1= data.frame(Observation,Obs_Type,Upper_bound,Lower_bound,mean)
df1$draw_value = 0
Trial_func=function(df1){
for (i in 1:nrow(df1)){
if (df1[i,"Obs_Type"] ==1){
#If Type == 1; then a=-Inf, b = Upper_Bound
df1[i,"draw_value"] = rtruncnorm(1,a=-Inf,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
} else if (df1[i,"Obs_Type"] ==2){
#If Type == 2; then a=-10, b = Upper_Bound
df1[i,"draw_value"] = rtruncnorm(1,a=-10,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
} else if(df1[i,"Obs_Type"] ==3){
#If Type == 3; then a=Lower_bound, b = Inf
df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=Inf,mean= df1[i,"mean"],sd=1)
} else {
#If Type == 3; then a=Lower_bound, b = 10
df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=10,mean= df1[i,"mean"],sd=1)
}
}
return(df1)
}
#Benchmarking
mbm=microbenchmark(Trial_func(df1=df1),times = 10)
summary(mbm)
#For obtaining the new data
New_data=Trial_func(df1=df1)
In the above I am creating a dataframe called df1 initially. I then create a function which takes a dataset (df1). Each observation in the dataset (df1), can be one of four types. This is given by df1$Obs_Type. What I want to do is that based on the Obs_Type, I want to draw values from a truncated normal distribution with a given upper and lower points.
The rules are:
a) When Obs_Type =1; a=-Inf, b = Upper_bound value of observation i.
b) When Obs_Type =2; a=-10, b = Upper_bound value of observation i.
c) When Obs_Type =3; a=Upper_bound value of observation i, b = Inf.
d) When Obs_Type =4; a=Upper_bound value of observation i, b = 10.
Where a = lower bound, b = upper bound;
Additionally, mean of observation i is given by df1$mean and sd = 1.
I am not familiar with vectorizing and was wondering if someone could help me with this a bit. I tried looking at some other examples on SO (for eg. this) but could not figure out what to do when I have multiple conditions.
My original dataset has about 10 million observations and other additional conditions (eg. instead of 4 types, my data has 16 types and the means changes with each type), but I used a simpler example here.
Please let me know if any part of the question requires any additional clarification.
The case_when function in the dplyr package is handy to vectorize this type of multiple if else statements.
Instead of passing the individual values to the "if" statements one can pass the entire vector for a very substantial performance improvement.
Also the case_when improves the readability of the script.
library(dplyr)
Trial_func <- function(df1) {
df1[,"draw_value"] <- case_when(
df1$Obs_Type == 1 ~ rtruncnorm(1,a=-Inf,b=df1[,"Upper_bound"],mean= df1[,"mean"], sd=1),
df1$Obs_Type == 2 ~ rtruncnorm(1,a=-10,b=df1[,"Upper_bound"],mean= df1[,"mean"],sd=1),
df1$Obs_Type == 3 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=Inf,mean= df1[,"mean"],sd=1),
df1$Obs_Type == 4 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=10,mean= df1[,"mean"],sd=1)
)
df1
}
Trial_func(df1)
Here is a vectorized way. It creates logical vectors i1, i2, i3 and i4 corresponding to the 4 conditions. Then it assigns the new values to the positions indexed by them.
Trial_func2 <- function(df1){
i1 <- df1[["Obs_Type"]] == 1
i2 <- df1[["Obs_Type"]] == 2
i3 <- df1[["Obs_Type"]] == 3
i4 <- df1[["Obs_Type"]] == 4
#If Type == 1; then a=-Inf, b = Upper_Bound
df1[i1, "draw_value"] <- rtruncnorm(sum(i1), a =-Inf,
b = df1[i1, "Upper_bound"],
mean = df1[i1, "mean"], sd = 1)
#If Type == 2; then a=-10, b = Upper_Bound
df1[i2, "draw_value"] <- rtruncnorm(sum(i2), a = -10,
b = df1[i2 , "Upper_bound"],
mean = df1[i2, "mean"], sd = 1)
#If Type == 3; then a=Lower_bound, b = Inf
df1[i3,"draw_value"] <- rtruncnorm(sum(i3),
a = df1[i3, "Lower_bound"],
b = Inf, mean = df1[i3, "mean"],
sd = 1)
#If Type == 3; then a=Lower_bound, b = 10
df1[i4, "draw_value"] <- rtruncnorm(sum(i4),
a = df1[i4, "Lower_bound"],
b = 10,
mean = df1[i4,"mean"],
sd = 1)
df1
}
In the speed test I have named #Dave2e's answer Trial_func3.
mbm <- microbenchmark(
loop = Trial_func(df1 = df1),
vect = Trial_func2(df1 = df1),
cwhen = Trial_func3(df1 = df1),
times = 10)
print(mbm, order = "median")
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# vect 4.349444 4.371169 4.40920 4.401384 4.450024 4.487453 10 a
# cwhen 13.458946 13.484247 14.16045 13.528792 13.787951 19.363104 10 a
# loop 2125.665690 2138.792497 2211.20887 2157.185408 2201.391083 2453.658767 10 b
library(truncnorm)
library(microbenchmark)
num_obs=10000
Observation=seq(1,num_obs)
Obs_Type=sample(1:4, num_obs, replace=T)
Upper_bound = runif(num_obs,0,1)
Lower_bound=runif(num_obs,2,4)
mean = runif(num_obs,10,15)
df1= data.frame(Observation,Obs_Type,Upper_bound,Lower_bound,mean)
df1$draw_value = 0
###########################
# Your example
###########################
Trial_func=function(df1, seed=NULL){
if (!is.null(seed)) set.seed(seed)
for (i in 1:nrow(df1)){
if (df1[i,"Obs_Type"] ==1){
#If Type == 1; then a=-Inf, b = Upper_Bound
df1[i,"draw_value"] = rtruncnorm(1,a=-Inf,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
} else if (df1[i,"Obs_Type"] ==2){
#If Type == 2; then a=-10, b = Upper_Bound
df1[i,"draw_value"] = rtruncnorm(1,a=-10,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
} else if(df1[i,"Obs_Type"] ==3){
#If Type == 3; then a=Lower_bound, b = Inf
df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=Inf,mean= df1[i,"mean"],sd=1)
} else {
#If Type == 3; then a=Lower_bound, b = 10
df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=10,mean= df1[i,"mean"],sd=1)
}
}
return(df1)
}
#############################
# Vectorized version
#############################
# for each row-elements define a function
truncated_normal <- function(obs_type, lower_bound, upper_bound, mean, sd) {
if (obs_type == 1) {
rtruncnorm(1, a=-Inf, b=upper_bound, mean=mean, sd=sd)
} else if (obs_type == 2){
rtruncnorm(1, a=-10, b=upper_bound, mean=mean, sd=sd)
} else if(obs_type == 3){
rtruncnorm(1, a=lower_bound, b=Inf, mean=mean, sd=sd)
} else {
rtruncnorm(1, a=lower_bound, b=10, mean=mean, sd=sd)
}
}
# vectorize it
truncated_normal <- Vectorize(truncated_normal)
Trial_func_vec <- function(df, res_col="draw_value", seed=NULL) {
if (!is.null(seed)) set.seed(seed)
df[, res_col] <- truncated_normal(
obs_type=df[, "Obs_Type"],
lower_bound=df[, "Lower_bound"],
upper_bound=df[, "Upper_bound"],
mean=df[,"mean"],
sd=1)
df
}
#Benchmarking
set.seed(1)
mbm=microbenchmark(Trial_func(df=df1),times = 10)
summary(mbm)
set.seed(1)
mbm_vec=microbenchmark(Trial_func_vec(df=df1),times = 10)
summary(mbm_vec)
## vectorization roughly 3x faster!
#For obtaining the new data
set.seed(1) # important so that randomization is reproducible
new_data=Trial_func(df=df1)
set.seed(1) # important so that randomization is reproducible
vec_data=Trial_func_vec(df=df1)
# since in both cases random number generator is provoked
# exactly once per row in the order of the rows,
# resulting df should be absolutely identical.
all(new_data == vec_data) ## TRUE! They are absolutely identical.
# proving that your code does - in principle - exactly the same
# like my vectorized code
The Benchmarking results
# #Rui Barradas' function
Trial_func2 <- function(df1){
i1 <- df1[["Obs_Type"]] == 1
i2 <- df1[["Obs_Type"]] == 2
i3 <- df1[["Obs_Type"]] == 3
i4 <- df1[["Obs_Type"]] == 4
#If Type == 1; then a=-Inf, b = Upper_Bound
df1[i1, "draw_value"] <- rtruncnorm(sum(i1), a =-Inf,
b = df1[i1, "Upper_bound"],
mean = df1[i1, "mean"], sd = 1)
#If Type == 2; then a=-10, b = Upper_Bound
df1[i2, "draw_value"] <- rtruncnorm(sum(i2), a = -10,
b = df1[i2 , "Upper_bound"],
mean = df1[i2, "mean"], sd = 1)
#If Type == 3; then a=Lower_bound, b = Inf
df1[i3,"draw_value"] <- rtruncnorm(sum(i3),
a = df1[i3, "Lower_bound"],
b = Inf, mean = df1[i3, "mean"],
sd = 1)
#If Type == 3; then a=Lower_bound, b = 10
df1[i4, "draw_value"] <- rtruncnorm(sum(i4),
a = df1[i4, "Lower_bound"],
b = 10,
mean = df1[i4,"mean"],
sd = 1)
df1
}
# #Dave2e's function
library(dplyr)
Trial_func_dplyr <- function(df1) {
df1[,"draw_value"] <- case_when(
df1$Obs_Type == 1 ~ rtruncnorm(1,a=-Inf,b=df1[,"Upper_bound"],mean= df1[,"mean"], sd=1),
df1$Obs_Type == 2 ~ rtruncnorm(1,a=-10,b=df1[,"Upper_bound"],mean= df1[,"mean"],sd=1),
df1$Obs_Type == 3 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=Inf,mean= df1[,"mean"],sd=1),
df1$Obs_Type == 4 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=10,mean= df1[,"mean"],sd=1)
)
df1
}
#Benchmarking
set.seed(1)
mbm <- microbenchmark(
loop = Trial_func(df1=df1),
ruy_vect = Trial_func2(df1=df1),
my_vect = Trial_func_vec(df=df1),
cwhen = Trial_func_dplyr(df1=df1),
times=10)
print(mbm, order = "median")
# > print(mbm, order = "median")
# Unit: milliseconds
# expr min lq mean median uq max
# ruy_vect 7.583821 7.879766 11.59954 8.815835 10.33289 36.60468
# cwhen 22.563190 23.103670 25.13804 23.965722 26.49628 30.63777
# my_vect 1326.771297 1373.415302 1413.75328 1410.995177 1484.28449 1506.11063
# loop 4149.424632 4269.475169 4486.41376 4423.527566 4742.96651 4911.31992
# neval cld
# 10 a
# 10 a
# 10 b
# 10 c
# #Rui's vectorize version wins by 3 magnitudes or order!!
Related
I am trying to extract random samples from 2 columns of my database (hours of work and relative amount of patients visited), and then I would like to calculate the mean progressively. By that I mean, the mean between the firsts 2 samples, then the mean between the mean I just calculated and the third sample...and so on.
Is it possible? Is there a function for that?
Thank you all for the help.
L.
This is how I am extracting the samples.
library(dplyr)
set.seed(2020)
obs <- rnorm(10, mean = 0, sd = 1)
time <- rnorm(10, mean = 0.5, sd = 1)
rdf <- data.frame(obs, time)
sample_n(rdf, 1)
p <- replicate(100, expr = (sample_n(rdf, 1) + sample_n(rdf, 1))/2)
One option is to use a for loop and determine the number of samples you would like. For example if we want to take 5 samples and calculate the means progressively we could do a loop which starts with first sample and iteratively selects the next sample. Then calculates the mean between the previous mean and the next sample:
set.seed(2020)
obs <- rnorm(10, mean = 0, sd = 1)
time <- rnorm(10, mean = 0.5, sd = 1)
rdf <- data.frame(obs, time)
nsamp <- 5 # number of samples
mean_vect <- numeric(nsamp) # create a vector to store the means
mean_vect[1] <- mean(sample_n(rdf, 1)$obs) # mean of first sample as starting point
# start calculations to fifth sample iteratively
for (i in 2:nsamp) {
# select the next sample
next_samp <- sample_n(rdf, 1)
# calculate the mean between the previous mean and the next sample
mean_vect[i] <- mean(c(mean_vect[i-1], next_samp$obs))
}
# print the means
print(mean_vect)
[1] -1.13040590 -0.20491620 0.04831609 0.08284144 0.40170747
You could define a recursive function (a function that calls itself).
f <- function(S, R, i=1, cm=NULL, res=NULL, ...) {
S <- rbind(cm, rdf[sample.int(nrow(rdf), 1), ])
cm <- colMeans(S)
res <- rbind(res, cm)
return(if (i < R) {
f(S, R=R, i=i + 1, cm=cm, res=res)
} else {
`rownames<-`(as.data.frame(res), NULL)
})
}
set.seed(42)
f(rdf[sample.int(nrow(rdf), 1), ], R=10)
# obs time
# 1 0.376972125 -0.35312282
# 2 -1.209781097 0.01180847
# 3 -0.416404486 -0.17065718
# 4 0.671363430 -0.97981606
# 5 0.394365109 -0.21075628
# 6 -0.368020398 -0.04117009
# 7 -0.033236012 0.68404454
# 8 0.042065388 0.62117402
# 9 0.209518756 0.13402560
# 10 -0.009929495 -1.20236950
You probably have to increase you C stack size.
But you could also use a for loop.
R <- 10
res1 <- matrix(nrow=0, ncol=2)
set.seed(42)
for (i in seq_len(R - 1)) {
if (nrow(res1) == 0) {
res1 <- rdf[sample.int(nrow(rdf), 1), ]
}
S <- rdf[sample.int(nrow(rdf), 1), ]
res1 <- rbind(res1, colMeans(rbind(res1[nrow(res1), ], S)))
}
res1
# obs time
# 1 0.376972125 -0.35312282
# 2 -1.209781097 0.01180847
# 3 -0.416404486 -0.17065718
# 4 0.671363430 -0.97981606
# 5 0.394365109 -0.21075628
# 6 -0.368020398 -0.04117009
# 7 -0.033236012 0.68404454
# 8 0.042065388 0.62117402
# 9 0.209518756 0.13402560
# 10 -0.009929495 -1.20236950
Here a quick benchmark of both versions (R=2K), recursion appears to be almost twice as fast.
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# recursive 577.0595 582.0189 587.3052 586.9783 592.4281 597.8778 3 a
# for-loop 991.4360 993.7170 997.2436 995.9980 1000.1473 1004.2966 3 b
Data:
rdf <- structure(list(obs = c(0.376972124936433, 0.301548373935665,
-1.0980231706536, -1.13040590360378, -2.79653431987176, 0.720573498411587,
0.93912102300901, -0.229377746707471, 1.75913134696347, 0.117366786802848
), time = c(-0.353122822287008, 1.40925918161821, 1.69637295955276,
0.128416096258652, 0.376739766712564, 2.30004311672545, 2.20399587729432,
-2.53876460529759, -1.78897494991878, 0.558303494992923)), class = "data.frame", row.names = c(NA,
-10L))
another approach (with your example data rdf):
create a function mean_of_random_pair(xs) which draws two random items of a set xs and calculates their mean:
mean_of_random_pair <- function(xs){
xs |> sample(size = 2) |> mean(na.rm = TRUE)
}
create a function cumulative_mean which calculates the grand mean X as the mean of the existing X and a new item x:
cumulative_mean <- function(xs){
xs |> Reduce(f = \(X, x) mean(c(X, x)),
accumulate = TRUE
)
}
link above functions up into a pipeline and run it runs times on the set rdf$obs:
runs = 100
1:runs |>
Map(f = \(i) mean_of_random_pair(rdf$obs)) |>
cumulative_mean()
output (the sequence of iterative averaging):
[1] 1.1000858 0.8557774 0.3041130 0.4262881 -0.4658256
# ...
inspect output (for n = 5000 simulation runs):
runs = 5e3
set.seed(4711)
densities <-
list(obs = 'obs', time = 'time') |>
map(\(var){
1:runs |>
Map(f = \(i) mean_of_random_pair(rdf[[var]])) |>
cumulative_mean() |>
density()
})
densities$time |> plot(col = 'blue', ylim = c(0, 1), xlim = c(-3, 3), main = 'foo')
densities$obs |> lines(col = 'red')
I have a matrix X:
one two three four
[1,] 1 3 2 4
[2,] 2 0 1 5
[3,] 3 2 1 4
[4,] 4 9 11 19
[5,] 4 3 2 1
I want to get a new matrix Y which only contains rows that are permutations of "1", "2", "3", "4". That is:
one two three four
[1,] 1 3 2 4
[3,] 3 2 1 4
[5,] 4 3 2 1
What function or command should I use?
mat <- rbind(
c(1, 3, 2, 4),
c(2, 0, 1, 5),
c(3, 2, 1, 4)
)
ok <- apply(mat, 1L, function(x) setequal(x, c(1, 2, 3, 4)))
mat[ok, ]
Your example matrix and target vector:
X <- structure(c(1, 2, 3, 4, 4, 3, 0, 2, 9, 3, 2, 1, 1, 11, 2, 4, 5, 4, 19, 1),
dim = 5:4)
v <- 1:4
But let's construct a more challenging one (thanks to user harre):
X <- rbind(X, 1, c(1, 2, 1, 2))
A fully vectorized approach (using package matrixStats)
rk <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
ct <- matrixStats::rowTabulates(rk, values = 1:length(v))
zo <- matrixStats::rowCounts(ct, value = 0L)
## all rows that are permutations of 'v'
X[zo == 0L, ]
# [,1] [,2] [,3] [,4]
#[1,] 1 3 2 4
#[2,] 3 2 1 4
#[3,] 4 3 2 1
## remove rows that are permutations of 'v'
X[zo > 0L, ]
Another fully vectorized method (base R)
This is a mathematical solution. For a nonlinear and asymmetric weight function w(x), the following weighted sum:
1 x w(1) + 2 x w(2) + 3 x w(3) + 4 x w(4)
is a unique score or identifier and is invariant to permutations. So for example, the following gives the same value:
2 x w(2) + 1 x w(1) + 3 x w(3) + 4 x w(4)
But anything else will give different values, like:
1 x w(1) + 3 x w(1) + 3 x w(3) + 4 x w(4)
0 x w(0) + 3 x w(1) + 0 x w(0) + 4 x w(4)
Here is an implementation using cosine weights. It works even if X and v are floating point numbers or characters.
## method = "tab" for tabulation method
## method = "cos" for cosine weights method
FindPerm <- function (X, v, method) {
## number of elements
n <- length(v)
if (ncol(X) != n) stop("inconformable 'X' and 'v'!")
if (anyDuplicated(v)) stop("'v' should not contain duplicated values!")
## convert everything to integers 0, 1, 2, ..., n
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
## branches
if (method == "tab") {
## row-wise tabulating
rtab <- matrixStats::rowTabulates(Xi, values = vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
matrixStats::rowCounts(rtab, value = 0L) == 0L
} else if (method == "cos") {
## evaluate cosine weights for Xi and vi
w <- pi / (n + 1)
cos.Xi <- cos(w * Xi)
cos.vi <- cos(w * vi)
## weighted sum for Xi
wtsum.Xi <- rowSums(Xi * cos.Xi)
## weighted sum for vi
wtsum.vi <- sum(vi * cos.vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
wtsum.Xi == wtsum.vi
} else {
stop("unknown method!")
}
}
X[FindPerm(X, v, "cos"), ]
# [,1] [,2] [,3] [,4]
#[1,] 1 3 2 4
#[2,] 3 2 1 4
#[3,] 4 3 2 1
Benchmark
Performance depends on the number of values in v. The tabulation method will slow down as v becomes long.
## a benchmark function, relying on package "microbenchmark"
## nr: number of matrix rows
## nc: number of elements in 'v'
bm <- function (nr, nc) {
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark("tab" = FindPerm(X, v, "tab"),
"cos" = FindPerm(X, v, "cos"),
check = "identical")
}
bm(2e+4, 4)
#Unit: milliseconds
# expr min lq mean median uq max
# tab 4.302674 4.324236 4.536260 4.336955 4.359814 7.039699
# cos 4.846893 4.872361 5.163209 4.882942 4.901288 7.837580
bm(2e+4, 20)
#Unit: milliseconds
# expr min lq mean median uq max
# tab 30.63438 30.70217 32.73508 30.77588 33.08046 135.64322
# cos 21.16669 21.26161 22.28298 21.37563 23.60574 26.31775
Update since there's so much interest in this question, here's a method using indexing to give a speed boost on Zheyuan Li's excellent generalization of my original answer.
The idea is to index on a length(v)-dimensional array for small v, or to index on v*sin(w*v) using the results of match instead of calculating X*sin(W*X) when v is large:
library(RcppAlgos)
# simplified version of Zheyuan Li's function
f1 <- function(X, v) {
n <- length(v)
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
w <- pi/(n + 1)
rowSums(Xi*sin(Xi*w)) == sum(vi*sin(vi*w))
}
f2 <- function(X, v) {
n <- length(v)
if (n < 6) {
# index an n-dimensional array
m <- array(FALSE, rep(n + 1L, n))
m[permuteGeneral(n)] <- TRUE
X[] <- match(X, v, nomatch = length(v) + 1L)
m[X]
} else {
nn <- 1:n
u <- c(nn*sin(pi*nn/(n + 1L)), 0)
X[] <- u[match(X, v, nomatch = n + 1L)]
rowSums(X) == sum(u)
}
}
set.seed(123)
# using Zheyuan Li's test dataset
nr <- 2000; nc <- 4
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark(f1 = f1(X, v),
f2 = f2(X, v),
check = "identical")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 344.4 367.25 438.932 374.05 386.75 5960.6 100
#> f2 81.9 85.00 163.332 88.90 98.50 6924.4 100
# Zheyuan Li's larger test dataset
set.seed(123)
nr <- 2000; nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark(f1 = f1(X, v),
f2 = f2(X, v),
check = "identical")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 1569.2 1575.45 1653.510 1601.30 1683.6 3972.6 100
#> f2 355.2 359.90 431.705 366.85 408.6 2253.8 100
Original answer edited to use X + exp(1/X) (see comments).
This should work with positive integers:
Y <- X[rowSums(X + exp(1/X)) == sum(1:4 + exp(1/(1:4))),]
Benchmarking against the apply solution:
f1 <- function(x) x[apply(x, 1L, function(x) setequal(x, 1:4)),]
f2 <- function(x) x[rowSums(x + exp(1/x)) == sum(1:4 + exp(1/(1:4))),]
X <- matrix(sample(10, 4e5, TRUE), 1e5)
microbenchmark::microbenchmark(f1 = f1(X),
f2 = f2(X),
times = 10,
check = "equal")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 448.2680 450.8778 468.55179 461.62620 472.0022 542.0455 10
#> f2 28.5362 28.6889 31.50941 29.44845 30.2693 50.4402 10
This question is HOT. I am learning so I take this as a good opportunity to learn. It is really difficult for me to come up with new solutions, but I found two things missing here:
there is no serious validation of these answers;
there is no benchmark for all of them.
I would like to convert each answer to a function that returns a TRUE/FALSE vector for flagging rows. I also want this function to work with any matrix and any vector.
Stéphane Laurent's answer, Zheyuan Li's answer and ThomasIsCoding's answer need minimal adaption.
Mohamed Desouky's answer is also easy to adapt, by taking out the function applied in Filter() and apply() it over matrix rows.
jblood94's answer is challenging. It was commented that for other matrices and vectors, conversion is needed using match. I don't know what is the appropriate way, but I saw match in Zheyuan Li's answer, so I borrowed that part.
TarJae's answer is awful (sorry; don't take this as an insult). None of them seems to work. I don't see any comparison between matrix rows and vectors in the base R solution. For other tidyverse codes, I don't know what df_matrix is. I have requested TarJae to please revise the answer.
harre's answer uses tidyverse and dose not return TRUE/FALSE. So I have to exclude it from the benchmark (sorry).
Here are the functions for the benchmark.
S.Laurent <- function (X, v) apply(X, 1L, function(x) setequal(x, v))
Z.Li <- function (X, v, method) {
## number of elements
n <- length(v)
if (ncol(X) != n) stop("inconformable 'X' and 'v'!")
if (anyDuplicated(v)) stop("'v' should not contain duplicated values!")
## convert everything to integers 0, 1, 2, ..., n
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
## branches
if (method == "tab") {
## row-wise tabulating
rtab <- matrixStats::rowTabulates(Xi, values = vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
matrixStats::rowCounts(rtab, value = 0L) == 0L
} else if (method == "cos") {
## evaluate cosine weights for Xi and vi
w <- pi / (n + 1)
cos.Xi <- cos(w * Xi)
cos.vi <- cos(w * vi)
## weighted sum for Xi
wtsum.Xi <- rowSums(Xi * cos.Xi)
## weighted sum for vi
wtsum.vi <- sum(vi * cos.vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
wtsum.Xi == wtsum.vi
} else {
stop("unknown method!")
}
}
Thomas <- function (X, v) colSums(mapply(`%in%`, list(v), asplit(X, 1))) == ncol(X)
M.Desouky <- function (X, v) apply(X, 1, function (x) all((x %in% v) & length(unique(x)) == length(v)))
jblood94 <- function (X, v) {
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:length(v)
rowSums(Xi + exp(1/Xi)) == sum(vi + exp(1/vi))
}
For benchmark, I followed the setup in Zheyuan Li's answer.
library(matrixStats)
library(microbenchmark); library(ggplot2)
nr <- 2000; nc <- 4
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("S.Laurent" = S.Laurent(X, v),
"Z.Li.tab" = Z.Li(X, v, "tab"),
"Z.Li.cos" = Z.Li(X, v, "cos"),
"Thomas" = Thomas(X, v),
"M.Desouky" = M.Desouky(X, v),
"jblood94" = jblood94(X, v),
check = "identical") -> bm1
autoplot(bm1)
nr <- 2000; nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("S.Laurent" = S.Laurent(X, v),
"Z.Li.tab" = Z.Li(X, v, "tab"),
"Z.Li.cos" = Z.Li(X, v, "cos"),
"Thomas" = Thomas(X, v),
"M.Desouky" = M.Desouky(X, v),
"jblood94" = jblood94(X, v),
check = "identical") -> bm2
autoplot(bm2)
I don't know how time is transformed for plotting, but clearly, they are not on the usual scale. Those to the left are far faster than it is suggested from the plot!
Conclusion: Zheyuan Li's "cos" method is the winner.
We can try this
> mat[colSums(mapply(`%in%`, list(1:4), asplit(mat, 1))) == ncol(mat), ]
[,1] [,2] [,3] [,4]
[1,] 1 3 2 4
[2,] 3 2 1 4
[3,] 4 3 2 1
Another option is using Filter function
t(Filter(\(x) all((x %in% 1:4) & length(unique(x)) == 4) ,
data.frame(t(X))))
Just for fun
Who can give me 4 distinct digits whose sum equal 6 ?
there is just {0,1,2,3}
then we can use the module of 4 using %%
X[apply(X , 1 , \(x) sum(unique(x %% 4)) == 6 & length(unique(x)) == 4) , ]
OR
with Using pure for loop
ans <- data.frame(matrix(NA , ncol = ncol(X)))
r <- 1
for(i in 1:nrow(X)){
if(all((X[i,] %in% 1:4) & length(unique(X[i,])) == 4)){
ans[r,] <- X[i,]
r <- r + 1
}
}
ans <- as.matrix(ans)
output
X1 X2 X3 X4
1 1 3 2 4
2 3 2 1 4
3 4 3 2 1
data
X <- matrix(c(1, 2, 3, 4, 4, 3, 0, 2, 9, 3, 2, 1, 1, 11, 2, 4,
5, 4, 19, 1) , ncol = 4)
For the fun of tidyverse-solutions, even if I think we'd rather work on the matrices directly. However, we could use rowwise() and c_across():
With set-operations (inspired by #Stéphane Laurent):
library(dplyr)
mat %>%
as_tibble() |>
rowwise() |>
filter(setequal(c_across(), c(1, 2, 3, 4))) |>
ungroup() |>
as.matrix()
Or without set-operations:
library(dplyr)
mat %>%
as_tibble() |>
rowwise() |>
filter(1 %in% c_across(everything()) &
2 %in% c_across(everything()) &
3 %in% c_across(everything()) &
4 %in% c_across(everything())
) |>
ungroup() |>
as.matrix()
Or inspired by #Mohamed Desouky:
mat %>%
as_tibble() |>
rowwise() |>
filter(all(c_across() %in% 1:4) & n_distinct(c_across()) == 4) |>
ungroup() |>
as.matrix()
And so on..
The algorithm library in C++ offers a function called std::is_permutation that does just the trick.
The workhorse function below uses Rcpp and is fairly straightforward.
#include <Rcpp.h>
// [[Rcpp::export]]
SEXP perm_idx_cpp(Rcpp::IntegerMatrix mat, const std::vector<int> &v) {
const int nRows = mat.nrow();
const int nCols = mat.ncol();
std::vector<int> test(nCols);
Rcpp::LogicalVector res(nRows);
for (int i = 0; i < nRows; ++i) {
for (int j = 0; j < nCols; ++j) {
test[j] = mat(i, j);
}
res[i] = std::is_permutation(
test.begin(), test.end(), v.begin()
);
}
return res;
}
And calling it in R we have (N.B. We use match to get integer indices as in #Zheyuan Li's answer, which is absolutely brilliant btw!):
get_perm_idx <- function(X, v) {
Xi <- matrix(match(X, v, nomatch = 0L), ncol = ncol(X))
perm_idx_cpp(Xi, seq_along(v))
}
It is very efficient as well. Here is a simple benchmark:
nr <- 2e4
nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("tab" = FindPerm(X, v, "tab"),
"cos" = FindPerm(X, v, "cos"),
"is_perm_cpp" = get_perm_idx(X, v),
check = "identical")
Unit: milliseconds
expr min lq mean median uq max neval
tab 33.641345 36.479660 39.00994 37.402306 39.560015 54.88057 100
cos 9.496309 12.887493 15.30122 13.306302 14.053643 132.24079 100
is_perm_cpp 3.232093 4.819553 6.08687 4.993367 5.248818 19.56919 100
You could probably squeeze out some extra efficiency, but it is tough to beat the simplicity here.
Update2
The second set.seed(i) should be replaced as set.seed(i+1), or whatever another new random series. If not, s3 <- sum(data$gene == 0 & data$cancer == 1) will always be 0, since the number smaller than 0.08 will be smaller than 0.39.
I did't correct my original question code because this matter is not related to this post's core question.
Update
set.seed(i) is added twice since there are two random number generations, i.e., random1 and random2. However, the results among operations still changeable, which is strange.
Background:
the codes below is about odds ratios. But the focus point is not statistic here. Instead, I find the results changed (!) among some operations which are actually the same (I suppose that they are in fact not, but I cannot figure it out).
Code:
gene <- vector(length = 500, mode = "integer")
cancer <- vector(length = 500, mode = "integer")
data <- data.frame(gene, cancer)
odd_withMutate <- vector(length = 20, mode = "numeric")
odd_noMutate <- vector(length = 20, mode = "numeric")
result <- data.frame(odd_withMutate, odd_noMutate)
for (i in 1:20) {
# set.seed(12)
# set.seed(16)
set.seed(i)
random1 <- runif(500, min = 0, max = 1)
# set.seed(12)
# set.seed(16)
set.seed(i) # add this instruction
random2 <- runif(500, min = 0, max = 1)
for (j in 1:500) {
if (random1[j] < 0.39){
data[j,1] <- 1
}
if (random2[j] < 0.08){
data[j,2] <- 1
}
}
s1 <- sum(data$gene == 1 & data$cancer == 1) # has the mutated gene & has cancer
s2 <- sum(data$gene == 1 & data$cancer == 0)
s3 <- sum(data$gene == 0 & data$cancer == 1)
s4 <- sum(data$gene == 0 & data$cancer == 0)
result[i,]$odd_withMutate <- s1/s2
result[i,]$odd_noMutate <- s3/s4
}
Different operations:
Operation #1:
If I run the code above, the 12th row of odd_noMutate in result will be 0, the 16th will be NaN. Then I tried to see what happened, so I use set.seed(12) or set.seed(16) to check (Operation #2 & #3). But the 0 and NaN disappeared! I mean, in Operation #2, 0.1638418 0 is not 1.5075377 0. In Operation #3, 0.2830189 0 is not 2.4013605 NaN.
Operation #2:
the changed part of code is:
set.seed(12) #odd_noMutate = 0
# set.seed(16) #odd_noMutate = NaN
# set.seed(i)
random1 <- runif(500, min = 0, max = 1)
set.seed(12)
# set.seed(16)
# set.seed(i) # add this instruction
random2 <- runif(500, min = 0, max = 1)
Operation #3:
# set.seed(12) #odd_noMutate = 0
set.seed(16) #odd_noMutate = NaN
# set.seed(i)
random1 <- runif(500, min = 0, max = 1)
# set.seed(12)
set.seed(16)
# set.seed(i) # add this instruction
Operation #4:
I find that even changed the i in my code will make results totally different (Shouldn't it be the subset of the original result?). It is the Operation #4. Specifically, 0.3092105 0 is not 1.5075377 0; 0.7562724 0 is not 2.4013605 NaN.
for (i in 10:20) {
# set.seed(12) #odd_noMutate = 0
# set.seed(16) #odd_noMutate = NaN
set.seed(i)
random1 <- runif(500, min = 0, max = 1)
# set.seed(12)
# set.seed(16)
set.seed(i) # add this instruction
random2 <- runif(500, min = 0, max = 1)
The results among these operations are shown below:
The problem is that some previous values in data remain and are reused. Maybe your problem are solved remaking data every for loop (i). (putting data <- data.frame(gene, cancer) into for loop).
gene <- vector(length = 500, mode = "integer")
cancer <- vector(length = 500, mode = "integer")
# data <- data.frame(gene, cancer)
odd_withMutate <- vector(length = 20, mode = "numeric")
odd_noMutate <- vector(length = 20, mode = "numeric")
result <- data.frame(odd_withMutate, odd_noMutate)
for (i in 1:20) {
data <- data.frame(gene, cancer) # remaking data every time
# set.seed(12)
# set.seed(16)
set.seed(i)
random1 <- runif(500, min = 0, max = 1)
# set.seed(12)
# set.seed(16)
set.seed(i) # add this instruction
random2 <- runif(500, min = 0, max = 1)
for (j in 1:500) {
if (random1[j] < 0.39){
data[j,1] <- 1
}
if (random2[j] < 0.08){
data[j,2] <- 1
}
}
s1 <- sum(data$gene == 1 & data$cancer == 1) # has the mutated gene & has cancer
s2 <- sum(data$gene == 1 & data$cancer == 0)
s3 <- sum(data$gene == 0 & data$cancer == 1)
s4 <- sum(data$gene == 0 & data$cancer == 0)
result[i,]$odd_withMutate <- s1/s2
result[i,]$odd_noMutate <- s3/s4
}
[ADDITION]
for loop doesn't have own environment unlike function.
So handling in for loop directly affects Global env objects such as your data.
You partly overwrited data of Global env by if statement and it was referred in next loop.
Here is my simple example;
data <- data.frame(gene = vector(length = 5, mode = "integer"))
keep_of_process <- list()
for(i in 1:2) {
set.seed(i)
random_val <- runif(5, 0, 1)
for(j in 1:5) {
if(random_val[j] < 0.39) {
data[j, 1] <- 1
}
keep_of_process[[i]] <- data.frame(random = random_val,
gene = data$gene)
}
}
do.call("cbind", keep_of_process) # just to merge process to show
# left is i = 1 and right is i = 2
random gene random gene
1 0.2655087 1 0.1848823 1
2 0.3721239 1 0.7023740 1
3 0.5728534 0 0.5733263 0
4 0.9082078 0 0.1680519 1
5 0.2016819 1 0.9438393 1
Please see row 2. In i = 2, random is 0.7023740 but gene is 1 (previous result retains).
So to do what you want (from my understanding), you need to remake data (my answer) or completely overwrite data by if statement, such as
if(random_val[j] < 0.39) {
data[j, 1] <- 1
} else {
data[j, 1] <- 0
}
I would like to perform a two sample t test in R using separate groupings. The t.test must be "unbiased", meaning that for all transactions in the outer group (group 2 below), the T test must be run for each inner group (group 1 below) like: "inner group A" vs. "inner group not A". The for loop code shown below is probably clearer than a verbal explanation...
My current code is below. Does anyone know a faster/better way to do this? Open to using any package, but currently using data.table.
For context, I have ~1 million rows of transaction data. Group 1 indicates a person (if there are multiple rows they have multiple transactions) and contains ~30k unique values. Group 2 indicates a zip code and contains ~500 unique values
Thanks!
library(data.table)
# fake data
grp1 <- c('A','A','A','B','B','C','C','D','D','D','D','E','E','E','F','F')
grp2 <- c(1,1,1,1,1,1,1, 2,2,2,2,2,2,2, 2,2)
vals <- c(10,20,30, 40,15, 25,60, 70,100,200,300, 400,1000,2000, 3000,5000)
DT <- data.table(grp1 = grp1, grp2 = grp2, vals = vals)
# "two sample t.test" --------------------------------------------------
# non vectorized, in-place
# runtime is ~50 mins for real data
for (z in DT[,unique(grp2)]){
for (c in DT[grp2 == z, unique(grp1)]) {
res = t.test(
DT[grp2 == z & grp1 == c, vals],
DT[grp2 == z & grp1 != c, vals],
alternative = 'greater'
)
DT[grp2 == z & grp1 == c, pval := res$p.value]
DT[grp2 == z & grp1 == c, tstat := res$statistic]
}
}
# vectorized, creates new summarized data.table
# runtime is 1-2 mins on real data
vec <- DT[,{
grp2_vector = vals
.SD[,.(tstat = t.test(vals, setdiff(grp2_vector, vals), alternative = 'g')$statistic,
pval = t.test(vals, setdiff(grp2_vector, vals), alternative = 'g')$p.value), by=grp1]
} , by=grp2]
stats::t.test is generalized and does a number of checks. You can just calculate what you need, i.e. t-statistic and p-value and also make use of the optimization in data.table to calculate length, mean and variance. Here is a possible approach:
#combinations of grp1 and grp2 and those not in grp1 for each grp2
comb <- unique(DT[, .(grp1, grp2)])[,
rbindlist(lapply(1:.N, function(n) .(g1=rep(grp1[n], .N-1L), notIn=grp1[-n]))),
.(g2=grp2)]
#this is optimized, switch on verbose to see the output
X <- DT[, .(nx=.N, mx=mean(vals), vx=var(vals)), .(grp1, grp2)] #, verbose=TRUE]
#calculate length, mean, var for values not in grp1
Y <- DT[comb, on=.(grp2=g2, grp1=notIn), allow.cartesian=TRUE][,
.(ny=.N, my=mean(vals), vy=var(vals)), by=.(grp1=g1, grp2=grp2)]
#calculate outputs based on stats:::t.test.default
ans <- X[Y, on=.(grp1, grp2)][, c("tstat", "pval") := {
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my)/stderr
.(tstat, pt(tstat, df, lower.tail = FALSE))
}, by=1:Y[,.N]]
output:
grp1 grp2 nx mx vx ny my vy tstat pval
1: C 1 2 42.500 612.50 5 23.0000 145.0000 1.06500150 0.22800432
2: B 1 2 27.500 312.50 5 29.0000 355.0000 -0.09950372 0.53511601
3: A 1 3 20.000 100.00 4 35.0000 383.3333 -1.31982404 0.87570431
4: F 2 2 4000.000 2000000.00 7 581.4286 489747.6190 3.30491342 0.08072148
5: E 2 3 1133.333 653333.33 6 1445.0000 4323350.0000 -0.32174451 0.62141500
6: D 2 4 167.500 10891.67 5 2280.0000 3292000.0000 -2.59809850 0.97016160
timing code:
library(data.table) #data.table_1.12.4
set.seed(0L)
np <- 4.2e5
nzc <- 4.2e3
DT <- data.table(grp1=rep(1:np, each=5), grp2=rep(1:nzc, each=np/nzc*5),
vals=abs(rnorm(np*5, 5000, 2000)), key=c("grp1", "grp2"))
mtd0 <- function() {
DT[, {
grp2_vector <- vals
.SD[,{
tres <- t.test(vals, setdiff(grp2_vector, vals), alternative = 'g')
.(tstat=tres$statistic, pval=tres$p.value)
}, by=grp1]
} , by=grp2]
}
mtd1 <- function() {
comb <- unique(DT[, .(grp1, grp2)])[,
rbindlist(lapply(1:.N, function(n) .(g1=rep(grp1[n], .N-1L), notIn=grp1[-n]))),
.(g2=grp2)]
X <- DT[, .(nx=.N, mx=mean(vals), vx=var(vals)), .(grp1, grp2)] #, verbose=TRUE]
Y <- DT[comb, on=.(grp2=g2, grp1=notIn), allow.cartesian=TRUE][,
.(ny=.N, my=mean(vals), vy=var(vals)), by=.(grp1=g1, grp2=grp2)]
ans <- X[Y, on=.(grp1, grp2)][, c("tstat", "pval") := {
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my)/stderr
.(tstat, pt(tstat, df, lower.tail = FALSE))
}, by=1:Y[,.N]]
}
microbenchmark::microbenchmark(mtd0(), mtd1(), times=1L)
timings:
Unit: seconds
expr min lq mean median uq max neval
mtd0() 65.76456 65.76456 65.76456 65.76456 65.76456 65.76456 1
mtd1() 18.29710 18.29710 18.29710 18.29710 18.29710 18.29710 1
I would suggest looking at the package Rfast. There are commands, such as ttest1, ttest2 and ttests for one sample, 2 sample and many t
I have 34 rasters (nrow: 17735, ncol: 11328, ncell: 200902080) with values 0 and 1, of 4Mb each. I want the cumulative sum of those values with zero reset.
I tried several alternatives based on: Cumulative sum that resets when 0 is encountered
library(microbenchmark)
library(compiler)
library(dplyr)
library(data.table)
x <- c(0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0)
fun = function(x)
{ cs = cumsum(x)
cs - cummax((x == 0) * cs)
}
funC <- cmpfun(fun)
microbenchmark(
funcioEx = fun(x),
funComEx = funC(x),
lapplyEx = unname(unlist(lapply(split(x,cumsum(c(0,diff(x) != 0))), cumsum))),
dataTaEx = data.table(x)[, whatiwant := cumsum(x), by = rleid(x==0L)],
reduceEx = Reduce(function(x, y) if (y == 0) 0 else x+y, x, accumulate=TRUE)
)
I would like to optimize this procedure for my data, because with the second option (funComEx, the fastest) it takes about 3 hours.
Rcpp may help a little
library(Rcpp)
cppFunction(
"IntegerVector foo(NumericVector vect){
int N = vect.size();
IntegerVector ans(N);
ans[0] = vect[0];
for (int i = 1; i < N; i++){
if(vect[i] > 0){
ans[i] = ans[i-1] + vect[i];
} else {
ans[i] = 0;
}
}
return(ans);
}"
)
set.seed(42)
x = sample(0:1, 1e4, TRUE)
identical(foo(x), fun(x))
#[1] TRUE
microbenchmark(
funcioEx = fun(x),
funComEx = funC(x),
lapplyEx = unname(unlist(lapply(split(x,cumsum(c(0,diff(x) != 0))), cumsum))),
dataTaEx = data.table(x)[, whatiwant := cumsum(x), by = rleid(x==0L)],
reduceEx = Reduce(function(x, y) if (y == 0) 0 else x+y, x, accumulate=TRUE),
foo_RCPP = foo(x)
)
#Unit: microseconds
# expr min lq mean median uq max neval
# funcioEx 98.238 104.2495 118.81500 113.1935 121.1110 280.637 100
# funComEx 97.358 103.2230 113.81515 112.1670 118.1785 220.522 100
# lapplyEx 17810.638 18888.9055 20130.20765 19399.7415 20641.0550 28073.981 100
# dataTaEx 3435.387 3832.0025 4468.77932 4023.6395 4347.3840 17053.181 100
# reduceEx 7472.515 8174.4020 9614.23122 8634.7985 10177.1305 15719.788 100
# foo_RCPP 52.491 62.6085 80.65777 66.5670 72.4320 1102.315 100