R: getting rid of for loop and speeding code - r

I would like to speed up my calculations and obtain results without using loop in function m. Reproducible example:
N <- 2500
n <- 500
r <- replicate(1000, sample(N, n))
m <- function(r, N) {
ic <- matrix(0, nrow = N, ncol = N)
for (i in 1:ncol(r)) {
p <- r[, i]
ic[p, p] <- ic[p, p] + 1
}
ic
}
system.time(ic <- m(r, N))
# user system elapsed
# 6.25 0.51 6.76
isSymmetric(ic)
# [1] TRUE
In every iteration of for loop we are dealing with matrix not vector, so how this could be Vectorized?
#joel.wilson The purpose of this function is to calculate pairwise frequencies of elements. So afterwards we could estimate pairwise inclusion probabilities.
Thanks to #Khashaa and #alexis_laz. Benchmarks:
> require(rbenchmark)
> benchmark(m(r, N),
+ m1(r, N),
+ mvec(r, N),
+ alexis(r, N),
+ replications = 10, order = "elapsed")
test replications elapsed relative user.self sys.self user.child sys.child
4 alexis(r, N) 10 4.73 1.000 4.63 0.11 NA NA
3 mvec(r, N) 10 5.36 1.133 5.18 0.18 NA NA
2 m1(r, N) 10 5.48 1.159 5.29 0.19 NA NA
1 m(r, N) 10 61.41 12.983 60.43 0.90 NA NA

This should be significantly faster as it avoids operations on double indexing
m1 <- function(r, N) {
ic <- matrix(0, nrow = N, ncol=ncol(r))
for (i in 1:ncol(r)) {
p <- r[, i]
ic[, i][p] <- 1
}
tcrossprod(ic)
}
system.time(ic1 <- m1(r, N))
# user system elapsed
# 0.53 0.01 0.55
all.equal(ic, ic1)
# [1] TRUE
Simple "counting/adding" operations can almost always be vectorized
mvec <- function(r, N) {
ic <- matrix(0, nrow = N, ncol=ncol(r))
i <- rep(1:ncol(r), each=nrow(r))
ic[cbind(as.vector(r), i)] <- 1
tcrossprod(ic)
}

Related

How to iterate through parameters in for loop

I have a model written as a for loop that incorporates a number of parameters that I specify:
## functions needed to run the model
learn <- function(prior, sensi, speci, e){
out <- ifelse(e == 1, (sensi*prior) / ((sensi*prior) + (1-speci)*(1-prior)),
((1-sensi)*prior) / (((1-sensi)*prior) + (speci*(1-prior))))
out
}
feed <- function(vec){
prior <- 0.5
for (i in vec){
res <- learn(prior, sensi, speci, i)
prior <- res
}
return(prior)
}
## specify parameters
iterations <- 100
N <- 10
BR <- 0.66
sensi <- 0.75
speci <- 0.45
## initialize results object
res <- NULL
## loop for number of iterations
for (j in 1:iterations){
X <- as.numeric(rbinom(1, 1, BR))
if (X == 1){ # if X is 1...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
}
} else { # if X is 0...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
evidence[[i]] <- ifelse(evidence[[i]]==1, 0, 1) # flip evidence
}
}
# feed vectors of evidence through learn function
t0 <- sapply(evidence, feed)
# save dataframe
df <- data.frame("i" = j,
"ID" = c(1:N),
"E" = t0,
"X" = X,
"N" = N,
"BR" = BR,
"sensi" = sensi,
"speci" = speci)
res <- rbind(res, df)
}
This works fine for a single parameterisation, but I now want to automate the process of specifying different parameter values and re-running the model. So instead of defining each parameter as a single value, I define them as a vector of values and store all the possible parameterisations in a dataframe (paramspace) with each row holding the values for a single parameterisation that I want to run:
## set up for multiple parameterizations
iterations <- 100
N_vec <- c(10, 50)
BR_vec <- c(0.25, 0.50, 0.75)
sensi_vec <- c(0.45, 0.75)
speci_vec <- c(0.45, 0.75)
paramspace <- expand.grid(iterations = iterations, N = N_vec, BR = BR_vec, sensi = sensi_vec, speci = speci_vec)
> paramspace
iterations N BR sensi speci
1 100 10 0.25 0.45 0.45
2 100 50 0.25 0.45 0.45
3 100 10 0.50 0.45 0.45
4 100 50 0.50 0.45 0.45
5 100 10 0.75 0.45 0.45
6 100 50 0.75 0.45 0.45
7 100 10 0.25 0.75 0.45
8 100 50 0.25 0.75 0.45
9 100 10 0.50 0.75 0.45
10 100 50 0.50 0.75 0.45
11 100 10 0.75 0.75 0.45
12 100 50 0.75 0.75 0.45
13 100 10 0.25 0.45 0.75
14 100 50 0.25 0.45 0.75
15 100 10 0.50 0.45 0.75
16 100 50 0.50 0.45 0.75
17 100 10 0.75 0.45 0.75
18 100 50 0.75 0.45 0.75
19 100 10 0.25 0.75 0.75
20 100 50 0.25 0.75 0.75
21 100 10 0.50 0.75 0.75
22 100 50 0.50 0.75 0.75
23 100 10 0.75 0.75 0.75
24 100 50 0.75 0.75 0.75
How can I pass each row of parameter values to my model and automatically run through all the parameterisations stated in paramspace?
As suggested in comments, you can create a function and then use apply to loop over the parameters combinations :
## functions needed to run the model
learn <- function(prior, sensi, speci, e){
out <- ifelse(e == 1, (sensi*prior) / ((sensi*prior) + (1-speci)*(1-prior)),
((1-sensi)*prior) / (((1-sensi)*prior) + (speci*(1-prior))))
out
}
feed <- function(vec,sensi,speci){
prior <- 0.5
for (i in vec){
res <- learn(prior, sensi, speci, i)
prior <- res
}
return(prior)
}
runModel <- function(iterations = 100,
N = 10,
BR = 0.66,
sensi = 0.75,
speci = 0.45 ) {
## initialize results object
res <- NULL
## loop for number of iterations
for (j in 1:iterations){
X <- as.numeric(rbinom(1, 1, BR))
if (X == 1){ # if X is 1...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
}
} else { # if X is 0...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
evidence[[i]] <- ifelse(evidence[[i]]==1, 0, 1) # flip evidence
}
}
# feed vectors of evidence through learn function
#t0 <- sapply(evidence, feed)
t0 <- sapply(evidence,function(e){feed(e,sensi,speci)})
# save dataframe
df <- list("i" = iterations,
"ID" = c(1:N),
"E" = t0,
"X" = X,
"N" = N,
"BR" = BR,
"sensi" = sensi,
"speci" = speci)
res <- rbind(res, df)
}
res
}
# Define parameter space
iterations <- 100
N_vec <- c(10, 50)
BR_vec <- c(0.25, 0.50, 0.75)
sensi_vec <- c(0.45, 0.75)
speci_vec <- c(0.45, 0.75)
paramspace <- expand.grid(iterations = iterations, N = N_vec, BR = BR_vec, sensi = sensi_vec, speci = speci_vec)
# Loop over parameter space :
res <- apply(paramspace,1,function(paramset) {
iterations = paramset[1]
N = paramset[2]
BR = paramset[3]
sensi = paramset[4]
speci = paramset[5]
runModel(iterations = iterations, N = N, BR = BR , sensi = sensi, speci = speci )
})
You can also use the foreach package, that used with an appropriate backend offers parallelization capabilities, in case your task becomes more intensive. Here a simple example to understand how it works.
foreach(a=1:3, b=4:6) %do% (a + b)
Then I tried to embed your code into foreach
require(foreach)
## functions needed to run the model
learn <- function(prior, sensi, speci, e){
out <- ifelse(e == 1, (sensi*prior) / ((sensi*prior) + (1-speci)*(1-prior)),
((1-sensi)*prior) / (((1-sensi)*prior) + (speci*(1-prior))))
out
}
feed <- function(vec){
prior <- 0.5
for (i in vec){
res <- learn(prior, sensi, speci, i)
prior <- res
}
return(prior)
}
## set up for multiple parameterizations
iterations <- 100
N_vec <- c(10, 50)
BR_vec <- c(0.25, 0.50, 0.75)
sensi_vec <- c(0.45, 0.75)
speci_vec <- c(0.45, 0.75)
paramspace <- expand.grid(iterations = iterations, N = N_vec, BR = BR_vec, sensi = sensi_vec, speci = speci_vec)
res <- foreach(iterations = paramspace$iterations,
N = paramspace$N,
BR = paramspace$BR,
sensi = paramspace$sensi,
speci = paramspace$speci) %do% {
## initialize results object
res <- NULL
## loop for number of iterations
for (j in 1:iterations){
X <- as.numeric(rbinom(1, 1, BR))
if (X == 1){ # if X is 1...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
}
} else { # if X is 0...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
evidence[[i]] <- ifelse(evidence[[i]]==1, 0, 1) # flip evidence
}
}
# feed vectors of evidence through learn function
t0 <- sapply(evidence, feed)
# save dataframe
df <- data.frame("i" = j,
"ID" = c(1:N),
"E" = t0,
"X" = X,
"N" = N,
"BR" = BR,
"sensi" = sensi,
"speci" = speci)
res <- rbind(res, df)
}
res
}
Another approach is to make a function and to use Map(...). The advantage of Map is that your paramspace will not be coerced into a matrix which will make everything the same type (i.e., numeric, character, etc.).
There were also some other changes I made in order to allow R to do the acccounting for us. Primarily:
X is now a logical so we can simplify our if statements. Additionally, the allocation is made all at once instead of looping.
We change the feed() function to also generate the evidence. This allows us to...
Use replicate to repeat the loops.
learn2 <- function(prior, sensi, speci, e){
out <- ifelse(e, (sensi*prior) / ((sensi*prior) + (1-speci)*(1-prior)),
((1-sensi)*prior) / (((1-sensi)*prior) + (speci*(1-prior))))
out
}
feed2 = function(x, N, samp_n = 10L, sensi, speci) {
evidence = rbinom(sample(samp_n, 1L, replace = TRUE),
1,
if (x) sensi else 1 - sensi)
prior = 0.5
for (i in evidence) {
res = learn2(prior, sensi, speci, i)
prior = res
}
return(prior)
}
runModel2 <- function(iterations = 2,
N = 10,
BR = 0.66,
sensi = 0.75,
speci = 0.45 ) {
X = sample(c(TRUE, FALSE), N, BR)
## this is done now so that the columns will be ordered nicer
ans = list(ID = 1:N,
N = N,
BR = BR,
sensi = sensi,
speci = speci,
X = X)
t0s = replicate(iterations,
vapply(X, feed2, FUN.VALUE = 0, N, 10L, sensi, speci, USE.NAMES = FALSE),
simplify = FALSE)
names(t0s) = paste0("E_", 1:iterations)
return(as.data.frame(c(ans, t0s)))
}
runModel2()
#> ID N BR sensi speci X E_1 E_2
#> 1 1 10 0.66 0.75 0.45 TRUE 0.82967106 0.657648599
#> 2 2 10 0.66 0.75 0.45 FALSE 0.43103448 0.006827641
#> 3 3 10 0.66 0.75 0.45 TRUE 0.43103448 0.775671866
#> 4 4 10 0.66 0.75 0.45 TRUE 0.71716957 0.431034483
#> 5 5 10 0.66 0.75 0.45 FALSE 0.24176079 0.016593958
#> 6 6 10 0.66 0.75 0.45 FALSE 0.30303324 0.008992838
#> 7 7 10 0.66 0.75 0.45 TRUE 0.82967106 0.865405260
#> 8 8 10 0.66 0.75 0.45 FALSE 0.43103448 0.439027817
#> 9 9 10 0.66 0.75 0.45 FALSE 0.57692308 0.050262167
#> 10 10 10 0.66 0.75 0.45 FALSE 0.02178833 0.296208531
This output is a little wider than your original approach. We can always reshape the E_# columns but this may end up being better for your actual use case.
Finally, here is Map() in action:
iterations <- 100
N_vec <- c(10, 50)
BR_vec <- c(0.25, 0.50, 0.75)
sensi_vec <- c(0.45, 0.75)
speci_vec <- c(0.45, 0.75)
paramspace <- expand.grid(iterations = iterations, N = N_vec, BR = BR_vec, sensi = sensi_vec, speci = speci_vec)
res = Map(runModel2, paramspace$iterations, paramspace$N, paramspace$BR, paramspace$sensi, paramspace$speci)
res[[24L]][1:10, 1:8] ## only first 10 rows for demonstration
## ID N BR sensi speci X E_1 E_2
##1 1 50 0.75 0.75 0.75 TRUE 0.500000000 0.500000000
##2 2 50 0.75 0.75 0.75 FALSE 0.001369863 0.035714286
##3 3 50 0.75 0.75 0.75 FALSE 0.250000000 0.900000000
##4 4 50 0.75 0.75 0.75 TRUE 0.750000000 0.250000000
##5 5 50 0.75 0.75 0.75 TRUE 0.987804878 0.500000000
##6 6 50 0.75 0.75 0.75 TRUE 0.964285714 0.250000000
##7 7 50 0.75 0.75 0.75 TRUE 0.750000000 0.750000000
##8 8 50 0.75 0.75 0.75 FALSE 0.012195122 0.035714286
##9 9 50 0.75 0.75 0.75 TRUE 0.750000000 0.500000000
##10 10 50 0.75 0.75 0.75 FALSE 0.250000000 0.001369863

Grouping factors or integers into equivalence classes in R

I have a data frame representing equivalences between members from two sets:
print(x)
G S
1 g1 s2
2 g1 s1
3 g2 s3
4 g3 s3
5 g4 s3
Does someone know of a function or a useful data structure for grouping the objects into equivalence classes? In the example above, the result should be two equivalence classes
{g1, s1, s2}, {g2, g3, g4, s3}
An option is to use igraph to extract vertices from clusters:
library(igraph)
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
tapply(names(m), m, sort)
output:
$`1`
[1] "g1" "s1" "s2"
$`2`
[1] "g2" "g3" "g4" "s3"
data:
x <- read.table(text="G S
g1 s2
g1 s1
g2 s3
g3 s3
g4 s3", header=TRUE, stringsAsFactors=FALSE)
You can test for equality using outer and combine them with | or. From this matrix get the unique lines and then use apply to return a list of the groups.
tt <- outer(x$G, x$G, "==") | outer(x$S, x$S, "==")
tt <- unique(tt)
apply(tt, 1, function(i) unique(unlist(x[i,])))
#[[1]]
#[1] "g1" "s2" "s1"
#
#[[2]]
#[1] "g2" "g3" "g4" "s3"
Another option which is looping over the vector instead of expanding it as outer is doing:
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- unique(c(t1, t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
#[[1]]
#[1] "g1" "s2" "s1"
#
#[[2]]
#[1] "g2" "g3" "g4" "s3"
Data:
x <- structure(list(G = c("g1", "g1", "g2", "g3", "g4"), S = c("s2",
"s1", "s3", "s3", "s3")), class = "data.frame", row.names = c(NA, -5L))
You can apply the following code for grouping
# function to categorize incoming `v` within existing `lst`
grp <- function(lst, v) {
if (length(lst) == 0) return(c(lst,list(v)))
idx <- which(unlist(Map(function(x) any(!is.na(match(v,x))), lst)))
if (length(idx) == 0) {
lst <- c(lst,list(v))
} else {
lst[idx] <- list(union(unlist(lst[idx]),v))
}
return(unique(lst))
}
# generate grouping results
df <- unique(df)
res <- Reduce(function(lst,x) grp(lst,x),
c(list(NULL),unname(Map(function(x) as.character(unlist(x)),split(df,seq(nrow(df)))))),
accumulate = F)
Application Examples
given input data df <- data.frame(G = c("g1","g1","g2","g3","g4"), S = c("s2","s1","s3","s3","s3"))
then
> df
G S
1 g1 s2
2 g1 s1
3 g2 s3
4 g3 s3
5 g4 s3
> res
[[1]]
[1] "g1" "s2" "s1"
[[2]]
[1] "g2" "s3" "g3" "g4"
given input data df <- data.frame(G = sprintf("g%i", c(2,3,4,2,2)), S = sprintf("s%i", c(3,3,2,4,3)))
then
> df
G S
1 g2 s3
2 g3 s3
3 g4 s2
4 g2 s4
> res
[[1]]
[1] "g2" "s3" "g3" "s4"
[[2]]
[1] "g4" "s2"
UPDATE: above solution become rather slow when dealing with huge dataset. An improved solution is given as below:
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
To compare the runtime of the three answers by #GKi, #chinsoon12, and #ThomasisCoding, I have created random sets of different size n and measured the runtime (as "elapsed" from proc.time).
From the results, I conclude that methods relying on igraph's connected component decomposition is the fastest:
n chinsoon12 ThomasisCoding GKi
500 0.002 0.054 0.030
2500 0.010 0.203 0.416
5000 0.020 0.379 1.456
7500 0.033 0.670 3.351
10000 0.044 0.832 5.837
Edit (2019-11-19): Upon request of #GKI, here is the code I used for comparing the runtime of the three algorithms. Beware that all functions work on the global variable x, because R only supports call-by-value, which would add unwanted overhead in this runtime estimation:
library(igraph)
# solution by chinsson12: CC decomposition from igraph
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
# solution by ThomasisCoding
method.B <- function() {
# find 1-to-1 mapping
r <- Reduce(intersect,lapply(names(x), function(v) split(x,x[v])))
r1map <- unlist(Map(toString,Map(unlist,r)))
# removel one-to-one mapping and find N-to-1 mapping
if (length(r1map) >0) {
xx <- x[-as.numeric(rownames(Reduce(rbind,r))),]
} else {
xx <- x
}
rNmap <- c()
if (nrow(xx)> 0) {
rNmap <- sapply(names(xx),
function(v) {
z <- split(xx,xx[v])
u <- z[unlist(Map(nrow,z))>1]
ifelse(length(u)==0, NA, toString(c(names(u),as.vector(u[[1]][,setdiff(names(xx),v)]))))
},USE.NAMES = F)
rNmap <- rNmap[!is.na(rNmap)]
}
# combine both 1-to-1 and n-to-1 mappings
res <- c(r1map,rNmap)
return(res)
}
# solution by GKi: with outer product
method.C <- function() {
tt <- outer(x$G, x$G, "==") | outer(x$S, x$S, "==")
tt <- unique(tt)
res <- apply(tt, 1, function(i) unique(unlist(x[i,])))
return(res)
}
# runtime results
rt <- data.frame()
for (n in seq(500,10000, by=500)) {
# this won't work because of ambigous node ids (see [answer by GKi][6]):
#x <- data.frame(G = sample(1:n,n,replace=TRUE), S = sample(1:n,n,replace=TRUE))
# therefore, make the node ids unique:
x <- data.frame(G = sprintf("g%i", sample(1:n,n,replace=TRUE)), S = sprintf("s%i", sample(1:n,n,replace=TRUE)))
t1 <- proc.time()
method.A()
tA <- proc.time() - t1
t1 <- proc.time()
method.B()
tB <- proc.time() - t1
t1 <- proc.time()
method.C()
tC <- proc.time() - t1
rt <- rbind(rt, data.frame(n=n, t.A=tA[["elapsed"]], t.B=tB[["elapsed"]], t.C=tC[["elapsed"]]))
}
print(rt)
plot(rt$n, rt$t.C, xlab="n", ylab="run time [s]", ylim=c(min(rt$t.A),max(rt$t.C)), type='l')
lines(rt$n, rt$t.B, col="red")
lines(rt$n, rt$t.A, col="blue")
legend("topleft", c("GKi", "ThomasisCoding", "chinsoon12"), lt=c(1,1,1), col=c("black", "red", "blue"))
Comparison on results of the methods:
method.A()
#$`1`
#[1] "1" "2" "3" "4"
method.A2()
#$`1`
#[1] "3" "1" "4" "2"
#
#$`2`
#[1] "2" "3"
method.B()
#[[1]]
#[1] 3 1 4 2
#
#[[2]]
#[1] 2 3
method.C()
#[[1]]
#[[1]]$All
#[1] 3 1 4 2
#
#[[1]]$G
#[1] 3 1
#
#[[1]]$S
#[1] 4 2 1
#
#
#[[2]]
#[[2]]$All
#[1] 2 3
#
#[[2]]$G
#[1] 2
#
#[[2]]$S
#[1] 3
Methods:
library(igraph)
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
method.A2 <- function() {
g <- graph_from_data_frame(t(apply(x, 1, function(x) paste0(names(x), x))))
m <- clusters(g)$membership
res <- tapply(substring(names(m),2), m, unique)
return(res)
}
method.B <- function() {
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
return(res)
}
method.C <- function() {
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- list(All=unique(c(t1, t2)), G=unique(t1), S=unique(t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
}
Data:
x <- data.frame(G = c(3,1,1,2,3), S=c(4,1,2,3,2))
x
# G S
#1 3 4
#2 1 1
#3 1 2
#4 2 3
#5 3 2
UPDATE: performance comparison based on latest updates by #GKi, #chinsoon12, and #ThomasisCoding
code for comparison
library(igraph)
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
method.B <- function() {
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
return(res)
}
method.C <- function() {
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- list(All=unique(c(t1, t2)), G=unique(t1), S=unique(t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
}
# runtime results
rt <- data.frame()
for (n in seq(500,10000, by=500)) {
# this won't work because of ambigous node ids (see [answer by GKi][6]):
#x <- data.frame(G = sample(1:n,n,replace=TRUE), S = sample(1:n,n,replace=TRUE))
# therefore, make the node ids unique:
x <- data.frame(G = sprintf("g%i", sample(1:n,n,replace=TRUE)), S = sprintf("s%i", sample(1:n,n,replace=TRUE)))
t1 <- proc.time()
method.A()
tA <- proc.time() - t1
t1 <- proc.time()
method.B()
tB <- proc.time() - t1
t1 <- proc.time()
method.C()
tC <- proc.time() - t1
rt <- rbind(rt, data.frame(n=n, t.A=tA[["elapsed"]], t.B=tB[["elapsed"]], t.C=tC[["elapsed"]]))
}
print(rt)
plot(rt$n, rt$t.C, xlab="n", ylab="run time [s]", ylim=c(min(rt$t.A),max(rt$t.C)), type='l')
lines(rt$n, rt$t.B, col="red")
lines(rt$n, rt$t.A, col="blue")
legend("topleft", c("GKi", "ThomasisCoding", "chinsoon12"), lt=c(1,1,1), col=c("black", "red", "blue"))
runtime of three methods:
n t.A t.B t.C
1 500 0.00 0.16 0.26
2 1000 0.02 0.31 0.53
3 1500 0.02 0.51 1.11
4 2000 0.03 0.90 1.47
5 2500 0.03 1.35 2.17
6 3000 0.04 2.08 3.14
7 3500 0.04 2.66 3.97
8 4000 0.07 3.38 4.92
9 4500 0.07 4.38 6.35
10 5000 0.06 5.41 7.58
11 5500 0.08 6.79 9.55
12 6000 0.08 7.81 10.91
13 6500 0.10 9.03 12.06
14 7000 0.09 10.06 14.20
15 7500 0.11 11.76 15.65
16 8000 0.13 13.41 17.84
17 8500 0.11 14.87 20.67
18 9000 0.13 16.88 23.52
19 9500 0.14 18.38 25.57
20 10000 0.14 22.81 30.05
visualization of runtime
Additional (Thanks to comment by #GKi): When keeping the dataset integers, the grouping process non-igraph methods are largely reduced:
n t.A t.B t.C
1 500 0.00 0.09 0.13
2 1000 0.01 0.15 0.23
3 1500 0.01 0.22 0.38
4 2000 0.03 0.31 0.50
5 2500 0.05 0.45 0.76
6 3000 0.07 0.51 0.77
7 3500 0.06 0.67 0.97
8 4000 0.07 0.85 1.20
9 4500 0.07 0.90 1.39
10 5000 0.09 1.23 1.55
11 5500 0.09 1.30 1.78
12 6000 0.09 1.51 1.94
13 6500 0.11 1.77 2.20
14 7000 0.13 2.18 2.55
15 7500 0.12 2.37 2.79
16 8000 0.13 2.56 2.96
17 8500 0.14 2.76 3.39
18 9000 0.15 3.03 3.54
19 9500 0.15 3.54 4.23
20 10000 0.16 3.76 4.32

Multiple range of rows deletion in R

Let's say I have
v <- matrix(seq(150), 50, 3)
k <- c(10, 40)
delta <- 5
How can I delete the 10-delta to 10+delta rows and 40-delta to 40+delta rows simultaneously?
I used vnew <- v[-((k-delta):(k+delta)),] but it seems that the command only delete using the first element of k (which is 10) and does not delete the 40-delta to 40+delta rows. Does anyone have any idea how to do this?
Oh and I will need to put this inside a loop where k is being updated in each iteration, so v[c(-{(10-delta):(10+delta)},-{(40-delta):(40+delta)}),] won't work.
If k is growing in each iteration and delta doesn't change I would suggest the following:
d <- -delta:delta
for (...) {
# ...
vnew <- v[-(rep(k, each=length(d)) + d),]
# ...
}
For your example:
d <- -5:5
k <- c(10, 40)
rep(k, each=length(d)) + d
# [1] 5 6 7 8 9 10 11 12 13 14 15 35 36 37 38 39 40 41 42 43 44 45
EDIT: a benchmark of both solutions:
library("rbenchmark")
idx1 <- function(k, delta) {
d <- -delta:delta
lapply(seq_along(k), function(i) {
rep(k[1:i], each=length(d)) + d
})
}
idx2 <- function(k, delta) {
lapply(seq_along(k), function(i) {
c(sapply(1:i, function(ii) {
(k[ii]-delta):(k[ii]+delta)
}))
})
}
set.seed(1)
k <- sample(1e3, 1e2)
delta <- 5
all.equal(idx1(k, delta), idx2(k, delta))
# [1] TRUE
benchmark(idx1(k, delta), idx2(k, delta), order="relative", replications=100)
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 idx1(k, delta) 100 0.174 1.000 0.172 0 0 0
# 2 idx2(k, delta) 100 1.579 9.075 1.576 0 0 0
Richard Scriven's answer only returns the indexes 10-delta:10+delta and 40-delta:40+delta of the lines to be removed from v. To effectly do it, you must combined it with what you tried like this:
v[-c(sapply(seq(k), function(i) (k[i]-delta):(k[i]+delta))), ]
or shorter but dirtier(?): v[-sapply(seq(k), function(i) (k[i]-delta):(k[i]+delta)), ]

Efficiently replicate matrices in R

I have a matrix and look for an efficient way to replicate it n times (where n is the number of observations in the dataset). For example, if I have a matrix A
A <- matrix(1:15, nrow=3)
then I want an output of the form
rbind(A, A, A, ...) #n times.
Obviously, there are many ways to construct such a large matrix, for example using a for loop or apply or similar functions. However, the call to the "matrix-replication-function" takes place in the very core of my optimization algorithm where it is called tens of thousands of times during one run of my program. Therefore, loops, apply-type of functions and anything similar to that are not efficient enough. (Such a solution would basically mean that a loop over n is performed tens of thousands of times, which is obviously inefficient.) I already tried to use the ordinary rep function, but haven't found a way to arrange the output of rep in a matrix of the desired format.
The solution
do.call("rbind", replicate(n, A, simplify=F))
is also too inefficient because rbind is used too often in this case. (Then, about 30% of the total runtime of my program are spent performing the rbinds.)
Does anyone know a better solution?
Two more solutions:
The first is a modification of the example in the question
do.call("rbind", rep(list(A), n))
The second involves unrolling the matrix, replicating it, and reassembling it.
matrix(rep(t(A),n), ncol=ncol(A), byrow=TRUE)
Since efficiency is what was requested, benchmarking is necessary
library("rbenchmark")
A <- matrix(1:15, nrow=3)
n <- 10
benchmark(rbind(A, A, A, A, A, A, A, A, A, A),
do.call("rbind", replicate(n, A, simplify=FALSE)),
do.call("rbind", rep(list(A), n)),
apply(A, 2, rep, n),
matrix(rep(t(A),n), ncol=ncol(A), byrow=TRUE),
order="relative", replications=100000)
which gives:
test replications elapsed
1 rbind(A, A, A, A, A, A, A, A, A, A) 100000 0.91
3 do.call("rbind", rep(list(A), n)) 100000 1.42
5 matrix(rep(t(A), n), ncol = ncol(A), byrow = TRUE) 100000 2.20
2 do.call("rbind", replicate(n, A, simplify = FALSE)) 100000 3.03
4 apply(A, 2, rep, n) 100000 7.75
relative user.self sys.self user.child sys.child
1 1.000 0.91 0 NA NA
3 1.560 1.42 0 NA NA
5 2.418 2.19 0 NA NA
2 3.330 3.03 0 NA NA
4 8.516 7.73 0 NA NA
So the fastest is the raw rbind call, but that assumes n is fixed and known ahead of time. If n is not fixed, then the fastest is do.call("rbind", rep(list(A), n). These were for a 3x5 matrix and 10 replications. Different sized matrices might give different orderings.
EDIT:
For n=600, the results are in a different order (leaving out the explicit rbind version):
A <- matrix(1:15, nrow=3)
n <- 600
benchmark(do.call("rbind", replicate(n, A, simplify=FALSE)),
do.call("rbind", rep(list(A), n)),
apply(A, 2, rep, n),
matrix(rep(t(A),n), ncol=ncol(A), byrow=TRUE),
order="relative", replications=10000)
giving
test replications elapsed
4 matrix(rep(t(A), n), ncol = ncol(A), byrow = TRUE) 10000 1.74
3 apply(A, 2, rep, n) 10000 2.57
2 do.call("rbind", rep(list(A), n)) 10000 2.79
1 do.call("rbind", replicate(n, A, simplify = FALSE)) 10000 6.68
relative user.self sys.self user.child sys.child
4 1.000 1.75 0 NA NA
3 1.477 2.54 0 NA NA
2 1.603 2.79 0 NA NA
1 3.839 6.65 0 NA NA
If you include the explicit rbind version, it is slightly faster than the do.call("rbind", rep(list(A), n)) version, but not by much, and slower than either the apply or matrix versions. So a generalization to arbitrary n does not require a loss of speed in this case.
Probably this is more efficient:
apply(A, 2, rep, n)
There's also this way:
rep(1, n) %x% A
You can use indexing
A[rep(seq(nrow(A)), n), ]
I came here for the same reason as the original poster and ultimately updated #Brian Diggs comparison to include all of the other posted answers. Hopefully I did this correctly:
#install.packages("rbenchmark")
library("rbenchmark")
A <- matrix(1:15, nrow=3)
n <- 600
benchmark(do.call("rbind", replicate(n, A, simplify=FALSE)),
do.call("rbind", rep(list(A), n)),
apply(A, 2, rep, n),
matrix(rep(t(A),n), ncol=ncol(A), byrow=TRUE),
A[rep(seq(nrow(A)), n), ],
rep(1, n) %x% A,
apply(A, 2, rep, n),
matrix(rep(as.integer(t(A)),n),nrow=nrow(A)*n,byrow=TRUE),
order="relative", replications=10000)
# test replications elapsed relative user.self sys.self user.child sys.child
#5 A[rep(seq(nrow(A)), n), ] 10000 0.32 1.000 0.33 0.00 NA NA
#8 matrix(rep(as.integer(t(A)), n), nrow = nrow(A) * n, byrow = TRUE) 10000 0.36 1.125 0.35 0.02 NA NA
#4 matrix(rep(t(A), n), ncol = ncol(A), byrow = TRUE) 10000 0.38 1.188 0.37 0.00 NA NA
#3 apply(A, 2, rep, n) 10000 0.59 1.844 0.56 0.03 NA NA
#7 apply(A, 2, rep, n) 10000 0.61 1.906 0.58 0.03 NA NA
#6 rep(1, n) %x% A 10000 1.44 4.500 1.42 0.02 NA NA
#2 do.call("rbind", rep(list(A), n)) 10000 1.67 5.219 1.67 0.00 NA NA
#1 do.call("rbind", replicate(n, A, simplify = FALSE)) 10000 5.03 15.719 5.02 0.01 NA NA
what about transforming it into an array, replicate the content and create a new matrix with the updated number of rows?
A <- matrix(...)
n = 2 # just a test
a = as.integer(A)
multi.a = rep(a,n)
multi.A = matrix(multi.a,nrow=nrow(A)*n,byrow=T)

Finding the mean of all Duplicates

There is a nice explanation here describing how to eliminate duplicates in a data frame by picking the maximum variable.
I can also see how this can be applied to pick the duplicate with the minimum variable.
my question now is how do I display the mean of all duplicates?
for example:
z <- data.frame(id=c(1,1,2,2,3,4),var=c(2,4,1,3,5,2))
# id var
# 1 2
# 1 4
# 2 1
# 2 3
# 3 5
# 4 2
I would like the output:
# id var
# 1 3 mean(2,4)
# 2 2 mean(1,3)
# 3 5
# 4 2
My current code is:
averages<-do.call(rbind,lapply(split(z,z$id),function(chunk) mean(chunk$var)))
z<-z[order(z$id),]
z<-z[!duplicated(z$id),]
z$var<-averages
My code runs very slowly and is takes about 10 times longer than the method for picking the maximum. How do I optimize this code?
Here is a faster solution using data.table
library(data.table)
z <- data.frame(id=sample(letters, 6e5, replace = TRUE),var = rnorm(6e5))
fn1 <- function(z){
z$var <- ave(z$var, z$id, FUN=mean)
return(unique(z))
}
fn2 <- function(z) {
t(sapply(split(z,z$id), function(x) sapply(x,mean)))
}
fn3 <- function(z){
data.table(z)[,list(var = mean(var)), 'id']
}
library(rbenchmark)
benchmark(f1 <- fn1(z), f2 <- fn2(z), f3 <- fn3(z), replications = 2)
est replications elapsed relative user.self sys.self
1 f1 <- fn1(z) 2 3.619 8.455607 3.331 0.242
2 f2 <- fn2(z) 2 0.586 1.369159 0.365 0.220
3 f3 <- fn3(z) 2 0.428 1.000000 0.341 0.086
I think split() and unsplit() is one way.
dupMean <- function(x)
{
result <- split(x[, 2], x[, 1])
result <- lapply(result, mean)
result <- unsplit(result, unique(x[, 1]))
return(result)
}
Or, to save a line with plyr:
require(plyr)
dupMean <- function(x)
{
result <- split(x[, 2], x[, 1])
result <- laply(result, mean)
return(result)
}
Update:
Just for curiosity, here is a comparison of the various functions suggested. Ramnath (fn3) looks to be the winner on my computer.
require(plyr)
require(data.table)
require(rbenchmark)
fn1 <- function(z){
z$var <- ave(z$var, z$id, FUN=mean)
return(unique(z))
}
fn2 <- function(z) {
t(sapply(split(z,z$id), function(x) sapply(x,mean)))
}
fn3 <- function(z){
data.table(z)[,list(var = mean(var)), 'id']
}
fn4 <- function(x)
{
result <- t(sapply(split(x,x$id), function(y) sapply(y,mean)))
return(result)
}
fn5 <- function(x)
{
x$var <- ave(x$var, x$id, FUN=mean)
x <- unique(x)
return(x)
}
fn6 <- function(x)
{
result <- do.call(rbind,lapply(split(x,x$id),function(chunk) mean(chunk$var)))
return(data.frame(id = unique(x[, 1]), var = result))
}
fn7 <- function(x)
{
result <- split(x[, 2], x[, 1])
result <- lapply(result, mean)
result <- unsplit(result, unique(x[, 1]))
return(data.frame(id = unique(x[, 1]), var = result))
}
fn8 <- function(x)
{
result <- split(x[, 2], x[, 1])
result <- laply(result, mean)
return(data.frame(id = unique(x[, 1]), var = result))
}
z <- data.frame(id = rep(c(1,1,2,2,3,4,5,6,6,7), 1e5), var = rnorm(1e6))
benchmark(f1 <- fn1(z), f2 <- fn2(z), f3 <- fn3(z), f4 <- fn4(z), f5 <- fn5(z), f6 <- fn6(z), f7 <- fn7(z), f8 <- fn8(z), replications = 2)
Result:
test replications elapsed relative user.self sys.self
1 f1 <- fn1(z) 2 13.45 20.692308 13.27 0.15
2 f2 <- fn2(z) 2 3.54 5.446154 3.43 0.09
3 f3 <- fn3(z) 2 0.65 1.000000 0.54 0.10
4 f4 <- fn4(z) 2 3.62 5.569231 3.50 0.09
5 f5 <- fn5(z) 2 13.57 20.876923 13.25 0.25
6 f6 <- fn6(z) 2 3.53 5.430769 3.36 0.14
7 f7 <- fn7(z) 2 3.34 5.138462 3.28 0.03
8 f8 <- fn8(z) 2 3.34 5.138462 3.26 0.03
I would use a combination of ave and unique:
z <- data.frame(id=rep(c(1,1,2,2,3,4),1e5),var=rnorm(6e5))
z$var <- ave(z$var, z$id, FUN=mean)
z <- unique(z)
UPDATE: after actually timing the solution, here's something that's a little faster.
z <- data.frame(id=rep(c(1,1,2,2,3,4),1e5),var=rnorm(6e5))
system.time({
averages <- t(sapply(split(z,z$id), function(x) sapply(x,mean)))
})
# user system elapsed
# 1.32 0.00 1.33
system.time({
z$var <- ave(z$var, z$id, FUN=mean)
z <- unique(z)
})
# user system elapsed
# 4.33 0.02 4.37

Resources