Grouping factors or integers into equivalence classes in R - r

I have a data frame representing equivalences between members from two sets:
print(x)
G S
1 g1 s2
2 g1 s1
3 g2 s3
4 g3 s3
5 g4 s3
Does someone know of a function or a useful data structure for grouping the objects into equivalence classes? In the example above, the result should be two equivalence classes
{g1, s1, s2}, {g2, g3, g4, s3}

An option is to use igraph to extract vertices from clusters:
library(igraph)
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
tapply(names(m), m, sort)
output:
$`1`
[1] "g1" "s1" "s2"
$`2`
[1] "g2" "g3" "g4" "s3"
data:
x <- read.table(text="G S
g1 s2
g1 s1
g2 s3
g3 s3
g4 s3", header=TRUE, stringsAsFactors=FALSE)

You can test for equality using outer and combine them with | or. From this matrix get the unique lines and then use apply to return a list of the groups.
tt <- outer(x$G, x$G, "==") | outer(x$S, x$S, "==")
tt <- unique(tt)
apply(tt, 1, function(i) unique(unlist(x[i,])))
#[[1]]
#[1] "g1" "s2" "s1"
#
#[[2]]
#[1] "g2" "g3" "g4" "s3"
Another option which is looping over the vector instead of expanding it as outer is doing:
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- unique(c(t1, t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
#[[1]]
#[1] "g1" "s2" "s1"
#
#[[2]]
#[1] "g2" "g3" "g4" "s3"
Data:
x <- structure(list(G = c("g1", "g1", "g2", "g3", "g4"), S = c("s2",
"s1", "s3", "s3", "s3")), class = "data.frame", row.names = c(NA, -5L))

You can apply the following code for grouping
# function to categorize incoming `v` within existing `lst`
grp <- function(lst, v) {
if (length(lst) == 0) return(c(lst,list(v)))
idx <- which(unlist(Map(function(x) any(!is.na(match(v,x))), lst)))
if (length(idx) == 0) {
lst <- c(lst,list(v))
} else {
lst[idx] <- list(union(unlist(lst[idx]),v))
}
return(unique(lst))
}
# generate grouping results
df <- unique(df)
res <- Reduce(function(lst,x) grp(lst,x),
c(list(NULL),unname(Map(function(x) as.character(unlist(x)),split(df,seq(nrow(df)))))),
accumulate = F)
Application Examples
given input data df <- data.frame(G = c("g1","g1","g2","g3","g4"), S = c("s2","s1","s3","s3","s3"))
then
> df
G S
1 g1 s2
2 g1 s1
3 g2 s3
4 g3 s3
5 g4 s3
> res
[[1]]
[1] "g1" "s2" "s1"
[[2]]
[1] "g2" "s3" "g3" "g4"
given input data df <- data.frame(G = sprintf("g%i", c(2,3,4,2,2)), S = sprintf("s%i", c(3,3,2,4,3)))
then
> df
G S
1 g2 s3
2 g3 s3
3 g4 s2
4 g2 s4
> res
[[1]]
[1] "g2" "s3" "g3" "s4"
[[2]]
[1] "g4" "s2"
UPDATE: above solution become rather slow when dealing with huge dataset. An improved solution is given as below:
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}

To compare the runtime of the three answers by #GKi, #chinsoon12, and #ThomasisCoding, I have created random sets of different size n and measured the runtime (as "elapsed" from proc.time).
From the results, I conclude that methods relying on igraph's connected component decomposition is the fastest:
n chinsoon12 ThomasisCoding GKi
500 0.002 0.054 0.030
2500 0.010 0.203 0.416
5000 0.020 0.379 1.456
7500 0.033 0.670 3.351
10000 0.044 0.832 5.837
Edit (2019-11-19): Upon request of #GKI, here is the code I used for comparing the runtime of the three algorithms. Beware that all functions work on the global variable x, because R only supports call-by-value, which would add unwanted overhead in this runtime estimation:
library(igraph)
# solution by chinsson12: CC decomposition from igraph
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
# solution by ThomasisCoding
method.B <- function() {
# find 1-to-1 mapping
r <- Reduce(intersect,lapply(names(x), function(v) split(x,x[v])))
r1map <- unlist(Map(toString,Map(unlist,r)))
# removel one-to-one mapping and find N-to-1 mapping
if (length(r1map) >0) {
xx <- x[-as.numeric(rownames(Reduce(rbind,r))),]
} else {
xx <- x
}
rNmap <- c()
if (nrow(xx)> 0) {
rNmap <- sapply(names(xx),
function(v) {
z <- split(xx,xx[v])
u <- z[unlist(Map(nrow,z))>1]
ifelse(length(u)==0, NA, toString(c(names(u),as.vector(u[[1]][,setdiff(names(xx),v)]))))
},USE.NAMES = F)
rNmap <- rNmap[!is.na(rNmap)]
}
# combine both 1-to-1 and n-to-1 mappings
res <- c(r1map,rNmap)
return(res)
}
# solution by GKi: with outer product
method.C <- function() {
tt <- outer(x$G, x$G, "==") | outer(x$S, x$S, "==")
tt <- unique(tt)
res <- apply(tt, 1, function(i) unique(unlist(x[i,])))
return(res)
}
# runtime results
rt <- data.frame()
for (n in seq(500,10000, by=500)) {
# this won't work because of ambigous node ids (see [answer by GKi][6]):
#x <- data.frame(G = sample(1:n,n,replace=TRUE), S = sample(1:n,n,replace=TRUE))
# therefore, make the node ids unique:
x <- data.frame(G = sprintf("g%i", sample(1:n,n,replace=TRUE)), S = sprintf("s%i", sample(1:n,n,replace=TRUE)))
t1 <- proc.time()
method.A()
tA <- proc.time() - t1
t1 <- proc.time()
method.B()
tB <- proc.time() - t1
t1 <- proc.time()
method.C()
tC <- proc.time() - t1
rt <- rbind(rt, data.frame(n=n, t.A=tA[["elapsed"]], t.B=tB[["elapsed"]], t.C=tC[["elapsed"]]))
}
print(rt)
plot(rt$n, rt$t.C, xlab="n", ylab="run time [s]", ylim=c(min(rt$t.A),max(rt$t.C)), type='l')
lines(rt$n, rt$t.B, col="red")
lines(rt$n, rt$t.A, col="blue")
legend("topleft", c("GKi", "ThomasisCoding", "chinsoon12"), lt=c(1,1,1), col=c("black", "red", "blue"))

Comparison on results of the methods:
method.A()
#$`1`
#[1] "1" "2" "3" "4"
method.A2()
#$`1`
#[1] "3" "1" "4" "2"
#
#$`2`
#[1] "2" "3"
method.B()
#[[1]]
#[1] 3 1 4 2
#
#[[2]]
#[1] 2 3
method.C()
#[[1]]
#[[1]]$All
#[1] 3 1 4 2
#
#[[1]]$G
#[1] 3 1
#
#[[1]]$S
#[1] 4 2 1
#
#
#[[2]]
#[[2]]$All
#[1] 2 3
#
#[[2]]$G
#[1] 2
#
#[[2]]$S
#[1] 3
Methods:
library(igraph)
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
method.A2 <- function() {
g <- graph_from_data_frame(t(apply(x, 1, function(x) paste0(names(x), x))))
m <- clusters(g)$membership
res <- tapply(substring(names(m),2), m, unique)
return(res)
}
method.B <- function() {
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
return(res)
}
method.C <- function() {
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- list(All=unique(c(t1, t2)), G=unique(t1), S=unique(t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
}
Data:
x <- data.frame(G = c(3,1,1,2,3), S=c(4,1,2,3,2))
x
# G S
#1 3 4
#2 1 1
#3 1 2
#4 2 3
#5 3 2

UPDATE: performance comparison based on latest updates by #GKi, #chinsoon12, and #ThomasisCoding
code for comparison
library(igraph)
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
method.B <- function() {
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
return(res)
}
method.C <- function() {
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- list(All=unique(c(t1, t2)), G=unique(t1), S=unique(t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
}
# runtime results
rt <- data.frame()
for (n in seq(500,10000, by=500)) {
# this won't work because of ambigous node ids (see [answer by GKi][6]):
#x <- data.frame(G = sample(1:n,n,replace=TRUE), S = sample(1:n,n,replace=TRUE))
# therefore, make the node ids unique:
x <- data.frame(G = sprintf("g%i", sample(1:n,n,replace=TRUE)), S = sprintf("s%i", sample(1:n,n,replace=TRUE)))
t1 <- proc.time()
method.A()
tA <- proc.time() - t1
t1 <- proc.time()
method.B()
tB <- proc.time() - t1
t1 <- proc.time()
method.C()
tC <- proc.time() - t1
rt <- rbind(rt, data.frame(n=n, t.A=tA[["elapsed"]], t.B=tB[["elapsed"]], t.C=tC[["elapsed"]]))
}
print(rt)
plot(rt$n, rt$t.C, xlab="n", ylab="run time [s]", ylim=c(min(rt$t.A),max(rt$t.C)), type='l')
lines(rt$n, rt$t.B, col="red")
lines(rt$n, rt$t.A, col="blue")
legend("topleft", c("GKi", "ThomasisCoding", "chinsoon12"), lt=c(1,1,1), col=c("black", "red", "blue"))
runtime of three methods:
n t.A t.B t.C
1 500 0.00 0.16 0.26
2 1000 0.02 0.31 0.53
3 1500 0.02 0.51 1.11
4 2000 0.03 0.90 1.47
5 2500 0.03 1.35 2.17
6 3000 0.04 2.08 3.14
7 3500 0.04 2.66 3.97
8 4000 0.07 3.38 4.92
9 4500 0.07 4.38 6.35
10 5000 0.06 5.41 7.58
11 5500 0.08 6.79 9.55
12 6000 0.08 7.81 10.91
13 6500 0.10 9.03 12.06
14 7000 0.09 10.06 14.20
15 7500 0.11 11.76 15.65
16 8000 0.13 13.41 17.84
17 8500 0.11 14.87 20.67
18 9000 0.13 16.88 23.52
19 9500 0.14 18.38 25.57
20 10000 0.14 22.81 30.05
visualization of runtime
Additional (Thanks to comment by #GKi): When keeping the dataset integers, the grouping process non-igraph methods are largely reduced:
n t.A t.B t.C
1 500 0.00 0.09 0.13
2 1000 0.01 0.15 0.23
3 1500 0.01 0.22 0.38
4 2000 0.03 0.31 0.50
5 2500 0.05 0.45 0.76
6 3000 0.07 0.51 0.77
7 3500 0.06 0.67 0.97
8 4000 0.07 0.85 1.20
9 4500 0.07 0.90 1.39
10 5000 0.09 1.23 1.55
11 5500 0.09 1.30 1.78
12 6000 0.09 1.51 1.94
13 6500 0.11 1.77 2.20
14 7000 0.13 2.18 2.55
15 7500 0.12 2.37 2.79
16 8000 0.13 2.56 2.96
17 8500 0.14 2.76 3.39
18 9000 0.15 3.03 3.54
19 9500 0.15 3.54 4.23
20 10000 0.16 3.76 4.32

Related

How to iterate through parameters in for loop

I have a model written as a for loop that incorporates a number of parameters that I specify:
## functions needed to run the model
learn <- function(prior, sensi, speci, e){
out <- ifelse(e == 1, (sensi*prior) / ((sensi*prior) + (1-speci)*(1-prior)),
((1-sensi)*prior) / (((1-sensi)*prior) + (speci*(1-prior))))
out
}
feed <- function(vec){
prior <- 0.5
for (i in vec){
res <- learn(prior, sensi, speci, i)
prior <- res
}
return(prior)
}
## specify parameters
iterations <- 100
N <- 10
BR <- 0.66
sensi <- 0.75
speci <- 0.45
## initialize results object
res <- NULL
## loop for number of iterations
for (j in 1:iterations){
X <- as.numeric(rbinom(1, 1, BR))
if (X == 1){ # if X is 1...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
}
} else { # if X is 0...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
evidence[[i]] <- ifelse(evidence[[i]]==1, 0, 1) # flip evidence
}
}
# feed vectors of evidence through learn function
t0 <- sapply(evidence, feed)
# save dataframe
df <- data.frame("i" = j,
"ID" = c(1:N),
"E" = t0,
"X" = X,
"N" = N,
"BR" = BR,
"sensi" = sensi,
"speci" = speci)
res <- rbind(res, df)
}
This works fine for a single parameterisation, but I now want to automate the process of specifying different parameter values and re-running the model. So instead of defining each parameter as a single value, I define them as a vector of values and store all the possible parameterisations in a dataframe (paramspace) with each row holding the values for a single parameterisation that I want to run:
## set up for multiple parameterizations
iterations <- 100
N_vec <- c(10, 50)
BR_vec <- c(0.25, 0.50, 0.75)
sensi_vec <- c(0.45, 0.75)
speci_vec <- c(0.45, 0.75)
paramspace <- expand.grid(iterations = iterations, N = N_vec, BR = BR_vec, sensi = sensi_vec, speci = speci_vec)
> paramspace
iterations N BR sensi speci
1 100 10 0.25 0.45 0.45
2 100 50 0.25 0.45 0.45
3 100 10 0.50 0.45 0.45
4 100 50 0.50 0.45 0.45
5 100 10 0.75 0.45 0.45
6 100 50 0.75 0.45 0.45
7 100 10 0.25 0.75 0.45
8 100 50 0.25 0.75 0.45
9 100 10 0.50 0.75 0.45
10 100 50 0.50 0.75 0.45
11 100 10 0.75 0.75 0.45
12 100 50 0.75 0.75 0.45
13 100 10 0.25 0.45 0.75
14 100 50 0.25 0.45 0.75
15 100 10 0.50 0.45 0.75
16 100 50 0.50 0.45 0.75
17 100 10 0.75 0.45 0.75
18 100 50 0.75 0.45 0.75
19 100 10 0.25 0.75 0.75
20 100 50 0.25 0.75 0.75
21 100 10 0.50 0.75 0.75
22 100 50 0.50 0.75 0.75
23 100 10 0.75 0.75 0.75
24 100 50 0.75 0.75 0.75
How can I pass each row of parameter values to my model and automatically run through all the parameterisations stated in paramspace?
As suggested in comments, you can create a function and then use apply to loop over the parameters combinations :
## functions needed to run the model
learn <- function(prior, sensi, speci, e){
out <- ifelse(e == 1, (sensi*prior) / ((sensi*prior) + (1-speci)*(1-prior)),
((1-sensi)*prior) / (((1-sensi)*prior) + (speci*(1-prior))))
out
}
feed <- function(vec,sensi,speci){
prior <- 0.5
for (i in vec){
res <- learn(prior, sensi, speci, i)
prior <- res
}
return(prior)
}
runModel <- function(iterations = 100,
N = 10,
BR = 0.66,
sensi = 0.75,
speci = 0.45 ) {
## initialize results object
res <- NULL
## loop for number of iterations
for (j in 1:iterations){
X <- as.numeric(rbinom(1, 1, BR))
if (X == 1){ # if X is 1...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
}
} else { # if X is 0...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
evidence[[i]] <- ifelse(evidence[[i]]==1, 0, 1) # flip evidence
}
}
# feed vectors of evidence through learn function
#t0 <- sapply(evidence, feed)
t0 <- sapply(evidence,function(e){feed(e,sensi,speci)})
# save dataframe
df <- list("i" = iterations,
"ID" = c(1:N),
"E" = t0,
"X" = X,
"N" = N,
"BR" = BR,
"sensi" = sensi,
"speci" = speci)
res <- rbind(res, df)
}
res
}
# Define parameter space
iterations <- 100
N_vec <- c(10, 50)
BR_vec <- c(0.25, 0.50, 0.75)
sensi_vec <- c(0.45, 0.75)
speci_vec <- c(0.45, 0.75)
paramspace <- expand.grid(iterations = iterations, N = N_vec, BR = BR_vec, sensi = sensi_vec, speci = speci_vec)
# Loop over parameter space :
res <- apply(paramspace,1,function(paramset) {
iterations = paramset[1]
N = paramset[2]
BR = paramset[3]
sensi = paramset[4]
speci = paramset[5]
runModel(iterations = iterations, N = N, BR = BR , sensi = sensi, speci = speci )
})
You can also use the foreach package, that used with an appropriate backend offers parallelization capabilities, in case your task becomes more intensive. Here a simple example to understand how it works.
foreach(a=1:3, b=4:6) %do% (a + b)
Then I tried to embed your code into foreach
require(foreach)
## functions needed to run the model
learn <- function(prior, sensi, speci, e){
out <- ifelse(e == 1, (sensi*prior) / ((sensi*prior) + (1-speci)*(1-prior)),
((1-sensi)*prior) / (((1-sensi)*prior) + (speci*(1-prior))))
out
}
feed <- function(vec){
prior <- 0.5
for (i in vec){
res <- learn(prior, sensi, speci, i)
prior <- res
}
return(prior)
}
## set up for multiple parameterizations
iterations <- 100
N_vec <- c(10, 50)
BR_vec <- c(0.25, 0.50, 0.75)
sensi_vec <- c(0.45, 0.75)
speci_vec <- c(0.45, 0.75)
paramspace <- expand.grid(iterations = iterations, N = N_vec, BR = BR_vec, sensi = sensi_vec, speci = speci_vec)
res <- foreach(iterations = paramspace$iterations,
N = paramspace$N,
BR = paramspace$BR,
sensi = paramspace$sensi,
speci = paramspace$speci) %do% {
## initialize results object
res <- NULL
## loop for number of iterations
for (j in 1:iterations){
X <- as.numeric(rbinom(1, 1, BR))
if (X == 1){ # if X is 1...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
}
} else { # if X is 0...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
evidence[[i]] <- ifelse(evidence[[i]]==1, 0, 1) # flip evidence
}
}
# feed vectors of evidence through learn function
t0 <- sapply(evidence, feed)
# save dataframe
df <- data.frame("i" = j,
"ID" = c(1:N),
"E" = t0,
"X" = X,
"N" = N,
"BR" = BR,
"sensi" = sensi,
"speci" = speci)
res <- rbind(res, df)
}
res
}
Another approach is to make a function and to use Map(...). The advantage of Map is that your paramspace will not be coerced into a matrix which will make everything the same type (i.e., numeric, character, etc.).
There were also some other changes I made in order to allow R to do the acccounting for us. Primarily:
X is now a logical so we can simplify our if statements. Additionally, the allocation is made all at once instead of looping.
We change the feed() function to also generate the evidence. This allows us to...
Use replicate to repeat the loops.
learn2 <- function(prior, sensi, speci, e){
out <- ifelse(e, (sensi*prior) / ((sensi*prior) + (1-speci)*(1-prior)),
((1-sensi)*prior) / (((1-sensi)*prior) + (speci*(1-prior))))
out
}
feed2 = function(x, N, samp_n = 10L, sensi, speci) {
evidence = rbinom(sample(samp_n, 1L, replace = TRUE),
1,
if (x) sensi else 1 - sensi)
prior = 0.5
for (i in evidence) {
res = learn2(prior, sensi, speci, i)
prior = res
}
return(prior)
}
runModel2 <- function(iterations = 2,
N = 10,
BR = 0.66,
sensi = 0.75,
speci = 0.45 ) {
X = sample(c(TRUE, FALSE), N, BR)
## this is done now so that the columns will be ordered nicer
ans = list(ID = 1:N,
N = N,
BR = BR,
sensi = sensi,
speci = speci,
X = X)
t0s = replicate(iterations,
vapply(X, feed2, FUN.VALUE = 0, N, 10L, sensi, speci, USE.NAMES = FALSE),
simplify = FALSE)
names(t0s) = paste0("E_", 1:iterations)
return(as.data.frame(c(ans, t0s)))
}
runModel2()
#> ID N BR sensi speci X E_1 E_2
#> 1 1 10 0.66 0.75 0.45 TRUE 0.82967106 0.657648599
#> 2 2 10 0.66 0.75 0.45 FALSE 0.43103448 0.006827641
#> 3 3 10 0.66 0.75 0.45 TRUE 0.43103448 0.775671866
#> 4 4 10 0.66 0.75 0.45 TRUE 0.71716957 0.431034483
#> 5 5 10 0.66 0.75 0.45 FALSE 0.24176079 0.016593958
#> 6 6 10 0.66 0.75 0.45 FALSE 0.30303324 0.008992838
#> 7 7 10 0.66 0.75 0.45 TRUE 0.82967106 0.865405260
#> 8 8 10 0.66 0.75 0.45 FALSE 0.43103448 0.439027817
#> 9 9 10 0.66 0.75 0.45 FALSE 0.57692308 0.050262167
#> 10 10 10 0.66 0.75 0.45 FALSE 0.02178833 0.296208531
This output is a little wider than your original approach. We can always reshape the E_# columns but this may end up being better for your actual use case.
Finally, here is Map() in action:
iterations <- 100
N_vec <- c(10, 50)
BR_vec <- c(0.25, 0.50, 0.75)
sensi_vec <- c(0.45, 0.75)
speci_vec <- c(0.45, 0.75)
paramspace <- expand.grid(iterations = iterations, N = N_vec, BR = BR_vec, sensi = sensi_vec, speci = speci_vec)
res = Map(runModel2, paramspace$iterations, paramspace$N, paramspace$BR, paramspace$sensi, paramspace$speci)
res[[24L]][1:10, 1:8] ## only first 10 rows for demonstration
## ID N BR sensi speci X E_1 E_2
##1 1 50 0.75 0.75 0.75 TRUE 0.500000000 0.500000000
##2 2 50 0.75 0.75 0.75 FALSE 0.001369863 0.035714286
##3 3 50 0.75 0.75 0.75 FALSE 0.250000000 0.900000000
##4 4 50 0.75 0.75 0.75 TRUE 0.750000000 0.250000000
##5 5 50 0.75 0.75 0.75 TRUE 0.987804878 0.500000000
##6 6 50 0.75 0.75 0.75 TRUE 0.964285714 0.250000000
##7 7 50 0.75 0.75 0.75 TRUE 0.750000000 0.750000000
##8 8 50 0.75 0.75 0.75 FALSE 0.012195122 0.035714286
##9 9 50 0.75 0.75 0.75 TRUE 0.750000000 0.500000000
##10 10 50 0.75 0.75 0.75 FALSE 0.250000000 0.001369863

R: cbind function in for loop

c <- readline(prompt="Enter an integer: ")
b <- readline(prompt="Enter an integer: ")
for(i in 1:c){
assign(paste("a", i, sep = ""), i)
}
This gives a1, a2 ... ac variables containing 1,2 ... c
How can I use cbind based on the value of b? For example, take the following:
# assume b = 3 and c = 12:
t1 <- cbind(a1,a2,a3)
t2 <- cbind(a4,a5,a6)
t3 <- cbind(a7,a8,a9)
t4 <- cbind(a10,a11,a12)
# assume b = 4 and c = 12:
t1 <- cbind(a1,a2,a3,a4)
t2 <- cbind(a5,a6,a7,a8)
t3 <- cbind(a9,a10,a11,a12)
Another example to clarify: assume b = 3, c=6
a1 <- c(3,5,2)
a2 <- c(4,7,3)
a3 <- c(3,5,2)
a4 <- c(4,5,3)
a5 <- c(5,5,5)
a6 <- c(4,3,1)
t1 <- cbind(a1,a2,a3)
t2 <- cbind(a4,a5,a6)
Expected value of t1:
3 4 3
5 7 5
2 3 2
I am making some assumptions about your data. I am assuming you have values assigned with the columns you are trying to cbind.
a <- 12
b <- 3
test <- NULL
index <- NULL
for(i in 1:a){
test[i] <- paste0("n_", i)
index[i] <- paste(i)
}
start <- seq(1,a-b+1, by=b)
end <- seq(b,a, by=b)
s = list()
k=1
for(k in 1:length(start)){
cbind_list <- start[k]:end[k]
s[[k]] <- rbind(test[seq(cbind_list[1],cbind_list[length(cbind_list)],by=1)])
}
list_cols <- do.call(rbind, s)
n_1 <- rep(1,4)
n_2 <- rep(2,4)
n_3 <- rep(3,4)
n_4 <- rep(4,4)
n_5 <- rep(5,4)
n_6 <- rep(6,4)
n_7 <- rep(7,4)
n_8 <- rep(8,4)
n_9 <- rep(9,4)
n_10 <- rep(10,4)
n_11 <- rep(11,4)
n_12 <- rep(12,4)
df <- data.frame(n_1,n_2,n_3,n_4,n_5,n_6,n_7,n_8,n_9,n_10,n_11,n_12)
t=list()
for(p in 1:nrow(list_cols)){
nam <- paste0("t",p)
assign(nam,cbind(df[,match(list_cols[p,], colnames(df))]))
}
OUTPUT:
> t1
n_1 n_2 n_3
1 1 2 3
2 1 2 3
3 1 2 3
4 1 2 3
UPDATED:
a <- 6
b <- 3
test <- NULL
index <- NULL
for(i in 1:a){
test[i] <- paste0("n_", i)
index[i] <- paste(i)
}
start <- seq(1,a-b+1, by=b)
end <- seq(b,a, by=b)
s = list()
k=1
for(k in 1:length(start)){
cbind_list <- start[k]:end[k]
s[[k]] <- rbind(test[seq(cbind_list[1],cbind_list[length(cbind_list)],by=1)])
}
list_cols <- do.call(rbind, s)
n_1 <- c(3,5,2)
n_2 <- c(4,7,3)
n_3 <- c(3,5,2)
n_4 <- c(4,5,3)
n_5 <- c(5,5,5)
n_6 <- c(4,3,1)
df <- data.frame(n_1,n_2,n_3,n_4,n_5,n_6)
t=list()
p=1
for(p in 1:nrow(list_cols)){
nam <- paste0("t",p)
assign(nam,cbind(df[,match(list_cols[p,], colnames(df))]))
}
OUTPUT:
> t1
n_1 n_2 n_3
1 3 4 3
2 5 7 5
3 2 3 2

R: getting rid of for loop and speeding code

I would like to speed up my calculations and obtain results without using loop in function m. Reproducible example:
N <- 2500
n <- 500
r <- replicate(1000, sample(N, n))
m <- function(r, N) {
ic <- matrix(0, nrow = N, ncol = N)
for (i in 1:ncol(r)) {
p <- r[, i]
ic[p, p] <- ic[p, p] + 1
}
ic
}
system.time(ic <- m(r, N))
# user system elapsed
# 6.25 0.51 6.76
isSymmetric(ic)
# [1] TRUE
In every iteration of for loop we are dealing with matrix not vector, so how this could be Vectorized?
#joel.wilson The purpose of this function is to calculate pairwise frequencies of elements. So afterwards we could estimate pairwise inclusion probabilities.
Thanks to #Khashaa and #alexis_laz. Benchmarks:
> require(rbenchmark)
> benchmark(m(r, N),
+ m1(r, N),
+ mvec(r, N),
+ alexis(r, N),
+ replications = 10, order = "elapsed")
test replications elapsed relative user.self sys.self user.child sys.child
4 alexis(r, N) 10 4.73 1.000 4.63 0.11 NA NA
3 mvec(r, N) 10 5.36 1.133 5.18 0.18 NA NA
2 m1(r, N) 10 5.48 1.159 5.29 0.19 NA NA
1 m(r, N) 10 61.41 12.983 60.43 0.90 NA NA
This should be significantly faster as it avoids operations on double indexing
m1 <- function(r, N) {
ic <- matrix(0, nrow = N, ncol=ncol(r))
for (i in 1:ncol(r)) {
p <- r[, i]
ic[, i][p] <- 1
}
tcrossprod(ic)
}
system.time(ic1 <- m1(r, N))
# user system elapsed
# 0.53 0.01 0.55
all.equal(ic, ic1)
# [1] TRUE
Simple "counting/adding" operations can almost always be vectorized
mvec <- function(r, N) {
ic <- matrix(0, nrow = N, ncol=ncol(r))
i <- rep(1:ncol(r), each=nrow(r))
ic[cbind(as.vector(r), i)] <- 1
tcrossprod(ic)
}

Faster way of converting a variable length list into a data frame in R

I have a variable length list which looks like this
chr [1:249] "1" "29.12" "2" "20.78" "3" "12.09" ...
chr [1:200] "1" "20.45" "3" "10.56" "4" "12.34" ...
chr [1:213] "2" "12.20" "3" "19.93" "5" "23.05" ...
The values in odd positions ("1", "3", "4", etc) represent variables having specific meaning while the values in even positions are the values for the variables represented by the number before it. E.g. in the second element of the list, the variable "3" has the value "10.56".
I'm trying to convert this into a data frame with values like "10.56" going into the correct column of the data frame i.e. column "3". This is the code I am using
e <- unlist(d[[k]]) ## d is my list. k is the index for a for loop
pos_index <- seq(1, length(e), 2) ## gives positions for the variables
val_index <- seq(2, length(e), 2) ## gives positions for corresponding values
df_index <- as.numeric(e[pos_index])
## Populate a pre-defined data frame at calculated positions
CNNIBN_DF[k, df_index] <- as.numeric(e[val_index])
The data frame should look something like this
X1 X2 X3 X4 X5
1 29.12 20.78 12.09 NA NA
2 20.45 NA 10.56 12.34 NA
3 NA 12.20 19.93 NA 23.05
This works but takes a long time. system.time for 1000 entities gives this
user system elapsed
57.64 0.06 58.14
The list itself has 33k entities with each entity having 200+ elements. I have tried the same operation using just for loops but both tend to take about the same time.
Is there a faster way to do this? I'm using a win32 machine with 4GB of RAM running Intel Core i3 M350 CPU # 2.27 GHz.
Thanks in advance!
Akrun, already, posted some of the many probable alternatives; I'll just add a more explicit approach that seems to do as less as possible (using akrun's "lst"):
ulst = unlist(lst)
cols = seq(1, length(ulst), 2)
inds = cbind(row = rep(seq_along(lst), lengths(lst) %/% 2),
col = as.integer(ulst[cols]))
vals = as.numeric(ulst[-cols])
ans = matrix(, max(inds[, "row"]), max(inds[, "col"]))
ans[inds] = vals
# [,1] [,2] [,3] [,4] [,5]
#[1,] 29.12 20.78 12.09 NA NA
#[2,] 20.45 NA 10.56 12.34 NA
#[3,] NA 12.20 19.93 NA 23.05
From your goal, it seems that you shouldn't necessarily need a "data.frame", but the "matrix" is easily converted to one. Also, it might be worth to look into if you could manipulate the building/fetching of your data in order to avoid this weird format.
Try
lst1 <- lapply(lst, function(x) { x<- as.numeric(x)
indx <- c(TRUE, FALSE)
v1 <- tabulate(x[indx])
is.na(v1) <- v1==0
v1[!is.na(v1)] <- x[!indx]
v1 })
setNames(do.call(rbind.data.frame,lapply(lst1, `length<-`,
max(lengths(lst1)))), paste0('X', 1:5))
# X1 X2 X3 X4 X5
#1 29.12 20.78 12.09 NA NA
#2 20.45 NA 10.56 12.34 NA
#3 NA 12.20 19.93 NA 23.05
Or
m1 <- do.call(rbind,Map(function(x,y) cbind(x,matrix(as.numeric(y),
nrow=length(y)/2, byrow=TRUE)), seq_along(lst), lst))
m2 <- matrix(NA, ncol=max(m1[,2]), nrow=length(lst))
m2[m1[,-3]] <- m1[,3]
We can use sparseMatrix from Matrix
library(Matrix)
d1 <- setNames(as.data.frame(m1), c('Row', 'Col', 'Value'))
with(d1, sparseMatrix(Row, Col, x=Value))
#3 x 5 sparse Matrix of class "dgCMatrix"
#[1,] 29.12 20.78 12.09 . .
#[2,] 20.45 . 10.56 12.34 .
#[3,] . 12.20 19.93 . 23.05
which can be converted to matrix by as.matrix.
Or
library(tidyr)
library(dplyr)
d1 <- unnest(lst, group)
d2 <- bind_cols(slice(d1, seq(1, n(), by=2)), slice(d1, seq(2, n(), by=2))[2])
colnames(d2)[3] <- 'val'
spread(d2, x, val) %>%
select(-group)
# 1 2 3 4 5
#1 29.12 20.78 12.09 <NA> <NA>
#2 20.45 <NA> 10.56 12.34 <NA>
#3 <NA> 12.20 19.93 <NA> 23.05
Or
library(data.table)#v1.9.5+
library(reshape2)
dcast(setDT(melt(lst))[, list(indx= value[c(TRUE, FALSE)],
value=value[c(FALSE, TRUE)]) ,L1], L1~paste0('X', indx), value.var='value')
# L1 X1 X2 X3 X4 X5
#1: 1 29.12 20.78 12.09 NA NA
#2: 2 20.45 NA 10.56 12.34 NA
#3: 3 NA 12.20 19.93 NA 23.05
Benchmarks
For a 1000 entities list,
set.seed(42)
lst <- lapply(1:1000, function(i) {v1 <- sample(50:200)[1L]
v2 <- sample(1:200, v1, replace=FALSE)
as.character(c(rbind(v2, rnorm(v1))))})
system.time({
m1 <- do.call(rbind,Map(function(x,y) cbind(x,matrix(as.numeric(y),
nrow=length(y)/2, byrow=TRUE)), seq_along(lst), lst))
m2 <- matrix(NA, ncol=max(m1[,2]), nrow=length(lst))
m2[m1[,-3]] <- m1[,3]
})
# user system elapsed
# 0.064 0.004 0.067
system.time({
m1 <- do.call(rbind,Map(function(x,y) cbind(x,matrix(as.numeric(y),
nrow=length(y)/2, byrow=TRUE)), seq_along(lst), lst))
d1 <- setNames(as.data.frame(m1), c('Row', 'Col', 'Value'))
with(d1, sparseMatrix(Row, Col, x=Value))
})
# user system elapsed
# 0.068 0.003 0.070
system.time({d1 <- unnest(lst, group)
d2 <- bind_cols(slice(d1, seq(1, n(), by=2)),
slice(d1, seq(2, n(), by=2))[2])
colnames(d2)[3] <- 'val'
res <- spread(d2, x, val) %>%
select(-group)})
# user system elapsed
# 0.259 0.002 0.261
Using the first method is slightly slower
system.time({
lst1 <- lapply(lst, function(x) { x<- as.numeric(x)
indx <- c(TRUE, FALSE)
v1 <- tabulate(x[indx])
is.na(v1) <- v1==0
v1[!is.na(v1)] <- x[!indx]
v1 })
setNames(do.call(rbind.data.frame,lapply(lst1, `length<-`,
max(lengths(lst1)))), paste0('X', 1:5))
})
# user system elapsed
#1.459 0.004 1.463
On a 33000 list
set.seed(42)
lst <- lapply(1:33000, function(i) {v1 <- sample(50:200)[1L]
v2 <- sample(1:200, v1, replace=FALSE)
as.character(c(rbind(v2, rnorm(v1))))})
system.time({
m1 <- do.call(rbind,Map(function(x,y) cbind(x,matrix(as.numeric(y),
nrow=length(y)/2, byrow=TRUE)), seq_along(lst), lst))
m2 <- matrix(NA, ncol=max(m1[,2]), nrow=length(lst))
m2[m1[,-3]] <- m1[,3]
})
# user system elapsed
# 6.160 0.102 6.260
#alexis_laz method is faster
system.time({
ulst = unlist(lst)
cols = seq(1, length(ulst), 2)
inds = cbind(row = rep(seq_along(lst), lengths(lst) %/% 2),
col = as.integer(ulst[cols]))
vals = as.numeric(ulst[-cols])
ans = matrix(, max(inds[, "row"]), max(inds[, "col"]))
ans[inds] = vals
})
# user system elapsed
# 2.421 0.041 2.460
data
lst <- list(c('1', '29.12', '2', '20.78', '3', '12.09'), c('1', '20.45',
'3', '10.56', '4', '12.34'), c('2', '12.20', '3', '19.93', '5', '23.05'))

Finding the mean of all Duplicates

There is a nice explanation here describing how to eliminate duplicates in a data frame by picking the maximum variable.
I can also see how this can be applied to pick the duplicate with the minimum variable.
my question now is how do I display the mean of all duplicates?
for example:
z <- data.frame(id=c(1,1,2,2,3,4),var=c(2,4,1,3,5,2))
# id var
# 1 2
# 1 4
# 2 1
# 2 3
# 3 5
# 4 2
I would like the output:
# id var
# 1 3 mean(2,4)
# 2 2 mean(1,3)
# 3 5
# 4 2
My current code is:
averages<-do.call(rbind,lapply(split(z,z$id),function(chunk) mean(chunk$var)))
z<-z[order(z$id),]
z<-z[!duplicated(z$id),]
z$var<-averages
My code runs very slowly and is takes about 10 times longer than the method for picking the maximum. How do I optimize this code?
Here is a faster solution using data.table
library(data.table)
z <- data.frame(id=sample(letters, 6e5, replace = TRUE),var = rnorm(6e5))
fn1 <- function(z){
z$var <- ave(z$var, z$id, FUN=mean)
return(unique(z))
}
fn2 <- function(z) {
t(sapply(split(z,z$id), function(x) sapply(x,mean)))
}
fn3 <- function(z){
data.table(z)[,list(var = mean(var)), 'id']
}
library(rbenchmark)
benchmark(f1 <- fn1(z), f2 <- fn2(z), f3 <- fn3(z), replications = 2)
est replications elapsed relative user.self sys.self
1 f1 <- fn1(z) 2 3.619 8.455607 3.331 0.242
2 f2 <- fn2(z) 2 0.586 1.369159 0.365 0.220
3 f3 <- fn3(z) 2 0.428 1.000000 0.341 0.086
I think split() and unsplit() is one way.
dupMean <- function(x)
{
result <- split(x[, 2], x[, 1])
result <- lapply(result, mean)
result <- unsplit(result, unique(x[, 1]))
return(result)
}
Or, to save a line with plyr:
require(plyr)
dupMean <- function(x)
{
result <- split(x[, 2], x[, 1])
result <- laply(result, mean)
return(result)
}
Update:
Just for curiosity, here is a comparison of the various functions suggested. Ramnath (fn3) looks to be the winner on my computer.
require(plyr)
require(data.table)
require(rbenchmark)
fn1 <- function(z){
z$var <- ave(z$var, z$id, FUN=mean)
return(unique(z))
}
fn2 <- function(z) {
t(sapply(split(z,z$id), function(x) sapply(x,mean)))
}
fn3 <- function(z){
data.table(z)[,list(var = mean(var)), 'id']
}
fn4 <- function(x)
{
result <- t(sapply(split(x,x$id), function(y) sapply(y,mean)))
return(result)
}
fn5 <- function(x)
{
x$var <- ave(x$var, x$id, FUN=mean)
x <- unique(x)
return(x)
}
fn6 <- function(x)
{
result <- do.call(rbind,lapply(split(x,x$id),function(chunk) mean(chunk$var)))
return(data.frame(id = unique(x[, 1]), var = result))
}
fn7 <- function(x)
{
result <- split(x[, 2], x[, 1])
result <- lapply(result, mean)
result <- unsplit(result, unique(x[, 1]))
return(data.frame(id = unique(x[, 1]), var = result))
}
fn8 <- function(x)
{
result <- split(x[, 2], x[, 1])
result <- laply(result, mean)
return(data.frame(id = unique(x[, 1]), var = result))
}
z <- data.frame(id = rep(c(1,1,2,2,3,4,5,6,6,7), 1e5), var = rnorm(1e6))
benchmark(f1 <- fn1(z), f2 <- fn2(z), f3 <- fn3(z), f4 <- fn4(z), f5 <- fn5(z), f6 <- fn6(z), f7 <- fn7(z), f8 <- fn8(z), replications = 2)
Result:
test replications elapsed relative user.self sys.self
1 f1 <- fn1(z) 2 13.45 20.692308 13.27 0.15
2 f2 <- fn2(z) 2 3.54 5.446154 3.43 0.09
3 f3 <- fn3(z) 2 0.65 1.000000 0.54 0.10
4 f4 <- fn4(z) 2 3.62 5.569231 3.50 0.09
5 f5 <- fn5(z) 2 13.57 20.876923 13.25 0.25
6 f6 <- fn6(z) 2 3.53 5.430769 3.36 0.14
7 f7 <- fn7(z) 2 3.34 5.138462 3.28 0.03
8 f8 <- fn8(z) 2 3.34 5.138462 3.26 0.03
I would use a combination of ave and unique:
z <- data.frame(id=rep(c(1,1,2,2,3,4),1e5),var=rnorm(6e5))
z$var <- ave(z$var, z$id, FUN=mean)
z <- unique(z)
UPDATE: after actually timing the solution, here's something that's a little faster.
z <- data.frame(id=rep(c(1,1,2,2,3,4),1e5),var=rnorm(6e5))
system.time({
averages <- t(sapply(split(z,z$id), function(x) sapply(x,mean)))
})
# user system elapsed
# 1.32 0.00 1.33
system.time({
z$var <- ave(z$var, z$id, FUN=mean)
z <- unique(z)
})
# user system elapsed
# 4.33 0.02 4.37

Resources