How to iterate through parameters in for loop - r

I have a model written as a for loop that incorporates a number of parameters that I specify:
## functions needed to run the model
learn <- function(prior, sensi, speci, e){
out <- ifelse(e == 1, (sensi*prior) / ((sensi*prior) + (1-speci)*(1-prior)),
((1-sensi)*prior) / (((1-sensi)*prior) + (speci*(1-prior))))
out
}
feed <- function(vec){
prior <- 0.5
for (i in vec){
res <- learn(prior, sensi, speci, i)
prior <- res
}
return(prior)
}
## specify parameters
iterations <- 100
N <- 10
BR <- 0.66
sensi <- 0.75
speci <- 0.45
## initialize results object
res <- NULL
## loop for number of iterations
for (j in 1:iterations){
X <- as.numeric(rbinom(1, 1, BR))
if (X == 1){ # if X is 1...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
}
} else { # if X is 0...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
evidence[[i]] <- ifelse(evidence[[i]]==1, 0, 1) # flip evidence
}
}
# feed vectors of evidence through learn function
t0 <- sapply(evidence, feed)
# save dataframe
df <- data.frame("i" = j,
"ID" = c(1:N),
"E" = t0,
"X" = X,
"N" = N,
"BR" = BR,
"sensi" = sensi,
"speci" = speci)
res <- rbind(res, df)
}
This works fine for a single parameterisation, but I now want to automate the process of specifying different parameter values and re-running the model. So instead of defining each parameter as a single value, I define them as a vector of values and store all the possible parameterisations in a dataframe (paramspace) with each row holding the values for a single parameterisation that I want to run:
## set up for multiple parameterizations
iterations <- 100
N_vec <- c(10, 50)
BR_vec <- c(0.25, 0.50, 0.75)
sensi_vec <- c(0.45, 0.75)
speci_vec <- c(0.45, 0.75)
paramspace <- expand.grid(iterations = iterations, N = N_vec, BR = BR_vec, sensi = sensi_vec, speci = speci_vec)
> paramspace
iterations N BR sensi speci
1 100 10 0.25 0.45 0.45
2 100 50 0.25 0.45 0.45
3 100 10 0.50 0.45 0.45
4 100 50 0.50 0.45 0.45
5 100 10 0.75 0.45 0.45
6 100 50 0.75 0.45 0.45
7 100 10 0.25 0.75 0.45
8 100 50 0.25 0.75 0.45
9 100 10 0.50 0.75 0.45
10 100 50 0.50 0.75 0.45
11 100 10 0.75 0.75 0.45
12 100 50 0.75 0.75 0.45
13 100 10 0.25 0.45 0.75
14 100 50 0.25 0.45 0.75
15 100 10 0.50 0.45 0.75
16 100 50 0.50 0.45 0.75
17 100 10 0.75 0.45 0.75
18 100 50 0.75 0.45 0.75
19 100 10 0.25 0.75 0.75
20 100 50 0.25 0.75 0.75
21 100 10 0.50 0.75 0.75
22 100 50 0.50 0.75 0.75
23 100 10 0.75 0.75 0.75
24 100 50 0.75 0.75 0.75
How can I pass each row of parameter values to my model and automatically run through all the parameterisations stated in paramspace?

As suggested in comments, you can create a function and then use apply to loop over the parameters combinations :
## functions needed to run the model
learn <- function(prior, sensi, speci, e){
out <- ifelse(e == 1, (sensi*prior) / ((sensi*prior) + (1-speci)*(1-prior)),
((1-sensi)*prior) / (((1-sensi)*prior) + (speci*(1-prior))))
out
}
feed <- function(vec,sensi,speci){
prior <- 0.5
for (i in vec){
res <- learn(prior, sensi, speci, i)
prior <- res
}
return(prior)
}
runModel <- function(iterations = 100,
N = 10,
BR = 0.66,
sensi = 0.75,
speci = 0.45 ) {
## initialize results object
res <- NULL
## loop for number of iterations
for (j in 1:iterations){
X <- as.numeric(rbinom(1, 1, BR))
if (X == 1){ # if X is 1...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
}
} else { # if X is 0...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
evidence[[i]] <- ifelse(evidence[[i]]==1, 0, 1) # flip evidence
}
}
# feed vectors of evidence through learn function
#t0 <- sapply(evidence, feed)
t0 <- sapply(evidence,function(e){feed(e,sensi,speci)})
# save dataframe
df <- list("i" = iterations,
"ID" = c(1:N),
"E" = t0,
"X" = X,
"N" = N,
"BR" = BR,
"sensi" = sensi,
"speci" = speci)
res <- rbind(res, df)
}
res
}
# Define parameter space
iterations <- 100
N_vec <- c(10, 50)
BR_vec <- c(0.25, 0.50, 0.75)
sensi_vec <- c(0.45, 0.75)
speci_vec <- c(0.45, 0.75)
paramspace <- expand.grid(iterations = iterations, N = N_vec, BR = BR_vec, sensi = sensi_vec, speci = speci_vec)
# Loop over parameter space :
res <- apply(paramspace,1,function(paramset) {
iterations = paramset[1]
N = paramset[2]
BR = paramset[3]
sensi = paramset[4]
speci = paramset[5]
runModel(iterations = iterations, N = N, BR = BR , sensi = sensi, speci = speci )
})

You can also use the foreach package, that used with an appropriate backend offers parallelization capabilities, in case your task becomes more intensive. Here a simple example to understand how it works.
foreach(a=1:3, b=4:6) %do% (a + b)
Then I tried to embed your code into foreach
require(foreach)
## functions needed to run the model
learn <- function(prior, sensi, speci, e){
out <- ifelse(e == 1, (sensi*prior) / ((sensi*prior) + (1-speci)*(1-prior)),
((1-sensi)*prior) / (((1-sensi)*prior) + (speci*(1-prior))))
out
}
feed <- function(vec){
prior <- 0.5
for (i in vec){
res <- learn(prior, sensi, speci, i)
prior <- res
}
return(prior)
}
## set up for multiple parameterizations
iterations <- 100
N_vec <- c(10, 50)
BR_vec <- c(0.25, 0.50, 0.75)
sensi_vec <- c(0.45, 0.75)
speci_vec <- c(0.45, 0.75)
paramspace <- expand.grid(iterations = iterations, N = N_vec, BR = BR_vec, sensi = sensi_vec, speci = speci_vec)
res <- foreach(iterations = paramspace$iterations,
N = paramspace$N,
BR = paramspace$BR,
sensi = paramspace$sensi,
speci = paramspace$speci) %do% {
## initialize results object
res <- NULL
## loop for number of iterations
for (j in 1:iterations){
X <- as.numeric(rbinom(1, 1, BR))
if (X == 1){ # if X is 1...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
}
} else { # if X is 0...
agents <- c(1:N)
evidence <- vector("list", length(agents))
for (i in agents) {
n <- sample(10, 1, replace = TRUE)
evidence[[i]] <- rbinom(n, 1, sensi)
evidence[[i]] <- ifelse(evidence[[i]]==1, 0, 1) # flip evidence
}
}
# feed vectors of evidence through learn function
t0 <- sapply(evidence, feed)
# save dataframe
df <- data.frame("i" = j,
"ID" = c(1:N),
"E" = t0,
"X" = X,
"N" = N,
"BR" = BR,
"sensi" = sensi,
"speci" = speci)
res <- rbind(res, df)
}
res
}

Another approach is to make a function and to use Map(...). The advantage of Map is that your paramspace will not be coerced into a matrix which will make everything the same type (i.e., numeric, character, etc.).
There were also some other changes I made in order to allow R to do the acccounting for us. Primarily:
X is now a logical so we can simplify our if statements. Additionally, the allocation is made all at once instead of looping.
We change the feed() function to also generate the evidence. This allows us to...
Use replicate to repeat the loops.
learn2 <- function(prior, sensi, speci, e){
out <- ifelse(e, (sensi*prior) / ((sensi*prior) + (1-speci)*(1-prior)),
((1-sensi)*prior) / (((1-sensi)*prior) + (speci*(1-prior))))
out
}
feed2 = function(x, N, samp_n = 10L, sensi, speci) {
evidence = rbinom(sample(samp_n, 1L, replace = TRUE),
1,
if (x) sensi else 1 - sensi)
prior = 0.5
for (i in evidence) {
res = learn2(prior, sensi, speci, i)
prior = res
}
return(prior)
}
runModel2 <- function(iterations = 2,
N = 10,
BR = 0.66,
sensi = 0.75,
speci = 0.45 ) {
X = sample(c(TRUE, FALSE), N, BR)
## this is done now so that the columns will be ordered nicer
ans = list(ID = 1:N,
N = N,
BR = BR,
sensi = sensi,
speci = speci,
X = X)
t0s = replicate(iterations,
vapply(X, feed2, FUN.VALUE = 0, N, 10L, sensi, speci, USE.NAMES = FALSE),
simplify = FALSE)
names(t0s) = paste0("E_", 1:iterations)
return(as.data.frame(c(ans, t0s)))
}
runModel2()
#> ID N BR sensi speci X E_1 E_2
#> 1 1 10 0.66 0.75 0.45 TRUE 0.82967106 0.657648599
#> 2 2 10 0.66 0.75 0.45 FALSE 0.43103448 0.006827641
#> 3 3 10 0.66 0.75 0.45 TRUE 0.43103448 0.775671866
#> 4 4 10 0.66 0.75 0.45 TRUE 0.71716957 0.431034483
#> 5 5 10 0.66 0.75 0.45 FALSE 0.24176079 0.016593958
#> 6 6 10 0.66 0.75 0.45 FALSE 0.30303324 0.008992838
#> 7 7 10 0.66 0.75 0.45 TRUE 0.82967106 0.865405260
#> 8 8 10 0.66 0.75 0.45 FALSE 0.43103448 0.439027817
#> 9 9 10 0.66 0.75 0.45 FALSE 0.57692308 0.050262167
#> 10 10 10 0.66 0.75 0.45 FALSE 0.02178833 0.296208531
This output is a little wider than your original approach. We can always reshape the E_# columns but this may end up being better for your actual use case.
Finally, here is Map() in action:
iterations <- 100
N_vec <- c(10, 50)
BR_vec <- c(0.25, 0.50, 0.75)
sensi_vec <- c(0.45, 0.75)
speci_vec <- c(0.45, 0.75)
paramspace <- expand.grid(iterations = iterations, N = N_vec, BR = BR_vec, sensi = sensi_vec, speci = speci_vec)
res = Map(runModel2, paramspace$iterations, paramspace$N, paramspace$BR, paramspace$sensi, paramspace$speci)
res[[24L]][1:10, 1:8] ## only first 10 rows for demonstration
## ID N BR sensi speci X E_1 E_2
##1 1 50 0.75 0.75 0.75 TRUE 0.500000000 0.500000000
##2 2 50 0.75 0.75 0.75 FALSE 0.001369863 0.035714286
##3 3 50 0.75 0.75 0.75 FALSE 0.250000000 0.900000000
##4 4 50 0.75 0.75 0.75 TRUE 0.750000000 0.250000000
##5 5 50 0.75 0.75 0.75 TRUE 0.987804878 0.500000000
##6 6 50 0.75 0.75 0.75 TRUE 0.964285714 0.250000000
##7 7 50 0.75 0.75 0.75 TRUE 0.750000000 0.750000000
##8 8 50 0.75 0.75 0.75 FALSE 0.012195122 0.035714286
##9 9 50 0.75 0.75 0.75 TRUE 0.750000000 0.500000000
##10 10 50 0.75 0.75 0.75 FALSE 0.250000000 0.001369863

Related

Computation with different combinations of parameters using for loop

I am trying to implement a for loop in R to fill a df with some combinations of learning rates and decays used in machine learning. The ideia is to try several learning rates and decays, calculate error metrics of these combinations and save in a dataset. So I could point out which combination is better.
Below is the code and my result. I don't understand why I get this result.
learning_rate = c(0.01, 0.02)
decay = c(0, 1e-1)
combinations = length(learning_rate) * length(decay)
df <- data.frame(Combination=character(combinations),
lr=double(combinations),
decay=double(combinations),
result=character(combinations),
stringsAsFactors=FALSE)
for (i in 1:combinations) {
for (lr in learning_rate) {
for (dc in decay) {
df[i, 1] = i
df[i, 2] = lr
df[i, 3] = dc
df[i, 4] = 10*lr + dc*4 # Here I'd do some machine learning. Just put this is easy equation as example
}
}
}
The result I get. It seems that only the combination loop worked well. What I did wrong?
Combination lr decay result
1 0.02 0.1 0.6
2 0.02 0.1 0.6
3 0.02 0.1 0.6
4 0.02 0.1 0.6
I expected this result
Combination lr decay result
1 0.01 0 0.1
2 0.01 1e-1 0.5
3 0.02 0 0.2
4 0.02 1e-1 0.6
Tuning with for-loop:
df <- data.frame()
for (lr in learning_rate) {
for (dc in decay) {
df <- rbind(df, data.frame(
lr = lr,
decay = dc,
result = 10*lr + dc*4
))
}
}
df
# lr decay result
# 1 0.01 0.0 0.1
# 2 0.01 0.1 0.5
# 3 0.02 0.0 0.2
# 4 0.02 0.1 0.6
Tuning with mapply():
df <- expand.grid(lr = learning_rate, decay = decay)
ML.fun <- function(lr, dc) 10*lr + dc*4
df$result <- mapply(ML.fun, lr = df$lr, dc = df$decay)
df
# lr decay result
# 1 0.01 0.0 0.1
# 2 0.02 0.0 0.2
# 3 0.01 0.1 0.5
# 4 0.02 0.1 0.6

Grouping factors or integers into equivalence classes in R

I have a data frame representing equivalences between members from two sets:
print(x)
G S
1 g1 s2
2 g1 s1
3 g2 s3
4 g3 s3
5 g4 s3
Does someone know of a function or a useful data structure for grouping the objects into equivalence classes? In the example above, the result should be two equivalence classes
{g1, s1, s2}, {g2, g3, g4, s3}
An option is to use igraph to extract vertices from clusters:
library(igraph)
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
tapply(names(m), m, sort)
output:
$`1`
[1] "g1" "s1" "s2"
$`2`
[1] "g2" "g3" "g4" "s3"
data:
x <- read.table(text="G S
g1 s2
g1 s1
g2 s3
g3 s3
g4 s3", header=TRUE, stringsAsFactors=FALSE)
You can test for equality using outer and combine them with | or. From this matrix get the unique lines and then use apply to return a list of the groups.
tt <- outer(x$G, x$G, "==") | outer(x$S, x$S, "==")
tt <- unique(tt)
apply(tt, 1, function(i) unique(unlist(x[i,])))
#[[1]]
#[1] "g1" "s2" "s1"
#
#[[2]]
#[1] "g2" "g3" "g4" "s3"
Another option which is looping over the vector instead of expanding it as outer is doing:
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- unique(c(t1, t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
#[[1]]
#[1] "g1" "s2" "s1"
#
#[[2]]
#[1] "g2" "g3" "g4" "s3"
Data:
x <- structure(list(G = c("g1", "g1", "g2", "g3", "g4"), S = c("s2",
"s1", "s3", "s3", "s3")), class = "data.frame", row.names = c(NA, -5L))
You can apply the following code for grouping
# function to categorize incoming `v` within existing `lst`
grp <- function(lst, v) {
if (length(lst) == 0) return(c(lst,list(v)))
idx <- which(unlist(Map(function(x) any(!is.na(match(v,x))), lst)))
if (length(idx) == 0) {
lst <- c(lst,list(v))
} else {
lst[idx] <- list(union(unlist(lst[idx]),v))
}
return(unique(lst))
}
# generate grouping results
df <- unique(df)
res <- Reduce(function(lst,x) grp(lst,x),
c(list(NULL),unname(Map(function(x) as.character(unlist(x)),split(df,seq(nrow(df)))))),
accumulate = F)
Application Examples
given input data df <- data.frame(G = c("g1","g1","g2","g3","g4"), S = c("s2","s1","s3","s3","s3"))
then
> df
G S
1 g1 s2
2 g1 s1
3 g2 s3
4 g3 s3
5 g4 s3
> res
[[1]]
[1] "g1" "s2" "s1"
[[2]]
[1] "g2" "s3" "g3" "g4"
given input data df <- data.frame(G = sprintf("g%i", c(2,3,4,2,2)), S = sprintf("s%i", c(3,3,2,4,3)))
then
> df
G S
1 g2 s3
2 g3 s3
3 g4 s2
4 g2 s4
> res
[[1]]
[1] "g2" "s3" "g3" "s4"
[[2]]
[1] "g4" "s2"
UPDATE: above solution become rather slow when dealing with huge dataset. An improved solution is given as below:
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
To compare the runtime of the three answers by #GKi, #chinsoon12, and #ThomasisCoding, I have created random sets of different size n and measured the runtime (as "elapsed" from proc.time).
From the results, I conclude that methods relying on igraph's connected component decomposition is the fastest:
n chinsoon12 ThomasisCoding GKi
500 0.002 0.054 0.030
2500 0.010 0.203 0.416
5000 0.020 0.379 1.456
7500 0.033 0.670 3.351
10000 0.044 0.832 5.837
Edit (2019-11-19): Upon request of #GKI, here is the code I used for comparing the runtime of the three algorithms. Beware that all functions work on the global variable x, because R only supports call-by-value, which would add unwanted overhead in this runtime estimation:
library(igraph)
# solution by chinsson12: CC decomposition from igraph
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
# solution by ThomasisCoding
method.B <- function() {
# find 1-to-1 mapping
r <- Reduce(intersect,lapply(names(x), function(v) split(x,x[v])))
r1map <- unlist(Map(toString,Map(unlist,r)))
# removel one-to-one mapping and find N-to-1 mapping
if (length(r1map) >0) {
xx <- x[-as.numeric(rownames(Reduce(rbind,r))),]
} else {
xx <- x
}
rNmap <- c()
if (nrow(xx)> 0) {
rNmap <- sapply(names(xx),
function(v) {
z <- split(xx,xx[v])
u <- z[unlist(Map(nrow,z))>1]
ifelse(length(u)==0, NA, toString(c(names(u),as.vector(u[[1]][,setdiff(names(xx),v)]))))
},USE.NAMES = F)
rNmap <- rNmap[!is.na(rNmap)]
}
# combine both 1-to-1 and n-to-1 mappings
res <- c(r1map,rNmap)
return(res)
}
# solution by GKi: with outer product
method.C <- function() {
tt <- outer(x$G, x$G, "==") | outer(x$S, x$S, "==")
tt <- unique(tt)
res <- apply(tt, 1, function(i) unique(unlist(x[i,])))
return(res)
}
# runtime results
rt <- data.frame()
for (n in seq(500,10000, by=500)) {
# this won't work because of ambigous node ids (see [answer by GKi][6]):
#x <- data.frame(G = sample(1:n,n,replace=TRUE), S = sample(1:n,n,replace=TRUE))
# therefore, make the node ids unique:
x <- data.frame(G = sprintf("g%i", sample(1:n,n,replace=TRUE)), S = sprintf("s%i", sample(1:n,n,replace=TRUE)))
t1 <- proc.time()
method.A()
tA <- proc.time() - t1
t1 <- proc.time()
method.B()
tB <- proc.time() - t1
t1 <- proc.time()
method.C()
tC <- proc.time() - t1
rt <- rbind(rt, data.frame(n=n, t.A=tA[["elapsed"]], t.B=tB[["elapsed"]], t.C=tC[["elapsed"]]))
}
print(rt)
plot(rt$n, rt$t.C, xlab="n", ylab="run time [s]", ylim=c(min(rt$t.A),max(rt$t.C)), type='l')
lines(rt$n, rt$t.B, col="red")
lines(rt$n, rt$t.A, col="blue")
legend("topleft", c("GKi", "ThomasisCoding", "chinsoon12"), lt=c(1,1,1), col=c("black", "red", "blue"))
Comparison on results of the methods:
method.A()
#$`1`
#[1] "1" "2" "3" "4"
method.A2()
#$`1`
#[1] "3" "1" "4" "2"
#
#$`2`
#[1] "2" "3"
method.B()
#[[1]]
#[1] 3 1 4 2
#
#[[2]]
#[1] 2 3
method.C()
#[[1]]
#[[1]]$All
#[1] 3 1 4 2
#
#[[1]]$G
#[1] 3 1
#
#[[1]]$S
#[1] 4 2 1
#
#
#[[2]]
#[[2]]$All
#[1] 2 3
#
#[[2]]$G
#[1] 2
#
#[[2]]$S
#[1] 3
Methods:
library(igraph)
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
method.A2 <- function() {
g <- graph_from_data_frame(t(apply(x, 1, function(x) paste0(names(x), x))))
m <- clusters(g)$membership
res <- tapply(substring(names(m),2), m, unique)
return(res)
}
method.B <- function() {
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
return(res)
}
method.C <- function() {
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- list(All=unique(c(t1, t2)), G=unique(t1), S=unique(t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
}
Data:
x <- data.frame(G = c(3,1,1,2,3), S=c(4,1,2,3,2))
x
# G S
#1 3 4
#2 1 1
#3 1 2
#4 2 3
#5 3 2
UPDATE: performance comparison based on latest updates by #GKi, #chinsoon12, and #ThomasisCoding
code for comparison
library(igraph)
method.A <- function() {
g <- graph_from_data_frame(x)
m <- clusters(g)$membership
res <- tapply(names(m), m, sort)
return(res)
}
method.B <- function() {
G2S <- function(df,g) {
df[df$G %in% g,]$S
}
S2G <- function(df,s) {
df[df$S %in%s,]$G
}
grpFun <- function(df, g) {
repeat {
gt <- S2G(df, (s<-G2S(df, g)))
if (length(gt) == length(g)) return(list(G = gt, S = s))
g <- gt
}
}
res <- c()
Gpool <- x$G
repeat {
if (length(Gpool)==0) break
grp <- grpFun(x,Gpool[1])
Gpool <- setdiff(Gpool,grp$G)
res <- c(res, list(union(unique(grp$G),unique(grp$S))))
}
return(res)
}
method.C <- function() {
y <- unique(x)
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
n <- 1
res <- list(0)
repeat {
i <- y[,1] %in% tt1 | y[,2] %in% tt2
tt <- y[i,]
y <- y[!i,]
tt1 <- unique(tt[!tt[,1] %in% tt1,1])
tt2 <- unique(tt[!tt[,2] %in% tt2,2])
if(length(tt1) + length(tt2) > 0) {
t1 <- c(t1, tt1)
t2 <- c(t2, tt2)
} else {
res[[n]] <- list(All=unique(c(t1, t2)), G=unique(t1), S=unique(t2))
if(nrow(y) == 0) break;
n <- n + 1
t1 <- tt1 <- y[1,1]
t2 <- tt2 <- y[1,2]
y <- y[-1,]
}
}
res
}
# runtime results
rt <- data.frame()
for (n in seq(500,10000, by=500)) {
# this won't work because of ambigous node ids (see [answer by GKi][6]):
#x <- data.frame(G = sample(1:n,n,replace=TRUE), S = sample(1:n,n,replace=TRUE))
# therefore, make the node ids unique:
x <- data.frame(G = sprintf("g%i", sample(1:n,n,replace=TRUE)), S = sprintf("s%i", sample(1:n,n,replace=TRUE)))
t1 <- proc.time()
method.A()
tA <- proc.time() - t1
t1 <- proc.time()
method.B()
tB <- proc.time() - t1
t1 <- proc.time()
method.C()
tC <- proc.time() - t1
rt <- rbind(rt, data.frame(n=n, t.A=tA[["elapsed"]], t.B=tB[["elapsed"]], t.C=tC[["elapsed"]]))
}
print(rt)
plot(rt$n, rt$t.C, xlab="n", ylab="run time [s]", ylim=c(min(rt$t.A),max(rt$t.C)), type='l')
lines(rt$n, rt$t.B, col="red")
lines(rt$n, rt$t.A, col="blue")
legend("topleft", c("GKi", "ThomasisCoding", "chinsoon12"), lt=c(1,1,1), col=c("black", "red", "blue"))
runtime of three methods:
n t.A t.B t.C
1 500 0.00 0.16 0.26
2 1000 0.02 0.31 0.53
3 1500 0.02 0.51 1.11
4 2000 0.03 0.90 1.47
5 2500 0.03 1.35 2.17
6 3000 0.04 2.08 3.14
7 3500 0.04 2.66 3.97
8 4000 0.07 3.38 4.92
9 4500 0.07 4.38 6.35
10 5000 0.06 5.41 7.58
11 5500 0.08 6.79 9.55
12 6000 0.08 7.81 10.91
13 6500 0.10 9.03 12.06
14 7000 0.09 10.06 14.20
15 7500 0.11 11.76 15.65
16 8000 0.13 13.41 17.84
17 8500 0.11 14.87 20.67
18 9000 0.13 16.88 23.52
19 9500 0.14 18.38 25.57
20 10000 0.14 22.81 30.05
visualization of runtime
Additional (Thanks to comment by #GKi): When keeping the dataset integers, the grouping process non-igraph methods are largely reduced:
n t.A t.B t.C
1 500 0.00 0.09 0.13
2 1000 0.01 0.15 0.23
3 1500 0.01 0.22 0.38
4 2000 0.03 0.31 0.50
5 2500 0.05 0.45 0.76
6 3000 0.07 0.51 0.77
7 3500 0.06 0.67 0.97
8 4000 0.07 0.85 1.20
9 4500 0.07 0.90 1.39
10 5000 0.09 1.23 1.55
11 5500 0.09 1.30 1.78
12 6000 0.09 1.51 1.94
13 6500 0.11 1.77 2.20
14 7000 0.13 2.18 2.55
15 7500 0.12 2.37 2.79
16 8000 0.13 2.56 2.96
17 8500 0.14 2.76 3.39
18 9000 0.15 3.03 3.54
19 9500 0.15 3.54 4.23
20 10000 0.16 3.76 4.32

R: getting rid of for loop and speeding code

I would like to speed up my calculations and obtain results without using loop in function m. Reproducible example:
N <- 2500
n <- 500
r <- replicate(1000, sample(N, n))
m <- function(r, N) {
ic <- matrix(0, nrow = N, ncol = N)
for (i in 1:ncol(r)) {
p <- r[, i]
ic[p, p] <- ic[p, p] + 1
}
ic
}
system.time(ic <- m(r, N))
# user system elapsed
# 6.25 0.51 6.76
isSymmetric(ic)
# [1] TRUE
In every iteration of for loop we are dealing with matrix not vector, so how this could be Vectorized?
#joel.wilson The purpose of this function is to calculate pairwise frequencies of elements. So afterwards we could estimate pairwise inclusion probabilities.
Thanks to #Khashaa and #alexis_laz. Benchmarks:
> require(rbenchmark)
> benchmark(m(r, N),
+ m1(r, N),
+ mvec(r, N),
+ alexis(r, N),
+ replications = 10, order = "elapsed")
test replications elapsed relative user.self sys.self user.child sys.child
4 alexis(r, N) 10 4.73 1.000 4.63 0.11 NA NA
3 mvec(r, N) 10 5.36 1.133 5.18 0.18 NA NA
2 m1(r, N) 10 5.48 1.159 5.29 0.19 NA NA
1 m(r, N) 10 61.41 12.983 60.43 0.90 NA NA
This should be significantly faster as it avoids operations on double indexing
m1 <- function(r, N) {
ic <- matrix(0, nrow = N, ncol=ncol(r))
for (i in 1:ncol(r)) {
p <- r[, i]
ic[, i][p] <- 1
}
tcrossprod(ic)
}
system.time(ic1 <- m1(r, N))
# user system elapsed
# 0.53 0.01 0.55
all.equal(ic, ic1)
# [1] TRUE
Simple "counting/adding" operations can almost always be vectorized
mvec <- function(r, N) {
ic <- matrix(0, nrow = N, ncol=ncol(r))
i <- rep(1:ncol(r), each=nrow(r))
ic[cbind(as.vector(r), i)] <- 1
tcrossprod(ic)
}

In R, how do I make an iterative calculation without using a loop?

Here is a simple example of one type of iterative calc:
vals <- data.frame( "x"=c( 14, 15, 12, 10, 17 ), "ema"=0 )
vals$ema[1] <- vals$x[1]
K <- 0.90
for( jj in 2:nrow( vals ) )
vals$ema[jj] <- K * vals$ema[jj-1] + (1-K) * vals$x[jj]
vals
x ema
1 14 14.0000
2 15 14.1000
3 12 13.8900
4 10 13.5010
5 17 13.8509
The more involved examples use if...else to determine the next value:
for( jj in 2:nrow( vals ) )
if( K * vals$ema[jj-1] + (1-K) * vals$x[jj] < 5.0 )
vals$ema[jj] <- 5.0
else if( K * vals$ema[jj-1] + (1-K) * vals$x[jj] > 15.0 )
vals$ema[jj] <- 15.0
else
vals$ema[jj] <- K * vals$ema[jj-1] + (1-K) * vals$x[jj]
I am not sure if it would be more involved or not, but the decision can be based on the previous value as well:
K1 <- 0.999
K2 <- 0.95
K3 <- 0.90
for( jj in 2:now( vals ) )
if( vals$ema[jj-1] < 0.0 )
vals$ema[jj] <- K1 * vals$ema[jj-1] + (1-K1) * vals$x[jj]
else if( vals$ema[jj-1] > 100.0 )
vals$ema[jj] <- K3 * vals$ema[jj-1] + (1-K3) * vals$x[jj]
else
vals$ema[jj] <- K2 * vals$ema[jj-1] + (1-K2) * vals$x[jj]
This answer by WaltS to a similar question I had about recursive calculations provides two potential solutions. Adapting one of them to your question:
vals$ema.Reduce <- Reduce(function(myema, x) K * myema + (1-K) * x,
x = tail(vals$x, -1), init = 14, accumulate = TRUE)
vals
# x ema ema.Reduce
#1 14 14.0000 14.0000
#2 15 14.1000 14.1000
#3 12 13.8900 13.8900
#4 10 13.5010 13.5010
#5 17 13.8509 13.8509
Explanation of the function:
Reduce() is calculating ema for the current jj row, and myema is the previous value (jj-1) starting with init. The x vector required by Reduce consists of vals$x for the rows you want to calculate: row 2 to the last row = x = tail(vals$x, -1). The accumulate = TRUE option returns the vector instead of the final value. (Note the x term in Reduce is a generic term and not the same as vals$x in the example data. For calculations that do not require the additional term vals$x, a vector of 0's would work (as in the linked answer)).
Adding if/else conditions to Reduce (note: init is changed in these examples to illustrate the conditional statements):
Reduce(function(myema, x) {
if(myema < 5) {
5
} else if(myema > 15) {
15
} else {
K * myema + (1-K) * x
}
}, x = tail(vals$x, -1), init = 16, accumulate = TRUE)
#[1] 16.000 15.000 14.700 14.230 14.507
Reduce(function(myema, x) {
if(myema < 0) {
K1 * myema + (1-K1) * x
} else if(myema > 100) {
K3 * myema + (1-K3) * x
} else {
K2 * myema + (1-K2) * x
}
}, x = tail(vals$x, -1), init = 110, accumulate = TRUE)
#[1] 110.00000 100.50000 91.65000 87.56750 84.03912
K3*110 + (1-K3)*vals$x[2] #100.5
K3*100.5 + (1-K3)*vals$x[3] #91.65
K2*91.65 + (1-K2)*vals$x[4] #87.5675
K2*87.5675 + (1-K2)*vals$x[5] #84.03912
Seems this succeeds:
vals$ema2 <- c(vals$ema[1], K*vals$ema[1:4] +(1-K)*vals$x[2:5] )
> vals
x ema ema2
1 14 14.0000 14.0000
2 15 14.1000 14.1000
3 12 13.8900 13.8900
4 10 13.5010 13.5010
5 17 13.8509 13.8509
Sometimes it is best to work with the time series and data munging libraries. In this case, lag.zoo from the zoo library handles lagged values for you.
library(dplyr)
library(zoo)
vals <- data.frame( "x"=c( 14, 15, 12, 10, 17 ) )
K <- 0.90
vals %>% mutate(ema = (1-K)*vals$x + K*(lag(vals$x,1)))
For this particular problem, the weights for each value is some function of k and i (as in the ith value). We can write a function for the weights, and vectorize it:
weights <- function(i, k) {
q <- 1-k
qs <- '^'(q, 1:i)
rev(qs) * c(1, rep(k, (i-1)))
}
v_weights <- Vectorize(weights)
An example:
> v_weights(1:3, .1)
[[1]]
[1] 0.9
[[2]]
[1] 0.81 0.09
[[3]]
[1] 0.729 0.081 0.090
where these are the weights of the "preceding" x values. We proceed with some matrix algebra. I write a function to make the weights (above) into a matrix:
weight_matrix <- function(j, k) {
w <- v_weights(1:j, k=k)
Ws <- matrix(0, j+1, j+1)
Ws[row(Ws)+col(Ws)<(j+2)] <- unlist(rev(w))
Ws <- t(Ws)
Ws[row(Ws)+col(Ws)==(j+2)] <- k
Ws[(j+1),1] <- 1
Ws
}
Example:
> weight_matrix(3, .1)
[,1] [,2] [,3] [,4]
[1,] 0.729 0.081 0.09 0.1
[2,] 0.810 0.090 0.10 0.0
[3,] 0.900 0.100 0.00 0.0
[4,] 1.000 0.000 0.00 0.0
Then multiply this with the vector of xs. Function: ema <- function(x, k) rev(weight_matrix(length(x)-1, k) %*% x[1:(length(x))]).
To get the dataframe above (I "flipped" the k so it's 0.1 instead of 0.9):
> x <- c(14, 15, 12, 10, 17)
> k <- .1
> vals <- data.frame("x"=x, "ema"=ema(x, k))
> vals
x ema
1 14 14.0000
2 15 14.1000
3 12 13.8900
4 10 13.5010
5 17 13.8509
#shayaa's answer is 99% correct. dplyr implements lag just fine, and apart from a typo in that answer (one value of x should be ema), extraneous calls to column names, and a missing default value (otherwise it puts NA in the first row) it works perfectly well.
library(dplyr)
vals %>% mutate(ema = K*lag(ema, 1, default=ema[1]) + (1-K)*x)
#> x ema
#> 1 14 14.0000
#> 2 15 14.1000
#> 3 12 13.8900
#> 4 10 13.5010
#> 5 17 13.8509

Combine same-name columns and apply Johansen test in R

I have two data bases (data is multicolumn before and after treatment):
Before treatment
Data1<-read.csv("before.csv")
X1 X2 X3
1 0.21 0.32 0.42
2 0.34 0.23 0.33
3 0.42 0.14 0.11
4 0.35 0.25 0.35
5 0.25 0.41 0.44
After treatment
Data2<-read.csv("after.csv")
X1 X2 X3
1 0.33 0.43 0.7
2 0.28 0.51 0.78
3 0.11 0.78 0.34
4 0.54 0.34 0.34
5 0.42 0.64 0.22
I would like to combine the data by columns (i.e. x1 in Data1 and x1 in Data2 similarly: x2 in Data1 and x2 in Data2 and so on) and perform Johansen Cointegration test for each pair.
What I tried is to make:
library("urca")
x1<-cbind(Data1$x1, Data2$x1)
Jo1<-ca.jo(x1, type="trace",K=2,ecdet="none", spec="longrun")
summary(Jo1)
x2<-cbind(Data1$x1, Data2$x2)
Jo2<-ca.jo(x2, type="trace",K=2,ecdet="none", spec="longrun")
summary(Jo2)
This gives me what I want but I would like to automate the process, i.e. instead of manually combining data, to have all pair-wise combinations.
Based on krishna's answere, but modified the loop:
for(i in 1:ncol(Data1)) {
col <- paste0("X", as.character(i))
data <- cbind(Data1[, col], Data2[, col])
colnames(data) <- c(paste0("Data1_",col),paste0("Data2_",col)) # add column names
Jo<- ca.jo(data, type="trace",K=2,ecdet="none", spec="longrun")
print(summary(Jo)) # print the summary to the console
}
You can loop through the columns name and find the Johansen Cointegration as follows:
# Create a sample data frame
Data1<- data.frame(X1 = rnorm(10, 0, 1), X2 = rnorm(10, 0, 1), X3 = rnorm(10, 0, 1))
Data2 <-data.frame(X1 = rnorm(10, 0, 1), X2 = rnorm(10, 0, 1), X3 = rnorm(10, 0, 1))
library("urca")
# loop through all columns index
for(i in ncol(Data1)) {
col <- paste0("X", as.character(i)) # find the column name
data <- cbind(Data1[, col], Data2[, col]) # get the data from Data1 and Data2, all rows of a column = col
# Your method for finding Ca.Jo ...
Jo<- ca.jo(data, type="trace",K=2,ecdet="none", spec="longrun")
summary(Jo)
}
You can also use colnames for looping as:
for(col in colnames(Data1)) {
print(col)
data <- cbind(Data1[, col], Data2[, col])
print(data)
#Jo<- ca.jo(data, type="trace",K=2,ecdet="none", spec="longrun")
#summary(Jo)
}
Hope this will help you.

Resources