How to simplify FOR loops in R? - r

for(i in 1:100)
{
for(j in 1:100)
{
hdist <- rgeos::gDistance(xySpatialLines[[i]], xySpatialLines[[j]], byid=FALSE, hausdorff=TRUE)
distances[i,j] <- dist
}
}
Is there any way to simplify j loop to get something like this:
for(i in 1:100)
{
distances[i,j] <- lapply(???) # or sapply?
}
UPDATE:
The data stored in xySpatialLines[[i]] - these are SpatialLines objects:
library(sp)
xySpatialLines <- vector(mode = "list", length = 2)
x1 <- c(1,4,5,3,2)
y1 <- c(2,5,3,6,7)
x2 <- c(4,4,6,3,2)
y2 <- c(8,5,2,6,1)
xy1 <- cbind(x1,y1)
xy2 <- cbind(x2,y2)
xy1.sp = sp::SpatialPoints(xy1)
xy2.sp = sp::SpatialPoints(xy2)
spl1 <- sp::SpatialLines(list(Lines(Line(xy1.sp), ID="a")))
spl2 <- sp::SpatialLines(list(Lines(Line(xy2.sp), ID="b")))
xySpatialLines[[1]] = spl1
xySpatialLines[[2]] = spl2

You can use outer:
distances = outer(xySpatialLines, xySpatialLines, FUN = gDistance, byid=FALSE, hausdorff = TRUE)

Related

How to see the distribution of variables after clustering with gower distance in R. How can i see the distribution of variables accross the clusters?

I have calculated dendrograms of my dataset with the divisive and agglomerative method
library(cluster)
library(fpc)
gower.dist <- daisy(data.cluster, metric=c("gower"))
divisive.clust <- diana(as.matrix(gower.dist),
diss = TRUE, keep.diss = TRUE)
plot(divisive.clust, main = "Divisive")
aggl.clust.c <- hclust(gower.dist, method = "complete")
plot(aggl.clust.c,
main = "Agglomerative, complete linkages")
I also have the results in a table with the amounts of cases in the clusters, etc.
cstats.table <- function(dist, tree, k) {
clust.assess <- c("cluster.number","n","within.cluster.ss","average.within","average.between",
"wb.ratio","dunn2","avg.silwidth")
clust.size <- c("cluster.size")
stats.names <- c()
row.clust <- c()
output.stats <- matrix(ncol = k, nrow = length(clust.assess))
cluster.sizes <- matrix(ncol = k, nrow = k)
for(i in c(1:k)){
row.clust[i] <- paste("Cluster-", i, " size")
}
for(i in c(2:k)){
stats.names[i] <- paste("Test", i-1)
for(j in seq_along(clust.assess)){
output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]
}
for(d in 1:k) {
cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
cluster.sizes[d, i]
}
}
output.stats.df <- data.frame(output.stats)
cluster.sizes <- data.frame(cluster.sizes)
cluster.sizes[is.na(cluster.sizes)] <- 0
rows.all <- c(clust.assess, row.clust)
# rownames(output.stats.df) <- clust.assess
output <- rbind(output.stats.df, cluster.sizes)[ ,-1]
colnames(output) <- stats.names[2:k]
rownames(output) <- rows.all
is.num <- sapply(output, is.numeric)
output[is.num] <- lapply(output[is.num], round, 2)
output
}
stats.df.divisive <- cstats.table(gower.dist, divisive.clust, 7)
stats.df.divisive
stats.df.aggl <-cstats.table(gower.dist, aggl.clust.c, 7)
#complete linkages looks like the most balanced approach
stats.df.aggl

Optimising nested for loops in R

I tried to speed the below code but without any success.
I read about Rfast package but I also fail in implementing that package.
Is there any way to optimise the following code in R?
RI<-function(y,x,a,mu,R=500,t=500){
x <- as.matrix(x)
dm <- dim(x)
n <- dm[1]
bias1 <- bias2 <- bias3 <- numeric(t)
b1 <- b2<- b3 <- numeric(R)
### Outliers in Y ######
for (j in 1:t) {
for (i in 1:R) {
id <- sample(n, a * n)
z <- y
z[id] <- rnorm(id, mu)
b1[i] <- var(coef(lm(z ~., data = as.data.frame(x))))
b2[i] <- var(coef(rlm(z ~ ., data = data.frame(x), maxit = 2000, method = "MM")))
b3[i] <- var(coef(rlm(z ~ ., data = data.frame(x), psi = psi.huber,maxit = 300)))
}
bias1[j] <- sum(b1) ; bias2[j] <- sum(b2); bias3[j] <- sum(b3)
}
bias <- cbind("lm" = bias1,"MM-rlm" = bias2, "H-rlm" = bias3)
colMeans(bias)
}
#######################################
p <- 5
n <- 200
x<- matrix(rnorm(n * p), ncol = p)
y<-rnorm(n)
a=0.2
mu <-10
#######################################
RI(y,x,a,mu)

Filling empty vector inside while loop - R

can somebody help me please with my cluster analysis?
I try to put values in an empty vector inside a while loop and are not able to manage it.
I am searching for the minimum value of my distance matrix and then put these values in every step of the while loop into a vector.
You can find the part of the code at variable "meltLevel" in the middle of the code.
Thanks in advance.
standardMatrix <- matrix (runif(25, max = 50, min = 1), nrow = 5, ncol = 5)
print(standardMatrix)
dist <- function(x1, x2) sqrt(sum((x1 - x2)^2))
distanceMatrix <- matrix(, nrow=5, ncol=5)
for (x in 1:4) {
for (y in (x+1):5) {
vector1 <- standardMatrix[x,];
vector2 <- standardMatrix[y,];
distanceMatrix[x,y] <- dist(vector1, vector2);
}
}
distanceMatrix <- t(distanceMatrix)
print(distanceMatrix)
while(dim(distanceMatrix)[1] > 2) {
distanceMatrix <- rbind(distanceMatrix, NA)
distanceMatrix <- cbind(distanceMatrix, NA)
newInd <- dim(distanceMatrix)[1]
endInd <- dim(distanceMatrix)[1] - 1
meltLevel <- numeric(0)
meltLevel <- c(meltLevel, min(distanceMatrix, na.rm = TRUE))
print(meltLevel)
minVal <- which(distanceMatrix == min(distanceMatrix, na.rm=TRUE), arr.ind=TRUE)
print(minVal)
endInd = dim(distanceMatrix)[1]-1
for(col in 1:endInd){
if(!(col == minVal[1] || col == minVal[2])) {
v <- c(distanceMatrix[minVal[1],col],distanceMatrix[col,minVal[1]],distanceMatrix[col,minVal[2]],distanceMatrix[minVal[2],col])
minV <- min(v, na.rm=TRUE)
distanceMatrix[newInd,col] = minV
}
}
maxMinVal <- max(minVal)
minMinVal <- min(minVal)
distanceMatrix <- distanceMatrix[-maxMinVal,]
distanceMatrix <- distanceMatrix[-minMinVal,]
distanceMatrix <- distanceMatrix[, -maxMinVal]
distanceMatrix <- distanceMatrix[, -minMinVal]
print(distanceMatrix)
}
print(meltLevel)

Avoiding Loops to Generate a Complex Dataframe with Nested Lists

Here is a kind of DF, I have to generate to store simulations data).
nbSimul <- 100
nbSampleSizes <- 4
nbCensoredRates <- 4
sampleSize <- c(100, 50, 30, 10)
censoredRate <- c(0.1, 0.3, 0.5, 0.8)
df.sampled <- data.frame(cas = numeric() ,
distribution = character(),
simul = numeric() ,
sampleSize = numeric() ,
censoredRate = numeric() ,
dta = I(list()) ,
quantileLD = I(list()) ,
stringsAsFactors = FALSE)
v <- 0 # Scenario indicator
for(k in 1:nbCensoredRates){
for(j in 1:nbSampleSizes){
for(i in 1:nbSimul){
# Scenario Id + Other info
v <- v + 1
df.sampled[v,"cas"] <- v
df.sampled[v,"distribution"] <- "logNormal"
df.sampled[v,"simul"] <- i
df.sampled[v,"sampleSize"] <- sampleSize[j]
df.sampled[v,"censoredRate"] <- censoredRate[k]
X <- rlnorm(sampleSize[j], meanlog = 0, sdlog = 1)
estimatedLD <- array(9)
for(w in 1:9){
estimatedLD[w] <- quantile(X, probs=censoredRate[k], type=w)[[1]]
}
df.sampled$dta[v] <- list(X)
df.sampled$quantileLD[v] <- list(estimatedLD[1:9])
}
}
}
Which is quite difficult to read.
I would like to find a way to avoid loops, and to reference easily scenarios (v) and attached variables.
Any idea?

prediction.strength in Package fpc

I am using the function prediction.strength in the r Package fpc with k-medoids algorithms.
here is my code
prediction.strength(data,2,6,M=10,clustermethod=pamkCBI,DIST,krange=2:6,diss=TRUE,usepam=TRUE)
somehow I get the error message
Error in switch(method, kmeans = kmeans(xdata[indvec[[l]][[i]], ], k, :
EXPR must be a length 1 vector
Does anybody have experience with this r command? There are simple examples like
iriss <- iris[sample(150,20),-5]
prediction.strength(iriss,2,3,M=3,method="pam")
but my problem is that I am using dissimilarity matrix instead of the data itself for the k-medoids algorithms. I don't know how should I correct my code in this case.
Please note that in the package help the following is stated for the prediction.strength:
xdats - data (something that can be coerced into a matrix). Note that this can currently
not be a dissimilarity matrix.
I'm afraid you'll have to hack the function to get it to handle a distance matrix. I'm using the following:
pred <- function (distance, Gmin = 2, Gmax = 10, M = 50,
classification = "centroid", cutoff = 0.8, nnk = 1, ...)
{
require(cluster)
require(class)
xdata <- as.matrix(distance)
n <- nrow(xdata)
nf <- c(floor(n/2), n - floor(n/2))
indvec <- clcenters <- clusterings <- jclusterings <- classifications <- list()
prederr <- list()
dist <- as.matrix(distance)
for (k in Gmin:Gmax) {
prederr[[k]] <- numeric(0)
for (l in 1:M) {
nperm <- sample(n, n)
indvec[[l]] <- list()
indvec[[l]][[1]] <- nperm[1:nf[1]]
indvec[[l]][[2]] <- nperm[(nf[1] + 1):n]
for (i in 1:2) {
clusterings[[i]] <- as.vector(pam(as.dist(dist[indvec[[l]][[i]],indvec[[l]][[i]]]), k, diss=TRUE))
jclusterings[[i]] <- rep(-1, n)
jclusterings[[i]][indvec[[l]][[i]]] <- clusterings[[i]]$clustering
centroids <- clusterings[[i]]$medoids
j <- 3 - i
classifications[[j]] <- classifdist(as.dist(dist), jclusterings[[i]],
method = classification, centroids = centroids,
nnk = nnk)[indvec[[l]][[j]]]
}
ps <- matrix(0, nrow = 2, ncol = k)
for (i in 1:2) {
for (kk in 1:k) {
nik <- sum(clusterings[[i]]$clustering == kk)
if (nik > 1) {
for (j1 in (1:(nf[i] - 1))[clusterings[[i]]$clustering[1:(nf[i] -
1)] == kk]) {
for (j2 in (j1 + 1):nf[i]) if (clusterings[[i]]$clustering[j2] ==
kk)
ps[i, kk] <- ps[i, kk] + (classifications[[i]][j1] ==
classifications[[i]][j2])
}
ps[i, kk] <- 2 * ps[i, kk]/(nik * (nik -
1))
}
}
}
prederr[[k]][l] <- mean(c(min(ps[1, ]), min(ps[2,
])))
}
}
mean.pred <- numeric(0)
if (Gmin > 1)
mean.pred <- c(1)
if (Gmin > 2)
mean.pred <- c(mean.pred, rep(NA, Gmin - 2))
for (k in Gmin:Gmax) mean.pred <- c(mean.pred, mean(prederr[[k]]))
optimalk <- max(which(mean.pred > cutoff))
out <- list(predcorr = prederr, mean.pred = mean.pred, optimalk = optimalk,
cutoff = cutoff, method = clusterings[[1]]$clustermethod,
Gmax = Gmax, M = M)
class(out) <- "predstr"
out
}

Resources