optimise algorithm for building a graph based on node weights - r

I am trying to improve a function to build a network based on the score calculated from some node attributes. The function tries to find the best subnetwork from a graph maximizing the product of node's attributes.
The function starts in a random node and start searching in the first neighbor, if there are some neighbors whose node's score suffice a threshold, the neighbour/s is added to the first node and the process continues until no more are added (the addition of the neighbour does not produce the desired increment in the score). If there is no node in the first neighbours that yields the increment of the score, then the function looks to the second degree neighbours. In this situation, it is very likely that there are several paths to connect the node (2nd degree neighbour), in this specific case, the chosen path will be the shortest with the highest weight (one of the nodes attribute).
I could do some paralelization of the code, although I don't know how to implement it in this type of function.
The function is the following:
build_network <-
function (G, seed, d= 2){
net <- G
d <- d
score.fun<-function(g){
Za <- sum(V(g)$weight*V(g)$RWRNodeweight)/sqrt(sum(V(g)$RWRNodeweight^2))
k <- vcount(g)
tmp <- genesets.length.null.stat[[as.character(k)]] # genesets.length.null.stat is a list with the median of Za and sd of Za calculated for 1000 replicates of networks of size k
Sa <- (Za-tmp[1])/tmp[2]
}
best.fun<-function(in.nodes,out.nodes) {
score<-(-Inf); best<-character()
for(node in out.nodes){
subG.update<-induced.subgraph(net, c(in.nodes,node))
if( score.fun(subG.update) > score ){
score<-score.fun(subG.update)
best<-node
}
}
list("node"=best,"score"=score)
}
subG <- induced.subgraph(net, seed)
if (!is.connected(subG)) { #the seed must be connected
stop("Input seeds are disjoint")
}
while (TRUE) {
in.nodes <- V(subG)$name
node_num <- vcount(subG)
subsum <- score.fun(subG)
#subx <- V(subG)$name
for (rad in 1:d) {
tmp.neigh <- unlist(neighborhood(net, order = rad, nodes = V(subG)$name))
pot.nodes <- V(net)[tmp.neigh]$name
out.nodes <- setdiff(pot.nodes, in.nodes)
if (length(out.nodes) == 0) break
best_node<-best.fun(in.nodes, out.nodes)
new_score<-best_node$score
best_node<-best_node$node
if (new_score > subsum + 0.01) {
tmp <- unlist(lapply(best_node, function(x) node2treePath(net,V(subG)$name, x))) # node2treePath is a function to retrieve the shortest path with the highest node weights
in.nodes <- c(tmp, V(subG)$name)
subG <- induced.subgraph(net, in.nodes)
break
}
}
if (node_num == vcount(subG)) break
}
return(subG)
}
I am trying to apply this function to a graph of ~10,000 nodes. Here will be an approximation of the code for running the function
### generate some example data
library(igraph)
my_graph <- erdos.renyi.game(10000, 0.0003)
V(my_graph)$name <- 1:vcount(my_graph)
V(my_graph)$weight <- rnorm(10000)
V(my_graph)$RWRNodeweight <- runif(10000, min=0, max=0.05)
### Run the function
sublist = list()
for (node in V(G)$name) {
subnet <- build_network(G, node, d)
sublist[[node]] <- subnet }
EDIT: here is the dput of head(genesets.length.null.stat)
structure(list(`1` = c(1.01397367504035, 1.18858228819048), `2` = c(1.61970348041337, 1.30189433386605), `3` = c(2.11767222957028, 1.36222065695878), `4` = c(2.47710421934929, 1.36968129959296), `5` = c(2.776011866622, 1.36318885187196), `6` = c(3.16885126246671, 1.42577861995897)), .Names = c("1", "2", "3", "4", "5", "6"))
Here is the node2treePath function:
node2treePath <- function (G, Tnodes, node){
tmp.path <- get.all.shortest.paths(G, node, Tnodes)$res
tmp.l <- unlist(lapply(tmp.path, length))
index <- which(tmp.l == min(tmp.l))
tmp.path = tmp.path[index]
tmp.sum <- unlist(lapply(tmp.path, function(x)return(sum(V(G)[x]$weight))))
index <- which(tmp.sum == max(tmp.sum))
selected.path = tmp.path[index]
collect <- unlist(lapply(selected.path, function(x)return(V(G)[x]$name)))
return(collect)
}

For the logic you want to do (and I imagine you may wish to change in way incompatible with the above answers) the following code is about ten times 30% faster. I used Rprof and profr and recoded some slow bits in trivial ways, e.g. not passing a named list pair, just an anonymous pair from one of your functions. The numerically named list with pairs of values for genesets.length.null.stat is very inefficient. I replaced it with two numeric vectors. You also call the 'V' function a lot, which was a big time consumer: as you can see, you can call it once, then query the result as needed.
# node2treePath is a function to retrieve the shortest path with the highest node weights
node2treePath_jw <- function(G, Tnodes, node){
tmp.path <- get.all.shortest.paths(G, node, Tnodes)$res
tmp.l <- vapply(tmp.path, length, integer(1))
index <- which(tmp.l == min(tmp.l))
tmp.path = tmp.path[index]
Vg <- V(G)
tmp.sum <- vapply(tmp.path, function(x) sum(Vg[x]$weight), numeric(1))
index <- which(tmp.sum == max(tmp.sum))
selected.path = tmp.path[index]
sapply(selected.path, function(x) Vg[x]$name)
}
build_network_jw <- function(net, seed, d= 2){
score.fun <- function(Vg, k){
Za <- sum(Vg$weight * Vg$RWRNodeweight) / sqrt(sum(Vg$RWRNodeweight^2))
(Za - genesets_jack_a[k]) / genesets_jack_b[k]
}
best.fun_jw <- function(in.nodes, out.nodes) {
score <- (-Inf)
best <- character()
for (node in out.nodes) {
subG.update <- induced.subgraph(net, c(in.nodes,node))
Vsgu <- V(subG.update)
Vsgu_count <- vcount(subG.update)
sf <- score.fun(Vsgu, Vsgu_count)
if (sf > score) {
score <- sf
best <- node
}
}
list(best, score)
}
subG <- induced.subgraph(net, seed)
if (!is.connected(subG)) { #the seed must be connected
stop("Input seeds are disjoint")
}
while (TRUE) {
VsubG <- V(subG)
Vnet <- V(net)
in.nodes <- VsubG$name
node_num <- vcount(subG)
subsum <- score.fun(VsubG, node_num)
for (rad in 1:d) { # d = 2
tmp.neigh <- unlist(neighborhood(net, order = rad, nodes = VsubG$name))
pot.nodes <- Vnet[tmp.neigh]$name
out.nodes <- setdiff(pot.nodes, in.nodes)
if (length(out.nodes) == 0) break
best_node <- best.fun_jw(in.nodes, out.nodes)
new_score <- best_node[[2]]
best_node <- best_node[[1]]
if (new_score > subsum + 0.01) {
tmp <- sapply(best_node, function(x) node2treePath_jw(net, VsubG$name, x))
in.nodes <- c(tmp, VsubG$name)
subG <- induced.subgraph(net, in.nodes)
break
}
}
if (node_num == vcount(subG)) break
}
subG
}
node2treePath <- function (G, Tnodes, node){
tmp.path <- get.all.shortest.paths(G, node, Tnodes)$res
tmp.l <- unlist(lapply(tmp.path, length))
index <- which(tmp.l == min(tmp.l))
tmp.path = tmp.path[index]
tmp.sum <- unlist(lapply(tmp.path, function(x)return(sum(V(G)[x]$weight))))
index <- which(tmp.sum == max(tmp.sum))
selected.path = tmp.path[index]
collect <- unlist(lapply(selected.path, function(x)return(V(G)[x]$name)))
return(collect)
}
build_network <- function (net, seed, d= 2){
#genesets.length.null.stat <- structure(list(`1` = c(1.01397367504035, 1.18858228819048), `2` = c(1.61970348041337, 1.30189433386605), `3` = c(2.11767222957028, 1.36222065695878), `4` = c(2.47710421934929, 1.36968129959296), `5` = c(2.776011866622, 1.36318885187196), `6` = c(3.16885126246671, 1.42577861995897)), .Names = c("1", "2", "3", "4", "5", "6"))
genesets.length.null.stat <- lapply(1:500, function(x) c(runif(1)+x, runif(1)+x))
names(genesets.length.null.stat) <- 1:500
score.fun<-function(g){
Za <- sum(V(g)$weight*V(g)$RWRNodeweight)/sqrt(sum(V(g)$RWRNodeweight^2))
k <- vcount(g)
tmp <- genesets.length.null.stat[[as.character(k)]] # genesets.length.null.stat is a list with the median of Za and sd of Za calculated for 1000 replicates of networks of size k
Sa <- (Za-tmp[1])/tmp[2]
}
best.fun <- function(in.nodes,out.nodes) {
score<-(-Inf); best<-character()
for (node in out.nodes){
subG.update<-induced.subgraph(net, c(in.nodes,node))
if (score.fun(subG.update) > score) {
score<-score.fun(subG.update)
best<-node
}
}
list("node"=best,"score"=score)
}
subG <- induced.subgraph(net, seed)
if (!is.connected(subG)) { #the seed must be connected
stop("Input seeds are disjoint")
}
while (TRUE) {
in.nodes <- V(subG)$name
node_num <- vcount(subG)
subsum <- score.fun(subG)
#subx <- V(subG)$name
for (rad in 1:d) {
tmp.neigh <- unlist(neighborhood(net, order = rad, nodes = V(subG)$name))
pot.nodes <- V(net)[tmp.neigh]$name
out.nodes <- setdiff(pot.nodes, in.nodes)
if (length(out.nodes) == 0) break
#message("length in.nodes = ", length(in.nodes))
#message("length out.nodes = ", length(out.nodes))
best_node<-best.fun(in.nodes, out.nodes)
new_score<-best_node$score
best_node<-best_node$node
if (new_score > subsum + 0.01) {
tmp <- unlist(lapply(best_node, function(x) node2treePath(net,V(subG)$name, x))) # node2treePath is a function to retrieve the shortest path with the highest node weights
in.nodes <- c(tmp, V(subG)$name)
subG <- induced.subgraph(net, in.nodes)
break
}
}
if (node_num == vcount(subG)) break
}
subG
}
library(igraph)
library(profr)
library(igraph)
library(profr)
#genesets.length.null.stat <- lapply(1:500, function(x) c(runif(1)+x, runif(1)+x))
#names(genesets.length.null.stat) <- 1:500
set.seed(1)
genesets_jack_a = runif(500) + 1:500
genesets_jack_b = runif(500) + 1:500
do_it_jw <- function(n = 1000){
my_graph <- erdos.renyi.game(n, 0.0003)
V(my_graph)$name <- 1:vcount(my_graph)
V(my_graph)$weight <- rnorm(n)
V(my_graph)$RWRNodeweight <- runif(n, min = 0, max = 0.05)
### Run the function
sublist = list()
Vmg <- V(my_graph)
for (node in Vmg$name) {
#message(node)
subnet <- build_network_jw(my_graph, node, 2)
sublist[[node]] <- subnet }
}
do_it <- function(n = 1000){
my_graph <- erdos.renyi.game(n, 0.0003)
V(my_graph)$name <- 1:vcount(my_graph)
V(my_graph)$weight <- rnorm(n)
V(my_graph)$RWRNodeweight <- runif(n, min = 0, max = 0.05)
### Run the function
sublist = list()
Vmg <- V(my_graph)
for (node in Vmg$name) {
#message(node)
subnet <- build_network(my_graph, node, 2)
sublist[[node]] <- subnet }
}
library(microbenchmark)
mb <- microbenchmark(do_it(1000), do_it_jw(1000), times = 5)
print(mb)

Since your score function only depends on node attributes and not edge's, the solution is not unique; you might want to search for a best tree instead. If you restructure your problem so that your nodes are edges and vice-versa, you probably can just use eg Djikstra's algorithm to find the best one. That is already in the igraph package as shortest.paths().

I can't read the R code, but based on your description: If the score threshold is constant, then this is easy to do in O(|V|+|E|+|C|^2) time, where |C| is the number of "good" components (this will be further explained shortly).
In a first pass, delete all nodes with score below the threshold. Then find all connected components in this new graph (this can be done in O(|V|+|E|) time by starting a DFS at each as-yet-unvisited node), calculate their scores by multiplying together all vertex weights in the component, and label each vertex with its component ID. This already tells you the "good" components -- the ones that don't require any 2nd-degree connections.
Suppose this produces |C| components. Create an empty hashtable H which has component-ID pairs for keys, and (length, weight) pairs for values. Now go back through each vertex v you deleted in the first pass: for each one, look at all its neighbours and record the shortest edge to each distinct component (this can be done using a length-|C| array to store the shortest edge to each component seen so far). After examining all of v's neighbours, count the number k of distinct components they fall into: if k >= 2, then v potentially should be used to connect some of these k(k-1)/2 pairs of components. For every pair of distinct components i and j that could be connected by v, update H with the weight and distance of this 2-edge connection as necessary: that is, if i and j are not yet joined together, then record that v joins them; otherwise, if they are already joined by some vertex u, only update H if v can do better (i.e., if it uses less total length and greater weight than u would). This step can be thought of as building a minimum spanning tree in a "component graph" derived from the original, pruned graph. The scores for each new "combined" component can easily be calculated as you go just by multiplying together the scores of the two constituent components.
Finally, simply return the component whose product is maximum.

Related

Removing certain percentage of data from a list

I have many cluster of 3D points. Each cluster has an id. From every id cluster(example 1), i am trying to remove the cluster that is the most farthest in terms of distancefrom the heighest point in my cluster. For that i take the highest point from the cluster get its x and y coordinate and then run a loop to compare the distance from other points in one cluster and the heightest point in the same cluster . I store the calculated distance and x,y co-ordinate in a
nested list and sort them. from the tail of the list i remove a certain percentage and i keep repeating this for all the clusters. This is working for most part, but for some cluster its removing the whole cluster instead of only certain percentage,and i cannot find why.
The code:
library("lidR")
library("raster")
library('rlist')
library(Rcpp)
library(RMCC)
las <- readLAS("las.las")
st_crs(las) <- 32633
i = 1
ids = unique(las$treeID)
idl <- length(ids)
finallist <- list()
while (i <= idl){
las_dist <- filter_poi(las,treeID == i)
templist <- list()
las_x <- las_dist#data$X
xidl <- length(las_x)
las_y <- las_dist#data$Y
las_z <- las_dist#data$Z
zidx <- which.max(las_z)
j <- 1
while (j <= xidl){
x1 <- las_x[zidx]
y1 <- las_y[zidx]
z1<- las_z[zidx]
x2 <- las_x[j]
y2 <- las_y[j]
z2<- las_z[j]
euclidean <- function(x1,x2,y1,y2) sqrt((x1 - x2)^2 + (y1-y2)^2)
distance <- euclidean(x1,x2,y1,y2)
templist[[j]] <- list(x2,y2,distance)
j <- j + 1
}
thirdelt <- sapply(templist, function(x)x[[3]])
thirdelt_order <- order(thirdelt)
templist <- templist[thirdelt_order]
temp_length = length(templist)
removed_data = tail(templist,round(temp_length * 0.15))
finallist <- append(finallist,removed_data)
i = i + 1
}
xlist <- lapply(finallist, "[[", 1)
finallas <- filter_poi(las,!X %in% xlist)

search for the best number of k that minimizes the absolute value of the error between the mean of datas and the mean of frequency table

I wrote this function which gives me as output a frequency table divided in K classes, the mean of datas and the the mead of the frequency table.
Data_Frame <- function(x, k) {
if(k<=1) {
print("insert a number greter then 1") }
else {
data_range <- range(x)
interval_width <- (max(data_range)-min(data_range))/k
cutting_values <- seq (from = min(data_range),
to = max(data_range),
by= interval_width,)
lower_bounds <- cutting_values[1:k]
upper_bounds <- cutting_values[2:(k+1)]
counts <- numeric(length = k)
for (k in seq_along(counts)) {
counts[k] <- length(
x[which((x>=cutting_values[k]) & (x<=cutting_values[(k+1)]))])
}
DF <- data.frame(low.bounds = lower_bounds,
up.bounds = upper_bounds,
freq = counts)
Data_m <- mean(x)
DF_m <- sum((DF$low.bounds+DF$up.bounds)/2*DF$freq)/
sum(DF$freq)
result <- list(DF, Data_m = Data_m, DF_m = DF_m)
return(result)
}
}
### Code for using functions
set.seed(4321)
x <- rnorm(1000, 10, 2)
k <- 5L
result_function_1 <- Data_Frame(x, k)
print(result_function_1)
I have to write a second function which must search for the best number of k that minimizes the absolute value of the error between the mean of datas (Data_m) and the mean of frequency table (DF_m). Starting from k = 2 to K = max_K I have to return the optimal k.
Could someone help me out?
thanks

Optimize network range function

I found a blog post by Richard Paquin Morel on computing a network range function in R (Burt 1981; Reagans and McEvily 2003). The function assigns a value to each network node based on the number of its contacts and the interconnectedness of these nodes. The network range can be computed accross subgroups of nodes (e.g. female and male nodes). These are stored as attributes of the vertices.
The author's example is very illustrative, but it is based on a relatively small network (about 100 nodes). I have a network with about 200,000 nodes, which means the performance of the function is not suited for my analyses.
I present you an example where the size of a random graph according to the Erdos-Renyi model can be manipulated to time the performance of the functions.
I am unfamiliar with optimizing R code, but I think the adjacency matrix needs to be stored more efficiently (e.g., a sparse matrix). My attempts so far did not succeed in producing a working function.
rm(list=ls())
library(igraph)
library(statnet)
library(intergraph)
library(tictoc)
set.seed(42)
## Source: https://ramorel.github.io/network-range/
## Function to find network range for each node in a network
## Arguments:
## net = adjacency matrix, igraph graph, or network object
## attr = Vector of attributes associated with each node in net
## directed = boolean indicated if the network is directed or not
netrange <- function(net, attr, directed = TRUE){
require(reshape2)
if (class(net) == "igraph") {
net <- as_adjacency_matrix(net, sparse = F)
}
else {
if(class(net) == "network") {
net <- as.matrix.network(net)
}
else {
net <- as.matrix(net)
}
}
if(nrow(net) != length(attr)) {
stop("Number of nodes must match length of attributes vector")
}
else {
if (directed == TRUE){
ns <- colnames(net)
el <- melt(net, varnames=c("ego", "alter"), value.name = "weight")
df <- cbind(rownames(net), attr)
el$ego_grp <- df[match(el[,1], df[,1]), 2]
el$alter_grp <- df[match(el[,2], df[,1]), 2]
#FINDING p_k, the strength of ties within each group
# z_iq = sum of strength of ties from nodes in group _k_ to all other alters
# z_ij = sum of strength of ties from nodes in group _k_ to alters in group _k_
z_iq <- sapply(unique(attr), function(x) {
sum(el[which(el$ego_grp==x), "weight"])
})
z_ij <- sapply(unique(attr), function(x) {
sum(el[which(el$ego_grp==x & el$alter_grp==x), "weight"])
})
p_k <- z_ij / z_iq
p_k[is.na(p_k)] <- 0
#FINDING p_ik, the strength of connection from person i to group k
# x_iq = sum of strength of ties for _i_ to alters in group _k_
# x_ij = sum of strength of ties for _i_ to all alters
x_ij <- sapply(colnames(net), function(x) {
sum(el[which(el$ego==x), "weight"])
}
)
x_iq <- list(NULL)
for(i in colnames(net)) {
x_iq[[i]] <- sapply(unique(attr), function(x) {
sum(el[which(el$ego==i & el$alter_grp==x), "weight"])
}
)
}
x_iq <- x_iq[-c(1)] #x_iq is now a list where each elements is a vector of node _i_ summed strength of tie to group _k_
p_ik <- lapply(1:length(x_iq),
function(x) x_iq[[x]] / x_ij[x])
# FINDING nd_i, the network diversity score for node _i_
nd_i <- sapply(1:length(p_ik),
function(x) 1 - sum(p_k*p_ik[[x]]^2, na.rm = F)
)
}
else {
ns <- colnames(net)
el <- melt(net, varnames=c("ego", "alter"), value.name = "weight")
dup <- data.frame(t(apply(el[,1:2],1,sort)))
el <- el[!duplicated(dup),]
df <- cbind(rownames(net), attr)
el$ego_grp <- df[match(el[,1], df[,1]), 2]
el$alter_grp <- df[match(el[,2], df[,1]), 2]
#FINDING p_k, the strength of ties within each group
# z_iq = sum of strength of ties from nodes in group _k_ to all other alters
# z_ij = sum of strength of ties from nodes in group _k_ to alters in group _k_
z_iq <- sapply(unique(attr), function(x) {
sum(el[which(el$ego_grp==x | el$alter_grp==x), "weight"])
})
z_ij <- sapply(unique(attr), function(x) {
sum(el[which(el$ego_grp==x & el$alter_grp==x), "weight"])
})
p_k <- z_ij / z_iq
p_k[is.na(p_k)] <- 0
#FINDING p_ik, the strength of connection from person i to group k
# x_iq = sum of strength of ties for _i_ to alters in group _k_
# x_ij = sum of strength of ties for _i_ to all alters
x_ij <- sapply(colnames(net), function(x) {
sum(el[which(el$ego==x | el$alter==x), "weight"])
}
)
x_iq <- list(NULL)
for(i in colnames(net)) {
x_iq[[i]] <- sapply(unique(attr), function(x) {
sum(el[which(el$ego==i & el$alter_grp==x), "weight"],
el[which(el$alter==i & el$ego_grp==x), "weight"])
}
)
}
x_iq <- x_iq[-c(1)] #x_iq is now a list where each elements is a vector of node _i_ summed strength of tie to group _k_
p_ik <- lapply(1:length(x_iq),
function(x) x_iq[[x]] / x_ij[x])
# FINDING nd_i, the network diversity score for node _i_
nd_i <- sapply(1:length(p_ik),
function(x) 1 - sum(p_k*p_ik[[x]]^2, na.rm = F)
)
}
return(nd_i)
}
}
# Generate exemplary network
g <- igraph::erdos.renyi.game(1000, 150, type = "gnm")
## Add categorical (binary) vertex feature: female
V(g)$female <- sample(c(0,1), replace=TRUE, size=length(V(g)))
V(g)$female
## transform igraph to statnet
net <- intergraph::asNetwork(g)
## Apply network function
tic()
range_female <- netrange(net,
net %v% "female",
directed = T)
seq_time <- toc()
Best thing to optimize is to use profiling using profvis which shows where your bottle necks are. I took part of your function though.
There we saw x_ij <- sapply(colnames(net), function(x) { sum(el[which(el$ego==x | el$alter==x), "weight"]) }) taking terribly long.
I prefer data.table myself so just provide the code here to speed up that part which takes a few seconds only. Try the same then for creating x_iq.
xx_ij <- el[, .(xxij = lapply(.SD, sum)), by = c("ego"), .SDcols = c("weight")]
xx_ij2 <- xx_ij$xxij
names(xx_ij2) <- xx_ij$ego
identical(x_ij, unlist(xx_ij2))
# TRUE
**profiling
library(profvis)
p <- profvis({
## code here
})
f <- paste0("profile_", as.Date(now()), ".html")
htmlwidgets::saveWidget(p, f)
browseURL(f)

How to implement the jaccard distance in kproto function

I am trying to implement the distance of jaccard in the kproto function (package clustMixType in R), but without any success. The aim is to do a cluster analysis of my dataset.
The distance of jaccard that I want to use is the complement of the similarity coefficient of jaccard, so
distance of jaccard = 1-[a/(a+b+c)] = [(b+c)/(a+b+c)], or
distance of jaccard = 1-[M11/(M01+M10+M11)] = [(M01+M10)/(M01+M10+M11)].
The source code of the kproto function is presented bellow. The variable d1 is the euclidean distance for the numeric variables and the variable d2 is the distance from the simple matching coefficient for the categorical variables (as factors).
It computes the distances between the observations and the prototypes, not between observations. Prototypes are calculated, and not an observation of the data set it self.
So my twoo questions are
1) d2 is what I want to modify, but how?
2) should d1 be the sqrt of what is being calculated?
Thank you for all your help. It will be much apreciated.
Here is an excerpt of the dataset i'm working on, where V1 to V4 are factor (binary) variables (to use the jaccard distance) and V5 to V8 are numeric variables (to use the euclidean distance):
V1;V2;V3;V4;V5;V6;V7;V8
1;1;0;0;6;2;3;3
0;1;0;1;3;5;2;1
1;1;0;0;1;3;2;1
1;1;0;0;4;3;3;1
1;1;1;0;1;4;1;1
1;0;1;0;4;3;1;1
1;1;0;0;2;4;2;1
1;1;0;0;2;4;2;1
1;1;0;0;6;2;1;1
1;1;0;0;6;2;2;1
1;1;0;0;5;2;3;1
1;1;0;0;4;3;3;1
1;1;0;0;4;4;2;1
1;1;0;0;4;3;3;1
1;1;0;0;4;3;3;1
1;1;0;0;3;4;2;1
1;1;0;0;4;3;2;1
1;1;0;0;5;2;3;1
1;1;0;0;4;3;4;1
1;1;0;0;4;3;2;1
1;1;0;0;4;3;2;1
1;1;0;0;3;3;2;1
1;1;0;0;3;3;3;1
1;1;0;0;5;2;3;1
1;1;0;0;5;2;3;1
1;1;0;0;5;2;2;1
1;1;0;0;3;3;2;1
1;1;0;0;5;2;3;1
1;1;0;0;5;2;2;1
1;0;0;0;3;4;2;1
1;1;0;0;7;2;4;1
1;1;0;0;7;2;2;1
1;1;0;0;5;2;4;1
1;1;0;0;5;3;4;1
1;1;0;0;5;3;2;1
1;1;0;0;5;3;4;1
1;0;0;0;3;5;3;1
0;1;0;0;6;2;4;1
1;1;0;0;6;2;3;1
1;1;0;0;6;2;4;1
Lets take the first twoo observations from the dataset provided above as an example:
V1;V2;V3;V4;V5;V6;V7;V8
1;1;0;0;6;2;3;3
0;1;0;1;3;5;2;1
The algorithm first select the k prototypes from the data set randomly, so lets supose that the second observation is one of the inicial prototypes. As I understood the algorithm creates a data frame called "protos" initially with k random observations from the data set provided, so the second observation would be one of the lines of the "proto" dataframe.
The combined distance used to cluster the observations would be d=d1+lambda*d2. Lambda can also be a vector of individual weights to each variable. d is the distance between the observations in the data set provided and the "proto" matrix created initially with k random observations.
In this case, considering the first twoo observations presented, the calculated distances, between the observation (yi) and the prototype (yk), would be as follow:
Euclidian for the numeric variables (V5 to V8):
d1=sum[(yij-ykj)^2]^0,5
where,
k=1 to k clusters
i=1 to n observations
j=5 to 8 th variable
d1=[[(6-3)^2]+[(2-5)^2]+[(3-2)^2]+[(3-1)^2]]^0,5
d1=[9+9+1+4]^0,5
d1=4.796
Jaccard, for the set of binary variables (V1 to V4):
d2=[(b+c)/(a+b+c)]
where,
a=1
b=1
c=1
are correspondences counts between the n observations and the k prototypes, for variables 1 to 4.
d2=[(1+1)/(1+1+1)]
d2=2/3
d2=0.667
So the combined distance between this especific observation and the initial prototype of that cluster is:
d=d1+d2
d=4.796+0.667
d=5.463
The results, as I understood, are then stored in a matrix called "d", line by line, the size of [number of lines=number of observations, number of columns = number of clusters k].
I'm expecting to correctly calculate the euclidian and jaccard distances, modifiyng the kproto function, maintaining the steps and results provided by the original function.
NOTE: the final function should work on any number of observations, variables and prototypes, and not only to my specific dataset.
I've also tried to mix the codes from kproto (clustMixType package) and dist.binary (ade4 package), but they work in different ways.
#K-Prototypes algorithm
kproto.default <- function(x, k, lambda = NULL, iter.max = 100, nstart = 1, na.rm = TRUE, keep.data = TRUE, verbose = TRUE, ...){
# initial error checks
if(!is.data.frame(x)) stop("x should be a data frame!")
if(ncol(x) < 2) stop("For clustering x should contain at least two variables!")
if(iter.max < 1 | nstart < 1) stop("iter.max and nstart must not be specified < 1!")
if(!is.null(lambda)){
if(any(lambda < 0)) stop("lambda must be specified >= 0!")
if(!any(lambda > 0)) stop("lambda must be specified > 0 for at least one variable!")
}
# check for numeric and factor variables
numvars <- sapply(x, is.numeric)
anynum <- any(numvars)
catvars <- sapply(x, is.factor)
anyfact <- any(catvars)
if(!anynum) stop("\n No numeric variables in x! Try using kmodes() from package klaR...\n\n")
if(!anyfact) stop("\n No factor variables in x! Try using kmeans()...\n\n")
# treatment of missings
NAcount <- apply(x, 2, function(z) sum(is.na(z)))
if(verbose){
cat("# NAs in variables:\n")
print(NAcount)
}
if(any(NAcount == nrow(x))) stop(paste("Variable(s) have only NAs please remove them:",names(NAcount)[NAcount == nrow(x)],"!"))
if(na.rm) {
miss <- apply(x, 1, function(z) any(is.na(z)))
if(verbose){
cat(sum(miss), "observation(s) with NAs.\n")
if(sum(miss) > 0) message("Observations with NAs are removed.\n")
cat("\n")
}
x <- x[!miss,]
} # remove missings
if(!na.rm){
allNAs <- apply(x,1,function(z) all(is.na(z)))
if(sum(allNAs) > 0){
if(verbose) cat(sum(allNAs), "observation(s) where all variables NA.\n")
warning("No meaningful cluster assignment possible for observations where all variables NA.\n")
if(verbose) cat("\n")
}
}
if(nrow(x) == 1) stop("Only one observation clustering not meaningful.")
k_input <- k # store input k for nstart > 1 as clusters can be merged
# initialize prototypes
if(!is.data.frame(k)){
if (length(k) == 1){
if(as.integer(k) != k){k <- as.integer(k); warning(paste("k has been set to", k,"!"))}
if(nrow(x) < k) stop("Data frame has less observations than clusters!")
ids <- sample(nrow(x), k)
protos <- x[ids,]
}
if (length(k) > 1){
if(nrow(x) < length(k)) stop("Data frame has less observations than clusters!")
ids <- k
k <- length(ids)
if(length(unique(ids)) != length(ids)) stop("If k is specified as a vector it should contain different indices!")
if(any(ids<1)|any(ids>nrow(x))) stop("If k is specified as a vector all elements must be valid indices of x!")
#check for integer
protos <- x[ids,]
}
rm(ids)
}
if(is.data.frame(k)){
if(nrow(x) < nrow(k)) stop("Data frame has less observations than clusters!")
if(length(names(k)) != length(names(x))) stop("k and x have different numbers of columns!")
if(any(names(k) != names(x))) stop("k and x have different column names!")
if(anynum) {if( any(sapply(k, is.numeric) != numvars)) stop("Numeric variables of k and x do not match!")}
if(anyfact) {if( any(sapply(k, is.factor) != catvars)) stop("Factor variables of k and x do not match!")}
protos <- k
k <- nrow(protos)
}
if(k < 1) stop("Number of clusters k must not be smaller than 1!")
# automatic calculation of lambda
if(length(lambda) > 1) {if(length(lambda) != sum(c(numvars,catvars))) stop("If lambda is a vector, its length should be the sum of numeric and factor variables in the data frame!")}
if(is.null(lambda)){
if(anynum & anyfact){
vnum <- mean(sapply(x[,numvars, drop = FALSE], var, na.rm = TRUE))
vcat <- mean(sapply(x[,catvars, drop = FALSE], function(z) return(1-sum((table(z)/sum(!is.na(z)))^2))))
if (vnum == 0){
if(verbose) warning("All numerical variables have zero variance.")
anynum <- FALSE
}
if (vcat == 0){
if(verbose) warning("All categorical variables have zero variance.")
anyfact <- FALSE
}
if(anynum & anyfact){
lambda <- vnum/vcat
if(verbose) cat("Estimated lambda:", lambda, "\n\n")
}else{
lambda <- 1
}
}
}
# initialize clusters
clusters <- numeric(nrow(x))
tot.dists <- NULL
moved <- NULL
iter <- 1
# check for any equal prototypes and reduce cluster number in case of occurence
if(k > 1){
keep.protos <- rep(TRUE,k)
for(l in 1:(k-1)){
for(m in (l+1):k){
d1 <- sum((protos[l,numvars, drop = FALSE]-protos[m,numvars, drop = FALSE])^2) # euclidean for numerics
d2 <- sum(protos[l,catvars, drop = FALSE] != protos[m,catvars, drop = FALSE]) # wtd simple matching for categorics
if((d1+d2) == 0) keep.protos[m] <- FALSE
}
}
if(!all(keep.protos)){
protos <- protos[keep.protos,]
k <- sum(keep.protos)
if(verbose) message("Equal prototypes merged. Cluster number reduced to:", k, "\n\n")
}
}
# special case only one cluster
if(k == 1){clusters <- rep(1, nrow(x)); size <- table(clusters); iter <- iter.max} # REM: named vector size is needed later...
# start iterations for standard case (i.e. k > 1)
while(iter < iter.max){
# compute distances
nrows <- nrow(x)
dists <- matrix(NA, nrow=nrows, ncol = k)
for(i in 1:k){
#a0 <- proc.time()[3]
#d1 <- apply(x[,numvars],1, function(z) sum((z-protos[i,numvars])^2)) # euclidean for numerics
d1 <- (x[,numvars, drop = FALSE] - matrix(rep(as.numeric(protos[i, numvars, drop = FALSE]), nrows), nrow=nrows, byrow=T))^2
if(length(lambda) == 1) d1 <- rowSums(d1, na.rm = TRUE)
if(length(lambda) > 1) d1 <- as.matrix(d1) %*% lambda[numvars]
#a1 <- proc.time()[3]
#d2 <- lambda * apply(x[,catvars],1, function(z) sum((z != protos[i,catvars]))) # wtd simple matching for categorics
d2 <- sapply(which(catvars), function(j) return(x[,j] != rep(protos[i,j], nrows)) )
d2[is.na(d2)] <- FALSE
if(length(lambda) == 1) d2 <- lambda * rowSums(d2)
if(length(lambda) > 1) d2 <- as.matrix(d2) %*% lambda[catvars]
#a2 <- proc.time()[3]
dists[,i] <- d1 + d2
#cat(a1-a0, a2-a1, "\n")
}
# assign clusters
old.clusters <- clusters
# clusters <- apply(dists, 1, function(z) which.min(z))
clusters <- apply(dists, 1, function(z) {a <- which.min(z); if (length(a)>1) a <- sample(a,1); return(a)}) # sample in case of multiple minima
size <- table(clusters)
min.dists <- apply(cbind(clusters, dists), 1, function(z) z[z[1]+1])
within <- as.numeric(by(min.dists, clusters, sum))
tot.within <- sum(within)
# prevent from empty classes
#tot.within <- numeric(k)
#totw.list <- by(min.dists, clusters, sum)
#tot.within[names(totw.list)] <- as.numeric(totw.list)
# ...check for empty clusters and eventually reduce number of prototypes
if (length(size) < k){
k <- length(size)
protos <- protos[1:length(size),]
if(verbose) cat("Empty clusters occur. Cluster number reduced to:", k, "\n\n")
}
# trace
tot.dists <- c(tot.dists, sum(tot.within))
moved <- c(moved, sum(clusters != old.clusters))
# compute new prototypes
remids <- as.integer(names(size))
for(i in remids){
protos[which(remids == i), numvars] <- sapply(x[clusters==i, numvars, drop = FALSE], mean, na.rm = TRUE)
protos[which(remids == i), catvars] <- sapply(x[clusters==i, catvars, drop = FALSE], function(z) levels(z)[which.max(table(z))])
}
if(k == 1){clusters <- rep(1, length(clusters)); size <- table(clusters); iter <- iter.max; break}
# check for any equal prototypes and reduce cluster number in case of occurence
if(iter == (iter.max-1)){ # REM: for last iteration equal prototypes are allowed. otherwise less prototypes than assigned clusters.
keep.protos <- rep(TRUE,k)
for(l in 1:(k-1)){
for(m in (l+1):k){
d1 <- sum((protos[l,numvars, drop = FALSE]-protos[m,numvars, drop = FALSE])^2) # euclidean for numerics
d2 <- sum(protos[l,catvars, drop = FALSE] != protos[m,catvars, drop = FALSE]) # wtd simple matching for categorics
if((d1+d2) == 0) keep.protos[m] <- FALSE
}
}
if(!all(keep.protos)){
protos <- protos[keep.protos,]
k <- sum(keep.protos)
if(verbose) cat("Equal prototypes merged. Cluster number reduced to:", k, "\n\n")
}
}
# add stopping rules
if(moved[length(moved)] == 0) break
if(k == 1){clusters <- rep(1, length(clusters)); size <- table(clusters); iter <- iter.max; break}
#cat("iter", iter, "moved", moved[length(moved)], "tot.dists",tot.dists[length(tot.dists)],"\n" )
iter <- iter+1
}
### Final update of prototypes and dists
if(iter == iter.max){ # otherwise there have been no moves anymore and prototypes correspond to cluster assignments
# compute new prototypes
remids <- as.integer(names(size))
for(i in remids){
protos[which(remids == i), numvars] <- sapply(x[clusters==i, numvars, drop = FALSE], mean, na.rm = TRUE)
protos[which(remids == i), catvars] <- sapply(x[clusters==i, catvars, drop = FALSE], function(z) levels(z)[which.max(table(z))])
}
# compute distances
nrows <- nrow(x)
dists <- matrix(NA, nrow=nrows, ncol = k)
for(i in 1:k){
d1 <- (x[,numvars, drop = FALSE] - matrix(rep(as.numeric(protos[i, numvars, drop = FALSE]), nrows), nrow=nrows, byrow=T))^2
if(length(lambda) == 1) d1 <- rowSums(d1, na.rm = TRUE)
if(length(lambda) > 1) d1 <- as.matrix(d1) %*% lambda[numvars]
d2 <- sapply(which(catvars), function(j) return(x[,j] != rep(protos[i,j], nrows)) )
d2[is.na(d2)] <- FALSE
if(length(lambda) == 1) d2 <- lambda * rowSums(d2)
if(length(lambda) > 1) d2 <- as.matrix(d2) %*% lambda[catvars]
dists[,i] <- d1 + d2
}
size <- table(clusters)
min.dists <- apply(cbind(clusters, dists), 1, function(z) z[z[1]+1])
within <- as.numeric(by(min.dists, clusters, sum))
tot.within <- sum(within)
}
names(clusters) <- row.names(dists) <- row.names(x)
rownames(protos) <- NULL
# create result:
res <- list(cluster = clusters,
centers = protos,
lambda = lambda,
size = size,
withinss = within,
tot.withinss = tot.within,
dists = dists,
iter = iter,
trace = list(tot.dists = tot.dists, moved = moved))
# loop: if nstart > 1:
if(nstart > 1)
for(j in 2:nstart){
res.new <- kproto(x=x, k=k_input, lambda = lambda, iter.max = iter.max, nstart=1, verbose=verbose)
if(res.new$tot.withinss < res$tot.withinss) res <- res.new
}
if(keep.data) res$data = x
class(res) <- "kproto"
return(res)
}
I've managed to modify the functions to accept a variety of similarity measures and uploaded the R file at http://dx.doi.org/10.17632/63nyn9tjcd.1, in case someone needs it.

k-means clustering with constraint based on the node values

Maybe I'm missing something as this seems to be a simple problem, but I looked this up online and haven't found anything in the literature.
Basically what I need to do is to do a clustering of a set of destination cities based on their location (so latitude/longitude as features of each node, Euclidean distances for the similarity metric), with fixed number of clusters. All seems good and a k-means would do the trick. However, I have the following constraint for each cluster: Every city (node) has a corresponding value assigned to it, and the sum of these values in each cluster should not exceed a fixed threshold (same threshold for all clusters). Is there an easy way to do so?
You have 2 options:
-You could instead use rpart as a clustering, and use weights and the minbucket option. However the clusters, which predict will give you will be rectangulars.
-You could have a look at the source code which I found on
https://searchcode.com/codesearch/view/18689414/ :
kmeans <-
function(x, centers, iter.max = 10, nstart = 1,
algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"))
{
do_one <- function(nmeth) {
Z <-
switch(nmeth,
{ # 1
Z <- .Fortran(C_kmns, x, m, p,
centers = centers,
as.integer(k), c1 = integer(m), integer(m),
nc = integer(k), double(k), double(k), integer(k),
double(m), integer(k), integer(k),
as.integer(iter.max), wss = double(k),
ifault = 0L)
switch(Z$ifault,
stop("empty cluster: try a better set of initial centers",
call.=FALSE),
warning(gettextf("did not converge in %d iterations",
iter.max), call.=FALSE, domain =NA),
stop("number of cluster centres must lie between 1 and nrow(x)",
call.=FALSE)
)
Z
},
{ # 2
Z <- .C(C_kmeans_Lloyd, x, m, p,
centers = centers, as.integer(k),
c1 = integer(m), iter = as.integer(iter.max),
nc = integer(k), wss = double(k))
if(Z$iter > iter.max)
warning("did not converge in ",
iter.max, " iterations", call.=FALSE)
if(any(Z$nc == 0))
warning("empty cluster: try a better set of initial centers", call.=FALSE)
Z
},
{ # 3
Z <- .C(C_kmeans_MacQueen, x, m, p,
centers = as.double(centers), as.integer(k),
c1 = integer(m), iter = as.integer(iter.max),
nc = integer(k), wss = double(k))
if(Z$iter > iter.max)
warning("did not converge in ",
iter.max, " iterations", call.=FALSE)
if(any(Z$nc == 0))
warning("empty cluster: try a better set of initial centers", call.=FALSE)
Z
})
Z
}
x <- as.matrix(x)
m <- as.integer(nrow(x))
if(is.na(m)) stop("invalid nrow(x)")
p <- as.integer(ncol(x))
if(is.na(p)) stop("invalid ncol(x)")
if(missing(centers))
stop("'centers' must be a number or a matrix")
nmeth <- switch(match.arg(algorithm),
"Hartigan-Wong" = 1,
"Lloyd" = 2, "Forgy" = 2,
"MacQueen" = 3)
if(length(centers) == 1L) {
if (centers == 1) nmeth <- 3
k <- centers
## we need to avoid duplicates here
if(nstart == 1)
centers <- x[sample.int(m, k), , drop = FALSE]
if(nstart >= 2 || any(duplicated(centers))) {
cn <- unique(x)
mm <- nrow(cn)
if(mm < k)
stop("more cluster centers than distinct data points.")
centers <- cn[sample.int(mm, k), , drop=FALSE]
}
} else {
centers <- as.matrix(centers)
if(any(duplicated(centers)))
stop("initial centers are not distinct")
cn <- NULL
k <- nrow(centers)
if(m < k)
stop("more cluster centers than data points")
}
if(iter.max < 1) stop("'iter.max' must be positive")
if(ncol(x) != ncol(centers))
stop("must have same number of columns in 'x' and 'centers'")
if(!is.double(x)) storage.mode(x) <- "double"
if(!is.double(centers)) storage.mode(centers) <- "double"
Z <- do_one(nmeth)
best <- sum(Z$wss)
if(nstart >= 2 && !is.null(cn))
for(i in 2:nstart) {
centers <- cn[sample.int(mm, k), , drop=FALSE]
ZZ <- do_one(nmeth)
if((z <- sum(ZZ$wss)) < best) {
Z <- ZZ
best <- z
}
}
centers <- matrix(Z$centers, k)
dimnames(centers) <- list(1L:k, dimnames(x)[[2L]])
cluster <- Z$c1
if(!is.null(rn <- rownames(x)))
names(cluster) <- rn
totss <- sum(scale(x, scale = FALSE)^2)
structure(list(cluster = cluster, centers = centers, totss = totss,
withinss = Z$wss, tot.withinss = best,
betweenss = totss - best, size = Z$nc),
class = "kmeans")
}
## modelled on print methods in the cluster package
print.kmeans <- function(x, ...)
{
cat("K-means clustering with ", length(x$size), " clusters of sizes ",
paste(x$size, collapse=", "), "\n", sep="")
cat("\nCluster means:\n")
print(x$centers, ...)
cat("\nClustering vector:\n")
print(x$cluster, ...)
cat("\nWithin cluster sum of squares by cluster:\n")
print(x$withinss, ...)
cat(sprintf(" (between_SS / total_SS = %5.1f %%)\n",
100 * x$betweenss/x$totss),
"Available components:\n", sep="\n")
print(names(x))
invisible(x)
}
fitted.kmeans <- function(object, method = c("centers", "classes"), ...)
{
method <- match.arg(method)
if (method == "centers") object$centers[object$cl, , drop=FALSE]
else object$cl
}
Please note, that the code checks if an improvement happened with these lines:
if((z <- sum(ZZ$wss)) < best) {
Z <- ZZ
best <- z
}
Here you can add your constraint.
You can use the same principle as KMeans. Iterate in 2-3 until convergence:
Assign cities to clusters (randomly)
Compute the centroids of clusters
Assign points to centroids such that:
The sum of distances to points to the assigned centroids are minimized
The threshold constraints are respected
In standard KMeans there are no constraints. Hence the second step is performed easily by assigning each point to the closest centroid. Here you will have to solve an optimization problem in step 2.
It is probably faster if you just model it as an integer programming problem. OR Tools has facilities for solving integer programming problems.
Here is a python implementation that does K-means clustering with different constraints, including one with a maximum on total weight of instances in a cluster.

Resources