cgan shape problem (shape [64,3,64,64] is invalid for input of size 2952192) - runtime-error

This is original github source:
https://github.com/otepencelik/GAN-Artwork-Generation
I want to rerun this model (CGAN) and use it in another area.
I want to change the network to generate multiple label and increase the performance but it didn't work well. I need somebody to help me! Thanks~ Error information as follow. I think the problem is reshape problem but I don't know how to figure it out! Need some help
Input attributes :#Number of all images (81474)!!!
# Root directory for dataset
dataroot = "***"
# Number of workers for dataloader
workers = 2
# Batch size during training
batch_size = 64
# Spatial size of training images. All images will be resized to this
# size using a transformer.
image_size = 64
#number of calsses lable
n_class = 27
# Number of channels in the training images. For color images this is 3
nc = 3
# Size of z latent vector (i.e. size of generator input)
nz = 100
# Size of feature maps in generator (output)
ngf = 64
# Size of feature maps in discriminator
ndf = 64
# Number of training epochs
num_epochs = 5
# Learning rate for optimizers
lr = 0.0002
# Beta1 hyperparam for Adam optimizers
beta1 = 0.5
# Number of GPUs available. Use 0 for CPU mode.
ngpu = 1
Generator model design
def __init__(self, ngpu=1):
super(Generator, self).__init__()
self.label_emb = nn.Embedding(n_class, n_class)
self.ngpu = ngpu
self.main = nn.Sequential(
# input is Z, going into a convolution
nn.ConvTranspose2d( nz + n_class, ngf * 16, 4, 1, 0, bias=False),
nn.BatchNorm2d(ngf * 16),
nn.ReLU(True),
# state size. (ngf*8) x 4 x 4
nn.ConvTranspose2d(ngf * 16, ngf * 8, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 8),
nn.ReLU(True),
# state size. (ngf*4) x 8 x 8
nn.ConvTranspose2d( ngf * 8, ngf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 4),
nn.ReLU(True),
# state size. (ngf*2) x 16 x 16
nn.ConvTranspose2d( ngf * 4, ngf, 4, 4, 1, bias=False),
nn.BatchNorm2d(ngf),
nn.ReLU(True),
# state size. (ngf) x 32 x 32
nn.ConvTranspose2d(ngf, nc, 4, 2, 1, bias=False),
nn.Tanh()
# state size. (nc) x 64 x 64
)
def forward(self, noise_input, labels):
# Concatenate label embedding and image to produce input
#print(self.label_emb(labels).unsqueeze(2).unsqueeze(3).shape, noise_input.shape, labels.shape)
gen_input = torch.cat((self.label_emb(labels).unsqueeze(2).unsqueeze(3), noise_input), 1)
img = self.main(gen_input)
img = img.view(img.size(0), *(nc, image_size, image_size))
return img
netG = Generator(ngpu).to(device)
Training model
# Training Loop
# Lists to keep track of progress
img_list = []
G_losses = []
D_losses = []
iters = 0
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
# For each batch in the dataloader
for i, (data, real_style_labels) in enumerate(dataloader, 0):
############################
# (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
###########################
## Train with all-real batch
netD.zero_grad()
# Format batch
real_cpu = data.to(device)
b_size = real_cpu.size(0)
label = torch.full((b_size,), real_label, dtype=torch.float, device=device)
# style labels - iso
real_style_labels = real_style_labels.to(device)
fake_style_labels = torch.tensor(np.random.choice(n_class, size = b_size)).type(torch.LongTensor).to(device)
# Forward pass real batch through D
output = netD(gaussian(real_cpu, mean=0, stddev=0.5*0.01**(epoch/num_epochs)), real_style_labels).view(-1)
# Calculate loss on all-real batch
errD_real = criterion(output, label)
# Calculate gradients for D in backward pass
errD_real.backward()
D_x = output.mean().item()
## Train with all-fake batch
# Generate batch of latent vectors
noise = torch.randn(b_size, nz, 1, 1, device=device)
# Generate fake image batch with G
fake = netG(noise, fake_style_labels)
label.fill_(fake_label)
# Classify all fake batch with D
output = netD(fake.detach(), fake_style_labels).view(-1)
# Calculate D's loss on the all-fake batch
errD_fake = criterion(output, label)
# Calculate the gradients for this batch
errD_fake.backward()
D_G_z1 = output.mean().item()
# Add the gradients from the all-real and all-fake batches
errD = errD_real + errD_fake
# Update D
optimizerD.step()
############################
# (2) Update G network: maximize log(D(G(z)))
###########################
netG.zero_grad()
label.fill_(real_label) # fake labels are real for generator cost
# Since we just updated D, perform another forward pass of all-fake batch through D
output = netD(fake, fake_style_labels).view(-1)
# Calculate G's loss based on this output
errG = criterion(output, label)
# Calculate gradients for G
errG.backward()
D_G_z2 = output.mean().item()
# Update G
optimizerG.step()
# Output training stats
if i % 200 == 0:
print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
% (epoch, num_epochs, i, len(dataloader),
errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))
# Save Losses for plotting later
G_losses.append(errG.item())
D_losses.append(errD.item())
# Check how the generator is doing by saving G's output on fixed_noise
#if (iters % 2000 == 0) or ((epoch == num_epochs-1) and (i == len(dataloader)-1)):
if (i == len(dataloader)-1):
with torch.no_grad():
fake = netG(fixed_noise, fixed_label).detach().cpu()
img_list.append(vutils.make_grid(fake, nrow = example_size,padding=2, normalize=True))
save_image(fake.data, img_save_path + '/%d-%d.png' % (epoch,iters), nrow=example_size, normalize=True)
iters += 1
error information:
RuntimeError Traceback (most recent call last)
<ipython-input-65-50e58bbfe414> in <module>
37 noise = torch.randn(b_size, nz, 1, 1, device=device)
38 # Generate fake image batch with G
---> 39 fake = netG(noise, fake_style_labels)
40 label.fill_(fake_label)
41 # Classify all fake batch with D
~/opt/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
<ipython-input-61-cb81d45887cf> in forward(self, noise_input, labels)
31 gen_input = torch.cat((self.label_emb(labels).unsqueeze(2).unsqueeze(3), noise_input), 1)
32 img = self.main(gen_input)
---> 33 img = img.view(img.size(0), *(nc, image_size, image_size))
34 return img
35
RuntimeError: shape '[64, 3, 64, 64]' is invalid for input of size 2952192
I spent a few days to figure out the logic but I failed. Hope somebody can help me out!! Thanks

Related

optimize R code for min() and sample() by group

I generate a network with npeople(=80), ncomp(=4) components and I want each component to have density equal to dens(=0.2).
I want to optimize 2 lines of the code which take most of the time (especially if I want to have 5k people in the network).
the 2 lines are:
# adjust probability to keep density
nodes[,p:= as.numeric(min(c(1, p * (1/(mean(nodes$p) / c.dens))))), by = c("ID","ALTERID")]
# simulate edges
nodes[, edge := sample(c(0,1),1, prob = c(1-p,p)), by = c("ID","ALTERID")]
I have tried using the lapply() function, but the execution time increased - see below the line of code:
nodes[,lapply(.SD, function(p) min(c(1, p * (1/(mean(nodes$p) / c.dens))))), by = c("ID","ALTERID")]
rm(list=ls())
library(data.table)
library(intergraph)
library(igraph)
library(Matrix)
library(profvis)
library(ggplot2)
draw.var <- function(n, var1, rho, mean){
C <- matrix(rho, nrow = 2, ncol = 2)
diag(C) <- 1
C <- chol(C)
S <- rnorm(n, mean = mean)
S <- cbind(scale(var1)[1:n],S)
ZS <- S %*% C
return(ZS[,2])
}
set.seed(1123)
profvis({
# create empty list to store data
dt.list <- list()
npeople <- 500
dens <- .2
OC.impact <- FALSE
cor_iv_si <- .6
cor_iv_uc <- 0
cor_uc_oc <- 0.6
ncomp <- 4
beta_oc <- 2 # observed characteristics
beta_uc <- 2 # unobserved characteristics
beta_si <- 1
# create data.table
dt.people <- data.table(ego = 1:npeople)
# draw observed characteristics
dt.people[, OC := abs(rt(npeople,2))]
# draw unobserved variable
dt.people[, UC := draw.var(npeople, dt.people$OC, rho = cor_uc_oc,mean = 5)]
# set component idientifier
dt.people$group <- cut_number(dt.people$UC, ncomp,labels = F)
for(q in 1:ncomp){
# subset comp
dt.sub <- dt.people[group == q]
# create undirected graph
nodes <- as.data.table(t(combn(dt.sub$ego, 2)))
setnames(nodes,c("ID","ALTERID"))
# add attributes
nodes <- merge(nodes,dt.people[,list(ID = ego, ID.UC = UC, ID.OC = OC)], by = "ID")
nodes <- merge(nodes,dt.people[,list(ALTERID = ego, ALTERID.UC = UC, ALTERID.OC = OC)], by = "ALTERID")
# calculate distance
nodes[,d := abs(ID.UC - ALTERID.UC)]
# estimate the appropiate density per component
n.edges <- (dens * (npeople * (npeople - 1)))/ncomp
n.nodes <- npeople/ncomp
c.dens <- n.edges/(n.nodes * (n.nodes - 1))
# estimate initial probability of tie based on distance
coefficient <- log(c.dens / (1 - c.dens))
alpha <- coefficient / mean(nodes$d)
nodes[,p := exp(alpha * d) / (1 + exp(alpha * d))]
# adjust probability to keep density
nodes[,p:= as.numeric(min(c(1, p * (1/(mean(nodes$p) / c.dens))))), by = c("ID","ALTERID")]
# simulate edges
nodes[, edge := sample(c(0,1),1, prob = c(1-p,p)), by = c("ID","ALTERID")]
# keep the edges
nodes <- nodes[edge == 1,list(ID,ALTERID)]
# bind the networks
if(q == 1){
net <- copy(nodes)
} else{
net <- rbind(net,nodes)
}
}
# create opposide direction
net <- rbind(net,net[,list(ID = ALTERID, ALTERID = ID)])
})
This incorporates #BenBolker and # DavidArenburg's suggestions and also incorporates some of data.table's tools.
Non-Equi joins
The OP code loops through each group. One part of the code also uses combn and multiple joins to get the data in the right format. Using non-equi joins, we can combine all of those steps in one data.table call
dt_non_sub <- dt.people[dt.people,
on = .(ego < ego, group = group),
allow.cartesian = T,
nomatch = 0L,
.(group,
ALTERID = i.ego, ID = x.ego,
ID.UC = UC, ID.OC = OC,
ALTERID.OC = i.OC, ALTERID.UC = i.UC,
d = abs(UC - i.UC)) #added to be more efficient
]
# dt_non_sub[, d:= abs(ID.UC - ALTERID.UC)]
Vectorization
The original code was mostly slow because of two calls with by groupings. Since each call split the dataframe in around 8,000 individual groups, there were 8,000 functions calls each time. This eliminates those by using pmin as suggested by #DavidArenburg and then uses runif(N)<p as suggested by #BenBolker. My addition was that since your final result don't seem to care about p, I only assigned the edge by using {} to only return the last thing calculated in the call.
# alpha <- coefficient / mean(nodes$d)
dt_non_sub[,
edge := {
alpha = coefficient / mean(d)
p = exp(alpha * d) / (1 + exp(alpha * d))
p_mean = mean(p)
p = pmin(1, p * (1/(p_mean / c.dens)))
as.numeric(runif(.N)<p)
}
, by = .(group)]
net2 <- rbindlist(dt_non_sub[edge == 1, .(group, ALTERID, ID)],
dt_non_sub[edge == 1, .(group, ID = ALTERID, ALTERID = ID)]
One thing to note is that the vectorization is not 100% identical. Your code was recursive, each split updated the mean(node$p) for the next ID, ALTERID group. If you need that recursive part of the call, there's not much help to make it faster.
In the end, the modified code runs in 20 ms vs. the 810 ms of your original function. The results, while different, are somewhat similar in the total number of results:
Original:
net
ID ALTERID
1: 5 10
2: 10 14
3: 5 25
4: 10 25
5: 14 25
---
48646: 498 458
48647: 498 477
48648: 498 486
48649: 498 487
48650: 498 493
Modified
net2
group ALTERID ID
1: 2 4 3
2: 2 6 4
3: 4 7 1
4: 4 8 7
5: 2 9 4
---
49512: 3 460 500
49513: 3 465 500
49514: 3 478 500
49515: 3 482 500
49516: 3 497 500

Lift curve is swapped

For the example for the lift curve I run
library(caret)
set.seed(1)
simulated <- data.frame(obs = factor(rep(letters[1:2], each = 100)),
perfect = sort(runif(200), decreasing = TRUE),
random = runif(200))
lift2 <- lift(obs ~ random + perfect, data = simulated)
xyplot(lift2, plot = "lift", auto.key = list(columns = 2))
and get
as result. I expected the image to be swapped horizontally - something along the lines of
What am I doing wrong?
Btw: This is a lift chart not a cumulative gains chart.
Update:
The plot that I expected, produced now by my own code
mylift <- caret::lift(Class ~ cforest_prob + perfect_prob + guess_prob, data = data_test)
ggplot(mylift$data) +
geom_line(aes(CumTestedPct, lift, color = liftModelVar))
is
I noticed, that the data.frame mylift$data contains the following columns:
names(mylift$data)
[1] "liftModelVar" "cuts" "events" "n" "Sn" "Sp" "EventPct"
[8] "CumEventPct" "lift" "CumTestedPct"
So I printed the following plot
ggplot(mylift$data) +
geom_line(aes(cuts, lift, color = liftModelVar))
So I guess that the different plots are just different ways of examining lift? I wasn't aware that there are different lift charts - I thought it was standardized across the industry.
Edit by the question author, for late readers: I accepted this answer for a large part because of the helpful discussion in the comments to this answer. Please consider reading the discussion!
Let's reproduce the graph and find the baseline. Let
cutoffs <- seq(0, 1, length = 1000)
be our cutoffs. Now the main computations are done by
aux <- sapply(cutoffs, function(ct) {
perf <- simulated$obs[simulated$perfect > ct]
rand <- simulated$obs[simulated$random > ct]
c(mean(perf == "a"), mean(rand == "a"))
})
where we go over the vector of cutoffs and do the following. Take the perfect case. We say that whenever perfect > ct, we are going to predict "a". Then simulated$obs[simulated$perfect > ct] are the true values, while mean(perf == "a") is our accuracy with a given cutoff. The same happens with random.
As for the baseline, it is just a constant defined by the share of "a" in the sample:
baseline <- mean(simulated$obs == "a")
When plotting the lifts, we divide our accuracy by that of the baseline method and get the same graph along with the baseline curve:
plot(x = cutoffs, y = aux[1, ] / baseline, type = 'l', ylim = c(0, 2), xlab = "Cutoff", ylab = "Lift")
lines(x = cutoffs, y = aux[2, ] / baseline, col = 'blue')
abline(a = baseline / baseline, b = 0, col = 'magenta')
Update:
Here's an illustration that, at least when plotted manually, the lift curve of the "expected" type can be manipulated and gives non-unique results.
Your example graph is from here, which also has this data:
# contacted response
# 1 10000 6000
# 2 20000 10000
# 3 30000 13000
# 4 40000 15800
# 5 50000 17000
# 6 60000 18000
# 7 70000 18800
# 8 80000 19400
# 9 90000 19800
# 10 100000 20000
Now suppose that we know not this evolution but 10 individual blocks:
# contacted response
# 1 10000 6000
# 2 10000 4000
# 3 10000 3000
# 4 10000 2800
# 5 10000 1200
# 6 10000 1000
# 7 10000 800
# 8 10000 600
# 9 10000 400
# 10 10000 200
In that case it depends on how we order the observations when putting "% Contacted" in the x-axis:
set.seed(1)
baseline <- sum(df$response) / sum(df$contacted) * cumsum(df$contacted)
lift1 <- cumsum(df$response)
lift2 <- cumsum(sample(df$response))
x <- 1:10 * 10
plot(x = x, y = lift1 / baseline, col = 'red', type = 'l', ylim = c(0, 3), xlab = "% Customers contacted", ylab = "Lift")
lines(x = x, y = lift2 / baseline, col = 'blue')
abline(a = baseline / baseline, b = 0, col = 'magenta')

mcmcglmm loop to create many chains

Following up from this question (see for reproducible data frame) I want to run MCMCGLMM n times, where n is the number of randomisations. I have tried to construct a loop which runs all the chains, and saves them (to retrieve the posterior distributions of the randomised variable later) but I am encountering problems.
This is what the data frame looks like (when n = 5, hence R1-R5), A = response variable, L and V are random effect variables, B is a fixed effect, R1-R5 are random assignments of L with structure of V maintained:
ID L B V A R1 R2 R3 R4 R5
1 1_1_1 1 1 1 11.1 6 19 21 1 31
2 1_1_1 1 1 1 6.9 6 19 21 1 31
3 1_1_4 1 1 4 7.7 2 24 8 22 22
4 1_1_4 1 1 4 10.5 2 24 8 22 22
5 1_1_5 1 1 5 8.5 11 27 14 17 22
6 1_1_7 1 1 7 11.2 5 24 13 18 25
I can create the names I want to assign to my chains, and the names of the variable that changes with each run of the MCMC chain (R1-Rn):
n = 5
Rs = as.vector(rep(NA,n))
for(i in 1:n){
Rs[i] = paste("R",i, sep = "")
}
Rs
Output:
> Rs
[1] "R1" "R2" "R3" "R4" "R5"
I then tried this loop to produce 5 chains:
for(i in 1:n){
chains[i] = MCMCglmm(A ~1 + B,
random = as.formula(paste0("~" ,Rs[i], " + Vial")),
rcov = ~units,
nitt = 500,
thin = 2,
burnin = 50,
prior = prior2,
family = "gaussian",
start = list(QUASI = FALSE),
data = df)
}
Thanks Roland for helping to get the random effect to call properly, previously I was getting an error Error in buildZ(rmodel.terms[r] ... object Rs[i] not found- fixed by as.formula
But this stores all of the data in chains and seemingly only the $Sol components, but I need to be able to access the values within the VCV, specifically the posterior distributions of the R variables (e.g. summary(chainR1$VCV))
In summary: It seems I am making a mistake in how I assign the chain names, does anyone have a suggestion of how to do this, and save the posterior distributions or even the whole chain?
Using assign was a key point:
n = 10 #Number of chains to run
chainVCVdf = matrix(rep(NA, times = ((nitt-burnin)/thin)*n), ncol = n)
colnames(chainVCVdf)=c(rep("X", times = n))
for(i in 1:n){
assign("chainX",paste0("chain",Rs[i]))
chainX = MCMCglmm(A ~1 + B,
random = as.formula(paste0("~" ,Rs[i], " + V")),
rcov = ~units,
nitt = nitt,
thin = thin,
burnin = burnin,
prior = prior1,
family = "gaussian",
start = list(QUASI = FALSE),
data = df)
assign("chainVCV", chainX$VCV[,1])
chainVCVdf[,i]=(chainVCV)
colnames(chainVCVdf)[i] = colnames(chainX$VCV)[1]
}
It then became possible to build a matrix of the VCV component that I am interested in (namely the randomised L assignment in columns R1-Rn)
It seems as though you want to run a number of different MCMCglmm formulas in a loop. #Roland has helped you found the solution to this (although I personally would create the formulas prior to the loop). #Roland also points out that in order to save the results of each model, you should save them in a list - rather than a chain as you are currently doing. You could also save each model as an .RData file, as seen in the end of the question. To formalize an answer to this question I would perform this in the following way:
Rs = paste0("~R", 1:5, " + V") ## Create all model formulae
chainNames = paste0("chainR", 1:5) ## Names for each model
chains = list() ## Initialize list
## Loop over models
for(i in 1:length(Rs)){
chains[[i]] = MCMCglmm(A ~1 + B,
random = formula(Rs[i]),
rcov = ~units,
nitt = 500,
thin = 2,
burnin = 50,
prior = prior2,
family = "gaussian",
start = list(QUASI = FALSE),
data = df)
}
names(chains) = chainNames ## Name each model
save(chains, "chainsR1-R5.Rdata") ## Save all model output
A side note, paste0 is the same as paste, but with the argument sep="" by default

R warning - longer object length is not a multiple of shorter object length [duplicate]

This question already has answers here:
Why do I get "warning longer object length is not a multiple of shorter object length"?
(4 answers)
Closed 6 years ago.
I'm trying to perform clustering on spatial data based on distance but constrain the cluster size. I found this article online, (Spatial Clustering With Equal Sizes), and it works with a small list of date into 3 clusters.
However, when I tried to run a larger list and cluster them into 30 clusters, it doesn't work as expected. The clusters it returns are uneven again, like below.
I tried the smaller data with 30 cluster and also the example dataset, they both worked out evenly. So I guess it's something wrong with my data. But I'm not sure how to fix it.
table( cl_constrain$cluster )
Cluster 1 2 3 4 5 6 7 8 9 10
Size 151 63 67 88 65 89 92 82 72 84
Cluster 11 12 13 14 15 16 17 18 19 20
Size 60 61 44 46 60 51 65 216 56 188
Cluster 20 21 22 23 24 25 26 27 28 29 30
Size 229 78 101 75 196 70 222 62 102 271
My data set looks as this
I'm new to R, and not sure what's going wrong with it, could anyone help me out please? Thanks a lot!
Here's the source code from the article.
# Convert to radian
as_radians = function(theta=0){
return(theta * pi / 180)
}
calc_dist = function(fr, to) {
lat1 = as_radians(fr$lat)
lon1 = as_radians(fr$lon)
lat2 = as_radians(to$lat)
lon2 = as_radians(to$lon)
a = 3963.191;
b = 3949.903;
numerator = ( a^2 * cos(lat2) )^2 + ( b^2 * sin(lat2) ) ^2
denominator = ( a * cos(lat2) )^2 + ( b * sin(lat2) )^2
radiusofearth = sqrt(numerator/denominator) #Accounts for the ellipticity of the earth.
d = radiusofearth * acos( sin(lat1) * sin(lat2) + cos(lat1)*cos(lat2)*cos(lon2 - lon1) )
d.return = list(distance_miles=d)
return(d.return)
}
raw.og = read.csv("http://statistical-research.com/wp-content/uploads/2013/11/sample_geo.txt", header=T, sep="\t")
orig.data = raw.og[,1:3]
dirichletClusters_constrained = function(orig.data, k=5, max.iter =50, tolerance = 1, plot.iter=TRUE) {
fr = to = NULL
r.k.start = sample(seq(1:k))
n = nrow( orig.data )
k.size = ceiling(n/k)
initial.clusters = rep(r.k.start, k.size)
if(n%%length(initial.clusters)!=0){
exclude.k = length(initial.clusters) - n%%length(initial.clusters)
} else {
exclude.k = 0
}
orig.data$cluster = initial.clusters[1:(length(initial.clusters)-exclude.k)]
orig.data$cluster_original = orig.data$cluster
## Calc centers and merge
mu = cbind( by(orig.data$Latitude, orig.data$cluster, mean), by(orig.data$Longitude, orig.data$cluster, mean), seq(1:k) )
tmp1 = matrix( match(orig.data$cluster, mu[,3]) )
orig.data.centers = cbind(as.matrix(orig.data), mu[tmp1,])[,c(1:2,4:6)]
## Calc initial distance from centers
fr$lat = orig.data.centers[,3]; fr$lon = orig.data.centers[,4]
to$lat = orig.data.centers[,1]; to$lon = orig.data.centers[,2]
orig.data$distance.from.center = calc_dist(fr, to)$distance_miles
orig.data$distance.from.center_original = orig.data$distance.from.center
## Set some initial configuration values
is.converged = FALSE
iteration = 0
error.old = Inf
error.curr = Inf
while ( !is.converged && iteration < max.iter ) { # Iterate until threshold or maximum iterations
if(plot.iter==TRUE){
plot(orig.data$Longitude, orig.data$Latitude, col=orig.data$cluster, pch=16, cex=.6,
xlab="Longitude",ylab="Latitude")
}
iteration = iteration + 1
start.time = as.numeric(Sys.time())
cat("Iteration ", iteration,sep="")
for( i in 1:n ) {
# Iterate over each observation and measure the distance each observation' from its mean center
# Produces an exchange. It takes the observation closest to it's mean and in return it gives the observation
# closest to the giver, k, mean
fr = to = distances = NULL
for( j in 1:k ){
# Determine the distance from each k group
fr$lat = orig.data$Latitude[i]; fr$lon = orig.data$Longitude[i]
to$lat = mu[j,1]; to$lon = mu[j,2]
distances[j] = as.numeric( calc_dist(fr, to) )
}
# Which k cluster is the observation closest.
which.min.distance = which(distances==min(distances), arr.ind=TRUE)
previous.cluster = orig.data$cluster[i]
orig.data$cluster[i] = which.min.distance # Replace cluster with closest cluster
# Trade an observation that is closest to the giving cluster
if(previous.cluster != which.min.distance){
new.cluster.group = orig.data[orig.data$cluster==which.min.distance,]
fr$lat = mu[previous.cluster,1]; fr$lon = mu[previous.cluster,2]
to$lat = new.cluster.group$Latitude; to$lon = new.cluster.group$Longitude
new.cluster.group$tmp.dist = calc_dist(fr, to)$distance_miles
take.out.new.cluster.group = which(new.cluster.group$tmp.dist==min(new.cluster.group$tmp.dist), arr.ind=TRUE)
LocationID = new.cluster.group$LocationID[take.out.new.cluster.group]
orig.data$cluster[orig.data$LocationID == LocationID] = previous.cluster
}
}
# Calculate new cluster means
mu = cbind( by(orig.data$Latitude, orig.data$cluster, mean), by(orig.data$Longitude, orig.data$cluster, mean), seq(1:k) )
tmp1 = matrix( match(orig.data$cluster, mu[,3]) )
orig.data.centers = cbind(as.matrix(orig.data), mu[tmp1,])[,c(1:2,4:6)]
mu = cbind( by(orig.data$Latitude, orig.data$cluster, mean), by(orig.data$Longitude, orig.data$cluster, mean), seq(1:k) )
## Calc initial distance from centers
fr$lat = orig.data.centers[,3]; fr$lon = orig.data.centers[,4]
to$lat = orig.data.centers[,1]; to$lon = orig.data.centers[,2]
orig.data$distance.from.center = calc_dist(fr, to)$distance_miles
# Test for convergence. Is the previous distance within the threshold of the current total distance from center
error.curr = sum(orig.data$distance.from.center)
error.diff = abs( error.old - error.curr )
error.old = error.curr
if( !is.nan( error.diff ) && error.diff < tolerance ) {
is.converged = TRUE
}
# Set a time to see how long the process will take is going through all iterations
stop.time = as.numeric(Sys.time())
hour.diff = (((stop.time - start.time) * (max.iter - iteration))/60)/60
cat("\n Error ",error.diff," Hours remain from iterations ",hour.diff,"\n")
# Write out iterations. Can later be used as a starting point if iterations need to pause
write.table(orig.data, paste("C:\\optimize_iteration_",iteration,"_instore_data.csv", sep=""), sep=",", row.names=F)
}
centers = data.frame(mu)
ret.val = list("centers" = centers, "cluster" = factor(orig.data$cluster), "LocationID" = orig.data$LocationID,
"Latitude" = orig.data$Latitude, "Longitude" = orig.data$Longitude,
"k" = k, "iterations" = iteration, "error.diff" = error.diff)
return(ret.val)
}
# Constrained clustering
cl_constrain = dirichletClusters_constrained(orig.data, k=4, max.iter=5, tolerance=.0001, plot.iter=TRUE)
table( cl_constrain$cluster )
plot(cl_constrain$Longitude, cl_constrain$Latitude, col=cl_constrain$cluster, pch=16, cex=.6,
xlab="Longitude",ylab="Latitude")
library(maps)
map("state", add=T)
points(cl_constrain$centers[,c(2,1)], pch=4, cex=2, col='orange', lwd=4)
There is an same-size cluster k-means variation in ELKI.
It is explained in detail in this tutorial.
I have seen a lot of people ask for such a clustering algorithm, but I do not think it is well supported by theory to use an algorithm like this.
For your use case, you also have the problem of geographic coordinates: k-means uses the mean, but the mean may be inconsistent with your distance function. Consider two points at Longitude -179° and +178°. K-means would use the mean of these two, -0.5° as cluster center. A more sensible choice of cluster center would be at +179.5°, on the very opposite side of the earth.
If your data is constrained to a reasonably small area, it may still work.
To get better quality, you may want to map your data into an appropriate UTM zone. Within one UTM zone, Euclidean distance is a reasonable approximation of distance.

view values used by function boot to bootstrap estimates

I have written the code below to obtain a bootstrap estimate of a mean. My objective is to view the numbers selected from the data set, ideally in the order they are selected, by the function boot in the boot package.
The data set only contains three numbers: 1, 10, and 100 and I am only using two bootstrap samples.
The estimated mean is 23.5 and the R code below indicates that the six numbers included one '1', four '10' and one '100'. However, there are 30 possible combinations of those numbers that would have resulted in a mean of 23.5.
Is there a way for me to determine which of those 30 possible combinations is the combination that actually appeared in the two bootstrap samples?
library(boot)
set.seed(1234)
dat <- c(1, 10, 100)
av <- function(dat, i) { sum(dat[i])/length(dat[i]) }
av.boot <- boot(dat, av, R = 2)
av.boot
#
# ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
# Call:
# boot(data = dat, statistic = av, R = 2)
#
#
# Bootstrap Statistics :
# original bias std. error
# t1* 37 -13.5 19.09188
#
mean(dat) + -13.5
# [1] 23.5
# The two samples must have contained one '1', four '10' and one '100',
# but there are 30 possibilities.
# Which of these 30 possible sequences actual occurred?
# This code shows there must have been one '1', four '10' and one '100'
# and shows the 30 possible combinations
my.combos <- expand.grid(V1 = c(1, 10, 100),
V2 = c(1, 10, 100),
V3 = c(1, 10, 100),
V4 = c(1, 10, 100),
V5 = c(1, 10, 100),
V6 = c(1, 10, 100))
my.means <- apply(my.combos, 1, function(x) {( (x[1] + x[2] + x[3])/3 + (x[4] + x[5] + x[6])/3 ) / 2 })
possible.samples <- my.combos[my.means == 23.5,]
dim(possible.samples)
n.1 <- rowSums(possible.samples == 1)
n.10 <- rowSums(possible.samples == 10)
n.100 <- rowSums(possible.samples == 100)
n.1[1]
n.10[1]
n.100[1]
length(unique(n.1)) == 1
length(unique(n.10)) == 1
length(unique(n.100)) == 1
I think you can determine the numbers sampled and the order in which they are sampled with the code below. You have to extract the function ordinary.array from the boot package and paste that function into your R code. Then specify the values for n, R and strata, where n is the number of observations in the data set and R is the number of replicate samples you want.
I do not know how general this approach is, but it worked with a couple of simple examples I tried, including the example below.
library(boot)
set.seed(1234)
dat <- c(1, 10, 100, 1000)
av <- function(dat, i) { sum(dat[i])/length(dat[i]) }
av.boot <- boot(dat, av, R = 3)
av.boot
#
# ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
# Call:
# boot(data = dat, statistic = av, R = 3)
#
#
# Bootstrap Statistics :
# original bias std. error
# t1* 277.75 -127.5 132.2405
#
#
mean(dat) + -127.5
# [1] 150.25
# boot:::ordinary.array
ordinary.array <- function (n, R, strata)
{
inds <- as.integer(names(table(strata)))
if (length(inds) == 1L) {
output <- sample.int(n, n * R, replace = TRUE)
dim(output) <- c(R, n)
}
else {
output <- matrix(as.integer(0L), R, n)
for (is in inds) {
gp <- seq_len(n)[strata == is]
output[, gp] <- if (length(gp) == 1)
rep(gp, R)
else bsample(gp, R * length(gp))
}
}
output
}
# I think the function ordinary.array determines which elements
# of the data are sampled in each of the R samples
set.seed(1234)
ordinary.array(n=4,R=3,1)
# [,1] [,2] [,3] [,4]
# [1,] 1 3 1 3
# [2,] 3 4 1 3
# [3,] 3 3 3 3
#
# which equals:
((1+100+1+100) / 4 + (100+1000+1+100) / 4 + (100+100+100+100) / 4) / 3
# [1] 150.25

Resources