Run DBSCAN against grouped coordinates - r

I'm attempting to run DBSCAN against some grouped coordinates in order to get sub-clusters. I've clustered some spacial data and I'd now like to further divide these regions according to the density of points within them. I think DBSCAN is probably the best way to do this.
My issue is that I can't figure out how to run DBSCAN against each cluster seperately and then output the cluster assignment as a new column. Here's some sample data:
library(dplyr)
library(dbscan)
# Create sample data
df <- data.frame(
"ID"=1:200,
"X"=c(1.0083,1.3166,1.3072,1.1311,1.2984,1.2842,1.1856,1.3451,1.1932,1.0926,1.2464,1.3197,1.2331,1.2996,1.3482,
1.1944,1.2800,1.3051,1.4471,0.9068,1.3150,1.1846,1.0232,1.0005,1.0640,1.3177,1.1015,0.9598,1.0354,1.2203,
0.8388,0.8655,1.3387,1.0133,1.0106,1.1753,1.3200,1.0139,1.1511,1.3508,1.2747,1.3681,1.1074,1.2735,1.2245,
0.9695,1.3250,1.0537,1.2020,1.3093,0.9268,1.3244,1.2626,1.3123,1.2819,1.1063,0.8759,1.0063,1.0173,1.0187,
1.2396,1.0241,1.2619,1.2682,1.0008,1.0827,1.3639,1.3099,1.0004,0.8886,1.2359,1.1370,1.2783,1.0803,1.1918,
1.1156,1.3313,1.1205,1.0776,1.3895,1.3559,0.8518,1.1315,1.3521,1.2281,1.2589,0.9974,1.1487,1.4204,0.9998,
1.0154,1.0098,0.8851,1.0252,0.9331,1.2197,1.0084,1.2303,1.2808,1.3125,0.5500,0.6694,0.3301,0.3787,0.6492,
0.6568,0.6773,0.3769,0.6237,0.7265,0.5509,0.3579,0.7201,0.2631,0.3881,0.7596,0.3343,0.7049,0.3430,0.2951,
0.5483,0.7699,0.3806,0.6555,0.2524,0.4030,0.6329,0.5006,0.2701,0.0822,0.5442,0.5233,0.7105,0.5660,0.3962,
0.3187,0.3143,0.5673,0.3731,0.7310,0.6376,0.4864,0.8865,0.3352,0.7540,0.0690,0.7983,0.6990,0.4090,0.5658,
0.5636,0.5420,0.7223,0.6146,0.5648,0.2711,0.3422,0.7214,0.2196,0.2848,0.6496,0.7907,0.7418,0.7825,0.4550,
0.4361,0.7417,0.2661,0.8978,0.7875,0.2343,0.3853,0.6874,0.7761,0.2905,0.6092,0.5329,0.6189,0.0684,0.5726,
0.5740,0.7060,0.4609,0.3568,0.7037,0.2874,0.6200,0.7149,0.5100,0.7059,0.2520,0.3105,0.6870,0.7888,0.3674,
0.6514,0.7271,0.6679,0.3752,0.7067),
"Y"=c(-1.2547,-1.1499,-1.1803,-1.0626,-1.2877,-1.1151,-1.0958,-1.1339,-1.0808,-1.5461,-1.0775,-1.1431,-1.0499,
-1.1521,-1.1675,-1.0963,-1.1407,-1.1916,-1.1229,-1.2297,-1.1308,-1.0341,-1.3071,-1.2370,-1.5043,-1.1154,
-1.5452,-1.0349,-1.5412,-1.0348,-1.3620,-1.3776,-1.1830,-1.2552,-1.2354,-1.0838,-1.1214,-1.2396,-1.4937,
-1.0793,-1.1857,-1.0679,-1.5425,-1.1633,-1.1620,-1.0838,-1.0750,-1.3493,-1.4155,-1.1354,-1.0615,-1.1494,
-1.1620,-1.1582,-1.1800,-1.5230,-1.3019,-1.2484,-1.5490,-1.2435,-1.0487,-1.2330,-1.1234,-1.0924,-1.0702,
-1.0446,-1.1077,-1.1144,-1.2170,-1.2715,-1.1537,-1.5077,-1.1305,-1.3396,-1.2107,-1.5458,-1.1482,-1.1224,
-1.3690,-1.2058,-1.1685,-1.3400,-1.5033,-1.2152,-1.3805,-1.1439,-1.5183,-1.4288,-1.1252,-1.2330,-1.2511,
-1.5429,-1.3333,-1.1851,-1.1367,-1.3952,-1.1240,-1.2113,-1.1632,-1.1965,-0.9917,-0.7416,-0.7729,-1.1279,
-0.9323,-0.9372,-0.7013,-1.1746,-0.9191,-0.9356,-0.7873,-1.1957,-0.9838,-0.5825,-1.0738,-0.9302,-0.7713,
-0.9407,-0.7774,-0.8160,-0.9861,-1.0440,-0.9896,-0.6478,-0.8865,-1.0601,-1.0640,-0.9898,-0.5989,-0.7375,
-0.7689,-0.9799,-0.9147,-1.1048,-0.9735,-0.8591,-0.7913,-1.0085,-0.7231,-0.9688,-0.9272,-0.9395,-0.9494,
-0.7859,-1.0817,-0.7262,-0.9915,-0.9329,-1.0953,-1.0425,-1.0806,-1.0132,-0.8514,-1.0785,-1.1109,-0.8542,
-1.0849,-0.9665,-0.5940,-0.6145,-0.7830,-0.9601,-0.8996,-0.7717,-0.7447,-1.0406,-1.0067,-0.5710,-0.9839,
-1.0594,-0.7069,-1.1202,-0.9705,-1.0100,-0.6377,-1.0632,-0.9450,-0.9163,-0.7865,-1.0090,-1.1005,-1.0049,
-0.8042,-1.0781,-0.6829,-0.5962,-1.0759,-0.7918,-0.9732,-0.7353,-0.5615,-1.2002,-0.9295,-0.9944,-1.1570,
-0.9524,-0.9257,-0.9360,-1.1328,-0.7661),
"cluster"=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2))
# How do you run DBSCAN against the points within each cluster?
I first thought I'd try to use the group_by function in dplyr but DBSCAN requires a data matrix input and group_by doesn't work for matrices.
matrix <- as.matrix(df[, -1])
set.seed(1234)
db = matrix %>%
group_by(cluster) %>%
dbscan(matrix, 0.4, 4)
#Error in UseMethod("group_by_") :
# no applicable method for 'group_by_' applied to an object of class "c('matrix', 'double', 'numeric')"
I've also tried using by() but get duplicate results for each cluster grouping, which isn't right:
by(data = df, INDICES = df$cluster, FUN = function(x) {
out <- dbscan(as.matrix(df[, c(2:3)]),eps=.0215,minPts=4)
})
#df$cluster: 1
#DBSCAN clustering for 200 objects.
#Parameters: eps = 0.0215, minPts = 4
#The clustering contains 10 cluster(s) and 138 noise points.
#
# 0 1 2 3 4 5 6 7 8 9 10
#138 11 12 4 5 8 2 4 8 4 4
#
#Available fields: cluster, eps, minPts
#--------------------------------------------------------------------------
#df$cluster: 2
#DBSCAN clustering for 200 objects.
#Parameters: eps = 0.0215, minPts = 4
#The clustering contains 10 cluster(s) and 138 noise points.
#
# 0 1 2 3 4 5 6 7 8 9 10
#138 11 12 4 5 8 2 4 8 4 4
#
#Available fields: cluster, eps, minPts
Can anyone point me in the right direction?

To be clear, dbscan::dbscan works fine on data.frame objects. You do not need to convert to matrix. It returns an object that includes a vector with the same dimension as the number of records in your input. The issue is that dplyr exposes variables to other functions as individual vectors, rather than as data.frame or matrix objects. You are free to do something like:
df %>%
group_by(cluster) %>%
mutate(
dbscan_cluster = dbscan::dbscan(
data.frame(X, Y),
eps = 0.0215,
minPts = 4
)[["cluster"]]
)
dplyr is not necessary, by also works, you just need to supply a generic function rather than one that directly references the source object directly. Your data must already be ordered by cluster.
df$dbscan_cluster <- unlist(
by(
df,
INDICES = df$cluster,
function(x) dbscan::dbscan(x[,c(2,3)], eps = 0.0215, minPts = 4)[["cluster"]]
)
)
However, you can still get garbage results if you don't have a good way to pick your epsilon. You might consider using dbscan::optics instead.

Related

Obtain values from simulated mppm in spatstat

I have obtained an mppm object by fitting a model on several independent datasets using the mppm function from the R package spatstat. How can I generate simulated realisations of this model and obtain the x, y, and marks attributes of the simulations ?
I fitted my model as such:
data <- listof(NMJ1,NMJ2,NMJ3)
data <- hyperframe(X=1:3, Points=data)
model <- mppm(Points ~marks*sqrt(x^2+y^2), data)
where NMJ1, NMJ2, and NMJ3 are marked ppp and are independent realisations of the same experiment.
sim <- simulate(model) allows me to generate simulated realisations of this model, and plot(sim,axes = TRUE) to plot them. sim itself is an hyperframe object:
> sim
Hyperframe:
Sim1
1 (ppp)
2 (ppp)
3 (ppp)
How can I access the values (x, y, and marks) in this hyperframe ? My goal is to generate a large number of independent realisations from my model, and to use the simulated values for another task. Is there a practical way to obtain, retrieve and save these values ?
Since you say you want to simulate this many times I have here shown the code
with two simulations (rather than one as you have in the question):
library(spatstat)
data <- list(amacrine, amacrine, amacrine)
data <- hyperframe(X=1:3, Points=data)
model <- mppm(Points ~marks*sqrt(x^2+y^2), data)
sim <- simulate(model, nsim = 2)
#> Generating simulated realisations of 3 models..
#> 1, 2, 3.
Now sim is a hyperframe with 2 columns (one for each simulation). Each
column is a list of 3 point patterns. To get the three sets of coordinates
and marks for the first simulation use as.data.frame on each point pattern:
co1 <- lapply(sim$Sim1, as.data.frame)
Then co1 is a list of length 3, and we can print out the first few
coordinates with the head() command, e.g. the coordinates of the third
point pattern:
head(co1[[3]])
#> x y marks
#> 1 0.4942587 0.7889985 off
#> 2 0.6987270 0.7637359 on
#> 3 0.3926415 0.6819965 on
#> 4 0.7982686 0.9060733 off
#> 5 1.3507722 0.9731363 on
#> 6 0.6450985 0.6924126 on
We can extract the coordinates and marks for each simulation by another lapply that
runs over every simulation (in this case 2):
co <- lapply(sim, function(x) lapply(x, as.data.frame))
Now co is a list with 2 elements, and each element is a list of 3 sets of
coordinates:
length(co)
#> [1] 2
length(co[[2]])
#> [1] 3
head(co[[2]][[3]])
#> x y marks
#> 1 0.1660580 0.04180501 on
#> 2 0.7840025 0.71727782 on
#> 3 1.2011733 0.17109112 on
#> 4 1.0429867 0.49284639 on
#> 5 1.1411869 0.86711072 off
#> 6 1.0375942 0.06427601 on

How to get CI 95% for coefficients of linear model using simpleboot package

I'm trying to predict a linear model (basic linear regressions with 4 predictors) with the procedure lm(). This works all fine.
What I want to do now is bootstrapping the model. After a quick research on Google I found out about the package simpleboot which seemed to be quite easy to understand.
I can easily bootstrap the lm.object using something like this:
boot_mod <- lm.boot(mod,R=100,rows=TRUE)
and afterwards print the object boot_mod.
I can also access the list in which the coefficients for each bootstrap sample are among other metrics such as RSS, R² and so on.
Can anyone tell me how I can save all coefficients from the boot list in a list or dataframe?
The result would look like this at best:
boot_coef
sample coef 1 coef 2 coef 3...
1 1,1 1,4 ...
2 1,2 1,5 ...
3 1,3 1,6 ...
library(tidyverse)
library(simpleboot)
### Some Dummy-Data in a dataframe
a <- c(3,4,5,6,7,9,13,12)
b <- c(5,9,14,22,12,5,12,18)
c <- c(7,2,8,7,12,5,3,1)
df <- as_data_frame(list(x1=a,x2=b,y=c))
### Linear model
mod <- lm(y~x1+x2,data=df)
### Bootstrap
boot_mod <- lm.boot(mod,R=10,rows = TRUE)
You can also use the function sample of the same package simpleboot:
given the output from either lm.boot or loess.boot, you can specify what kind of information you want to extract:
samples(object, name = c("fitted", "coef", "rsquare", "rss"))
It outputs either a vector or matrix depending on the entity extracted.
Source:
https://rdrr.io/cran/simpleboot/man/samples.html
Here is a tidyverse option to save all coefficients from the boot.list:
library(tidyverse)
as.data.frame(boot_mod$boot.list) %>%
select(ends_with("coef")) %>% # select coefficients
t(.) %>% as.data.frame(.) %>% # model per row
rownames_to_column("Sample") %>% # set sample column
mutate(Sample = parse_number(Sample))
# output
Sample (Intercept) x1 x2
1 1 5.562417 -0.2806786 0.12219191
2 2 8.261905 -0.8333333 0.54761905
3 3 9.406171 -0.5863124 0.07783740
4 4 8.996784 -0.6040479 0.06737891
5 5 10.908036 -0.7249561 -0.03091908
6 6 8.914262 -0.5094340 0.05549390
7 7 7.947724 -0.2501127 -0.08607481
8 8 6.255539 -0.2033771 0.07463971
9 9 5.676581 -0.2668020 0.08236743
10 10 10.118126 -0.4955047 0.01233728

Creation prediction function for kmean in R

I want create predict function which predicts for which cluster, observation belong
data(iris)
mydata=iris
m=mydata[1:4]
train=head(m,100)
xNew=head(m,10)
rownames(train)<-1:nrow(train)
norm_eucl=function(train)
train/apply(train,1,function(x)sum(x^2)^.5)
m_norm=norm_eucl(train)
result=kmeans(m_norm,3,30)
predict.kmean <- function(cluster, newdata)
{
simMat <- m_norm(rbind(cluster, newdata),
sel=(1:nrow(newdata)) + nrow(cluster))[1:nrow(cluster), ]
unname(apply(simMat, 2, which.max))
}
## assign new data samples to exemplars
predict.kmean(m_norm, x[result$cluster, ], xNew)
After i get the error
Error in predict.kmean(m_norm, x[result$cluster, ], xNew) :
unused argument (xNew)
i understand that i am making something wrong function, cause I'm just learning to do it, but I can't understand where exactly.
indeed i want adopt similar function of apcluster ( i had seen similar topic, but for apcluster)
predict.apcluster <- function(s, exemplars, newdata)
{
simMat <- s(rbind(exemplars, newdata),
sel=(1:nrow(newdata)) + nrow(exemplars))[1:nrow(exemplars), ]
unname(apply(simMat, 2, which.max))
}
## assign new data samples to exemplars
predict.apcluster(negDistMat(r=2), x[apres#exemplars, ], xNew)
how to do it?
Rather than trying to replicate something, let's come up with our own function. For a given vector x, we want to assign a cluster using some prior k-means output. Given how k-means algorithm works, what we want is to find which cluster's center is closest to x. That can be done as
predict.kmeans <- function(x, newdata)
apply(newdata, 1, function(r) which.min(colSums((t(x$centers) - r)^2)))
That is, we go over newdata row by row and compute the corresponding row's distance to each of the centers and find the minimal one. Then, e.g.,
head(predict(result, train / sqrt(rowSums(train^2))), 3)
# 1 2 3
# 2 2 2
all.equal(predict(result, train / sqrt(rowSums(train^2))), result$cluster)
# [1] TRUE
which confirms that our predicting function assigned all the same clusters to the training observations. Then also
predict(result, xNew / sqrt(rowSums(xNew^2)))
# 1 2 3 4 5 6 7 8 9 10
# 2 2 2 2 2 2 2 2 2 2
Notice also that I'm calling simply predict rather than predict.kmeans. That is because result is of class kmeans and a right method is automatically chosen. Also notice how I normalize the data in a vectorized manner, without using apply.

Error in generating a poisson point pattern with calculated lambda

I have a two-dimensional point pattern (no marks) and I am trying to test for clustering in the presence of spatial inhomogeneity using envelopes and the inhomogenous pair correlation function. I am estimating an inhomogenous intensity function for the data using the density.ppp function. Here is some sample data:
x y
1 533.03 411.58
2 468.39 622.92
3 402.86 530.94
4 427.13 616.81
5 495.20 680.62
6 566.61 598.99
7 799.03 585.16
8 1060.09 544.23
9 144.66 747.40
10 138.14 752.92
11 449.49 839.15
12 756.45 713.72
13 741.01 728.41
14 760.22 740.28
15 802.34 756.21
16 799.04 764.89
17 773.81 771.97
18 768.41 720.07
19 746.14 754.11
20 815.40 765.14
There are ~1700 data points overall
Here is my code:
library("spatstat")
WT <- read.csv("Test.csv")
colnames(WT) <- c("x","y")
#determine bounding window
win <- ripras(WT)
unitname(win) <- c("micrometer")
#convert to ppp data class
WT.ppp <- as.ppp(WT, win)
plot(WT.ppp)
#estimate intensity function using cross validation
I <- density.ppp(WT.ppp,sigma=bw.diggle(WT.ppp),adjust=0.3,kernal="epanechnikov")
plot(I)
#predetermined r values for PCF
radius <- seq(from = 0, to = 50, by = 0.5)
#use envelopes to test the null hypothesis (ie. inhomogenous poisson process)
PCF_envelopes <- envelope(WT.ppp,divisor="d", pcfinhom,r = radius,nsim=10,simulate=expression(rpoispp(I)) )
When I run rpoisspp(I), I get the following error:
Error in sample.int(npix, size = ni, replace = TRUE, prob = lpix) :
negative probability
I can't seem to figure out what the issue is....any suggestions?
Thanks for your help!
This is happening because the image I contains some negative values, probably very small values but negative. You can check that by computing range(I) or min(I) or any(I < 0).
The help for density.ppp says that the result may contain negative values (very small ones) due to numerical error. To remove these, you need to set positive=TRUE in the call to density.ppp.
By the way, the argument kernel has been mis-spelt in the code above. Also the vector r is too coarsely spaced - you would be better to leave this argument un-specified. Also you don't need to type density.ppp, just density.

Calculate furthest distance in given time or best time for given distance

I have imported data from my GPS tracker and I am trying to figure out how to best calculate furthest distance ran in given time (e.g. 12-minutes) or best time for given distence (e.g. 5 miles). Given the observations are taken in unequal intervals and my speed is also not constant, I will have data like the table below:
x <- read.table(header=T, sep="", stringsAsFactors = FALSE,text="
time dist
4 3
5 4
5 6
3 2
5 5
4 5
4 3
4 2
5 6")
My best attempt so far is to generate new dataset where times go by one time unit. It is then relatively easy to calculate furthest distance in given time. The downside of this is that a) I would need to repeat the same logic for best time (generate data with unit distance), b) it seems to be quite sub-optimal solution for data with thousands data points.
# Generate data frame where each row represents one unit of time
z_adj <- data.frame(
time = unlist(sapply(x$time, function(s) rep(s, each = s))),
dist = unlist(sapply(seq_along(x$dist), function(s) rep(x$dist[s], each = x$time[s])))
)
z_adj$seq_time <- seq_along(z_adj$time)
z_adj$time_dist <- z_adj$dist / z_adj$time
# Furthest distance given time
# Time 10
z_adj$in_t10 <- sapply(z_adj$seq_time, function(s) sum(z_adj$dist[s:(s+9)]))
z_adj$in_t10[which(z_adj$in_t10 == max(z_adj$in_t10, na.rm = T))]
# Fastest time given distance
# ... would need to do the above again with constant distance :/
Is there a more straightforward approach to accomplish this?
You could use something like this:
x <- read.table(header=T, sep="", stringsAsFactors = FALSE,text="
time dist
4 3
5 4
5 6
3 2
5 5
4 5
4 3
4 2
5 6")
# Add starting point and cumulatice time/distance
x <- rbind(c(0,0), x)
x$total_time <- cumsum(x$time)
x$total_dist <- cumsum(x$dist)
# function to interpolate and calculate lagging differences
foo <- function(x, y, n) {
interpolation <- approx(x, y, xout = seq(min(x), max(x)))
diff(interpolation$y, lag = n)
}
# Max distance in ten units of time
max(foo(x$total_time, x$total_dist, 10))
# Min time for ten units of distance
min(foo(x$total_dist, x$total_time, 10))
BTW, in your code you should sum over z_adj$time_dist instead of z_adj$distto get the correct result.

Resources