KNN outlier detection in R - r

I am trying to run a script I was given to perform outlier detection using a weighted KNN outlier score, but keep getting the following error:
Error in apply(kNNdist(x = dat, k = k), 1, mean) :
dim(X) must have a positive length
The script I am trying to run is as below. It is a single block of script, but I have added a comment directly above the section of the script that is causing the error, which is the function:
WKNN_Outlier <- apply(kNNdist(x=dat, k = k), 1, mean)
If anyone has any better or simpler ideas for unsupervised outlier detection, I am all ears (so to speak...)
library(dbscan)
library(ggplot2)
set.seed(0)
x11 <- rnorm(n = 100, mean = 10, sd = 1) # Cluster 1 (x1 coordinate)
x21 <- rnorm(n = 100, mean = 10, sd = 1) # Cluster 1 (x2 coordinate)
x12 <- rnorm(n = 100, mean = 20, sd = 1) # Cluster 2 (x1 coordinate)
x22 <- rnorm(n = 100, mean = 10, sd = 1) # Cluster 2 (x2 coordinate)
x13 <- rnorm(n = 100, mean = 15, sd = 3) # Cluster 3 (x1 coordinate)
x23 <- rnorm(n = 100, mean = 25, sd = 3) # Cluster 3 (x2 coordinate)
x14 <- rnorm(n = 50, mean = 25, sd = 1) # Cluster 4 (x1 coordinate)
x24 <- rnorm(n = 50, mean = 25, sd = 1) # Cluster 4 (x2 coordinate)
dat <- data.frame(x1 = c(x11,x12,x13,x14), x2 = c(x21,x22,x23,x24))
( g0a <- ggplot() + geom_point(data=dat, mapping=aes(x=x1, y=x2), shape = 19) )
k <- 4 # KNN parameter
top_n <- 20 # No. of top outliers to be displayed
KNN_Outlier <- kNNdist(x=dat, k = k)
rank_KNN_Outlier <- order(x=KNN_Outlier, decreasing = TRUE) # Sorting (descending)
KNN_Result <- data.frame(ID = rank_KNN_Outlier, score = KNN_Outlier[rank_KNN_Outlier])
head(KNN_Result, top_n)
graph <- g0a +
geom_point(data=dat[rank_KNN_Outlier[1:top_n],], mapping=aes(x=x1,y=x2), shape=19,
color="red", size=2) +
geom_text(data=dat[rank_KNN_Outlier[1:top_n],],
mapping=aes(x=(x1-0.5), y=x2, label=rank_KNN_Outlier[1:top_n]), size=2.5)
graph
## Use KNNdist() to calculate the weighted KNN outlier score
k <- 4 # KNN parameter
top_n <- 20 # No. of top outliers to be displayed
The WKNN_Outler function below is what is causing the error. From what I can gather, the apply function shouldn't be having any issues, as the data (dat) is converted into a data.frame, which should prevent the error, but doesn't.
WKNN_Outlier <- apply(kNNdist(x=dat, k = k), 1, mean) # Weighted KNN outlier score (mean)
rank_WKNN_Outlier <- order(x=WKNN_Outlier, decreasing = TRUE)
WKNN_Result <- data.frame(ID = rank_WKNN_Outlier, score = WKNN_Outlier[rank_WKNN_Outlier])
head(WKNN_Result, top_n)
ge1 <- g0a +
geom_point(data=dat[rank_WKNN_Outlier[1:top_n],], mapping=aes(x=x1,y=x2), shape=19,
color="red", size=2) +
geom_text(data=dat[rank_WKNN_Outlier[1:top_n],],
mapping=aes(x=(x1-0.5), y=x2, label=rank_WKNN_Outlier[1:top_n]), size=2.5)
ge1

The function kNNdist(x=dat, k = k) produces a vector not a matrix, which is why when you try to do the apply function it tells you dim(X) must have a positive length (vectors have a NULL dim).
Try:
WKNN_Outlier <- apply(kNNdist(x=dat, k = k, all=T), 1, mean)

Related

How can I find the maximum output of a function

If I have a GLM, is there any way I can efficiently find the maximum output by changing one covariate and holding the others?
Using my simulated data:
# FUNCTIONS ====================================================================
logit <- function(p){
x = log(p/(1-p))
x
}
sigmoid <- function(x){
p = 1/(1 + exp(-x))
p
}
beta_duration <- function(D, select){
logit(
switch(select,
0.05 + 0.9 / (1 + exp(-2*D + 25)),
0.9 * exp(-exp(-0.5 * (D - 11))),
0.9 * exp(-exp(-(D - 11))),
0.9 * exp(-2 * exp(-(D - 9))),
sigmoid(0.847 + 0.210 * (D - 10)),
0.7 + 0.0015 * (D - 10) ^ 2,
0.7 - 0.0015 * (D - 10) ^ 2 + 0.03 * (D - 10)
)
)
}
beta_sex <- function(sex, OR = 1){
ifelse(sex == "Female", -0.5 * log(OR), 0.5 * log(OR))
}
plot_beta_duration <- function(select){
x <- seq(10, 20, by = 0.01)
y <- beta_duration(x, select)
data.frame(x = x,
y = y) %>%
ggplot(aes(x = x, y = y)) +
geom_line() +
ylim(0, 1)
}
# DATA SIMULATION ==============================================================
duration <- c(10, 12, 14, 18, 20)
sex <- factor(c("Female", "Male"))
eta <- function(duration, sex, duration_select, sex_OR, noise_sd){
beta_sex(sex, sex_OR) + beta_duration(duration, duration_select) + rnorm(length(duration), 0, noise_sd)
}
sim_data <- function(durations_type, sex_OR, noise_sd, p_female, n, seed){
set.seed(seed)
data.frame(
duration = sample(duration, n, TRUE),
sex = sample(sex, n, TRUE, c(p_female, 1 - p_female))
) %>%
rowwise() %>%
mutate(eta = eta(duration, sex, durations_type, sex_OR, noise_sd),
p = sigmoid(eta),
cured = sample(0:1, 1, prob = c(1 - p, p)))
}
# DATA SIM PARAMETERS
durations_type <- 4 # See beta_duration for functions
sex_OR <- 3 # Odds of cure for male vs female (ref)
noise_sd <- 1
p_female <- 0.7 # proportion of females in the sample
n <- 500
data <- sim_data(durations_type = 1, # See beta_duration for functions
sex_OR = 3, # Odds of cure for male vs female (ref)
noise_sd = 1,
p_female = 0.7, # proportion of females in the sample
n = 500,
seed = 21874564)
I am fitting a fractional polynomial GLM:
library(mfp)
model1 <- mfp(cured ~ fp(duration) + sex,
family = binomial(link = "logit"),
data = data)
summary(model1)
Given that I am holding sex as constant, is there any way to find the value of duration within a certain range that gives me the highest predicted value? Something less inefficient than:
range <- seq(10, 20, by = 1e-4)
range[which.max(predict(model, type = "response", newdata = data.frame(duration = range, sex = "Male")))]
You can use optimize here. Just create a function which returns a prediction based on the value of duration:
f <- function(x) predict(model1, list(sex = 'Male', duration = x))
And we can find the value of duration which produces the maximum log odds within the range 0-20 by doing:
optimise(f, c(0, 20), maximum = TRUE)$maximum
#> [1] 17.95679

Find value of covariate given a probability in R

Given a fractional polynomial GLM, I am looking to find the value of a covariate that gives me an output of a given probability.
My data is simulated using:
# FUNCTIONS ====================================================================
logit <- function(p){
x = log(p/(1-p))
x
}
sigmoid <- function(x){
p = 1/(1 + exp(-x))
p
}
beta_duration <- function(D, select){
logit(
switch(select,
0.05 + 0.9 / (1 + exp(-2*D + 25)),
0.9 * exp(-exp(-0.5 * (D - 11))),
0.9 * exp(-exp(-(D - 11))),
0.9 * exp(-2 * exp(-(D - 9))),
sigmoid(0.847 + 0.210 * (D - 10)),
0.7 + 0.0015 * (D - 10) ^ 2,
0.7 - 0.0015 * (D - 10) ^ 2 + 0.03 * (D - 10)
)
)
}
beta_sex <- function(sex, OR = 1){
ifelse(sex == "Female", -0.5 * log(OR), 0.5 * log(OR))
}
plot_beta_duration <- function(select){
x <- seq(10, 20, by = 0.01)
y <- beta_duration(x, select)
data.frame(x = x,
y = y) %>%
ggplot(aes(x = x, y = y)) +
geom_line() +
ylim(0, 1)
}
# DATA SIMULATION ==============================================================
duration <- c(10, 12, 14, 18, 20)
sex <- factor(c("Female", "Male"))
eta <- function(duration, sex, duration_select, sex_OR, noise_sd){
beta_sex(sex, sex_OR) + beta_duration(duration, duration_select) + rnorm(length(duration), 0, noise_sd)
}
sim_data <- function(durations_type, sex_OR, noise_sd, p_female, n, seed){
set.seed(seed)
data.frame(
duration = sample(duration, n, TRUE),
sex = sample(sex, n, TRUE, c(p_female, 1 - p_female))
) %>%
rowwise() %>%
mutate(eta = eta(duration, sex, durations_type, sex_OR, noise_sd),
p = sigmoid(eta),
cured = sample(0:1, 1, prob = c(1 - p, p)))
}
# DATA SIM PARAMETERS
durations_type <- 4 # See beta_duration for functions
sex_OR <- 3 # Odds of cure for male vs female (ref)
noise_sd <- 1
p_female <- 0.7 # proportion of females in the sample
n <- 500
data <- sim_data(durations_type = 1, # See beta_duration for functions
sex_OR = 3, # Odds of cure for male vs female (ref)
noise_sd = 1,
p_female = 0.7, # proportion of females in the sample
n = 500,
seed = 21874564)
And my model is fitted by:
library(mfp)
model1 <- mfp(cured ~ fp(duration) + sex,
family = binomial(link = "logit"),
data = data)
summary(model1)
For each level of sex (i.e. "Male" or "Female"), I want to find the value of duration that gives me a probability equal to some value frontier <- 0.8.
So far, I can only think of using an approximation using a vector of possibilities:
pred_duration <- seq(10, 20, by = 0.1)
pred <- data.frame(expand.grid(duration = pred_duration,
sex = sex),
p = predict(model1,
newdata = expand.grid(duration = pred_duration,
sex = sex),
type = "response"))
pred[which(pred$p > 0.8), ] %>%
group_by(sex) %>%
summarize(min(duration))
But I am really after an exact solution.
The function uniroot allows you to detect the point at which the output of a function equals 0. If you create a function that takes duration as input, calculates the predicted probability from that duration, then subtracts the desired probability, then this function will have an output of 0 at the desired value of duration. uniroot will find this value for you. If you wrap this process in a little function, it makes it very easy to use:
find_prob <- function(p) {
f <- function(v) {
predict(model1, type = 'response',
newdata = data.frame(duration = v, sex = 'Male')) - p
}
uniroot(f, interval = range(data$duration), tol = 1e-9)$root
}
So, for example, to find the duration that gives an 80% probability, we just do:
find_prob(0.8)
#> [1] 12.86089
To prove that this is the correct value, we can feed it directly into predict to see what the predicted probability will be given sex = male and duration = 12.86089
predict(model1, type = 'response',
newdata = data.frame(sex = 'Male', duration = find_prob(0.8)))
#> 1
#> 0.8

Monte Carlo simulations for VAR models

I've been trying to estimate VAR models using Monte Carlo Simulation. I have 3 endogenous variables. I need some guidance regarding this.
First of all, I want to add an outlier as a percentage of the sample size.
Second (second simulation for same model), I want to add multivariate contaminated normal distribution like 0.9N (0, I) + 0.1((0,0,0)',(100, 100, 100)) instead of outlier.
Could you tell me how to do these?
Thank you.
RR <- function(n, out){
# n is number of observations
k <- 3 # Number of endogenous variables
p <- 2 # Number of lags
# add outlier
n[1]<- n[1]+out
# Generate coefficient matrices
B1 <- matrix(c(.1, .3, .4, .1, -.2, -.3, .03, .1, .1), k) # Coefficient matrix of lag 1
B2 <- matrix(c(0, .2, .1, .07, -.4, -.1, .5, 0, -.1), k) # Coefficient matrix of lag 2
M <- cbind(B1, B2) # Companion form of the coefficient matrices
# Generate series
DT <- matrix(0, k, n + 2*p) # Raw series with zeros
for (i in (p + 1):(n + 2*p)){ # Generate series with e ~ N(0,1)
DT[, i] <- B1%*%DT[, i-1] + B2%*%DT[, i-2] + rnorm(k, 0, 1)
}
DT <- ts(t(DT[, -(1:p)])) # Convert to time series format
#names <- c("V1", "V2", "V3") # Rename variables
colnames(DT) <- c("Y1", "Y2", "Y3")
#plot.ts(DT) # Plot the series
# estimate VECM
vecm1 <- VECM(DT, lag = 2, r = 2, include = "const", estim ="ML")
vecm2 <- VECM(DT, lag = 2, r = 1, include = "const", estim ="ML")
# mse
mse1 <- mean(vecm1$residuals^2)
mse2 <- mean(vecm2$residuals^2)
#param_list <- unname(param_list)
return(list("mse1" = mse1, "mse2" = mse2, "mse3" = mse3))
}
# defined the parameter grids(define the parameters ranges we want to run our function with)
n_grid = c(50, 80, 200, 400)
out_grid = c(0 ,5, 10)
# collect parameter grids in a list (to enter it into the Monte Carlo function)
prml = list("n" = n_grid, "out" = out_grid)
# run simulation
RRS <- MonteCarlo(func = RR, nrep = 1000, param_list = prml)
summary(RRS)
# make table:
rows = "n"
cols = "out"
MakeTable(output = RRS, rows = rows, cols = cols)

Silhouette value of each cluster using clustMixType

Using the clustMixType package I'm trying to get the silhouette values of the following dataset using the example in page 13 here:
https://cran.r-project.org/web/packages/clustMixType/clustMixType.pdf
n <- 10
prb <- 0.99
muk <- 2.5
x1 <- sample(c("A","B"), 2*n, replace = TRUE, prob = c(prb, 1-prb))
x1 <- c(x1, sample(c("A","B"), 2*n, replace = TRUE, prob = c(1-prb, prb)))
x1 <- as.factor(x1)
x2 <- sample(c("A","B"), 2*n, replace = TRUE, prob = c(prb, 1-prb))
x2 <- c(x2, sample(c("A","B"), 2*n, replace = TRUE, prob = c(1-prb, prb)))
x2 <- as.factor(x2)
x3 <- c(rnorm(n, mean = -muk), rnorm(n, mean = muk), rnorm(n, mean = -muk), rnorm(n, mean = muk))
x4 <- c(rnorm(n, mean = -muk), rnorm(n, mean = muk), rnorm(n, mean = -muk), rnorm(n, mean = muk))
x <- data.frame(x1,x2,x3,x4)
This seems to work to obtain the silhouette value for several k number of clusters:
library(clustMixType)
Essil <- numeric(5)
for(i in 2:6){
kpres <- kproto(x, k = i,na.rm=FALSE )
val_sil<-validation_kproto(method = "silhouette", object=kpres)
Essil[i] <- val_sil
}
plot(1:6, Essil, type = "b", ylab = "Silhouette", xlab = "Number of clusters")
Essil
Therefore as shown in the plot, the best number of clusters would be 4.
But validation_kproto gives the mean silhouette value for each k, perhaps 4 clusters gives best mean silhouette but with some of the clusters having a silhouette lower than 0.5.
Perhaps 5 clusters would be a better solution.
Therefore it would be important to know the silhouette value of each cluster, would that be possible?
I've tried setting kp_obj='all' but does not work..
It is correct, the validation_kproto function only outputs the silhouette index for the entire cluster partition. It is currently not possible to get the indices per cluster.
Possible alternative in the current state: use val <- validation_kproto(data = x, k = 2:6, kp_obj = "all") (instead of for(i in 2:6){...}) and get all examined kproto-objects where you can compare the within cluster distances (val$kp_obj$withinss).
Many greetings Rabea
clustmixtype

How to simulate PCA Data?

I am trying to simulate PCA Data as follows:
q <- 5 # no. of PCs
p <- 20 # no. of variables
n <- 2000 # no. of individuals
eps <- 0.05 # error standard deviation
# Eigenvalues
Sig <- seq(3, 1, length.out = q)^2
Lambda <- diag(Sig)
# Matrix of Principal Components
H <- rmvnorm(n = n, mean = rep(0, q), sigma = Lambda)
# Add gaussian noise
E <- matrix(rnorm(n*p, sd = sqrt(eps)), ncol = p)
# Data matrix
Y <- H %*% t(Amat) + E
# Perform PCA
summary(m1 <- prcomp(Y, scale = T)) # and so on...
However, I have no idea how to create the matrix of Loadings Amat in a meaningful way.
Thanks for any help I receive from you and I appreciate it!
This is not using the same structure as the OP, but it simulates a PCA with 4 different groups (which could be species) which each have 3 "traits" (each of the trait have different means and sd based on some biological data found in the literature for example).
set.seed(123) # setting this so the random results will be repeatable
library(MASS)
# Simulating 3 traits for 4 different species
n = 200 # number of "individuals"
# Generate the groups
Amat1 = MASS::mvrnorm(n, mu = c(11.2,11.8,9.91), Sigma = diag(c(1.31,1.01,1.02)))
Amat2 = MASS::mvrnorm(n, mu = c(7.16,8.54,6.82), Sigma = diag(c(0.445,0.546,0.350)))
Amat3 = MASS::mvrnorm(n, mu = c(15.6,14.6,13.5), Sigma = diag(c(1.43,0.885,0.990)))
Amat4 = MASS::mvrnorm(n, mu = c(8.65,14.1,8.24), Sigma = diag(c(0.535,0.844,0.426)))
# Combine the data
Amat = rbind(Amat1,Amat2,Amat3,Amat4)
# Make group data
Amat.gr = cbind(Amat, gl(4,k=n,labels = c(1,2,3,4)))
# Calculate the covariance matrix for each group
by(Amat.gr[,1:3],INDICES = Amat.gr[,4],FUN = cov) # calculate covariance matrix for all groups
# Plot the result
summary(m1 <- prcomp(Amat, scale= T))
# biplot(m1, xlabs=rep(".", nrow(Amat)), cex = 2)
plot(vegan::scores(m1), asp = 1, pch = 19, col = gl(4,k=n,labels = c(1,2,3,4)))
plot(Amat[,1],Amat[,2], pch = 19, col = gl(4,k=n,labels = c(1,2,3,4)))
The plot on the left shows the PCA and on the right the raw data.
I added a toy example with data to show what is the algorithm to compute a PCA in R from Legendre and Legendre 2012.
# Generate vectors (example from Legendre and Legendre 2012)
v1 = c(2,3,5,7,9)
v2 = c(1,4,0,6,2)
# If you want to play with sample size
# n = 100
# v1 = rnorm(n = n, mean = mean(v1), sd = sd(v1))
# v2 = rnorm(n = n, mean = mean(v2), sd = sd(v2))
# Get the y matrix
y = cbind(v1,v2)
# Centered y matrix
yc = apply(y, 2, FUN = function(x) x-mean(x))
# Dispersion matrix
s = 1/(nrow(y)-1)*t(yc) %*% yc
# Compute the single value decomposition to get the eigenvectors and
ev = svd(s)$v
# get the principal components
f = yc %*% ev
# This gives the identity matrix
round(t(svd(s)$v) %*% svd(s)$v,2)
# these are the eigen values
svd(s)$d
-svd(yc)$v #p. 104
plot(f, pch = 19); abline(h=0,v=0, lty = 3)

Resources