Related
I want draw a multisequence align result in a whole view,so i draw a picture with follow code:
library(ggmsa)
library(Biostrings)
aln_data <- system.file("extdata", "seedSample.fa", package ="ggmsa")
df <- tidy_msa(aln_data)
###df like this
name position character
#1 Homo 1 -
#2 fascicularis 1 G
#3 mulatta 1 -
#4 Homo 2 -
#5 fascicularis 2 T
#6 mulatta 2 -
max_pos <- df %>% select(position) %>% max()
height_n <- length(unique(df$name))
df$y<- rep(seq(1,height_n,1),times=max_pos)
#caculate the seq frequce
fre_list<-list()
for (i in 1:length(unique(df$position))){
single_pos <- df[df$position == i,]
most_fre <- names(sort(table(single_pos$character),decreasing = T)[1])
fre_list[[i]] <- most_fre
}
df$fre <- rep(unlist(fre_list),each=height_n)
df$y <- ifelse(df$character == df$fre,0,df$y)
df$y0 <- ifelse(df$y ==0,0,df$y-1)
ggplot(df)+geom_segment(aes(x=position,xend=position,y=y0,yend=y),color='grey',alpha=0.8)+
theme_bw()+ggplot2::theme(panel.grid=element_blank(),
axis.text.x = element_blank(),axis.text.y = element_blank(),
axis.ticks=element_blank(),axis.title.x = element_blank(),
axis.title.y = element_blank())+
scale_y_continuous(expand=c(0,0))+
scale_x_continuous(expand=c(0,0))
it works well ,then i want to change it to a new geom function like follow:
StatMsawind <- ggplot2::ggproto('StatMsawind',Stat,required_aes= c('x','y'),
compute_group =function(data,scales){
View(data)
pos_number <- data %>% select(x) %>% max()
print(pos_number)
height_n <- nrow(data)/pos_number
print(height_n)
out<- data.frame('y'=rep(seq(1,height_n,1),times=pos_number))
out$cha <- data$y
fre_list<-list()
for (i in 1:pos_number){
single_pos <- data[data$x == i,]
most_fre <- names(sort(table(single_pos$y),decreasing = T)[1])
fre_list[[i]] <- most_fre
}
out$fre <- rep(unlist(fre_list),each=height_n)
out$y <- ifelse(out$cha == out$fre,0,out$y)
out$ystart <- ifelse(out$y == 0,0,out$y-1)
out
print(out)
})
stat_msawind <- function(data=NULL,mapping =NULL,geom = 'msawind',position='identity',
inherit.aes=TRUE,...){
ggplot2::layer(stat = StatMsawind, data = data, mapping = mapping, geom = geom,
position = position,inherit.aes = inherit.aes,
params = list(...))
}
GeomMsawind <- ggplot2::ggproto('GeomMsawind',ggplot2::Geom,
required_aes=c('x','y','ystart'),
default_aes=aes(color='grey',alpha=0.8),
draw_key = draw_key_abline,
draw_panel = function(data,panel_scales,coord){
coords <- coord$transform(data,panel_scales)
print(coords)
grid::segmentsGrob(x0=coords$x,x1=coords$x,y0=coords$ystart,y1=coords$y,
gp=grid::gpar(col=ggplot2::alpha(coords$color,coords$alpha)))
})
geom_msawind <- function(data=NULL,mapping =NULL,stat='msawind',position='identity',inherit.aes=TRUE,...){
ggplot2::layer(data = data,mapping = mapping,stat = stat, geom=GeomMsawind,position=position,inherit.aes = inherit.aes,
params = list(color='grey',alpha=0.8,...))
}
ggplot(df,aes(position,character))+geom_msawind()
i get this error:
Warning message:
Computation failed in `stat_msawind()`:
wrong sign in 'by' argument
when i chcek the data in StatMsawind function ,i find that the data shape is no equal to the passing variables df i send in.i can't figure out why it can't stat correct, what should i do to make it correct
thanks very much
I am trying to estimate a Mixed-mixed multinomial logit model using the gmnl package. It works perfectly when not including Alternative Specific Constants (ASC), but it produces a weird error when incorporating them. The code below was taken (and adapted) from the original article published of the package.
Data preparation
options(digits = 3)
library("gmnl")
library("mlogit")
data("Electricity", package = "mlogit")
Electr <- mlogit.data(Electricity,
id.var = "id",
choice = "choice",
varying = 3:26,
shape = "wide",
sep = "")
####Alternative Specific Constants
Electr$asc2 <- as.numeric(Electr$alt == 2)
Electr$asc3 <- as.numeric(Electr$alt == 3)
Electr$asc4 <- as.numeric(Electr$alt == 4)
Latent Class Models (with ASC)
The code below works perfectly, even including the ASC in the second part of the formula (LC_ASC_in_formula) or explicitly with the regressors (LC_ASC_in_variables).
LC_ASC_in_formula <- gmnl(choice ~ pf + cl + loc + wk + tod + seas | 1 | 0 | 0 | 1,
data = Electr,
subset = 1:3000,
model = "lc",
panel = TRUE,
Q = 2)
summary(LC_ASC_in_formula)
LC_ASC_in_variables <- gmnl( choice ~ pf + cl + loc + wk + tod + seas +asc2 +asc3 +asc4 | 0 | 0 | 0 | 1,
data = Electr,
subset = 1:3000,
model = "lc",
panel = TRUE,
Q = 2)
summary(LC_ASC_in_variables)
## Are they the same?
logLik(LC_ASC_in_variables) == logLik(LC_ASC_in_formula)
## [1] TRUE
Mixed-mixed MNL model
This model is basically a Latent Class model, but inside each class, the parameters are random (follow a previously specified parametric distribution).
Mixed-mixed MNL model WITHOUT ASC
The model works just fine when the ASCs are omitted.
MM_no_ASC <- gmnl(choice ~ pf + cl + loc + wk + tod + seas | 0 | 0 | 0 | 1,
data = Electr,
subset = 1:3000,
model = "mm",
R = 5,
panel = TRUE,
ranp = c(pf = "n",cl = "n",loc = "n",wk = "n", tod = "n",seas= "n"),
Q = 2,
iterlim = 500)
However, it fails to estimate the model when including the ASC:
As part of the variables in the model.
MM_ASC_in_variables <- gmnl( choice ~ pf + cl + loc + wk + tod + seas +
asc2 +asc3 +asc4 | 0 | 0 | 0 | 1 ,
data = Electr,
subset = 1:3000,
model = "mm",
R = 5,
panel = TRUE,
ranp = c(pf = "n",cl = "n",loc = "n",wk = "n", tod = "n",seas= "n"),
Q = 2,
iterlim = 500)
> Error in if (distr == "n") { : missing value where TRUE/FALSE needed
and when including them in the third part of the formula.
MM_ASC_in_formula <- gmnl( choice ~ pf + cl + loc + wk + tod + seas | 1 | 0 | 0 | 1 ,
data = Electr,
subset = 1:3000,
model = "mm",
R = 5,
panel = TRUE,
ranp = c(pf = "n",cl = "n",loc = "n",wk = "n", tod = "n",seas= "n"),
Q = 2,
iterlim = 500)
> Error in if (distr == "n") { : missing value where TRUE/FALSE needed
Howeve, both ways to include the ASC parameters fail to initialize the model estimation. Hopefully, someone could help me to solve this issue. Thank you in advance.
Bonus1: Traceback of the error.
I reduced the number of observations included in the estimation (subset = 1:20) to see better the traceback() of the error shown below. But I couldn't spot the error myself.
MM_ASC_in_formula <- gmnl( choice ~ pf + cl + loc + wk + tod + seas | 1 | 0 | 0 | 1 ,
data = Electr,
subset = 1:20,
model = "mm",
R = 5,
panel = TRUE,
ranp = c(pf = "n",cl = "n",loc = "n",wk = "n", tod = "n",seas= "n"),
Q = 2,
iterlim = 500)
# Error in if (distr == "n") { : missing value where TRUE/FALSE needed
traceback()
# Estimating MM-MNL model
# Error in if (distr == "n") { : missing value where TRUE/FALSE needed
# > traceback()
# 14: Makeh.rcoef(beta[, q], stds[, q], ranp, Omega[, ((i - 1) * R +
# 1):(i * R), drop = FALSE], correlation, Pi = NULL, Slist = NULL,
# mvar = NULL)
# 13: fnOrig(theta, ...)
# 12: logLikFunc(theta, fnOrig = function (theta, y, X, H, Q, id = NULL,
# ranp, R, correlation, weights = NULL, haltons = NULL, seed = 12345,
# gradient = TRUE, get.bi = FALSE)
# {
# K <- ncol(X[[1]])
# J <- length(X)
# N <- nrow(X[[1]])
# panel <- !is.null(id)
# if (panel) {
# n <- length(unique(id))
# if (length(weights) == 1)
# weights <- rep(weights, N)
# }
# beta <- matrix(theta[1L:(K * Q)], nrow = K, ncol = Q)
# nstds <- if (!correlation)
# K * Q
# else (0.5 * K * (K + 1)) * Q
# stds <- matrix(theta[(K * Q + 1):(K * Q + nstds)], ncol = Q)
# rownames(beta) <- colnames(X[[1]])
# colnames(beta) <- colnames(stds) <- paste("class", 1:Q, sep = ":")
# gamma <- theta[-c(1L:(K * Q + nstds))]
# ew <- lapply(H, function(x) exp(crossprod(t(x), gamma)))
# sew <- suml(ew)
# Wnq <- lapply(ew, function(x) {
# v <- x/sew
# v[is.na(v)] <- 0
# as.vector(v)
# })
# Wnq <- Reduce(cbind, Wnq)
# set.seed(seed)
# Omega <- make.draws(R * ifelse(panel, n, N), K, haltons)
# XBr <- vector(mode = "list", length = J)
# for (j in 1:J) XBr[[j]] <- array(NA, dim = c(N, R, Q))
# nind <- ifelse(panel, n, N)
# if (panel)
# theIds <- unique(id)
# if (get.bi)
# bi <- array(NA, dim = c(nind, R, Q, K), dimnames = list(NULL,
# NULL, NULL, colnames(X[[1]])))
# for (i in 1:nind) {
# if (panel) {
# anid <- theIds[i]
# theRows <- which(id == anid)
# }
# else theRows <- i
# for (q in 1:Q) {
# bq <- Makeh.rcoef(beta[, q], stds[, q], ranp, Omega[,
# ((i - 1) * R + 1):(i * R), drop = FALSE], correlation,
# Pi = NULL, Slist = NULL, mvar = NULL)
# for (j in 1:J) {
# XBr[[j]][theRows, , q] <- crossprod(t(X[[j]][theRows,
# , drop = FALSE]), bq$br)
# }
# if (get.bi)
# bi[i, , q, ] <- t(bq$br)
# }
# }
# EXB <- lapply(XBr, function(x) exp(x))
# SEXB <- suml.array(EXB)
# Pntirq <- lapply(EXB, function(x) x/SEXB)
# Pnrq <- suml.array(mapply("*", Pntirq, y, SIMPLIFY = FALSE))
# if (panel)
# Pnrq <- apply(Pnrq, c(2, 3), tapply, id, prod)
# Pnq <- apply(Pnrq, c(1, 3), mean)
# WPnq <- Wnq * Pnq
# Ln <- apply(WPnq, 1, sum)
# if (get.bi)
# Qir <- list(wnq = Wnq, Ln = Ln, Pnrq = Pnrq)
# lnL <- if (panel)
# sum(log(Ln) * weights[!duplicated(id)])
# else sum(log(Ln) * weights)
# if (gradient) {
# lambda <- mapply(function(y, p) y - p, y, Pntirq, SIMPLIFY = FALSE)
# Wnq.mod <- aperm(repmat(Wnq/Ln, dimen = c(1, 1, R)),
# c(1, 3, 2))
# Qnq.mod <- Wnq.mod * Pnrq
# if (panel)
# Qnq.mod <- Qnq.mod[id, , ]
# eta <- lapply(lambda, function(x) x * Qnq.mod)
# dUdb <- dUds <- vector(mode = "list", length = J)
# for (j in 1:J) {
# dUdb[[j]] <- array(NA, dim = c(N, K, Q))
# dUds[[j]] <- array(NA, dim = c(N, nrow(stds), Q))
# }
# for (i in 1:nind) {
# if (panel) {
# anid <- theIds[i]
# theRows <- which(id == anid)
# }
# else theRows <- i
# for (q in 1:Q) {
# bq <- Makeh.rcoef(beta[, q], stds[, q], ranp,
# Omega[, ((i - 1) * R + 1):(i * R), drop = FALSE],
# correlation, Pi = NULL, Slist = NULL, mvar = NULL)
# for (j in 1:J) {
# dUdb[[j]][theRows, , q] <- tcrossprod(eta[[j]][theRows,
# , q, drop = TRUE], bq$d.mu)
# dUds[[j]][theRows, , q] <- tcrossprod(eta[[j]][theRows,
# , q, drop = TRUE], bq$d.sigma)
# }
# }
# }
# if (correlation) {
# vecX <- c()
# for (i in 1:K) {
# vecX <- c(vecX, i:K)
# }
# Xac <- lapply(X, function(x) x[, vecX])
# }
# else {
# Xac <- X
# }
# Xr <- lapply(X, function(x) x[, rep(1:K, Q)])
# Xacr <- lapply(Xac, function(x) x[, rep(1:ncol(Xac[[1]]),
# Q)])
# dUdb <- lapply(dUdb, function(x) matrix(x, nrow = N))
# dUds <- lapply(dUds, function(x) matrix(x, nrow = N))
# grad.beta <- suml(mapply("*", Xr, dUdb, SIMPLIFY = FALSE))/R
# grad.stds <- suml(mapply("*", Xacr, dUds, SIMPLIFY = FALSE))/R
# Qnq <- WPnq/Ln
# if (panel) {
# Wnq <- Wnq[id, ]
# H <- lapply(H, function(x) x[id, ])
# Qnq <- Qnq[id, ]
# }
# Wg <- vector(mode = "list", length = Q)
# IQ <- diag(Q)
# for (q in 1:Q) Wg[[q]] <- rowSums(Qnq * (repRows(IQ[q,
# ], N) - repCols(Wnq[, q], Q)))
# grad.gamma <- suml(mapply("*", H, Wg, SIMPLIFY = FALSE))
# gari <- cbind(grad.beta, grad.stds, grad.gamma)
# colnames(gari) <- names(theta)
# attr(lnL, "gradient") <- gari * weights
# }
# if (get.bi) {
# Pnjq <- lapply(Pntirq, function(x) apply(x, c(1, 3),
# mean))
# if (panel)
# Wnq <- Wnq[id, ]
# Pw <- lapply(Pnjq, function(x) x * Wnq)
# attr(lnL, "prob.alt") <- sapply(Pw, function(x) apply(x,
# 1, sum))
# attr(lnL, "prob.ind") <- Ln
# attr(lnL, "bi") <- bi
# attr(lnL, "Qir") <- Qir
# attr(lnL, "Wnq") <- Wnq
# }
# lnL
# },# weights = 1, R = 5, seed = 12345, ranp = c(pf = "n", cl = "n",
# loc = "n", wk = "n", tod = "n", seas = "n"), id = structure(c(1L,
# 1L, 1L, 1L, 1L), .Label = "1", class = "factor"), H = list(
# `1` = structure(0, .Dim = c(1L, 1L), .Dimnames = list(
# "1", "(class)2")), `2` = structure(1, .Dim = c(1L,
# 1L), .Dimnames = list("2", "(class)2"))), correlation = FALSE,
# haltons = NA, Q = 2)
# 11: eval(f, sys.frame(sys.parent()))
# 10: eval(f, sys.frame(sys.parent()))
# 9: callWithoutArgs(theta, fName = fName, args = names(formals(sumt)),
# ...)
# 8: (function (theta, fName, ...)
#
# 7: do.call(callWithoutSumt, argList)
# 6: maxOptim(fn = fn, grad = grad, hess = hess, start = start, method = "BFGS",
# fixed = fixed, constraints = constraints, finalHessian = finalHessian,
# parscale = parscale, control = mControl, ...)
# 5: maxRoutine(fn = logLik, grad = grad, hess = hess, start = start,
# constraints = constraints, ...)
# 4: maxLik(method = "bfgs", iterlim = 500, start = c(`class.1.2:(intercept)` = -4.85114128700713,
# `class.1.3:(intercept)` = -7.69322200825539, `class.1.4:(intercept)` = 5.01582959989182,
# class.1.pf = -1.60963678008691, class.1.cl = 0.109892050051351,
# class.1.loc = 18.3461318629584, class.1.wk = 5.01552145983325,
# class.1.tod = 6.12905713997904, class.1.seas = -4.37562129235275,
# `class.2.2:(intercept)` = -4.81114128700713, `class.2.3:(intercept)` = -7.6532220082554,
# `class.2.4:(intercept)` = 5.05582959989182, class.2.pf = -1.56963678008691,
# class.2.cl = 0.149892050051351, class.2.loc = 18.3861318629584,
# class.2.wk = 5.05552145983325, class.2.tod = 6.16905713997903,
# class.2.seas = -4.33562129235275, class.1.sd.pf = 0.08, class.1.sd.cl = 0.08,
# class.1.sd.loc = 0.08, class.1.sd.wk = 0.08, class.1.sd.tod = 0.08,
# class.1.sd.seas = 0.08, class.2.sd.pf = 0.12, class.2.sd.cl = 0.12,
# class.2.sd.loc = 0.12, class.2.sd.wk = 0.12, class.2.sd.tod = 0.12,
# class.2.sd.seas = 0.12, `(class)2` = 0), X = Xl, y = yl, gradient = gradient,
# weights = weights, logLik = ll.mnlogit, R = R, seed = seed,
# ranp = ranp, id = id, H = Hl, correlation = correlation,
# haltons = haltons, Q = Q)
# 3: eval(opt, sys.frame(which = nframe))
# 2: eval(opt, sys.frame(which = nframe))
# 1: gmnl(choice ~ pf + cl + loc + wk + tod + seas | 1 | 0 | 0 | 1,
# data = Electr, subset = 1:20, model = "mm", R = 5, panel = TRUE,
# ranp = c(pf = "n", cl = "n", loc = "n", wk = "n", tod = "n",
# seas = "n"), Q = 2, iterlim = 500)
Bonus2 :sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19044)
Matrix products: default
attached base packages:
[1] grid stats graphics grDevices utils datasets
[7] methods base
other attached packages:
[1] here_1.0.1 strucchange_1.5-2 sandwich_3.0-1
[4] zoo_1.8-9 partykit_1.2-15 mvtnorm_1.1-3
[7] libcoin_1.0-9 mlogit_1.1-1 dfidx_0.0-4
[10] gmnl_1.1-3.2 Formula_1.2-4 maxLik_1.5-2
[13] miscTools_0.6-26 dplyr_1.0.7 nnet_7.3-17
Thank you in advance.
I have a function that generates a figure of a table:
plot_covariate_means_by_ntile <- function(.df, .ntile = "ntile", n_top = 10, directory) {
.df <- as.data.frame(.df)
covariate_names <- covariate_names
#.df[, .ntile] <- as.factor(.df[, .ntile])
.df[, .ntile] <- as_factor(.df[, .ntile], levels = "both")
# Regress each covariate on ntile/subgroup assignment to means p
cov_means <- lapply(covariate_names, function(covariate) {
lm_robust(as.formula(paste0(covariate, " ~ 0 + ", .ntile)), data = .df, se_type = "stata")
})
# Extract the mean and standard deviation of each covariate per ntile/subgroup
cov_table <- lapply(cov_means, function(cov_mean) {
means <- as.data.frame(t(coef(summary(cov_mean))[,c("Estimate", "Std. Error")]))
means
})
# Preparation to color the chart
temp_standardized <- sapply(seq_along(covariate_names), function(j) {
covariate_name <- covariate_names[j]
.mean <- mean(.df[, covariate_name], na.rm = TRUE)
.sd <- sd(.df[, covariate_name], na.rm = TRUE)
m <- as.matrix(round(signif(cov_table[[j]], digits=4), 3))
.standardized <- (m["Estimate",] - .mean) / .sd
.standardized
})
colnames(temp_standardized) <- covariate_names
ordering <- order(apply(temp_standardized, MARGIN = 2, function(x) {.range <- range(x); abs(.range[2] - .range[1])}), decreasing = TRUE)
# fwrite(tibble::rownames_to_column(as.data.frame(t(temp_standardized)[ordering,])),
# paste0(directory$data, "/covariate_standardized_means_by_", .ntile, ".csv"))
color_scale <- max(abs(c(max(temp_standardized, na.rm = TRUE), min(temp_standardized, na.rm = TRUE))))
color_scale <- color_scale * c(-1,1)
max_std_dev <- floor(max(color_scale))
breaks <- -max_std_dev:max_std_dev
labels <- c(" ", breaks, " ")
breaks <- c(min(color_scale), breaks, max(color_scale))
# Little trick to display the standard errors
table <- lapply(seq_along(covariate_names), function(j) {
covariate_name <- covariate_names[j]
.mean <- mean(.df[, covariate_name], na.rm = TRUE)
.sd <- sd(.df[, covariate_name], na.rm = TRUE)
m <- as.matrix(round(signif(cov_table[[j]], digits=4), 3))
.standardized <- (m["Estimate",] - .mean) / .sd
return(data.frame(covariate = covariate_name,
group = c(1,2,5) ,
estimate = m["Estimate",], std.error = m["Std. Error",],
standardized = .standardized))
})
# table <- do.call(rbind, table)
table <- rbindlist(table)
setnames(table, "group", .ntile)
table[, covariate := factor(covariate, levels = rev(covariate_names[ordering]), ordered = TRUE)]
table[covariate %in% head(covariate_names[ordering], n_top)] %>%
mutate(info = paste0(estimate, "\n(", std.error, ")")) %>%
ggplot(aes_string(x = .ntile, y = "covariate")) +
# Add coloring
geom_raster(aes(fill = standardized)
, alpha = 0.9
) +
scale_fill_distiller(palette = "RdBu",
direction = 1,
breaks = breaks,
labels = labels,
limits = color_scale,
name = "Standard\nDeviation on\nNormalized\nDistribution"
) +
# add numerics
geom_text(aes(label = info), size=2.1) +
# reformat
labs(title = paste0("Covariate averages within ", ifelse(tolower(.ntile) == "leaf", .ntile, "Assigned Group")),
y = "within covariate") +
scale_x_continuous(position = "top") #+
#cowplot::theme_minimal_hgrid(16)
}
But the output shows all 5 columns, I want it to show only 1 , 2 and 5.
I can adjust the line
groups = 1:ncol(m)
But then that incorrectly labels the groups, the third column is actually group 5:
Is there any way to adjust the function to present the correct columns and the correct labels for them?
Maybe you could use facet_wrap as a workaround?
library(tidyverse)
data.frame(X = rep(1:5, each = 25),
Y = rep(factor(rev(LETTERS[-26]),
levels = rev(LETTERS[-26])), 5),
Z = rnorm(125, 5, 1)) %>%
mutate(X = ifelse(X %in% c(1,2,5), X, NA)) %>%
na.omit() %>%
ggplot(aes(x = X, y = Y, fill = Z)) +
geom_raster() +
facet_wrap(~X, ncol=3, scales="free_x") +
theme_minimal() +
theme(axis.text.x = element_blank())
I tried to figure out a solution using scale_x_discrete (e.g. something like scale_x_discrete(limits = c("1", "2", "5"), breaks = c("1", "2", "5"))) and it 'feels' like it could work, but I gave up - maybe something worth pursuing.
I put together a function to identify outliers. It takes a dataframe and then shows plots of the data with lines to indicate potential outliers. It'll give a table with outliers marked, too.
But, it is SLOOOW. The problem is it takes a really long time for the plots to load.
I was curious if you might have advice on how to speed this up.
Related: Is the default plotting system faster than ggplot?
I'll start with the dependencies
#These next four functions are not mine. They're used in GetOutliers()
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
Hampel <- function(x, t = 3){
#
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up<-mu+t*sig
down<-mu-t*sig
out <- list(up = up, down = down)
return(out)
}
ThreeSigma <- function(x, t = 3){
#
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up<-mu+t* sig
down<-mu-t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule <- function(x, t = 1.5){
#
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q<-xU-xL
if(Q==0){
message("Boxplot rule implosion: interquartile distance is zero")
}
up<-xU+t*Q
down<-xU-t*Q
out <- list(up = up, down = down)
return(out)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
#strip non-numeric variables out of a dataframe
num_vars <- function(df){
X <- which(sapply(df, is.numeric))
num_vars <- df[names(X)]
return(num_vars)
}
This is the function
GetOutliers <- function(df){
library('dplyr')
library('ggplot2')
#strip out the non-numeric columns
df_out <- num_vars(df)
#initialize the data frame
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
df_out_id <- df_out
#identify outliers for each column
for (i in 1:length(names(num_vars(df)))){
#find the outliers
Outs <- FindOutliers(df_out[[i]])
OutsSum <- Outs$summary
#re-enter the outlier status
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
ifelse(is.na(Outs$Hampel), print(), df_out[unlist(Outs$Hampel[1]),]$Hampel <- TRUE)
ifelse(is.na(Outs$threeSigma), print(), df_out[unlist(Outs$threeSigma[1]),]$threeSigma <- TRUE)
ifelse(is.na(Outs$boxplotRule), print(), df_out[unlist(Outs$boxplotRule[1]),]$boxplotRule <- TRUE)
#visualize the outliers and print outlier information
Temp <- df_out
A <- colnames(Temp)[i]
AA <- paste(A,"Index")
colnames(Temp)[i] <- 'curr_column'
#table with outlier status
X <- arrange(subset(Temp,Hampel == TRUE | boxplotRule == TRUE | threeSigma == TRUE), desc(curr_column))
#scatterplot with labels
Y <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
geom_text(aes(40,OutsSum$lowLim[1],label="ThreeSigma Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[2],label="Hampel Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[3],label="Boxplot Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[1],label="ThreeSigma Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[2],label="Hampel Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[3],label="Boxplot Upper",vjust=-1)) +
xlab(AA) + ylab(A)
#scatterplot without labels
Z <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
xlab(AA) + ylab(A)
U <- ggplot(Temp,aes(curr_column)) + geom_density() + xlab(A)
print(A)
print(X)
print(OutsSum)
print(Z)
print(Y)
print(U)
#mark the extreme outliers, the rest are reasonable outliers
A <- colnames(df_out_id[i])
Q <- as.numeric(readline(prompt="Enter the index for final Extreme value on the upper limit (if none, enter 0): "))
W <- as.numeric(readline(prompt="Enter the index for first Extreme value on the lower limit (if none, enter 0): "))
col <- df_out_id[i]
df_out_id[i] <- sapply(col[[1]], function(x){
if(Q>1 & x %in% X$curr_column[1:Q]) return('Extreme')
if(W>1 & x %in% X$curr_column[W:length(X$curr_column)]) return('Extreme')
else if (x %in% X$curr_column[Q+1:length(X$curr_column)]) return('Reasonable')
else return('Non-Outlier')
})
}
#return a dataframe with outlier status, excluding the outlier ID columns
summary(df_out_id)
return(df_out_id[1:(length(names(df_out_id))-3)])
}
Example
library('ISLR')
data(Carseats)
GetOutliers(Carseats)
It'll show you the outliers for each numeric variable.
It'll plot the variable density and then a scatterplot with identifier lines
It will also accept input so you can mark some outliers as reasonable and other as extreme
I am applying spectral clustering to a dataset with 4200 rows and 2 columns.
spec <- specClust(df1, centers=7, nn = 7, method = "symmetric")
I have the below error.
n .Call("R_igraph_arpack", func, extra, options, env, sym, PACKAGE = "igraph") :
At arpack.c:944 : ARPACK error, Maximum number of iterations reached
In addition: Warning message:
In .Call("R_igraph_arpack", func, extra, options, env, sym, PACKAGE = "igraph") :
At arpack.c:776 :ARPACK solver failed to converge (1001 iterations, 0/7 eigenvectors converged)
How do i increase the iterations of arpack because this doesnt work:
spec <- specClust(df1, centers=7, nn = 7, method = "symmetric",iter.max=301000)
Digging into the specClust, the ... does not pass anything to the arpack call.
The simplest thing to do I think is to copy the specClust code add maxiter=10000 and source the function in your script.
specCLust2 <- function (data, centers = NULL, nn = 7, method = "symmetric",
gmax = NULL, max.iter = 10000, ...)
{
call = match.call()
if (is.data.frame(data))
data = as.matrix(data)
da = apply(data, 1, paste, collapse = "#")
indUnique = which(!duplicated(da))
indAll = match(da, da[indUnique])
data2 = data
data = data[indUnique, ]
n <- nrow(data)
data = scale(data, FALSE, TRUE)
if (is.null(gmax)) {
if (!is.null(centers))
gmax = centers - 1L
else gmax = 1L
}
test = TRUE
while (test) {
DC = mydist(data, nn)
sif <- rbind(1:n, as.vector(DC[[2]]))
g <- graph(sif, directed = FALSE)
g <- decompose(g, min.vertices = 4)
if (length(g) > 1) {
if (length(g) >= gmax)
nn = nn + 2
else test = FALSE
}
else test = FALSE
}
W <- DC[[1]]
n <- nrow(data)
wi <- W[, nn]
SC <- matrix(1, nrow(W), nn)
SC[] <- wi[DC[[2]]] * wi
W = W^2/SC
alpha = 1/(2 * (nn + 1))
qua = abs(qnorm(alpha))
W = W * qua
W = dnorm(W, sd = 1)
DC[[1]] = W
L = Laplacian(DC, nn, method)
f <- function(x, extra) as.vector(extra %*% x)
if (is.null(centers))
kmax = 25
else kmax = max(centers)
###
#add the maxiter parameter to the arpack call, below
###
U <- arpack(f, extra = L, options = list(n = n, which = "SM",
nev = kmax, ncv = 2 * kmax, mode = 1, maxiter=max.iter), sym = TRUE)
ind <- order(U[[1]])
U[[2]] = U[[2]][indAll, ind]
U[[1]] = U[[1]][ind]
if (is.null(centers)) {
tmp = which.max(diff(U[[1]])) + 1
centers = which.min(AUC(U[[1]][1:tmp]))
}
if (method == "symmetric") {
rs = sqrt(rowSums(U[[2]]^2))
U[[2]] = U[[2]]/rs
}
result = kmeans(U[[2]], centers = centers, nstart = 20, ...)
archeType = getClosest(U[[2]][indAll, ], result$centers)
result$eigenvalue = U[[1]]
result$eigenvector = U[[2]]
result$data = data2
result$indAll = indAll
result$indUnique = indUnique
result$L = L
result$archetype = archeType
result$call = call
class(result) = c("specClust", "kmeans")
result
}