fixed fill for different sections of a density plot with ggplot - r

Given draws from a rnorm, and cutoff c I want my plot to use the following colors:
Red for the section that is to the left of -c
Blue for the section in between -c and c
and Green for the section that is to the right of c
For example, if my data is:
set.seed(9782)
mydata <- rnorm(1000, 0, 2)
c <- 1
I want to plot something like this:
But if my data is all to the right of c the whole plot should be green. Similarly, if all is between -c and c or to the left of -c the plot should be all red or blue.
This is the code I wrote:
MinD <- min(mydata)
MaxD <- max(mydata)
df.plot <- data.frame(density = mydata)
if(c==0){
case <- dplyr::case_when((MinD < 0 & MaxD >0) ~ "L_and_R",
(MinD > 0) ~ "R",
(MaxD < 0) ~ "L")
}else{
case <- dplyr::case_when((MinD < -c & MaxD >c) ~ "ALL",
(MinD > -c & MaxD > c) ~ "Center_and_R",
(MinD > -c & MaxD <c) ~ "Center",
(MinD < -c & MaxD < c) ~ "Center_and_L",
MaxD < -c ~ "L",
MaxD > c ~ "R")
}
# Draw the Center
if(case %in% c("ALL", "Center_and_R", "Center", "Center_and_L")){
ds <- density(df.plot$density, from = -c, to = c)
ds_data_Center <- data.frame(x = ds$x, y = ds$y, section="Center")
} else{
ds_data_Center <- data.frame(x = NA, y = NA, section="Center")
}
# Draw L
if(case %in% c("ALL", "Center_and_L", "L", "L_and_R")){
ds <- density(df.plot$density, from = MinD, to = -c)
ds_data_L <- data.frame(x = ds$x, y = ds$y, section="L")
} else{
ds_data_L <- data.frame(x = NA, y = NA, section="L")
}
# Draw R
if(case %in% c("ALL", "Center_and_R", "R", "L_and_R")){
ds <- density(df.plot$density, from = c, to = MaxD)
ds_data_R <- data.frame(x = ds$x, y = ds$y, section="R")
} else{
ds_data_R <- data.frame(x = NA, y = NA, section="R")
}
L_Pr <- round(mean(mydata < -c),2)
Center_Pr <- round(mean((mydata>-c & mydata<c)),2)
R_Pr <- round(mean(mydata > c),2)
filldf <- data.frame(section = c("L", "Center", "R"),
Pr = c(L_Pr, Center_Pr, R_Pr),
fill = c("red", "blue", "green")) %>%
dplyr::mutate(section = as.character(section))
if(c==0){
ds_data <- suppressWarnings(dplyr::bind_rows(ds_data_L, ds_data_R)) %>%
dplyr::full_join(filldf, by = "section") %>% filter(Pr!=0) %>%
dplyr::full_join(filldf, by = "section") %>% mutate(section = ordered(section, levels=c("L","R")))
ds_data <- ds_data[order(ds_data$section), ] %>%
filter(Pr!=0) %>%
mutate(Pr=scales::percent(Pr))
}else{
ds_data <- suppressWarnings(dplyr::bind_rows(ds_data_Center, ds_data_L, ds_data_R)) %>%
dplyr::full_join(filldf, by = "section") %>% mutate(section = ordered(section, levels=c("L","Center","R")))
ds_data <- ds_data[order(ds_data$section), ] %>%
filter(Pr!=0) %>%
mutate(Pr=scales::percent(Pr))
}
fillScale <- scale_fill_manual(name = paste0("c = ", c, ":"),
values = as.character(unique(ds_data$fill)))
p <- ggplot(data = ds_data, aes(x=x, y=y, fill=Pr)) +
geom_area() + fillScale
Alas, I cannot figure out how to assign the colors to the different sections while keeping the percentages as labels for the colors.

We use the density function to create the data frame we'll actually plot. Then, We use the cut function to create groups using ranges of the data values. Finally, we calculate the probability mass for each group and use those as the actual legend labels.
We also create a labeled vector of colors to ensure that the same color always goes with a given range of x-values, regardless of whether the data contains any values within a given range of x-values.
The code below packages all this into a function.
library(tidyverse)
library(gridExtra)
fill_density = function(x, cc=1, adj=1, drop_levs=FALSE) {
# Calculate density values for input data
dens = data.frame(density(x, n=2^10, adjust=adj)[c("x","y")]) %>%
mutate(section = cut(x, breaks=c(-Inf, -1, cc, Inf))) %>%
group_by(section) %>%
mutate(prob = paste0(round(sum(y)*mean(diff(x))*100),"%"))
# Get probability mass for each level of section
# We'll use these as the label values in scale_fill_manual
sp = dens %>%
group_by(section, prob) %>%
summarise %>%
ungroup
if(!drop_levs) {
sp = sp %>% complete(section, fill=list(prob="0%"))
}
# Assign colors to each level of section
col = setNames(c("red","blue","green"), levels(dens$section))
ggplot(dens, aes(x, y, fill=section)) +
geom_area() +
scale_fill_manual(labels=sp$prob, values=col, drop=drop_levs) +
labs(fill="")
}
Now let's run the function on several different data distributions:
set.seed(3)
dat2 = rnorm(1000)
grid.arrange(fill_density(mydata), fill_density(mydata[mydata>0]),
fill_density(mydata[mydata>2], drop_levs=TRUE),
fill_density(mydata[mydata>2], drop_levs=FALSE),
fill_density(mydata[mydata < -5 | mydata > 5], adj=0.3), fill_density(dat2),
ncol=2)

Related

How to remove E notation from label cut intervals and replace with shorthand labels?

Looking to pass either label_number_si() or something like it to label cut intervals with SI units ("k" for thousand, etc.).
library(scales)
library(tidyverse)
dat <- data.frame(test_x=1:1e5, test_y=1) %>%
mutate(label_cut=cut_width(test_x,1e4, boundary = 0))
ggplot(dat) +
geom_col(aes(test_x,label_cut)) +
scale_x_continuous(labels = label_number_si())
Had to write up some custom functions to break up the interval:
library(scales)
library(tidyverse)
SI_format <- function(x) {
case_when(
x < 1e3 ~ as.character(x),
x < 1e6 ~ paste0(as.character(x/1e3), "K"),
x < 1e9 ~ paste0(as.character(x/1e6), "M"),
x < 1e12 ~ paste0(as.character(x/1e9), "B"),
x < 1e15 ~ paste0(as.character(x/1e12), "T"),
TRUE ~ "..."
)
}
pretty_labels <- function(x) {
first_bracket <- str_extract(x,"^.")
last_bracket <- str_extract(x,".$")
first_number <- SI_format(as.numeric(str_extract(x,"(\\d+)")))
last_number <- SI_format(as.numeric(str_extract(x,"(\\d+)(?!.*\\d)")))
new_label <- paste0(first_bracket,first_number,",",last_number,last_bracket)
return(new_label)
}
dat <- data.frame(test_x=c(0,1, 10, 1e2,1e3,1e4,1e5,1e6), test_y=1) %>%
mutate(label_cut=cut_width(test_x,1e3, boundary = 0, dig.lab=10))
levels(dat$label_cut) <- pretty_labels(levels(dat$label_cut))
ggplot(dat) +
geom_col(aes(test_x,label_cut)) +
scale_x_continuous(labels = label_number_si())
Here's one way with my santoku package:
library(santoku)
dat$label_cut <- chop_width(
dat$test_x,
1e4,
labels = lbl_intervals(fmt = scales::label_number_si())
)

Subsetting within R function

I have a function that generates a figure of a table:
plot_covariate_means_by_ntile <- function(.df, .ntile = "ntile", n_top = 10, directory) {
.df <- as.data.frame(.df)
covariate_names <- covariate_names
#.df[, .ntile] <- as.factor(.df[, .ntile])
.df[, .ntile] <- as_factor(.df[, .ntile], levels = "both")
# Regress each covariate on ntile/subgroup assignment to means p
cov_means <- lapply(covariate_names, function(covariate) {
lm_robust(as.formula(paste0(covariate, " ~ 0 + ", .ntile)), data = .df, se_type = "stata")
})
# Extract the mean and standard deviation of each covariate per ntile/subgroup
cov_table <- lapply(cov_means, function(cov_mean) {
means <- as.data.frame(t(coef(summary(cov_mean))[,c("Estimate", "Std. Error")]))
means
})
# Preparation to color the chart
temp_standardized <- sapply(seq_along(covariate_names), function(j) {
covariate_name <- covariate_names[j]
.mean <- mean(.df[, covariate_name], na.rm = TRUE)
.sd <- sd(.df[, covariate_name], na.rm = TRUE)
m <- as.matrix(round(signif(cov_table[[j]], digits=4), 3))
.standardized <- (m["Estimate",] - .mean) / .sd
.standardized
})
colnames(temp_standardized) <- covariate_names
ordering <- order(apply(temp_standardized, MARGIN = 2, function(x) {.range <- range(x); abs(.range[2] - .range[1])}), decreasing = TRUE)
# fwrite(tibble::rownames_to_column(as.data.frame(t(temp_standardized)[ordering,])),
# paste0(directory$data, "/covariate_standardized_means_by_", .ntile, ".csv"))
color_scale <- max(abs(c(max(temp_standardized, na.rm = TRUE), min(temp_standardized, na.rm = TRUE))))
color_scale <- color_scale * c(-1,1)
max_std_dev <- floor(max(color_scale))
breaks <- -max_std_dev:max_std_dev
labels <- c(" ", breaks, " ")
breaks <- c(min(color_scale), breaks, max(color_scale))
# Little trick to display the standard errors
table <- lapply(seq_along(covariate_names), function(j) {
covariate_name <- covariate_names[j]
.mean <- mean(.df[, covariate_name], na.rm = TRUE)
.sd <- sd(.df[, covariate_name], na.rm = TRUE)
m <- as.matrix(round(signif(cov_table[[j]], digits=4), 3))
.standardized <- (m["Estimate",] - .mean) / .sd
return(data.frame(covariate = covariate_name,
group = c(1,2,5) ,
estimate = m["Estimate",], std.error = m["Std. Error",],
standardized = .standardized))
})
# table <- do.call(rbind, table)
table <- rbindlist(table)
setnames(table, "group", .ntile)
table[, covariate := factor(covariate, levels = rev(covariate_names[ordering]), ordered = TRUE)]
table[covariate %in% head(covariate_names[ordering], n_top)] %>%
mutate(info = paste0(estimate, "\n(", std.error, ")")) %>%
ggplot(aes_string(x = .ntile, y = "covariate")) +
# Add coloring
geom_raster(aes(fill = standardized)
, alpha = 0.9
) +
scale_fill_distiller(palette = "RdBu",
direction = 1,
breaks = breaks,
labels = labels,
limits = color_scale,
name = "Standard\nDeviation on\nNormalized\nDistribution"
) +
# add numerics
geom_text(aes(label = info), size=2.1) +
# reformat
labs(title = paste0("Covariate averages within ", ifelse(tolower(.ntile) == "leaf", .ntile, "Assigned Group")),
y = "within covariate") +
scale_x_continuous(position = "top") #+
#cowplot::theme_minimal_hgrid(16)
}
But the output shows all 5 columns, I want it to show only 1 , 2 and 5.
I can adjust the line
groups = 1:ncol(m)
But then that incorrectly labels the groups, the third column is actually group 5:
Is there any way to adjust the function to present the correct columns and the correct labels for them?
Maybe you could use facet_wrap as a workaround?
library(tidyverse)
data.frame(X = rep(1:5, each = 25),
Y = rep(factor(rev(LETTERS[-26]),
levels = rev(LETTERS[-26])), 5),
Z = rnorm(125, 5, 1)) %>%
mutate(X = ifelse(X %in% c(1,2,5), X, NA)) %>%
na.omit() %>%
ggplot(aes(x = X, y = Y, fill = Z)) +
geom_raster() +
facet_wrap(~X, ncol=3, scales="free_x") +
theme_minimal() +
theme(axis.text.x = element_blank())
I tried to figure out a solution using scale_x_discrete (e.g. something like scale_x_discrete(limits = c("1", "2", "5"), breaks = c("1", "2", "5"))) and it 'feels' like it could work, but I gave up - maybe something worth pursuing.

R Plotly show string on contour plots

I have overlayed two contour plots:
library(plotly)
cluster_count <- 5
volcan <- plot_ly(z = ~volcano,
type = "contour",
contours = list(
coloring= "fill",
showlines = F
))
cluster_matrix <- volcano
cluster_matrix[cluster_matrix < 100] <- 1
cluster_matrix[cluster_matrix <= 120 & cluster_matrix >= 100] <- 2
cluster_matrix[cluster_matrix < 140 & cluster_matrix >= 120] <- 3
cluster_matrix[cluster_matrix <= 160 & cluster_matrix >= 140] <- 4
cluster_matrix[cluster_matrix > 160] <- 5
cluster_name_matrix <- cluster_matrix
cluster_name_matrix[cluster_matrix ==1] <- "Eins"
cluster_name_matrix[cluster_matrix ==2] <- "Zwei"
cluster_name_matrix[cluster_matrix ==3] <- "Drei"
cluster_name_matrix[cluster_matrix ==4] <- "Vier"
cluster_name_matrix[cluster_matrix ==5] <- "Funf"
volcan %>% add_contour(cluster_matrix,
type = "contour",
opacity =1,
text=cluster_name_matrix,
hovertemplate = 'Cluster: %{text}<extra></extra>',
autocontour = F,
line=list(color="orange"),
contours = list(
start = 1,
showlabels = T,
coloring= "lines",
end = cluster_count,
size = 1,
showlines = T
))
Is it possible to have a plot like this:
Like I did for the hovering text? Thanks for tips and suggestions in advance!
What you've been looking for is the add_annotations() function. In the code below, I write a function that retrieves a random coordinate pair for each level and then passes the corresponding coordinates to the add_annotations() function. Note that I stored your contour plot in the variable p:
library(purrr)
# Custom function
find_rand_annotation_index <- function(name_matrix, string){
d <- which(name_matrix == string, arr.ind = TRUE)
d2 <- as.data.frame(d[sample(nrow(d), size = 1), , drop = FALSE])
cbind(d2, string)
}
# Get 5 random coordinates to plot the labels
text_coords <- purrr::map_dfr(c("Eins", "Zwei", "Drei", "Vier", "Funf"), ~ find_rand_annotation_index(cluster_name_matrix, .x))
# Plot the annotations on the contour plot
p %>%
add_annotations(
x = text_coords$col,
y = text_coords$row,
text = text_coords$string,
font = list(color = "IndianRed"),
showarrow = F
)
The positioning of the labels may not be to your liking (because the coordinates are chosen randomly), but you may want to do something about it in your code.

Plotly (R) Legend Won't Appear?

I'm trying to create a plot showing the CDFs of two different categories of data, with a legend to show which color corresponds to which (Plotly version 4.9.2.1). For some reason, it's a royal pain in the rear to get the legend to show. Below is a toy example with three of my attempts--only the last one works, but it's obnoxiously contrived and makes the resulting data appear misleadingly dense in the plot. Can anyone explain how to do this right?
library(plotly)
library(magrittr)
color.dat <- runif(30)
x.mat <- matrix(0, nrow=500, ncol=30)
for (i in 1:30){
x.mat[,i] <- rnorm(500, 0, color.dat[i])
}
### Attempt 1, no legend appears at all ###
p <- plot_ly(showlegend=TRUE)
for (i in 1:30){
tmp.cdf <- ecdf(x.mat[,i])
p <- p %>%
add_lines(x=sort(x.mat[,i]), y=tmp.cdf(sort(x.mat[,i])),
name=ifelse(color.dat[i] > 0.5, 'A', 'B'),
showlegend=FALSE,
line=list(color=ifelse(color.dat[i] > 0.5, 'blue', 'orange')))
}
p <- p %>%
add_lines(x=c(0,1), y=c(0,0), name='A',
line=list(color='blue'),
showlegend=TRUE, visible=FALSE) %>%
add_lines(x=c(0,1), y=c(0,0), name='B',
line=list(color='orange'),
showlegend=TRUE, visible=FALSE)
### Attempt 2, legend entry appears only for class B (doesn't appear without invisible traces added at end) ###
p <- plot_ly(showlegend=TRUE)
a.bool <- TRUE
b.bool <- TRUE
for (i in 1:30){
tmp.cdf <- ecdf(x.mat[,i])
if (color.dat[i] > 0.5 && a.bool){
class.bool <- TRUE
a.bool <- FALSE
} else {
class.bool <- FALSE
}
if (color.dat[i] < 0.5 && b.bool){
class.bool <- TRUE
b.bool <- FALSE
} else {
class.bool <- FALSE
}
p <- p %>%
add_lines(x=sort(x.mat[,i]), y=tmp.cdf(sort(x.mat[,i])),
name=ifelse(color.dat[i] > 0.5, 'A', 'B'),
showlegend=class.bool,
line=list(color=ifelse(color.dat[i] > 0.5, 'blue', 'orange')))
}
p <- p %>%
add_lines(x=c(0,1), y=c(0,0), name='A',
line=list(color='blue'),
showlegend=TRUE, visible=FALSE) %>%
add_lines(x=c(0,1), y=c(0,0), name='B',
line=list(color='orange'),
showlegend=TRUE, visible=FALSE)
### Attempt 3, both legend entries appear, but plot is misleading and obscures a lot of detail ###
p <- plot_ly(showlegend=TRUE)
flat.mat.a <- c()
flat.mat.b <- c()
flat.cdf.a <- c()
flat.cdf.b <- c()
for (i in 1:30){
tmp.cdf <- ecdf(x.mat[,i])
if (color.dat[i] > 0.5){
flat.mat.a <- c(flat.mat.a, sort(x.mat[,i]))
flat.cdf.a <- c(flat.cdf.a, tmp.cdf(sort(x.mat[,i])))
} else {
flat.mat.b <- c(flat.mat.b, sort(x.mat[,i]))
flat.cdf.b <- c(flat.cdf.b, tmp.cdf(sort(x.mat[,i])))
}
}
p <- p %>%
add_lines(x=flat.mat.a, y=flat.cdf.a,
showlegend=TRUE, name='A',
line=list(color='blue')) %>%
add_lines(x=flat.mat.b, y=flat.cdf.b,
showlegend=TRUE, name='B',
line=list(color='orange'))
My preferred approach to plotting stuff with ploty is to put the data in dataframe.
After the data preparation steps it's just takes two lines of code to get the plot and the legend.
library(plotly)
library(tidyr)
library(dplyr)
set.seed(42)
color.dat <- runif(30)
x.mat <- matrix(0, nrow=500, ncol=30)
for (i in 1:30){
x.mat[,i] <- rnorm(500, 0, color.dat[i])
}
# Put the data in a dataframe
dfx <- data.frame(x.mat) %>%
tidyr::pivot_longer(everything()) %>%
arrange(name, value) %>%
mutate(id = as.integer(gsub("^X", "", name)),
color = color.dat[id],
color = ifelse(color > 0.5, 'blue', 'orange')) %>%
group_by(name) %>%
mutate(cdf = ecdf(value)(value)) %>%
ungroup()
p <- dfx %>%
group_by(name) %>%
plot_ly(showlegend=TRUE) %>%
add_lines(x = ~value, y =~cdf, color = ~color, colors = c(blue = "blue", orange = "orange"))
p

How do I speed up my function, specifically the ggplot commands?

I put together a function to identify outliers. It takes a dataframe and then shows plots of the data with lines to indicate potential outliers. It'll give a table with outliers marked, too.
But, it is SLOOOW. The problem is it takes a really long time for the plots to load.
I was curious if you might have advice on how to speed this up.
Related: Is the default plotting system faster than ggplot?
I'll start with the dependencies
#These next four functions are not mine. They're used in GetOutliers()
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
Hampel <- function(x, t = 3){
#
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up<-mu+t*sig
down<-mu-t*sig
out <- list(up = up, down = down)
return(out)
}
ThreeSigma <- function(x, t = 3){
#
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up<-mu+t* sig
down<-mu-t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule <- function(x, t = 1.5){
#
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q<-xU-xL
if(Q==0){
message("Boxplot rule implosion: interquartile distance is zero")
}
up<-xU+t*Q
down<-xU-t*Q
out <- list(up = up, down = down)
return(out)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
#strip non-numeric variables out of a dataframe
num_vars <- function(df){
X <- which(sapply(df, is.numeric))
num_vars <- df[names(X)]
return(num_vars)
}
This is the function
GetOutliers <- function(df){
library('dplyr')
library('ggplot2')
#strip out the non-numeric columns
df_out <- num_vars(df)
#initialize the data frame
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
df_out_id <- df_out
#identify outliers for each column
for (i in 1:length(names(num_vars(df)))){
#find the outliers
Outs <- FindOutliers(df_out[[i]])
OutsSum <- Outs$summary
#re-enter the outlier status
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
ifelse(is.na(Outs$Hampel), print(), df_out[unlist(Outs$Hampel[1]),]$Hampel <- TRUE)
ifelse(is.na(Outs$threeSigma), print(), df_out[unlist(Outs$threeSigma[1]),]$threeSigma <- TRUE)
ifelse(is.na(Outs$boxplotRule), print(), df_out[unlist(Outs$boxplotRule[1]),]$boxplotRule <- TRUE)
#visualize the outliers and print outlier information
Temp <- df_out
A <- colnames(Temp)[i]
AA <- paste(A,"Index")
colnames(Temp)[i] <- 'curr_column'
#table with outlier status
X <- arrange(subset(Temp,Hampel == TRUE | boxplotRule == TRUE | threeSigma == TRUE), desc(curr_column))
#scatterplot with labels
Y <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
geom_text(aes(40,OutsSum$lowLim[1],label="ThreeSigma Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[2],label="Hampel Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[3],label="Boxplot Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[1],label="ThreeSigma Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[2],label="Hampel Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[3],label="Boxplot Upper",vjust=-1)) +
xlab(AA) + ylab(A)
#scatterplot without labels
Z <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
xlab(AA) + ylab(A)
U <- ggplot(Temp,aes(curr_column)) + geom_density() + xlab(A)
print(A)
print(X)
print(OutsSum)
print(Z)
print(Y)
print(U)
#mark the extreme outliers, the rest are reasonable outliers
A <- colnames(df_out_id[i])
Q <- as.numeric(readline(prompt="Enter the index for final Extreme value on the upper limit (if none, enter 0): "))
W <- as.numeric(readline(prompt="Enter the index for first Extreme value on the lower limit (if none, enter 0): "))
col <- df_out_id[i]
df_out_id[i] <- sapply(col[[1]], function(x){
if(Q>1 & x %in% X$curr_column[1:Q]) return('Extreme')
if(W>1 & x %in% X$curr_column[W:length(X$curr_column)]) return('Extreme')
else if (x %in% X$curr_column[Q+1:length(X$curr_column)]) return('Reasonable')
else return('Non-Outlier')
})
}
#return a dataframe with outlier status, excluding the outlier ID columns
summary(df_out_id)
return(df_out_id[1:(length(names(df_out_id))-3)])
}
Example
library('ISLR')
data(Carseats)
GetOutliers(Carseats)
It'll show you the outliers for each numeric variable.
It'll plot the variable density and then a scatterplot with identifier lines
It will also accept input so you can mark some outliers as reasonable and other as extreme

Resources