I have a set of points such as this, and I am trying to group or cluster them into four groups of equal size based on their distance to their mean.
df <- data.frame(x = rnorm(50, 0, 1),
y = rnorm(50, -0, 0.7))
ggplot(data = df, aes(x = x, y = y)) +
geom_point() +
geom_abline(slope = 0.7, intercept = 0) +
geom_abline(slope = -0.7, intercept = 0)
In this case, I would want each point to be assigned a group based on how far they are with respect to the average (here it would be (0,0)) and whether this distance trends more up, down, left or right. Thanks for the help.
df <- data.frame(x = rnorm(50, 0, 1),
y = rnorm(50, -0, 0.7),
center = rep(0,50)) # not really needed
df$v1 <- -df$x^2 # vector one 0(center) - df$x2
df$v2 <- -df$y^2 # vector two 0(center) - df$y2
df$dist = df$v1 + df$v2
df$len = sqrt(abs(df$dist)) #lenght of vector betwen center and point
df$group = cut(df$len, 10,labels = paste("Level",LETTERS[1:10])) # grouping into 10 bins
df<-df %>% mutate(posit= case_when(x < mean(x) & y < mean(y) ~ "A",
x > mean(x) & y > mean(y) ~ "B",
x <mean(x) & y >mean(y) ~ "C",
x >mean(x) & y < mean(y) ~ "D"))
ggplot(data = df ) +
geom_point(aes(x = x, y = y,col= group, shape= posit, size=2)) +
geom_vline(xintercept =mean(df$x))+
geom_abline(slope = -0.0, intercept =mean(df$y))
Related
I'd like to plot a discontinuous function without connecting a jump. For example, in the following plot, I'd like to delete the line connecting (0.5, 0.5) and (0.5, 1.5).
f <- function(x){
(x < .5) * (x) + (x >= .5) * (x + 1)
}
ggplot()+
geom_function(fun = f)
Edit: I'm looking for a solution that works even if the discountinuous point is not a round number, say pi/10.
You could write a little wrapper function which finds discontinuities in the given function and plots them as separate groups:
plot_fun <- function(fun, from = 0, to = 1, by = 0.001) {
x <- seq(from, to, by)
groups <- cut(x, c(-Inf, x[which(abs(diff(fun(x))) > 0.1)], Inf))
df <- data.frame(x, groups, y = fun(x))
ggplot(df, aes(x, y, group = groups)) +
geom_line()
}
This allows
plot_fun(f)
plot_fun(floor, 0, 10)
This answer is based on Allan Cameron's answer, but depicts the jump using open and closed circles. Whether the function is right or left continuous is controlled by an argument.
library("ggplot2")
plot_fun <- function(fun, from = 0, to = 1, by = 0.001, right_continuous = TRUE) {
x <- seq(from, to, by)
tol_vertical <- 0.1
y <- fun(x)
idx_break <- which(abs(diff(y)) > tol_vertical)
x_break <- x[idx_break]
y_break_l <- y[idx_break]
y_break_r <- y[idx_break + 1]
groups <- cut(x, c(-Inf, x_break, Inf))
df <- data.frame(x, groups, y = fun(x))
plot_ <- ggplot(df, aes(x, y, group = groups)) +
geom_line()
# add open and closed points showing jump
dataf_l <- data.frame(x = x_break, y = y_break_l)
dataf_r <- data.frame(x = x_break, y = y_break_r)
shape_open_circle <- 1
# this is the default of shape, but might as well specify.
shape_closed_circle <- 19
shape_size <- 4
if (right_continuous) {
shape_l <- shape_open_circle
shape_r <- shape_closed_circle
} else {
shape_l <- shape_closed_circle
shape_r <- shape_open_circle
}
plot_ <- plot_ +
geom_point(data = dataf_l, aes(x = x, y = y), group = NA, shape = shape_l, size = shape_size) +
geom_point(data = dataf_r, aes(x = x, y = y), group = NA, shape = shape_r, size = shape_size)
return(plot_)
}
Here's the OP's original example:
f <- function(x){
(x < .5) * (x) + (x >= .5) * (x + 1)
}
plot_fun(f)
Here's Allan's additional example using floor, which shows multiple discontinuities:
plot_fun(floor, from = 0, to = 10)
And here's an example showing that the function does not need to be piecewise linear:
f_curved <- function(x) ifelse(x > 0, yes = 0.5*(2-exp(-x)), no = 0)
plot_fun(f_curved, from = -1, to = 5)
You can insert everything inside an ifelse:
f <- function(x){
ifelse(x==0.5,
NA,
(x < .5) * (x) + (x >= .5) * (x + 1))
}
ggplot()+
geom_function(fun = f)
I'd like to ggplot 3 pairs (factor x0) of 2 conditions (factor cond0), using boxplots with specific quantile limits.
The 2 problems are:
There are 3 groups in condition A (red), but only 2 groups in condition B (blue). Since group 1B is missing, the boxplot of group 1A occupies its space on the graph (twice as large). I would like its width to be as narrow as the others, and the space of the missing group 1B to be maintained even if it is empty.
Since the group 3B has only one value (and therefore no outlier), the outliers of group 3A are located in the middle of the pair instead of being aligned with boxplot 3A.
Would there be a solution to these problems?
Thanks for help
library(dplyr)
library(ggplot2)
# dataframe
x1 <- rep(1:3, each=60)
y1 <- rnorm(180, rep(c(20,35,50), each=60), 10)
cond1 <- rep("A", each=180)
dat1 <- data.frame(x1, y1, cond1)
dat1$x1 <- as.factor(dat1$x1)
dat1$cond1 <- as.factor(dat1$cond1)
dat1 <- dat1 %>% rename(x0 = x1, y0 = y1, cond0 = cond1)
x2 <- rep(2:3, each = 179, len = 180) ; y2
y2 <- rnorm(180, rep(c(30,60), each=90), 7) ; x2
cond2 <- rep("B", each=180)
dat2 <- data.frame(x2, y2, cond2)
dat2$x2 <- as.factor(dat2$x2)
dat2$cond2 <- as.factor(dat2$cond2)
dat2 <- dat2 %>% rename(x0 = x2, y0 = y2, cond0 = cond2)
dat <- rbind(dat1,dat2)
# define boxplots limits
dat_boxlim <- function(x) {
r <- quantile(x, probs = c(0.1, 0.4, 0.5, 0.8, 0.9))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
# define outliers limits
dat_boxout <- function(x) {
subset(x, x < quantile(x, 0.1) | x > quantile(x, 0.9))
}
# figure
ggplot(dat, aes(x0, y0, group=interaction(cond0, x0), fill = cond0))+
stat_summary(fun.data = dat_boxlim, geom = "boxplot", position = position_dodge(0.7), width = 0.5, show.legend = TRUE) +
stat_summary(fun = dat_boxout, geom = "point", size=2, position = position_dodge(0.7), show.legend = FALSE)
The first problem is solved by using "position = position_dodge2(preserve = "single")" in stat_summary geom="boxplot".
The second problem is solved using the new formula below.
The whole appropriate code is:
library(dplyr)
library(ggplot2)
# dataframe
x1 <- rep(1:3, each=60)
y1 <- rnorm(180, rep(c(20,35,50), each=60), 10)
cond1 <- rep("A", each=180)
dat1 <- data.frame(x1, y1, cond1)
dat1$x1 <- as.factor(dat1$x1)
dat1$cond1 <- as.factor(dat1$cond1)
dat1 <- dat1 %>% rename(x0 = x1, y0 = y1, cond0 = cond1)
x2 <- rep(2:3, each = 179, len = 180) ; y2
y2 <- rnorm(180, rep(c(30,60), each=90), 7) ; x2
cond2 <- rep("B", each=180)
dat2 <- data.frame(x2, y2, cond2)
dat2$x2 <- as.factor(dat2$x2)
dat2$cond2 <- as.factor(dat2$cond2)
dat2 <- dat2 %>% rename(x0 = x2, y0 = y2, cond0 = cond2)
dat <- rbind(dat1,dat2)
# define boxplots limits
dat_boxlim <- function(x) {
r <- quantile(x, probs = c(0.1, 0.4, 0.5, 0.8, 0.9))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
# define outliers limits
dat_boxout <- function(x) {
if (length(x) > 1) { # or other length if needed (e.g. > 7)
return(subset(x, x < quantile(x, 0.1) | x > quantile(x, 0.9))) # only for low outliers
} else {
return(NA)
}
}
# figure
ggplot(dat, aes(x0, y0, group=interaction(cond0, x0), fill = cond0))+
stat_summary(fun.data = dat_boxlim, geom = "boxplot", position = position_dodge2(preserve = "single", 0.7, padding = 0.1), width = 0.5, show.legend = TRUE) +
stat_summary(fun = dat_boxout, geom = "point", size=2, position = position_dodge(preserve = "total", 0.5), show.legend = FALSE)
I have line plots y vs x. y is sigmoid and varies from 0 to 1.
determine the value of x where y = 0.5 or very close by interpolation.
draw vertical line at x where y = 0.5
library(tidyverse)
# continuous variables
x <- seq(-5, 5, 0.1)
# compute y1
error_term <- runif(1, min = -2, max = 2)
y1 <- 1/(1 + exp(-x + error_term))
# compute y2
error_term <- runif(1, min = -2, max = 2)
y2 <- 1/(1 + exp(-x + error_term))
# merge y
y <- c(y1, y2)
x <- c(x, x)
# categorical variable
a <- c(rep(0, 101), rep(1, 101))
tbl <- tibble(x, a, y)
# TASK
# 1. determine values of x at which y = 0.5 for all categories and store them in variable x0
# 2. Use x0 to draw vertical lines in plots at x where y is 0.5
# ggplot
ggplot(data = tbl,
aes(x = x,
y = y)) +
geom_line() +
theme_bw() +
facet_grid(a ~ .)
This really isn't something built in to ggplot so you'll need to summarize the data yourself prior to plotting. You can write a helper function and then create the data you need for the lines
find_intersect <- function(x,y, target=0.5) {
optimize(function(z) (approxfun(x,y)(z)-target)^2, x)$minimum
}
line_data <- tbl %>%
group_by(a) %>%
summarize(xint=find_intersect(x,y))
Then plot with
ggplot(data = tbl,
aes(x = x,
y = y)) +
geom_line() +
theme_bw() +
geom_vline(aes(xintercept=xint), data=line_data) +
facet_grid(a ~ .)
So I'm trying to plot a couple of curves using ggplot(), and I would like to have each curve sitting in its own plot in a facet_grid. All of this works fine.
The problem is that I'd also like to annotate the curve with the x value corresponding to the peak y value. I tried using geom_text(), and I tried implementing it as shown below, but it doesn't seem to quite work. It's clearly printing something onto the plot, but not the way I hoped it would; i.e., each plot has its corresponding x value printed on it at the location (x, max(y)).
I suspect I've not implemented the ifelse() correctly, but I'm not experienced enough with R to figure out what exactly the problem is.
Any suggestions on where I'm going wrong?
Output:
Data + code:
library('ggplot2')
x <- seq(5, 15, length=1000)
y <- dnorm(x, mean=10, sd=1)
z <- rep_len("z", length.out = 1000)
x1 <- seq(5, 15, length=1000)
y1 <- dnorm(x1, mean=10, sd=2)
z1 <- rep_len("z1", length.out = 1000)
x <- c(x, x1)
y <- c(y, y1)
z <- c(z, z1)
df <- data.frame(x, y, z)
ggplot(data = df, aes(x, y)) + geom_line() + facet_grid(.~z) + geom_text(data = df, aes(x, y, label = ifelse(y == max(y), as.numeric(x), '')), inherit.aes = FALSE, hjust = 0, vjust = 0)
Edit: the output I'm expecting is something like this:
You need to fix two things.
(1) calculate max per z
(2) avoid duplicate y_values
The following code should fix both:
library(dplyr)
df2 <- df %>%
distinct(y, .keep_all = TRUE) %>%
group_by(z) %>%
mutate(y_label = ifelse(y == max(y), as.numeric(x), ''))
as.data.frame(df2)
ggplot(data = df2, aes(x, y)) + geom_line() + facet_grid(.~z) + geom_text(aes(label = y_label), hjust = 0, vjust = 0)
You need to provide geom_text a data.frame with data for z and z1.
x y z
z 9.994995 0.3989373 z
z1 9.994995 0.1994705 z1
How to get that? Well, here's one way.
df.split <- split(df, f = df$z)
df.max <- sapply(df.split, FUN = function(x) which.max(x$y))
df.max <- mapply(function(x1, x2) x1[x2, ], x1 = df.split, x2 = df.max, SIMPLIFY = FALSE)
df.max <- do.call(rbind, df.max)
which you can then plot
ggplot(data = df, aes(x, y)) +
geom_line() +
geom_text(data = df.max, aes(x = x, y = y, label = round(y, 2))) +
facet_grid(. ~ z)
Get the means and maxes for each z:
Ys <- df %>% group_by(z) %>% summarise(maxY = max(y))
Xs <- df %>% group_by(z) %>% summarise(meanX = mean(x))
Plot with the geom_text
ggplot(data = df, aes(x, y)) +
geom_line() +
geom_text(data = left_join(Xs,Ys), aes(meanX, maxY, label = meanX)) +
facet_grid(.~z)
Or more succinctly
ggplot(data = df, aes(x, y)) +
geom_line() +
geom_text(data =
df %>%
group_by(z) %>%
summarise(maxY = max(y), meanX = mean(x)),
aes(meanX, maxY, label = meanX)) +
facet_grid(.~z)
Not sure about how to tackle this - I have a data distribution where data selection based on standard deviation does not include all data points (data is more variable on one end than on the other). However, when plotting a density plot I can see that all data outside the 8th blue ring are what I want to select.
Example code:
x <- sort(rnorm(1300, mean = 0, sd = 1))
y <- rnorm(1300, mean = 0, sd = 1)
x <- c(x, rnorm(300, mean = 4, sd = 2), rnorm(600, mean = -2, sd = 2))
y <- c(y, rnorm(300, mean = 3, sd = 4), rnorm(600, mean = -2, sd = 2))
mydata <- data.frame(x,y)
ggplot(data = mydata, aes(x = x, y = y)) +
geom_point(cex = 0.5) +
geom_density_2d()
I adapted this from http://slowkow.com/notes/ggplot2-color-by-density/.
Under the hood, geom_density_2d uses the MASS::kde2d function, so we can also apply it to the underlying data to subset by density.
set.seed(42)
x <- sort(rnorm(1300, mean = 0, sd = 1))
y <- rnorm(1300, mean = 0, sd = 1)
x <- c(x, rnorm(300, mean = 4, sd = 2), rnorm(600, mean = -2, sd = 2))
y <- c(y, rnorm(300, mean = 3, sd = 4), rnorm(600, mean = -2, sd = 2))
mydata <- data.frame(x,y)
# Copied from http://slowkow.com/notes/ggplot2-color-by-density/
get_density <- function(x, y, n = 100) {
dens <- MASS::kde2d(x = x, y = y, n = n)
ix <- findInterval(x, dens$x)
iy <- findInterval(y, dens$y)
ii <- cbind(ix, iy)
return(dens$z[ii])
}
mydata$density <- get_density(mydata$x, mydata$y)
Select points based on arbitrary contour
EDIT: Changed to allow selection based on contour levels
# First create plot with geom_density
gg <- ggplot(data = mydata, aes(x = x, y = y)) +
geom_point(cex = 0.5) +
geom_density_2d(size = 1, n = 100)
gg
# Extract levels denoted by contours by going into the
# ggplot build object. I found these coordinates by
# examining the object in RStudio; Note, the coordinates
# would change if the layer order were altered.
gb <- ggplot_build(gg)
contour_levels <- unique(gb[["data"]][[2]][["level"]])
# contour_levels
# [1] 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08
# Add layer that relies on given contour level
gg2 <- gg +
geom_point(data = mydata %>%
filter(density <= contour_levels[1]),
color = "red", size = 0.5)
gg2