geom_vlines multiple vlines per plot - r

How can I get ggplot to produce something similar like
library(ggplot2)
library(reshape2)
library(ecp)
synthetic_control.data <- read.table("/path/synthetic_control.data.txt", quote="\"", comment.char="")
n <- 2
s <- sample(1:100, n)
idx <- c(s, 100+s, 200+s, 300+s, 400+s, 500+s)
sample2 <- synthetic_control.data[idx,]
df = as.data.frame(t(as.matrix(sample2)))
#calculate the change points
changeP <- e.divisive(as.matrix(df[1]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = data.frame(changeP,variable=colnames(df)[1])
for(series in 2:ncol(df)){
changeP <- e.divisive(as.matrix(df[series]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = rbind(changePoints, data.frame(changeP,variable=colnames(df)[2]))
}
this is the interesting part about the plot:
df$id = 1:nrow(df)
dfMelt <- reshape2::melt(df, id.vars = "id")
p = ggplot(dfMelt,aes(x=id,y=value))+geom_line(color = "steelblue")+ facet_grid(variable ~ ., scales = 'free_y')
p + geom_vline(aes(xintercept=changeP), data=changePoints, linetype='dashed')
So far my result is: https://www.dropbox.com/s/mysadkruo946oox/changePoint.pdf which means that there is something wrong with my array passed to the geom_vlines.
Could you point me in the right direction why I only get vlines in the first 2 plots?

This is the solution:
library(ggplot2)
library(reshape2)
library(ecp)
synthetic_control.data <- read.table("/Users/geoHeil/Dropbox/6.Semester/BachelorThesis/rResearch/data/synthetic_control.data.txt", quote="\"", comment.char="")
n <- 2
s <- sample(1:100, n)
idx <- c(s, 100+s, 200+s, 300+s, 400+s, 500+s)
sample2 <- synthetic_control.data[idx,]
df = as.data.frame(t(as.matrix(sample2)))
#calculate the change points
changeP <- e.divisive(as.matrix(df[1]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = data.frame(changeP,variable=colnames(df)[1])
for(series in 2:ncol(df)){
changeP <- e.divisive(as.matrix(df[series]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = rbind(changePoints, data.frame(changeP,variable=colnames(df)[series]))
}
# plot
df$id = 1:nrow(df)
dfMelt <- reshape2::melt(df, id.vars = "id")
p = ggplot(dfMelt,aes(x=id,y=value))+geom_line(color = "steelblue")+ facet_grid(variable ~ ., scales = 'free_y')
p + geom_vline(aes(xintercept=changeP), data=changePoints, linetype='dashed', colour='darkgreen')

Related

Binning two columns of data frame together in R

I would like to bin two columns of a dataset simultaneously to create one common binned column. The simple code is as follows
x <- sample(100)
y <- sample(100)
data <- data.frame(x, y)
xbin <- seq(from = 0, to = 100, by = 10)
ybin <- seq(from = 0, to = 100, by = 10)
Any help is appreciated!
Not sure if this is what you are looking for
library(tidyverse)
x <- sample(100)
y <- sample(100)
data <- data.frame(x, y)
xbin <- seq(from = 0, to = 100, by = 10)
ybin <- seq(from = 0, to = 100, by = 10)
data <- data%>%
dplyr::mutate(
x_binned = cut(x, breaks = seq(0,100,10)),
y_binned = cut(y, breaks = seq(0,100,10))
)
data %>%
ggplot() +
geom_bin_2d(
aes(x = x_binned, y = y_binned), binwidth = c(10,10), colour = "red") +
theme_minimal()
After asking in the comments I am still not quite shure, what the desired answer would look like but I hope, that one of the two answers in the below code will work for you:
x <- sample(100)
y <- sample(100)
data <- data.frame(x, y)
xbin <- seq(from = 0, to = 100, by = 10)
ybin <- seq(from = 0, to = 100, by = 10)
data$xbin <- cut(data$x, breaks = xbin, ordered = TRUE)
data$ybin <- cut(data$y, breaks = ybin, ordered = TRUE)
data$commonbin1 <- paste0(data$xbin, data$ybin)
data$commonbin2 <- paste0("(",as.numeric(data$xbin),";", as.numeric(data$ybin),")")
head(data, 20)
This will construct a common binning variable commonbin1 that includes the bin-limits in the names of the bins and commonbin2 which will be easier to compare to the plot mentioned in the comment.

Problem with stat_summary using group interaction and geom boxplot

I'd like to ggplot 3 pairs (factor x0) of 2 conditions (factor cond0), using boxplots with specific quantile limits.
The 2 problems are:
There are 3 groups in condition A (red), but only 2 groups in condition B (blue). Since group 1B is missing, the boxplot of group 1A occupies its space on the graph (twice as large). I would like its width to be as narrow as the others, and the space of the missing group 1B to be maintained even if it is empty.
Since the group 3B has only one value (and therefore no outlier), the outliers of group 3A are located in the middle of the pair instead of being aligned with boxplot 3A.
Would there be a solution to these problems?
Thanks for help
library(dplyr)
library(ggplot2)
# dataframe
x1 <- rep(1:3, each=60)
y1 <- rnorm(180, rep(c(20,35,50), each=60), 10)
cond1 <- rep("A", each=180)
dat1 <- data.frame(x1, y1, cond1)
dat1$x1 <- as.factor(dat1$x1)
dat1$cond1 <- as.factor(dat1$cond1)
dat1 <- dat1 %>% rename(x0 = x1, y0 = y1, cond0 = cond1)
x2 <- rep(2:3, each = 179, len = 180) ; y2
y2 <- rnorm(180, rep(c(30,60), each=90), 7) ; x2
cond2 <- rep("B", each=180)
dat2 <- data.frame(x2, y2, cond2)
dat2$x2 <- as.factor(dat2$x2)
dat2$cond2 <- as.factor(dat2$cond2)
dat2 <- dat2 %>% rename(x0 = x2, y0 = y2, cond0 = cond2)
dat <- rbind(dat1,dat2)
# define boxplots limits
dat_boxlim <- function(x) {
r <- quantile(x, probs = c(0.1, 0.4, 0.5, 0.8, 0.9))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
# define outliers limits
dat_boxout <- function(x) {
subset(x, x < quantile(x, 0.1) | x > quantile(x, 0.9))
}
# figure
ggplot(dat, aes(x0, y0, group=interaction(cond0, x0), fill = cond0))+
stat_summary(fun.data = dat_boxlim, geom = "boxplot", position = position_dodge(0.7), width = 0.5, show.legend = TRUE) +
stat_summary(fun = dat_boxout, geom = "point", size=2, position = position_dodge(0.7), show.legend = FALSE)
The first problem is solved by using "position = position_dodge2(preserve = "single")" in stat_summary geom="boxplot".
The second problem is solved using the new formula below.
The whole appropriate code is:
library(dplyr)
library(ggplot2)
# dataframe
x1 <- rep(1:3, each=60)
y1 <- rnorm(180, rep(c(20,35,50), each=60), 10)
cond1 <- rep("A", each=180)
dat1 <- data.frame(x1, y1, cond1)
dat1$x1 <- as.factor(dat1$x1)
dat1$cond1 <- as.factor(dat1$cond1)
dat1 <- dat1 %>% rename(x0 = x1, y0 = y1, cond0 = cond1)
x2 <- rep(2:3, each = 179, len = 180) ; y2
y2 <- rnorm(180, rep(c(30,60), each=90), 7) ; x2
cond2 <- rep("B", each=180)
dat2 <- data.frame(x2, y2, cond2)
dat2$x2 <- as.factor(dat2$x2)
dat2$cond2 <- as.factor(dat2$cond2)
dat2 <- dat2 %>% rename(x0 = x2, y0 = y2, cond0 = cond2)
dat <- rbind(dat1,dat2)
# define boxplots limits
dat_boxlim <- function(x) {
r <- quantile(x, probs = c(0.1, 0.4, 0.5, 0.8, 0.9))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
# define outliers limits
dat_boxout <- function(x) {
if (length(x) > 1) { # or other length if needed (e.g. > 7)
return(subset(x, x < quantile(x, 0.1) | x > quantile(x, 0.9))) # only for low outliers
} else {
return(NA)
}
}
# figure
ggplot(dat, aes(x0, y0, group=interaction(cond0, x0), fill = cond0))+
stat_summary(fun.data = dat_boxlim, geom = "boxplot", position = position_dodge2(preserve = "single", 0.7, padding = 0.1), width = 0.5, show.legend = TRUE) +
stat_summary(fun = dat_boxout, geom = "point", size=2, position = position_dodge(preserve = "total", 0.5), show.legend = FALSE)

Manually assigning colors with scale_fill_manual only works for certain hexagon sizes

I am trying to create a scatterplot that is summarized by hexagon bins of counts. I would like the user to be able to define the count breaks for the color scale. I have this working, using scale_fill_manual(). Oddly, however, it only works sometimes. In the MWE below, using the given seed value, if xbins=10, there are issues resulting in a plot as follows:
However, if xbins=20 or 40, for example, the plot doesn't seem to have problems:
My MWE is as follows:
library(ggplot2)
library(hexbin)
library(RColorBrewer)
set.seed(1)
xbins <- 20
x <- abs(rnorm(10000))
y <- abs(rnorm(10000))
minVal <- min(x, y)
maxVal <- max(x, y)
maxRange <- c(minVal, maxVal)
buffer <- (maxRange[2] - maxRange[1]) / (xbins / 2)
h <- hexbin(x = x, y = y, xbins = xbins, shape = 1, IDs = TRUE,
xbnds = maxRange, ybnds = maxRange)
hexdf <- data.frame (hcell2xy(h), hexID = h#cell, counts = h#count)
my_breaks <- c(2, 4, 6, 8, 20, 1000)
clrs <- brewer.pal(length(my_breaks) + 3, "Blues")
clrs <- clrs[3:length(clrs)]
hexdf$countColor <- cut(hexdf$counts, breaks = c(0, my_breaks, Inf),
labels = rev(clrs))
ggplot(hexdf, aes(x = x, y = y, hexID = hexID, fill = countColor)) +
scale_fill_manual(values = levels(hexdf$countColor)) +
geom_hex(stat = "identity") +
geom_abline(intercept = 0, color = "red", size = 0.25) +
coord_fixed(xlim = c(-0.5, (maxRange[2] + buffer)),
ylim = c(-0.5, (maxRange[2] + buffer))) +
theme(aspect.ratio=1)
My goal is to tweak this code so that the plot does not have problems (where suddenly certain hexagons are different sizes and shapes than the rest) regardless of the value assigned to xbins. However, I am puzzled what may be causing this problem for certain xbins values. Any advice would be greatly appreciated.
EDIT:
I am updating the example code after taking into account comments by #bdemarest and #Axeman. I followed the most popular answer in the link #Axeman recommends, and believe it is more useful when you are working with scale_fill_continuous() on an integer vector. Here, I am working on scale_fill_manual() on a factor vector. As a result, I am still unable to get this goal to work. Thank you.
library(ggplot2)
library(hexbin)
library(RColorBrewer)
set.seed(1)
xbins <- 10
x <- abs(rnorm(10000))
y <- abs(rnorm(10000))
minVal <- min(x, y)
maxVal <- max(x, y)
maxRange <- c(minVal, maxVal)
buffer <- (maxRange[2] - maxRange[1]) / (xbins / 2)
bindata = data.frame(x=x,y=y,factor=as.factor(1))
h <- hexbin(bindata, xbins = xbins, IDs = TRUE, xbnds = maxRange, ybnds = maxRange)
counts <- hexTapply (h, bindata$factor, table)
counts <- t (simplify2array (counts))
counts <- melt (counts)
colnames (counts) <- c ("factor", "ID", "counts")
counts$factor =as.factor(counts$factor)
hexdf <- data.frame (hcell2xy (h), ID = h#cell)
hexdf <- merge (counts, hexdf)
my_breaks <- c(2, 4, 6, 8, 20, 1000)
clrs <- brewer.pal(length(my_breaks) + 3, "Blues")
clrs <- clrs[3:length(clrs)]
hexdf$countColor <- cut(hexdf$counts, breaks = c(0, my_breaks, Inf), labels = rev(clrs))
ggplot(hexdf, aes(x = x, y = y, fill = countColor)) +
scale_fill_manual(values = levels(hexdf$countColor)) +
geom_hex(stat = "identity") +
geom_abline(intercept = 0, color = "red", size = 0.25) +
coord_cartesian(xlim = c(-0.5, maxRange[2]+buffer), ylim = c(-0.5, maxRange[2]+ buffer)) + theme(aspect.ratio=1)
you can define colors in 'geom' instead of 'scale' that modifies the scale of plot:
ggplot(hexdf, aes(x = x, y = y)) +
geom_hex(stat = "identity",fill =hexdf$countColor)

How to retrieve isocontour coordinates?

I have data interpolated on a grid and I need to retrieve the iso-contour coordinates:
require(akima)
require(pracma)
require(ggplot2)
require(RColorBrewer)
r <- seq(0.1, 1, length.out = 20)
theta <- seq(0, 90)
my.df <- expand.grid(r = r, theta = theta)
my.df$value <- 1/my.df$r^2 * sin(deg2rad(my.df$theta))
# Interpolating data on rectangular grid
data.interp <-
interp(
x = my.df$r * cos(deg2rad(my.df$theta)),
y = my.df$r * sin(deg2rad(my.df$theta)),
z = my.df$value,
nx = 200,
ny = 200,
duplicate = "strip"
)
data.xyz <- as.data.frame(interp2xyz(data.interp))
data.xyz <- setNames(data.xyz, c("x", "y", "value"))
data.xyz <- na.omit(data.xyz)
my.breaks <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
brks <- cut(data.xyz$value,
breaks = my.breaks,
ordered_result = TRUE)
levels(brks) <- gsub(",", " - ", levels(brks), fixed = TRUE)
levels(brks) <- gsub("\\(|\\]","",levels(brks))
data.xyz$brks <- brks
ggplot(data.xyz, aes(x = x, y = y, fill = brks)) +
geom_tile() +
scale_fill_manual("Value",
values = rev(colorRampPalette(brewer.pal(11, "Spectral"))(length(my.breaks))))
Here is what the result looks like:
What I need is to retrieve the coordinates of my iso-contours.
The purpose of to create a 3D model of those contours assuming the data is axisymmetric. But before I do that, I need to find the coordinates of the line separating the colors.
Using contourLines, here is how to do this:
r <- seq(0.1, 1, length.out = 20)
theta <- seq(0, 90)
my.df <- expand.grid(r = r, theta = theta)
my.df$value <- 1/my.df$r^2 * sin(deg2rad(my.df$theta))
my.matrix <- acast(my.df, r ~ theta, value.var = "value")
contour.lines <- contourLines(x = r,
y = theta,
z = my.matrix,
levels = seq(0, 100, by = 10))
contour.df <- data.frame()
for(level in contour.lines) {
contour.df <- rbind(contour.df, data.frame(x = level$x * cos(deg2rad(level$y)),
y = level$x * sin(deg2rad(level$y)),
level = as.factor(level$level)))
}
ggplot(contour.df, aes(x, y, color = level)) + geom_path() + scale_x_continuous(limits = c(0, 1)) + scale_y_continuous(limits = c(0, 1))

Multiple density graphs different groups (based on factor level) using plyr

I am trying to output multiple density plot from a function, by dividing the dataframe into pieces such that separate density for each level of a factor for corresponding yvar.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(gen, yvar)
minyvar <- min(yvar)
maxyvar <- max(yvar)
par(mfrow = c(length(levels(mydf$gen)),1))
plotdensity <- function (xf, minyvar, maxyvar){
plot(density(xf), xlim=c(minyvar, maxyvar), main = paste (names(xf),
"distribution", sep = ""))
dens <- density(xf)
x1 <- min(which(dens$x >= quantile(xf, .80)))
x2 <- max(which(dens$x < max(dens$x)))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="blu4"))
abline(v= mean(xf), col = "black", lty = 1, lwd =2)
}
require(plyr)
ddply(mydf, .(mydf$gen), plotdensity, yvar, minyvar, maxyvar)
Error in .fun(piece, ...) : unused argument(s) (111.544494112914)
My specific expectation are each plot is named by name of level for example Aa, Bb, Cc, Dd
Arrangement of the graphs see the parameter set, so that we compare density changes and means. compact - Low space between the graphs.
Help appreciated.
Edits:
The following graphs are individually produced, although I want to develop a function that can be applicable to x level for a factor.
I see that #Andrie just beat me to most of this. I'm still going to post my answer, since filling only certain quantiles of the distribution requires a slightly different approach.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
ggplot(mydf,aes(x = x)) +
facet_wrap(~grp) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0, fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),colour = "black")
Here is a way of doing it in ggplot:
set.seed(1234)
mydf <- rbind(
data.frame(gen="Aa", yvar= rnorm(40000, 50, 10)),
data.frame(gen="Bb", yvar=rnorm(4000, 70, 10)),
data.frame(gen="Cc", yvar=rnorm(400, 75, 10)),
data.frame(gen="Dd", yvar=rnorm(40, 80, 10))
)
labels <- ddply(mydf, .(gen), nrow)
means <- ddply(mydf, .(gen), summarize, mean=mean(yvar))
ggplot(mydf, aes(x=yvar)) +
stat_density(fill="blue") +
facet_grid(gen~.) +
theme_bw() +
geom_vline(data=means, aes(xintercept=mean), colour="red") +
geom_text(data=labels, aes(label=paste("n =", V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution")
With sincere thanks to joran and Andrie, the following is just compilation of my favorite from above two posts, just some of readers might want to see.
require(ggplot2)
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
mydf1 <- mydf
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
labels <- ddply(mydf1, .(grp), nrow)
ggplot(mydf,aes(x = x)) +
facet_grid(grp~.) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0,
fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),
colour = "black") + geom_text(data=labels,
aes(label=paste("n =", labels$V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution") + theme_bw()

Resources