Multiple density graphs different groups (based on factor level) using plyr

Multiple density graphs different groups (based on factor level) using plyr - r

I am trying to output multiple density plot from a function, by dividing the dataframe into pieces such that separate density for each level of a factor for corresponding yvar.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(gen, yvar)
minyvar <- min(yvar)
maxyvar <- max(yvar)
par(mfrow = c(length(levels(mydf$gen)),1))
plotdensity <- function (xf, minyvar, maxyvar){
plot(density(xf), xlim=c(minyvar, maxyvar), main = paste (names(xf),
"distribution", sep = ""))
dens <- density(xf)
x1 <- min(which(dens$x >= quantile(xf, .80)))
x2 <- max(which(dens$x < max(dens$x)))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="blu4"))
abline(v= mean(xf), col = "black", lty = 1, lwd =2)
}
require(plyr)
ddply(mydf, .(mydf$gen), plotdensity, yvar, minyvar, maxyvar)
Error in .fun(piece, ...) : unused argument(s) (111.544494112914)
My specific expectation are each plot is named by name of level for example Aa, Bb, Cc, Dd
Arrangement of the graphs see the parameter set, so that we compare density changes and means. compact - Low space between the graphs.
Help appreciated.
Edits:
The following graphs are individually produced, although I want to develop a function that can be applicable to x level for a factor.

I see that #Andrie just beat me to most of this. I'm still going to post my answer, since filling only certain quantiles of the distribution requires a slightly different approach.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
ggplot(mydf,aes(x = x)) +
facet_wrap(~grp) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0, fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),colour = "black")

Here is a way of doing it in ggplot:
set.seed(1234)
mydf <- rbind(
data.frame(gen="Aa", yvar= rnorm(40000, 50, 10)),
data.frame(gen="Bb", yvar=rnorm(4000, 70, 10)),
data.frame(gen="Cc", yvar=rnorm(400, 75, 10)),
data.frame(gen="Dd", yvar=rnorm(40, 80, 10))
)
labels <- ddply(mydf, .(gen), nrow)
means <- ddply(mydf, .(gen), summarize, mean=mean(yvar))
ggplot(mydf, aes(x=yvar)) +
stat_density(fill="blue") +
facet_grid(gen~.) +
theme_bw() +
geom_vline(data=means, aes(xintercept=mean), colour="red") +
geom_text(data=labels, aes(label=paste("n =", V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution")

With sincere thanks to joran and Andrie, the following is just compilation of my favorite from above two posts, just some of readers might want to see.
require(ggplot2)
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
mydf1 <- mydf
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
labels <- ddply(mydf1, .(grp), nrow)
ggplot(mydf,aes(x = x)) +
facet_grid(grp~.) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0,
fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),
colour = "black") + geom_text(data=labels,
aes(label=paste("n =", labels$V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution") + theme_bw()

Related

Labelling points on a geom_point() based on the equation of geom_abline()

I have a fun problem for you today. Any help would be amazing.
I have geom_plot()
set.seed(1)
list1 <- sample(10, 100, replace = T)
list2 <- sample(7, 100, replace = T)
df <- data.frame(list1, list2)
df
ggplot(data=df, aes(x=list1, y=list2)) + geom_point()
x3 <- c(10,6)
y3 <- c(1,7)
slope3 <- diff(y3)/diff(x3)
intercept3 <- y3[1] - slope*x3[1]
ggplot(data = df, aes(x=list1, y=list2)) +
geom_point() +
geom_abline(data = NULL, intercept = intercept3, slope = slope3, col="red")
What I would ideally like to do is label all the points that lay on, or to the right of the geom_abline() I have plotted I wanted to use the geom_label_repel() function to make this look tidy, but when I tried this it just labelled every poitn! If possible, I would also like to further subset the data points on or to the right of the plotted geom_abline() in the future based on other criteria.
Thank you in advance!

You can try this:
set.seed(1)
list1 <- sample(10, 100, replace = T)
list2 <- sample(7, 100, replace = T)
df <- data.frame(list1, list2)
df
ggplot(data=df, aes(x=list1, y=list2)) + geom_point()
x3 <- c(10,6)
y3 <- c(1,7)
slope3 <- diff(y3)/diff(x3)
intercept3 <- y3[1] - slope3*x3[1]
#Mutate
df$prod <- intercept3+slope3*df$list1
df$label <- ifelse(df$list2>df$prod,'text',NA)
ggplot(data = df, aes(x=list1, y=list2,label=label)) +
geom_point() +
geom_abline(data = NULL, intercept = intercept3, slope = slope3, col="red")+
geom_text(vjust=-0.5)

Labelling R2 and p value in ggplot?

I am trying to add lm model coefs of two parallel modelling results onto the same ggplot plot. Here is my working example:
library(ggplot2)
set.seed(100)
dat <- data.frame(
x <- rnorm(100, 1),
y <- rnorm(100, 10),
lev <- gl(n = 2, k = 50, labels = letters[1:2])
)
mod1 <- lm(y~x, dat = dat[lev %in% "a", ])
r1 <- paste("R^2==", round(summary(mod1)[[9]], 3))
p1<- paste("p==", round(summary(mod1)[[4]][2, 4], 3), sep= "")
lab1 <- paste(r1, p1, sep =",")
mod2 <- lm(y~x, dat = dat[lev %in% "b", ])
r2 <- paste("R^2==", round(summary(mod2)[[9]], 3))
p2 <- paste("p==", round(summary(mod2)[[4]][2, 4], 3), sep= "")
lab2 <- paste(r2, p2, sep =",")
ggplot(dat, aes(x = x, y = y, col = lev)) + geom_jitter() + geom_smooth(method = "lm") + annotate("text", x = 2, y = 12, label = lab1, parse = T) + annotate("text", x = 10, y = 8, label = lab2, parse = T)
Here is the promot shows:
Error in parse(text = text[[i]]) : <text>:1:12: unexpected ','
1: R^2== 0.008,
Now the problem is that I could label either R2 or p value seperately, but not both of them together. How could I do to put the two results into one single line on the figure?
BTW, any other efficienty way of doing the same thing as my code? I have nine subplots that I want to put into one full plot, and I don't want to add them one by one.
++++++++++++++++++++++++++ Some update ++++++++++++++++++++++++++++++++++
Following #G. Grothendieck 's kind suggestion and idea, I tried to wrap the most repeatative part of the codes into a function, so I could finish all the plot with a few lines. Now the problem is that, whatever I changed the input variables, the output plot are basically the same, except the axis labels. Can anyone explain why? The following is the working code I used:
library(ggplot2)
library(ggpubr)
set.seed(100)
dat <- data.frame(
x = rnorm(100, 1),
y = rnorm(100, 10),
z = rnorm(100, 25),
lev = gl(n = 2, k = 50, labels = letters[1:2])
)
test <- function(dat, x, y){
fmt <- "%s: Adj ~ R^2 == %.3f * ',' ~ {p == %.3f}"
mod1 <- lm(y ~ x, dat, subset = lev == "a")
sum1 <- summary(mod1)
lab1 <- sprintf(fmt, "a", sum1$adj.r.squared, coef(sum1)[2, 4])
mod2 <- lm(y ~ x, dat, subset = lev == "b")
sum2 <- summary(mod2)
lab2 <- sprintf(fmt, "b", sum2$adj.r.squared, coef(sum2)[2, 4])
colors <- 1:2
p <- ggplot(dat, aes(x = x, y = y, col = lev)) +
geom_jitter() +
geom_smooth(method = "lm") +
annotate("text", x = 2, y = c(12, 8), label = c(lab1, lab2),
parse = TRUE, hjust = 0, color = colors) +
scale_color_manual(values = colors)
return(p)
}
ggarrange(test(dat, x, z), test(dat, y, z))

There are several problems here:
x, y and lev are arguments to data.frame so they must be specified using = rather than <-
make use of the subset= argument in lm
use sprintf instead of paste to simplify the specification of labels
label the text strings a and b and make them the same color as the corresponding lines to identify which is which
the formula syntax needs to be corrected. See fmt below.
it would be clearer to use component names and accessor functions of the summary objects where available
use TRUE rather than T because the latter can be overridden if there is a variable called T but TRUE can never be overridden.
use hjust=0 and adjust the x= and y= in annotate to align the two text strings
combine the annotate statements
place the individual terms of the ggplot statement on separate lines for improved readability
This gives:
library(ggplot2)
set.seed(100)
dat <- data.frame(
x = rnorm(100, 1),
y = rnorm(100, 10),
lev = gl(n = 2, k = 50, labels = letters[1:2])
)
fmt <- "%s: Adj ~ R^2 == %.3f * ',' ~ {p == %.3f}"
mod1 <- lm(y ~ x, dat, subset = lev == "a")
sum1 <- summary(mod1)
lab1 <- sprintf(fmt, "a", sum1$adj.r.squared, coef(sum1)[2, 4])
mod2 <- lm(y ~ x, dat, subset = lev == "b")
sum2 <- summary(mod2)
lab2 <- sprintf(fmt, "b", sum2$adj.r.squared, coef(sum2)[2, 4])
colors <- 1:2
ggplot(dat, aes(x = x, y = y, col = lev)) +
geom_jitter() +
geom_smooth(method = "lm") +
annotate("text", x = 2, y = c(12, 8), label = c(lab1, lab2),
parse = TRUE, hjust = 0, color = colors) +
scale_color_manual(values = colors)

Unless I'm misunderstanding your question, the problem's with the parse = T arguments to your annotate calls. I don't think your strings need to be parsed. Try parse = F instead, or just drop the parameter, as the default value seems to be FALSE anyway

How to retrieve isocontour coordinates?

I have data interpolated on a grid and I need to retrieve the iso-contour coordinates:
require(akima)
require(pracma)
require(ggplot2)
require(RColorBrewer)
r <- seq(0.1, 1, length.out = 20)
theta <- seq(0, 90)
my.df <- expand.grid(r = r, theta = theta)
my.df$value <- 1/my.df$r^2 * sin(deg2rad(my.df$theta))
# Interpolating data on rectangular grid
data.interp <-
interp(
x = my.df$r * cos(deg2rad(my.df$theta)),
y = my.df$r * sin(deg2rad(my.df$theta)),
z = my.df$value,
nx = 200,
ny = 200,
duplicate = "strip"
)
data.xyz <- as.data.frame(interp2xyz(data.interp))
data.xyz <- setNames(data.xyz, c("x", "y", "value"))
data.xyz <- na.omit(data.xyz)
my.breaks <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
brks <- cut(data.xyz$value,
breaks = my.breaks,
ordered_result = TRUE)
levels(brks) <- gsub(",", " - ", levels(brks), fixed = TRUE)
levels(brks) <- gsub("\\(|\\]","",levels(brks))
data.xyz$brks <- brks
ggplot(data.xyz, aes(x = x, y = y, fill = brks)) +
geom_tile() +
scale_fill_manual("Value",
values = rev(colorRampPalette(brewer.pal(11, "Spectral"))(length(my.breaks))))
Here is what the result looks like:
What I need is to retrieve the coordinates of my iso-contours.
The purpose of to create a 3D model of those contours assuming the data is axisymmetric. But before I do that, I need to find the coordinates of the line separating the colors.

Using contourLines, here is how to do this:
r <- seq(0.1, 1, length.out = 20)
theta <- seq(0, 90)
my.df <- expand.grid(r = r, theta = theta)
my.df$value <- 1/my.df$r^2 * sin(deg2rad(my.df$theta))
my.matrix <- acast(my.df, r ~ theta, value.var = "value")
contour.lines <- contourLines(x = r,
y = theta,
z = my.matrix,
levels = seq(0, 100, by = 10))
contour.df <- data.frame()
for(level in contour.lines) {
contour.df <- rbind(contour.df, data.frame(x = level$x * cos(deg2rad(level$y)),
y = level$x * sin(deg2rad(level$y)),
level = as.factor(level$level)))
}
ggplot(contour.df, aes(x, y, color = level)) + geom_path() + scale_x_continuous(limits = c(0, 1)) + scale_y_continuous(limits = c(0, 1))

ggplot, plotting in multiple pages with different rows

I wanted to make a graph using facet_wrap and plot it in different pages in a pdf file. I've read son many options, and this works:
R + ggplot: plotting over multiple pages
but only when you have the same rows in each page.
I have this demo data to try explain my case:
A <- data.frame(TIME = rep(c(0, 5, 10, 15, 30, 45, 60), 5))
A$C <- (1 - exp(-0.2*A$TIME))
A$ID <- rep(1:5, each = 7)
A$R <- rnorm(35, mean = 1, sd = 0.01)
A$C2 <- A$C*A$R
Pages <- 5
A2 <- A[c(1,4:8,10:22,24:35),]
So, I have ID with different number of observations. I tried to make a vector with the number of observation in each ID (I want an ID per page), but it doesn't work.
nrws <- ddply(A2, .(ID), "nrow")
nsamp <- nrws[,2]
pdf("Test.pdf")
for (i in seq(Pages))
{
slice = seq(((i-1)*nsamp[i]),(i*nsamp[i]))
slice2 = slice[!(slice > nrow(A2))]
A3 = A2[slice2,]
p1 <- ggplot(A3, aes(x = TIME, y = C2)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
facet_wrap(~ID)
print(p1)
}
dev.off()
Could you help me?
Thanks in advances,
Nacho

I think you were overthinking trying to calculate your "slices". Maybe you want this?
Not entirely sure. If you only want one ID per page you don't need facet_wrap, and you will probably need to set the scale explicitly to keep it the same from page to page.
library(plyr)
A <- data.frame(TIME = rep(c(0, 5, 10, 15, 30, 45, 60), 5))
A$C <- (1 - exp(-0.2*A$TIME))
A$ID <- rep(1:5, each = 7)
A$R <- rnorm(35, mean = 1, sd = 0.01)
A$C2 <- A$C*A$R
Pages <- 5
A2 <- A[c(1,4:8,10:22,24:35),]
nrws <- ddply(A2, .(ID), "nrow")
nsamp <- nrws[,2]
pdf("Test.pdf")
for (i in seq(Pages))
{
# slice = seq(((i-1)*nsamp[i]),(i*nsamp[i]))
# slice2 = slice[!(slice > nrow(A2))]
# A3 = A2[slice2,]
A3 = A2[A2$ID==i,]
p1 <- ggplot(A3, aes(x = TIME, y = C2)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
facet_wrap(~ID)
print(p1)
}
dev.off()

geom_vlines multiple vlines per plot

How can I get ggplot to produce something similar like
library(ggplot2)
library(reshape2)
library(ecp)
synthetic_control.data <- read.table("/path/synthetic_control.data.txt", quote="\"", comment.char="")
n <- 2
s <- sample(1:100, n)
idx <- c(s, 100+s, 200+s, 300+s, 400+s, 500+s)
sample2 <- synthetic_control.data[idx,]
df = as.data.frame(t(as.matrix(sample2)))
#calculate the change points
changeP <- e.divisive(as.matrix(df[1]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = data.frame(changeP,variable=colnames(df)[1])
for(series in 2:ncol(df)){
changeP <- e.divisive(as.matrix(df[series]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = rbind(changePoints, data.frame(changeP,variable=colnames(df)[2]))
}
this is the interesting part about the plot:
df$id = 1:nrow(df)
dfMelt <- reshape2::melt(df, id.vars = "id")
p = ggplot(dfMelt,aes(x=id,y=value))+geom_line(color = "steelblue")+ facet_grid(variable ~ ., scales = 'free_y')
p + geom_vline(aes(xintercept=changeP), data=changePoints, linetype='dashed')
So far my result is: https://www.dropbox.com/s/mysadkruo946oox/changePoint.pdf which means that there is something wrong with my array passed to the geom_vlines.
Could you point me in the right direction why I only get vlines in the first 2 plots?

This is the solution:
library(ggplot2)
library(reshape2)
library(ecp)
synthetic_control.data <- read.table("/Users/geoHeil/Dropbox/6.Semester/BachelorThesis/rResearch/data/synthetic_control.data.txt", quote="\"", comment.char="")
n <- 2
s <- sample(1:100, n)
idx <- c(s, 100+s, 200+s, 300+s, 400+s, 500+s)
sample2 <- synthetic_control.data[idx,]
df = as.data.frame(t(as.matrix(sample2)))
#calculate the change points
changeP <- e.divisive(as.matrix(df[1]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = data.frame(changeP,variable=colnames(df)[1])
for(series in 2:ncol(df)){
changeP <- e.divisive(as.matrix(df[series]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = rbind(changePoints, data.frame(changeP,variable=colnames(df)[series]))
}
# plot
df$id = 1:nrow(df)
dfMelt <- reshape2::melt(df, id.vars = "id")
p = ggplot(dfMelt,aes(x=id,y=value))+geom_line(color = "steelblue")+ facet_grid(variable ~ ., scales = 'free_y')
p + geom_vline(aes(xintercept=changeP), data=changePoints, linetype='dashed', colour='darkgreen')