dt<- fread("Book2.txt")
names(dt)
Book2.txt <- data.frame(table('d$sample','d$expression', 'd$SD'))
names(dt) <- c("sample","expression", "SD")
ticks <- c('wt','clone16','clone14','clone20')
SD <-c(0.001793815,0.000869683,0.000106077,6.04489E-05)
gg <- ggplot(data=dt, aes(x=sample, y=expression)) + geom_bar(stat="identity")
gg <- gg + scale_x_discrete(limits=ticks, labels = ticks) +
geom_errorbar(aes(ymin=expression-SD, ymax=expression+SD), color= "grey", width=.1)
Please, Could anyone help me out in solving this problem.The graph A is obtained using excel from the same set of values, while the graph B is obtained using ggplot2. The SD deviation in the geom_bar graph is not actual and assigned to the graph according to the height of the graph.
My plots:
Related
I want to create a (time-series) plot out of 40 million data points in order to show two regression lines with two specific events on each of it (first occurrence of an optimum in time-series).
Currently, I draw the regression lines and add a geom_vline to it to indicate the event.
As I want to be independent from colours in the plot, it would be beneficial if I could just plot the marker geom_vline as a point on the regression line.
Do you have any idea how to solve this using ggplot2?
My current approach is this here (replaced data points with test data):
library(ggplot2)
# Generate data
m1 <- "method 1"
m2 <- "method 2"
data1 <- data.frame(Time=seq(100), Value=sample(1000, size=100), Type=rep(as.factor(m1), 100))
data2 <- data.frame(Time=seq(100), Value=sample(1000, size=100), Type=rep(as.factor(m2), 100))
df <- rbind(data1, data2)
rm(data1, data2)
# Calculate first minima for each Type
m1_intercept <- df[which(df$Type == m1), ][which.min(df[which(df$Type == m1), ]$Value),]
m2_intercept <- df[which(df$Type == m2), ][which.min(df[which(df$Type == m2), ]$Value),]
# Plot regression and vertical lines
p1 <- ggplot(df, aes(x=Time, y=Value, group=Type, colour=Type), linetype=Type) +
geom_smooth(se=F) +
geom_vline(aes(xintercept=m1_intercept$Time, linetype=m1_intercept$Type)) +
geom_vline(aes(xintercept=m2_intercept$Time, linetype=m2_intercept$Type)) +
scale_linetype_manual(name="", values=c("dotted", "dashed")) +
guides(colour=guide_legend(title="Regression"), linetype=guide_legend(title="First occurrence of optimum")) +
theme(legend.position="bottom")
ggsave("regression.png", plot=p1, height=5, width=7)
which generates this plot:
My desired plot would be something like this:
So my questions are
Does it make sense to indicate a minimum value on a regression line? The values y-axis position would be in fact wrong but just to indicate the timepoint?
If yes, how can I achieve such a behaviour?
If no, what would you think could be better?
Thank you very much in advance!
Robin
If you first run your ggplot() call with only geom_smooth(), you can access plotted values through ggplot_build(), which we then can use to plot points on the two fitted lines. Example:
# Create initial plot
p1<-ggplot(df, aes(x=Time, y=Value, colour=Type)) +
geom_smooth(se=F)
# Now we can access the fitted values
smooths <- ggplot_build(p1)$data[[1]]
smooths_1 <- smooths[smooths$group==1,] # First group (method 1)
smooths_2 <- smooths[smooths$group==2,] # Second group (method 2)
# Then we find the closest plotted values to the minima
smooth_1_x <- smooths_1$x[which.min(abs(smooths_1$x - m1_intercept$Time))]
smooth_2_x <- smooths_2$x[which.min(abs(smooths_2$x - m2_intercept$Time))]
# Subset the previously defined datasets for respective closest values
point_data1 <- smooths_1[smooths_1$x==smooth_1_x,]
point_data2 <- smooths_1[smooths_2$x==smooth_2_x,]
Now we use point_data1 and point_data2 to place the points on your plot:
ggplot(df, aes(x=Time, y=Value, colour=Type)) +
geom_smooth(se=F) +
geom_point(data=point_data1, aes(x=x, y=y), colour = "red",size = 5) +
geom_point(data=point_data2, aes(x=x, y=y), colour = "red", size = 5)
To reproduce this plot, you can use set.seed(42) for your data generation step.
I have scatterplots of 2D data from two categories. I want to add density lines for each dimension -- not outside the plot (cf. Scatterplot with marginal histograms in ggplot2) but right on the plotting surface. I can get this for the x-axis dimension, like this:
set.seed(123)
dim1 <- c(rnorm(100, mean=1), rnorm(100, mean=4))
dim2 <- rnorm(200, mean=1)
cat <- factor(c(rep("a", 100), rep("b", 100)))
mydf <- data.frame(cbind(dim2, dim1, cat))
ggplot(data=mydf, aes(x=dim1, y=dim2, colour=as.factor(cat))) +
geom_point() +
stat_density(aes(x=dim1, y=(-2+(..scaled..))),
position="identity", geom="line")
It looks like this:
But I want an analogous pair of density curves running vertically, showing the distribution of points in the y-dimension. I tried
stat_density(aes(y=dim2, x=0+(..scaled..))), position="identity", geom="line)
but receive the error "stat_density requires the following missing aesthetics: x".
Any ideas? thanks
You can get the densities of the dim2 variables. Then, flip the axes and store them in a new data.frame. After that it is simply plotting them on top of the other graph.
p <- ggplot(data=mydf, aes(x=dim1, y=dim2, colour=as.factor(cat))) +
geom_point() +
stat_density(aes(x=dim1, y=(-2+(..scaled..))),
position="identity", geom="line")
stuff <- ggplot_build(p)
xrange <- stuff[[2]]$ranges[[1]]$x.range # extract the x range, to make the new densities align with y-axis
## Get densities of dim2
ds <- do.call(rbind, lapply(unique(mydf$cat), function(lev) {
dens <- with(mydf, density(dim2[cat==lev]))
data.frame(x=dens$y+xrange[1], y=dens$x, cat=lev)
}))
p + geom_path(data=ds, aes(x=x, y=y, color=factor(cat)))
So far I can produce:
distrib_horiz <- stat_density(aes(x=dim1, y=(-2+(..scaled..))),
position="identity", geom="line")
ggplot(data=mydf, aes(x=dim1, y=dim2, colour=as.factor(cat))) +
geom_point() + distrib_horiz
And:
distrib_vert <- stat_density(data=mydf, aes(x=dim2, y=(-2+(..scaled..))),
position="identity", geom="line")
ggplot(data=mydf, aes(x=dim2, y=dim1, colour=as.factor(cat))) +
geom_point() + distrib_vert + coord_flip()
But combining them is proving tricky.
So far I have only a partial solution since I didn't manage to obtain a vertical stat_density line for each individual category, only for the total set. Maybe this can nevertheless help as a starting point for finding a better solution. My suggestion is to try with the ggMarginal() function from the ggExtra package.
p <- ggplot(data=mydf, aes(x=dim1, y=dim2, colour=as.factor(cat))) +
geom_point() + stat_density(aes(x=dim1, y=(-2+(..scaled..))),
position="identity", geom="line")
library(ggExtra)
ggMarginal(p,type = "density", margins = "y", size = 4)
This is what I obtain:
I know it's not perfect, but maybe it's a step in a helpful direction. At least I hope so. Looking forward to seeing other answers.
I'm trying to create a forest plot in R from meta-analysis results. However, I'm having difficulties adjusting the line thickness & the center points as well as getting rid of the automatic legend and creating my own legend.
#d is a data frame with 4 columns
#d$x gives variable names
#d$y gives center point
#d$ylo gives lower limits
#d$yhi gives upper limits
#data
d <- data.frame(x = toupper(letters[1:10]),
y = rnorm(10, 0, 0.1))
d <- transform(d, ylo = y-1/10, yhi=y+1/10)
d$x <- factor(d$x, levels=rev(d$x)) #Reverse ordering in the way that it's is in the
#function
credplot.gg <- function(d){
require(ggplot2)
p <- ggplot(d, aes(x=x, y=y, ymin=ylo, ymax=yhi,group=x,colour=x))+
geom_pointrange()+ theme_bw()+ coord_flip()+
guides(color=guide_legend(title="Cohort"))+
geom_hline(aes(x=0),colour = 'red', lty=1)+
xlab('Cohort') + ylab('Beta') + ggtitle('rs6467890_CACNA2D1')
return(p)
}
credplot.gg(d)
The issues that I'm having are:
when insert "size" into ggplot(d, aes(x=x, y=y, ymin=ylo, ymax=yhi, group=x,colour=x), size=1.5) the line and points are extremely large
How do I get rid of the legend that is automatically generated with the plot and how do I create my own legend?
I'm fairly new to r so and any help is gladly appreciated
Is there any way to plot the cumulative probability from a frequency table? I mean a "smooth" version of it, similar to the way geom_density() plots.
So far, I managed to plot the individually calculated probabilities as points joined by lines, but it doesn't look very good.
I generate some test data:
set.seed(1)
x <- sort(sample(1:100, 20))
p <- runif(x); p <- cumsum(p)/sum(p)
table <- data.frame(x=x, prob=p)
You can use geom_smooth from the ggplot2 package.
require("ggplot2")
qplot(x=x, y=p, data=table, aes(ymin=0, ymax=1)) + ylab("ecf") +
geom_smooth(se=F, stat="smooth", method="loess", fullrange=T, fill="lightgrey", size=1)
As an alternative, an easy way to specifiy smoothing by a parameter try DeconCdf from the decon package:
require("decon")
plot(DeconCdf(x, sig=1))
If you want to use ggplot, you first have to transform the Decon function object in a data.frame.
f <- DeconCdf(x, sig=1)
m <- ggplot(data=data.frame(x=f$x, p=f$y), aes(x=x, y=p, ymin=0, ymax=1)) + ylab("ecf")
m + geom_line(size=1)
Use the sig-Parameter as your smoothing parameter:
f <- DeconCdf(x, sig=0.3)
m <- ggplot(data=data.frame(x=f$x, p=f$y), aes(x=x, y=p, ymin=0, ymax=1)) + ylab("ecf")
m + geom_line(size=1)
This version plots a histogram with a smoothed line from geom_density:
# Generate some data:
set.seed(28986)
x2 <- rweibull(100, 1, 1/2)
# Plot the points:
library(ggplot2)
library(scales)
ggplot(data.frame(x=x2),aes(x=x, y=1-cumsum(..count..)/sum(..count..))) +
geom_histogram(aes(fill=..count..)) +
geom_density(fill=NA, color="black", adjust=1/2) +
scale_y_continuous("Percent of units\n(equal to or larger than x)",labels=percent) +
theme_grey(base_size=18)
Note that I've used 1 - "cumulative probability" due to individual preference (I think it looks better and I'm accustomed to dealing with "reliability" metrics), but obviously that's just a preference that you could ignore by removing the 1- part in the aes.
I'm using ggplot2 to create panels of histograms, and I'd like to be able to add a vertical line at the mean of each group. But geom_vline() uses the same intercept for each panel (i.e. the global mean):
require("ggplot2")
# setup some sample data
N <- 1000
cat1 <- sample(c("a","b","c"), N, replace=T)
cat2 <- sample(c("x","y","z"), N, replace=T)
val <- rnorm(N) + as.numeric(factor(cat1)) + as.numeric(factor(cat2))
df <- data.frame(cat1, cat2, val)
# draws a single histogram with vline at mean
qplot(val, data=df, geom="histogram", binwidth=0.2) +
geom_vline(xintercept=mean(val), color="red")
# draws panel of histograms with vlines at global mean
qplot(val, data=df, geom="histogram", binwidth=0.2, facets=cat1~cat2) +
geom_vline(xintercept=mean(val), color="red")
How can I get it to use each panel's group mean as the x-intercept? (Bonus points if you can also add a text label by the line with the value of the mean.)
I guess this is a reworking of #eduardo's really, but in one line.
ggplot(df) + geom_histogram(mapping=aes(x=val))
+ geom_vline(data=aggregate(df[3], df[c(1,2)], mean),
mapping=aes(xintercept=val), color="red")
+ facet_grid(cat1~cat2)
alt text http://www.imagechicken.com/uploads/1264782634003683000.png
or using plyr (require(plyr) a package by the author of ggplot, Hadley):
ggplot(df) + geom_histogram(mapping=aes(x=val))
+ geom_vline(data=ddply(df, cat1~cat2, numcolwise(mean)),
mapping=aes(xintercept=val), color="red")
+ facet_grid(cat1~cat2)
It seems unsatisfying that vline isn't cut on the facets, I'm not sure why.
One way is to construct the data.frame with the mean values before hand.
library(reshape)
dfs <- recast(data.frame(cat1, cat2, val), cat1+cat2~variable, fun.aggregate=mean)
qplot(val, data=df, geom="histogram", binwidth=0.2, facets=cat1~cat2) + geom_vline(data=dfs, aes(xintercept=val), colour="red") + geom_text(data=dfs, aes(x=val+1, y=1, label=round(val,1)), size=4, colour="red")