geom_scatterpie with non-numeric axes - r

I want to have a species x sample (both strings/factors) scatterplot with piecharts instead of points. The size of the points shall be correlated to the abundance of the species in each sample.
This can be easily done with just points as this:
d <- data.frame(Tax=c("A", "B", "C"), Sample=c("01", "02", "03"))
d$A <- abs(rnorm(3, sd=1))
d$B <- abs(rnorm(3, sd=2))
d$size=c(0.1,0.2,0.3)
library(ggplot2)
ggplot(d,aes(x=Tax, y=Sample, size=size)) + geom_point()
To replace the points with piecharts can be achieved with geom_scatterpie of the scatterpie package (available on CRAN)
However, this does not work with factors in the x/y aesthetics:
library(scatterpie)
ggplot() + geom_scatterpie(aes(x=Tax, y=Sample, r=size), data=d, cols=c("A", "B"))
Warning:
Removed 6 rows containing non-finite values (stat_pie).
The panel is drawn, but stays empty. Note that scatterpie works well with numeric x/y aesthetics:
d <- data.frame(x=c(1,2,3), y=c(1,2,3))
d$A <- abs(rnorm(3, sd=1))
d$B <- abs(rnorm(3, sd=2))
d$size=c(0.1,0.2, 0.3)
ggplot() + geom_scatterpie(aes(x=x, y=y, r=size), data=d, cols=c("A", "B")) + coord_fixed()
How can i change geom_scatterpie to accept non-numeric axes?

This should work:
d2 <- d %>%
mutate(tax_num = as.numeric(as.factor(Tax)),
sample_num = as.numeric(as.factor(Sample)))
ggplot() + geom_scatterpie(data=d2, aes(x=tax_num, y=sample_num, r=size), cols=c("A", "B")) +
scale_x_continuous(breaks=c(1,2,3), labels=c("A", "B", "C")) +
scale_y_continuous(breaks=c(1,2,3), labels=c("01", "02", "03")) +
labs(x="Tax", y="Sample") +
coord_fixed()

Related

Why pies are flat in geom_scatterpie in R?

Why are the pies flat?
df<- data.frame(
Day=(1:6),
Var1=c(172,186,191,201,205,208),
Var2= c(109,483,64010,161992,801775,2505264), A=c(10,2,3,4.5,16.5,39.6), B=c(10,3,0,1.4,4.8,11.9), C=c(2,5,2,0.1,0.5,1.2), D=c(0,0,0,0,0.1,0.2))
ggplot() +
geom_scatterpie(data = df, aes(x = Var1 , y = Var2, group = Var1), cols = c("A", "B", "C", "D"))
I have tried using coord_fixed() and does not work either.
The problem seems to be the scales of the x- and y-axes. If you rescaled them to both to have zero mean and unit variance, the plot works. So, one thing you could do is plot the rescaled values, but transform the labels back into the original scale. To do this, you would have to do the following:
Make the data:
df<- data.frame(
Day=(1:6),
Var1=c(172,186,191,201,205,208),
Var2= c(109,483,64010,161992,801775,2505264), A=c(10,2,3,4.5,16.5,39.6), B=c(10,3,0,1.4,4.8,11.9), C=c(2,5,2,0.1,0.5,1.2), D=c(0,0,0,0,0.1,0.2))
Rescale the variables
df <- df %>%
mutate(x = c(scale(Var1)),
y = c(scale(Var2)))
Find the linear map that transforms the rescaled values back into their original values. Then, you can use the coefficients from the model to make a function that will transform the rescaled values back into the original ones.
m1 <- lm(Var1 ~ x, data=df)
m2 <- lm(Var2 ~ y, data=df)
trans_x <- function(x)round(coef(m1)[1] + coef(m1)[2]*x)
trans_y <- function(x)round(coef(m2)[1] + coef(m2)[2]*x)
Make the plot, using the transformation functions as the call to labels in the scale_[xy]_continuous() functions
ggplot() +
geom_scatterpie(data=df, aes(x = x, y=y), cols = c("A", "B", "C", "D")) +
scale_x_continuous(labels = trans_x) +
scale_y_continuous(labels = trans_y) +
coord_fixed()
There may be an easier way than this, but it wasn't apparent to me.
The range on the y-axis is so large it's compressing the disks to lines. Change the y-axis to a log scale, and you can see the shapes. Adding coord_fixed() to keep the pies circular:
ggplot() +
geom_scatterpie(data = df, aes(x = Var1 , y = Var2, group = Var1), cols = c("A", "B", "C", "D")) +
scale_y_log10() +
coord_fixed()

Can I use Annotate in ggplot2 to fill my violin graph? [duplicate]

ggplot2 can create a very attractive filled violin plot:
ggplot() + geom_violin(data=data.frame(x=1, y=rnorm(10 ^ 5)),
aes(x=x, y=y), fill='gray90', color='black') +
theme_classic()
I'd like to restrict the fill to the central 95% of the distribution if possible, leaving the outline intact. Does anyone have suggestions on how to accomplish this?
Does this do what you want? It requires some data-processing and the drawing of two violins.
set.seed(1)
dat <- data.frame(x=1, y=rnorm(10 ^ 5))
#calculate for each point if it's central or not
dat_q <- quantile(dat$y, probs=c(0.025,0.975))
dat$central <- dat$y>dat_q[1] & dat$y < dat_q[2]
#plot; one'95' violin and one 'all'-violin with transparent fill.
p1 <- ggplot(data=dat, aes(x=x,y=y)) +
geom_violin(data=dat[dat$central,], color="transparent",fill="gray90")+
geom_violin(color="black",fill="transparent")+
theme_classic()
Edit: the rounded edges bothered me, so here is a second approach. If I were doing this, I would want straight lines. So I did some playing with the density (which is what violin plots are based on)
d_y <- density(dat$y)
right_side <- data.frame(x=d_y$y, y=d_y$x) #note flip of x and y, prevents coord_flip later
right_side$central <- right_side$y > dat_q[1]&right_side$y < dat_q[2]
#add the 'left side', this entails reversing the order of the data for
#path and polygon
#and making x negative
left_side <- right_side[nrow(right_side):1,]
left_side$x <- 0 - left_side$x
density_dat <- rbind(right_side,left_side)
p2 <- ggplot(density_dat, aes(x=x,y=y)) +
geom_polygon(data=density_dat[density_dat$central,],fill="red")+
geom_path()
p2
Just make a selection first. Proof of concept:
df1 <- data.frame(x=1, y=rnorm(10 ^ 5))
df2 <- subset(df1, y > quantile(df1$y, 0.025) & y < quantile(df1$y, 0.975))
ggplot(mapping = aes(x = x, y = y)) +
geom_violin(data = df1, aes(fill = '100%'), color = NA) +
geom_violin(data = df2, aes(fill = '95%'), color = 'black') +
theme_classic() +
scale_fill_grey(name = 'level')
#Heroka gave a great answer. Here is a more general function based on his answer that allows to fill the violin plot according to any ranges (not just quantiles).
violincol <- function(x,from=-Inf,to=Inf,col='grey'){
d <- density(x)
right <- data.frame(x=d$y, y=d$x) #note flip of x and y, prevents coord_flip later
whichrange <- function(r,x){x <= r[2] & x > r[1]}
ranges <- cbind(from,to)
right$col <- sapply(right$y,function(y){
id <- apply(ranges,1,whichrange,y)
if(all(id==FALSE)) NA else col[which(id)]
})
left <- right[nrow(right):1,]
left$x <- 0 - left$x
dat <- rbind(right,left)
p <- ggplot(dat, aes(x=x,y=y)) +
geom_polygon(data=dat,aes(fill=col),show.legend = F)+
geom_path()+
scale_fill_manual(values=col)
return(p)
}
x <- rnorm(10^5)
violincol(x=x)
violincol(x=x,from=c(-Inf,0),to=c(0,Inf),col=c('green','red'))
r <- seq(-5,5,0.5)
violincol(x=x,from=r,to=r+0.5,col=rainbow(length(r)))

How to plot a (sophisticated) stacked barplot in ggplot2, without complicated manual data aggregation

I want to plot a (facetted) stacked barplot where the X-Axis is in percent. Also the Frequency labels are displayed within the bars.
After quite some work and viewing many different questions on stackoverflow, I found a solution on how to solve this with ggplot2. However, I don't do it directly with ggplot2, I manually aggregate my data with a table call. And I do this manual aggregation in a complicated way and also calculate the percent values manually with temp variables (see source code comment "manually aggregate data").
How can I do the same plot, but in a nicer way without the manual and complicated data aggregation?
library(ggplot2)
library(scales)
library(gridExtra)
library(plyr)
##
## Random Data
##
fact1 <- factor(floor(runif(1000, 1,6)),
labels = c("A","B", "C", "D", "E"))
fact2 <- factor(floor(runif(1000, 1,6)),
labels = c("g1","g2", "g3", "g4", "g5"))
##
## STACKED BAR PLOT that scales x-axis to 100%
##
## manually aggregate data
##
mytable <- as.data.frame(table(fact1, fact2))
colnames(mytable) <- c("caseStudyID", "Group", "Freq")
mytable$total <- sapply(mytable$caseStudyID,
function(caseID) sum(subset(mytable, caseStudyID == caseID)$Freq))
mytable$percent <- round((mytable$Freq/mytable$total)*100,2)
mytable2 <- ddply(mytable, .(caseStudyID), transform, pos = cumsum(percent) - 0.5*percent)
## all case studies in one plot (SCALED TO 100%)
p1 <- ggplot(mytable2, aes(x=caseStudyID, y=percent, fill=Group)) +
geom_bar(stat="identity") +
theme(legend.key.size = unit(0.4, "cm")) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
geom_text(aes(label = sapply(Freq, function(x) ifelse(x>0, x, NA)), y = pos), size = 3) # the ifelse guards against printing labels with "0" within a bar
print(p1)
..
After you make the data:
fact1 <- factor(floor(runif(1000, 1,6)),
labels = c("A","B", "C", "D", "E"))
fact2 <- factor(floor(runif(1000, 1,6)),
labels = c("g1","g2", "g3", "g4", "g5"))
dat = data.frame(caseStudyID=fact1, Group=fact2)
You can automate making an unlabeled graph of the kind that you want with position_fill:
ggplot(dat, aes(caseStudyID, fill=Group)) + geom_bar(position="fill")
I don't know if there's a way to generate the text labels automatically. The positions and counts from the stacked graph are accessible with ggplot_build, if you want to use what ggplot calculates instead of doing it separately.
p = ggplot(dat, aes(caseStudyID, fill=Group)) + geom_bar(position="fill")
ggplot_build(p)$data[[1]]
That will return a dataframe with (among other things), count, x, y, ymin, and ymax variables that can be used to create positioned labels.
If you want the labels vertically centered in each category, first make a column with values halfway between ymin and ymax.
freq = ggplot_build(p)$data[[1]]
freq$y_pos = (freq$ymin + freq$ymax) / 2
Then add the labels to the graph with annotate.
p + annotate(x=freq$x, y=freq$y_pos, label=freq$count, geom="text", size=3)
If you have the distribution of case study ID's in each group as single vector, you could use the sjp.stackfrq function from the sjPlot-package.
A <- floor(runif(1000, 1,6))
B <- floor(runif(1000, 1,6))
C <- floor(runif(1000, 1,6))
D <- floor(runif(1000, 1,6))
E <- floor(runif(1000, 1,6))
mydf <- data.frame(A,B,C,D,E)
sjp.stackfrq(mydf, legendLabels = c("g1","g2", "g3", "g4", "g5"))
The function offers many parameters to easily customize plot appearance (labelling, size and colors etc.).

R ggplot: text & graphics below a plot with facet

I've created a plot of categorical data using facet in ggplot.
Example script here:
#script to produce plot with dummy data
rm(list=ls(all=TRUE))
library(ggplot2)
require(gridExtra)
#put dummy data in df
dummy_data<-data.frame(experiment_number=c(rep("exp_1",15),rep("exp_2",15)),
group=rep(c("A","B","C"),5),yvalue=runif(30, 0.0, 0.05))
# make plot
plot1<-ggplot(data = dummy_data)+
geom_point(aes(x = group, y = yvalue,
colour=group,shape=group),size=3.5,position = position_jitter(w = 0.2)) +
facet_wrap( ~ experiment_number) +
ylab("yvalue") +
xlab("")
#plot
plot1
I now want to add text & bars below the plot to show the p values relating to a statistical test between the groups -an example where I've just drawn it in my hand is attached (p values just made up).
Note the p values will be different in the two different panels. I've played around with annotate & custom annotate but cant seem to get it to work. Any ideas?
thanks v much
Here's a totally ridiculous way of doing something similar to what you are asking for. I used geom_errorbar for the bars, so I had to flip the coordinate system. Anyway, you should be able to customize this to do what you need.
rm(list=ls(all=TRUE))
library(ggplot2)
#put dummy data in df
dummy_data<-data.frame(experiment_number=c(rep("exp_1",15),rep("exp_2",15)),
group=rep(c("A","B","C"),5),yvalue=runif(30, 0.0, 0.05))
# make plot
plot1<-ggplot(data = dummy_data)+
geom_point(aes(y = group, x = yvalue, #changed x and y
colour=group,shape=group),size=3.5,position = position_jitter(h = 0.2)) + # changed w=... to h=...
facet_wrap( ~ experiment_number) +
xlab("yvalue") +
ylab("") + coord_flip() # flipped coordinate system
#plot
rng <- range(dummy_data$yvalue) # range
df.lines <- data.frame(ymin=LETTERS[1:3], ymax=LETTERS[c(2,3,1)], x=rng[1]-diff(rng)*1:3/12) #data for geom_errorbar
# data for geom_text
df.txt <- data.frame(y=c("AB", "BC", "B"),
x=rng[1]-diff(rng)*(1:3+.5)/12,
label=c("p=0.003", "p=0.05", "p=0.6",
"p=0.2", "p=0.1", "p=0.05"),
experiment_number=rep(c("exp_1", "exp_2"), each=3))
# add some space and geom_errorbar and geom_text
plot2 <- plot1 + scale_x_continuous(limits=c(rng[1]-diff(rng)/3, rng[2]+diff(rng)/5)) +
geom_errorbar(data=df.lines, aes(x=x, ymin=ymin, ymax=ymax)) +
scale_y_discrete(breaks=LETTERS[1:3], limits=c("A", "AB", "B", "BC", "C")) +
geom_text(data=df.txt, aes(x=x, y=y, label=label), xjust=0.5)
plot2

ggplot2 Scatter Plot Labels

I'm trying to use ggplot2 to create and label a scatterplot. The variables that I am plotting are both scaled such that the horizontal and the vertical axis are plotted in units of standard deviation (1,2,3,4,...ect from the mean). What I would like to be able to do is label ONLY those elements that are beyond a certain limit of standard deviations from the mean. Ideally, this labeling would be based off of another column of data.
Is there a way to do this?
I've looked through the online manual, but I haven't been able to find anything about defining labels for plotted data.
Help is appreciated!
Thanks!
BEB
Use subsetting:
library(ggplot2)
x <- data.frame(a=1:10, b=rnorm(10))
x$lab <- letters[1:10]
ggplot(data=x, aes(a, b, label=lab)) +
geom_point() +
geom_text(data = subset(x, abs(b) > 0.2), vjust=0)
The labeling can be done in the following way:
library("ggplot2")
x <- data.frame(a=1:10, b=rnorm(10))
x$lab <- rep("", 10) # create empty labels
x$lab[c(1,3,4,5)] <- LETTERS[1:4] # some labels
ggplot(data=x, aes(x=a, y=b, label=lab)) + geom_point() + geom_text(vjust=0)
Subsetting outside of the ggplot function:
library(ggplot2)
set.seed(1)
x <- data.frame(a = 1:10, b = rnorm(10))
x$lab <- letters[1:10]
x$lab[!(abs(x$b) > 0.5)] <- NA
ggplot(data = x, aes(a, b, label = lab)) +
geom_point() +
geom_text(vjust = 0)
Using qplot:
qplot(a, b, data = x, label = lab, geom = c('point','text'))

Resources