I am trying to plot different types of plots (line plot and bar charts) beneath one another, they all have the same axis:
c1 <- ggplot(data, aes(date, TotalMutObs)) + stat_smooth(se = FALSE) +
geom_point() +
opts(axis.title.x = theme_blank()) +
ylab("Cumulative number of new mutations")
c2 <- ggplot(data, aes(date, distance)) + stat_smooth(se = FALSE) +
geom_point() +
opts(axis.title.x = theme_blank()) +
ylab("Cumulative mean pairwise distance")
c3 <- ggplot(data, aes(x = date, y = NbOfHorses)) +
geom_bar(stat = "identity") +
opts(axis.title.x = theme_blank()) +
ylab("Number of horses sampled")
grid.arrange(c1, c2,c3)
However, the dates on the x-axis are not lining up for the different plots.
Here is some data to try it out:
date<-c("2003-03-13","2003-03-25","2003-03-26","2003-03-27","2003-03-28","2003-03-31","2003-04-01","2003-04-02","2003-04-04","2003-04-08","2003-04-09","2003-04-10","2003-04-11","2003-04-14","2003-04-15","2003-04-17","2003-04-19","2003-04-21","2003-04-22","2003-04-28","2003-05-08");
NbOfHorses<-c("1","2","1","3","4","5","4","3","3","3","3","4","2","4","1","2","4","1","2","1","2");
TotalMutObs<-c("20","30","58","72","140","165","204","230","250","286","302","327","346","388","393","414","443","444","462","467","485");
distance<-c("0.000693202","0.00073544","0.000855432","0.000506876","0.000720193","0.000708047","0.000835468","0.000812401","0.000803149","0.000839117","0.000842048","0.000856393","0.000879973","0.000962382","0.000990666","0.001104861","0.001137515","0.001143838","0.00121874","0.001213737","0.001201379");
data<-as.data.frame(cbind(date,NbOfHorses,TotalMutObs,distance));
Cheers,
Joseph
The way to solve this problem is to work within ggplot2 and get creative about stacking copies of your data and then sending subsets to each geom that you need.
#A version of your data cleaned up
dat <- data.frame(date = as.Date(date),NbOfHorses = as.numeric(NbOfHorses),
TotalMutObs = as.numeric(TotalMutObs),distance = as.numeric(distance))
#Create three copies, one for each panel
# Use informative titles for grp to be panel titles
fullDat <- rbind(dat,dat,dat)
fullDat$grp <- rep(c('Cumulative number of new mutations',
'Cumulative mean pairwise distance',
'Number of horses sampled'),each = nrow(dat))
ggplot(fullDat,aes(x = date)) +
facet_wrap(~grp,nrow = 3,scale = "free_y") +
geom_point(data = subset(fullDat,grp == 'Cumulative number of new mutations'),
aes(y = TotalMutObs)) +
stat_smooth(data = subset(fullDat,grp == 'Cumulative number of new mutations'),
aes(y = TotalMutObs),se = FALSE) +
geom_point(data = subset(fullDat,grp == 'Cumulative mean pairwise distance'),
aes(y = distance)) +
stat_smooth(data = subset(fullDat,grp == 'Cumulative mean pairwise distance'),
aes(y = distance),se = FALSE) +
geom_bar(data = subset(fullDat,grp == 'Number of horses sampled'),
aes(y = NbOfHorses),stat = "identity") +
labs(x = NULL,y = NULL)
Related
I am currently working with a dataset of "world bank islands". In that, I am trying to plot the population Vs country graph for each year. Below is the code that I have done.
library(ggplot2)
options(scipen = 999)
bank <- read.csv("C:/Users/True Gamer/OneDrive/Desktop/world_bank_international_arrivals_islands.csv")
bank[bank == "" | bank == "."] <- NA
bank$country <- as.numeric(bank$country)
bank$year <- as.numeric(bank$year)
bank$areakm2 <- as.numeric(bank$areakm2)
bank$pop <- as.numeric(bank$pop)
bank$gdpnom <- as.numeric(bank$gdpnom)
bank$flights...WB <- as.numeric(bank$flights...WB)
bank$hotels <- as.numeric(bank$hotels)
bank$hotrooms <- as.numeric(bank$hotrooms)
bank$receipt <- as.numeric(bank$receipt)
bank$ovnarriv <- as.numeric(bank$ovnarriv)
bank$dayvisit <- as.numeric(bank$dayvisit)
bank$arram <- as.numeric(bank$arram)
bank$arreur <- as.numeric(bank$arreur)
bank$arraus <- as.numeric(bank$arraus)
str(bank)
plot1 <- ggplot(bank, aes(x=country,y=pop)) + geom_bar(stat = "identity",aes(fill=year)) + ggtitle("Population of each country yearwise") + xlab("Countries") + ylab("Population")
plot1
However, when I do this, the y values shown on the graph are different from the actual y values. This is the link to the dataset
The problem is that you are stacking the bars (this is default behaviour). Also, geom_bar(stat = "identity") is just a long way of writing geom_col. One further point to note is that since all your columns are numeric, the single line:
bank <- as.data.frame(lapply(bank, as.numeric))
replaces all your individual numeric conversions.
The plot you are trying to create would be something like this:
ggplot(bank, aes(x = country, y = pop)) +
geom_col(aes(fill = factor(year)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Countries") +
ylab("Population") +
labs(fill = "Year") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = 1:27)
However, it would probably be best to present your data in a different way. Perhaps, if you are comparing population growth, something like this would be better:
ggplot(bank, aes(x = year, y = pop)) +
geom_line(aes(color = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")
Or with bars:
ggplot(bank, aes(x = year, y = pop)) +
geom_col(aes(fill = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")
I have the following code
library(ggplot2)
library(dplyr)
# create data
time <- as.numeric(rep(seq(1,7),each=7)) # x Axis
value <- runif(49, 10, 100) # y Axis
group <- rep(LETTERS[1:7],times=7) # group, one shape per group
data <- data.frame(time, value, group)
# stacked area chart
ggplot(data, aes(x=time, y=value, fill=group)) +
geom_area()+
geom_text(data = data %>% filter(time == last(time)), aes(label = group,
x = time + 0.5,
y = value,
color = group)) +
guides(color = FALSE) + theme_bw() +
scale_x_continuous(breaks = scales::pretty_breaks(10))
Where i get
But i am aiming for link
Is there any solution for stacked area plot?
The question code is plotting the text labels in the value's of the last time, when in fact the areas are cumulative. And in reverse order.
Also, the following graph plots data created with the same code but with
set.seed(1234)
Then the data creation code is the same as in the question.
# stacked area chart
ggplot(data, aes(x=time, y=value, fill=group)) +
geom_area()+
geom_text(data = data %>%
filter(time == last(time)) %>%
mutate(value = cumsum(rev(value))),
aes(label = rev(group),
x = time + 0.5,
y = value,
color = rev(group))) +
guides(color = FALSE) + theme_bw() +
scale_x_continuous(breaks = scales::pretty_breaks(10))
Edit.
Following the discussion in the comments to this answer, I have decided to post code based on the comment by user Jake Kaupp.
ggplot(data, aes(x = time, y = value, fill = group)) +
geom_area()+
geom_text(data = data %>% filter(time == last(time)),
aes(x = time + 0.5, y = value,
label = rev(group), color = rev(group)),
position = position_stack(vjust = 0.5)) +
guides(color = FALSE) +
theme_bw() +
scale_x_continuous(breaks = scales::pretty_breaks(10))
You can use the text function to put text wherever you want. For example:
text(7.2, 350, "B", col="brown")
Here we go
time <- as.numeric(rep(seq(1,7),each=8)) # x Axis
value <- runif(56, 10, 100) # y Axis
group <- rep(LETTERS[1:8],times=7) # group, one shape per group
data <- data.frame(time, value, group)
round_df <- function(x, digits) {
# round all numeric variables
# x: data frame
# digits: number of digits to round
numeric_columns <- sapply(x, mode) == 'numeric'
x[numeric_columns] <- round(x[numeric_columns], digits)
x
}
data$value<- round_df(data$value, 2)
# stacked area chart
ggplot(data, aes(x=time, y=value, fill=group)) +
geom_area()+
geom_text(aes(x = time + 0.5, y = value, label=ifelse(time == max(time), group, NA)),position = position_stack(vjust = 0.5),check_overlap = TRUE)+
guides(color = FALSE) + theme_bw()+
scale_x_continuous(breaks = scales::pretty_breaks(10)) +
geom_text(aes(label=ifelse(time != min(time) & time != max(time),value, NA)),position = position_stack(vjust = 0.5),check_overlap = TRUE)+
geom_text(aes(x = time + 0.18,label=ifelse(time == min(time),value, NA)),position = position_stack(vjust = 0.5),check_overlap = TRUE)+
geom_text(aes(x = time - 0.18,label=ifelse(time == max(time),value, NA)),position = position_stack(vjust = 0.5),check_overlap = TRUE)
And get
Factor levels but why not letters? That is the next step :)
UPDATED
just converted factor to char data$group <- as.character(data$group)
I'm looking to label the "flow" portion of Alluvial / Sankey chart on R.
The stratums (columns) can easily be labelled, but not the flows connecting them. All my attempts on reading the documentations and experimenting were to no avail.
In the sample below, "freq" is expected to be labelled on the flow connection part.
library(ggplot2)
library(ggalluvial)
data(vaccinations)
levels(vaccinations$response) <- rev(levels(vaccinations$response))
ggplot(vaccinations,
aes(x = survey, stratum = response, alluvium = subject,
y = freq,
fill = response, label = freq)) +
scale_x_discrete(expand = c(.1, .1)) +
geom_flow() +
geom_stratum(alpha = .5) +
geom_text(stat = "stratum", size = 3) +
theme(legend.position = "bottom") +
ggtitle("vaccination survey responses at three points in time")
There is an option to take the raw numbers and use these as labels for the flow part:
ggplot(vaccinations,
aes(x = survey, stratum = response, alluvium = subject,
y = freq,
fill = response, label = freq)) +
scale_x_discrete(expand = c(.1, .1)) +
geom_flow() +
geom_stratum(alpha = .5) +
geom_text(stat = "stratum", size = 3) +
geom_text(stat = "flow", nudge_x = 0.2) +
theme(legend.position = "bottom") +
ggtitle("vaccination survey responses at three points in time")
If you want more control over how to label these points, you can extract the layer data and do computations on that. For example we can compute the fractions for only the starting positions as follows:
# Assume 'g' is the previous plot object saved under a variable
newdat <- layer_data(g)
newdat <- newdat[newdat$side == "start", ]
split <- split(newdat, interaction(newdat$stratum, newdat$x))
split <- lapply(split, function(dat) {
dat$label <- dat$label / sum(dat$label)
dat
})
newdat <- do.call(rbind, split)
ggplot(vaccinations,
aes(x = survey, stratum = response, alluvium = subject,
y = freq,
fill = response, label = freq)) +
scale_x_discrete(expand = c(.1, .1)) +
geom_flow() +
geom_stratum(alpha = .5) +
geom_text(stat = "stratum", size = 3) +
geom_text(data = newdat, aes(x = xmin + 0.4, y = y, label = format(label, digits = 1)),
inherit.aes = FALSE) +
theme(legend.position = "bottom") +
ggtitle("vaccination survey responses at three points in time")
It still is kind of a judgement call about where exactly you want to place the labels. Doing it at the start is the easy way, but if you want these labels to be approximately in the middle and dodging oneanother it would require some processing.
I am trying to make an overlapping histogram like this:
ggplot(histogram, aes = (x), mapping = aes(x = value)) +
geom_histogram(data = melt(tpm_18_L_SD), breaks = seq(1,10,by = 1),
aes(y = 100*(..count../sum(..count..))), alpha=0.2) +
geom_histogram(data = melt(tpm_18_S_SD), breaks = seq(1,10,by = 1),
aes(y = 100*(..count../sum(..count..))), alpha=0.2) +
geom_histogram(data = melt(tpm_18_N_SD), breaks = seq(1,10,by = 1),
aes(y = 100*(..count../sum(..count..))), alpha=0.2) +
facet_wrap(~variable, scales = 'free_x') +
ylim(0, 20) +
ylab("Percentage of Genes") +
xlab("Standard Deviation")
My code can only make them plot side by side and I would like to also make them overlap. Thank you! I based mine off of the original post where this came from but it did not work for me. It was originally 3 separate graphs which I combined with grid and ggarrange. It looks like this right now.
Here is the code of the three separate graphs.
SD_18_L <- ggplot(data = melt(tpm_18_L_SD), mapping = aes(x = value)) +
geom_histogram(aes(y = 100*(..count../sum(..count..))), breaks = seq(1, 10, by = 1)) +
facet_wrap(~variable, scales = 'free_x') +
ylim(0, 20) +
ylab("Percentage of Genes") +
xlab("Standard Deviation")
SD_18_S <- ggplot(data = melt(tpm_18_S_SD), mapping = aes(x = value)) +
geom_histogram(aes(y = 100*(..count../sum(..count..))), breaks = seq(1, 10, by = 1)) +
facet_wrap(~variable, scales = 'free_x') +
ylim(0, 20) +
ylab("Percentage of Genes") +
xlab("Standard Deviation")
SD_18_N <- ggplot(data = melt(tpm_18_N_SD), mapping = aes(x = value)) +
geom_histogram(aes(y = 100*(..count../sum(..count..))), breaks = seq(1, 10, by = 1)) +
facet_wrap(~variable, scales = 'free_x') +
ylim(0, 20) +
ylab("Percentage of Genes") +
xlab("Standard Deviation")
What my graphs look like now:
ggplot expects dataframes in a long format. I'm not sure what your data looks like, but you shouldn't have to call geom_histogram for each category. Instead, get all your data into a single dataframe (you can use rbind for this) in long format (what you're doing already with melt) first, then feed it into ggplot and map fill to whatever your categorical variable is.
Your call to facet_wrap is what puts them in 3 different plots. If you want them all on the same plot, take that line out.
An example using the iris data:
ggplot(iris, aes(x = Sepal.Length, fill = Species)) +
geom_histogram(alpha = 0.6, position = "identity")
I decreased alpha in geom_histogram so you can see where colors overlap, and added position = "identity" so observations aren't being stacked. Hope that helps!
I am having difficulties adding a legend to my error bar plot. I tried several command that I've seen in other subject, but unfortunately it doesn't work (I am sure I'm missing something but I can't figure out what)
library(ggplot2)
errors=matrix(c(-3.800904,-3.803444,-3.805985,-3.731204,-3.743969,
-3.756735,-3.742510,-3.764961,-3.787413,-3.731204,-3.743969,-3.756735,
-3.711420,-3.721589,-3.731758,-3.731204,-3.743969,-3.756735,-3.636346,
-3.675159,-3.713971,-3.731204,-3.743969,-3.756735),nrow=4,byrow=TRUE)
modelName=c("model 1","model 2","model 3","model 0")
boxdata=data.frame(errors,modelName)
colnames(boxdata)=c("icp","pred","icm","icp_obs","obs","icm_obs","model")
qplot(boxdata$model,boxdata$pred,
main = paste("confidance level 95% for age ", age_bp + start_age - 1,sep="")) +
geom_errorbar(aes(x=boxdata$model, ymin=boxdata$icm, ymax=boxdata$icp), width=0.20,col='deepskyblue') +
geom_point(aes(x=boxdata$model,y=boxdata$obs),shape=4,col="orange") +
geom_errorbar(aes(x=boxdata$model, ymin=boxdata$icm_obs, ymax=boxdata$icp_obs), width=0.20,col='red') +
scale_shape_manual(name="legend", values=c(19,4)) +
scale_color_manual(name="legend", values = c("black","orange")) +
xlab("models") +
ylab("confidence level")
The problem is that you are using wide form data rather than long form data. You need to convert the data from wide to long before plotting if you want to get a legend.
library(ggplot2)
errors=matrix(c(-3.800904,-3.803444,-3.805985,-3.731204,-3.743969,
-3.756735,-3.742510,-3.764961,-3.787413,-3.731204,-3.743969,-3.756735,
-3.711420,-3.721589,-3.731758,-3.731204,-3.743969,-3.756735,-3.636346,
-3.675159,-3.713971,-3.731204,-3.743969,-3.756735),nrow=4,byrow=TRUE)
errors = rbind(errors[, 1:3], errors[,4:6]) # manually reshaping the data
modelName=c("model 1","model 2","model 3","model 0")
type = rep(c("model", "obs"), each = 4)
boxdata=data.frame(errors,modelName, type)
colnames(boxdata)=c("icp","pred","icm","model", "type")
ggplot(boxdata, aes(x = model, y = pred, ymax = icp, ymin = icm,
group = type, colour = type, shape = type)) +
geom_errorbar(width=0.20) +
geom_point() +
scale_shape_manual(values=c(19, 4)) +
scale_color_manual(values = c("black","orange")) +
xlab("models") +
ylab("confidence level")
The output looks closer to your output can be generated by:
ggplot(boxdata, aes(x = model, y = pred, ymax = icp, ymin = icm,
group = type, colour = type, shape = type)) +
geom_errorbar(width=0.20) +
geom_point(colour = rep(c("black","orange"), each = 4)) +
scale_shape_manual(values=c(19, 4)) +
scale_color_manual(values = c("deepskyblue", "red")) +
xlab("models") +
ylab("confidence level")