R: Stacked area chart does not stack - r

I have data which I want to plot as a stacked area plot. On the x-axis I have data which is continuous and on the y axis I have continuous data which I prepare to be cumulative. This is the code I am using with some dummy data:
library(data.table)
library(ggplot2)
set.seed(1)
dt <- data.table(var=sample(1:6,1000,replace=TRUE),xdata=runif(1000),ydata=runif(1000))
setorder(dt, var, xdata)
dt$cumydata <- dt[,
cumsum(ydata),
by = .(var)]$V1/sum(dt$ydata)
ggplot(dt, aes(x = xdata, y = cumydata, fill = as.factor(var))) +
geom_area(position = "stack")
Here is the output plot:
My issue is, that the data does not stack correctly. I guess this could be because of the continuity of the data?

For a stacked area chart the x values as well as the number of occurences must be the same. So changing your sample data to this will give you the expected output:
set.seed(1)
dt <- data.table(ydata=runif(1002))
dt$var <- rep(1:6, each=167)
dt$xdata <- rep(runif(167), 6)
setorder(dt, var, xdata)
dt$cumydata <- dt[,
cumsum(ydata),
by = .(var)]$V1/sum(dt$ydata)
ggplot(dt,aes(x = xdata, y = cumydata, fill = as.factor(var))) +
geom_area(position = "stack")

So this is finally how I solved it, based on Jimbou's information. It is just a bit of preprocessing. I also made the whole thing logarithmic.
library(data.table)
library(ggplot2)
set.seed(1)
dtt <- data.table(var=sample(1:6,1000,replace=TRUE),xdata=runif(1000),ydata=runif(1000))
setorder(dtt, var, xdata)
log.min.xdata <- log(min(dtt$xdata))
log.max.xdata <- log(max(dtt$xdata))
nbreaks <- 101
temp <- hist(log(dtt$xdata[dtt$var==1]),
breaks = seq(log.min.xdata, log.max.xdata, length=nbreaks),
plot = FALSE)
dt <- data.table(var = unlist(lapply(sort(unique(dtt$var)),
function(x){rep(x,nbreaks-1)})),
bin = rep(1:(nbreaks-1),length(unique(dtt$var))),
mid = rep(temp$mids))
dt$count <- dt[,
hist(log(dtt$xdata[dtt$var==var]),
breaks = seq(log.min.xdata, log.max.xdata, length=nbreaks),
plot = FALSE)$counts,
by = .(var)]$V1
dt$cumcount <- dt[,
cumsum(count),
by = .(var)]$V1
pp <- ggplot(dt, aes(x = exp(mid), y = cumcount, fill = as.factor(var))) +
geom_area(position = "stack") +
scale_x_log10() +
theme_bw() +
theme(legend.position = c(0.1, 0.70),
legend.background = element_rect(fill="lightgrey",
size=0.5, linetype="solid")) +
labs(title = "y",
fill = " var",
x = "xdata",
y = "cumcount") +
theme(title = element_text(face = "bold"),
axis.title = element_text(face = "bold"),
legend.title = element_text(face = "bold"),
legend.text = element_text(face = "bold"))

Related

How can I create bubble grid charts in ggplot?

I want to create bubble grid charts with ggplot.
somthing like this:
I couldnt find any code or exampe online.
Thanks
Using geom_point with discrete x and y scales will get you started. Here's an example with some quick toy data:
library(tidyverse)
offenses <- c("robbery", "violence", "drugs")
actions <- c("formal", "informal", "considered")
counts <- sample(10:100, 9, replace = TRUE)
data <- expand.grid(offenses = offenses, actions = actions) %>% bind_cols(counts = counts)
ggplot(data,
aes(x = str_to_title(offenses),
y = str_to_title(actions),
colour = str_to_title(offenses),
size = counts)) +
geom_point() +
geom_text(aes(label = counts),
colour = "white",
size = 3) +
scale_x_discrete(position = "top") +
scale_size_continuous(range = c(10, 30)) + # Adjust as required.
scale_color_brewer(palette = "Set2") +
labs(x = NULL, y = NULL) +
theme(legend.position = "none",
panel.background = element_blank(),
panel.grid = element_blank(),
axis.ticks = element_blank())
Play around with the range parameter of scale_size_continouous as needed to get bubbles of a reasonable size for your data set.
Oh, I also tried meanwhile. It looks very similar ...
require(ggplot2)
require(ggsci) # for the scale_fill_npg()
grid.bubble.plot <- function(df,
axis_labels_size=16,
aspect_ratio=1/1,
values_text_size=3,
values_text_color="white",
x_axis_position="top", # or "bottom",
bubble_size_range=c(5, 24),
bubble_alpha=0.5,
bubble_shape=21,
bubble_edge_stroke=0) {
col_names <- colnames(df)
row_names <- rownames(df)
values <- as.vector(as.matrix(df))
values_x <- as.vector(sapply(col_names, function(i) rep(i, nrow(df))))
values_y <- as.vector(rep(row_names, dim(df)[2]))
res_df <- data.frame(values = values, values_x = values_x, values_y)
gg <- ggplot(res_df, aes(x=values_x, y=values_y, size = values, fill=factor(values_x))) +
geom_point(alpha=bubble_alpha, shape=bubble_shape, stroke=bubble_edge_stroke) +
scale_size(range = bubble_size_range) +
scale_fill_npg() +
scale_x_discrete(position = x_axis_position) +
geom_text(aes(label=values), size=values_text_size, color=values_text_color) +
theme(line=element_blank(),
panel.background=element_blank(),
legend.position="none",
axis.title=element_blank(),
axis.text=element_text(size=axis_labels_size),
aspect.ratio=aspect_ratio)
gg
}
grid.bubble.plot(df)
Play around with the values.
e.g. you can make also the text size vary with the values:
# thanks to #MSR for example dataset
require(tidyverse)
offenses <- c("robbery", "violence", "drugs")
actions <- c("formal", "informal", "considered")
counts <- sample(10:100, 9, replace = TRUE)
df <- expand.grid(offenses = offenses, actions = actions) %>% bind_cols(counts = counts)
grid.bubble.plot(df, values_text_size=as.vector(as.matrix(df)))
For color, you can try other scale_fill_ variants like:
_aaas() _lancet() _jco() _tron()

Automate tick max and min in faceted ggplot

I am trying to just mark the max and min of each x-axis in a faceted ggplot. I have several facets with different x scales and the same y scale, and the x axis tick labels overlap each other. Rather than having to manually determine the limits and breaks for each facet x axis, I am looking for a way to just label the min and max values for each.
Code using example data of the CO2 dataset (see ?CO2):
CO2$num <- 1:nrow(CO2)
library(reshape2)
CO2.melt <- melt(CO2,
id.var=c("Type",
"Plant",
"Treatment",
"num"))
CO2.melt <- CO2.melt[order(CO2.melt$num),]
library(ggplot2)
ggplot(CO2.melt,
aes(x = value,
y = num)) +
geom_path(aes(color = Treatment)) +
facet_wrap( ~ variable, scales = "free_x",nrow=1)
Purpose is to replicate well log displays such as this one.
When you want to implemented this for the tick-labels, the use of scales = "free_x" in a faceted plot makes this hard to automate this. However, with a bit of tinkering and the help of several other packages, you could also use the following approach:
1) Summarise the data in order to get an idea which tick-labels / breaks you need on the x-axis:
library(data.table)
minmax <- melt(setDT(CO2.melt)[, .(min.val = min(value), max.val = max(value),
floor.end = 10*ceiling(min(value)/10),
ceil.end = 10*floor((max(value)-1)/10)),
variable][],
measure.vars = patterns('.val','.end'),
variable.name = 'var',
value.name = c('minmax','ends'))
which gives:
> minmax
variable var minmax ends
1: conc 1 95.0 100
2: uptake 1 7.7 10
3: conc 2 1000.0 990
4: uptake 2 45.5 40
2) Create break vecors for each facet:
brks1 <- c(95,250,500,750,1000)
brks2 <- c(7.7,10,20,30,40,45.5)
3) Create the facets:
p1 <- ggplot(CO2.melt[CO2.melt$variable=="conc",],
aes(x = value, y = num, colour = Treatment)) +
geom_path() +
scale_x_continuous(breaks = brks1) +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(colour = c('red','black')[c(1,2,2,2,1)],
face = c('bold','plain')[c(1,2,2,2,1)]),
axis.title = element_blank(),
panel.grid.major = element_line(colour = "grey60"),
panel.grid.minor = element_blank())
p2 <- ggplot(CO2.melt[CO2.melt$variable=="uptake",],
aes(x = value, y = num, colour = Treatment)) +
geom_path() +
scale_x_continuous(breaks = brks2) +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(colour = c('red','black')[c(1,2,2,2,2,1)],
face = c('bold','plain')[c(1,2,2,2,2,1)]),
axis.title = element_blank(),
panel.grid.major = element_line(colour = "grey60"),
panel.grid.minor = element_blank())
4) Extract the legend into a separate object:
library(grid)
library(gtable)
fill.legend <- gtable_filter(ggplot_gtable(ggplot_build(p2)), "guide-box")
legGrob <- grobTree(fill.legend)
5) Create the final plot:
library(gridExtra)
grid.arrange(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
legGrob, ncol=3, widths = c(4,4,1))
which results in:
A possible alternative solution to do this automatically, is either use geom_text or geom_label. An example to show how you can achieve this:
# create a summary
library(dplyr)
library(tidyr)
minmax <- CO2.melt %>%
group_by(variable) %>%
summarise(minx = min(value), maxx = max(value)) %>%
gather(lbl, val, -1)
# create the plot
ggplot(CO2.melt, aes(x = value, y = num, color = Treatment)) +
geom_path() +
geom_text(data = minmax,
aes(x = val, y = -3, label = val),
colour = "red", fontface = "bold", size = 5) +
facet_wrap( ~ variable, scales = "free_x", nrow=1) +
theme_minimal()
which gives:
You can also get the minimum and maximum values on the fly inside ggplot (credit to #eipi10). Another example using geom_label:
ggplot(CO2.melt, aes(x = value, y = num, color = Treatment)) +
geom_path() +
geom_label(data = CO2.melt %>%
group_by(variable) %>%
summarise(minx = min(value), maxx = max(value)) %>%
gather(lbl, val, -1),
aes(x = val, y = -3, label = val),
colour = "red", fontface = "bold", size = 5) +
facet_wrap( ~ variable, scales = "free_x", nrow=1) +
theme_minimal()
which gives:
Edit Updating to ggplot2 ver 3.0.0
This approach modifies the labels in the ggplot build data (i.e., ggplot_build(plot)). I've removed the x-axis expansions so that the maximum and minimum values fall on the panel boundaries.
# Packages
library(grid)
library(ggplot2)
library(reshape2)
# Data
CO2$num <- 1:nrow(CO2)
library(reshape2)
CO2.melt <- melt(CO2,
id.var=c("Type",
"Plant",
"Treatment",
"num"))
CO2.melt <- CO2.melt[order(CO2.melt$num),]
# Plot
(p <- ggplot(CO2.melt,
aes(x = value,
y = num)) +
scale_x_continuous(expand = c(0, 0)) +
geom_path(aes(color = Treatment)) +
facet_wrap( ~ variable, scales = "free_x", nrow=1))
# Get the build data
gb <- ggplot_build(p)
# Get number of panels
panels = length(gb$layout$panel_params)
# Get x tick mark labels
x.labels = lapply(1:panels, function(N) gb$layout$panel_params[[N]]$x.labels)
# Get range of x values
x.range = lapply(1:panels, function(N) gb$layout$panel_params[[N]]$x.range)
# Get position of x tick mark labels
x.pos = lapply(1:panels, function(N) gb$layout$panel_params[[N]]$x.major)
# Get new x tick mark labels - includes max and min
new.labels = lapply(1:panels, function(N) as.character(sort(unique(c(as.numeric(x.labels[[N]]), x.range[[N]])))))
# Tag min and max values with "min" and "max"
new.labelsC = new.labels
minmax = c("min", "max")
new.labelsC = lapply(1:panels, function(N) {
x = c(new.labelsC[[N]][1], new.labelsC[[N]][length(new.labels[[N]])])
x = paste0(x, "\n", minmax)
c(x[1], new.labelsC[[N]][2:(length(new.labels[[N]])-1)], x[2])
} )
# # Get position of new labels
new.pos = lapply(1:panels, function(N) (as.numeric(new.labels[[N]]) - x.range[[N]][1])/(x.range[[N]][2] - x.range[[N]][1]))
# Put them back into the build data
for(i in 1:panels) {
gb$layout$panel_params[[i]]$x.labels = new.labelsC[[i]]
gb$layout$panel_params[[i]]$x.major_source = as.numeric(new.labels[[i]])
gb$layout$panel_params[[i]]$x.major = new.pos[[i]]
}
# Get the ggplot grob
gp = ggplot_gtable(gb)
# Add some additional space between the panels
pos = gp$layout$l[grep("panel", gp$layout$name)] # Positions of the panels
for(i in 1:(panels-1)) gp$widths[[pos[i]+1]] = unit(1, "cm")
# Colour the min and max labels using `grid` editing functions
for(i in 1:panels) {
gp = editGrob(grid.force(gp), gPath(paste0("axis-b-", i), "axis", "axis", "GRID.text"),
grep = TRUE, gp = gpar(col = c("red", rep("black", length(new.labels[[i]])-2), "red")))
}
# Draw it
grid.newpage()
grid.draw(gp)

Change y axis limits for each row of a facet plot in ggplot2

I have a 3 rows by 5 columns facet plot. Each row show data which spread over different ranges. To properly display my data so everything is shown, I don't set a y axis limit.
Here's my code:
require(reshape2)
library(ggplot2)
library(RColorBrewer)
fileName = paste("./data_test.csv", sep = "")
## data available here: https://dl.dropboxusercontent.com/u/73950/data_test.csv
mydata = read.csv(fileName,sep=",", header=TRUE)
dataM = melt(mydata,c("id"))
dataM = cbind(dataM,
colsplit(dataM$variable,
pattern = "_",
names = c("Network_model", "order", "category")))
dataM$variable <- NULL
dataM <- dcast(dataM, ... ~ category, value.var = "value")
dataM$minCut <- NULL
dataM$nbr_communities <- NULL
dataM$mean_community_size <- NULL
dataM$density <- NULL
my_palette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
dataM = melt(dataM, id.vars = c("Network_model", "order", "nodesRemoved", "id"))
my_palette = c(brewer.pal(5, "Blues")[c(4)], brewer.pal(5, "Set1")[c(3)])
ggplot(dataM, aes(x= nodesRemoved ,y= value, group= order, color= order)) +
geom_point(size = .6,alpha = .15,position="jitter") + ## increased size
stat_smooth(se = FALSE, size = .5, alpha = .1, method = "loess") +
scale_color_manual(values=my_palette) +
theme_bw() +
theme(plot.background = element_blank(),
axis.line = element_blank(),
legend.key = element_blank(),
legend.title = element_blank(),
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8)
) +
scale_y_continuous("Value") +
scale_x_continuous("Time", limits=c(0, 100)) +
facet_grid(variable ~ Network_model,scales="free")
Which produces this:
Now, I'd like to selectively set limits for each of the three rows, so that the first row is limits=c(1.9, 3), the second is limits=c(0, 1) and the third is limits=c(.3, .7).
How can I achieve this in ggplot2 of faceting?
I think your best option will be to trim the data before plotting it, e.g. with dplyr,
library(dplyr)
limits <- data.frame(variable = levels(dataM$variable),
min = c(1.9,0,0.3),
max = c(3,1,0.7))
dataC <- inner_join(dataM, limits) %>% filter(value > min, value < max)
last_plot() %+% dataC
(I initially made the points bigger to see the culprits more clearly)

How to plot a barplot/barchart with continuous x axis

I would like to plot a barplot but I have dates on the x axis and I want those dates to be correctly spaced (as it is NON categorical)
set.seed(1)
m = matrix(abs(rnorm(6)),3,2)
rownames(m) = as.Date(c('2011-01-01','2011-01-03','2011-01-10'))
barplot(t(m),beside=T,col=c('red','blue'),las=2)
On this example I would like 14984 to be offset on the right.
I'd rather a graphics solution but ggplot2 is fine too
Would you mind to use ´ggplot´ instead?
library(ggplot2)
set.seed(1)
df <- data.frame(y=abs(rnorm(6)),
x=rep(as.Date(c('2011-01-01','2011-01-03','2011-01-10')),
times = 2),
g = factor(rep(c(1,2), each = 3)))
ggplot(aes(x=x, y=y, group = g, fill = g), data = df) +
geom_bar(stat = 'identity', position = 'dodge')
You can improve axis formatting with `scale_x_date´
library(scales)
ggplot(aes(x=x, y=y, group = g, fill = g), data = df) +
geom_bar(stat = 'identity', position = 'dodge') +
scale_x_date(breaks = '1 day') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
And customize it to your purpose
ggplot(aes(x=x, y=y, group = g, fill = g), data = df) +
geom_bar(stat = 'identity', position = 'dodge') +
scale_x_date(breaks = '1 day') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
scale_fill_manual('My\nclasses', values = c('1'='red', '2' = 'blue')) +
labs(list(title = 'Barplot\n', x = ('Date'), y = 'Values'))
With graphics, you probably have to prepare the data appropriately (with missing values for dates you don't consider) in order to do this. Then you can use barplot.
# matrix definition
set.seed(1)
m = matrix(abs(rnorm(6)),3,2)
rownames(m) = as.Date(c('2011-01-01','2011-01-03','2011-01-10'))
# get all dates in between
dts <- do.call(":", as.list(range(rownames(m))))
dts <- dts[!dts%in%rownames(m)]
mat <- matrix(NA, nrow=length(dts), ncol=2, dimnames=list(dts, NULL))
# combine with original matrix
m <- rbind(m, mat)
m <- m[order(rownames(m)), ]
which(!is.na(m[,1]))
# plot
barplot(t(m), beside=T, col=c('red','blue'),las=2, axes=FALSE, axisnames=FALSE)
axis(2)
axis(1, at=3*which(!is.na(m[,1]))-1, labels=rownames(m[!is.na(m[,1]),]))

ggplot2: A mean row in heatmaps

Say I created a heatmap using the function geom_raster() (from ggplot2).
What's a smart way to add a row at the bottom of the table showing (in my case) the 'Mean return' for each month on the period considered ?
It would be nice there is some space left between the 1985-2013 period and the row for the average, and maybe police color and 'cases' could be customized.
The core of my code is as follows (the object molten contains the my data, originally a matrix passed through the melt() function of reshape2.
hm <- ggplot(data = molten, aes(x = factor(Var2, levels = month.abb), y=Var1, fillll=value)) + geom_raster()
hm <- hm + scale_fill_gradient2(low=LtoM(100), mid=Mid, high=MtoH(100))
hm <- hm + labs(fill='% Return')
hm <- hm + geom_text(aes(label=paste(sprintf("%.1f %%", value))), size = 4)
hm <- hm + scale_y_continuous(breaks = 1985:2013)
hm <- hm + xlab(label = NULL) + ylab(label = NULL)
hm <- hm + theme_bw()
hm <- hm + theme(axis.text.x = element_text(size = 10, hjust = 0, vjust = 0.4, angle=90))
It's not very concise, but I think this should do what you need.
You didn't provide a data set, so I just made some up. Also, the LtoM and MtoH functions are not included in any R package I could find, so I did a quick Google search and found them here
The following code produces a plot hm2 with facets to make the "Mean Return" row at the bottom:
require(reshape2)
require(ggplot2)
# Random data
set.seed(100)
casted = data.frame(Var1 = rep(1985:2013, times=12), Var2 = rep(month.abb, each=29), return = rnorm(12*29, 0, 9))
molten = melt(casted, id.vars = c("Var1", "Var2"))
LtoM <-colorRampPalette(c('red', 'yellow' ))
Mid <- "snow3"
MtoH <-colorRampPalette(c('lightgreen', 'darkgreen'))
# Averages
monthly.avg = cbind(Var1 = rep("Mean", 12), dcast(molten, Var2 ~ ., mean))
colnames(monthly.avg)[3] = "Mean"
molten2 = merge(molten, melt(monthly.avg), all.x = TRUE, all.y = TRUE)
# New plot
hm2 =
ggplot(data = molten2, aes(x = factor(Var2, levels = month.abb), y=Var1, fill=value)) +
geom_raster() +
scale_fill_gradient2(low=LtoM(100), mid=Mid, high=MtoH(100)) +
labs(fill='% Return') +
geom_text(aes(label=paste(sprintf("%.1f %%", value))), size = 4) +
xlab(label = NULL) + ylab(label = NULL) +
theme_bw() +
theme(axis.text.x = element_text(size = 10, hjust = 0, vjust = 0.4, angle=90)) +
facet_grid(variable ~ ., scales = "free_y", space = "free_y") + # grid layout
theme(strip.background = element_rect(colour = 'NA', fill = 'NA'), strip.text.y = element_text(colour = 'white')) # remove facet labels
which gives the following plot:
How about this:
I created a grid to mock up your data
Main changes, are to precalculate the aggregate and "spacer" data rows, and add to molten,
then add scale_y_discrete so you can label the rows,
then make sure the format works for the grey spacer bar with no % label (comments in code)
Easier in future if you include the data (or a sample) in the question
require(ggplot2)
molten<-expand.grid(c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),1985:2013,0)
colnames(molten)<-c("Var2","Var1","value")
molten$value=(runif(nrow(molten))*60)-30
#create means
means<-aggregate(molten[,c(1,3)], by=list(molten$Var2),FUN=mean, na.rm=TRUE)
colnames(means)<-c("Var2","Var1","value")
means$Var1<-"MEANS"
#create spacer bar
spacer<-means
spacer$Var1<-" "
spacer$value<-NA
#append them to the data
molten<-rbind(molten,spacer,means)
hm <- ggplot(data = molten, aes(x = Var2, y=Var1, fill=value)) +
geom_raster() +
# replaced your functions for ease of use
scale_fill_gradient2(low="red", mid="yellow", high="green",na.value="grey") +
labs(fill='% Return') +
# don't format the NA vals with %, return blank
geom_text(aes(label=ifelse((is.na(value)),"",paste(sprintf("%.1f %%", value)))), size = 4) +
# make the scale discrete to add labels and enforce order (use a blank space for the spacer)
scale_y_discrete(limits = c("MEANS"," ",1985:2013)) +
xlab(label = NULL) + ylab(label = NULL) +
theme_bw() +
theme(axis.text.x = element_text(size = 10, hjust = 0, vjust = 0.4, angle=90))
hm

Resources