ggplot specific thick line - r

How would one be able to plot one line thicker than the other. I tried using the geom_line(size=X) but then this increases the thickness of both lines. Let say I would like to increase the thickness of the first column, how would one be able to approach this?
a <- (cbind(rnorm(100),rnorm(100))) #nav[,1:10]
sa <- stack(as.data.frame(a))
sa$x <- rep(seq_len(nrow(a)), ncol(a))
require("ggplot2")
p<-qplot(x, values, data = sa, group = ind, colour = ind, geom = "line")
p + theme(legend.position = "none")+ylab("Millions")+xlab("Age")+
geom_line( size = 1.5)

You need to map line thickness to the variable:
p + geom_line(aes(size = ind))
To control the thickness use scale_size_manual():
p + geom_line(aes(size = ind)) +
scale_size_manual(values = c(0.1, 1))

Related

R: Changing the Color of Overlapping Points

I am working with the R programming language. I made the following graph that shows a scatterplot between points of two different colors :
library(ggplot2)
a = rnorm(10000,10,10)
b = rnorm(10000, 10, 10)
c = as.factor("red")
data_1 = data.frame(a,b,c)
a = rnorm(10000,7,5)
b = rnorm(10000, 7, 5)
c = as.factor("blue")
data_2 = data.frame(a,b,c)
final = rbind(data_1, data_2)
my_plot = ggplot(final, aes(x=a, y=b, col = c)) + geom_point() + theme(legend.position="top") + ggtitle("My Plot")
My Question: Is there a way to "change the colors of overlapping points"?
Here is what I tried so far:
1) I found the following question (Visualizing two or more data points where they overlap (ggplot R)) and tried the strategy suggested:
linecolors <- c("#714C02", "#01587A", "#024E37")
fillcolors <- c("#9D6C06", "#077DAA", "#026D4E")
# partially transparent points by setting `alpha = 0.5`
ggplot(final, aes(a,b, colour = c, fill = c)) +
geom_point(alpha = 0.5) +
scale_color_manual(values=linecolors) +
scale_fill_manual(values=fillcolors) +
theme_bw()
This shows the two different colors along with the overlap, but it is quite dark and still not clear. Is there a way to pick better colors/resolutions for this?
2) I found the following link which shows how to make color gradients for continuous variables : https://drsimonj.svbtle.com/pretty-scatter-plots-with-ggplot2 - but I have discrete colors and I do not know how to apply this
3) I found this question over here (Any way to make plot points in scatterplot more transparent in R?) which shows to do this with the base R plot, but not with ggplot2:
addTrans <- function(color,trans)
{
# This function adds transparancy to a color.
# Define transparancy with an integer between 0 and 255
# 0 being fully transparant and 255 being fully visable
# Works with either color and trans a vector of equal length,
# or one of the two of length 1.
if (length(color)!=length(trans)&!any(c(length(color),length(trans))==1)) stop("Vector lengths not correct")
if (length(color)==1 & length(trans)>1) color <- rep(color,length(trans))
if (length(trans)==1 & length(color)>1) trans <- rep(trans,length(color))
num2hex <- function(x)
{
hex <- unlist(strsplit("0123456789ABCDEF",split=""))
return(paste(hex[(x-x%%16)/16+1],hex[x%%16+1],sep=""))
}
rgb <- rbind(col2rgb(color),trans)
res <- paste("#",apply(apply(rgb,2,num2hex),2,paste,collapse=""),sep="")
return(res)
}
cols <- sample(c("red","green","pink"),100,TRUE)
# Very transparant:
plot(final$a , final$b ,col=addTrans(cols,100),pch=16,cex=1)
But this is also not able to differentiate between the two color classes that I have.
Problem: Can someone please suggest how to fix the problem with overlapping points, such that the overlap appear more visible?
Thanks!
I would use a density heatmap
ggplot(final, aes(x=a, y=b, col = c))+
stat_density_2d(aes(fill = stat(density)), geom = 'raster', contour = FALSE) +
scale_fill_viridis_c() +
coord_cartesian(expand = FALSE) +
geom_point(shape = '.', col = 'white')
or
ggplot(final, aes(x=a, y=b, col = c))+
stat_density_2d(aes(fill = stat(level)), geom = 'polygon') +
scale_fill_viridis_c(name = "density") +
geom_point(shape = '.')
or
ggplot(final, aes(x=a, y=b, col = c))+
geom_point(alpha = 0.1) +
geom_rug(alpha = 0.01)

ggplot make default point size larger when size is already determined by another variable

I am trying to display data that includes non-detects. For the ND I want to have a circular outline at different sizes so that the lines do not overlap each other. I pretty much have what I want, but for the parameter cis-DCE the circular outline just makes the point look bigger instead of being a distinct outline. How do I attribute size to the parameter and also make the starting size larger?
I will include all of the code I am using for the graphing, but I am specifically working on this bit right now.
geom_point(aes(x= date, y = lrl, group = parm_nmShort, size = parm_nmShort), shape = 1) + #marking lower limit
I also know that I could use facet_wraps and I've done that previously, but historically this data has been shown in one graph, but without identifying the NDs and I do not want to drastically alter the display of the data and confuse anyone.
{
#graphing
# folder where you want the graphs to be saved:
results <- 'C:/Users/cbuckley/OneDrive - DOI/Documents/Projects/New Haven/Data/Graphs/'
{
VOC.graph <- function(df, na.rm = TRUE, ...){
df$parm_nmShort <- factor(df$parm_nm, levels = c("cis.1.2.Dichloroethene_77093",
"Trichloroethene_34485",
"Tetrachloroethene_34475"),
labels = c("cis-DCE", "TCE", "PCE"))
# create list of sites in data to loop over
site_list <- unique(df$site_nm)
# create for loop to produce ggplot2 graphs
for (i in seq_along(site_list)) {
# create plot for each county in df
plot <-
ggplot(subset(df, df$site_nm==site_list[i]),
aes(x = date, y = result,
group = parm_nmShort,
color = parm_nmShort)) +
geom_point() + #add data point plot
geom_line() + #add line plot
#geom_point(aes(y = lrl, group = parm_nmShort, shape = parm_nmShort)) +
geom_point(aes(x= date, y = lrl, group = parm_nmShort, size = parm_nmShort), shape = 1) + #marking lower limit
#scale_shape_manual(values = c("23","24","25")) + #create outlier shapes
#facet_wrap(~parm_nmShort) +
ggtitle(site_list[i]) + #name graphs well names
# theme(legend.position="none") + #removed legend
labs(x = "Year", y = expression(paste("Value, ug/L"))) + #add x and y label titles
theme_article() + #remove grey boxes, outline graph in black
theme(legend.title = element_blank()) + #removes legend title
scale_x_date(labels = date_format("%y"),
limits = as.Date(c("2000-01-01","2021-01-01"))) #+ # set x axis for all graphs
# geom_hline(yintercept = 5) #+ #add 5ug/L contaminant limit horizontal line
# theme(axis.text.x = element_text(angle = 45, size = 12, vjust = 1)) + #angles x axis titles 45 deg
# theme(aspect.ratio = 1) +
# scale_color_hue(labels = c("cic-DCE", "PCE", "TCE")) + #change label names
# scale_fill_discrete(breaks = c("PCE", "TCE", "cic-DCE"))
# Code below will let you block out below the resolution limit
# geom_ribbon(aes(ymin = 0, ymax = ###LRL###), fill ="white", color ="grey3") +
# geom_line(color ="black", lwd = 1)
#ggsave(plot,
# file=paste(results, "", site_list[i], ".png", sep=''),
# scale=1)
# print plots to screen
print(plot)
}
}
#run graphing function with long data set
VOC.graph(data)
}}
Well after a lot of playing around, I figured out the answer to my own question. I figured I'd leave the question up because none of the solutions I found online worked for me but this code did.
geom_point(aes(x= date, y = lrl, group = parm_nmShort, shape = parm_nmShort, size = parm_nmShort)) + #identify non detects
scale_shape_manual(values = c(1,1,1)) +
scale_size_manual(values = c(3,5,7)) +
I'm not very good at R, but for some reason when I didn't include the group and shape in the aes as parm_nmShort, I couldn't mannualy change the values. I don't know if it's because I have more than one geom_point in my whole script and so maybe it didn't know which one to change.

Set the width and gap in geom_bar in a large dataset with a lot of unique values

I have the dataframe below:
res<-sample.int(2187, 2187)
freq<-floor(runif(2187, 95,105))
t<-data.frame(res,freq)
and Im trying to create a bar chart based on this but despite the fact that I use width and color arguments I still cannot create space between the bars which are black instead of the selected fill.
library(ggplot2)
require(scales)
ggplot(t,width=0.1)+
geom_bar(aes(x=res,y=freq ,fill = (t$res==101)),
color = "black",stat = "identity") +
scale_fill_manual(values=c("darkblue", "lightblue"), guide = F) +
theme_classic(base_size = 16)+ theme(legend.position = "none")+
scale_x_discrete(breaks = seq(80, 115, 5))+ scale_y_continuous(labels = comma)
Note that this code works nice for a dataset with much fewer unique values like:
fac<-factor(rep(c(80,85,100,100.5,100.7,101,101.5,110,105),2000000))
res<-data.frame(fac)
new<-data.frame(table(res))
require(scales)
ggplot(new,width=0.1)+
geom_bar(aes(x=res,y=Freq ,fill = (new$res==101)),
color = "black",stat = "identity") +
scale_fill_manual(values=c("darkblue", "lightblue"), guide = F) +
theme_classic(base_size = 16)+ theme(legend.position = "none")+
scale_x_discrete(breaks = seq(80, 115, 5))+ scale_y_continuous(labels = comma)
May be I am completely wrong but if I understand correctly, the OP wants to reproduce the second chart from scratch using a sample of random numbers instead of already tabulated counts.
To create a histogram / bar chart, we only need a vector of random numbers (wraped in a data.frame for ggplot) and let geom_bar() do the counting. In addition, a particular bar will be highlighted.
By using floor(), the random numbers are already binned but are still considered as continuous by ggplot(). Therefore, they need to be turned into factor.
# create data
set.seed(123L) # ensure random data are reproducible
t <- data.frame(res = floor(runif(2187, 95, 105)))
library(ggplot2)
ggplot(t) +
aes(x = as.factor(res), fill = res == 101) +
geom_bar() +
theme_classic(base_size = 16) +
scale_fill_manual(values = c("darkblue", "lightblue"), guide = FALSE) +
xlab("res") +
ylab("freq")
Edit: geom_histogram()
Ther is an alternative approach using geom_histogram().
geom_histogram() does all steps in one go: The binning (no need to use floor()) as well as counting and plotting:
set.seed(123L) # ensure random data are reproducible
t2 <- data.frame(res = runif(2187, 95,105)) # floor() omitted here
ggplot(t2) +
aes(x = res, fill = floor(res) == 101) +
geom_histogram(breaks = seq(95, 105, 1), closed = "left") +
theme_classic(base_size = 16) +
scale_fill_manual(values = c("darkblue", "lightblue"), guide = FALSE) +
xlab("res") +
ylab("freq")
Here, the breaks parameter was used to specify the bin boundaries explicitely. Alternatively, the number of bins or the width of the bins can be specifies. This gives flexibilty to play around with the parameters.
Edit 2
The OP has asked about the case where the random numbers are uniformly distributed between 100 and 1015. With an adjustment to the sequence of breaks,
set.seed(123L) # ensure random data are reproducible
t3 <- data.frame(res = runif(2187, 100, 1015))
ggplot(t3) +
aes(x = res, fill = floor(res) == 101) +
geom_histogram(breaks = seq(100, 1015, 1), closed = "left") +
theme_classic(base_size = 16) +
scale_fill_manual(values = c("darkblue", "lightblue"), guide = FALSE) +
xlab("res") +
ylab("freq")
returns
This chart contains over 900 bars for each bin of width 1 which aren't all visible depending on the screen resolution as already explained by Jon Spring.
Therefore, it might be more suitable to reduce the number of bins, e.g., to 100 bins:
ggplot(t3) +
aes(x = res, fill = floor(res) == 101) +
geom_histogram(bins = 100L) +
theme_classic(base_size = 16) +
scale_fill_manual(values = c("darkblue", "lightblue"), guide = FALSE) +
xlab("res") +
ylab("freq")
Please note that 101 is still highlighted in the lower left corner.
Edit -- added alternate solutions at bottom.
If you have over 2,000 bars, and each one has a black outline 1 pixel wide on each side, that'll take something on the order of 6,000 horizontal pixels (ignoring anti-aliasing) to see one with a different fill. Most screens have much lower resolution than that.
If you must use bars, and must show every value, one option would be to drop the outline with color = NA and set width = 1 (as a term in the geom_col/geom_bar call) so there's no distracting blank space between bars. Even then, the different color at res == 101 is only visible at certain resolutions. (That might vary on device settings and anti-aliasing.)
ggplot(t)+
geom_col(aes(x=res,y=freq , fill = (res==101)),
color = NA, width = 1) +
scale_fill_manual(values=c("darkblue", "lightblue"), guide = F) +
theme_classic(base_size = 16) +
scale_x_continuous(breaks = c(500*0:4, 101))
If you must show all 2000 points, but want to highlight one, it might make sense to use a different geom that spreads the data out to use more of the available space.
For instance, we might use geom_point or geom_jitter to plot all the coordinates in 2d space. Here, I highlight the element with res == 101. I use arrange to make sure the special dot gets plotted last so that it doesn't get occluded.
library(dplyr)
ggplot(t %>% arrange(res == 101),
aes(x = res, y = freq,
fill = res == 101,
size = res == 101)) +
geom_jitter(shape = 21, stroke = 0.1)
Or we might plot the data as a line, highlighting the special dot on its own:
ggplot(t, aes(res, freq)) +
geom_line(color = "gray70") +
geom_point(data = subset(t, res == 101)) +
expand_limits(y=0)

How to include number of observations in each quartile of boxplot using ggplot2 in R?

I am plotting a box-plot to see the distribution of the variable. I am also interested in seeing the number of observations in each quartile. Is there any way to add the number of observations in each quartile to the boxplot along with the values of quartiles?
I included some code below which can generate box-plot with the values of quartiles.
df <- datasets::iris
boxplot <- ggplot(df, aes(x = "", y = Sepal.Length)) +
geom_boxplot(width=0.1, position = "dodge", fill = "red") +
stat_boxplot(geom = "errorbar", width = 0.1) +
stat_summary(geom = "label_repel", fun.y = quantile, aes(label = ..y..),
position = position_nudge(x = -0.1), size = 3) +
ggtitle("") +
xlab("") +
ylab('Sepal.Length')
I expect the values of quartiles on the left-hand side of the plot and the number of observations on the right-hand side of the plot if possible.
this would be one possibility. I always prefer to have my additional data as an extra data frame, because this gives me more control on what is how calculated.
Counting made with some inspiration from https://stackoverflow.com/a/54451575
quantile_counts=function(x){
df= data.frame(label=table(cut(x, quantile(x))),
label_pos=diff(quantile(x))/2+quantile(x)[1:4])
return(df)
}
df_quantile_counts=quantile_counts(df$Sepal.Length)
boxplot <- ggplot(df, aes(x = "", y = Sepal.Length)) +
geom_boxplot(width=0.1, position = "dodge", fill = "red") +
stat_boxplot(geom = "errorbar", width = 0.1) +
stat_summary(geom = "label", fun.y = quantile, aes(label = ..y..),
position = position_nudge(x = -0.1), size = 3) +
geom_text(data=df_quantile_counts,aes(x="",y=label_pos,label = label.Freq),
position = position_nudge(x = +0.1), size = 3) +
ggtitle("") +
xlab("") +
ylab('Sepal.Length')
HTH, Tobi
#TobiO 's answer is correct. But, my data was kind of skewed and some cut points were the same (such as the first and second cut points were the same). I needed to take the unique values to calculate the number of observations in each quartile. Another point is related to usage of cut function which does not include the starting point (low bound, high bound]. In order to include the starting point, I have used the cut2 function from the Hmisc package. I included a label_pos_extension line in order to prevent the overlap of label/text for the quartiles whose cut points are very close to each other. geom_text_repel did not work for preventing the overlaps.
quantile_counts2 <- function(x){
label_pos_extension <- c(0,3,4,0)
if(length(unique(quantile(x))) < 5){
df <- data.frame(label = table(cut2(x, g = 4)),
label_pos = c(0, diff(unique(quantile(x))) / 2 + quantile(x)[1:length(unique(quantile(x)))-1]) + label_pos_extension[1:length(unique(quantile(x)))])
} else {
df <- data.frame(label = table(cut2(x, g = 4)),
label_pos = diff(quantile(x)) / 2 + quantile(x)[1:4] + label_pos_extension)
} return(df)
}
PS. I tried to put my edited function in comment but, it did not work.

Overlaying histograms with ggplot2 in R

I am new to R and am trying to plot 3 histograms onto the same graph.
Everything worked fine, but my problem is that you don't see where 2 histograms overlap - they look rather cut off.
When I make density plots, it looks perfect: each curve is surrounded by a black frame line, and colours look different where curves overlap.
Can someone tell me if something similar can be achieved with the histograms in the 1st picture? This is the code I'm using:
lowf0 <-read.csv (....)
mediumf0 <-read.csv (....)
highf0 <-read.csv(....)
lowf0$utt<-'low f0'
mediumf0$utt<-'medium f0'
highf0$utt<-'high f0'
histogram<-rbind(lowf0,mediumf0,highf0)
ggplot(histogram, aes(f0, fill = utt)) + geom_histogram(alpha = 0.2)
Using #joran's sample data,
ggplot(dat, aes(x=xx, fill=yy)) + geom_histogram(alpha=0.2, position="identity")
note that the default position of geom_histogram is "stack."
see "position adjustment" of this page:
geom_histogram documentation
Your current code:
ggplot(histogram, aes(f0, fill = utt)) + geom_histogram(alpha = 0.2)
is telling ggplot to construct one histogram using all the values in f0 and then color the bars of this single histogram according to the variable utt.
What you want instead is to create three separate histograms, with alpha blending so that they are visible through each other. So you probably want to use three separate calls to geom_histogram, where each one gets it's own data frame and fill:
ggplot(histogram, aes(f0)) +
geom_histogram(data = lowf0, fill = "red", alpha = 0.2) +
geom_histogram(data = mediumf0, fill = "blue", alpha = 0.2) +
geom_histogram(data = highf0, fill = "green", alpha = 0.2) +
Here's a concrete example with some output:
dat <- data.frame(xx = c(runif(100,20,50),runif(100,40,80),runif(100,0,30)),yy = rep(letters[1:3],each = 100))
ggplot(dat,aes(x=xx)) +
geom_histogram(data=subset(dat,yy == 'a'),fill = "red", alpha = 0.2) +
geom_histogram(data=subset(dat,yy == 'b'),fill = "blue", alpha = 0.2) +
geom_histogram(data=subset(dat,yy == 'c'),fill = "green", alpha = 0.2)
which produces something like this:
Edited to fix typos; you wanted fill, not colour.
While only a few lines are required to plot multiple/overlapping histograms in ggplot2, the results are't always satisfactory. There needs to be proper use of borders and coloring to ensure the eye can differentiate between histograms.
The following functions balance border colors, opacities, and superimposed density plots to enable the viewer to differentiate among distributions.
Single histogram:
plot_histogram <- function(df, feature) {
plt <- ggplot(df, aes(x=eval(parse(text=feature)))) +
geom_histogram(aes(y = ..density..), alpha=0.7, fill="#33AADE", color="black") +
geom_density(alpha=0.3, fill="red") +
geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
labs(x=feature, y = "Density")
print(plt)
}
Multiple histogram:
plot_multi_histogram <- function(df, feature, label_column) {
plt <- ggplot(df, aes(x=eval(parse(text=feature)), fill=eval(parse(text=label_column)))) +
geom_histogram(alpha=0.7, position="identity", aes(y = ..density..), color="black") +
geom_density(alpha=0.7) +
geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
labs(x=feature, y = "Density")
plt + guides(fill=guide_legend(title=label_column))
}
Usage:
Simply pass your data frame into the above functions along with desired arguments:
plot_histogram(iris, 'Sepal.Width')
plot_multi_histogram(iris, 'Sepal.Width', 'Species')
The extra parameter in plot_multi_histogram is the name of the column containing the category labels.
We can see this more dramatically by creating a dataframe with many different distribution means:
a <-data.frame(n=rnorm(1000, mean = 1), category=rep('A', 1000))
b <-data.frame(n=rnorm(1000, mean = 2), category=rep('B', 1000))
c <-data.frame(n=rnorm(1000, mean = 3), category=rep('C', 1000))
d <-data.frame(n=rnorm(1000, mean = 4), category=rep('D', 1000))
e <-data.frame(n=rnorm(1000, mean = 5), category=rep('E', 1000))
f <-data.frame(n=rnorm(1000, mean = 6), category=rep('F', 1000))
many_distros <- do.call('rbind', list(a,b,c,d,e,f))
Passing data frame in as before (and widening chart using options):
options(repr.plot.width = 20, repr.plot.height = 8)
plot_multi_histogram(many_distros, 'n', 'category')
To add a separate vertical line for each distribution:
plot_multi_histogram <- function(df, feature, label_column, means) {
plt <- ggplot(df, aes(x=eval(parse(text=feature)), fill=eval(parse(text=label_column)))) +
geom_histogram(alpha=0.7, position="identity", aes(y = ..density..), color="black") +
geom_density(alpha=0.7) +
geom_vline(xintercept=means, color="black", linetype="dashed", size=1)
labs(x=feature, y = "Density")
plt + guides(fill=guide_legend(title=label_column))
}
The only change over the previous plot_multi_histogram function is the addition of means to the parameters, and changing the geom_vline line to accept multiple values.
Usage:
options(repr.plot.width = 20, repr.plot.height = 8)
plot_multi_histogram(many_distros, "n", 'category', c(1, 2, 3, 4, 5, 6))
Result:
Since I set the means explicitly in many_distros I can simply pass them in. Alternatively you can simply calculate these inside the function and use that way.

Resources