I have the following data set gwas_data
Running
head -n 23 gwas_data gives me the following table.
gwas_data <-
data.frame(
stringsAsFactors = FALSE,
udi = c("A","B","C","D","E",
"F","G","H","I","J","K","A","B","C","D","E",
"F","G","H","I","J","K"),
snp = c("rs71628639_A",
"rs71628639_A","rs71628639_A","rs71628639_A","rs71628639_A",
"rs71628639_A","rs71628639_A","rs71628639_A",
"rs71628639_A","rs71628639_A","rs71628639_A","rs12726330_A",
"rs12726330_A","rs12726330_A","rs12726330_A",
"rs12726330_A","rs12726330_A","rs12726330_A","rs12726330_A",
"rs12726330_A","rs12726330_A","rs12726330_A"),
chr = c(1L,1L,1L,1L,1L,1L,1L,
1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,
1L),
bp = c(154988255L,154988255L,
154988255L,154988255L,154988255L,154988255L,154988255L,
154988255L,154988255L,154988255L,154988255L,
155108167L,155108167L,155108167L,155108167L,155108167L,
155108167L,155108167L,155108167L,155108167L,
155108167L,155108167L),
p = c(0.580621191,0.356577427,
0.494774059,0.984005886,0.492034614,0.581479389,
0.24820214,0.202720896,0.295462221,0.845848783,
0.954714162,0.343101621,0.740942238,0.929127071,0.717965027,
0.335111376,0.857154424,0.480087195,0.980307843,
0.521114038,0.583150471,0.925783695),
beta = c(0.000852277,0.003943912,
0.001091986,-3.18e-05,0.000564413,0.000120028,
0.026156467,0.000303135,0.069146449,-2.96e-07,-2.11e-05,
0.001274261,-0.001232397,0.000123948,-0.000498507,
-0.000689988,-3.41e-50,-0.013934416,5.12e-06,
-0.03696031,-7.28e-07,-3.01e-05),
bp_cum = c(1.154988255,1.154988255,
1.154988255,1.154988255,1.154988255,1.154988255,
1.154988255,1.154988255,1.154988255,1.154988255,
1.154988255,1.155108167,1.155108167,1.155108167,
1.155108167,1.155108167,1.155108167,1.155108167,1.155108167,
1.155108167,1.155108167,1.155108167)
)
I would like to make a manhattan plot, the X-axis should have chromosomal numbers from 1:22, I want each entry to be on the x-axis according to the BP position. The id should act as colour and the y-axis would be -log10(p).
I have rewritten the r command as follows, but my graph doesn't look correct.
library(plyr)
library(dplyr)
library(purrr)
library(tidyverse)
library(ggtext)
library(stringr)
gwas_data <- read.table("gwas_data", header=T)
sig <- 5e-8
manhplot <- ggplot(gwas_data, aes(x = bp_cum, y = -log10(p), color = udi)) +
geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") +
geom_point(aes(color=as.factor(udi)), alpha=0.8, size=2) +
scale_x_continuous(label = axis_set$chr, breaks = axis_set$center) +
scale_y_continuous(expand = c(0,0), limits = c(0, ylim)) +
#scale_color_manual(values = rep(c("#276FBF", "#183059"), (length(axis_set$chr)))) +
scale_size_continuous(range = c(0.5,3)) +
theme_minimal()
print(manhplot)
I would also like to add the name of the ID and SNP if they are above the significant threshold.
My axis_set looks as follows with test data which goes from chromosome 1:4
chr center
1 179641307
2 354697451
3 553030055
4 558565909
My final graph looks as follows:
I am new to R and have not been able to correct the following graph.
Xb_exp, it should have blue dots.
Xb_dw, solid red line.
Xb_f, dotted line.
Xb_s, longdash line.
The legend expression should be as shown with the subscript.
I have not been able to correct it.
Is there a way to do this?
enter image description here
my data
CA <- c(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30)
Xb_exp <- c(0.0231,0.0519,0.0839,0.1197,0.1595,0.1996,0.2384,0.2772,0.3153,0.3520,0.3887,0.4254,0.4615,0.4978,0.5339,0.5685,0.6000,0.6279,0.6528,0.6762,0.6974,0.7166,0.7346,0.7516,0.7669,0.7810,0.7940,0.8059)
Xb_dw <- c(0.0160,0.0516,0.0886,0.1259,0.1633,0.2006,0.2377,0.2749,0.3122,0.3496,0.3870,0.4245,0.4617,0.4984,0.5339,0.5678,0.5996,0.6288,0.6551,0.6786,0.6994,0.7179,0.7346,0.7499,0.7641,0.7774,0.7899,0.8018)
Xb_f <- c(0.0021,0.0031,0.0046,0.0067,0.0095,0.0131,0.0177,0.0234,0,0387,0.0483,0.0591,0.0709,0.0832,0.0955,0.1073,0.1181,0.1272,0.1345,0.1398,0.1443,0.1456,0.1468,0.1474,0.1476,0.1477,0.1477,0.1477,0.1477)
Xb_s <- c(0.0139,0.0484,0.0839,0.1192,0.1538,0.1874,0.2200,0.2515,0.2818,0.3108,0.3387,0.3653,0.3908,0.4151,0.4383,0.4604,0.4815,0.5015,0.5206,0.5387,0.5559,0.5722,0.5877,0.6024,0.6164,0.6264,0.6421,0.6040)
dat <- c(CA, Xb_exp, Xb_dw, Xb_f, Xb_s)
my code
labels = c(expression(X[b_exp]),expression(X[b_dw]),expression(X[b_f]),expression(X[b_s]))
color4 <- c("Xb_exp"="#3C5488FF", "Xb_dw"="#DC0000FF", "Xb_f"="#00A087FF", "Xb_s"="#4DBBD5FF")
Xb_D1 <- ggplot(data = dat) +
theme_bw() +
labs(x="Crank position (ºCA)", y= bquote('Burn fraction ('~X[b]~')')) +
geom_point(aes(x=CA, y=Xb_exp, colour="Xb_exp"), size=3) +
geom_line(aes(x=CA, y=Xb_dw,colour="Xb_dw"), size=1,linetype="solid") +
geom_line(aes(x=CA, y=Xb_f,colour="Xb_f"), size=1,linetype="dotted") +
geom_line(aes(x=CA, y=Xb_s,colour="Xb_s"), size=1,linetype="longdash") +
scale_colour_manual(values=color4, labels=labels) +
theme(legend.title = element_blank(),legend.position = c(0.8, 0.5),
legend.text = element_text(size = 12)) +
scale_x_continuous(limits = c(2,80))
plot(Xb_D1)
ggplot() requires a dataframe not a vector. If you modify your code with:
dat <- data.frame(CA, Xb_exp, Xb_dw, Xb_f, Xb_s)
and fix the typo in your Xb_f vector
Xb_f <- c(0.0021,0.0031,0.0046,0.0067,0.0095,0.0131,0.0177,0.0234,0.0387,0.0483,0.0591,0.0709,0.0832,0.0955,0.1073,0.1181,0.1272,0.1345,0.1398,0.1443,0.1456,0.1468,0.1474,0.1476,0.1477,0.1477,0.1477,0.1477)
Your remaining code will work as but could be achieved more simply using the tidyverse approach below. Use pivot_longer to stack the y variables against your x variable.
dat %>%
pivot_longer(Xb_exp:Xb_s) %>%
ggplot(aes(x = CA, y = value, colour = name)) +
geom_point() +
geom_line() +
scale_colour_manual(values=color4, labels=labels) +
theme_bw() +
theme(legend.title = element_blank(),legend.position = c(0.8, 0.5),
legend.text = element_text(size = 12)) +
scale_x_continuous(limits = c(2,80)) +
labs(x="Crank position (ºCA)", y= bquote('Burn fraction ('~X[b]~')')) ```
Ironically, setting this up with conventional ploting is rather simple:
Given all the data above:
linetypes4 <- c( Xb_exp=NA, Xb_dw="solid", Xb_f="dotted", Xb_s="longdash" )
plot(
NA, type="n", xlim=c(0,30), ylim=c(0,0.8),
xlab = "Crank position (ºCA)", ylab = bquote('Burn fraction ('~X[b]~')'),
panel.first = grid()
)
with( dat, {
points( x=CA, y=Xb_exp, pch=19, col=color4["Xb_exp"], size=3 )
for( n in c("Xb_dw", "Xb_f", "Xb_s")) {
lines( x=CA, y=get(n), lty=linetypes[n], col=color4[n], lwd=2 )
}
})
legend(
x = "right",
legend = labels,
col = color4,
lty = linetypes4,
pch = c(19,NA,NA,NA),
box.lwd = 0,
inset = .02
)
There are some errors in your code suggesting you didn't try what you pasted.
0,0387, in your data should likely be 0.0387, otherwise nothing is right (no data measures several hundreds in there)
c(CA, ... ) should likely be data.frame( CA, ... )
Now, the first problem is you are doing all the heavy lifting yourself, while ggplot sits there with nothing left to do. It was designed to set up colors and line types by group. You however need to transform the data first to take full advantage of that:
library(tidyr)
CA <- c(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30)
Xb_exp <- c(0.0231,0.0519,0.0839,0.1197,0.1595,0.1996,0.2384,0.2772,0.3153,0.3520,0.3887,0.4254,0.4615,0.4978,0.5339,0.5685,0.6000,0.6279,0.6528,0.6762,0.6974,0.7166,0.7346,0.7516,0.7669,0.7810,0.7940,0.8059)
Xb_dw <- c(0.0160,0.0516,0.0886,0.1259,0.1633,0.2006,0.2377,0.2749,0.3122,0.3496,0.3870,0.4245,0.4617,0.4984,0.5339,0.5678,0.5996,0.6288,0.6551,0.6786,0.6994,0.7179,0.7346,0.7499,0.7641,0.7774,0.7899,0.8018)
Xb_f <- c(0.0021,0.0031,0.0046,0.0067,0.0095,0.0131,0.0177,0.0234,0.0387,0.0483,0.0591,0.0709,0.0832,0.0955,0.1073,0.1181,0.1272,0.1345,0.1398,0.1443,0.1456,0.1468,0.1474,0.1476,0.1477,0.1477,0.1477,0.1477)
Xb_s <- c(0.0139,0.0484,0.0839,0.1192,0.1538,0.1874,0.2200,0.2515,0.2818,0.3108,0.3387,0.3653,0.3908,0.4151,0.4383,0.4604,0.4815,0.5015,0.5206,0.5387,0.5559,0.5722,0.5877,0.6024,0.6164,0.6264,0.6421,0.6040)
dat <- data.frame(CA, Xb_exp, Xb_dw, Xb_f, Xb_s)
color4 <- c("Xb_exp"="#3C5488FF", "Xb_dw"="#DC0000FF", "Xb_f"="#00A087FF", "Xb_s"="#4DBBD5FF")
linetypes <- c( Xb_dw="solid", Xb_f="dotted", Xb_s="longdash" )
dat2 <- pivot_longer( dat, cols=starts_with("Xb_") )
dat2.line <- dat2 %>% filter( name != "Xb_exp" )
dat2.point <- dat2 %>% filter( name == "Xb_exp" )
dat2 is now a long data set, with data category as a variable, not with a separate column for each data series. This is how ggplot likes it:
dat2
# A tibble: 112 x 3
CA name value
<dbl> <fct> <dbl>
1 3 Xb_exp 0.0231
2 3 Xb_dw 0.016
3 3 Xb_f 0.0021
4 3 Xb_s 0.0139
5 4 Xb_exp 0.0519
6 4 Xb_dw 0.0516
7 4 Xb_f 0.0031
8 4 Xb_s 0.0484
9 5 Xb_exp 0.0839
10 5 Xb_dw 0.0886
# … with 102 more rows
I then split the data on what later goes to points and what goes ot lines, just not to make the plot code uglier than it has to be:
Xb_D1 <- ggplot(data = dat2.line, aes(x=CA,y=value,color=name)) +
theme_bw() +
labs(x="Crank position (ºCA)", y= bquote('Burn fraction ('~X[b]~')')) +
geom_point( data = dat2.point, size=3) +
geom_line( aes(col=name,lty=name), size=1 ) +
scale_colour_manual(values=color4) +
scale_linetype_manual( values=linetypes, guide=FALSE ) +
guides(
color = guide_legend( override.aes=list( shape=c(NA,19,NA,NA), linetype=c("solid","solid","dashed","dotted") ) )
) +
theme(legend.title = element_blank(),legend.position = c(0.8, 0.5),
legend.text.align = 0,
legend.text = element_text(size = 12)) +
scale_x_continuous(limits = c(2,30))
print(Xb_D1)
no need to supply labels
use line type as you would use color with ggplot, its just one more channel that can carry information (or aesthetic as they like to call it over there)
align the legends left, looks nicer that way
more sophisticated is the use of override.aes to take away the points from the legend categories who shouldn't have them.
Now, I was unable to change the order of the data series in the labels, that can be a hazzle. Is it still ok for you the order they are?
I was thinking of doing this in R but am new to it and would appreciate any help
I have a dataset (pitches) of baseball pitches identified by
'pitchNumber' and 'outcome' e.g S = swinging strike, B = ball, H= hit
etc.
e.g.
1 B ;
2 H ;
3 S ;
4 S ;
5 X ;
6 H; etc.
All I want to do is have a graph that plots them in a line cf BHSSXB
but replacing the letter with a small bar colored to represent the letter, with a legend, and optionally having the pitch number above the color . Somewhat like a sparkline.
Any suggestion on how to implement this much appreciated
And the same graph using ggplot.
Data courtesy of #GavinSimpson.
ggplot(baseball, aes(x=pitchNumber, y=1, ymin=0, ymax=1, colour=outcome)) +
geom_point() +
geom_linerange() +
ylab(NULL) +
xlab(NULL) +
scale_y_continuous(breaks=c(0, 1)) +
opts(
panel.background=theme_blank(),
panel.grid.minor=theme_blank(),
axis.text.y = theme_blank()
)
Here is a base graphics idea from which to work. First some dummy data:
set.seed(1)
baseball <- data.frame(pitchNumber = seq_len(50),
outcome = factor(sample(c("B","H","S","S","X","H"),
50, replace = TRUE)))
> head(baseball)
pitchNumber outcome
1 1 H
2 2 S
3 3 S
4 4 H
5 5 H
6 6 H
Next we define the colours we want:
## better colours - like ggplot for the cool kids
##cols <- c("red","green","blue","yellow")
cols <- head(hcl(seq(from = 0, to = 360,
length.out = nlevels(with(baseball, outcome)) + 1),
l = 65, c = 100), -1)
then plot the pitchNumber as a height 1 histogram-like bar (type = "h"), suppressing the normal axes, and we add on points to the tops of the bars to help visualisation:
with(baseball, plot(pitchNumber, y = rep(1, length(pitchNumber)), type = "h",
ylim = c(0, 1.2), col = cols[outcome],
ylab = "", xlab = "Pitch", axes = FALSE, lwd = 2))
with(baseball, points(pitchNumber, y = rep(1, length(pitchNumber)), pch = 16,
col = cols[outcome]))
Add on the x-axis and the plot frame, plus a legend:
axis(side = 1)
box()
## note: this assumes that the levels are in alphabetical order B,H,S,X...
legend("topleft", legend = c("Ball","Hit","Swinging Strike","X??"), lty = 1,
pch = 16, col = cols, bty = "n", ncol = 2, lwd = 2)
Gives this:
This is in response to your last comment on #Gavin's answer. I'm going to build off of the data provided by #Gavin and the ggplot2 plot by #Andrie. ggplot() supports the concept of faceting by a variable or variables. Here you want to facet by pitcher and at the pitch limit of 50 per row. We'll create a new variable that corresponds to each row we want to plot separately. The equivalent code in base graphics would entail adjusting mfrow or mfcol in par() and calling separate plots for each group of data.
#150 pitches represents a somewhat typical 9 inning game.
#Thanks to Gavin for sample data.
longGame <- rbind(baseball, baseball, baseball)
#Starter goes 95 pitches, middle relief throws 35, closer comes in for 20 and the glory
longGame$pitcher <- c(rep("S", 95), rep("M", 35), rep("C",20))
#Adjust pitchNumber accordingly
longGame$pitchNumber <- c(1:95, 1:35, 1:20)
#We want to show 50 pitches at a time, so will combine the pitcher name
#with which set of pitches this is
longGame$facet <- with(longGame, paste(pitcher, ceiling(pitchNumber / 50), sep = ""))
#Create the x-axis in increments of 1-50, by pitcher
longGame <- ddply(longGame, "facet", transform, pitchFacet = rep(1:50, 5)[1:length(facet)])
#Convert facet to factor in the right order
longGame$facet <- factor(longGame$facet, levels = c("S1", "S2", "M1", "C1"))
#Thanks to Andrie for ggplot2 function. I change the x-axis and add a facet_wrap
ggplot(longGame, aes(x=pitchFacet, y=1, ymin=0, ymax=1, colour=outcome)) +
geom_point() +
geom_linerange() +
facet_wrap(~facet, ncol = 1) +
ylab(NULL) +
xlab(NULL) +
scale_y_continuous(breaks=c(0, 1)) +
opts(
panel.background=theme_blank(),
panel.grid.minor=theme_blank(),
axis.text.y = theme_blank()
)
You can obviously change the labels for the facet variable, but the above code will produce: