I have the following dataset:
HIU,0.0833333333,0,0.35,0.0208333333,0.40625,0,0.21875,0.125,0.078125,0.0104166667,1,0.53125,0.4375
TTHY,0,0,0.8,0,0.5,0,0.7083333333,0.2708333333,0,0.6597222222,0,0.1435185185,0
Full,0.0554986339,0.1034836066,0.4620901639,0.0683060109,0.4961577869,0.0696721311,0.222079918,0.1465163934,0.2085040984,0.0476007514,0.893613388,0.396943306,0.4223872951
I made a grouped bar plot according to the rows of HIU and TTHY (figure 1). But I want to add a line according to the "Full" row, such as the second image.
Figure 1:
Figure 2:
How can I do it with R? This is my current code:
df = read.csv('TTR-HIU/resultados.csv',header=FALSE,colClasses=c("NULL",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA))
df.bar <- barplot(as.matrix(df[-nrow(df),]),beside=TRUE,col=c("darkblue","red"))
Using ggplot2, you could try something like this:
# put data in data frame:
df <- data.frame(HIU = c(0.0833333333,0,0.35,0.0208333333,0.40625,0,0.21875,0.125,0.078125,0.0104166667,1,0.53125,0.4375),
TTHY = c(0,0,0.8,0,0.5,0,0.7083333333,0.2708333333,0,0.6597222222,0,0.1435185185,0),
Full= c(0.0554986339,0.1034836066,0.4620901639,0.0683060109,0.4961577869,0.0696721311,0.222079918,0.1465163934,0.2085040984,0.0476007514,0.893613388,0.396943306,0.4223872951))
library(ggplot2)
library(tidyr) # to make data long (gather)
# create x-values:
df$x <- as.factor(seq_len(nrow(df)))
# make data long for ggplot2:
df_long <- df %>% gather(key, value, -x)
ggplot() +
# plot bars:
geom_col(data = subset(df_long, key %in% c("HIU", "TTHY")),
mapping = aes(x = x, y = value, fill = key),
position = position_dodge()) +
# plot lines:
geom_line(data = subset(df_long, key == "Full"),
mapping = aes(x = x, y = value, group = key, color = key),
size = 2) +
# make plot look a little like your desired output:
scale_color_manual(values = c("Full" = "yellow")) +
scale_fill_manual(values = c("HIU" = "blue", "TTHY" = "red")) +
theme_minimal() +
theme(axis.title = element_blank(),
legend.title = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank())
However, you might have to put your data in data-frame-shape as in this example. Use dput to show how your data exactly looks like, if you need further help...
Related
Suppose I have data in the following form:
library(ggplot2)
Data <- data.frame(
"ID" = c("ABC111", "ABC111", "ABC111", "ABC111", "ABC112", "ABC112", "ABC112", "ABC113", "ABC113", "ABC114", "ABC115"),
"color" = c("red", "red", "red", "red", "blue", "blue", "blue", "green", "green", "black", "yellow"),
"start_date" = c("2005/01/01", "2006/01/01", "2007/01/01", "2008/01/01", "2009/01/01", "2010/01/01", "2011/01/01", "2012/01/01", "2013/01/01", "2014/01/01", "2015/01/01"),
"end_date" = c("2005/09/01", "2006/06/01", "2007/04/01", "2008/05/07", "2009/06/01", "2010/10/01", "2011/12/12", "2013/05/01", "2013/06/08", "2015/01/01", "2016/08/09")
)
Data$ID = as.factor(Data$ID)
Data$color = as.factor(Data$color)
Now what I want to do is for each row, plot the start_date and the end_date ... and then connect them with a straight line. I believe this can be done with geom_line() in ggplot2.
I want something that looks like this:
I tried using the following code:
q <- qplot(start_date, end_date, data=Data)
q <- q + geom_line(aes(group = ID))
q
But the graph looks completely different than what I expected.
Can anyone please show me what I am doing wrong?
Thanks
Does the following work for you?
ggplot(data = Data, aes(start_date, end_date, color = ID))+
geom_line(aes(group = ID))+
geom_point()
or maybe geom_segment ?
# Adding x and y coordinates for geom_segment
Data$x <- as.character(as.Date(Data$start_date) + (as.Date(Data$end_date) - as.Date(Data$start_date)))
Data$y <- 1:nrow(Data)
ggplot(data = Data, aes(x, y, colour = ID))+
geom_segment(aes(xend = start_date, yend = end_date))
Here's a solution using the tidyverse package. I used the number of each row in the original data as the y-axis values in the plot. As these values are meaningless, I removed the y-axis title, labels and ticks from the plot.
library(tidyverse)
Data %>%
# Number each row in its order of appearance,
# save this numbers in a new column named order
rowid_to_column("order") %>%
# Change data from wide to long format
pivot_longer(cols = c(start_date, end_date),
names_to = "date_type",
values_to = "date") %>%
# Ggplot, use date as x, order as y, ID as col and order as group
ggplot(aes(x = date,
y = order,
col = ID,
group = order)) +
# Draw points
geom_point()+
# Draw lines
geom_line() +
# Maybe you want to remove the y axis title, text and ticks
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
# I added a vertical format to the x axis labels
# it might easier to read this way
axis.text.x = element_text(angle = 90, vjust = 0.5))
Here is an example from the geom_boxplot man page:
p = ggplot(mpg, aes(class, hwy))
p + geom_boxplot(aes(colour = drv))
which looks like this:
I would like to make a very similar plot, but with (yearmon formatted) dates where the class variable is in the example, and a factor variable where drv is in the example.
Here is some sample data:
df_box = data_frame(
Date = sample(
as.yearmon(seq.Date(from = as.Date("2013-01-01"), to = as.Date("2016-08-01"), by = "month")),
size = 10000,
replace = TRUE
),
Source = sample(c("Inside", "Outside"), size = 10000, replace = TRUE),
Value = rnorm(10000)
)
I have tried a bunch of different things:
Put an as.factor around the date variable, then I no longer have the nicely spaced out date scale for the x-axis:
df_box %>%
ggplot(aes(
x = as.factor(Date),
y = Value,
# group = Date,
color = Source
)) +
geom_boxplot(outlier.shape = NA) +
theme_bw() +
xlab("Month Year") +
theme(
axis.text.x = element_text(hjust = 1, angle = 50)
)
On the other hand, if I use Date as an additional group variable as suggested here, adding color no longer has any additional impact:
df_box %>%
ggplot(aes(
x = Date,
y = Value,
group = Date,
color = Source
)) +
geom_boxplot() +
theme_bw()
Any ideas as to how achieve the output of #1 while still maintaining a yearmon scale x-axis?
Since you need separate boxes for each combination of Date and Source, use interaction(Source, Date) as the group aesthetic:
ggplot(df_box, aes(x = Date, y = Value,
colour = Source,
group = interaction(Source, Date))) +
geom_boxplot()
I have a data.frame that looks something like this:
HSP90AA1 SSH2 ACTB TotalTranscripts
ESC_11_TTCGCCAAATCC 8.053308 12.038484 10.557234 33367.23
ESC_10_TTGAGCTGCACT 9.430003 10.687959 10.437068 30285.41
ESC_11_GCCGCGTTATAA 7.953726 9.918988 10.078192 30133.94
ESC_11_GCATTCTGGCTC 11.184402 11.056144 8.316846 24857.07
ESC_11_GTTACATTTCAC 11.943733 11.004500 9.240883 23629.00
ESC_11_CCGTTGCCCCTC 7.441695 9.774733 7.566619 22792.18
The TotalTranscripts column is sorted in descending order. What I'd like to do is generate three bar graphs using ggplot2 with each bar graph corresponding to each column of the data.frame with the exception of TotalTranscripts. I'd like the bar graphs to be ordered by TotalTranscripts just as the data.frame. I would be ideal to have these bar graphs on one plot using a facet wrap.
Any help would be greatly appreciated! Thank you!
EDIT: Here is my current code using barplot().
cells = "ESC"
genes = c("HSP90AA1", "SSH2", "ACTB")
g = data[genes,grep(cells, colnames(data))]
g = data.frame(t(g), colSums(data)[grep(cells, colnames(data))])
colnames(g)[ncol(g)] = "TotalTranscripts"
g = g[order(g$TotalTranscripts, decreasing=T), , drop=F]
barplot(as.matrix(g[1]), beside=TRUE, names.arg=paste(rownames(g)," (",g$TotalTranscripts,")",sep=""), las=2, col="light blue", cex.names=0.3, main=paste(colnames(g)[1], "\nCells sorted by total number of transcripts (colSums)", sep=""))
This will generate a plot that looks like this.
Again, the problem I seem to be having here is how to have multiple of these plots on the same image. I would like to add 20+ columns to this data.frame but I've cut this down to 3 for the sake of simplicity.
EDIT: Current code incorporating the answer below
cells = "ESC"
genes = rownames(data[x,])[1:8]
# genes = c("HSP90AA1", "SSH2", "ACTB")
g = data[genes,grep(cells, colnames(data))]
g = data.frame(t(g), colSums(data)[grep(cells, colnames(data))])
colnames(g)[ncol(g)] = "TotalTranscripts"
g = g[order(g$TotalTranscripts, decreasing=T), , drop=F]
g$rowz <- row.names(g)
g$Cells <- reorder(g$rowz, rev(g$TotalTranscripts))
df1 <- melt(g, id.vars = c("Cells", "TotalTranscripts"), measure.vars=genes)
ggplot(df1, aes(x = Cells, y = value)) + geom_bar(stat = "identity") +
theme(axis.title.x=element_blank(), axis.text.x = element_blank()) +
facet_wrap(~ variable, scales = "free") +
theme_bw() + theme(axis.text.x = element_text(angle = 90))
Here is the example data for anybody else:
df <- structure(list(HSP90AA1 = c(8.053308, 9.430003, 7.953726, 11.184402,
11.943733, 7.441695), SSH2 = c(12.038484, 10.687959, 9.918988,
11.056144, 11.0045, 9.774733), ACTB = c(10.557234, 10.437068,
10.078192, 8.316846, 9.240883, 7.566619), TotalTranscripts = c(33367.23,
30285.41, 30133.94, 24857.07, 23629, 22792.18)), .Names = c("HSP90AA1",
"SSH2", "ACTB", "TotalTranscripts"), class = "data.frame", row.names = c("ESC_11_TTCGCCAAATCC",
"ESC_10_TTGAGCTGCACT", "ESC_11_GCCGCGTTATAA", "ESC_11_GCATTCTGGCTC",
"ESC_11_GTTACATTTCAC", "ESC_11_CCGTTGCCCCTC"))
And here is a solution:
#New column for row names so they can be used as x-axis elements
df$rowz <- row.names(df)
#Explicitly order the rows (see the Kohske link)
df$rowz1 <- reorder(df$rowz, rev(df$TotalTranscripts))
library(reshape2)
#Melt the data from wide to long
df1 <- melt(df, id.vars = c("rowz1", "TotalTranscripts"),
measure.vars = c("HSP90AA1", "SSH2", "ACTB"))
library(ggplot2)
gp <- ggplot(df1, aes(x = rowz1, y = value)) + geom_bar(stat = "identity") +
facet_wrap(~ variable, scales = "free") +
theme_bw()
gp + theme(axis.text.x = element_text(angle = 90))
This example by Kohske is a constant reference for me on ordering elements in ggplot2.
If you have many columns, but the same six ESC complexes, you can switch the groupings, i.e. x = variable and facet_wrap(~ rowz1), but this fundamentally changes how you are visualizing/comparing your data. Also, consider facet_grid(row ~ column) if you can organize the columns by 2 components (Columns being the data that are melted into 'variable' and 'value').
And this additional SO solution isn't related to your question, but it is an elegant way to reorder elements in each facet by their values (for future reference).
Finally, the method that will give you the finest control is to plot each graph separately and combine the grobs. Baptiste's packages like gridExtra and gtable are useful for these tasks.
**EDIT in response to new information from OP**
The OP has subsequently asked how to visualize the data, especially when there are more ESC categorical variables (up to 600+).
Here are some examples, with the big caveat that with many categorical variables, they should be grouped or converted to a continuous variable somehow.
#Plot colour to a few discrete, categorical variables
gp + aes(fill = rowz1) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
labs(x = NULL, fill = "Cell", title = "Discrete categorical variables")
#Plot colour on a continuous scale.
#Ultimately, not appropriate for this example! (but shown for reference)
#More appropriate: fill = TotalTranscripts
gp + aes(fill = as.numeric(rowz1)) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
labs(x = NULL, title = "Continuous variables (legend won't work for many values)") +
scale_fill_gradient2(name = "Cell",
breaks = as.numeric(df1$rowz1),
labels = df1$rowz1,
midpoint=median(as.numeric(df1$rowz1)))
#x is continuous, colour plotted to the categorical variable.
#Same caveats as earlier.
gp1 <- ggplot(df1, aes(x = TotalTranscripts/1000, y = value, colour = rowz1)) +
geom_point(size=3) + facet_wrap(~ variable, scales = "free") +
labs(title = "X is an actual continuous variable") +
theme_bw() + labs(x = bquote("Total Transcripts,"~10^3), colour = "Cell")
gp1
I'm trying to use ggplot to create sequence plots, for the sake of keeping the same visual style within my paper using sequence analysis. I do:
library(ggplot2)
library(TraMineR)
library(dplyr)
library(tidyr)
data(mvad)
mvad_seq<-seqdef(mvad,15:length(mvad))
mvad_trate<-seqsubm(mvad_seq,method="TRATE")
mvad_dist<-seqdist(mvad_seq,method="OM",sm=mvad_trate)
cluster<-cutree(hclust(d=as.dist(mvad_dist),method="ward.D2"),k=6)
mvad$cluster<-cluster
mvad_long<-gather(select(mvad,id,contains("."),-matches("N.Eastern"),-matches("S.Eastern")),
key="Month",value="state",
Jul.93, Aug.93, Sep.93, Oct.93, Nov.93, Dec.93, Jan.94, Feb.94, Mar.94,
Apr.94, May.94, Jun.94, Jul.94, Aug.94, Sep.94, Oct.94, Nov.94, Dec.94, Jan.95,
Feb.95, Mar.95, Apr.95, May.95, Jun.95, Jul.95, Aug.95, Sep.95, Oct.95, Nov.95,
Dec.95, Jan.96, Feb.96, Mar.96, Apr.96, May.96, Jun.96, Jul.96, Aug.96, Sep.96,
Oct.96, Nov.96, Dec.96, Jan.97, Feb.97, Mar.97, Apr.97, May.97, Jun.97, Jul.97,
Aug.97, Sep.97, Oct.97, Nov.97, Dec.97, Jan.98, Feb.98, Mar.98, Apr.98, May.98,
Jun.98, Jul.98, Aug.98, Sep.98, Oct.98, Nov.98, Dec.98, Jan.99, Feb.99, Mar.99,
Apr.99, May.99, Jun.99)
mvad_long<-left_join(mvad_long,select(mvad,id,cluster))
ggplot(data=mvad_long,aes(x=Month,y=id,fill=state))+geom_tile()+facet_wrap(~cluster)
I try to plot the sequences by cluster, and this gives me the following plot:
As you can see, there are gaps for the ids that don't belong to the cluster represented by each facet. I would like to get rid of these gaps, so that the sequences show up stacked just as with the seqIplot() function of TraMineR as in the next figure:
Any suggestions of how to proceed?
Two small changes:
mvad_long$id <- as.factor(mvad_long$id)
ggplot(data=mvad_long,aes(x=Month,y=id,fill=state))+
geom_tile()+facet_wrap(~cluster,scales = "free_y")
ggplot was treating id as a numerical variable, rather than a factor, and then the scales were fixed.
An update: I needed to convert the month in to a date for it to work. Full solution follows:
library(ggplot2)
library(TraMineR)
library(dplyr)
library(tidyr)
library(lubridate)
data(mvad)
mvad_seq <- seqdef(mvad, 15:length(mvad))
mvad_trate <- seqsubm(mvad_seq, method = "TRATE")
mvad_dist <- seqdist(mvad_seq, method = "OM", sm = mvad_trate)
cluster <- cutree(hclust(d = as.dist(mvad_dist), method = "ward.D2"), k = 6)
mvad$cluster <- cluster
mvad_long <- mvad %>%
select(id, matches("\\.\\d\\d")) %>%
gather(key = "month", value = "state", -id) %>%
inner_join(
mvad %>%
select(id, cluster),
by = "id"
) %>%
mutate(
id = factor(id),
date = myd(paste0(month, "01"))
)
mvad_long %>%
ggplot(aes(x = date, y = id, fill = state, color = state)) +
geom_tile() +
facet_wrap(~cluster, scales = "free_y", ncol = 2) +
theme_bw() +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid = element_blank()
) +
scale_fill_brewer(palette = "Accent") +
scale_colour_brewer(palette = "Accent") +
labs(x = "", y = "")
The graph I'm currently trying to make falls a little between two stools. I want to make a histogram that is composed of stacked and labelled boxes. Here's an example of exactly the sort of thing I'm talking about, taken from a recent article in the New York Times:
http://farm8.staticflickr.com/7109/7026409819_1d2aaacd0a.jpg
Is it possible to achieve this using ggplot2?
To amplify the question somewhat, so far what I have is:
dfr <- data.frame(
name = LETTERS[1:26],
percent = rnorm(26, mean=15)
)
ggplot(dfr, aes(x=percent, fill=name)) + geom_bar() +
stat_bin(geom="text", aes(label=name))
...which I'm clearly doing all wrong. Ultimately what I'd ideally like is something along the lines of the manually-modified graph below, with (say) letters A to M filled one shade and N to Z filled another.
http://farm8.staticflickr.com/7116/7026536711_4df9a1aa12.jpg
Here you go!
set.seed(3421)
# added type to mimick which candidate is supported
dfr <- data.frame(
name = LETTERS[1:26],
percent = rnorm(26, mean=15),
type = sample(c("A", "B"), 26, replace = TRUE)
)
# easier to prepare data in advance. uses two ideas
# 1. calculate histogram bins (quite flexible)
# 2. calculate frequencies and label positions
dfr <- transform(dfr, perc_bin = cut(percent, 5))
dfr <- ddply(dfr, .(perc_bin), mutate,
freq = length(name), pos = cumsum(freq) - 0.5*freq)
# start plotting. key steps are
# 1. plot bars, filled by type and grouped by name
# 2. plot labels using name at position pos
# 3. get rid of grid, border, background, y axis text and lables
ggplot(dfr, aes(x = perc_bin)) +
geom_bar(aes(y = freq, group = name, fill = type), colour = 'gray',
show_guide = F) +
geom_text(aes(y = pos, label = name), colour = 'white') +
scale_fill_manual(values = c('red', 'orange')) +
theme_bw() + xlab("") + ylab("") +
opts(panel.grid.major = theme_blank(), panel.grid.minor = theme_blank(),
axis.ticks = theme_blank(), panel.border = theme_blank(),
axis.text.y = theme_blank())