Related
I would be very grateful for any help. I have created a heatmap using Pheatmap. My measures are binary and I would like the annotation row colours (5 categories) to be the same as the data points. Currently I have one colour across the 5 categories. I have attached the chart produced by my code. I am not sure how to do this. Thanks in advance!
Here is my code and sample data:
library(pheatmap)
library(dplyr)
*Arrange cluster
spells2=spells%>%arrange(PAM_complete)
*Df for wheeze columns
whz=spells2%>%dplyr::select(2:6)
*Create separate df for cluster
c5=spells2$PAM_complete
c5=as.data.frame(c5)
*Wheeze and cluster need the same row names (id)
rownames(whz)=spells2$id
rownames(c5)=spells2$id
c5$c5=as.factor(c5$c5)
col=c("white", "darkblue")
pheatmap(whz,legend_breaks = 0:1, legend_labels = c("No wheeze", "Wheeze"), fontsize = 10,
show_rownames=FALSE, cluster_rows = FALSE, color=col,
cluster_cols=FALSE , annotation_row=c5, )
> dput(head(spells2, 50))
structure(list(id = c("10003A", "1001", "10012A", "10013A", "10016A",
"10019A", "1001A", "10023A", "1002A", "10037A", "1004", "10042A",
"10045A", "1005", "10051A", "10054A", "1006", "10064A", "10065A",
"10075A", "10076A", "10082A", "10087A", "10094A", "10095A", "10097A",
"10098A", "100A", "10103A", "10104A", "10106A", "10121A", "10124A",
"10126A", "10132A", "1013A", "10144A", "10146A", "1014A", "1015",
"10153A", "10156A", "10159A", "10161A", "1017", "10171A", "10175A",
"10178A", "1018", "10186A"), whz1 = c(0, 1, 0, 0, 0, 0, 0, 1,
0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0
), whz2 = c(1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0), whz3 = c(0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 0), whz4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), whz5 = c(0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 0, 0, 0, 0), PAM_complete = c("ETW", "ETW", "NWZ", "NWZ",
"LOW", "NWZ", "NWZ", "INT", "NWZ", "ETW", "NWZ", "PEW", "ETW",
"INT", "NWZ", "INT", "ETW", "NWZ", "ETW", "ETW", "NWZ", "ETW",
"ETW", "NWZ", "NWZ", "NWZ", "NWZ", "NWZ", "NWZ", "PEW", "NWZ",
"ETW", "NWZ", "INT", "NWZ", "INT", "NWZ", "INT", "NWZ", "LOW",
"PEW", "NWZ", "NWZ", "INT", "ETW", "NWZ", "ETW", "NWZ", "ETW",
"NWZ")), row.names = c(NA, -50L), class = c("tbl_df", "tbl",
"data.frame"))
>
If I understand you correctly, you have plot "B" below, but you want plot "A" (without the little gaps in between the plots). This is not a straightforward task using the pheatmap package. The approach I used to create plot "A" below might be suitable with some tweaking (basically, plot each group separately then paste them all together in a column). Otherwise, a simpler 'ggplot' method is included below.
library(tidyverse)
library(pheatmap)
library(cowplot)
spells2 <- as.data.frame(spells) %>%
arrange(PAM_complete)
#Df for wheeze columns
whz <- spells2 %>%
dplyr::select(2:6)
#Create separate df for cluster
c5 <- spells2$PAM_complete %>%
as.data.frame()
colnames(c5) <- "names"
#Wheeze and cluster need the same row names (id)
rownames(whz) <- spells2$id
rownames(c5) <- spells2$id
c5$names <- as.factor(c5$names)
combined <- cbind(c5, whz)
# To get the 'default' pheatmap colour scheme
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 75, c = 100)[1:n]
}
scales::show_col(gg_color_hue(5))
# Specify colours for each group
ann_colors = list(
names = c(ETW = "#FF9289", INT = "#FF8AFF",
LOW = "#00DB98", NWZ = "#00CBFF",
PEW = "#BEC100"))
# Generate the plots
col = c("grey95", "darkblue")
p <- pheatmap(whz, legend_breaks = 0:1,
legend_labels = c("No wheeze", "Wheeze"),
fontsize = 10, show_rownames = FALSE,
cluster_rows = FALSE, color = col,
cluster_cols = FALSE, annotation_row = c5)
col_1 <- c("grey95", "#FF9289")
p1 <- pheatmap(combined %>% filter(names == "ETW") %>% select(-c(names)),
show_rownames = FALSE, show_colnames = FALSE,
cluster_rows = FALSE, cluster_cols = FALSE,
legend = FALSE, annotation_legend = FALSE,
color = col_1, annotation_names_row = FALSE,
annotation_colors = ann_colors,
annotation_row = combined %>% filter(names == "ETW") %>% select(names))
col_2 <- c("grey95", "#FF8AFF")
p2 <- pheatmap(combined %>% filter(names == "INT") %>% select(-c(names)),
show_rownames = FALSE, show_colnames = FALSE,
cluster_rows = FALSE, cluster_cols = FALSE,
legend = FALSE, annotation_legend = FALSE,
color = col_2, annotation_names_row = FALSE,
annotation_colors = ann_colors, cellheight = 7,
annotation_row = combined %>% filter(names == "INT") %>% select(names))
col_3 <- c("grey95", "#00DB98")
p3 <- pheatmap(combined %>% filter(names == "LOW") %>% select(-c(names)),
show_rownames = FALSE, show_colnames = FALSE,
cluster_rows = FALSE, cluster_cols = FALSE,
legend = FALSE, annotation_legend = FALSE,
color = col_3, annotation_names_row = FALSE,
annotation_colors = ann_colors,
annotation_row = combined %>% filter(names == "LOW") %>% select(names))
# Because all whz values = 0 for NWZ,
# you need to change one value to '1'
# in order for pheatmap to generate a plot
combined[23,2] <- 1
col_4 <- c("grey95", "grey95")
p4 <- pheatmap(combined %>% filter(names == "NWZ") %>% select(-c(names)),
show_rownames = FALSE, show_colnames = FALSE,
cluster_rows = FALSE, cluster_cols = FALSE,
legend = FALSE, annotation_legend = FALSE,
color = col_4, annotation_names_row = FALSE,
annotation_colors = ann_colors,
annotation_row = combined %>% filter(names == "NWZ") %>% select(names))
col_5 <- c("grey95", "#BEC100")
p5 <- pheatmap(combined %>% filter(names == "PEW") %>% select(-c(names)),
show_rownames = FALSE,
cluster_rows = FALSE, cluster_cols = FALSE,
legend = FALSE, annotation_legend = FALSE,
color = col_5,
annotation_colors = ann_colors,
annotation_row = combined %>% filter(names == "PEW") %>% select(names))
heatmaps <- cowplot::plot_grid(p1[[4]], p2[[4]], p3[[4]],
p4[[4]], p5[[4]], ncol = 1,
rel_heights = c(1.3, 0.7, 0.3, 2.4, 0.8))
cowplot::plot_grid(heatmaps, p$gtable, ncol = 2, rel_widths = c(0.7, 1), labels = "AUTO")
EDIT
If you don't necessarily want to use pheatmap, ggplot2 geom_tile() would be a lot easier, e.g.
library(tidyverse)
my_levels <- rownames(combined)
my_colours <- c("#FF9289", "#FF8AFF", "#00DB98", "#00CBFF", "#BEC100")
combined %>%
rownames_to_column(var = "IDs") %>%
pivot_longer(cols = -c(IDs, names),
names_to = "Trial",
values_to = "Wheeze") %>%
rename(Group = names) %>%
mutate(IDs = factor(IDs, levels = my_levels)) %>%
ggplot() +
geom_tile(aes(y = rev(IDs),
x = Trial,
fill = Group,
alpha = Wheeze),
color = "black") +
scale_alpha_continuous(breaks = c(0, 1),
labels = c("No", "Yes")) +
scale_fill_manual(values = my_colours) +
theme_minimal() +
theme(panel.grid = element_blank())
EDIT 2
To include an 'annotation' bar before the plot, you can use this:
combined %>%
rownames_to_column(var = "IDs") %>%
pivot_longer(cols = -c(IDs, names),
names_to = "Trial",
values_to = "Wheeze") %>%
rename(Group = names) %>%
mutate(IDs = factor(IDs, levels = my_levels)) %>%
ggplot() +
geom_tile(aes(y = rev(IDs),
x = Trial,
fill = Group,
alpha = Wheeze),
color = "black") +
geom_tile(aes(x = -0.1, y = rev(IDs), fill = Group),
show.legend = FALSE) +
coord_cartesian(c(0.8, 5)) +
scale_fill_manual(values = my_colours) +
scale_alpha_continuous(breaks = c(0, 1),
labels = c("No", "Yes")) +
theme(plot.margin=unit(c(1,0,0,0), units="lines"))
I wasn't able to label it as "Groups", but I imagine it's possible if you tinker with it.
I have the following dataframe (df). I want to make a graph that shows the same positions of the cells in the dataframe and having each cell with a color depending on the value (-1:1). if the value is zero then the color is blue, if the value is 1 or -1 then the value is red.
df<- structure(list(`0` = 1:8, `1` = c(0.885, 0.695, 0.828, 0.888,
0.823, 0.231, 0.599, 0.153), `2` = c(0.834, 0.68, 0.857, 0.802,
0.734, 0.205, 0.62, 0.044), `3` = c(0.854, 0.66, 0.83, 0.829,
0.729, 0.159, 0.559, 0.081), `4` = c(0.87, 0.583, 0.778, 0.853,
0.75, 0.087, 0.515, -0.011), `5` = c(0.922, 0.739, 0.787, 0.805,
0.635, -0.017, 0.498, -0.204), `6` = c(0.815, 0.535, 0.833, 0.784,
0.803, 0.092, 0.502, -0.419), `7` = c(0.859, 0.517, 0.8, 0.829,
0.557, 0.22, 0.368, -0.42), `8` = c(0.86, 0.701, 0.701, 0.786,
0.567, 0.414, 0.324, -0.396), `9` = c(0.774, 0.781, 0.805, 0.862,
0.405, 0.852, 0.1, -0.448), `10` = c(0.869, 0.788, 0.837, 0.838,
0.481, 0, -0.072, -0.48), `11` = c(0.816, 0.795, 0.807, 0.744,
0.217, 0, 0.096, -0.346), `12` = c(0.829, 0.792, 0.774, 0.778,
0.003, 0, 0, 0), `13` = c(0.799, 0.84, 0.775, 0.66, -0.024, 0,
0, 0), `14` = c(0.842, 0.765, 0.852, 0.679, 0, 0, 0, 0), `15` = c(0.804,
0.811, 0.818, 0.468, 0, 0, 0, 0), `16` = c(0.801, 0.757, 0.715,
0.091, 0, 0, 0, 0), `17` = c(0.807, 0.786, 0.799, -0.042, 0,
0, 0, 0), `18` = c(0.595, 0.795, 0.73, 0, 0, 0, 0, 0), `19` = c(0.822,
0.789, 0.623, 0, 0, 0, 0, 0), `20` = c(0.829, 0.822, 0.048, 0,
0, 0, 0, 0), `21` = c(0.805, 0.788, -0.205, 0, 0, 0, 0, 0), `22` = c(0.788,
0.791, -0.065, 0, 0, 0, 0, 0), `23` = c(0.839, 0.786, -0.217,
0, 0, 0, 0, 0), `24` = c(0.804, 0.815, 0, 0, 0, 0, 0, 0), `25` = c(0.789,
0.784, 0, 0, 0, 0, 0, 0), `26` = c(0.754, 0.787, 0, 0, 0, 0,
0, 0), `27` = c(0.832, 0.741, 0, 0, 0, 0, 0, 0), `28` = c(0.846,
0.778, 0, 0, 0, 0, 0, 0), `29` = c(0.797, 0.69, 0, 0, 0, 0, 0,
0), `30` = c(0.843, 0.644, 0, 0, 0, 0, 0, 0), `31` = c(0.825,
0.622, 0, 0, 0, 0, 0, 0), `32` = c(0.824, 0.726, 0, 0, 0, 0,
0, 0), `33` = c(0.749, 0.493, 0, 0, 0, 0, 0, 0), `34` = c(0.774,
-0.082, 0, 0, 0, 0, 0, 0), `35` = c(0.652, -0.255, 0, 0, 0, 0,
0, 0), `36` = c(0.833, 0, 0, 0, 0, 0, 0, 0), `37` = c(0.795,
0, 0, 0, 0, 0, 0, 0), `38` = c(0.864, 0, 0, 0, 0, 0, 0, 0), `39` = c(0.226,
0, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -8L), class = "data.frame")
You don't have any cells that are -1 or 1, so there aren't any that would be red. You didn't mention what color the cells should be if they are neither -1, 0 or 1, so I have left these white:
library(ggplot2)
library(tidyr)
library(dplyr)
pivot_longer(df, -1) %>%
mutate(name = factor(as.numeric(name),
sort(unique(as.numeric(name))))) %>%
ggplot(aes(name, `0`, fill = ifelse(value == 0, "blue",
ifelse(abs(value) == 1, "red", "white")))) +
geom_tile(color = "black") +
scale_y_reverse(breaks = seq(nrow(df)), expand = c(0, 0)) +
scale_x_discrete(position = "top") +
scale_fill_identity() +
coord_equal() +
theme_classic() +
theme(axis.line = element_blank(),
axis.title = element_blank(),
legend.position = "none")
Edit
Update based on comment from OP:
library(ggplot2)
library(tidyr)
library(dplyr)
pivot_longer(df, -1) %>%
mutate(name = factor(as.numeric(name),
sort(unique(as.numeric(name))))) %>%
ggplot(aes(name, `0`, fill = value)) +
geom_tile(color = "black") +
scale_y_reverse(breaks = seq(nrow(df)), expand = c(0, 0)) +
scale_x_discrete(position = "top") +
scale_fill_gradient2(low = "red", mid = "blue", high = "red",
breaks = c(-1, 0, 1), limits = c(-1, 1)) +
coord_equal() +
theme_classic() +
theme(axis.line = element_blank(),
axis.title = element_blank())
legend.position = "none")
I have a dataframe that I want to plot a heatmap of:
dput(df)
structure(list(`0` = c(6.08, 7.91, 5.14, 2.23, 0.72, 0.19, 0.04,
0.01, 0, 0, 0), `1` = c(9.12, 11.86, 7.71, 3.34, 1.09, 0.28,
0.06, 0.01, 0, 0, 0), `2` = c(6.84, 8.89, 5.78, 2.5, 0.81, 0.21,
0.05, 0.01, 0, 0, 0), `3` = c(3.42, 4.45, 2.89, 1.25, 0.41, 0.11,
0.02, 0, 0, 0, 0), `4` = c(1.28, 1.67, 1.08, 0.47, 0.15, 0.04,
0.01, 0, 0, 0, 0), `5` = c(0.38, 0.5, 0.33, 0.14, 0.05, 0.01,
0, 0, 0, 0, 0), `6` = c(0.1, 0.13, 0.08, 0.04, 0.01, 0, 0, 0,
0, 0, 0), `7` = c(0.02, 0.03, 0.02, 0.01, 0, 0, 0, 0, 0, 0, 0
), `8` = c(0, 0.01, 0, 0, 0, 0, 0, 0, 0, 0, 0), `9` = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), `10 or more` = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0)), row.names = c("0", "1", "2", "3", "4", "5",
"6", "7", "8", "9", "10 or more"), class = "data.frame")
Now to plot the heatmap using ggplot2 this is how I approach the solution:
df %>%
as_tibble(rownames = "homeScore") %>%
pivot_longer(cols = -homeScore, names_to = "awayScore", values_to = "level") %>%
ggplot() +geom_tile(aes(x=homeScore, y=awayScore, fill = level))
The problem I face is that the columns and rows are being sorted on (0,1,10+,2,..) instead of (0,1,2,...10+). Here's the example:
How do I sort the values such that 10+ is the last for row and column, instead of the third?
As #Nate already mentioned you have to convert your vars to factors and put the levels in the right order. Instead of converting via factor(as.numeric(.)) (which converts "10 or more" to NA) I would recommend to make use of forcats::fct_relevel which allows you to change the order of the levels, e.g. forcats::fct_relevel(homeScore, "10 or more", after = 10) will change the order of the levels such that 10 or more becomes the last level. Try this:
library(ggplot2)
library(tidyr)
library(dplyr)
library(forcats)
df %>%
as_tibble(rownames = "homeScore") %>%
pivot_longer(cols = -homeScore, names_to = "awayScore", values_to = "level") %>%
mutate_at(vars(homeScore, awayScore), ~forcats::fct_relevel(.x, "10 or more", after = 10)) %>%
ggplot() +
geom_tile(aes(x=homeScore, y=awayScore, fill = level))
I have a list (lst3, subset below) and would like to do some calculations on it, e.g.:
lst4 <-lapply(lst3, function(x) aggregate(x[,5:ncol(x)], x[c(4)], FUN = mean)) #column means
lst5<-lapply(lst4,function(x) apply(x[,-c(1)],1,mean)) # get row mean
However, I am unable to get row mean without ignoring "Site".
I would like my final list to look like this:
lst5<-
[[1]]
Site x
G116 1.864233
[[2]]
Site x
GG16 2.064567
The essence is that the final list should have the above structure so that I can write my data to working directory using:
lapply(lst5,function(x)write.table(x,file=paste(getwd(),"summer",paste0(unique(x$Site),".csv"),
sep="/"),row.names=FALSE,quote=FALSE)) ### create a folder called "summer" and write files to directory###
Thanks,
AZ.
list(structure(list(Year = c(2005L, 2005L, 2005L), Month = c(8L,
8L, 8L), Day = 29:31, Site = structure(c(1L, 1L, 1L), .Label = "G116", class = "factor"),
Sim001 = c(8.4, 17.72, 6.03), Sim002 = c(0.27, 0, 0), Sim003 = c(2.83,
0.14, 0.1), Sim004 = c(0, 0, 0), Sim005 = c(0, 0.77, 0.28
), Sim006 = c(0, 0, 0), Sim007 = c(0, 0, 0), Sim008 = c(10.94,
4.77, 0), Sim009 = c(0, 0, 0), Sim010 = c(3.43, 2.74, 0.65
), Sim011 = c(0.36, 0, 2.75), Sim012 = c(26.91, 0, 2.16),
Sim013 = c(0.88, 1.33, 0.87), Sim014 = c(0, 0.86, 9.42),
Sim015 = c(0, 0.17, 1.15), Sim016 = c(0, 0, 0), Sim017 = c(0.13,
0, 0), Sim018 = c(0, 0, 6.72), Sim019 = c(8.45, 12.99, 23.72
), Sim020 = c(1.76, 0, 0), Sim021 = c(0, 0, 2.34), Sim022 = c(0,
0, 0), Sim023 = c(1.2, 0, 0.26), Sim024 = c(0.85, 0, 0),
Sim025 = c(0, 0, 0), Sim026 = c(2.05, 0.76, 5.03), Sim027 = c(0.78,
0, 0), Sim028 = c(1.2, 0, 0), Sim029 = c(22, 0.19, 0), Sim030 = c(0.12,
0, 0), Sim031 = c(3.1, 13.67, 0), Sim032 = c(0, 0, 17.88),
Sim033 = c(0, 0, 0), Sim034 = c(1.11, 0, 0), Sim035 = c(1.17,
1.41, 23.35), Sim036 = c(0, 0.48, 1.71), Sim037 = c(1.51,
11.1, 7.98), Sim038 = c(0, 0, 0), Sim039 = c(0, 0, 5.46),
Sim040 = c(5.21, 0, 0), Sim041 = c(0.1, 0.11, 0), Sim042 = c(0,
0.15, 5.23), Sim043 = c(0, 0, 0), Sim044 = c(0, 0.1, 0),
Sim045 = c(0, 0, 0), Sim046 = c(0, 0, 0), Sim047 = c(0, 0,
0.11), Sim048 = c(0, 0, 0), Sim049 = c(0, 0, 4.05), Sim050 = c(0,
0, 0), Sim051 = c(0, 0.12, 0), Sim052 = c(0.24, 2.58, 0),
Sim053 = c(3.63, 0, 0.17), Sim054 = c(10.94, 2.69, 0), Sim055 = c(0,
0, 0), Sim056 = c(0.24, 0.44, 8.27), Sim057 = c(0, 0, 0),
Sim058 = c(0, 0, 3.75), Sim059 = c(0.19, 11.06, 0), Sim060 = c(0,
0, 1.65), Sim061 = c(0, 4.95, 0), Sim062 = c(0.15, 0, 4.73
), Sim063 = c(2.99, 0.12, 1.28), Sim064 = c(0, 0, 0), Sim065 = c(0,
0, 0), Sim066 = c(0, 0, 0), Sim067 = c(0.11, 0.62, 0.56),
Sim068 = c(2.84, 0, 0), Sim069 = c(0, 0, 0), Sim070 = c(17.91,
0.11, 4.78), Sim071 = c(0, 0, 1.68), Sim072 = c(0, 0, 1.38
), Sim073 = c(1.68, 0, 0), Sim074 = c(0.53, 0, 2.87), Sim075 = c(0,
0, 0), Sim076 = c(2.58, 0.27, 0.11), Sim077 = c(0, 0, 0),
Sim078 = c(9.07, 3.13, 8.62), Sim079 = c(0.98, 0, 2.38),
Sim080 = c(3.4, 0, 0), Sim081 = c(0, 0, 4.57), Sim082 = c(1.87,
2.86, 0), Sim083 = c(21.76, 2.24, 0), Sim084 = c(0.45, 4.03,
0.39), Sim085 = c(0, 0, 0), Sim086 = c(0, 0, 0), Sim087 = c(0,
0, 17.12), Sim088 = c(5.05, 0, 0), Sim089 = c(0, 0, 1.4),
Sim090 = c(0.1, 0, 0), Sim091 = c(1.96, 0, 1.38), Sim092 = c(0,
0, 0), Sim093 = c(0, 0, 0), Sim094 = c(0, 0, 1.81), Sim095 = c(2.72,
7.16, 1.7), Sim096 = c(6.37, 0, 0), Sim097 = c(0, 1.12, 25.7
), Sim098 = c(0, 0, 0), Sim099 = c(0, 0, 0), Sim100 = c(6.77,
10.87, 2.6)), .Names = c("Year", "Month", "Day", "Site",
"Sim001", "Sim002", "Sim003", "Sim004", "Sim005", "Sim006", "Sim007",
"Sim008", "Sim009", "Sim010", "Sim011", "Sim012", "Sim013", "Sim014",
"Sim015", "Sim016", "Sim017", "Sim018", "Sim019", "Sim020", "Sim021",
"Sim022", "Sim023", "Sim024", "Sim025", "Sim026", "Sim027", "Sim028",
"Sim029", "Sim030", "Sim031", "Sim032", "Sim033", "Sim034", "Sim035",
"Sim036", "Sim037", "Sim038", "Sim039", "Sim040", "Sim041", "Sim042",
"Sim043", "Sim044", "Sim045", "Sim046", "Sim047", "Sim048", "Sim049",
"Sim050", "Sim051", "Sim052", "Sim053", "Sim054", "Sim055", "Sim056",
"Sim057", "Sim058", "Sim059", "Sim060", "Sim061", "Sim062", "Sim063",
"Sim064", "Sim065", "Sim066", "Sim067", "Sim068", "Sim069", "Sim070",
"Sim071", "Sim072", "Sim073", "Sim074", "Sim075", "Sim076", "Sim077",
"Sim078", "Sim079", "Sim080", "Sim081", "Sim082", "Sim083", "Sim084",
"Sim085", "Sim086", "Sim087", "Sim088", "Sim089", "Sim090", "Sim091",
"Sim092", "Sim093", "Sim094", "Sim095", "Sim096", "Sim097", "Sim098",
"Sim099", "Sim100"), row.names = 15947:15949, class = "data.frame"),
structure(list(Year = c(2005L, 2005L, 2005L), Month = c(8L,
8L, 8L), Day = 29:31, Site = structure(c(1L, 1L, 1L), .Label = "GG16", class = "factor"),
Sim001 = c(18.36, 0.33, 0.14), Sim002 = c(0, 10.92, 0
), Sim003 = c(0, 0, 0), Sim004 = c(0, 0, 1.7), Sim005 = c(0,
0, 0), Sim006 = c(0.91, 4.24, 0), Sim007 = c(0, 0, 0.22
), Sim008 = c(0.63, 2.9, 2.24), Sim009 = c(0, 0, 0),
Sim010 = c(0, 0, 6.91), Sim011 = c(0, 3.28, 10.18), Sim012 = c(8.39,
14.58, 45.62), Sim013 = c(2.87, 0.53, 0.11), Sim014 = c(9.15,
21.1, 0.66), Sim015 = c(0, 1.75, 2.2), Sim016 = c(0,
7.86, 0), Sim017 = c(0, 0, 0), Sim018 = c(0, 0, 0), Sim019 = c(0,
0, 0), Sim020 = c(0.39, 0, 0), Sim021 = c(0.13, 0, 1.05
), Sim022 = c(0, 0, 10.91), Sim023 = c(0.23, 0, 0), Sim024 = c(0.12,
0.83, 5.35), Sim025 = c(0, 0, 0), Sim026 = c(7.75, 0,
4.82), Sim027 = c(20.04, 0, 0), Sim028 = c(12.41, 0,
5.3), Sim029 = c(0, 0, 0), Sim030 = c(0, 0, 0), Sim031 = c(0,
8.06, 0), Sim032 = c(0, 0, 0), Sim033 = c(0, 0, 0), Sim034 = c(0.1,
0, 3.34), Sim035 = c(0, 4.34, 3.53), Sim036 = c(2.89,
0.27, 0), Sim037 = c(0, 0, 0), Sim038 = c(0, 0, 0), Sim039 = c(0,
0.11, 0), Sim040 = c(9.83, 1.55, 9.09), Sim041 = c(3.6,
0, 0), Sim042 = c(0, 0, 1.37), Sim043 = c(0, 0, 0), Sim044 = c(0,
0, 0), Sim045 = c(0, 0, 0), Sim046 = c(0, 0, 0), Sim047 = c(0,
20.52, 0.65), Sim048 = c(1.77, 0.67, 0), Sim049 = c(0,
0, 0), Sim050 = c(0, 0, 0), Sim051 = c(0, 4.9, 0), Sim052 = c(0.71,
11.34, 0), Sim053 = c(3.46, 2.59, 1.5), Sim054 = c(0,
23.63, 0), Sim055 = c(0, 16.48, 4.99), Sim056 = c(0,
0, 0), Sim057 = c(0, 0, 0), Sim058 = c(0, 0, 0), Sim059 = c(0,
0, 0), Sim060 = c(16.87, 0, 0), Sim061 = c(0, 3.43, 0
), Sim062 = c(0.45, 0, 0), Sim063 = c(0, 11.14, 7.22),
Sim064 = c(0, 0, 0), Sim065 = c(0, 0, 0), Sim066 = c(0,
16.08, 1.87), Sim067 = c(0, 0, 0), Sim068 = c(5.16, 0.88,
0.1), Sim069 = c(0, 0, 3.91), Sim070 = c(0, 0, 0), Sim071 = c(0.17,
0, 5.22), Sim072 = c(0, 0, 6.95), Sim073 = c(0, 0, 0),
Sim074 = c(0.14, 0, 0), Sim075 = c(0, 0, 0), Sim076 = c(0,
9.62, 0), Sim077 = c(0, 0, 0), Sim078 = c(1.65, 0, 0),
Sim079 = c(0.23, 8.41, 0.28), Sim080 = c(0.78, 0, 0),
Sim081 = c(0, 0, 0), Sim082 = c(0.11, 2.75, 0), Sim083 = c(0.26,
7.34, 5.92), Sim084 = c(0, 0, 4.27), Sim085 = c(0, 0,
0), Sim086 = c(0, 0, 0.1), Sim087 = c(27.18, 0.72, 28.29
), Sim088 = c(0, 0, 4.2), Sim089 = c(0, 9.37, 6.59),
Sim090 = c(0.21, 2.57, 0), Sim091 = c(0.45, 0, 0), Sim092 = c(0,
4.97, 0), Sim093 = c(1.43, 0, 0), Sim094 = c(0, 0, 2.15
), Sim095 = c(6, 0, 1.63), Sim096 = c(7.21, 0, 0), Sim097 = c(0,
0.39, 1.92), Sim098 = c(0, 0, 0), Sim099 = c(4.38, 0,
0), Sim100 = c(0, 0, 0)), .Names = c("Year", "Month",
"Day", "Site", "Sim001", "Sim002", "Sim003", "Sim004", "Sim005",
"Sim006", "Sim007", "Sim008", "Sim009", "Sim010", "Sim011",
"Sim012", "Sim013", "Sim014", "Sim015", "Sim016", "Sim017",
"Sim018", "Sim019", "Sim020", "Sim021", "Sim022", "Sim023",
"Sim024", "Sim025", "Sim026", "Sim027", "Sim028", "Sim029",
"Sim030", "Sim031", "Sim032", "Sim033", "Sim034", "Sim035",
"Sim036", "Sim037", "Sim038", "Sim039", "Sim040", "Sim041",
"Sim042", "Sim043", "Sim044", "Sim045", "Sim046", "Sim047",
"Sim048", "Sim049", "Sim050", "Sim051", "Sim052", "Sim053",
"Sim054", "Sim055", "Sim056", "Sim057", "Sim058", "Sim059",
"Sim060", "Sim061", "Sim062", "Sim063", "Sim064", "Sim065",
"Sim066", "Sim067", "Sim068", "Sim069", "Sim070", "Sim071",
"Sim072", "Sim073", "Sim074", "Sim075", "Sim076", "Sim077",
"Sim078", "Sim079", "Sim080", "Sim081", "Sim082", "Sim083",
"Sim084", "Sim085", "Sim086", "Sim087", "Sim088", "Sim089",
"Sim090", "Sim091", "Sim092", "Sim093", "Sim094", "Sim095",
"Sim096", "Sim097", "Sim098", "Sim099", "Sim100"), row.names = 15947:15949, class = "data.frame"))
You can go from lst3 directly to lst5 without the intermediate aggregate step:
lapply(lst3, function(df){
data.frame(Site = df$Site[1], x = mean(unlist(df[-c(1:4)])))
})
#[[1]]
# Site x
#1 G116 1.864233
#
#[[2]]
# Site x
#1 GG16 2.064567
Since you're calculating the mean of all columns except the first 4 columns and over all the rows of the other columns, it's quite easy to unlist the data, creating a single vector, and then using standard mean on it. Also, by skipping the lst4 step, this most likely be noticeably faster.
Or, as commented by Richard, a variation could be:
lapply(lst3, function(df){
data.frame(Site = df$Site[1], x = mean(colMeans(df[-c(1:4)])))
})
Benchmark:
library(microbenchmark)
microbenchmark(
f1 = {lapply(lst3, function(df){
data.frame(Site = df$Site[1], x = mean(unlist(df[-c(1:4)])))
})},
f2 = {lapply(lst3, function(df){
data.frame(Site = df$Site[1], x = mean(colMeans(df[-c(1:4)])))
})},
unit = "relative"
)
Unit: relative
expr min lq median uq max neval
f1 1.00000 1.000000 1.000000 1.000000 1.000000 100
f2 2.91545 2.937272 2.927799 2.894704 3.486007 100
Here's another option for your consideration:
library(reshape2)
x <- melt(lst3)
aggregate(value ~ Site, x[grepl("^Sim.*", x$variable),], FUN = mean)
# Site value
#1 G116 1.864233
#2 GG16 2.064567
Or the same concept but using dplyr:
library(dplyr)
filter(x, grepl("^Sim.*", variable)) %>% group_by(Site) %>% summarise(x = mean(value))
#Source: local data frame [2 x 2]
#
# Site x
#1 G116 1.864233
#2 GG16 2.064567
Of course, this could also be done using data.table, for example like this (there are probably several even slightly more efficient ways to do this in data.table):
library(data.table)
setDT(x)[grepl("^Sim.*", variable), list(x = mean(value)), by = Site]
# Site x
#1: G116 1.864233
#2: GG16 2.064567
I would like to ask you for the suggestions how I can edit my plot function to make my graph more clear ?
Here I show you the code which I use for plotting:
# open the pdf file
pdf(file='LSF1_PWD_GWD.pdf')
a <- c('LSF1', 'PWD', 'GWD')
rowsToPlot<-c(1066,2269,109)
matplot(as.matrix(t(tbl_alles[rowsToPlot,])),type=rep("l", length(rowsToPlot)), col=rainbow(length(rowsToPlot)),xlab = 'Fraction Size', ylab = 'Intensity')
legend('topright',a,lty=1, bty='n', cex=.75, col = rainbow(length(rowsToPlot)))
# close the pdf file
dev.off()
and that's how the graph looks like:
It's just a basic plot because I have no idea how to edit it. The arrow indicates three lines on one position which you can't see because they overlap... and that's the most important part of this graph for me. Maybe I shouldn't use dotted line ? How to change it ?
Data:
tbl_alles <-
structure(list("10" = c(0, 0, 0, 0, 0, 0),
"20" = c(0, 0, 0, 0, 0, 0),
"52.5" = c(0, 0, 0, 0, 0, 0),
"81" = c(0, 0, 1, 0, 0, 0),
"110" = c(0, 0, 0, 0, 0, 0),
"140.5" = c(0, 0, 0, 0, 0, 0),
"189" = c(0, 0, 0, 0, 0, 0),
"222.5" = c(0, 0, 0, 0, 0, 0 ),
"278" = c(0, 0, 0, 0, 0, 0),
"340" = c(0, 0, 0, 0, 0, 0),
"397" = c(0, 1, 0, 0, 0, 0),
"453.5" = c(0, 0.66069369, 0, 0, 0, 1),
"529" = c(0, 0.521435654, 0, 0, 1, 0),
"580" = c(0, 0.437291195, 0, 0, 1, 0),
"630.5" = c(0, 0.52204783, 0, 0, 0, 0),
"683.5" = c(0, 0.52429838, 0, 0, 0, 0),
"735.5" = c(1, 0.3768651, 0, 1, 0, 0),
"784" = c(0, 0, 0, 0, 0, 0),
"832" = c(0, 0, 0, 0, 0, 0),
"882.5" = c(0, 0, 0, 0, 0, 0),
"926.5" = c(0, 0, 0, 0, 0, 0),
"973" = c(0, 0, 0, 0, 0, 0),
"1108" = c(0, 0, 0, 0, 0, 0),
"1200" = c(0, 0, 0, 0, 0, 0)),
.Names = c("10", "20", "52.5", "81",
"110", "140.5","189", "222.5",
"278", "340", "397", "453.5",
"529", "580", "630.5", "683.5",
"735.5", "784", "832", "882.5",
"926.5", "973", "1108", "1200"),
row.names = c("at1g01050.1", "at1g01080.1",
"at1g01090.1","at1g01220.1",
"at1g01420.1", "at1g01470.1"),
class = "data.frame")
RowsToPlot:
> dput(tbl_alles[rowsToPlot,])
structure(list(`10` = c(0, 0, 0), `20` = c(0, 0, 0), `52.5` = c(0,
0, 0), `81` = c(0, 0, 0), `110` = c(0, 0, 0), `140.5` = c(0,
0, 0), `189` = c(0, 0, 0), `222.5` = c(0, 0, 0), `278` = c(0,
0, 0), `340` = c(0, 0, 0), `397` = c(0, 0, 0), `453.5` = c(0,
0, 0), `529` = c(0, 0, 0), `580` = c(0, 0, 0), `630.5` = c(0,
0, 0), `683.5` = c(0, 0, 0.57073483), `735.5` = c(0, 1, 0.85691826
), `784` = c(0, 0, 0.90706982), `832` = c(1, 1, 1), `882.5` = c(0,
0, 0), `926.5` = c(0, 0, 0), `973` = c(0, 0, 0), `1108` = c(0,
0, 0), `1200` = c(0, 0, 0)), .Names = c("10", "20", "52.5", "81",
"110", "140.5", "189", "222.5", "278", "340", "397", "453.5",
"529", "580", "630.5", "683.5", "735.5", "784", "832", "882.5",
"926.5", "973", "1108", "1200"), row.names = c("at3g01510.1",
"at5g26570.1", "at1g10760.1"), class = "data.frame")
Okay, here's a way to distinguish the lines clearly, while keeping everything on one plot. I use non solid linetypes and different sizes to 'make room' for the overlayed lines.
library(reshape2)
library(ggplot2)
dat <- as.data.frame(as.matrix(t(tbl_alles)))
dat$x <- as.numeric(row.names(dat))
ggplot(melt(dat, id.vars='x'), aes(x=x, y=value, group=variable)) +
geom_line(aes(color=variable, linetype=variable, size=variable)) +
scale_linetype_manual(values=c('solid', 'dotted', 'dashed')) +
scale_size_manual(values=c(1,3,1)) +
scale_color_manual(values=c('black', 'red', 'white')) +
theme(axis.text = element_text(color='black'),
panel.background = element_rect('grey'),
legend.key = element_rect('grey'),
panel.grid = element_blank()) +
labs(title='This is not a pretty chart, but you can make out the lines')
I took as a starting point your data from the dput you pasted above:
tbl_alles <- structure(list(`10` = c(0, 0, 0), `20` = c(0, 0, 0), `52.5` = c(0, 0, 0), `81` = c(0, 0, 0), `110` = c(0, 0, 0), `140.5` = c(0, 0, 0), `189` = c(0, 0, 0), `222.5` = c(0, 0, 0), `278` = c(0, 0, 0), `340` = c(0, 0, 0), `397` = c(0, 0, 0), `453.5` = c(0, 0, 0), `529` = c(0, 0, 0), `580` = c(0, 0, 0), `630.5` = c(0, 0, 0), `683.5` = c(0, 0, 0.57073483), `735.5` = c(0, 1, 0.85691826), `784` = c(0, 0, 0.90706982), `832` = c(1, 1, 1), `882.5` = c(0, 0, 0), `926.5` = c(0, 0, 0), `973` = c(0, 0, 0), `1108` = c(0, 0, 0), `1200` = c(0, 0, 0)), .Names = c("10", "20", "52.5", "81", "110", "140.5", "189", "222.5", "278", "340", "397", "453.5", "529", "580", "630.5", "683.5", "735.5", "784", "832", "882.5", "926.5", "973", "1108", "1200"), row.names = c("at3g01510.1", "at5g26570.1", "at1g10760.1"), class = "data.frame")
This is most certainly not what you need, but perhaps it can give you another idea.
X=structure(list(`10` = c(0, 0, 0), `20` = c(0, 0, 0), `52.5` = c(0,
0, 0), `81` = c(0, 0, 0), `110` = c(0, 0, 0), `140.5` = c(0,
0, 0), `189` = c(0, 0, 0), `222.5` = c(0, 0, 0), `278` = c(0,
0, 0), `340` = c(0, 0, 0), `397` = c(0, 0, 0), `453.5` = c(0,
0, 0), `529` = c(0, 0, 0), `580` = c(0, 0, 0), `630.5` = c(0,
0, 0), `683.5` = c(0, 0, 0.57073483), `735.5` = c(0, 1, 0.85691826
), `784` = c(0, 0, 0.90706982), `832` = c(1, 1, 1), `882.5` = c(0,
0, 0), `926.5` = c(0, 0, 0), `973` = c(0, 0, 0), `1108` = c(0,
0, 0), `1200` = c(0, 0, 0)), .Names = c("10", "20", "52.5", "81",
"110", "140.5", "189", "222.5", "278", "340", "397", "453.5",
"529", "580", "630.5", "683.5", "735.5", "784", "832", "882.5",
"926.5", "973", "1108", "1200"), row.names = c("at3g01510.1",
"at5g26570.1", "at1g10760.1"), class = "data.frame");
library(ggplot2)
library(reshape2)
library(data.table)
X.dt<-as.data.table(t(X))
X.dt[,X:=1:dim(X.dt)[1]]
X.dt<-melt(X.dt, id='X')
ggplot(X.dt,aes(X, value,group=variable,color=variable))+
geom_line()+
facet_wrap(~variable, nrow=3)+
guides(color=FALSE)+labs(x="X",y="Intensity")
Since you have a discrete number of x values, I suggest using a barplot instead. This will make the categories easier to distinguish and highlight the aspect you are most interested in.
First put the data in long format
dat <- structure(list(`10` = c(0, 0, 0), `20` = c(0, 0, 0), `52.5` = c(0, 0, 0),
`81` = c(0, 0, 0), `110` = c(0, 0, 0), `140.5` = c(0, 0, 0),
`189` = c(0, 0, 0), `222.5` = c(0, 0, 0), `278` = c(0, 0, 0),
`340` = c(0, 0, 0), `397` = c(0, 0, 0), `453.5` = c(0, 0, 0),
`529` = c(0, 0, 0), `580` = c(0, 0, 0), `630.5` = c(0, 0, 0),
`683.5` = c(0, 0, 0.57073483), `735.5` = c(0, 1, 0.85691826),
`784` = c(0, 0, 0.90706982), `832` = c(1, 1, 1),
`882.5` = c(0, 0, 0), `926.5` = c(0, 0, 0), `973` = c(0, 0, 0),
`1108` = c(0, 0, 0), `1200` = c(0, 0, 0)),
.Names = c("10", "20", "52.5", "81", "110", "140.5", "189",
"222.5", "278", "340", "397", "453.5", "529", "580",
"630.5", "683.5", "735.5", "784", "832", "882.5",
"926.5", "973", "1108", "1200"),
row.names = c("at3g01510.1", "at5g26570.1", "at1g10760.1"),
class = "data.frame")
library(tidyr)
dat$rowname <- rownames(dat)
ggdat <- gather(dat, key = "colname", value = "Intensity", -rowname)
Then create the barplot using ggplot2
library(RColorBrewer)
library(ggplot2)
colors <- brewer.pal(nrow(dat), "Dark2")
ggplot(data = ggdat, aes(x = colname, y = Intensity, fill = rowname)) +
geom_bar(aes(color = rowname), stat = "identity",
position = position_dodge(), width = 0.75) +
scale_fill_manual(values = colors) +
scale_color_manual(values = colors) +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "bottom")
The code could be used for more than 3 rows, although the bars will get harder to distinguish with more categories. If this is a problem, you could consider dropping/binning x values, or perhaps splitting the plot into two:
ggdat$group <- factor(ggdat$colname %in% colnames(dat)[1:12],
levels = c(TRUE, FALSE), labels = c("Low x", "High x"))
ggplot(data = ggdat, aes(x = colname, y = Intensity, fill = rowname)) +
geom_bar(aes(color = rowname), stat = "identity",
position = position_dodge(), width = 0.75) +
scale_fill_manual(values = colors) +
scale_color_manual(values = colors) +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "bottom") +
facet_wrap(~ group, ncol = 1, scales = "free_x")
How many records does the dataset have? It seems you are dealing with an overplotting issue. Follow #Nikos method to tidy the data.
Use size and alpha to change the size and transparency of the line.
ggplot(data = X.dt, aes(x = X, y = value, group = variable, color = variable)) +
geom_line(data = X.dt, aes(x = X, y = value, group = variable, color = variable),
size = 3, alpha = .25)
The color of the line changes as they overlap. However this will only work for smaller datasets. My only other suggestion is to overlay geom_line() with geom_point() that will plot points over the lines. You can use position = position_jitter() to slightly augment the position of the points, that way if they overlap you can see where they overlap.
ggplot(data = X.dt, aes(x = X, y = value, group = variable, color = variable)) +
geom_point(position = position_jitter(w = 0.001, h = 0.02), size = 3, alpha = .5) +
geom_line(data = X.dt, aes(x = X, y = value, group = variable, color = variable), size = 1, alpha = .25)
You can try to play with the line types but this can become really difficult if you have too much lines to see : is 3 the maximum you'll have ? Else, you may consider another way to draw your data.
Here is an example with your data, when I plot it, I can see the 3 lines :
matplot(as.matrix(t(tbl_alles[rowsToPlot,])),type="l",lwd=2,lty=c("solid","48","36"), col=rainbow(length(rowsToPlot)),xlab = 'Fraction Size', ylab = 'Intensity')
legend('topright',c('LSF1', 'PWD', 'GWD'),lty=c("solid","48","36"),lwd=2, bty='n', cex=.75, col = rainbow(length(rowsToPlot)))
the 3 line types :
solid: this is the default type, as you already know...
48: first 4 units of line then a blank of 8 units
36: first 3 units of line then a blank of 6 units.
I also changed the width of the line with lwd=2.
There is another parameter to play with : transparency.
If (keeping the different lty) you change the colors to c("#FF000030","#0000FF50","#00FF0080") for example, it will be easier to see every lines (the two last characters of each hexadecimal code specify the transparency).
If you use transparency, then you can even specify a unique color and ovelapping lines will appear darker : for example, col=#00000044".