Complexheatmap with multiple files plotting - r

I would like to use Complexheatmap for multiple files for plotting individual data frame or files .
So far I was able to do this as for small subset of files.
Reading files as list
list_of_files <- list.files('Model_hmap/',pattern = '\\.txt$', full.names = TRUE)
#Further arguments to read.csv can be passed in ...
#all_csv <- lapply(list_of_files,read_delim,delim = "\t", escape_double = FALSE,trim_ws = TRUE)
all_csv <- lapply(list_of_files,read.table,strip.white = FALSE,check.names = FALSE,header=TRUE,row.names=1)
#my_names = c("gene","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","UP_DOWN")
my_names = c("Symbol","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","UP_DOWN")
#my_names = c['X2']
#my_names = c("Peak","annotation","ENSEMBL","log2FoldChange","padj","UP_DOWN")
result_abd = lapply(all_csv, FUN = function(x) subset(x, select=-c(1:7,155)))
names(result_abd) <- gsub(".txt","",
list.files("Model_hmap/",full.names = FALSE),
fixed = TRUE)
Then Scaling the data
fun <- function(result_abd) {
p <- t(scale(t(result_abd[,1:ncol(result_abd)])))
}
p2 <- mapply(fun, result_abd, SIMPLIFY = FALSE)
Next step was to use the metadata which i would like to annotate my heat-map
My metadata is as such
dput(head(metadata))
structure(list(patient = c("TCGA-AB-2856", "TCGA-AB-2849", "TCGA-AB-2971",
"TCGA-AB-2930", "TCGA-AB-2891", "TCGA-AB-2872"), prior_malignancy = c("no",
"no", "no", "no", "no", "no"), FAB = c("M4", "M0", "M4", "M2",
"M1", "M3"), Risk_Cyto = c("Intermediate", "Poor", "Intermediate",
"Intermediate", "Poor", "Good")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
To read the above metadata I'm doing this below Im not sure if its the right way or approach.
list_of_files1 <- list.files('Model_hmap_meta/',pattern = '\\.txt$', full.names = TRUE)
#Further arguments to read.csv can be passed in ...
meta1 <- lapply(list_of_files1,read.table, row.names = 1,sep = "\t",header = TRUE)
Now I'm stuck at the above step Im not sure how do I pass the argument as list which i have done for the dataframe of my gene expression which I had calculated the zscore which is a list. So I think the metadata should be the same class if I have to use this .
For single file This is how I used to annotation into my final plot
metadata = read_delim("Model_hmap_meta/FAB_table.txt",delim = "\t", escape_double = FALSE,
trim_ws = TRUE)
head(metadata)
dim(metadata)
ann <- data.frame(metadata$FAB, metadata$Risk_Cyto)
colnames(ann) <- c('FAB', 'Risk_Cyto')
colours <- list('FAB' = c('M0' = 'red2', 'M1' = 'royalblue', 'M2'='gold','M3'='forestgreen','M4'='chocolate','M5'='Purple'),
'Risk_Cyto' = c('Good' = 'limegreen', 'Intermediate' = 'navy' , 'N.D.' ='magenta','Poor'='black'))
colAnn <- HeatmapAnnotation(df = ann,
which = 'col',
col = colours,
annotation_width = unit(c(1, 4), 'cm'),
gap = unit(1, 'mm'))
Now this is what I need to pass it to the list if I understand which I'm not able to do
My plotting function.
This is the code I use to plot.
hm1 <- Heatmap(heat,
col= colorRamp2(c(-2.6,-1,0,1,2.6),c("blue","skyblue","white","lightcoral","red")),
#heatmap_legend_param=list(at=c(-2.6,-1,0,1,2.6),color_bar="continuous",
# legend_direction="vertical", legend_width=unit(5,"cm"),
# title_position="topcenter", title_gp=gpar(fontsize=10, fontface="bold")),
name = "Z-score",
#Row annotation configurations
cluster_rows=T,
show_row_dend=FALSE,
row_title_side="right",
row_title_gp=gpar(fontsize=8),
show_row_names=FALSE,
row_names_side="left",
#Column annotation configuratiions
cluster_columns=T,
show_column_dend=T,
column_title="DE genes",
column_title_side="top",
column_title_gp=gpar(fontsize=15, fontface="bold"),
show_column_names = FALSE,
column_names_gp = gpar(fontsize = 12, fontface="bold"),
#Dendrogram configurations: columns
clustering_distance_columns="euclidean",
clustering_method_columns="complete",
column_dend_height=unit(10,"mm"),
#Dendrogram configurations: rows
clustering_distance_rows="euclidean",
clustering_method_rows="complete",
row_dend_width=unit(4,"cm"),
row_dend_side = "left",
row_dend_reorder = TRUE,
#Splits
border=T,
row_km = 1,
column_km = 1,
#plot params
#width = unit(5, "inch"),
#height = unit(4, "inch"),
#height = unit(0.4, "cm")*nrow(mat),
#Annotations
top_annotation = colAnn)
# plot heatmap
draw(hm1, annotation_legend_side = "right", heatmap_legend_side="right")
Objective
How do I wrap all the above into a small function where I can take input multiple files and plot them.
UPDATE
Data files
My data files my metadafile

Using the code you provided I made the following function (make_heatmap). Some of the read in statements are altered to match what I was working with on my machine. I also only used 2 of your files but it should work with all 4 that you're using.
This function will allow you to pass the counts matrix (which you normalize and set up before passing to the function). The assumption is that you're using the same metadata/annotation for each file you're passing. If you have different annotation files you could set up the heatmap annotation before the function and then pass that to the function. This is a bit more tedious though.
Usually the way that I set up my heatmap analyzes is that I have a script containing all of my functions (one for each type of heatmap I have to make) and then every time I need to make a new heatmap I have another script where I read in/prepare (ie median center) my counts matrix and then call the heatmap function I need.
list_of_files <- dir(pattern = 'MAP', full.names = TRUE)
#Further arguments to read.csv can be passed in ...
#all_csv <- lapply(list_of_files,read_delim,delim = "\t", escape_double = FALSE,trim_ws = TRUE)
all_csv <- lapply(list_of_files,read.table,strip.white = FALSE,check.names = FALSE,header=TRUE,row.names=1)
#my_names = c("gene","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","UP_DOWN")
my_names = c("Symbol","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","UP_DOWN")
#my_names = c['X2']
#my_names = c("Peak","annotation","ENSEMBL","log2FoldChange","padj","UP_DOWN")
result_abd = lapply(all_csv, FUN = function(x) subset(x, select=-c(1:7,155)))
names(result_abd) <- gsub(".txt","",
list.files("Model_hmap/",full.names = FALSE),
fixed = TRUE)
fun <- function(result_abd) {
p <- t(scale(t(result_abd[,1:ncol(result_abd)])))
}
p2 <- mapply(fun, result_abd, SIMPLIFY = FALSE)
# list_of_files1 <- list.files('Model_hmap_meta/',pattern = '\\.txt$', full.names = TRUE)
# #Further arguments to read.csv can be passed in ...
# meta1 <- lapply(list_of_files1,read.table, row.names = 1,sep = "\t",header = TRUE)
make_heatmap<-function(counts_matrix){
metadata = read.table("FAB_table.txt",sep = "\t", header=1)
head(metadata)
dim(metadata)
ann <- data.frame(metadata$FAB, metadata$Risk_Cyto)
colnames(ann) <- c('FAB', 'Risk_Cyto')
colours <- list('FAB' = c('M0' = 'red2', 'M1' = 'royalblue', 'M2'='gold','M3'='forestgreen','M4'='chocolate','M5'='Purple'),
'Risk_Cyto' = c('Good' = 'limegreen', 'Intermediate' = 'navy' , 'N.D.' ='magenta','Poor'='black'))
colAnn <- HeatmapAnnotation(df = ann,
which = 'col',
col = colours,
annotation_width = unit(c(1, 4), 'cm'),
gap = unit(1, 'mm'))
hm1 <- Heatmap(counts_matrix,
col= colorRamp2(c(-2.6,-1,0,1,2.6),c("blue","skyblue","white","lightcoral","red")),
#heatmap_legend_param=list(at=c(-2.6,-1,0,1,2.6),color_bar="continuous",
# legend_direction="vertical", legend_width=unit(5,"cm"),
# title_position="topcenter", title_gp=gpar(fontsize=10, fontface="bold")),
name = "Z-score",
#Row annotation configurations
cluster_rows=T,
show_row_dend=FALSE,
row_title_side="right",
row_title_gp=gpar(fontsize=8),
show_row_names=FALSE,
row_names_side="left",
#Column annotation configuratiions
cluster_columns=T,
show_column_dend=T,
column_title="DE genes",
column_title_side="top",
column_title_gp=gpar(fontsize=15, fontface="bold"),
show_column_names = FALSE,
column_names_gp = gpar(fontsize = 12, fontface="bold"),
#Dendrogram configurations: columns
clustering_distance_columns="euclidean",
clustering_method_columns="complete",
column_dend_height=unit(10,"mm"),
#Dendrogram configurations: rows
clustering_distance_rows="euclidean",
clustering_method_rows="complete",
row_dend_width=unit(4,"cm"),
row_dend_side = "left",
row_dend_reorder = TRUE,
#Splits
border=T,
row_km = 1,
column_km = 1,
#plot params
#width = unit(5, "inch"),
#height = unit(4, "inch"),
#height = unit(0.4, "cm")*nrow(mat),
#Annotations
top_annotation = colAnn)
# plot heatmap
draw(hm1, annotation_legend_side = "right", heatmap_legend_side="right")
}
make_heatmap(as.matrix(p2[[1]])) #just call the function with the counts matrix
make_heatmap(as.matrix(p2[[2]]))
If you need to output the heatmap to a pdf or something, you can do that before calling the function or you can put that command inside of the heatmap function (just make sure to call dev.off() inside the function too in that case).

Related

Extension to answered question: R - Defining a function which recognises arguments not as objects, but as being part of the call

I've followed the excellent answer, as described here: https://stackoverflow.com/a/59987272/7493594
But how can I make it work with ggadjustedcurves?
myfun <- function(TimeVar, EventVar, CoxVar, CoxData){
TimeVar <- as.name(TimeVar)
EventVar <- as.name(EventVar)
CoxVar <- as.name(CoxVar)
CoxModel <- eval(bquote(coxph(Surv(.(TimeVar), .(EventVar)) ~.(CoxVar), data = .(CoxData))))
ggadjustedcurves(CoxModel,
variable = CoxVar,
xlab = "Years",
ylab = "Survival",
ggtheme = theme_survminer(),
size = 2, palette = "lancet",
data = CoxData)
I guess you're after something like this?
library(survminer)
library(survival)
myfun <- function(TimeVar, EventVar, CoxVar, CoxData) {
TimeVar <- as.name(TimeVar)
EventVar <- as.name(EventVar)
CoxVar_char <- CoxVar # Need to store `CoxVar` as string
CoxVar <- as.name(CoxVar)
CoxData <- as.name(CoxData)
CoxModel <- eval(bquote(
coxph(Surv(.(TimeVar), .(EventVar)) ~.(CoxVar), data = .(CoxData))))
ggadjustedcurves(
CoxModel,
variable = CoxVar_char, # `variable` needs to be a string
xlab = "Years",
ylab = "Survival",
ggtheme = theme_survminer(),
size = 2, palette = "lancet",
data = eval(CoxData)) # Eval `CoxData` as symbol
}
myfun("stop", "event", "size", "bladder")

HeatMap: how to cluster only the rows and keep order of the heatmap's column labels as same as in the df?

I wanna plot a heatmap and cluster only the rows (i.e. genes in this tydf1).
Also, wanna keep order of the heatmap's column labels as same as in the df (i.e. tydf1)?
Sample data
df1 <- structure(list(Gene = c("AA", "PQ", "XY", "UBQ"), X_T0_R1 = c(1.46559502, 0.220140568, 0.304127515, 1.098842127), X_T0_R2 = c(1.087642983, 0.237500819, 0.319844338, 1.256624804), X_T0_R3 = c(1.424945196, 0.21066267, 0.256496284, 1.467120048), X_T1_R1 = c(1.289943948, 0.207778662, 0.277942721, 1.238400358), X_T1_R2 = c(1.376535013, 0.488774258, 0.362562315, 0.671502431), X_T1_R3 = c(1.833390311, 0.182798731, 0.332856558, 1.448757569), X_T2_R1 = c(1.450753714, 0.247576125, 0.274415259, 1.035410946), X_T2_R2 = c(1.3094609, 0.390028842, 0.352460646, 0.946426593), X_T2_R3 = c(0.5953716, 1.007079177, 1.912258811, 0.827119776), X_T3_R1 = c(0.7906009, 0.730242116, 1.235644748, 0.832287694), X_T3_R2 = c(1.215333041, 1.012914813, 1.086362205, 1.00918082), X_T3_R3 = c(1.069312467, 0.780421013, 1.002313082, 1.031761442), Y_T0_R1 = c(0.053317766, 3.316414959, 3.617213894, 0.788193798), Y_T0_R2 = c(0.506623748, 3.599442788, 1.734075583, 1.179462912), Y_T0_R3 = c(0.713670106, 2.516735845, 1.236204882, 1.075393433), Y_T1_R1 = c(0.740998252, 1.444496448, 1.077023349, 0.869258744), Y_T1_R2 = c(0.648231834, 0.097957459, 0.791438659, 0.428805547), Y_T1_R3 = c(0.780499252, 0.187840968, 0.820430227, 0.51636582), Y_T2_R1 = c(0.35344654, 1.190274584, 0.401845911, 1.223534348), Y_T2_R2 = c(0.220223951, 1.367784148, 0.362815405, 1.102117612), Y_T2_R3 = c(0.432856978, 1.403057729, 0.10802472, 1.304233845), Y_T3_R1 = c(0.234963735, 1.232129062, 0.072433381, 1.203096462), Y_T3_R2 = c(0.353770497, 0.885122768, 0.011662112, 1.188149743), Y_T3_R3 = c(0.396091395, 1.333921747, 0.192594116, 1.838029829), Z_T0_R1 = c(0.398000559, 1.286528398, 0.129147097, 1.452769794), Z_T0_R2 = c(0.384759325, 1.122251177, 0.119475721, 1.385513609), Z_T0_R3 = c(1.582230097, 0.697419716, 2.406671502, 0.477415567), Z_T1_R1 = c(1.136843842, 0.804552001, 2.13213228, 0.989075996), Z_T1_R2 = c(1.275683837, 1.227821594, 0.31900326, 0.835941568), Z_T1_R3 = c(0.963349308, 0.968589683, 1.706670339, 0.807060135), Z_T2_R1 = c(3.765036263, 0.477443352, 1.712841882, 0.469173869), Z_T2_R2 = c(1.901023385, 0.832736132, 2.223429427, 0.593558769), Z_T2_R3 = c(1.407713024, 0.911920317, 2.011259223, 0.692553388), Z_T3_R1 = c(0.988333629, 1.095130142, 1.648598854, 0.629915612), Z_T3_R2 = c(0.618606729, 0.497458337, 0.549147265, 1.249492088), Z_T3_R3 = c(0.429823986, 0.471389536, 0.977124788, 1.136635484)), row.names = c(NA, -4L ), class = c("data.table", "data.frame"))
Scripts used
library(dplyr)
library(stringr)
library(tidyr)
gdf1 <- gather(df1, "group", "Expression", -Gene)
gdf1$tgroup <- apply(str_split_fixed(gdf1$group, "_", 3)[, c(1, 2)],
1, paste, collapse ="_")
library(dplyr)
tydf1 <- gdf1 %>%
group_by(Gene, tgroup) %>%
summarize(expression_mean = mean(Expression)) %>%
spread(., tgroup, expression_mean)
#1 heatmap script is being used
library(tidyverse)
tydf1 <- tydf1 %>%
as.data.frame() %>%
column_to_rownames(var=colnames(tydf1)[1])
library(gplots)
library(vegan)
randup.m <- as.matrix(tydf1)
scaleRYG <- colorRampPalette(c("red","yellow","darkgreen"),
space = "rgb")(30)
data.dist <- vegdist(randup.m, method = "euclidean")
row.clus <- hclust(data.dist, "aver")
heatmap.2(randup.m, Rowv = as.dendrogram(row.clus),
dendrogram = "row", col = scaleRYG, margins = c(7,10),
density.info = "none", trace = "none", lhei = c(2,6),
colsep = 1:3, sepcolor = "black", sepwidth = c(0.001,0.0001),
xlab = "Identifier", ylab = "Rows")
#2 heatmap script is being used
df2 <- as.matrix(tydf1[, -1])
heatmap(df2)
Also, I want to add a color key.
It is still unclear to me, what the desired output is. There are some notes:
You don't need to use vegdist() to calculate distance matrix for your hclust() call. Because if you check all(vegdist(randup.m, method = "euclidian") == dist(randup.m)) it returns TRUE;
Specifying Colv = F in your heatmap.2() call will prevent reordering of the columns (default is TRUE);
Maybe it is better to scale your data by row (see the uncommented row);
Your call of heatmap.2() returns the heatmap with color key.
So summing it up - in your first script you just miss the Colv = F argument, and after a little adjustment it looks like this:
heatmap.2(randup.m,
Rowv = as.dendrogram(row.clus),
Colv = F,
dendrogram = "row",
#scale = "row",
col = scaleRYG,
density.info = "none",
trace = "none",
srtCol = -45,
adjCol = c(.1, .5),
xlab = "Identifier",
ylab = "Rows"
)
However I am still not sure - is it what you need?

'use.edge.length = FALSE' doesn't seem to work when using plotBranchbyTrait() in phytools

I'm trying to create a phylogeny where the branch lengths that I've coded are represented by colour rather than length. So I want the branch lengths to be equal.
Here is my code:
plotBranchbyTrait(tree.scaled, tree.scaled$edge.length, mode=c("edges"),palette="rainbow", use.edge.length = FALSE, node.depth = 2)
It's my understanding that use.edge.length = FALSE should make the branch lengths equal, and it does this if I code the tree using plot.phylo(). But the tree still shows up with the branch lengths when I use plotBranchbyTrait(). Anyone know how to get around this?
Unfortunately, optional arguments (...) are not directly passed to plot.phylo in the plotBranchbyTrait function. One non-elegant way to fix that is to modify the body directly in R to add a hard coded use.edge.length = FALSE option.
You can do this by creating a new function and modify it using body(foo)[[line_of_interest]] <- substitute(my_new_line <- that_does_something). The following example should work:
## Back up the function
plotBranchbyTrait_no_edge_length <- phytools::plotBranchbyTrait
## The line to modify:
body(plotBranchbyTrait_no_edge_length)[[34]]
# xx <- plot.phylo(tree, type = type, show.tip.label = show.tip.label,
# show.node.label = show.node.label, edge.color = colors, edge.width = edge.width,
# edge.lty = edge.lty, font = font, cex = cex, adj = adj, srt = srt,
# no.margin = no.margin, root.edge = root.edge, label.offset = label.offset,
# underscore = underscore, x.lim = x.lim, y.lim = y.lim, direction = direction,
# lab4ut = lab4ut, tip.color = tip.color, plot = plot, rotate.tree = rotate.tree,
# open.angle = open.angle, lend = 2, new = FALSE)
## Modify the line 34 by adding `use.edge.length = FALSE`
body(plotBranchbyTrait_no_edge_length)[[34]] <- substitute( xx <- plot.phylo(use.edge.length = FALSE, tree, type = type, show.tip.label = show.tip.label, show.node.label = show.node.label, edge.color = colors, edge.width = edge.width, edge.lty = edge.lty, font = font, cex = cex, adj = adj, srt = srt, no.margin = no.margin, root.edge = root.edge, label.offset = label.offset, underscore = underscore, x.lim = x.lim, y.lim = y.lim, direction = direction, lab4ut = lab4ut, tip.color = tip.color, plot = plot, rotate.tree = rotate.tree, open.angle = open.angle, lend = 2, new = FALSE) )
## Testing whether it worked
library(phytools)
tree <- pbtree(n=50)
x <- fastBM(tree)
## With use.edge.length = TRUE (default)
plotBranchbyTrait(tree, x, mode = "tips", edge.width = 4, prompt = FALSE)
## With use.edge.length = FALSE
plotBranchbyTrait_no_edge_length(tree, x, mode = "tips", edge.width = 4, prompt = FALSE)
You can find more on how to modify functions here.

plot_generic() function in plot_roc_components not found in R

I am using plot_roc_components function from rmda package. The definition of it has plot_generic() function. But, I am not able to find definition of this function. Why is it so?
The reason for it to see if there is an option for legend.size(). plot_roc_components gives me figure, however, I want to change the legend size. There is an option for legend.position, but not for its font size.
Could you please explain?
Thanks!
https://github.com/mdbrown/rmda/blob/57553a4cf5b6972176a0603b412260e367147619/R/plot_functions_sub.R
You were looking in one file but it was defined in another file.
plot_generic<- function(xx, predictors, value, plotNew,
standardize, confidence.intervals,
cost.benefit.axis = TRUE, cost.benefits, n.cost.benefits,
cost.benefit.xlab, xlab, ylab,
col, lty, lwd,
xlim, ylim, legend.position,
lty.fpr = 2, lty.tpr = 1,
tpr.fpr.legend = FALSE,
impact.legend = FALSE,
impact.legend.2 = FALSE,
population.size = 1000,
policy = policy, ...){
## xx is output from get_DecisionCurve,
## others are directly from the function call
#save old par parameters and reset them once the function exits.
old.par<- par("mar"); on.exit(par(mar = old.par))
xx.wide <- reshape::cast(xx, thresholds~model, value = value, add.missing = TRUE, fill = NA)
xx.wide$thresholds <- as.numeric(as.character(xx.wide$thresholds))
if(is.numeric(confidence.intervals)){
val_lower <- paste(value, "lower", sep = "_")
val_upper <- paste(value, "upper", sep = "_")
xx.lower <- cast(xx, thresholds~model, value = val_lower, add.missing = TRUE, fill = NA)
xx.upper <- cast(xx, thresholds~model, value = val_upper, add.missing = TRUE, fill = NA)
xx.lower$thresholds <- as.numeric(as.character(xx.lower$thresholds))
xx.upper$thresholds <- as.numeric(as.character(xx.upper$thresholds))
}
# adjust margins to add extra x-axis
if(cost.benefit.axis) par(mar = c(7.5, 4, 3, 2) + 0.1)
#set default ylim if not provided
#initial call to plot and add gridlines

Wind rose with ggplot (R)?

I am looking for good R code (or package) that uses ggplot2 to create wind roses that show the frequency, magnitude and direction of winds.
I'm particularly interested in ggplot2 as building the plot that way gives me the chance to leverage the rest of the functionality in there.
Test data
Download a year of weather data from the 80-m level on the National Wind Technology's "M2" tower. This link will create a .csv file that is automatically downloaded. You need to find that file (it's called "20130101.csv"), and read it in.
# read in a data file
data.in <- read.csv(file = "A:/drive/somehwere/20130101.csv",
col.names = c("date","hr","ws.80","wd.80"),
stringsAsFactors = FALSE))
This would work with any .csv file and will overwrite the column names.
Sample data
If you don't want to download that data, here are 10 data points that we will use to demo the process:
data.in <- structure(list(date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "1/1/2013", class = "factor"), hr = 1:9, ws.80 = c(5,
7, 7, 51.9, 11, 12, 9, 11, 17), wd.80 = c(30, 30, 30, 180, 180,
180, 269, 270, 271)), .Names = c("date", "hr", "ws.80", "wd.80"
), row.names = c(NA, -9L), class = "data.frame")
For sake of argument we'll assume that we are using the data.in data frame, which has two data columns and some kind of date / time information. We'll ignore the date and time information initially.
The ggplot function
I've coded the function below. I'm interested in other people's experience or suggestions on how to improve this.
# WindRose.R
require(ggplot2)
require(RColorBrewer)
plot.windrose <- function(data,
spd,
dir,
spdres = 2,
dirres = 30,
spdmin = 2,
spdmax = 20,
spdseq = NULL,
palette = "YlGnBu",
countmax = NA,
debug = 0){
# Look to see what data was passed in to the function
if (is.numeric(spd) & is.numeric(dir)){
# assume that we've been given vectors of the speed and direction vectors
data <- data.frame(spd = spd,
dir = dir)
spd = "spd"
dir = "dir"
} else if (exists("data")){
# Assume that we've been given a data frame, and the name of the speed
# and direction columns. This is the format we want for later use.
}
# Tidy up input data ----
n.in <- NROW(data)
dnu <- (is.na(data[[spd]]) | is.na(data[[dir]]))
data[[spd]][dnu] <- NA
data[[dir]][dnu] <- NA
# figure out the wind speed bins ----
if (missing(spdseq)){
spdseq <- seq(spdmin,spdmax,spdres)
} else {
if (debug >0){
cat("Using custom speed bins \n")
}
}
# get some information about the number of bins, etc.
n.spd.seq <- length(spdseq)
n.colors.in.range <- n.spd.seq - 1
# create the color map
spd.colors <- colorRampPalette(brewer.pal(min(max(3,
n.colors.in.range),
min(9,
n.colors.in.range)),
palette))(n.colors.in.range)
if (max(data[[spd]],na.rm = TRUE) > spdmax){
spd.breaks <- c(spdseq,
max(data[[spd]],na.rm = TRUE))
spd.labels <- c(paste(c(spdseq[1:n.spd.seq-1]),
'-',
c(spdseq[2:n.spd.seq])),
paste(spdmax,
"-",
max(data[[spd]],na.rm = TRUE)))
spd.colors <- c(spd.colors, "grey50")
} else{
spd.breaks <- spdseq
spd.labels <- paste(c(spdseq[1:n.spd.seq-1]),
'-',
c(spdseq[2:n.spd.seq]))
}
data$spd.binned <- cut(x = data[[spd]],
breaks = spd.breaks,
labels = spd.labels,
ordered_result = TRUE)
# clean up the data
data. <- na.omit(data)
# figure out the wind direction bins
dir.breaks <- c(-dirres/2,
seq(dirres/2, 360-dirres/2, by = dirres),
360+dirres/2)
dir.labels <- c(paste(360-dirres/2,"-",dirres/2),
paste(seq(dirres/2, 360-3*dirres/2, by = dirres),
"-",
seq(3*dirres/2, 360-dirres/2, by = dirres)),
paste(360-dirres/2,"-",dirres/2))
# assign each wind direction to a bin
dir.binned <- cut(data[[dir]],
breaks = dir.breaks,
ordered_result = TRUE)
levels(dir.binned) <- dir.labels
data$dir.binned <- dir.binned
# Run debug if required ----
if (debug>0){
cat(dir.breaks,"\n")
cat(dir.labels,"\n")
cat(levels(dir.binned),"\n")
}
# deal with change in ordering introduced somewhere around version 2.2
if(packageVersion("ggplot2") > "2.2"){
cat("Hadley broke my code\n")
data$spd.binned = with(data, factor(spd.binned, levels = rev(levels(spd.binned))))
spd.colors = rev(spd.colors)
}
# create the plot ----
p.windrose <- ggplot(data = data,
aes(x = dir.binned,
fill = spd.binned)) +
geom_bar() +
scale_x_discrete(drop = FALSE,
labels = waiver()) +
coord_polar(start = -((dirres/2)/360) * 2*pi) +
scale_fill_manual(name = "Wind Speed (m/s)",
values = spd.colors,
drop = FALSE) +
theme(axis.title.x = element_blank())
# adjust axes if required
if (!is.na(countmax)){
p.windrose <- p.windrose +
ylim(c(0,countmax))
}
# print the plot
print(p.windrose)
# return the handle to the wind rose
return(p.windrose)
}
Proof of Concept and Logic
We'll now check that the code does what we expect. For this, we'll use the simple set of demo data.
# try the default settings
p0 <- plot.windrose(spd = data.in$ws.80,
dir = data.in$wd.80)
This gives us this plot:
So: we've correctly binned the data by direction and wind speed, and have coded up our out-of-range data as expected. Looks good!
Using this function
Now we load the real data. We can load this from the URL:
data.in <- read.csv(file = "http://midcdmz.nrel.gov/apps/plot.pl?site=NWTC&start=20010824&edy=26&emo=3&eyr=2062&year=2013&month=1&day=1&endyear=2013&endmonth=12&endday=31&time=0&inst=21&inst=39&type=data&wrlevel=2&preset=0&first=3&math=0&second=-1&value=0.0&user=0&axis=1",
col.names = c("date","hr","ws.80","wd.80"))
or from file:
data.in <- read.csv(file = "A:/blah/20130101.csv",
col.names = c("date","hr","ws.80","wd.80"))
The quick way
The simple way to use this with the M2 data is to just pass in separate vectors for spd and dir (speed and direction):
# try the default settings
p1 <- plot.windrose(spd = data.in$ws.80,
dir = data.in$wd.80)
Which gives us this plot:
And if we want custom bins, we can add those as arguments:
p2 <- plot.windrose(spd = data.in$ws.80,
dir = data.in$wd.80,
spdseq = c(0,3,6,12,20))
Using a data frame and the names of columns
To make the plots more compatible with ggplot(), you can also pass in a data frame and the name of the speed and direction variables:
p.wr2 <- plot.windrose(data = data.in,
spd = "ws.80",
dir = "wd.80")
Faceting by another variable
We can also plot the data by month or year using ggplot's faceting capability. Let's start by getting the time stamp from the date and hour information in data.in, and converting to month and year:
# first create a true POSIXCT timestamp from the date and hour columns
data.in$timestamp <- as.POSIXct(paste0(data.in$date, " ", data.in$hr),
tz = "GMT",
format = "%m/%d/%Y %H:%M")
# Convert the time stamp to years and months
data.in$Year <- as.numeric(format(data.in$timestamp, "%Y"))
data.in$month <- factor(format(data.in$timestamp, "%B"),
levels = month.name)
Then you can apply faceting to show how the wind rose varies by month:
# recreate p.wr2, so that includes the new data
p.wr2 <- plot.windrose(data = data.in,
spd = "ws.80",
dir = "wd.80")
# now generate the faceting
p.wr3 <- p.wr2 + facet_wrap(~month,
ncol = 3)
# and remove labels for clarity
p.wr3 <- p.wr3 + theme(axis.text.x = element_blank(),
axis.title.x = element_blank())
Comments
Some things to note about the function and how it can be used:
The inputs are:
vectors of speed (spd) and direction (dir) or the name of the data frame and the names of the columns that contain the speed and direction data.
optional values of the bin size for wind speed (spdres) and direction (dirres).
palette is the name of a colorbrewer sequential palette,
countmax sets the range of the wind rose.
debug is a switch (0,1,2) to enable different levels of debugging.
I wanted to be able to set the maximum speed (spdmax) and the count (countmax) for the plots so that I can compare windroses from different data sets
If there are wind speeds that exceed (spdmax), those are added as a grey region (see the figure). I should probably code something like spdmin as well, and color-code regions where the wind speeds are less than that.
Following a request, I implemented a method to use custom wind speed bins. They can be added using the spdseq = c(1,3,5,12) argument.
You can remove the degree bin labels using the usual ggplot commands to clear the x axis: p.wr3 + theme(axis.text.x = element_blank(),axis.title.x = element_blank()).
At some point recently ggplot2 changed the ordering of bins, so that the plots didn't work. I think this was version 2.2. But, if your plots look a bit weird, change the code so that test for "2.2" is maybe "2.1", or "2.0".
Here is my version of the code. I added labels for directions (N, NNE, NE, ENE, E....) and made the y label to show frequency in percent instead of counts.
Click here to see figure of wind Rose with directions and frequency (%)
# WindRose.R
require(ggplot2)
require(RColorBrewer)
require(scales)
plot.windrose <- function(data,
spd,
dir,
spdres = 2,
dirres = 22.5,
spdmin = 2,
spdmax = 20,
spdseq = NULL,
palette = "YlGnBu",
countmax = NA,
debug = 0){
# Look to see what data was passed in to the function
if (is.numeric(spd) & is.numeric(dir)){
# assume that we've been given vectors of the speed and direction vectors
data <- data.frame(spd = spd,
dir = dir)
spd = "spd"
dir = "dir"
} else if (exists("data")){
# Assume that we've been given a data frame, and the name of the speed
# and direction columns. This is the format we want for later use.
}
# Tidy up input data ----
n.in <- NROW(data)
dnu <- (is.na(data[[spd]]) | is.na(data[[dir]]))
data[[spd]][dnu] <- NA
data[[dir]][dnu] <- NA
# figure out the wind speed bins ----
if (missing(spdseq)){
spdseq <- seq(spdmin,spdmax,spdres)
} else {
if (debug >0){
cat("Using custom speed bins \n")
}
}
# get some information about the number of bins, etc.
n.spd.seq <- length(spdseq)
n.colors.in.range <- n.spd.seq - 1
# create the color map
spd.colors <- colorRampPalette(brewer.pal(min(max(3,
n.colors.in.range),
min(9,
n.colors.in.range)),
palette))(n.colors.in.range)
if (max(data[[spd]],na.rm = TRUE) > spdmax){
spd.breaks <- c(spdseq,
max(data[[spd]],na.rm = TRUE))
spd.labels <- c(paste(c(spdseq[1:n.spd.seq-1]),
'-',
c(spdseq[2:n.spd.seq])),
paste(spdmax,
"-",
max(data[[spd]],na.rm = TRUE)))
spd.colors <- c(spd.colors, "grey50")
} else{
spd.breaks <- spdseq
spd.labels <- paste(c(spdseq[1:n.spd.seq-1]),
'-',
c(spdseq[2:n.spd.seq]))
}
data$spd.binned <- cut(x = data[[spd]],
breaks = spd.breaks,
labels = spd.labels,
ordered_result = TRUE)
# figure out the wind direction bins
dir.breaks <- c(-dirres/2,
seq(dirres/2, 360-dirres/2, by = dirres),
360+dirres/2)
dir.labels <- c(paste(360-dirres/2,"-",dirres/2),
paste(seq(dirres/2, 360-3*dirres/2, by = dirres),
"-",
seq(3*dirres/2, 360-dirres/2, by = dirres)),
paste(360-dirres/2,"-",dirres/2))
# assign each wind direction to a bin
dir.binned <- cut(data[[dir]],
breaks = dir.breaks,
ordered_result = TRUE)
levels(dir.binned) <- dir.labels
data$dir.binned <- dir.binned
# Run debug if required ----
if (debug>0){
cat(dir.breaks,"\n")
cat(dir.labels,"\n")
cat(levels(dir.binned),"\n")
}
# create the plot ----
p.windrose <- ggplot(data = data,
aes(x = dir.binned,
fill = spd.binned
,y = (..count..)/sum(..count..)
))+
geom_bar() +
scale_x_discrete(drop = FALSE,
labels = c("N","NNE","NE","ENE", "E",
"ESE", "SE","SSE",
"S","SSW", "SW","WSW", "W",
"WNW","NW","NNW")) +
coord_polar(start = -((dirres/2)/360) * 2*pi) +
scale_fill_manual(name = "Wind Speed (m/s)",
values = spd.colors,
drop = FALSE) +
theme(axis.title.x = element_blank()) +
scale_y_continuous(labels = percent) +
ylab("Frequencia")
# adjust axes if required
if (!is.na(countmax)){
p.windrose <- p.windrose +
ylim(c(0,countmax))
}
# print the plot
print(p.windrose)
# return the handle to the wind rose
return(p.windrose)
}
Have you ever tried windRose function from Openair package? It's very easy and you can set intervals, statistics and etc.
windRose(mydata, ws = "ws", wd = "wd", ws2 = NA, wd2 = NA,
ws.int = 2, angle = 30, type = "default", bias.corr = TRUE, cols
= "default", grid.line = NULL, width = 1, seg = NULL, auto.text
= TRUE, breaks = 4, offset = 10, normalise = FALSE, max.freq =
NULL, paddle = TRUE, key.header = NULL, key.footer = "(m/s)",
key.position = "bottom", key = TRUE, dig.lab = 5, statistic =
"prop.count", pollutant = NULL, annotate = TRUE, angle.scale =
315, border = NA, ...)
pollutionRose(mydata, pollutant = "nox", key.footer = pollutant,
key.position = "right", key = TRUE, breaks = 6, paddle = FALSE,
seg = 0.9, normalise = FALSE, ...)

Resources