Is it possible to subset facets in a polyfreq in GGplot? - r
I was wondering if it was possible to use subset on a geom_polyfreq()?
I am running a topic model and in order to report the facets properly i want to remove 4 out of 10 facets.
My code is as follows:
ggplot(data = dat,
aes(x = date,
fill = Topics)) +
geom_freqpoly(binwidth = 3) +
labs(x = "",
y = "Topic Count",
title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=8)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
> ggplot(subset(dat, Topics %in% c(3, 4, 5, 7, 8, 9)),
aes(x = date,
fill = topic)) +
geom_freqpoly(binwidth = 3) +
labs(x = "",
y = "Topic Count",
title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=9)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
However, when I try to subset the data, I get an error that says:
Fejl: Faceting variables must have at least one value
Does anybody know what the issue is?
I hope this makes sense.
The full code is down below.
article.data <- article.data[!is.na(article.data$fulltext), ]
## Get date
article.data$date <- as.Date(article.data$date, "%Y-%m-%d")
#all of 2018
dat <- article.data[article.data$date > as.Date("2018-01-01", "%Y-%m-%d") &
article.data$date < as.Date("2018-12-01", "%Y-%m-%d"), ]
## 'tokenize' fulltext
quanteda_options("language_stemmer" = "danish")
texts <- gsub(":", " ", dat$fulltext, fixed = T)
texts <- tokens(texts, what = "word",
remove_numbers = T,
remove_punct = T,
remove_symbols = T,
remove_separators = T,
remove_hyphens = T,
remove_url = T,
verbose = T)
texts <- tokens_tolower(texts)
texts <- tokens_remove(texts, stopwords("danish"))
texts <- tokens_wordstem(texts)
texts <- tokens_remove(texts, stopwords("danish"))
# get actual dfm from tokens
txt.mat <- dfm(texts)
#remove frequent words with no substance
txt.mat <- txt.mat %>% dfm_remove(c("ad",
"af","aldrig","alene","alle",
"allerede","alligevel","alt",
"altid","anden","andet","andre",
"at","bag","bare", "bedre", "begge","bl.a.",
"blandt", "blev", "blevet", "blive","bliver",
"burde", "bør","ca.", "com", "da",
"dag", "dansk", "danske", "de",
"dem", "den", "denne","dens",
"der","derefter","deres","derfor",
"derfra","deri","dermed","derpå",
"derved","det","dette","dig",
"din","dine","disse","dit",
"dog","du","efter","egen",
"ej","eller","ellers","en",
"end","endnu","ene","eneste","enhver","ens",
"enten","er","et","f.eks.","far","fem",
"fik","fire","flere","flest",
"fleste","for", "foran",
"fordi","forrige","fra", "fx",
"få","får","før","først",
"gennem","gjorde","gjort","god",
"godt","gør","gøre","gørende",
"ham","han","hans","har",
"havde","have","hej","hel",
"heller","helt","hen","hende",
"hendes","henover","her",
"herefter","heri","hermed",
"herpå","hos","hun","hvad",
"hvem","hver","hvilke","hvilken",
"hvilkes","hvis",
"hvor", "hvordan","hvorefter","hvorfor",
"hvorfra","hvorhen","hvori","hvorimod",
"hvornår","hvorved","i", "ifølge", "igen",
"igennem","ikke","imellem","imens",
"imod","ind","indtil","ingen",
"intet","ja","jeg","jer","jeres",
"jo","kan","kom","komme",
"kommer", "kroner", "kun","kunne","lad",
"langs", "lang", "langt", "lav","lave","lavet",
"lidt","lige","ligesom","lille",
"længere","man","mand","mange",
"med","meget","mellem","men", "mener",
"mens","mere","mest","mig",
"min","mindre","mindst","mine",
"mit","mod","må","måske",
"ned","nej","nemlig","ni",
"nogen","nogensinde","noget",
"nogle","nok","nu","ny", "nye",
"nyt","når","nær","næste",
"næsten","og","også","okay",
"om","omkring","op","os",
"otte","over","overalt","pga.", "partier",
"partiets", "partiers", "politiske",
"procent", "på", "ritzau", "samme",
"sammen","se","seks","selv","selvom",
"senere","ser","ses","siden","sig",
"sige", "siger", "sin","sine","sit",
"skal","skulle","som","stadig",
"stor","store","synes","syntes",
"syv","så","sådan","således",
"tag","tage","temmelig","thi",
"ti","tidligere","til","tilbage",
"tit","to","tre","ud","uden",
"udover","under","undtagen","var",
"ved","vi","via","vil","ville", "viser",
"vor","vore","vores","vær","være",
"været","øvrigt","facebook","http", "https",
"www","millioner", "frem", "lars", "lars_løkke",
"rasmussen", "løkke_rasmussen", "statsminister", "politik",
"formand", "partiet", "år", "tid", "and", "fler",
"sid", "regeringen", "giv", "politisk", "folketing", "mer",
"ifølg"))
############################################################
## FEATURE SELECTION
############################################################
# check out top-appearing features in dfm
topfeatures(txt.mat)
# keep features (words) appearing in >2 documents
txt.mat <- dfm_trim(txt.mat, min_termfreq = 4)
# filter out one-character words
txt.mat <- txt.mat[, str_length(colnames(txt.mat)) > 2]
# filter out some html trash features
#txt.mat <- txt.mat[, !grepl("[[:digit:]]+px", colnames(txt.mat))]
#txt.mat <- txt.mat[, !grepl(".", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("_", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("#", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("#", colnames(txt.mat), fixed = T)]
############################################################
## SELECT FEATURES BY TF-IDF
############################################################
# Create tf_idf-weighted dfm
ti <- dfm_tfidf(txt.mat)
# Select from main dfm using its top features
txt.mat <- dfm_keep(txt.mat, names(topfeatures(ti, n = 1000)))
############################################################
## RUN TOPIC MODEL
############################################################
# convert quanteda dfm to tm 'dtm'
dtm <- convert(txt.mat, to = "topicmodels")
# run lda with 8 topics
lda <- LDA(dtm, k = 8)
# review terms by topic
terms(lda, 10)
############################################################
## LOOK FOR 'OPTIMAL' k
############################################################
# randomly sample test data
set.seed(61218)
select <- sample(1:nrow(dtm), size = 100)
test <- dtm[select, ]
train <- dtm[!(1:nrow(dtm) %in% select), ]
n.tops <- 3:14
metrics <- data.frame(topics = n.tops,
perplexity = NA)
for(i in n.tops) { # NB: takes awhile to run
print(i)
est <- LDA(train, k = i)
metrics[(i - 1), "perplexity"] <- perplexity(est, newdata = test)
}
save(metrics, file = "lda_perplexity2018.RData")
qplot(data = metrics, x = topics, y = perplexity, geom = "line",
xlab = "Number of topics",
ylab = "Perplexity on test data") + theme_bw()
#We found that 8 topics was one of those of lowest perplexity but
#also the ones which made the most sense
############################################################
## RERUN WITH BETTER CHOICE OF k
############################################################
# run lda with 10 topics
lda <- LDA(dtm, k = 10)
save(lda, file = "dr_ft_keep2018.RData")
# examine output
terms(lda, 20)
# put topics into original data
dat$topic <- topics(lda)
# add labels
#dat$date <- factor(dat$date,
#levels = 1:12,
#labels = c("januar","februar", "marts","april", "maj", "juni", "juli", "august", "september", "oktober", "november", "decemeber"))
dat$Topics <- factor(dat$topic,
levels = 1:10,
labels = c("Topc 1", "Topic 2", "Integration", "Taxation", "Burka Prohibition",
"Topic 6", "Justice", "Foreign Affairs", "Housing", "Topic 10"))
# frequency
qplot(data = dat, x = Topics,
geom = "bar", xlab = "",
ylab = "Topic Frequency", fill=Topics, main = "Figure 1: Main Topics in 2018 - DR") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))
#Make visualization showing topics over time
ggplot(data = dat,
aes(x = date,
fill = Topics[1])) +
geom_freqpoly(binwidth = 30) +
facet_wrap(Topics ~ ., scales = "free")+
theme_classic() +
scale_x_date(breaks = as.Date(c( "2018-02-01", "2018-04-01", "2018-06-01", "2018-08-01", "2018-10-01", "2018-12-01", date_labels="%B"))) +
theme(axis.text.x = element_text(angle = 90))
ggplot(data = dat,
aes(x = date,
fill = Topics)) +
geom_freqpoly(binwidth = 3) +
labs(x = "",
y = "Topic Count",
title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=8)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
It's best practice on this forum to make your question reproducible, so that others can try it and test their solutions to confirm they work. It's also good if you can make it minimal, both to respect potential answerers' time and to help clarify your own understanding of the problem.
How to make a great R reproducible example
In this case, the error message suggests that your subsetting is removing all your data, which breaks the faceting. It can't plot any facets if the faceting variable has no values.
It looks like dat$Topics is a factor, but your loop is referring to Topics like they're numeric with Topics %in% c(3, 4, 5, 7, 8, 9). For example, I could define a factor vector with the same levels as your Topics variable:
Topics <- factor(1:10, levels = 1:10,
labels = c("Topc 1", "Topic 2", "Integration", "Taxation", "Burka Prohibition",
"Topic 6", "Justice", "Foreign Affairs", "Housing", "Topic 10"))
Compare the output of these three lines:
Topics %in% c(1, 2)
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
as.numeric(Topics) %in% c(1, 2)
# [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Topics %in% c("Topc 1", "Topic 2")
# [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
In the top case, none of the data matches the test, so using that to subset the data would give you an empty data set and seems like a plausible cause for the error you got.
To refer to the Topics by their underlying level, we can refer to as.numeric(Topics) %in% c(1, 2). If we want to refer to the Topics by their labels, I could use Topics %in% c("Topc 1", "Topic 2").
Since I don't have your data, I can't confirm this exact syntax will work for you, but I hope something along these lines will.
For more on how to work with factors in R, I recommend: https://r4ds.had.co.nz/factors.html
Related
'ts' object must have one or more observations
the error is shown above. I am trying to plot a graph that show the amount of tweet within each month of 2016. My question is how can I am able to found out the amount of tweet for each month in order for me to plot a graph to see which month tweeted the most. library(ggplot2) library(RColorBrewer) library(rstudioapi) current_path = rstudioapi::getActiveDocumentContext()$path setwd(dirname(current_path )) print( getwd() ) donaldtrump <- read.csv("random_poll_tweets.csv", stringsAsFactors = FALSE) print(str(donaldtrump)) time8_ts <- ts(random$time8, start = c(2016,8), frequency = 12) time7_ts <- ts(random$time7, start = c(2016,7), frequency = 12) time6_ts <- ts(random$time6, start = c(2016,6), frequency = 12) time5_ts <- ts(random$time5, start = c(2016,5), frequency = 12) time4_ts <- ts(random$time4, start = c(2016,4), frequency = 12) time3_ts <- ts(random$time3, start = c(2016,3), frequency = 12) time2_ts <- ts(random$time2, start = c(2016,2), frequency = 12) time1_ts <- ts(random$time1, start = c(2016,1), frequency = 12) browser_mts <- cbind(time8_ts, time7_ts,time6_ts,time5_ts,time4_ts,time3_ts,time2_ts,time1_ts) dimnames(browser_mts)[[2]] <- c("8","7","6","5","4","3","2","1") pdf(file="fig_browser_tweet_R.pdf",width = 11,height = 8.5) ts.plot(browser_mts, ylab = "Amount of Tweet", xlab = "Month", plot.type = "single", col = 1:5) legend("topright", colnames(browser_mts), col = 1:5, lty = 1, cex=1.75)
library(lubridate) library(dplyr) donaldtrump$created_at <- donaldtrump$created_at |> mdy_hm() |> floor_date(unit = "month") donaldtrump |> count(created_at) Just because you are looking at a time series doesn't mean that you must use a time series object. If you want a plot: library(ggplot2) donaldtrump |> count(created_at) |> ggplot(aes(created_at, n)) + geom_col() + labs(x = "Amount of Tweet", y = "Month")
Not enough Y-observations for t-test with spatial RNA-sequencing data in R?
I am trying to perform differential gene expression using a t-test on spatial RNA-sequencing data. There are a couple of different annotations/groups indicating different structures (ANN2 in code): AML area, Taggregate, immatureTLS, matureTLS, and microcluster. ANN1 relates to one of the 3 different patients. The error I get: Error in h(simpleError(msg, call)) : error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': not enough 'y' observations I don’t understand how my data has not enough y-observations, and how I could overcome this error. I have searched google and other blogs, but I wasn’t able to resolve it. The code I use (all the code I used before is shown on this website: https://bioconductor.org/packages/devel/workflows/vignettes/GeoMxWorkflows/inst/doc/GeomxTools_RNA-NGS_Analysis.html ) 7.1 Differential Expression plots<-list() tables<-list() labels<-list() test<-"ttest" mtc<-"BY" #options: "holm" "hochberg" "hommel" "bonferroni" "BH" "BY" "fdr" counter=1 comps_df<-data.frame(comp='',val='') for (active_group1 in unique(ann$segment)) { for (active_group2 in unique(ann$segment)) { #supress reduncant compares if(active_group1==active_group2) {next} comp<-paste(sort(c(active_group1,active_group2)),collapse = "_") if(comp %in% comps_df$comp) {next} temp_df<-data.frame(comp=comp ,val=1) comps_df<-rbind(comps_df,temp_df) labels[[counter]]<-paste(active_group1," vs ", active_group2) group1<-log_q[,rownames(ann)[ann$segment==active_group1]] group2<-log_q[,rownames(ann)[ann$segment==active_group2]] #run t_tests results<-as.data.frame ( apply(log_q, 1, function(x) t.test(x[colnames(group1)],x[colnames(group2)])$p.value) ) colnames(results)<-"raw_p_value" #multiple_testing_correction adj_p_value<- p.adjust(results$raw_p_value,method=mtc) results<-cbind(results,adj_p_value) #calc_fdr FDR<- p.adjust(results$raw_p_value,method="fdr") results<-cbind(results,FDR) #fold_changes #as base data is already log transformed, means need to be subtracted to get FC in log space fchanges<-as.data.frame( apply(log_q, 1, function(x) (mean(x[colnames(group1)]) - mean(x[colnames(group2)]) ) ) ) colnames(fchanges)<-"FC" #paste("FC",active_group1," / ",active_group2) results<-cbind(results,fchanges) #add genenames results$Gene<-rownames(results) #set categories based on P-value & FDR for plotting results$Color <- "NS or FC < 0.5" results$Color[results$adj_p_value < 0.05] <- "P < 0.05" results$Color[results$FDR < 0.05] <- "FDR < 0.05" results$Color[results$FDR < 0.001] <- "FDR < 0.001" results$Color[abs(results$FC) < 1] <- "NS or FC < 1" results$Color <- factor(results$Color, levels = c("NS or FC < 1", "P < 0.05", "FDR < 0.05", "FDR < 0.001")) #vulcanoplot # pick top genes for either side of volcano to label # order genes for convenience: results$invert_P <- (-log10(results$adj_p_value)) * sign(results$FC) top_g <- c() top_g <- c(top_g, results[ind, 'Gene'][ order(results[ind, 'invert_P'], decreasing = TRUE)[1:15]], results[ind, 'Gene'][order(results[ind, 'invert_P'], decreasing = FALSE)[1:15]]) top_g<- unique(top_g) results <- results[, -1*ncol(results)] # remove invert_P from matrix # Graph results plots[[counter]]<- ggplot(results, aes(x = FC, y = -log10(adj_p_value), color = Color, label = Gene)) + geom_vline(xintercept = c(1, -1), lty = "dashed") + geom_hline(yintercept = -log10(0.05), lty = "dashed") + geom_point() + labs(x = paste("Enriched in", active_group2," <- log2(FC) -> Enriched in", active_group1), y = "Significance, -log10(P)", color = "Significance") + scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue", `FDR < 0.05` = "lightblue", `P < 0.05` = "orange2", `NS or FC < 0.5` = "gray"), guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + geom_text_repel(data = subset(results, FDR<0.001 & (-1>FC| FC>1)), point.padding = 0.15, color = "black", size=3.5, min.segment.length = .1, box.padding = .2, lwd = 2, max.overlaps = 50) + theme_bw(base_size = 20) + theme(legend.position = "bottom") + ggtitle(paste(test, mtc,"multitest corr")) #store tables for display later tables[[counter]]<-results counter = counter+1 #datatable(subset(results, Gene %in% GOI), rownames=FALSE,caption = paste("DE results ", active_group1," vs ", active_group2)) } } grid.arrange(grobs=plots,ncol=2) #strangly does not appear in html output?? for (c in (2:counter-1)) { #Gene %in% GOI print(datatable( subset(tables[[c]], Color == "FDR < 0.001" ), rownames=FALSE, extensions = 'Buttons', options = list ( dom = 'Bftrip', buttons = c('copy', 'csv', 'excel', 'pdf', 'print') ), caption = paste("DE results ", labels[[1]]),filter='top') %>% formatRound(columns=c("raw_p_value","adj_p_value","FDR","FC"), digits=3)) cat('\n\n<!-- -->\n\n') } [normalised data example][1]Data type which is used as input: [1]: https://i.stack.imgur.com/Yt0DJ.png Any help would be greatly appreciated! Thanks
How to add additional statistics on top of a combined ggplot2 graph that uses a multi-variable object or two separate objects
I have a ggplot2 graph which plots two separate violin plots onto one graph, given by this example (thanks to #jared_mamrot for providing it): library(tidyverse) data("Puromycin") head(Puromycin) dat1 <- Puromycin %>% filter(state == "treated") dat2 <- Puromycin %>% filter(state == "untreated") mycp <- ggplot() + geom_violin(data = dat1, aes(x= state, y = conc, colour = "Puromycin (Treatment1)")) + geom_violin(data = dat2, aes(x= state, y = conc, colour = "Puromycin (Treatment2)")) mycp I would like to add a boxplot or other summary statistics such as those in http://www.sthda.com/english/wiki/ggplot2-violin-plot-quick-start-guide-r-software-and-data-visualization and https://www.maths.usyd.edu.au/u/UG/SM/STAT3022/r/current/Misc/data-visualization-2.1.pdf, but trying the code suggested in those places does not change the original plot. mycp + geom_boxplot() Thanks for reading and hopefully this makes sense! UPDATE ========================================================================== So the above example does not reflect exactly my situation I realize now. Essentially, I want to apply statistics onto a combined ggplot2 graph that uses two separate objects as its variables (here TNBC_List1 and ER_List1) Here is an example that does (sorry for the longer example, I will admit I am having trouble creating a simpler reproducible example and I am very new to coding in general): # Libraries ------------------------------------------------------------- library(BiocManager) library(GEOquery) library(plyr) library(dplyr) library(Matrix) library(devtools) library(Seurat) library(ggplot2) library(cowplot) library(SAVER) library(metap) library(multtest) # Loading Raw Data into RStudio ---------------------------------- filePaths = getGEOSuppFiles("GSE75688") tarF <- list.files(path = "./GSE75688/", pattern = "*.tar", full.names = TRUE) tarF untar(tarF, exdir = "./GSE75688/") gzipF <- list.files(path = "./GSE75688/", pattern = "*.gz", full.names = TRUE) ldply(.data = gzipF, .fun = gunzip) list.files(path = "./GSE75688/", full.names = TRUE) list.files(path = "./GSE75688/", pattern = "\\.txt$",full.names = TRUE) # full matrix ---------------------------------------------------------- fullmat <- read.table(file = './GSE75688//GSE75688_GEO_processed_Breast_Cancer_raw_TPM_matrix.txt', sep = '\t', header = FALSE, stringsAsFactors = FALSE) fullmat <- data.frame(fullmat[,-1], row.names=fullmat[,1]) colnames(fullmat) <- as.character(fullmat[1, ]) fullmat <- fullmat[-1,] fullmat <- as.matrix(fullmat) # BC01 ER+ matrix ----------------------------------------------------------- BC01mat <- grep(pattern =c("^BC01") , x = colnames(fullmat), value = TRUE) BC01mat = fullmat[,grepl(c("^BC01"),colnames(fullmat))] BC01mat = BC01mat[,!grepl("^BC01_Pooled",colnames(BC01mat))] BC01mat = BC01mat[,!grepl("^BC01_Tumor",colnames(BC01mat))] BC01pdat <- data.frame("samples" = colnames(BC01mat), "treatment" = "ER+") # BC07 TNBC matrix ----------------------------------------------------------- BC07mat <- grep(pattern =c("^BC07") , x = colnames(fullmat), value = TRUE) BC07mat <- fullmat[,grepl(c("^BC07"),colnames(fullmat))] BC07mat <- BC07mat[,!grepl("^BC07_Pooled",colnames(BC07mat))] BC07mat <- BC07mat[,!grepl("^BC07_Tumor",colnames(BC07mat))] BC07mat <- BC07mat[,!grepl("^BC07LN_Pooled",colnames(BC07mat))] BC07mat <- BC07mat[,!grepl("^BC07LN",colnames(BC07mat))] BC07pdat <- data.frame("samples" = colnames(BC07mat), "treatment" = "TNBC") #merge samples together ========================================================================= joined <- cbind(BC01mat, BC07mat) pdat_joined <- rbind(BC01pdat, BC07pdat) #fdat ___________________________________________________________________________________ fdat <- grep(pattern =c("gene_name|gene_type") , x = colnames(fullmat), value = TRUE) fdat <- fullmat[,grepl(c("gene_name|gene_type"),colnames(fullmat))] fdat <- as.data.frame(fdat, stringsAsFactors = FALSE) fdat <- setNames(cbind(rownames(fdat), fdat, row.names = NULL), c("ensembl_id", "gene_short_name", "gene_type")) rownames(pdat_joined) <- pdat_joined$samples rownames(fdat) = make.names(fdat$gene_short_name, unique=TRUE) rownames(joined) <- rownames(fdat) # Create Seurat Object __________________________________________________________________ joined <- as.data.frame(joined) sobj_pre <- CreateSeuratObject(counts = joined) sobj_pre <-AddMetaData(sobj_pre,metadata=pdat_joined) head(sobj_pre#meta.data) #gene name input sobj_pre[["RNA"]]#meta.features<-fdat head(sobj_pre[["RNA"]]#meta.features) #Downstream analysis ------------------------------------------------------- sobj <- sobj_pre sobj <- FindVariableFeatures(object = sobj, mean.function = ExpMean, dispersion.function = LogVMR, nfeatures = 2000) sobj <- ScaleData(object = sobj, features = rownames(sobj), block.size = 2000) sobj <- RunPCA(sobj, npcs = 100, ndims.print = 1:10, nfeatures.print = 5) sobj <- FindNeighbors(sobj, reduction = "pca", dims = 1:4, nn.eps = 0.5) sobj <- FindClusters(sobj, resolution = 1, n.start = 10) umap.method = 'umap-learn' metric = 'correlation' sobj <- RunUMAP(object = sobj, reduction = "pca", dims = 1:4,min.dist = 0.5, seed.use = 123) p0 <- DimPlot(sobj, reduction = "umap", pt.size = 0.1,label=TRUE) + ggtitle(label = "Title") p0 # ER+ score computation ------------------- ERlist <- list(c("CPB1", "RP11-53O19.1", "TFF1", "MB", "ANKRD30B", "LINC00173", "DSCAM-AS1", "IGHG1", "SERPINA5", "ESR1", "ILRP2", "IGLC3", "CA12", "RP11-64B16.2", "SLC7A2", "AFF3", "IGFBP4", "GSTM3", "ANKRD30A", "GSTT1", "GSTM1", "AC026806.2", "C19ORF33", "STC2", "HSPB8", "RPL29P11", "FBP1", "AGR3", "TCEAL1", "CYP4B1", "SYT1", "COX6C", "MT1E", "SYTL2", "THSD4", "IFI6", "K1AA1467", "SLC39A6", "ABCD3", "SERPINA3", "DEGS2", "ERLIN2", "HEBP1", "BCL2", "TCEAL3", "PPT1", "SLC7A8", "RP11-96D1.10", "H4C8", "PI15", "PLPP5", "PLAAT4", "GALNT6", "IL6ST", "MYC", "BST2", "RP11-658F2.8", "MRPS30", "MAPT", "AMFR", "TCEAL4", "MED13L", "ISG15", "NDUFC2", "TIMP3", "RP13-39P12.3", "PARD68")) sobj <- AddModuleScore(object = sobj, features = ERlist, name = "ER_List") #TNBC computation ------------------- tnbclist <- list(c("FABP7", "TSPAN8", "CYP4Z1", "HOXA10", "CLDN1", "TMSB15A", "C10ORF10", "TRPV6", "HOXA9", "ATP13A4", "GLYATL2", "RP11-48O20.4", "DYRK3", "MUCL1", "ID4", "FGFR2", "SHOX2", "Z83851.1", "CD82", "COL6A1", "KRT23", "GCHFR", "PRICKLE1", "GCNT2", "KHDRBS3", "SIPA1L2", "LMO4", "TFAP2B", "SLC43A3", "FURIN", "ELF5", "C1ORF116", "ADD3", "EFNA3", "EFCAB4A", "LTF", "LRRC31", "ARL4C", "GPNMB", "VIM", "SDR16C5", "RHOV", "PXDC1", "MALL", "YAP1", "A2ML1", "RP1-257A7.5", "RP11-353N4.6", "ZBTB18", "CTD-2314B22.3", "GALNT3", "BCL11A", "CXADR", "SSFA2", "ADM", "GUCY1A3", "GSTP1", "ADCK3", "SLC25A37", "SFRP1", "PRNP", "DEGS1", "RP11-110G21.2", "AL589743.1", "ATF3", "SIVA1", "TACSTD2", "HEBP2")) sobj <- AddModuleScore(object = sobj, features = tnbclist, name = "TNBC_List") #ggplot2 issue ---------------------------------------------------------------------------- sobj[["ClusterName"]] <- Idents(object = sobj) sobjlists <- FetchData(object = sobj, vars = c("ER_List1", "TNBC_List1", "ClusterName")) library(reshape2) melt(sobjlists, id.vars = c("ER_List1", "TNBC_List1", "ClusterName")) p <- ggplot() + geom_violin(data = sobjlists, aes(x= ClusterName, y = ER_List1, fill = ER_List1, colour = "ER+ Signature"))+ geom_violin(data = sobjlists, aes(x= ClusterName, y = TNBC_List1, fill = TNBC_List1, colour="TNBC Signature")) Extension ====================================================================== If you want to do this but with two objects (sobjlists1 and sobjlists2, for example) instead of what my example showed (two variables but one object), rbind the two and then do what #StupidWolf says library(reshape2) sobjlists1= melt(sobjlists1, id.vars = "treatment") sobjlists2= melt(sobjlists2, id.vars = "treatment") combosobjlists <- rbind(sobjlists1, sobjlists2) and then continue on with their code using combosobjlists: ggplot(combosobjlists,aes(x= ClusterName, y = value)) + geom_violin(aes(fill=variable)) + geom_boxplot(aes(col=variable), width = 0.2,position=position_dodge(0.9)) Hope this thread helps!
Try to include just the minimum code to show your problem. Like in your example, there's no need to start with the whole seurat processing. You can just provide the data.frame with dput() and we can see the issue with ggplot2 , see this post. Create some example data: library(Seurat) library(ggplot2) genes = c(unlist(c(ERlist,tnbclist))) mat = matrix(rnbinom(500*length(genes),mu=500,size=1),ncol=500) rownames(mat) = genes colnames(mat) = paste0("cell",1:500) sobj = CreateSeuratObject(mat) sobj = NormalizeData(sobj) Add some made-up cluster: sobj$ClusterName = factor(sample(0:1,ncol(sobj),replace=TRUE)) Add your module score: sobj = AddModuleScore(object = sobj, features = tnbclist, name = "TNBC_List",ctrl=5) sobj = AddModuleScore(object = sobj, features = ERlist, name = "ER_List",ctrl=5) We get the data, what you need to do is to pivot it long correctly. Plotting it twice with ggplot2 is going to cause all kinds of problem: sobjlists = FetchData(object = sobj, vars = c("ER_List1", "TNBC_List1", "ClusterName")) head(sobjlists) ER_List1 TNBC_List1 ClusterName cell1 -0.05391108 -0.008736057 1 cell2 0.07074816 -0.039064126 1 cell3 0.08688374 -0.066967324 1 cell4 -0.12503649 0.120665057 0 cell5 0.05356685 -0.072293651 0 cell6 -0.20053804 0.178977042 1 Should look like this: library(reshape2) sobjlists = melt(sobjlists, id.vars = "ClusterName") ClusterName variable value 1 1 ER_List1 -0.05391108 2 1 ER_List1 0.07074816 3 1 ER_List1 0.08688374 4 0 ER_List1 -0.12503649 5 0 ER_List1 0.05356685 6 1 ER_List1 -0.20053804 Now we plot: ggplot(sobjlists,aes(x= ClusterName, y = value)) + geom_violin(aes(fill=variable)) + geom_boxplot(aes(col=variable), width = 0.2,position=position_dodge(0.9))
for you to be able to use the data within a plot without specifying it (like geom_boxplot() ), you need to put the data in the ggplot() function call. Then the following functions are able to inherit them. You also do not need an extra violin plot per color library(tidyverse) data("Puromycin") head(Puromycin) mycp <- ggplot(Puromycin,aes(x= state, y = conc, colour=state))+geom_violin() mycp + geom_boxplot(width=0.1, color= "black") + scale_color_discrete( labels= c("Puromycin (Treatment1)","Puromycin (Treatment2)") ) Result:
How to edit the labels of a facet_wrap/grid if there are two variables?
In ggplot I have faceted by two variables (tau and z) but can only change the label of the first: df<-data.frame(x=runif(1e3),y=runif(1e3),tau=rep(c("A","aBc"),each=500),z=rep(c("DDD","EEE"),each=500)) tauNames <- c( `A` = "10% load", `aBc` = "40% load" ) df%>% ggplot(aes(x=x,y=y))+ geom_point(alpha=0.4)+ xlab(label = "Time[s]")+ ylab(label = "Dose")+ facet_grid(tau~z,labeller = as_labeller(tauNames))+ ggpubr::theme_pubclean() As you can see I can change one of the labels but not both. Any thoughts are much appreciated
In the documentation of ?as_labeller you can find in the examples how you get the labels for multiple faceting variables. library(tidyverse) df<-data.frame(x=runif(1e3),y=runif(1e3),tau=rep(c("A","aBc"),each=500),z=rep(c("DDD","EEE"),each=500)) tauNames <- c( `A` = "10% load", `aBc` = "40% load" ) df%>% ggplot(aes(x=x,y=y))+ geom_point(alpha=0.4)+ xlab(label = "Time[s]")+ ylab(label = "Dose")+ facet_grid(tau~z,labeller = labeller(tau = tauNames, z = c("DDD" = "D", "EEE" = "E")))+ ggpubr::theme_pubclean()
Wind rose with ggplot (R)?
I am looking for good R code (or package) that uses ggplot2 to create wind roses that show the frequency, magnitude and direction of winds. I'm particularly interested in ggplot2 as building the plot that way gives me the chance to leverage the rest of the functionality in there. Test data Download a year of weather data from the 80-m level on the National Wind Technology's "M2" tower. This link will create a .csv file that is automatically downloaded. You need to find that file (it's called "20130101.csv"), and read it in. # read in a data file data.in <- read.csv(file = "A:/drive/somehwere/20130101.csv", col.names = c("date","hr","ws.80","wd.80"), stringsAsFactors = FALSE)) This would work with any .csv file and will overwrite the column names. Sample data If you don't want to download that data, here are 10 data points that we will use to demo the process: data.in <- structure(list(date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "1/1/2013", class = "factor"), hr = 1:9, ws.80 = c(5, 7, 7, 51.9, 11, 12, 9, 11, 17), wd.80 = c(30, 30, 30, 180, 180, 180, 269, 270, 271)), .Names = c("date", "hr", "ws.80", "wd.80" ), row.names = c(NA, -9L), class = "data.frame")
For sake of argument we'll assume that we are using the data.in data frame, which has two data columns and some kind of date / time information. We'll ignore the date and time information initially. The ggplot function I've coded the function below. I'm interested in other people's experience or suggestions on how to improve this. # WindRose.R require(ggplot2) require(RColorBrewer) plot.windrose <- function(data, spd, dir, spdres = 2, dirres = 30, spdmin = 2, spdmax = 20, spdseq = NULL, palette = "YlGnBu", countmax = NA, debug = 0){ # Look to see what data was passed in to the function if (is.numeric(spd) & is.numeric(dir)){ # assume that we've been given vectors of the speed and direction vectors data <- data.frame(spd = spd, dir = dir) spd = "spd" dir = "dir" } else if (exists("data")){ # Assume that we've been given a data frame, and the name of the speed # and direction columns. This is the format we want for later use. } # Tidy up input data ---- n.in <- NROW(data) dnu <- (is.na(data[[spd]]) | is.na(data[[dir]])) data[[spd]][dnu] <- NA data[[dir]][dnu] <- NA # figure out the wind speed bins ---- if (missing(spdseq)){ spdseq <- seq(spdmin,spdmax,spdres) } else { if (debug >0){ cat("Using custom speed bins \n") } } # get some information about the number of bins, etc. n.spd.seq <- length(spdseq) n.colors.in.range <- n.spd.seq - 1 # create the color map spd.colors <- colorRampPalette(brewer.pal(min(max(3, n.colors.in.range), min(9, n.colors.in.range)), palette))(n.colors.in.range) if (max(data[[spd]],na.rm = TRUE) > spdmax){ spd.breaks <- c(spdseq, max(data[[spd]],na.rm = TRUE)) spd.labels <- c(paste(c(spdseq[1:n.spd.seq-1]), '-', c(spdseq[2:n.spd.seq])), paste(spdmax, "-", max(data[[spd]],na.rm = TRUE))) spd.colors <- c(spd.colors, "grey50") } else{ spd.breaks <- spdseq spd.labels <- paste(c(spdseq[1:n.spd.seq-1]), '-', c(spdseq[2:n.spd.seq])) } data$spd.binned <- cut(x = data[[spd]], breaks = spd.breaks, labels = spd.labels, ordered_result = TRUE) # clean up the data data. <- na.omit(data) # figure out the wind direction bins dir.breaks <- c(-dirres/2, seq(dirres/2, 360-dirres/2, by = dirres), 360+dirres/2) dir.labels <- c(paste(360-dirres/2,"-",dirres/2), paste(seq(dirres/2, 360-3*dirres/2, by = dirres), "-", seq(3*dirres/2, 360-dirres/2, by = dirres)), paste(360-dirres/2,"-",dirres/2)) # assign each wind direction to a bin dir.binned <- cut(data[[dir]], breaks = dir.breaks, ordered_result = TRUE) levels(dir.binned) <- dir.labels data$dir.binned <- dir.binned # Run debug if required ---- if (debug>0){ cat(dir.breaks,"\n") cat(dir.labels,"\n") cat(levels(dir.binned),"\n") } # deal with change in ordering introduced somewhere around version 2.2 if(packageVersion("ggplot2") > "2.2"){ cat("Hadley broke my code\n") data$spd.binned = with(data, factor(spd.binned, levels = rev(levels(spd.binned)))) spd.colors = rev(spd.colors) } # create the plot ---- p.windrose <- ggplot(data = data, aes(x = dir.binned, fill = spd.binned)) + geom_bar() + scale_x_discrete(drop = FALSE, labels = waiver()) + coord_polar(start = -((dirres/2)/360) * 2*pi) + scale_fill_manual(name = "Wind Speed (m/s)", values = spd.colors, drop = FALSE) + theme(axis.title.x = element_blank()) # adjust axes if required if (!is.na(countmax)){ p.windrose <- p.windrose + ylim(c(0,countmax)) } # print the plot print(p.windrose) # return the handle to the wind rose return(p.windrose) } Proof of Concept and Logic We'll now check that the code does what we expect. For this, we'll use the simple set of demo data. # try the default settings p0 <- plot.windrose(spd = data.in$ws.80, dir = data.in$wd.80) This gives us this plot: So: we've correctly binned the data by direction and wind speed, and have coded up our out-of-range data as expected. Looks good! Using this function Now we load the real data. We can load this from the URL: data.in <- read.csv(file = "http://midcdmz.nrel.gov/apps/plot.pl?site=NWTC&start=20010824&edy=26&emo=3&eyr=2062&year=2013&month=1&day=1&endyear=2013&endmonth=12&endday=31&time=0&inst=21&inst=39&type=data&wrlevel=2&preset=0&first=3&math=0&second=-1&value=0.0&user=0&axis=1", col.names = c("date","hr","ws.80","wd.80")) or from file: data.in <- read.csv(file = "A:/blah/20130101.csv", col.names = c("date","hr","ws.80","wd.80")) The quick way The simple way to use this with the M2 data is to just pass in separate vectors for spd and dir (speed and direction): # try the default settings p1 <- plot.windrose(spd = data.in$ws.80, dir = data.in$wd.80) Which gives us this plot: And if we want custom bins, we can add those as arguments: p2 <- plot.windrose(spd = data.in$ws.80, dir = data.in$wd.80, spdseq = c(0,3,6,12,20)) Using a data frame and the names of columns To make the plots more compatible with ggplot(), you can also pass in a data frame and the name of the speed and direction variables: p.wr2 <- plot.windrose(data = data.in, spd = "ws.80", dir = "wd.80") Faceting by another variable We can also plot the data by month or year using ggplot's faceting capability. Let's start by getting the time stamp from the date and hour information in data.in, and converting to month and year: # first create a true POSIXCT timestamp from the date and hour columns data.in$timestamp <- as.POSIXct(paste0(data.in$date, " ", data.in$hr), tz = "GMT", format = "%m/%d/%Y %H:%M") # Convert the time stamp to years and months data.in$Year <- as.numeric(format(data.in$timestamp, "%Y")) data.in$month <- factor(format(data.in$timestamp, "%B"), levels = month.name) Then you can apply faceting to show how the wind rose varies by month: # recreate p.wr2, so that includes the new data p.wr2 <- plot.windrose(data = data.in, spd = "ws.80", dir = "wd.80") # now generate the faceting p.wr3 <- p.wr2 + facet_wrap(~month, ncol = 3) # and remove labels for clarity p.wr3 <- p.wr3 + theme(axis.text.x = element_blank(), axis.title.x = element_blank()) Comments Some things to note about the function and how it can be used: The inputs are: vectors of speed (spd) and direction (dir) or the name of the data frame and the names of the columns that contain the speed and direction data. optional values of the bin size for wind speed (spdres) and direction (dirres). palette is the name of a colorbrewer sequential palette, countmax sets the range of the wind rose. debug is a switch (0,1,2) to enable different levels of debugging. I wanted to be able to set the maximum speed (spdmax) and the count (countmax) for the plots so that I can compare windroses from different data sets If there are wind speeds that exceed (spdmax), those are added as a grey region (see the figure). I should probably code something like spdmin as well, and color-code regions where the wind speeds are less than that. Following a request, I implemented a method to use custom wind speed bins. They can be added using the spdseq = c(1,3,5,12) argument. You can remove the degree bin labels using the usual ggplot commands to clear the x axis: p.wr3 + theme(axis.text.x = element_blank(),axis.title.x = element_blank()). At some point recently ggplot2 changed the ordering of bins, so that the plots didn't work. I think this was version 2.2. But, if your plots look a bit weird, change the code so that test for "2.2" is maybe "2.1", or "2.0".
Here is my version of the code. I added labels for directions (N, NNE, NE, ENE, E....) and made the y label to show frequency in percent instead of counts. Click here to see figure of wind Rose with directions and frequency (%) # WindRose.R require(ggplot2) require(RColorBrewer) require(scales) plot.windrose <- function(data, spd, dir, spdres = 2, dirres = 22.5, spdmin = 2, spdmax = 20, spdseq = NULL, palette = "YlGnBu", countmax = NA, debug = 0){ # Look to see what data was passed in to the function if (is.numeric(spd) & is.numeric(dir)){ # assume that we've been given vectors of the speed and direction vectors data <- data.frame(spd = spd, dir = dir) spd = "spd" dir = "dir" } else if (exists("data")){ # Assume that we've been given a data frame, and the name of the speed # and direction columns. This is the format we want for later use. } # Tidy up input data ---- n.in <- NROW(data) dnu <- (is.na(data[[spd]]) | is.na(data[[dir]])) data[[spd]][dnu] <- NA data[[dir]][dnu] <- NA # figure out the wind speed bins ---- if (missing(spdseq)){ spdseq <- seq(spdmin,spdmax,spdres) } else { if (debug >0){ cat("Using custom speed bins \n") } } # get some information about the number of bins, etc. n.spd.seq <- length(spdseq) n.colors.in.range <- n.spd.seq - 1 # create the color map spd.colors <- colorRampPalette(brewer.pal(min(max(3, n.colors.in.range), min(9, n.colors.in.range)), palette))(n.colors.in.range) if (max(data[[spd]],na.rm = TRUE) > spdmax){ spd.breaks <- c(spdseq, max(data[[spd]],na.rm = TRUE)) spd.labels <- c(paste(c(spdseq[1:n.spd.seq-1]), '-', c(spdseq[2:n.spd.seq])), paste(spdmax, "-", max(data[[spd]],na.rm = TRUE))) spd.colors <- c(spd.colors, "grey50") } else{ spd.breaks <- spdseq spd.labels <- paste(c(spdseq[1:n.spd.seq-1]), '-', c(spdseq[2:n.spd.seq])) } data$spd.binned <- cut(x = data[[spd]], breaks = spd.breaks, labels = spd.labels, ordered_result = TRUE) # figure out the wind direction bins dir.breaks <- c(-dirres/2, seq(dirres/2, 360-dirres/2, by = dirres), 360+dirres/2) dir.labels <- c(paste(360-dirres/2,"-",dirres/2), paste(seq(dirres/2, 360-3*dirres/2, by = dirres), "-", seq(3*dirres/2, 360-dirres/2, by = dirres)), paste(360-dirres/2,"-",dirres/2)) # assign each wind direction to a bin dir.binned <- cut(data[[dir]], breaks = dir.breaks, ordered_result = TRUE) levels(dir.binned) <- dir.labels data$dir.binned <- dir.binned # Run debug if required ---- if (debug>0){ cat(dir.breaks,"\n") cat(dir.labels,"\n") cat(levels(dir.binned),"\n") } # create the plot ---- p.windrose <- ggplot(data = data, aes(x = dir.binned, fill = spd.binned ,y = (..count..)/sum(..count..) ))+ geom_bar() + scale_x_discrete(drop = FALSE, labels = c("N","NNE","NE","ENE", "E", "ESE", "SE","SSE", "S","SSW", "SW","WSW", "W", "WNW","NW","NNW")) + coord_polar(start = -((dirres/2)/360) * 2*pi) + scale_fill_manual(name = "Wind Speed (m/s)", values = spd.colors, drop = FALSE) + theme(axis.title.x = element_blank()) + scale_y_continuous(labels = percent) + ylab("Frequencia") # adjust axes if required if (!is.na(countmax)){ p.windrose <- p.windrose + ylim(c(0,countmax)) } # print the plot print(p.windrose) # return the handle to the wind rose return(p.windrose) }
Have you ever tried windRose function from Openair package? It's very easy and you can set intervals, statistics and etc. windRose(mydata, ws = "ws", wd = "wd", ws2 = NA, wd2 = NA, ws.int = 2, angle = 30, type = "default", bias.corr = TRUE, cols = "default", grid.line = NULL, width = 1, seg = NULL, auto.text = TRUE, breaks = 4, offset = 10, normalise = FALSE, max.freq = NULL, paddle = TRUE, key.header = NULL, key.footer = "(m/s)", key.position = "bottom", key = TRUE, dig.lab = 5, statistic = "prop.count", pollutant = NULL, annotate = TRUE, angle.scale = 315, border = NA, ...) pollutionRose(mydata, pollutant = "nox", key.footer = pollutant, key.position = "right", key = TRUE, breaks = 6, paddle = FALSE, seg = 0.9, normalise = FALSE, ...)