how to change p-value label in ggpubr/ggplot2 - r

I'm using ggplot2 to make violin plots of module scores from Seurat, and am wanting to add statistics to it. I made the following violin plot, and I'm wanting to change the bracket labels so that it says "p < 0.13" instead of just 0.13 like it is now (thanks to #StupidWolf for the example!).
library(Seurat)
library(SeuratObject)
library(ggplot2)
library(ggpubr)
library(reshape2)
#add Seurat's module scores and create Seurat object from them =====================
ERlist <- list(c("CPB1", "RP11-53O19.1", "TFF1", "MB", "ANKRD30B",
"LINC00173", "DSCAM-AS1", "IGHG1", "SERPINA5", "ESR1",
"ILRP2", "IGLC3", "CA12", "RP11-64B16.2", "SLC7A2",
"AFF3", "IGFBP4", "GSTM3", "ANKRD30A", "GSTT1", "GSTM1",
"AC026806.2", "C19ORF33", "STC2", "HSPB8", "RPL29P11",
"FBP1", "AGR3", "TCEAL1", "CYP4B1", "SYT1", "COX6C",
"MT1E", "SYTL2", "THSD4", "IFI6", "K1AA1467", "SLC39A6",
"ABCD3", "SERPINA3", "DEGS2", "ERLIN2", "HEBP1", "BCL2",
"TCEAL3", "PPT1", "SLC7A8", "RP11-96D1.10", "H4C8",
"PI15", "PLPP5", "PLAAT4", "GALNT6", "IL6ST", "MYC",
"BST2", "RP11-658F2.8", "MRPS30", "MAPT", "AMFR", "TCEAL4",
"MED13L", "ISG15", "NDUFC2", "TIMP3", "RP13-39P12.3", "PARD68"))
tnbclist <- list(c("FABP7", "TSPAN8", "CYP4Z1", "HOXA10", "CLDN1",
"TMSB15A", "C10ORF10", "TRPV6", "HOXA9", "ATP13A4",
"GLYATL2", "RP11-48O20.4", "DYRK3", "MUCL1", "ID4", "FGFR2",
"SHOX2", "Z83851.1", "CD82", "COL6A1", "KRT23", "GCHFR",
"PRICKLE1", "GCNT2", "KHDRBS3", "SIPA1L2", "LMO4", "TFAP2B",
"SLC43A3", "FURIN", "ELF5", "C1ORF116", "ADD3", "EFNA3",
"EFCAB4A", "LTF", "LRRC31", "ARL4C", "GPNMB", "VIM",
"SDR16C5", "RHOV", "PXDC1", "MALL", "YAP1", "A2ML1",
"RP1-257A7.5", "RP11-353N4.6", "ZBTB18", "CTD-2314B22.3", "GALNT3",
"BCL11A", "CXADR", "SSFA2", "ADM", "GUCY1A3", "GSTP1",
"ADCK3", "SLC25A37", "SFRP1", "PRNP", "DEGS1", "RP11-110G21.2",
"AL589743.1", "ATF3", "SIVA1", "TACSTD2", "HEBP2"))
genes = c(unlist(c(ERlist,tnbclist)))
mat = matrix(rnbinom(500*length(genes),mu=500,size=1),ncol=500)
rownames(mat) = genes
colnames(mat) = paste0("cell",1:500)
sobj = CreateSeuratObject(mat)
sobj = NormalizeData(sobj)
sobj$ClusterName = factor(sample(0:1,ncol(sobj),replace=TRUE))
sobj = AddModuleScore(object = sobj, features = tnbclist,
name = "TNBC_List",ctrl=5)
sobj = AddModuleScore(object = sobj, features = ERlist,
name = "ER_List",ctrl=5)
sobjlists = FetchData(object = sobj, vars = c("ER_List1", "TNBC_List1", "ClusterName"))
#violin plot =======================================================
my_comparisons <- list( c("0", "1") )
ggplot(sobjlists,aes(x= ClusterName, y = ER_List1)) +
geom_violin(aes(fill=ClusterName)) +
geom_boxplot(width=0.1) + labs(y= "ER+ Signature", x = "ClusterName") + ggtitle(label = "Object") +
theme(plot.title = element_text(hjust = 0.5)) + stat_compare_means(comparisons = my_comparisons, method = "wilcox.test")+ # Add pairwise comparisons p-value
stat_compare_means(label.y = 0.75)
The closest I've found is ggpubr: Show significance levels (*** or n.s.) instead of p-value in the label and https://github.com/kassambara/ggpubr/issues/327 but I am not sure how to implement it with how I created the plot.
Thanks for reading!

Related

How to Keep Statistics with Reordered Combined ggplot2 Graph

I'm using ggplot2 to make violin plots of module scores from Seurat, and am wanting to add statistics to it. I made the following violin plot, but I want to switch the violins around from reading "0" and "1" from left to right, to "1" and "0". (Thanks to #StupidWolf for the example!)
library(Seurat)
library(SeuratObject)
library(ggplot2)
library(ggpubr)
library(reshape2)
#add Seurat's module scores and create Seurat object from them =====================
ERlist <- list(c("CPB1", "RP11-53O19.1", "TFF1", "MB", "ANKRD30B",
"LINC00173", "DSCAM-AS1", "IGHG1", "SERPINA5", "ESR1",
"ILRP2", "IGLC3", "CA12", "RP11-64B16.2", "SLC7A2",
"AFF3", "IGFBP4", "GSTM3", "ANKRD30A", "GSTT1", "GSTM1",
"AC026806.2", "C19ORF33", "STC2", "HSPB8", "RPL29P11",
"FBP1", "AGR3", "TCEAL1", "CYP4B1", "SYT1", "COX6C",
"MT1E", "SYTL2", "THSD4", "IFI6", "K1AA1467", "SLC39A6",
"ABCD3", "SERPINA3", "DEGS2", "ERLIN2", "HEBP1", "BCL2",
"TCEAL3", "PPT1", "SLC7A8", "RP11-96D1.10", "H4C8",
"PI15", "PLPP5", "PLAAT4", "GALNT6", "IL6ST", "MYC",
"BST2", "RP11-658F2.8", "MRPS30", "MAPT", "AMFR", "TCEAL4",
"MED13L", "ISG15", "NDUFC2", "TIMP3", "RP13-39P12.3", "PARD68"))
tnbclist <- list(c("FABP7", "TSPAN8", "CYP4Z1", "HOXA10", "CLDN1",
"TMSB15A", "C10ORF10", "TRPV6", "HOXA9", "ATP13A4",
"GLYATL2", "RP11-48O20.4", "DYRK3", "MUCL1", "ID4", "FGFR2",
"SHOX2", "Z83851.1", "CD82", "COL6A1", "KRT23", "GCHFR",
"PRICKLE1", "GCNT2", "KHDRBS3", "SIPA1L2", "LMO4", "TFAP2B",
"SLC43A3", "FURIN", "ELF5", "C1ORF116", "ADD3", "EFNA3",
"EFCAB4A", "LTF", "LRRC31", "ARL4C", "GPNMB", "VIM",
"SDR16C5", "RHOV", "PXDC1", "MALL", "YAP1", "A2ML1",
"RP1-257A7.5", "RP11-353N4.6", "ZBTB18", "CTD-2314B22.3", "GALNT3",
"BCL11A", "CXADR", "SSFA2", "ADM", "GUCY1A3", "GSTP1",
"ADCK3", "SLC25A37", "SFRP1", "PRNP", "DEGS1", "RP11-110G21.2",
"AL589743.1", "ATF3", "SIVA1", "TACSTD2", "HEBP2"))
genes = c(unlist(c(ERlist,tnbclist)))
mat = matrix(rnbinom(500*length(genes),mu=500,size=1),ncol=500)
rownames(mat) = genes
colnames(mat) = paste0("cell",1:500)
sobj = CreateSeuratObject(mat)
sobj = NormalizeData(sobj)
sobj$ClusterName = factor(sample(0:1,ncol(sobj),replace=TRUE))
sobj = AddModuleScore(object = sobj, features = tnbclist,
name = "TNBC_List",ctrl=5)
sobj = AddModuleScore(object = sobj, features = ERlist,
name = "ER_List",ctrl=5)
sobjlists = FetchData(object = sobj, vars = c("ER_List1", "TNBC_List1", "ClusterName"))
#violin plot =======================================================
my_comparisons <- list( c("0", "1") )
ggplot(sobjlists,aes(x= ClusterName, y = ER_List1)) +
geom_violin(aes(fill=ClusterName)) +
geom_boxplot(width=0.1) + labs(y= "ER+ Signature", x = "ClusterName") + ggtitle(label = "Object") +
theme(plot.title = element_text(hjust = 0.5)) + stat_compare_means(comparisons = my_comparisons, method = "wilcox.test")+ # Add pairwise comparisons p-value
stat_compare_means(label.y = 0.75)
Using the solution from How to reorder plots in combined ggplot2 graph?, I can reorder the plots by adding
+ scale_x_discrete(limits = c("1", "0"))
to the very end of the code I have now. However, doing that, the "Wilcoxon, p = 0.13" disappears, and gives the following error:
ggplot(sobjlists,aes(x= ClusterName, y = ER_List1)) +
geom_violin(aes(fill=ClusterName)) +
geom_boxplot(width=0.1) + labs(y= "ER+ Signature", x = "ClusterName") + ggtitle(label = "Object") +
theme(plot.title = element_text(hjust = 0.5)) + stat_compare_means(comparisons = my_comparisons, method = "wilcox.test")+ # Add pairwise comparisons p-value
stat_compare_means(label.y = 0.75) + scale_x_discrete(limits = c("1", "0"))
Warning messages:
1: Unknown or uninitialised column: `p`.
2: Computation failed in `stat_compare_means()`:
argument "x" is missing, with no default
For a TL;DR, how do I keep the top statistic in and reorder the violins?
Thanks for reading!
Here is what worked for me (From https://www.datanovia.com/en/blog/how-to-change-ggplot-legend-order/)
sobjlists$ClusterName <- factor(sobjlists$ClusterName, levels = c("1", "0"))

How to add additional statistics on top of a combined ggplot2 graph that uses a multi-variable object or two separate objects

I have a ggplot2 graph which plots two separate violin plots onto one graph, given by this example (thanks to #jared_mamrot for providing it):
library(tidyverse)
data("Puromycin")
head(Puromycin)
dat1 <- Puromycin %>%
filter(state == "treated")
dat2 <- Puromycin %>%
filter(state == "untreated")
mycp <- ggplot() +
geom_violin(data = dat1, aes(x= state, y = conc, colour = "Puromycin (Treatment1)")) +
geom_violin(data = dat2, aes(x= state, y = conc, colour = "Puromycin (Treatment2)"))
mycp
I would like to add a boxplot or other summary statistics such as those in http://www.sthda.com/english/wiki/ggplot2-violin-plot-quick-start-guide-r-software-and-data-visualization and https://www.maths.usyd.edu.au/u/UG/SM/STAT3022/r/current/Misc/data-visualization-2.1.pdf, but trying the code suggested in those places does not change the original plot.
mycp + geom_boxplot()
Thanks for reading and hopefully this makes sense!
UPDATE ==========================================================================
So the above example does not reflect exactly my situation I realize now. Essentially, I want to apply statistics onto a combined ggplot2 graph that uses two separate objects as its variables (here TNBC_List1 and ER_List1) Here is an example that does (sorry for the longer example, I will admit I am having trouble creating a simpler reproducible example and I am very new to coding in general):
# Libraries -------------------------------------------------------------
library(BiocManager)
library(GEOquery)
library(plyr)
library(dplyr)
library(Matrix)
library(devtools)
library(Seurat)
library(ggplot2)
library(cowplot)
library(SAVER)
library(metap)
library(multtest)
# Loading Raw Data into RStudio ----------------------------------
filePaths = getGEOSuppFiles("GSE75688")
tarF <- list.files(path = "./GSE75688/", pattern = "*.tar", full.names = TRUE)
tarF
untar(tarF, exdir = "./GSE75688/")
gzipF <- list.files(path = "./GSE75688/", pattern = "*.gz", full.names = TRUE)
ldply(.data = gzipF, .fun = gunzip)
list.files(path = "./GSE75688/", full.names = TRUE)
list.files(path = "./GSE75688/", pattern = "\\.txt$",full.names = TRUE)
# full matrix ----------------------------------------------------------
fullmat <- read.table(file = './GSE75688//GSE75688_GEO_processed_Breast_Cancer_raw_TPM_matrix.txt',
sep = '\t', header = FALSE, stringsAsFactors = FALSE)
fullmat <- data.frame(fullmat[,-1], row.names=fullmat[,1])
colnames(fullmat) <- as.character(fullmat[1, ])
fullmat <- fullmat[-1,]
fullmat <- as.matrix(fullmat)
# BC01 ER+ matrix -----------------------------------------------------------
BC01mat <- grep(pattern =c("^BC01") , x = colnames(fullmat), value = TRUE)
BC01mat = fullmat[,grepl(c("^BC01"),colnames(fullmat))]
BC01mat = BC01mat[,!grepl("^BC01_Pooled",colnames(BC01mat))]
BC01mat = BC01mat[,!grepl("^BC01_Tumor",colnames(BC01mat))]
BC01pdat <- data.frame("samples" = colnames(BC01mat), "treatment" = "ER+")
# BC07 TNBC matrix -----------------------------------------------------------
BC07mat <- grep(pattern =c("^BC07") , x = colnames(fullmat), value = TRUE)
BC07mat <- fullmat[,grepl(c("^BC07"),colnames(fullmat))]
BC07mat <- BC07mat[,!grepl("^BC07_Pooled",colnames(BC07mat))]
BC07mat <- BC07mat[,!grepl("^BC07_Tumor",colnames(BC07mat))]
BC07mat <- BC07mat[,!grepl("^BC07LN_Pooled",colnames(BC07mat))]
BC07mat <- BC07mat[,!grepl("^BC07LN",colnames(BC07mat))]
BC07pdat <- data.frame("samples" = colnames(BC07mat), "treatment" = "TNBC")
#merge samples together =========================================================================
joined <- cbind(BC01mat, BC07mat)
pdat_joined <- rbind(BC01pdat, BC07pdat)
#fdat ___________________________________________________________________________________
fdat <- grep(pattern =c("gene_name|gene_type") , x = colnames(fullmat), value = TRUE)
fdat <- fullmat[,grepl(c("gene_name|gene_type"),colnames(fullmat))]
fdat <- as.data.frame(fdat, stringsAsFactors = FALSE)
fdat <- setNames(cbind(rownames(fdat), fdat, row.names = NULL),
c("ensembl_id", "gene_short_name", "gene_type"))
rownames(pdat_joined) <- pdat_joined$samples
rownames(fdat) = make.names(fdat$gene_short_name, unique=TRUE)
rownames(joined) <- rownames(fdat)
# Create Seurat Object __________________________________________________________________
joined <- as.data.frame(joined)
sobj_pre <- CreateSeuratObject(counts = joined)
sobj_pre <-AddMetaData(sobj_pre,metadata=pdat_joined)
head(sobj_pre#meta.data)
#gene name input
sobj_pre[["RNA"]]#meta.features<-fdat
head(sobj_pre[["RNA"]]#meta.features)
#Downstream analysis -------------------------------------------------------
sobj <- sobj_pre
sobj <- FindVariableFeatures(object = sobj, mean.function = ExpMean, dispersion.function = LogVMR, nfeatures = 2000)
sobj <- ScaleData(object = sobj, features = rownames(sobj), block.size = 2000)
sobj <- RunPCA(sobj, npcs = 100, ndims.print = 1:10, nfeatures.print = 5)
sobj <- FindNeighbors(sobj, reduction = "pca", dims = 1:4, nn.eps = 0.5)
sobj <- FindClusters(sobj, resolution = 1, n.start = 10)
umap.method = 'umap-learn'
metric = 'correlation'
sobj <- RunUMAP(object = sobj, reduction = "pca", dims = 1:4,min.dist = 0.5, seed.use = 123)
p0 <- DimPlot(sobj, reduction = "umap", pt.size = 0.1,label=TRUE) + ggtitle(label = "Title")
p0
# ER+ score computation -------------------
ERlist <- list(c("CPB1", "RP11-53O19.1", "TFF1", "MB", "ANKRD30B",
"LINC00173", "DSCAM-AS1", "IGHG1", "SERPINA5", "ESR1",
"ILRP2", "IGLC3", "CA12", "RP11-64B16.2", "SLC7A2",
"AFF3", "IGFBP4", "GSTM3", "ANKRD30A", "GSTT1", "GSTM1",
"AC026806.2", "C19ORF33", "STC2", "HSPB8", "RPL29P11",
"FBP1", "AGR3", "TCEAL1", "CYP4B1", "SYT1", "COX6C",
"MT1E", "SYTL2", "THSD4", "IFI6", "K1AA1467", "SLC39A6",
"ABCD3", "SERPINA3", "DEGS2", "ERLIN2", "HEBP1", "BCL2",
"TCEAL3", "PPT1", "SLC7A8", "RP11-96D1.10", "H4C8",
"PI15", "PLPP5", "PLAAT4", "GALNT6", "IL6ST", "MYC",
"BST2", "RP11-658F2.8", "MRPS30", "MAPT", "AMFR", "TCEAL4",
"MED13L", "ISG15", "NDUFC2", "TIMP3", "RP13-39P12.3", "PARD68"))
sobj <- AddModuleScore(object = sobj, features = ERlist, name = "ER_List")
#TNBC computation -------------------
tnbclist <- list(c("FABP7", "TSPAN8", "CYP4Z1", "HOXA10", "CLDN1",
"TMSB15A", "C10ORF10", "TRPV6", "HOXA9", "ATP13A4",
"GLYATL2", "RP11-48O20.4", "DYRK3", "MUCL1", "ID4", "FGFR2",
"SHOX2", "Z83851.1", "CD82", "COL6A1", "KRT23", "GCHFR",
"PRICKLE1", "GCNT2", "KHDRBS3", "SIPA1L2", "LMO4", "TFAP2B",
"SLC43A3", "FURIN", "ELF5", "C1ORF116", "ADD3", "EFNA3",
"EFCAB4A", "LTF", "LRRC31", "ARL4C", "GPNMB", "VIM",
"SDR16C5", "RHOV", "PXDC1", "MALL", "YAP1", "A2ML1",
"RP1-257A7.5", "RP11-353N4.6", "ZBTB18", "CTD-2314B22.3", "GALNT3",
"BCL11A", "CXADR", "SSFA2", "ADM", "GUCY1A3", "GSTP1",
"ADCK3", "SLC25A37", "SFRP1", "PRNP", "DEGS1", "RP11-110G21.2",
"AL589743.1", "ATF3", "SIVA1", "TACSTD2", "HEBP2"))
sobj <- AddModuleScore(object = sobj, features = tnbclist, name = "TNBC_List")
#ggplot2 issue ----------------------------------------------------------------------------
sobj[["ClusterName"]] <- Idents(object = sobj)
sobjlists <- FetchData(object = sobj, vars = c("ER_List1", "TNBC_List1", "ClusterName"))
library(reshape2)
melt(sobjlists, id.vars = c("ER_List1", "TNBC_List1", "ClusterName"))
p <- ggplot() + geom_violin(data = sobjlists, aes(x= ClusterName, y = ER_List1, fill = ER_List1, colour = "ER+ Signature"))+ geom_violin(data = sobjlists, aes(x= ClusterName, y = TNBC_List1, fill = TNBC_List1, colour="TNBC Signature"))
Extension ======================================================================
If you want to do this but with two objects (sobjlists1 and sobjlists2, for example) instead of what my example showed (two variables but one object), rbind the two and then do what #StupidWolf says
library(reshape2)
sobjlists1= melt(sobjlists1, id.vars = "treatment")
sobjlists2= melt(sobjlists2, id.vars = "treatment")
combosobjlists <- rbind(sobjlists1, sobjlists2)
and then continue on with their code using combosobjlists:
ggplot(combosobjlists,aes(x= ClusterName, y = value)) +
geom_violin(aes(fill=variable)) +
geom_boxplot(aes(col=variable),
width = 0.2,position=position_dodge(0.9))
Hope this thread helps!
Try to include just the minimum code to show your problem. Like in your example, there's no need to start with the whole seurat processing. You can just provide the data.frame with dput() and we can see the issue with ggplot2 , see this post.
Create some example data:
library(Seurat)
library(ggplot2)
genes = c(unlist(c(ERlist,tnbclist)))
mat = matrix(rnbinom(500*length(genes),mu=500,size=1),ncol=500)
rownames(mat) = genes
colnames(mat) = paste0("cell",1:500)
sobj = CreateSeuratObject(mat)
sobj = NormalizeData(sobj)
Add some made-up cluster:
sobj$ClusterName = factor(sample(0:1,ncol(sobj),replace=TRUE))
Add your module score:
sobj = AddModuleScore(object = sobj, features = tnbclist,
name = "TNBC_List",ctrl=5)
sobj = AddModuleScore(object = sobj, features = ERlist,
name = "ER_List",ctrl=5)
We get the data, what you need to do is to pivot it long correctly. Plotting it twice with ggplot2 is going to cause all kinds of problem:
sobjlists = FetchData(object = sobj, vars = c("ER_List1", "TNBC_List1", "ClusterName"))
head(sobjlists)
ER_List1 TNBC_List1 ClusterName
cell1 -0.05391108 -0.008736057 1
cell2 0.07074816 -0.039064126 1
cell3 0.08688374 -0.066967324 1
cell4 -0.12503649 0.120665057 0
cell5 0.05356685 -0.072293651 0
cell6 -0.20053804 0.178977042 1
Should look like this:
library(reshape2)
sobjlists = melt(sobjlists, id.vars = "ClusterName")
ClusterName variable value
1 1 ER_List1 -0.05391108
2 1 ER_List1 0.07074816
3 1 ER_List1 0.08688374
4 0 ER_List1 -0.12503649
5 0 ER_List1 0.05356685
6 1 ER_List1 -0.20053804
Now we plot:
ggplot(sobjlists,aes(x= ClusterName, y = value)) +
geom_violin(aes(fill=variable)) +
geom_boxplot(aes(col=variable),
width = 0.2,position=position_dodge(0.9))
for you to be able to use the data within a plot without specifying it (like geom_boxplot() ), you need to put the data in the ggplot() function call. Then the following functions are able to inherit them.
You also do not need an extra violin plot per color
library(tidyverse)
data("Puromycin")
head(Puromycin)
mycp <- ggplot(Puromycin,aes(x= state, y = conc, colour=state))+geom_violin()
mycp + geom_boxplot(width=0.1, color= "black") +
scale_color_discrete(
labels= c("Puromycin (Treatment1)","Puromycin (Treatment2)")
)
Result:

Is it possible to subset facets in a polyfreq in GGplot?

I was wondering if it was possible to use subset on a geom_polyfreq()?
I am running a topic model and in order to report the facets properly i want to remove 4 out of 10 facets.
My code is as follows:
ggplot(data = dat,
aes(x = date,
fill = Topics)) +
geom_freqpoly(binwidth = 3) +
labs(x = "",
y = "Topic Count",
title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=8)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
> ggplot(subset(dat, Topics %in% c(3, 4, 5, 7, 8, 9)),
aes(x = date,
fill = topic)) +
geom_freqpoly(binwidth = 3) +
labs(x = "",
y = "Topic Count",
title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=9)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
However, when I try to subset the data, I get an error that says:
Fejl: Faceting variables must have at least one value
Does anybody know what the issue is?
I hope this makes sense.
The full code is down below.
article.data <- article.data[!is.na(article.data$fulltext), ]
## Get date
article.data$date <- as.Date(article.data$date, "%Y-%m-%d")
#all of 2018
dat <- article.data[article.data$date > as.Date("2018-01-01", "%Y-%m-%d") &
article.data$date < as.Date("2018-12-01", "%Y-%m-%d"), ]
## 'tokenize' fulltext
quanteda_options("language_stemmer" = "danish")
texts <- gsub(":", " ", dat$fulltext, fixed = T)
texts <- tokens(texts, what = "word",
remove_numbers = T,
remove_punct = T,
remove_symbols = T,
remove_separators = T,
remove_hyphens = T,
remove_url = T,
verbose = T)
texts <- tokens_tolower(texts)
texts <- tokens_remove(texts, stopwords("danish"))
texts <- tokens_wordstem(texts)
texts <- tokens_remove(texts, stopwords("danish"))
# get actual dfm from tokens
txt.mat <- dfm(texts)
#remove frequent words with no substance
txt.mat <- txt.mat %>% dfm_remove(c("ad",
"af","aldrig","alene","alle",
"allerede","alligevel","alt",
"altid","anden","andet","andre",
"at","bag","bare", "bedre", "begge","bl.a.",
"blandt", "blev", "blevet", "blive","bliver",
"burde", "bør","ca.", "com", "da",
"dag", "dansk", "danske", "de",
"dem", "den", "denne","dens",
"der","derefter","deres","derfor",
"derfra","deri","dermed","derpå",
"derved","det","dette","dig",
"din","dine","disse","dit",
"dog","du","efter","egen",
"ej","eller","ellers","en",
"end","endnu","ene","eneste","enhver","ens",
"enten","er","et","f.eks.","far","fem",
"fik","fire","flere","flest",
"fleste","for", "foran",
"fordi","forrige","fra", "fx",
"få","får","før","først",
"gennem","gjorde","gjort","god",
"godt","gør","gøre","gørende",
"ham","han","hans","har",
"havde","have","hej","hel",
"heller","helt","hen","hende",
"hendes","henover","her",
"herefter","heri","hermed",
"herpå","hos","hun","hvad",
"hvem","hver","hvilke","hvilken",
"hvilkes","hvis",
"hvor", "hvordan","hvorefter","hvorfor",
"hvorfra","hvorhen","hvori","hvorimod",
"hvornår","hvorved","i", "ifølge", "igen",
"igennem","ikke","imellem","imens",
"imod","ind","indtil","ingen",
"intet","ja","jeg","jer","jeres",
"jo","kan","kom","komme",
"kommer", "kroner", "kun","kunne","lad",
"langs", "lang", "langt", "lav","lave","lavet",
"lidt","lige","ligesom","lille",
"længere","man","mand","mange",
"med","meget","mellem","men", "mener",
"mens","mere","mest","mig",
"min","mindre","mindst","mine",
"mit","mod","må","måske",
"ned","nej","nemlig","ni",
"nogen","nogensinde","noget",
"nogle","nok","nu","ny", "nye",
"nyt","når","nær","næste",
"næsten","og","også","okay",
"om","omkring","op","os",
"otte","over","overalt","pga.", "partier",
"partiets", "partiers", "politiske",
"procent", "på", "ritzau", "samme",
"sammen","se","seks","selv","selvom",
"senere","ser","ses","siden","sig",
"sige", "siger", "sin","sine","sit",
"skal","skulle","som","stadig",
"stor","store","synes","syntes",
"syv","så","sådan","således",
"tag","tage","temmelig","thi",
"ti","tidligere","til","tilbage",
"tit","to","tre","ud","uden",
"udover","under","undtagen","var",
"ved","vi","via","vil","ville", "viser",
"vor","vore","vores","vær","være",
"været","øvrigt","facebook","http", "https",
"www","millioner", "frem", "lars", "lars_løkke",
"rasmussen", "løkke_rasmussen", "statsminister", "politik",
"formand", "partiet", "år", "tid", "and", "fler",
"sid", "regeringen", "giv", "politisk", "folketing", "mer",
"ifølg"))
############################################################
## FEATURE SELECTION
############################################################
# check out top-appearing features in dfm
topfeatures(txt.mat)
# keep features (words) appearing in >2 documents
txt.mat <- dfm_trim(txt.mat, min_termfreq = 4)
# filter out one-character words
txt.mat <- txt.mat[, str_length(colnames(txt.mat)) > 2]
# filter out some html trash features
#txt.mat <- txt.mat[, !grepl("[[:digit:]]+px", colnames(txt.mat))]
#txt.mat <- txt.mat[, !grepl(".", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("_", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("#", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("#", colnames(txt.mat), fixed = T)]
############################################################
## SELECT FEATURES BY TF-IDF
############################################################
# Create tf_idf-weighted dfm
ti <- dfm_tfidf(txt.mat)
# Select from main dfm using its top features
txt.mat <- dfm_keep(txt.mat, names(topfeatures(ti, n = 1000)))
############################################################
## RUN TOPIC MODEL
############################################################
# convert quanteda dfm to tm 'dtm'
dtm <- convert(txt.mat, to = "topicmodels")
# run lda with 8 topics
lda <- LDA(dtm, k = 8)
# review terms by topic
terms(lda, 10)
############################################################
## LOOK FOR 'OPTIMAL' k
############################################################
# randomly sample test data
set.seed(61218)
select <- sample(1:nrow(dtm), size = 100)
test <- dtm[select, ]
train <- dtm[!(1:nrow(dtm) %in% select), ]
n.tops <- 3:14
metrics <- data.frame(topics = n.tops,
perplexity = NA)
for(i in n.tops) { # NB: takes awhile to run
print(i)
est <- LDA(train, k = i)
metrics[(i - 1), "perplexity"] <- perplexity(est, newdata = test)
}
save(metrics, file = "lda_perplexity2018.RData")
qplot(data = metrics, x = topics, y = perplexity, geom = "line",
xlab = "Number of topics",
ylab = "Perplexity on test data") + theme_bw()
#We found that 8 topics was one of those of lowest perplexity but
#also the ones which made the most sense
############################################################
## RERUN WITH BETTER CHOICE OF k
############################################################
# run lda with 10 topics
lda <- LDA(dtm, k = 10)
save(lda, file = "dr_ft_keep2018.RData")
# examine output
terms(lda, 20)
# put topics into original data
dat$topic <- topics(lda)
# add labels
#dat$date <- factor(dat$date,
#levels = 1:12,
#labels = c("januar","februar", "marts","april", "maj", "juni", "juli", "august", "september", "oktober", "november", "decemeber"))
dat$Topics <- factor(dat$topic,
levels = 1:10,
labels = c("Topc 1", "Topic 2", "Integration", "Taxation", "Burka Prohibition",
"Topic 6", "Justice", "Foreign Affairs", "Housing", "Topic 10"))
# frequency
qplot(data = dat, x = Topics,
geom = "bar", xlab = "",
ylab = "Topic Frequency", fill=Topics, main = "Figure 1: Main Topics in 2018 - DR") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))
#Make visualization showing topics over time
ggplot(data = dat,
aes(x = date,
fill = Topics[1])) +
geom_freqpoly(binwidth = 30) +
facet_wrap(Topics ~ ., scales = "free")+
theme_classic() +
scale_x_date(breaks = as.Date(c( "2018-02-01", "2018-04-01", "2018-06-01", "2018-08-01", "2018-10-01", "2018-12-01", date_labels="%B"))) +
theme(axis.text.x = element_text(angle = 90))
ggplot(data = dat,
aes(x = date,
fill = Topics)) +
geom_freqpoly(binwidth = 3) +
labs(x = "",
y = "Topic Count",
title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=8)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
It's best practice on this forum to make your question reproducible, so that others can try it and test their solutions to confirm they work. It's also good if you can make it minimal, both to respect potential answerers' time and to help clarify your own understanding of the problem.
How to make a great R reproducible example
In this case, the error message suggests that your subsetting is removing all your data, which breaks the faceting. It can't plot any facets if the faceting variable has no values.
It looks like dat$Topics is a factor, but your loop is referring to Topics like they're numeric with Topics %in% c(3, 4, 5, 7, 8, 9). For example, I could define a factor vector with the same levels as your Topics variable:
Topics <- factor(1:10, levels = 1:10,
labels = c("Topc 1", "Topic 2", "Integration", "Taxation", "Burka Prohibition",
"Topic 6", "Justice", "Foreign Affairs", "Housing", "Topic 10"))
Compare the output of these three lines:
Topics %in% c(1, 2)
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
as.numeric(Topics) %in% c(1, 2)
# [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Topics %in% c("Topc 1", "Topic 2")
# [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
In the top case, none of the data matches the test, so using that to subset the data would give you an empty data set and seems like a plausible cause for the error you got.
To refer to the Topics by their underlying level, we can refer to as.numeric(Topics) %in% c(1, 2). If we want to refer to the Topics by their labels, I could use Topics %in% c("Topc 1", "Topic 2").
Since I don't have your data, I can't confirm this exact syntax will work for you, but I hope something along these lines will.
For more on how to work with factors in R, I recommend: https://r4ds.had.co.nz/factors.html

How do I sort a faceted bar plot on the values in one facet?

I'm trying to order the bars of a vertically facetted bar plot (3 facets) based on the values of one of the facets. Specifically, I want bars in the leftmost panel sorted by decreasing value. My data is based on multi- period fund returns. A period length is a facet; the list of ticker symbols spans all facets. I've found many similar examples, but I can't get the reordering of the x axis labels to work correctly. I'm using the code below on a much longer list of stock symbols, but this should be a working example:
require('ggplot2')
require('reshape2')
require('tseries')
symbs = c("XLE", "XLV", "XLK", "XLB", "SPY")
desc = c("Energy", "HealthCare", "Tech", "Materials", "S&P")
data = cbind(symbs, desc)
indexTickers = data[,1]
indexNames = data[,2]
latestDate =Sys.Date()
dailyPrices = lapply(indexTickers, function(ticker) get.hist.quote(instrument= ticker, start = "2012-01-01",
end = latestDate, quote="Close", provider = "yahoo", origin="1970-01-01", compression = "d", retclass="zoo"))
perf5Day = lapply(dailyPrices, function(x){(x-lag(x,k=-5))/lag(x,k=-5)})
perf20Day = lapply(dailyPrices, function(x){(x-lag(x,k=-20))/lag(x,k=-20)})
perf60Day = lapply(dailyPrices, function(x){(x-lag(x,k=-60))/lag(x,k=-60)})
names(perf5Day) = indexTickers
names(perf20Day) = indexTickers
names(perf60Day) = indexTickers
perfsMerged = lapply(indexTickers, function(spdr){merge(perf5Day[[spdr]],perf20Day[[spdr]],perf60Day[[spdr]])})
perfNames = c('1Week','1Month','3Month')
perfsMerged = lapply(perfsMerged, function(x){
names(x)=perfNames
return(x)
})
latestDataPoints = t(sapply(perfsMerged, function(x){return(x[nrow(x)])}))
namesAndTickers = paste(indexNames, paste(' (',indexTickers,')',sep=''), sep='')
latestDataPoints = data.frame(cbind(namesAndTickers,latestDataPoints))
latestDataPoints[2:4] <- lapply( latestDataPoints[2:4], function(x)
as.numeric(as.character(x)))
names(latestDataPoints) = c('Ticker','5 Day','20 Day','60 Day')
drm = melt(latestDataPoints, id.vars=c('Ticker'))
posNeg = ifelse(drm$value >= 0, 'pos','neg')
pcts = paste(round(100*drm$value, 2), "%", sep="")
drm = cbind(drm, posNeg, pcts)
names(drm) = c('Ticker','Period','Value','Sign','Pct')
ordered = drm[drm$Period=="5 Day",]
ordered = ordered[order(-ordered$Value),]
orderedSymbs = ordered$Ticker
orderedSymbs = c(orderedSymbs, orderedSymbs, orderedSymbs)
p = ggplot(drm, aes(x=reorder(Ticker, orderedSymbs),y=Value,fill=Sign)) + geom_bar(stat='identity') + facet_grid(. ~ Period,scales='free_y') +
coord_flip() + scale_fill_manual(values=c('pos'='darkgreen','neg'='red'),guide=FALSE) +
ggtitle("Performances of Top Etfs by Trading Volume")
p = p + theme(plot.title = element_text(vjust = 2, size=rel(1.8),
face="bold"), axis.text.x=element_text(color='orangered4',size=rel(1.2)),
axis.text.y=element_text(color='orangered4',size=rel(1.2)), axis.title.x = element_blank(),
axis.title.y = element_blank(), strip.text.x = element_text(size=rel(1.2)))
p
Here is the working version.
Besides some cosmetic changes to the code (that I usually use), the only major change was related to rearranging the factors before going into the ggplot.
Hope this helps
require('ggplot2')
require('reshape2')
require('tseries')
symbs = c("XLE", "XLV", "XLK", "XLB", "SPY")
desc = c("Energy", "HealthCare", "Tech", "Materials", "S&P")
data = cbind(symbs, desc)
indexTickers = data[,1]
indexNames = data[,2]
latestDate =Sys.Date()
dailyPrices = lapply(indexTickers, function(ticker) get.hist.quote(instrument= ticker, start = "2012-01-01",
end = latestDate, quote="Close", provider = "yahoo", origin="1970-01-01", compression = "d", retclass="zoo"))
perf5Day = lapply(dailyPrices, function(x){(x-lag(x,k=-5))/lag(x,k=-5)})
perf20Day = lapply(dailyPrices, function(x){(x-lag(x,k=-20))/lag(x,k=-20)})
perf60Day = lapply(dailyPrices, function(x){(x-lag(x,k=-60))/lag(x,k=-60)})
names(perf5Day) = indexTickers
names(perf20Day) = indexTickers
names(perf60Day) = indexTickers
perfsMerged = lapply(indexTickers, function(spdr){merge(perf5Day[[spdr]],perf20Day[[spdr]],perf60Day[[spdr]])})
perfNames = c('1Week','1Month','3Month')
perfsMerged = lapply(perfsMerged, function(x){
names(x)=perfNames
return(x)
})
latestDataPoints = t(sapply(perfsMerged, function(x){return(x[nrow(x)])}))
namesAndTickers = paste(indexNames, paste(' (',indexTickers,')',sep=''), sep='')
latestDataPoints = data.frame(cbind(namesAndTickers,latestDataPoints))
latestDataPoints[2:4] <- lapply( latestDataPoints[2:4], function(x)
as.numeric(as.character(x)))
names(latestDataPoints) = c('Ticker','5 Day','20 Day','60 Day')
drm = melt(latestDataPoints, id.vars=c('Ticker'))
posNeg = ifelse(drm$value >= 0, 'pos','neg')
pcts = paste(round(100*drm$value, 2), "%", sep="")
drm = cbind(drm, posNeg, pcts)
names(drm) = c('Ticker','Period','Value','Sign','Pct')
ordered = drm[drm$Period=="5 Day",]
ordered = ordered[order(-ordered$Value),]
orderedSymbs = ordered$Ticker
Here is the change
# commented out this
# orderedSymbs = c(orderedSymbs, orderedSymbs, orderedSymbs)
# added this line, see http://www.r-bloggers.com/reorder-factor-levels-2/ for details
drm$Ticker = factor(drm$Ticker, levels(drm$Ticker)[as.numeric(orderedSymbs)])
And some minor modifications here
p = ggplot(drm,
aes(x=Ticker,
y=Value,fill=Sign)
)
p = p + geom_bar(stat='identity') + facet_grid(. ~ Period,scales='free_y')
p = p + coord_flip() + scale_fill_manual(values=c('pos'='darkgreen','neg'='red'),guide=FALSE)
p = p + ggtitle("Performances of Top Etfs by Trading Volume")
p = p + theme(plot.title = element_text(vjust = 2, size=rel(1.8), face="bold"), axis.text.x=element_text(color='orangered4',size=rel(1.2)), axis.text.y=element_text(color='orangered4',size=rel(1.2)), axis.title.x = element_blank(), axis.title.y = element_blank(), strip.text.x = element_text(size=rel(1.2)))
print(p)

Plot fitted data from data frame as side-by-side barplot

I have a data frame that comes from a lm subset composed of the intercept (ordenada) and the slope (velocidad1) calculated for each subject.
A
UT1 UT2 UT3 UT4
ordenada 1213.8 2634.8 3.760000e+02 -11080.8
velocidad1 1.5 -2.5 6.615954e-14 20.0
UT5 UT6 UT7
ordenada 1711.8 1.739000e+03 1.800000e+01
velocidad1 -2.5 5.039544e-14 -9.154345e-16
UT8 UT9 UT10 UT11 UT12
ordenada 5659.2 -2791 3422.6 418.2 2802.2
velocidad1 -6.0 5 -1.0 -0.5 -1.5
UT13 UT14 TR1 TR2
ordenada 2.832000e+03 -411.2 -15722.0 -1105.4
velocidad1 1.405114e-13 3.5 25.5 25.0
TR3 TR4 TR5 TR6
ordenada 1.14600e+03 299.6 1943.4 6.840000e+02
velocidad1 -5.11402e-14 2.0 -2.5 6.479414e-14
TR7 TR8 TR9 TR10
ordenada 354.8 1.317000e+03 33284.6 -3742.6
velocidad1 1.0 -3.475548e-14 -52.0 8.0
TR11 TR12 TR13
ordenada 7.400000e+02 2205.4 -4542.6
velocidad1 -8.018585e-14 -2.5 8.0
TR14
ordenada 5.880000e+02
velocidad1 -4.406498e-14
dput(A)
structure(list(UT1 = c(1213.79999999971, 1.50000000000047), UT2 = c(2634.80000000021,
-2.50000000000033), UT3 = c(375.999999999959, 6.61595351840473e-14
), UT4 = c(-11080.8000000008, 20.0000000000013), UT5 = c(1711.80000000007,
-2.50000000000012), UT6 = c(1738.99999999997, 5.03954433109254e-14
), UT7 = c(18.0000000000006, -9.15434469010036e-16), UT8 = c(5659.20000000026,
-6.00000000000041), UT9 = c(-2791.00000000024, 5.00000000000039
), UT10 = c(3422.59999999968, -0.99999999999948), UT11 = c(418.199999999958,
-0.499999999999932), UT12 = c(2802.20000000017, -1.50000000000028
), UT13 = c(2831.99999999991, 1.40511433073812e-13), UT14 = c(-411.200000000294,
3.50000000000048), TR1 = c(-15722.0000000017, 25.5000000000028
), TR2 = c(-1105.40000000264, 25.0000000000043), TR3 = c(1146.00000000003,
-5.11402035568996e-14), TR4 = c(299.599999999803, 2.00000000000032
), TR5 = c(1943.40000000013, -2.50000000000021), TR6 = c(683.99999999996,
6.47941413997612e-14), TR7 = c(354.800000000011, 0.999999999999982
), TR8 = c(1317.00000000002, -3.47554781454658e-14), TR9 = c(33284.6000000025,
-52.000000000004), TR10 = c(-3742.60000000058, 8.00000000000094
), TR11 = c(740.00000000005, -8.0185853149896e-14), TR12 = c(2205.40000000021,
-2.50000000000034), TR13 = c(-4542.60000000042, 8.00000000000067
), TR14 = c(588.000000000027, -4.40649812201441e-14)), .Names = c("UT1",
"UT2", "UT3", "UT4", "UT5", "UT6", "UT7", "UT8", "UT9", "UT10",
"UT11", "UT12", "UT13", "UT14", "TR1", "TR2", "TR3", "TR4", "TR5",
"TR6", "TR7", "TR8", "TR9", "TR10", "TR11", "TR12", "TR13", "TR14"
), row.names = c("ordenada", "velocidad1"), class = "data.frame")
My goal is to get a barplot of the data in second row ( A[2,] ) splitting by group (UT which contains UT1,UT2... and TR) in the same graph. I am trying to do some ggplot but keep failing over and over again. I get no layers in plot error or margin error in base graphics.
The output should look like this
I KNOW the answer is in the reshape package but I wish there's another way to do that.
Thank you in advance.
Using base graphics:
# convert the one-row data frame to a two-row matrix
m <- matrix(unlist(df[2, ]), nrow = 2, byrow = TRUE)
# plot
barplot(m, beside = TRUE, col = c("blue", "red"), names.arg = seq_len(ncol(m)))
Possibly add a legend:
legend("topright", legend = c("UT", "TR"), fill = c("blue", "red"))
EDIT: Not using reshape per request in comments
library(ggplot2)
plot_data <- data.frame(names(A), t(A[2,]))
names(plot_data) <- c("variable", "value")
plot_data$group <- grepl("^TR", plot_data$variable)
plot_data$variable <- gsub("[^0-9]", "", as.character(plot_data$variable))
plot_data$variable <- factor(plot_data$variable,
unique(sort(as.numeric(plot_data$variable))))
p <- ggplot(aes(y = value, x = variable, fill = group), data = plot_data)
p + geom_bar(stat = "identity", position = "dodge")
Here is another option that incorporates your complete dataset. Not sure if this is usefull for you.
I've used reshape2, it's actually easier. You just have to melt(yourdataframe), for your particular case there is no need to specify anything else in the melt function arguments.
require("ggplot2")
require("reshape2")
A <- df
df1 <- melt(df[1,])
df1$origen <- "ORDENADA"
df2 <- melt(df[2,])
df2$origen <- "VELOCIDAD"
identical(df1$variable,df2$variable)
df3 <- rbind(df1,df2)
df3$group <- ifelse(grepl("^TR", df3$variable) == TRUE, "TR", "UT")
df3$vble <- gsub("[^0-9]", "", as.character(df3$variable))
df3$vble <- factor(df3$vble, levels = as.numeric(unique(df3$vble)))
ggplot(aes(y = value, x = vble, fill = group), data = df3) +
geom_bar(stat = "identity", position = "dodge") +
facet_grid(origen ~ ., scales = "free")
Using Functions
prepare <- function(data){
data1 <- melt(data[1,])
data1$origen <- "ORDENADA"
data2 <- melt(data[2,])
data2$origen <- "VELOCIDAD"
identical(data1$variable,data2$variable)
data3 <- rbind(data1,data2)
data3$group <- ifelse(grepl("^TR", data3$variable) == TRUE, "TR", "UT")
data3$vble <- gsub("[^0-9]", "", as.character(data3$variable))
data3$vble <- factor(data3$vble, levels = as.numeric(unique(data3$vble)))
return(data3)
}
prepare(df)
#This would work, but is a bit manual for many plots:
ggplot(aes(y = value, x = vble, fill = group), data = prepare(df)) +
geom_bar(stat = "identity", position = "dodge") +
facet_grid(origen ~ ., scales = "free")
plot_fun <- function(data){
p <- ggplot(data, aes_string(x = "vble", y = "value", fill = "group"))
p <- p + geom_bar(stat = "identity", position = "dodge")
p <- p + facet_grid(origen ~ ., scales = "free")
suppressWarnings(print(p))
}
plot_fun(prepare(df))
I guess you could loop in order to plot several data frames using the same plot function.
I guess you could probably addapt it more to your needs, but this can get you started

Resources