why does geom_text_repel() ignore size aesthetic? - r

I am using geom_text_repel. Here is the code
xx <- structure(list(SYMBOL = c("HSPB1", "DSP", "COPA", "SEMA3B", "RFC1",
"FAM120A", "MRPL24", "ENO1", "TAF15", "MRPL47", "MYO5A", "TTC37",
"RBMX", "LEMD2", "GNL3", "TRMT2A", "EIF3H", "PSPC1", "MIOS",
"POLR2A", "SCAMP3", "EEF2", "AGO3", "SASH1", "GTPBP4", "PARP1",
"RPS3", "ATP2A2", "PPAN", "KPNB1", "SPATA5", "RPL18", "ARAP1",
"VDAC2", "SF1", "RCN1", "NSUN5", "CSNK1A1", "PHGDH", "NAV1",
"EIF4G1", "RSL1D1", "DHX29", "TCP1", "HSD17B12", "PLOD1", "AKAP8L",
"STT3A", "DIMT1", "CCT5"), interactor = c(FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE), log2FC_OEIPtoOEC = c(-1.188, -1.071, -0.305, -1.196,
-2.242, -1.083, -2.027, 0.101, -0.613, -1.195, -1.632, -0.934,
-0.604, -1.714, -1.011, -1.301, -1.327, 0.447, -0.563, 0.061,
0.159, -0.354, -0.696, 2.679, -2.407, -1.133, -1.174, -0.023,
-1.739, 0.372, -1.84, -1.755, 2.496, 2.48, -0.13, -0.043, -1.436,
-2.089, 0.601, -1.868, -0.587, -0.656, -0.689, 0.489, -0.389,
-0.214, -0.682, -0.463, -2.124, -0.175), adj.pvalue_OEIPtoOEC = c(3.11820459104672e-06,
0.000375326754978968, 0.000452267099086636, 2.29371074079822e-06,
3.16069740512121e-08, 1.71482977276262e-09, 3.16164967556097e-09,
0.803071498733443, 0.000260890847563963, 7.84919749378136e-08,
2.81074102369284e-07, 0.000650750327476979, 1.76645936204616e-05,
4.64727351507494e-07, 5.97431538613294e-08, 2.06414050372876e-06,
1.60474719971609e-07, 1.06148995172679e-06, 0.000424624322200997,
0.0822930476250642, 0.0669857879144555, 0.0497856508138053, 0.0010128186235129,
6.79027171829534e-10, 7.08618519862231e-09, 5.31048502842373e-07,
9.14238499256634e-06, 0.65661095803186, 0.000109268795075108,
0.000896331917103447, 0.000154543207846778, 4.94297345832319e-06,
1.74956073133704e-06, 6.14100266559054e-10, 0.0165759045939973,
0.609835745607611, 3.22563881688616e-05, 2.36186291346976e-09,
1.96827534532196e-06, 1.23996382790666e-06, 0.000563501819276639,
6.07830082570168e-05, 0.000461083456429919, 0.0127703802958071,
0.000553587826763665, 0.017917136749192, 1.08862548100969e-07,
0.000666364393616902, 4.57980386379692e-08, 0.255896208213884
)), row.names = c(130L, 169L, 307L, 401L, 79L, 24L, 321L, 123L,
69L, 506L, 59L, 402L, 14L, 309L, 362L, 531L, 481L, 181L, 499L,
54L, 275L, 250L, 335L, 200L, 522L, 437L, 489L, 208L, 462L, 125L,
367L, 536L, 331L, 161L, 344L, 294L, 508L, 249L, 220L, 412L, 82L,
315L, 403L, 37L, 246L, 297L, 333L, 411L, 318L, 357L), class = "data.frame")
Here is the code for the plot
ggplot(xx, aes (log2FC_OEIPtoOEC, -log10(adj.pvalue_OEIPtoOEC), label = ifelse (interactor, SYMBOL, ""), colour = interactor)) +
geom_point() +
geom_text_repel (aes(size = 100)) +
geom_hline(yintercept= -log10 (0.05), linetype="dashed") +
geom_vline(xintercept= 1, linetype="dashed") +
xlab ("Log2 fold change of IP to control") +
ylab ("-Log10 of adjusted p-value") + theme_classic()
ggplot(xx, aes (log2FC_OEIPtoOEC, -log10(adj.pvalue_OEIPtoOEC), label = ifelse (interactor, SYMBOL, ""), colour = interactor)) +
geom_point() +
geom_text_repel (aes(size = 0.01)) +
geom_hline(yintercept= -log10 (0.05), linetype="dashed") +
geom_vline(xintercept= 1, linetype="dashed") +
xlab ("Log2 fold change of IP to control") +
ylab ("-Log10 of adjusted p-value") + theme_classic()
in the two plots, the aes for geom_text_repel specifies size as 100 and 0.01 respectively. However, this parameter is basically ignored and the text is always the same size, and a "size" box is added to the legend for some reason

To specify a literal size, rather than a variable that should be mapped to it, you remove the aes to get:
ggplot(xx, aes (log2FC_OEIPtoOEC, -log10(adj.pvalue_OEIPtoOEC), label = ifelse (interactor, SYMBOL, ""), colour = interactor)) +
geom_point() +
geom_text_repel (size = 100) +
geom_hline(yintercept= -log10 (0.05), linetype="dashed") +
geom_vline(xintercept= 1, linetype="dashed") +
xlab ("Log2 fold change of IP to control") +
ylab ("-Log10 of adjusted p-value") + theme_classic()
Alternatively a relationship between the variable used and the size that the text appears can be specified through one of the scale_size_ functions. scale_size_identity() would work here.

Related

How do I scatterplot LD1 vs LD2 in lda analysis?

I'm very very new to R, so thanks in advance for the help
I did the lda analysis on my dataset (tme.lda), in the console I get all my results with LD1, LD2, LD3, LD4, LD5 and LD6 but when I try to plot it I tried a lot of different methods but I get every kind of error: Error LD1 object not found - Error in fortify - Error in as.data.frame just to say a few.
This is my dataset:
dput(head(tme.lda))
structure(list(Word = structure(1:6, levels = c("bene", "bile",
"casa", "come", "posso", "tutto", "vero"), class = "factor"),
f0min = c(184L, 193L, 189L, 199L, 175L, 144L), f0max = c(229L,
226L, 198L, 225L, 192L, 188L), F1 = c(600L, 347L, 980L, 531L,
550L, 432L), F2 = c(2406L, 2695L, 1759L, 997L, 996L, 1901L
), F4 = c(4125L, 4403L, 3837L, 3988L, 3909L, 4171L), max_F0 = c(143L,
130L, 124L, 133L, 123L, 120L)), row.names = c(NA, 6L), class = "data.frame")
And this is the code I wrote, how can I get from here the scatterplot LD1 vs LD2?
View(tme.lda)
#lDFA analysis with "WORD" as grouping factor
tme.lda<-cbind(tme[,5],tme.lda[,1:6])
names(tme.lda)
#> [1] "tme[, 5]" "f0min" "f0max" "F1" "F2" "F4" "max_F0"
names(tme.lda)=c("Word","f0min","f0max","F1","F2","F4","max_F0")
names(tme.lda)
#> [1] "Word" "f0min" "f0max" "F1" "F2" "F4" "max_F0"
library(MASS)
lda(Word~f0min+f0max+F1+F2+F4+max_F0,data = tme.lda)
I tried this:
plot(Word, panel = tme.lda, abbrev = FALSE, xlab = "LD1", ylab = "LD2")
plot(x, panel = panel.lda, cex = 0.7, dimen=2, abbrev = FALSE, xlab = "LD1", ylab = "LD2")
ggplot(Word, panel = tme.lda, cex = 0.7, dimen=2, xlab = "LD1", ylab = "LD2")
ggplot2::aes(LD1,LD2) (Word, panel = tme.lda, cex = 0.7, dimen=2, xlab = "LD1", ylab = "LD2")
plot.lda<-lda(Word~f0min+f0max+F1+F2+F4+max_F0,data = tme.lda)
ggp <- ggplot(plot.lda, aes(x = LD1, y=LD2)) +
geom_point(mapping = aes(colour=Word)) +
ggtitle("LD1 Vs. LD2")
ggp <- ggplot(plot.lda, aes(x = LD1, y=LD2))
Just to say a few things I tried

plotting a 3D bubble graph on R

I am currently trying to plot a 3D bubble graph with 2 (then later i will try with 3) axes, as in excel, but on R (here is an example of the 3D bubble plot i am trying to plot) :
https://fr.extendoffice.com/documents/excel/2017-excel-create-bubble-chart.html
library(ggplot2)
library(scales)
p <- ggplot(plot_3D, aes(x = var_2020_2021_valeur, y = var_2020_2021_CA)) +
geom_point(aes(color = Specialite, size = CA.annee.N), alpha = 0.5) +
scale_color_manual(values = c("#00AFBB", "#E7B800", "#FC4E07",
"#FFB5C5", "#BF87B3", "#7F5AA2", "#3F2D91", "#000080",
"#2468A0", "#a32cc4", "#9073db", "#c51f5d",
"#5800ff", "#4455ff", "#48ff50")) +
scale_size(range = c(0.5, 12)) + # Réglage de la plage de tailles des points
ylim(-100, 100) +
xlim (-100, 100) +
geom_hline(yintercept=0) +
geom_vline(xintercept=0) +
scale_y_continuous(labels = percent) +
scale_x_continuous(labels = percent)
I get the following message of error :
"Scale for 'y' is already present. Adding another scale for 'y',
which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x',
which
will replace the existing scale."
Even when i abandon the two last lines of code, it doesn't work...
Here is the structure of my dataset plot_3D :
structure(list(Specialite = c("ANESTHESIE REANIMATION", "Autres",
"CHIRURGIE GENERALE ET VISCERALE", "CHIRURGIE PLASTIQUE", "GASTRO ENTEROLOGIE",
"GYNECOLOGIE OBSTETRIQUE", "IMAGERIE", "MAXILLO STOMATO", "MEDECINE GENERALE et
INTERNE",
"OPHTALMOLOGIE", "ORL", "ORTHOPEDIE", "PNEUMOLOGIE", "URGENTISTE",
"UROLOGIE"), CA.annee.N = c(64310L, 25298L, 1205537L, 42020L,
3694964L, 344370L, 3454L, 588033L, 228439L, 1849804L, 27358L,
2739286L, 0L, 916L, 432907L), Nombre.de.sejours.annee.N = c(171L,
34L, 1504L, 56L, 9224L, 682L, 9L, 1103L, 888L, 2276L, 57L, 4068L,
0L, 2L, 720L), CA.annee.N.1 = c(50135L, 454L, 790559L, 15531L,
2644858L, 304242L, 3026L, 402195L, 459813L, 1308933L, 20597L,
2269691L, 0L, 3901L, 318352L), Nombre.de.sejours.annee.N.1 = c(150L,
1L, 1067L, 25L, 7276L, 627L, 9L, 802L, 1918L, 1693L, 43L, 3519L,
0L, 7L, 547L), CA.annee.N.2 = c(48583L, 453L, 941610L, 16675L,
3140507L, 385813L, 2950L, 642017L, 691982L, 1704005L, 51602L,
2261368L, 7145L, 4648L, 308169L), Nombre.de.sejours.annee.N.2 = c(154L,
1L, 1264L, 28L, 8317L, 831L, 8L, 1286L, 3231L, 2269L, 127L, 3610L,
26L, 10L, 551L), CA_par_sejour_N = c(376.081871345029, 744.058823529412,
801.553856382979, 750.357142857143, 400.581526452732, 504.941348973607,
383.777777777778, 533.121486854034, 257.251126126126, 812.743409490334,
479.964912280702, 673.374139626352, NaN, 458, 601.259722222222
), CA_par_sejour_N1 = c(334.233333333333, 454, 740.917525773196,
621.24, 363.504398020891, 485.234449760766, 336.222222222222,
501.490024937656, 239.735662148071, 773.14412285883, 479, 644.981813015061,
NaN, 557.285714285714, 581.99634369287), CA_par_sejour_N2 = c(315.474025974026,
453, 744.944620253165, 595.535714285714, 377.600937838163, 464.275571600481,
368.75, 499.235614307932, 214.169606932838, 750.993829881005,
406.314960629921, 626.417728531856, 274.807692307692, 464.8,
559.290381125227), var_2020_2021_valeur = c(0.125207553640259,
0.638896087069189, 0.0818395145215454, 0.207837780659878, 0.101999119223065,
0.0406131494220115, 0.141440846001322, 0.063074957314078, 0.0730615704860669,
0.051218505658529, 0.00201443064864667, 0.0440203522616658, NaN,
-0.178159446295822, 0.0330987964754596), var_2020_2021_CA = c(0.282736611149895,
54.7224669603524, 0.524917178856986, 1.70555662867813, 0.397036816343259,
0.131895004634469, 0.141440846001322, 0.462059448774848, -0.503191514811456,
0.413215191304673, 0.328251687138904, 0.206898207729598, NaN,
-0.765188413227378, 0.35983753832236)), class = "data.frame", row.names = c(NA,
-15L))
Could anyone help ?
You should remove xlim and ylim because they can be used when you don't specify anything else on your axis. So your could add the limits to both scale_*_continuous in the limits arguments like this:
library(ggplot2)
library(scales)
p <- ggplot(plot_3D, aes(x = var_2020_2021_valeur, y = var_2020_2021_CA)) +
geom_point(aes(color = Specialite, size = CA.annee.N), alpha = 0.5) +
scale_color_manual(values = c("#00AFBB", "#E7B800", "#FC4E07",
"#FFB5C5", "#BF87B3", "#7F5AA2", "#3F2D91", "#000080",
"#2468A0", "#a32cc4", "#9073db", "#c51f5d",
"#5800ff", "#4455ff", "#48ff50")) +
scale_size(range = c(0.5, 12)) + # Réglage de la plage de tailles des points
geom_hline(yintercept=0) +
geom_vline(xintercept=0) +
scale_y_continuous(labels = percent, limits = c(-100, 100)) +
scale_x_continuous(labels = percent, limits = c(-100, 100))
p
#> Warning: Removed 1 rows containing missing values (geom_point).
Created on 2022-07-12 by the reprex package (v2.0.1)

Forest plot with table ggplot coding

I am trying to get a table side by side with my forest plot but I am having a lot of trouble doing so.
I am able to make a forest plot with the following code:
###dataframe
###dataframe
library(ggplot2)
library(tidyr)
library(grid)
library(gridExtra)
library(forcats)
forestdf <- structure(list(labels = structure(1:36, .Label = c("Age*", "Sex – male vs. female",
"Body-mass index*,1 ", "Systolic blood pressure*", "Race - vs. white",
"Asian", "Black", "Townsend deprivation index", "Social habit",
"Smoking - vs. never", "Previous", "Current", "Alcohol use - vs. never",
"Once or twice a week", "Three or four times a week", "Daily or almost daily",
"Comorbidity", "Cancer", "Diabetes", "Chronic obstructive pulmonary disease2",
"Asthma", "Ischemic heart disease3", "Hypothyroidism", "Hypercholesterolemia",
"Allergic rhinitis", "Depression", "Serology", "White blood cell count",
"Red blood cell count", "Hemoglobin concentration", "Mean corpuscular volume",
"Mean corpuscular hemoglobin concentration", "Platelet count",
"Lymphocyte count", "Monocyte count", "Neutrophil count"), class = "factor"),
rr = c(1.18, 1.45, 1.76, 0.98, NA, 2.16, 2.65, 1.09, NA,
NA, 1.35, 1.15, NA, 0.73, 0.63, 0.63, NA, 1.23, 1.34, 1.51,
1.12, 1.46, 0.96, 1.1, 1.18, 1.38, NA, 1.03, 0.87, 0.93,
1, 0.94, 1, 1.03, 1.17, 1.06), rrhigh = c(1.08, 1.28, 1.57,
0.95, NA, 1.63, 2.03, 1.07, NA, NA, 1.18, 0.94, NA, 0.58,
0.49, 0.5, NA, 0.99, 1.08, 1.09, 0.93, 1.15, 0.71, 0.92,
0.91, 1.1, NA, 1.02, 0.73, 0.87, 0.99, 0.88, 1, 1.01, 1.03,
1.01), rrlow = c(1.28, 1.64, 1.97, 1.02, NA, 2.86, 3.44,
1.11, NA, NA, 1.55, 1.42, NA, 0.9, 0.79, 0.81, NA, 1.53,
1.66, 2.09, 1.34, 1.85, 1.3, 1.31, 1.52, 1.74, NA, 1.04,
1.03, 0.98, 1.01, 1.01, 1, 1.05, 1.32, 1.1)), class = "data.frame", row.names = c(NA,
-36L))
forestdf$labels <- factor(forestdf$labels,levels = forestdf$labels)
levels(forestdf$labels) 1.52, 1.74, NA, 1.04, 1.03, 0.98, 1.01, 1.01, 1, 1.05, 1.32,
#forestplot
p <- ggplot(forestdf, aes(x=rr, y=labels, xmin=rrlow, xmax=rrhigh))+
geom_pointrange(shape=22, fill="black")+
geom_vline(xintercept = 1, linetype=3)+
xlab("Variable")+ylab("Adjusted Relative Risk with 95% Confidence Interval")+theme_classic()+scale_y_discrete(limits = rev(labels))+
scale_x_log10(limits = c(0.25, 4), breaks = c(0.25, 0.5, 1, 2, 4), labels=c("0.25", "0.5", "1", "2", "4"), expand = c(0,0))
p
However, I cannot get the left panel with labels to work:
#dataframe for table
fplottable <- structure(list(labels = structure(c(1L, 30L, 7L, 33L, 27L, 4L,
6L, 35L, 32L, 31L, 26L, 11L, 2L, 24L, 34L, 12L, 10L, 8L, 14L,
9L, 5L, 18L, 17L, 16L, 3L, 13L, 29L, 36L, 28L, 15L, 21L, 20L,
25L, 19L, 22L, 23L), .Label = c("Age*", "Alcohol use - vs. never",
"Allergic rhinitis", "Asian", "Asthma", "Black", "Body-mass index*,1 ",
"Cancer", "Chronic obstructive pulmonary disease2", "Comorbidity",
"Current", "Daily or almost daily", "Depression", "Diabetes",
"Hemoglobin concentration", "Hypercholesterolemia", "Hypothyroidism",
"Ischemic heart disease3", "Lymphocyte count", "Mean corpuscular hemoglobin concentration",
"Mean corpuscular volume", "Monocyte count", "Neutrophil count",
"Once or twice a week", "Platelet count", "Previous", "Race - vs. white",
"Red blood cell count", "Serology", "Sex – male vs. female",
"Smoking - vs. never", "Social habit", "Systolic blood pressure*",
"Three or four times a week", "Townsend deprivation index", "White blood cell count"
), class = "factor"), No..of.Events = c(1073L, 581L, 1061L, 1031L,
NA, 57L, 68L, 1072L, NA, NA, 442L, 117L, NA, 262L, 191L, 172L,
NA, 96L, 107L, 41L, 146L, 86L, 52L, 170L, 66L, 84L, NA, 1009L,
1009L, 1009L, 1009L, 1009L, 1009L, 1005L, 1005L, 1005L), ARR..95..CI. = c("1.18 (1.08-1.28)",
"1.45 (1.28-1.64)", "1.76 (1.57-1.97)", "0.98 (0.95-1.02)", "",
"2.16 (1.63-2.86)", "2.65 (2.03-3.44)", "1.09 (1.07-1.11)", "",
"", "1.35 (1.18-1.55)", "1.15 (0.94-1.42)", "", "0.73 (0.58-0.90)",
"0.63 (0.49-0.79)", "0.63 (0.50-0.81)", "", "1.23 (0.99-1.53)",
"1.34 (1.08-1.66)", "1.51 (1.09-2.09)", "1.12 (0.93-1.34)", "1.46 (1.15-1.85)",
"0.96 (0.71-1.30)", "1.10 (0.92-1.31)", "1.18 (0.91-1.52)", "1.38 (1.10-1.74)",
"", "1.03 (1.02-1.04)", "0.87 (0.73-1.03)", "0.93 (0.87-0.98)",
"1.00 (0.99-1.01)", "0.94 (0.88-1.01)", "1.00 (1.00-1.00)", "1.03 (1.01-1.05)",
"1.17 (1.03-1.32)", "1.06 (1.01-1.10)")), class = "data.frame", row.names = c(NA,
-36L))
###NOT WORKING CODE THAT TRIES TO MAKE TABLE LEFT OF FOREST PLOT
data_table <- geom_text(data=fplottable,aes(y=labels)) +
geom_text(label=eventnum) +
geom_text(label=arr)
data_table
grid.arrange(data_table,p, ncol=2)
I am drawing inspiration from:
Reproduce table and plot from journal and trying to get something similar to what is shown in the forest plot with the pink boxes
There were a few issues as #efz pointed out. In addition, you need to refactor the labels in your second column to allow them to match up with those in your first. It's probably going to look messy with the y axis labels and title alongside the table, so these could be removed too.
That leaves you something like:
forestdf$colour <- rep(c("white", "gray95"), 18)
p <- ggplot(forestdf, aes(x = rr, y = labels, xmin = rrlow, xmax = rrhigh)) +
geom_hline(aes(yintercept = labels, colour = colour), size = 7) +
geom_pointrange(shape = 22, fill = "black") +
geom_vline(xintercept = 1, linetype = 3) +
xlab("Variable") +
ylab("Adjusted Relative Risk with 95% Confidence Interval") +
theme_classic() +
scale_colour_identity() +
scale_y_discrete(limits = rev(forestdf$labels)) +
scale_x_log10(limits = c(0.25, 4),
breaks = c(0.25, 0.5, 1, 2, 4),
labels = c("0.25", "0.5", "1", "2", "4"), expand = c(0,0)) +
theme(axis.text.y = element_blank(), axis.title.y = element_blank())
names(fplottable) <- c("labels", "eventnum", "arr")
fplottable$labels <- factor(fplottable$labels, rev(levels(forestdf$labels)))
fplottable$colour <- rep(c("white", "gray95"), 18)
data_table <- ggplot(data = fplottable, aes(y = labels)) +
geom_hline(aes(yintercept = labels, colour = colour), size = 7) +
geom_text(aes(x = 0, label = labels), hjust = 0) +
geom_text(aes(x = 5, label = eventnum)) +
geom_text(aes(x = 7, label = arr), hjust = 1) +
scale_colour_identity() +
theme_void() +
theme(plot.margin = margin(5, 0, 35, 0))
grid.arrange(data_table,p, ncol = 2)
You can simplify further by merging the two dataframes as fdf <- full_join(forestdf, fplottable, by = "labels") and running your p on fdf. Then p + geom_text(aes(x=22, label=paste(" ", arr," ",eventum, sep=' '))) will give the following output: output
Obviously, limits need to be expanded to 100 to include the table, and the full code is below:
p <- ggplot(fdf, aes(x=rr, y=labels, xmin=rrlow, xmax=rrhigh))+
geom_pointrange(shape=22, fill="black") +
geom_vline(xintercept = 1, linetype=3) +
xlab("Variable")+ylab("Adjusted Relative Risk with 95% Confidence Interval") +
theme_bw() +
#scale_y_discrete(limits = rev(labels))+
scale_x_log10(limits = c(0.25, 100),
breaks = c(0.25, 0.5, 1, 2, 4, 100),
labels=c("0.25", "0.5", "1", "2", "4", ""),
expand = c(0,0)
)+
geom_text(aes(x=22, label=paste(" ", arr," ",eventum, sep=' ')))
p
supposing
names(fplottable)<-c('labels','eventum','arr')
then there are a few issues with the code for data_table. If I understood correctly you meant something like:
data_table <- ggplot(data=fplottable)+geom_text(aes(x= 1, y=labels, label=arr))+geom_text(aes(x= 1.5, y=labels, label=eventum)).
You can play with the value of x and have only one geom_text where label=paste(arr, eventum, sep=' ')
in this case the command grid.arrange(data_table,p, ncol=2) seems to work fine. You can define the space of each panel with width.

Add p-values from own formula to ggplot2

I would like to add different p-values from an specific formula in a plot. I need different p-values from each of the subjects. Here is the code I used, which did not work:
formula <- lme(scale(Inactive.freq)~ scale(Time.point), random=~ 1|Subject, data=Freq_df, method='ML')
gggplot(Freq_df, aes(x=Time.point, y=Inactive.freq, group=Subject,colour=Subject)) +
geom_line(size=2)+
theme_minimal()+
geom_point()+
stat_smooth(method=lm, se = FALSE,linetype ="dashed")+
geom_smooth(method = "lm", formula = formula)+
stat_poly_eq(aes(label = paste(stat(eq.label),
stat(adj.rr.label), sep = "~~~~")), formula = formula, parse = TRUE) +
stat_fit_glance(label.x.npc = "right", label.y.npc = "bottom", geom = "text",
aes(label = paste("P-value = ", signif(..p.value.., digits = 3), sep = "")))
I would appreciate any help. Thank you!
UPDATE
My data:
structure(list(Subject = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label =
c("Caesar",
"DL", "Kyosti", "Paul", "Richards", "Taylor"), class = "factor"),
Time.point = c(1, 3, 4, 5, 6, 7), Pacing.freq = c(0.644444444444444,
0.562962962962963, 0.411111111111111, 0.122222222222222,
0, 0), Affiliative.freq = c(0.0703125, 0.138576779026217,
0.00760456273764259, 0.00617283950617284, 0.0634920634920635,
0.0629370629370629), Inactive.freq = c(0, 0, 0.174904942965779,
0.518518518518518, 0.290322580645161, 0.172661870503597),
Not.alert.alone.freq = c(0, 0, 0.174904942965779, 0.518518518518518,
0.279569892473118, 0.165467625899281), Not.alert.with.cagemate.freq = c(0,
0, 0, 0, 0.0108695652173913, 0.00719424460431655), Alert.with.cagemate.freq = c(0.06640625,
0.0262172284644195, 0, 0, 0, 0.00719424460431655), Non_visible = c(15L,
3L, 7L, 18L, 84L, 131L), Visible = c(255L, 267L, 263L, 162L,
186L, 139L)), row.names = c(NA, 6L), class = "data.frame")
This can be done using another layer with the "stat_fit_glance" method provided with the package ggpmisc (which you are already using, I believe...). It's a great package with lot more capabilities for annotating ggplot2.
The solution would be:
The modified data
Freq_df <- structure(list(Subject = as.factor(c(rep("Caesar", 3), rep("DL", 3))),
Time.point = c(1, 3, 4, 5, 6, 7),
Pacing.freq = c(0.644444444444444, 0.562962962962963,
0.411111111111111, 0.122222222222222, 0, 0),
Affiliative.freq = c(0.0703125, 0.138576779026217, 0.00760456273764259,
0.00617283950617284, 0.0634920634920635, 0.0629370629370629),
Inactive.freq = c(0, 0, 0.174904942965779, 0.518518518518518,
0.290322580645161, 0.172661870503597),
Not.alert.alone.freq = c(0, 0, 0.174904942965779, 0.518518518518518,
0.279569892473118, 0.165467625899281),
Not.alert.with.cagemate.freq = c(0, 0, 0, 0,
0.0108695652173913, 0.00719424460431655),
Alert.with.cagemate.freq = c(0.06640625, 0.0262172284644195, 0, 0, 0,
0.00719424460431655),
Non_visible = c(15L, 3L, 7L, 18L, 84L, 131L),
Visible = c(255L, 267L, 263L, 162L, 186L, 139L)),
row.names = c(NA, 6L), class = "data.frame")
The data needed to be changed, as a line cannot be fitted unless at least two data points are there, whereas you provided one data point per subject. So I limited it to two subjects with three points per subject. But you get the idea :)
The plotting code
ggplot(Freq_df, aes(x = Time.point, y = Pacing.freq)) + ylim(-0.5, 1.5) +
geom_line(size=2, alpha = 0.5) + geom_point(aes(group = "Subject"), size = 3) +
geom_smooth(method = "lm", formula = formula) + facet_wrap('Subject') +
stat_poly_eq(aes(label = paste(stat(eq.label), stat(adj.rr.label),
sep = "~~~~")), formula = formula, parse = TRUE) +
stat_fit_glance(label.x.npc = "right", label.y.npc = "bottom", geom = "text",
aes(label = paste("P-value = ", signif(..p.value.., digits = 15),
sep = "")))
EDIT 1:
#another way to use `stat_fit_glance` (not shown in the graph here)
stat_fit_glance(label.x = "right", label.y = "bottom",
aes(label = sprintf('r^2~"="~%.3f~~italic(p)~"="~%.2f',
stat(r.squared), stat(p.value))), parse = T)
`Facet-wrap' will do the trick if you need seperate p-values (seperate line-fitting) per group (and also not too many groups I believe... there must be a limit to number of facets allowed, which I don't know!).
OUTPUT
Play with the options to get desired output, e.g. if you use label.x.npc = "left" & label.y.npc = "bottom", then the regression equation & the p value labels might overlap.

How do I add additional dimensions to a map of the United States in R?

I have data with the following columns:
state: abbreviated state names (AK, NY, NJ...etc.)
subs: Number of magazine subscribers per state
income: Average state income for that state
party: Political party - (1=democrat, 0=republican, 0.5=unknown)
Sample data is available here https://raw.githubusercontent.com/vindication09/sample_data/master/sample_data.csv
I plotted political affiliation using the following code and packages. full_join3 is the name of my dataset.
library(ggplot2)
library(usmap)
usmap::plot_usmap(data = full_join3, values = "party", lines = "red") +
scale_fill_discrete(
name = "Political Affiliation Based on 2016 election",
label = scales::comma
) +
theme(legend.position = "right")
I want to add the number of subscribers and the average income if possible for each state as a label. How would I go about doing that?
Alright, here's my best stab at achieving this, though I don't think there is a good way to get all these numbers onto the map while keeping it somewhat readable. I would suggest using a different visualisation to convey all of this information at once.
The approach is basically to ignore usmap since I cannot add parameters to plot_usmap without editing the source code. Instead, we make a geometry with inset Alaska and Hawaii using the data from the fiftystater package, and join it onto the data provided using a reference table of state names and abbreviations.
Then, plotting is a matter of using geom_sf (currently in the development version of ggplot2) and geom_label_repel from the ggrepel package. We pass a preconstructed dataframe that has all of the labels for the states.
Again though, I would prefer an alternative visualisation that skips the map and instead just more clearly shows the relationships between the variables. This makes it much more obvious that high sub states are more Republican across income levels. Although, I would check the original data (Hawaii is republican? Oklahoma is democrat?)
library(tidyverse)
library(sf)
#> Linking to GEOS 3.6.1, GDAL 2.2.3, proj.4 4.9.3
library(fiftystater)
library(ggrepel)
tbl <- structure(list(state = c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY", "DC"), subs = c(43L, 373L, 604L, 431L, 157L, 524L, 682L, 178L, 594L, 395L, 76L, 492L, 597L, 686L, 282L, 27L, 560L, 528L, 309L, 306L, 101L, 139L, 414L, 280L, 22L, 548L, 82L, 675L, 684L, 598L, 66L, 653L, 541L, 636L, 530L, 332L, 21L, 469L, 341L, 456L, 278L, 153L, 499L, 700L, 223L, 222L, 305L, 20L, 321L, 232L, 107L), income = c(81360L, 36595L, 51963L, 47673L, 56776L, 61959L, 37456L, 64224L, 56211L, 25183L, 44677L, 78116L, 35134L, 85910L, 81341L, 52409L, 75060L, 55098L, 56239L, 84138L, 37589L, 50006L, 88730L, 71527L, 34506L, 76364L, 89672L, 79442L, 42263L, 73869L, 65454L, 80625L, 60519L, 35125L, 60869L, 64727L, 86541L, 75562L, 50824L, 44414L, 26103L, 32962L, 61337L, 48314L, 25417L, 35721L, 34247L, 86608L, 64030L, 61089L, 37934L), party = c(0, 0, 1, 0.5, 1, 0, 0, 0.5, 0, 1, 0, 0.5, 0.5, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0.5, 0, 0.5, 0.5, 1, 0, 0.5, 0, 1, 0, 1, 0, 0.5, 1, 1, 0, 0.5, 0.5, 1, 0.5, 0, 0.5, 0.5, 0, 0, 0, 1, 1, 0.5)), row.names = c(NA, -51L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(state = structure(list(), class = c("collector_character", "collector")), subs = structure(list(), class = c("collector_integer", "collector")), income = structure(list(), class = c("collector_integer", "collector")), party = structure(list(), class = c("collector_double", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
names_abbs <- tibble(state.abb, state.name) %>%
mutate(state.name = str_to_lower(state.name)) %>%
add_row(state.abb = "DC", state.name = "district of columbia")
sf_fifty <- st_as_sf(fifty_states, coords = c("long", "lat")) %>%
group_by(id, piece) %>%
summarize(do_union = FALSE) %>%
st_cast("POLYGON") %>%
ungroup() %>%
left_join(names_abbs, by = c("id" = "state.name")) %>%
left_join(tbl, by = c("state.abb" = "state")) %>%
mutate(
party = factor(party, labels = c("Republican", "Unknown", "Democrat")),
lon = map_dbl(geometry, ~st_centroid(.x)[[1]]),
lat = map_dbl(geometry, ~st_centroid(.x)[[2]])
)
labels <- sf_fifty %>%
mutate(area = st_area(geometry)) %>%
group_by(state.abb) %>%
top_n(1, area) %>%
mutate(text = str_c(str_to_title(id), "\n", subs, " subs, $", income))
ggplot(sf_fifty) +
theme_minimal() +
geom_sf(aes(fill = party)) +
coord_sf(datum = NA) +
geom_label_repel(
data = labels,
mapping = aes(label = text, x = lon, y = lat),
size = 2
) +
scale_fill_brewer(
type = "diverging",
palette = "RdYlBu",
name = "Political Affiliation Based on 2016 Election"
) +
theme(
legend.position = "bottom",
axis.title = element_blank()
)
ggplot(labels, aes(x = income, y = subs)) +
theme_minimal() +
geom_point(aes(colour = party), size = 3) +
scale_colour_discrete(name = "Political Affiliation Based on 2016 Election") +
geom_text_repel(aes(label = state.abb)) +
theme(legend.position = "bottom")
Created on 2018-07-02 by the reprex package (v0.2.0).

Resources