Make facet_matrix show density plot for variable with missing values - r

I want to plot a facet_matrix showing scatter plots and autodensity plots on the diagonal. However, for some reason it does not show the density plot for a certain variable (gini_eurostat). I assume this is because there are some missing values for gini_eurostat. How can I make it show the density plot, even though there are some missing values?
This is the code I used:
ggplot(df_Q2, aes(x = .panel_x, y = .panel_y)) +
geom_autodensity() +
geom_point(alpha = 1, shape = 16, size = 0.5) +
facet_matrix(vars(c(intraEU_trade_bymemberstate_pct, gini_eurostat, exports_currentUSD)),
layer.upper = 2, layer.diag=1, layer.lower = 2) +
theme_few()
The data frame looks like this:
head(df_Q2[,c("intraEU_trade_bymemberstate_pct", "gini_eurostat", "exports_currentUSD")])
# A tibble: 6 × 3
# intraEU_trade_bymemberstate_pct gini_eurostat exports_currentUSD
# <dbl> <dbl> <dbl>
# 1 8.6 NA 96701496330.
# 2 8.8 27.4 116638893905.
# 3 8.8 25.8 141025428359.
# 4 8.4 26.3 153625979356.
# 5 8 25.3 170827273868.
# 6 8.1 26.2 204299603066.

Related

R - Plot 2-way Anova Results on longitudinal data using GGplot2

I am looking to display the results of a two-way Anova analysis over several time points. This is preliminary data, and I am interested in getting an understanding of the potential relationship between time and sex on tumor burden.
My data:
ID Sex Tumor.Burden Time.Point
Cage3 female 1270800 1
Cage3 female 1237600 2
Cage3 female 1288760 3
Cage3 female 775220 4
Cage4 female 1768400 1
Cage4 female 1630200 2
Cage4 female 1606900 3
Cage4 female 1134220 4
Cage5 male 1441500 1
Cage5 male 3000750 2
Cage5 male 5930500 3
Cage5 male 6944225 4
Cage6 male 2063640 1
Cage6 male 7067600 2
Cage6 male 10460400 3
Cage6 male 18764800 4
This is the plot I am using. I'd like to point out that this wasn't made with the data I just listed, but rather with similar data. However, I plan on using the same approach here.
ggplot(Data, aes(x = Time.Point, y = Tumor.Burden, color = Sex)) +
geom_line() +
theme_minimal() +
labs(title = "Weekly Follow-up of Tumor-Bearing Mice", x = "Time points (weeks)", y="Log(Tumor Burden)") +
theme(plot.title = element_text(size = 10, hjust = 0.5))
What is the best approach to add the significance of each time point above to the corresponding time point on the plot? I.E is there a statistically significant difference between males and females at time point 1:5?
Currently, I am following this: https://www.datanovia.com/en/lessons/repeated-measures-anova-in-r/#two-way-repeated-measures-anova. However, I am getting an error at the end and it seems to be related to my ID variable getting flagged as NA when I run
Data %>%
group_by(Time.Point) %>%
anova_test(dv = Tumor.Burden, wid = ID, within = Sex)
Thanks!
To calculate the p values you can use anova_test(Tumor.Burden ~ Sex) and use these output p values in a geom_text to show them in your plot like this:
library(ggplot2)
library(ggpubr)
library(rstatix)
library(dplyr)
p_values <- Data %>%
group_by(Time.Point) %>%
anova_test(Tumor.Burden ~ Sex)
#> Coefficient covariances computed by hccm()
#> Coefficient covariances computed by hccm()
#> Coefficient covariances computed by hccm()
#> Coefficient covariances computed by hccm()
p_values
#> # A tibble: 4 × 8
#> Time.Point Effect DFn DFd F p `p<.05` ges
#> * <int> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 1 Sex 1 2 0.342 0.618 "" 0.146
#> 2 2 Sex 1 2 3.11 0.22 "" 0.608
#> 3 3 Sex 1 2 8.83 0.097 "" 0.815
#> 4 4 Sex 1 2 4.05 0.182 "" 0.669
ggplot() +
geom_line(Data, mapping = aes(x = Time.Point, y = Tumor.Burden, color = Sex)) +
geom_text(data = p_values, mapping = aes(x = Time.Point, y = 15000000, label = p), size = 3) +
theme_minimal() +
labs(title = "Weekly Follow-up of Tumor-Bearing Mice", x = "Time points (weeks)", y="Log(Tumor Burden)") +
theme(plot.title = element_text(size = 10, hjust = 0.5))
Created on 2022-11-16 with reprex v2.0.2

ggplot boxplot with mean and confidence interval by group

I'd like to make a boxplot with mean instead of median. Moreover, I would like the line to stop at 5% (lower) end 95% (upper) quantile. Here the code;
ggplot(data, aes(x=Cement, y=Mean_Gap, fill=Material)) +
geom_boxplot(fatten = NULL,aes(fill=Material), position=position_dodge(.9)) +
xlab("Cement") + ylab("Mean cement layer thickness") +
stat_summary(fun=mean, geom="point", aes(group=Material), position=position_dodge(.9),color="black")
I'd like to change geom to errorbar, but this doesn't work. I tried middle = mean(Mean_Gap), but this doesn't work either. I tried ymin = quantile(y,0.05), but nothing was changing. Can anyone help me?
The standard boxplot using ggplot. fill is Material:
Here is how you can create the boxplot using custom parameters for the box and whiskers. It's the solution shown by #lukeA in stackoverflow.com/a/34529614/6288065, but this one will also show you how to make several boxes by groups.
The R built-in data set called "ToothGrowth" is similar to your data structure so I will use that as an example. We will plot the length of tooth growth (len) for each vitamin C supplement group (supp), separated/filled by dosage level (dose).
# "ToothGrowth" at a glance
head(ToothGrowth)
# len supp dose
#1 4.2 VC 0.5
#2 11.5 VC 0.5
#3 7.3 VC 0.5
#4 5.8 VC 0.5
#5 6.4 VC 0.5
#6 10.0 VC 0.5
library(dplyr)
# recreate the data structure with specific "len" coordinates to plot for each group
df <- ToothGrowth %>%
group_by(supp, dose) %>%
summarise(
y0 = quantile(len, 0.05),
y25 = quantile(len, 0.25),
y50 = mean(len),
y75 = quantile(len, 0.75),
y100 = quantile(len, 0.95))
df
## A tibble: 6 x 7
## Groups: supp [2]
# supp dose y0 y25 y50 y75 y100
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 OJ 0.5 8.74 9.7 13.2 16.2 19.7
#2 OJ 1 16.8 20.3 22.7 25.6 26.9
#3 OJ 2 22.7 24.6 26.1 27.1 30.2
#4 VC 0.5 4.65 5.95 7.98 10.9 11.4
#5 VC 1 14.0 15.3 16.8 17.3 20.8
#6 VC 2 19.8 23.4 26.1 28.8 33.3
# boxplot using the mean for the middle and 95% quantiles for the whiskers
ggplot(df, aes(supp, fill = as.factor(dose))) +
geom_boxplot(
aes(ymin = y0, lower = y25, middle = y50, upper = y75, ymax = y100),
stat = "identity"
) +
labs(y = "len", title = "Boxplot with Mean Middle Line") +
theme(plot.title = element_text(hjust = 0.5))
In the figure above, the boxplot on the left is the standard boxplot with regular median line and regular min/max whiskers. The boxplot on the right uses the mean middle line and 5%/95% quantile whiskers.

Visualize multiple box plot selecting differents rows of a dataframe

I am developing an EDA (Estimation of Distribution Algorithm). I'm getting all measure of the Pareto Front's solutions with distint configurations.
I have a structure with all values:
> metrics20
# A tibble: 320 x 6
File Hypervolume `Modified Hypervolume` Spread Spacing Time
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 001-unif-0.csv 25771 26294. 391. 30.1 16.8
2 002-unif-0.csv 27481 28416. 534. 41.1 16.5
3 003-unif-0.csv 26394 26842. 356. 29.6 16.5
4 004-unif-0.csv 30828 31696 418. 38.0 16.5
5 005-unif-0.csv 28146 28727 444. 34.2 16.6
6 006-unif-0.csv 30176 31006 451. 50.1 16.6
7 007-unif-0.csv 29374 30216 537. 35.8 16.5
8 008-unif-0.csv 27434 28156. 439. 31.4 16.5
9 009-unif-0.csv 28944 29426 471. 33.7 16.4
10 010-unif-0.csv 28339 29302. 576. 44.3 16.4
I want to visualize the values by this way. I take for example the Hipervolume column, I split data by File column value: -unif-, -sat-, -eff- and -prod- distribution and show values with -0.csv,-0.25.csv,-0.5.csv and -0.75.csv in x axis for the same distribution.
Reproducible example:
library(readr)
metrics20 <- read_csv("./metrics20.csv")
Data: Link
Hopefully this is a step towards what you're looking for:
library(readr)
library(dplyr)
library(ggplot2)
metrics20 <- read_csv("metrics20.csv")
metrics20 %>%
mutate(tag = factor(gsub("(^\\d+-)(\\w+)(-.*$)", "\\2", .$File), levels = c("unif", "sat", "eff", "prod")),
level = gsub("(^\\d+-\\w+-)(.*)(\\.csv$)", "\\2", .$File)) %>%
ggplot(aes(x = level, y = Hypervolume)) +
geom_boxplot() +
facet_wrap(~tag, nrow = 1)+
theme_minimal() +
theme(panel.border = element_rect(colour = "black", fill = NA),
panel.grid = element_blank())
From here there may be other things you want to tweak if you need to adjust it to be more like the example plot. You should be able to find all next steps in the help for the functions used.

Plotting Cumulative Gains Curve Plot R

I am trying to generate a cumulative gain plot using ggplot2 in R. Basically I want to replicate following using ggplot2.
My Data is this
df
# A tibble: 10 x 6
Decile resp Cumresp Gain Cumlift
<int> <dbl> <dbl> <dbl> <dbl>
1 8301 8301 57.7 5.77
2 2449 10750 74.8 3.74
3 1337 12087 84.0 2.80
4 751 12838 89.3 2.23
5 462 13300 92.5 1.85
6 374 13674 95.1 1.58
7 252 13926 96.8 1.38
8 195 14121 98.2 1.23
9 136 14257 99.1 1.10
10 124 14381 100 1
## Cumulative Gains Plot
ggplot(df, aes(Decile, Gain)) +
geom_point() +
geom_line() +
geom_abline(intercept = 52.3 , slope = 4.77)
scale_y_continuous(breaks = seq(0, 100, by = 20)) +
scale_x_continuous(breaks = c(1:10)) +
labs(title = "Cumulative Gains Plot",
y = "Cumulative Gain %")
However, I am not able to get the diagonal line, even though I tried geom_abline or niether my y-axis is right. I could not start from 0 to 100.
I would really appreciate if someone can get me the plot as in picture using ggplot2.
Thanks in advance
library(dplyr); library(ggplot2)
df2 <- df %>%
add_row(Decile = 0, Gain =0) %>%
arrange(Decile)
ggplot(df2, aes(Decile, Gain)) +
geom_point() +
geom_line() +
# This makes another geom_line that only sees the first and last row of the data
geom_line(data = df2 %>% slice(1, n())) +
scale_y_continuous(breaks = seq(0, 100, by = 20), limits = c(0,100)) +
scale_x_continuous(breaks = c(1:10)) +
labs(title = "Cumulative Gains Plot",
y = "Cumulative Gain %")

How can I plot using 2 y-axes using a single data frame with 7 variables having a wide range of values?

I have 7 variables (density of plankton functional groups) in a time series which I want to place in a single plot to compare their trends over time. I used ggplot, geom_point and geom_line. Since each of the variables vary in range, those with smaller values end up as almost flat lines when plotted against those with the larger values. Since I am only after the trends, not the density, I would prefer to see all lines in one plot. I considered using the sec.axis function, but could not figure out how to assign the variables to either of the y-axes.
Below is my sample data:
seq=1:6
fgrp:
Cop<-c(4.166667,4.722222,3.055556,4.444444,2.777778,2.222222)
Cyan<-c(7.222222,3.888889,1.388889,0.555556,6.944444,3.611111)
Dia<-c(96.66667,43.88889,34.44444,111.8056,163.0556,94.16667)
Dino<-c(126.9444,71.11111,50,55.97222,65,38.33333)
Naup<-c(271.9444,225.5556,207.7778,229.8611,139.7222,92.5)
OT<-c(22.5,19.16667,10.27778,18.61111,18.88889,8.055556)
Prot<-c(141.9444,108.8889,99.16667,113.8889,84.44444,71.94444)
And the ggplot script without the sec.axis since I could not make it work yet:
ggplot(data=df,aes(x=seq,y=mean,shape=fgrp,linetype=fgrp))+geom_point(size=2.5)+geom_line(size=0.5)+scale_shape_manual(values=c(16,17,15,18,8,1,0),
guide=guide_legend(title="Functional\nGroups"))+scale_linetype_manual(values=c("solid","longdash","dotted","dotdash","dashed","twodash","12345678"),guide=F)+scale_y_continuous(sec.axis = sec_axis(~./3)) +geom_errorbar(mapping=aes(ymax=mean+se,ymin=mean-se), width=0.04,linetype="longdash",color="gray30")+theme_minimal()+labs(list(title="Control",x="time",y="density"),size=12)+theme(plot.title = element_text(size = 12,hjust = 0.5 ))
The lines do not look terrible, as is, but here's an example that leverages facet_wrap with scales = "free_y" that should get you going in the right direction:
library(tidyverse)
seq <- 1:6
Cop <- c(4.166667,4.722222,3.055556,4.444444,2.777778,2.222222)
Cyan <- c(7.222222,3.888889,1.388889,0.555556,6.944444,3.611111)
Dia <- c(96.66667,43.88889,34.44444,111.8056,163.0556,94.16667)
Dino <- c(126.9444,71.11111,50,55.97222,65,38.33333)
Naup <- c(271.9444,225.5556,207.7778,229.8611,139.7222,92.5)
OT <- c(22.5,19.16667,10.27778,18.61111,18.88889,8.055556)
Prot <- c(141.9444,108.8889,99.16667,113.8889,84.44444,71.94444)
df <- tibble(
seq = seq,
cop = Cop,
cyan = Cyan,
dia = Dia,
dino = Dino,
naup = Naup,
ot = OT,
prot = Prot
)
df
#> # A tibble: 6 x 8
#> seq cop cyan dia dino naup ot prot
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 4.17 7.22 96.7 127. 272. 22.5 142.
#> 2 2 4.72 3.89 43.9 71.1 226. 19.2 109.
#> 3 3 3.06 1.39 34.4 50 208. 10.3 99.2
#> 4 4 4.44 0.556 112. 56.0 230. 18.6 114.
#> 5 5 2.78 6.94 163. 65 140. 18.9 84.4
#> 6 6 2.22 3.61 94.2 38.3 92.5 8.06 71.9
df_tidy <- df %>%
gather(grp, value, -seq)
df_tidy
#> # A tibble: 42 x 3
#> seq grp value
#> <int> <chr> <dbl>
#> 1 1 cop 4.17
#> 2 2 cop 4.72
#> 3 3 cop 3.06
#> 4 4 cop 4.44
#> 5 5 cop 2.78
#> 6 6 cop 2.22
#> 7 1 cyan 7.22
#> 8 2 cyan 3.89
#> 9 3 cyan 1.39
#> 10 4 cyan 0.556
#> # ... with 32 more rows
ggplot(df_tidy, aes(x = seq, y = value, color = grp)) +
geom_line()
ggplot(df_tidy, aes(x = seq, y = value, color = grp)) +
geom_line() +
facet_wrap(~ grp, scales = "free_y")

Resources