How to remove outliers from nonlinear regression curve? with 3-sigma limits? - r

I tried to remove outliers from nonlinear regression curve, and get the updated formula and error criteria. Someone suggested me to use 3-sigma limits, a statistical calculation where the data are within three standard deviations from a mean. But I don't know how to realize it in my case.
Here is the original data.
ISIDOR <- structure(list(Pos_heliaphen = c("W30", "X41", "Y27", "Z24",
"Y27", "W30", "W30", "X41", "Y27", "W30", "X41", "Z40", "Z99"
), traitement = c("WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW",
"WW", "WW", "WW", "WW", "WW"), Variete = c("Isidor", "Isidor",
"Isidor", "Isidor", "Isidor", "Isidor", "Isidor", "Isidor", "Isidor",
"Isidor", "Isidor", "Isidor", "Cali"), FTSW_apres_arros = c(0.462837958498518,
0.400045032939416, 0.352560790392534, 0.377856799586057, 0.170933345859364,
0.315689846065931, 0.116825600914318, 0.0332444780173884, 0.00966070114456602,
0.0871102539376406, 0.0107280083093036, 0.195548432729584, 1),
NLE = c(0.903498791068124, 0.954670066942938, 0.970762905436272,
0.873838605282389, 0.647875257025359, 0.53056603773585, 0.0384548155916796,
0.0470924009989314, 0.00403163281128882, 0.193696514297641,
0.0718450645564359, 0.295346695941639, 1)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -13L))
Here is my original code.
pred_df <- data.frame(FTSW_apres_arros = seq(min(ISIDOR$FTSW_apres_arros),
max(ISIDOR$FTSW_apres_arros),
length.out = 100))
pred_df$NLE <- predict(mod, newdata = pred_df)
mod = nls(NLE ~ 2/(1+exp(a*FTSW_apres_arros))-1,start = list(a=1),data = ISIDOR)
ISIDOR$pred = predict(mod,ISIDOR)
a = coef(mod)
RMSE = rmse(ISIDOR$NLE, ISIDOR$pred)
MSE = mse(ISIDOR$NLE, ISIDOR$pred)
Rsquared = summary(lm(ISIDOR$NLE~ ISIDOR$pred))$r.squared
ggplot(ISIDOR, aes(FTSW_apres_arros, NLE)) +
geom_point(aes(color = Variete), pch = 19, cex = 3) +
geom_line(data = pred_df) +
scale_color_manual(values = c("red3","blue3"))+
scale_y_continuous(limits = c(0, 1.0)) +
scale_x_continuous(limits = c(0, 1)) +
labs(title = "Isidor",
y = "Expansion folliaire totale relative",
x = "FTSW",
subtitle = paste0("y = 2/(1 + exp(", round(a, 3), "* x)) -1)","\n",
"R^2 = ", round(Rsquared, 3)," RMSE = ",
round(RMSE, 3), " MSE = ", round(MSE, 3)))+
theme(plot.title = element_text(hjust = 0, size = 14, face = "bold",
colour = "black"),
plot.subtitle = element_text(hjust = 0,size=10, face = "italic",
colour = "black"),
legend.position = "none")
Here is the picture I got. I also want to get the updated formula and error criteria (circled in red).
If 3-sigma limits doesn't work for my case, could anyone recommend me other ways to deal with outliers?

Related

change second y axis color in base R

Change secondary line axis color changes send color for ggplot, but I chose to go with base R, and would like to be able to select the second y axis color.
I have the following data:
df = structure(list(A = c("Q4-17", "Q1-18", "Q2-18", "Q3-18", "Q4-18",
"Q1-19", "Q2-19", "Q3-19", "Q4-19", "Q1-20", "Q2-20", "Q3-20",
"Q4-20", "Q1-21", "Q2-21", "Q3-21", "Q4-21", "Q1-22", "Q2-22",
"Q3-22"), B = c(69.45, 71.1, 74.94, 73.87, 93.61, 91.83,
95.38, 109.8, 133.75, 125.26, 118.22, 145.65, 144.9757185, 155.3464032,
184.367033, 179.8121721, 187.235487, 189.1684376, 184.3864519,
161.5300056), C = c(70.73, 71.73, 74.33, 73.27,
95.94, 94.38, 95.38, 109.8, 115.32, 116.92, 115.9, 113.87, 106.108147,
96.84273563, 111.5150869, 110.1228567, 110.7448835, 194.9684376,
187.7241152, 167.7665553), D = c(260.3, 216.02, 203.72,
203.52, 300.96, 320.77, 330.5, 413.52, 436.7, 474.96, 463.6,
501.87, 493.8865461, 497.1760767, 514.9903459, 503.7601267, 510.8362938,
614.9915546, 603.5761107, 593.660831), E = c(NA,
NA, NA, NA, NA, NA, NA, NA, 39.237, 35.621, 32.964, NA, 152.137,
140.743023, 167.809, 170.877, 117.517, 102.691723, 88.8, 76.2445528
)), class = "data.frame", row.names = c(NA, -20L))
df = df %>%
rowwise() %>%
mutate(sums = sum(D,E, na.rm = TRUE))
df = df[8:nrow(df),]
and this to generate my plot
x <- seq(1,nrow(df),1)
y1 <- df$B
y2 <- df$D
par(mar = c(5, 4, 4, 4) + 0.3)
plot(x, y1, col = "#000000",
type = "l",
main = "title",
ylim = c(0, max(df[,2:3])),
ylab = "Y1",
xlab = "",
xaxt = "n")
axis(1,
at = seq(from = 13, by = -4, length.out = 4),
labels = df$A[seq(from = 13, by = -4, length.out = 4)])
lines(x, df$C, lty = "dashed", col = "#adadad", lwd = 2)
par(new = TRUE)
plot(x, df$sums, col = "#ffa500",
axes = FALSE, xlab = "", ylab = "", type = "l")
axis(side = 4, at = pretty(range(y2)),
ylim = c(0,max(df[,3:5], na.rm = TRUE)),
col = "#00aa00") # Add colour selection of 2nd axis
par(new = TRUE)
plot(x, df$D , col = "#0000ff",
axes = FALSE, xlab = "", ylab = "", type = "l", lwd = 1)
mtext("y2", side = 4, line = 3)
but this does not colour my complete second y axis, nor labels, nor title
does any one have any suggestions to be able to set entire y2 axis to be #00AA00 - ticks, labels, and title?

Is there a way to report prevalence ratio using gtsummary?

How would I report PR instead of OR?
library(gtsummary)
library(dplyr)
set.seed(2022)
trial_data <- tibble(
"outcome" = factor(rbinom(1000, 1, 0.20),
labels = c("No", "Yes")),
"var1" = factor(rbinom(1000, 2, 0.25),
labels = c("Low", "Middle", "High")),
"var2" = factor(rbinom(1000, 2, 0.20),
labels = c("Primary", "Secondary", "Tertiary")),
"var3" = factor(rbinom(1000, 1, 0.10),
labels = c("No", "Yes")),
"var4" = round(rnorm(1000, 20, 5)),
)
trial_data %>% count(var3)
# Logistic regression with odds ratio
trial_data %>%
tbl_uvregression(
method = glm,
y = "outcome",
method.args = list(family = binomial),
exponentiate = TRUE,
pvalue_fun = function(x) style_pvalue(x, digits = 2)
)

How can I combine color and shape identity with ggplot2?

I get 2 different legends one for shape and one for color. I've read: Combine legends for color and shape into a single legend already but I have no clue how to combine a color/shape identity together.
My Data:
It calls Vergleich2
My code:
Vergleich2 <- data.frame(
list(
RH = c(4.4, 70.81, 89.74, 98.21, 99.45, 100.3, 101.16, 101.83, 103.46, 103.65, 103.9, 33.37, 32.26, 50.39, 75.65, 81.54, 86.58, 91.88, 94.1, 96.41, 98.52, 99.93, 101.45, 77.09, 84.51, 92.15, 94.61, 96.22, 97.36, 98.85, 98.95, 98.74, 99.34, 100.07, 101.06, 102.45, 103.04),
max = c(0, 0.0262005491707849,
0.0960002914076637, 0.26123554979527, 0.299421329851762, 0.362190635901956, 0.452267730725373, 0.60803295055093, 0.958096790371026, 0.995440372287362, 1, 0.0191985206504361, 0, 0.0444427600652313, 0.0676200802520531, 0.0922989569990268, 0.112052964622176, 0.182215180712429, 0.248659241123121, 0.327097853193048, 0.496708233033155, 0.627302705113058, 1, 0.515522981617377, 0.585402158506993, 0.762678213109326, 0.920738889764711, 0.836214001953324, 0.871654063266438, 0.908503395902539, 0.825786584233689, 0.875522664077668, 0.831158954459146, 0.831533205795933, 0.893700430247523, 1.00803637031109,
1), letzte = c(0, 0.0171373807524096, 0.0818334694345387, 0.239981280241844, 0.280068579568638, 0.345939316999413, 0.434432925347285, 0.611502937955804, 0.964279264750348, 1.00834862405373, 1, 0.00678220086610785, 0, -0.00307024455552525, 0.0255053593718935, 0.0748980985479396, 0.0890155980480638, 0.153017148428967, 0.187262260262659, 0.306449913424004,
0.454599256084893, 0.614943073105356, 1, 0.527873986434174, 0.593334258062775, 0.768834444991388, 1.21440714508987, 0.847592976104216, 0.892496700707447, 0.917439391188656, 0.834935302471757, 0.840806889095709, 0.823590477107656,
0.834511976098586, 0.912778381850167, 1.00642363306524, 1)))
Region1_plot <- ggplot()+
geom_point(data=Vergleich2[c(1:11),], mapping=aes(x=RH,y=max,col="red",shape=19))+
geom_point(data=Vergleich2[c(1:11),], mapping=aes(x=RH,y=letzte, col="blue",shape=19))+
geom_point(data=Vergleich2[c(12:13),], mapping=aes(x=RH,y=max,col="red",shape=3))+
geom_point(data=Vergleich2[c(12:13),], mapping=aes(x=RH,y=letzte, col="blue",shape=3))+
geom_point(data=Vergleich2[c(14:23),], mapping=aes(x=RH,y=max,col="red",shape=6))+
geom_point(data=Vergleich2[c(14:23),], mapping=aes(x=RH,y=letzte, col="blue",shape=6))+
scale_color_identity("", guide="legend", breaks=c("red","blue"),labels=c("Incr.1","Decr.1"))+
scale_shape_identity("", guide = "legend", breaks=c(19,3,6), labels=c("Incr.1","Decr.1", "Incr.2"))+
labs(title = "Relative Wasseraufnahme Isopren SOA #10 (RH)", title.position="center")+
ylab("Norm. Optical Density [-]")+
xlab("RH [%]")+
coord_cartesian(xlim = c(0, 100))+
scale_x_continuous(breaks=c(seq(0,120, 4)))+
theme(axis.text = element_text(size = 15),
axis.title = element_text(size=15),
plot.title = element_text(hjust = 0.5),
legend.title = element_text(size=0),
legend.text= element_text(size=15),
legend.background = element_rect(),
legend.position = c(0.095,0.9),
title = element_text(size=20))
print(Region1_plot)
Thx for your help!!
Per my experience it makes to shape data so that you can keep ggplot calls as simple as possible. The various geom_points hint at a problem with your input data. Here's a proposal how to add a column that contains a combination of the attributes you want to show:
library(tidyverse)
Vergleich2 <- data.frame(
list(
RH = c(4.4, 70.81, 89.74, 98.21, 99.45, 100.3, 101.16, 101.83, 103.46, 103.65, 103.9, 33.37, 32.26, 50.39, 75.65, 81.54, 86.58, 91.88, 94.1, 96.41, 98.52, 99.93, 101.45, 77.09, 84.51, 92.15, 94.61, 96.22, 97.36, 98.85, 98.95, 98.74, 99.34, 100.07, 101.06, 102.45, 103.04),
max = c(0, 0.0262005491707849,
0.0960002914076637, 0.26123554979527, 0.299421329851762, 0.362190635901956, 0.452267730725373, 0.60803295055093, 0.958096790371026, 0.995440372287362, 1, 0.0191985206504361, 0, 0.0444427600652313, 0.0676200802520531, 0.0922989569990268, 0.112052964622176, 0.182215180712429, 0.248659241123121, 0.327097853193048, 0.496708233033155, 0.627302705113058, 1, 0.515522981617377, 0.585402158506993, 0.762678213109326, 0.920738889764711, 0.836214001953324, 0.871654063266438, 0.908503395902539, 0.825786584233689, 0.875522664077668, 0.831158954459146, 0.831533205795933, 0.893700430247523, 1.00803637031109,
1), letzte = c(0, 0.0171373807524096, 0.0818334694345387, 0.239981280241844, 0.280068579568638, 0.345939316999413, 0.434432925347285, 0.611502937955804, 0.964279264750348, 1.00834862405373, 1, 0.00678220086610785, 0, -0.00307024455552525, 0.0255053593718935, 0.0748980985479396, 0.0890155980480638, 0.153017148428967, 0.187262260262659, 0.306449913424004,
0.454599256084893, 0.614943073105356, 1, 0.527873986434174, 0.593334258062775, 0.768834444991388, 1.21440714508987, 0.847592976104216, 0.892496700707447, 0.917439391188656, 0.834935302471757, 0.840806889095709, 0.823590477107656,
0.834511976098586, 0.912778381850167, 1.00642363306524, 1)))
plot_df <- Vergleich2[1:23,] ## above you plot a subset of the data - that's why I'm choosing columns 1:23
plot_df <- plot_df %>%
mutate(shapes = c(rep("Incr.1", 11), rep("Decr.1", 2), rep("Incr.2", 10))) %>% ## adding the attribute for shapes
pivot_longer(cols = c("max", "letzte"), names_to = "colrs") %>% ## tidying data (a format that is ggplot-friendly)
mutate(combined = paste(shapes, colrs)) ## and combining the columns so that I can use one column for both shape and colour
ggplot(plot_df, aes(x = RH, y = value, shape = combined, colour = combined))+
geom_point() +
scale_color_manual("", values = c("red", "blue", "red", "blue", "red", "blue"))+
scale_shape_manual("", values = c(19, 3, 6, 19, 3, 6))+
labs(title = "Relative Wasseraufnahme Isopren SOA #10 (RH)", title.position="center")+
ylab("Norm. Optical Density [-]")+
xlab("RH [%]")+
coord_cartesian(xlim = c(0, 100))+
scale_x_continuous(breaks=c(seq(0,120, 4)))+
theme(axis.text = element_text(size = 15),
axis.title = element_text(size=15),
plot.title = element_text(hjust = 0.5),
legend.title = element_text(size=0),
legend.text= element_text(size=15),
legend.background = element_rect(),
legend.position = c(0.095,0.9),
title = element_text(size=20))
See Hadley Wickham's book on "[tidy data][1]"

Not able to print forestplot in high resolution format in R

I need to create a forestplot of high resolution. I used the forestplot() function from library(forestplot) to create my plot, and then attempted to use the tiff() function to create a high resolution image for publication. However, my image turned blank.
It works if I export directly from R but not as high resolution as it was supposed to.
library(forestplot)
df <- structure(list(
mean = c(NA, 0.22, 0.20, 0.27),
lower = c(NA, 0.05, 0.04, 0.01),
upper = c(NA, 0.95, 1.08, 9.12)),
.Names = c("mean", "lower", "upper"),
row.names = c(NA, -4L),
class = "data.frame")
tabletext <- cbind(
c("", "Pooled", "Group 1", "Group 2"),
c("N", "4334", "3354", "980"),
c("HR (95% CI)", "0.22 (0.05, 0.95)", "0.20 (0.04, 1.08)", "0.27 (0.01, 9.12)"),
c("p-value", "0.042", "0.061", "0.467")
)
ggfp <- forestplot(tabletext,
df,
new_page = TRUE,
is.summary = c(TRUE, rep(FALSE, 3)),
clip = c(0, 2),
colgap = unit(5, "mm"),
line.margin = unit(2, "mm"),
lineheight = unit(1, "in"),
txt_gp = fpTxtGp(label = gpar(cex = 1),
ticks = gpar(cex = 1)),
align = c("l", "c", "c", "c"),
boxsize = 0.2,
xticks = seq(0, 2.0, 0.5),
zero = 1,
col = fpColors(box = "royalblue",
line = "darkblue"),
mar = unit(c(-1, 0.5, -2, 0.5), "in"))
tiff("forestplot.tiff", units = "in", width = 9, height = 7, res = 300)
ggfp
dev.off()
The file was created but it was a blank page
This works for me (output file is 17MB):
library(forestplot)
setwd("/path/to/directory/for/plot")
df <- structure(list(
mean = c(NA, 0.22, 0.20, 0.27),
lower = c(NA, 0.05, 0.04, 0.01),
upper = c(NA, 0.95, 1.08, 9.12)),
.Names = c("mean", "lower", "upper"),
row.names = c(NA, -4L),
class = "data.frame")
tabletext <- cbind(
c("", "Pooled", "Group 1", "Group 2"),
c("N", "4334", "3354", "980"),
c("HR (95% CI)", "0.22 (0.05, 0.95)", "0.20 (0.04, 1.08)", "0.27 (0.01, 9.12)"),
c("p-value", "0.042", "0.061", "0.467")
)
tiff("forestplot.tiff", units = "in", width = 9, height = 7, res = 300)
forestplot(tabletext,
df,
new_page = TRUE,
is.summary = c(TRUE, rep(FALSE, 3)),
clip = c(0, 2),
colgap = unit(5, "mm"),
line.margin = unit(2, "mm"),
lineheight = unit(1, "in"),
txt_gp = fpTxtGp(label = gpar(cex = 1),
ticks = gpar(cex = 1)),
align = c("l", "c", "c", "c"),
boxsize = 0.2,
xticks = seq(0, 2.0, 0.5),
zero = 1,
col = fpColors(box = "royalblue",
line = "darkblue"),
mar = unit(c(-1, 0.5, -2, 0.5), "in"))
dev.off()

ggplot - legend as y-axis label

I have the following graph
Is it possible to add the legend labels (HPD and Quantile) under the respective boxplots? Also can i get rid of the white bar in the middle?
My code isthe following:
p <- ggplot(Results.Baseline,aes(x=Inference, y=Results, fill=Method)) +
scale_y_continuous(limits = c(0, 1))+
geom_boxplot()+facet_wrap(~Method)+ facet_wrap(~Model)+
geom_hline(yintercept=0.95, linetype="dashed", color = "red")
I basically want something like this just under all boxplots:
Here is my data:
data <- structure(list(Results = c(0.234375, 0.203125, 0.234375, 0.203125,
0.21875, 0.203125), Model = c("Baseline 1", "Baseline 1", "Baseline 1",
"Baseline 1", "Baseline 1", "Baseline 1"), Method = c("Quantile",
"Quantile", "Quantile", "Quantile", "Quantile", "Quantile"),
Inference = c("HMDM", "HMDM", "HMDM", "HMDM", "HMDM", "HMDM"
)), .Names = c("Results", "Model", "Method", "Inference"), row.names = c("1:nrow(transitions)",
"V2", "V3", "V4", "V5", "V6"), class = "data.frame")
I added more data so that I can replicate your graph better. You can
use geom_text to add the Method labels to the graph. You have to
only keep one label per box plot which is why I created the datalabs
dataframe. Also you did not need two facet_wraps in your plot. Does
this help answer your question?
data <- structure(list(Results = c(0.234375, 0.203125, 0.234375, 0.203125,
0.21875, 0.203125), Model = c("Baseline 1", "Baseline 1", "Baseline
1",
"Baseline 1", "Baseline 1", "Baseline 1"), Method = c("Quantile",
"Quantile", "Quantile", "Quantile", "Quantile", "Quantile"),
Inference = c("HMDM", "HMDM", "HMDM", "HMDM", "HMDM", "HMDM"
)), .Names = c("Results", "Model", "Method", "Inference"), row.names = c("1:nrow(transitions)",
"V2", "V3", "V4", "V5", "V6"), class = "data.frame")
data2 <- structure(list(Results = c(0.234375, 0.203125, 0.234375, 0.203125,
0.21875, 0.203125), Model = c("Baseline 2", "Baseline 2", "Baseline 2",
"Baseline 2", "Baseline 2", "Baseline 2"), Method = c("HPD",
"HPD", "HPD", "HPD", "HPD", "HPD"),
Inference = c("Eco. Inf.", "Eco. Inf.", "Eco. Inf.", "Eco. Inf.",
"Eco. Inf.", "Eco. Inf."
)), .Names = c("Results", "Model", "Method", "Inference"), row.names = c("1:nrow(transitions)",
"V2", "V3", "V4", "V5", "V6"), class = "data.frame")
data3 <- rbind(data,data2)
data4 <- mutate(data3, Method = ifelse(Method == "Quantile",
"HPD","Quantile"),
Inference = ifelse(Inference == "HMDM","Eco. Inf.",
"HMDM"))
data5 <- rbind(data3,data4)
datalabs <- data5 %>%
group_by(Method,Model) %>%
arrange(Method,Model) %>%
filter(row_number()==1)
ggplot(data5,aes(x=Inference, y=Results, fill=Method)) +
scale_y_continuous(limits = c(0, 1))+
geom_boxplot()+
facet_wrap(~Model)+
geom_hline(yintercept=0.95, linetype="dashed", color = "red")+
geom_text(data = datalabs, aes(label=Method) ,
nudge_y = -.1)+
theme_bw() +
theme(panel.grid = element_blank()) +
theme(panel.spacing = unit(0, "lines"),
strip.background = element_blank(),
panel.border = element_rect(fill = NA, color="white"))

Resources