Colour a Q-Q plot comparing two distributions by quartiles in R - r

I am trying to construct a Q-Q plot comparing two distributions, with the 99th percentile colored like the following example:
However I am not sure how to achieve this, here is a subset of my data:
dfw <- structure(list(Date.Time = structure(c(848502000, 848509200,
848512800, 848520000, 848523600, 848530800, 848534400, 848541600,
848545200, 848552400, 848556000, 848563200, 848566800, 848574000,
848577600, 848588400, 848595600, 848599200, 848606400, 848610000,
848617200, 848620800, 848628000, 848631600, 848638800, 848642400,
848649600, 848653200, 848660400, 848664000, 848674800, 848682000,
848685600, 848692800, 848696400, 848703600, 848707200, 848714400,
848718000, 848725200, 848728800, 848736000, 848739600, 848746800,
848750400, 848761200, 848768400, 848772000, 848779200, 848782800,
848790000, 848793600, 848800800, 848804400, 848811600, 848815200,
848822400, 848826000, 848833200, 848847600, 848854800, 848858400,
848865600, 848869200, 848876400, 848880000, 848887200, 848890800,
848898000, 848901600, 848908800, 848912400, 848919600, 848923200,
848934000, 848941200, 848944800, 848952000, 848955600, 848962800,
848966400, 848973600, 848977200, 848984400, 848988000, 848995200,
848998800, 849006000, 849009600, 853682400, 853686000, 853714800,
853718400, 853725600, 853729200, 853736400, 853750800, 853758000,
853761600, 853768800), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
Hs.Mod = c(1.5960001, 1.5600001, 1.5480001, 1.552, 1.552,
1.534, 1.5180001, 1.462, 1.4260001, 1.3740001, 1.36, 1.3340001,
1.3080001, 1.2720001, 1.256, 1.218, 1.212, 1.21, 1.2060001,
1.2160001, 1.248, 1.25, 1.25, 1.264, 1.3560001, 1.394, 1.3700001,
1.332, 1.2900001, 1.2800001, 1.268, 1.2800001, 1.2800001,
1.3240001, 1.3240001, 1.286, 1.2540001, 1.19, 1.172, 1.1700001,
1.1860001, 1.2340001, 1.274, 1.3640001, 1.4120001, 1.58,
1.6580001, 1.6660001, 1.682, 1.748, 1.9280001, 1.9800001,
2.026, 2.052, 2.214, 2.328, 2.4320002, 2.39, 2.2180002, 1.9080001,
1.792, 1.7400001, 1.7140001, 1.7140001, 1.692, 1.6680001,
1.608, 1.58, 1.536, 1.524, 1.5640001, 1.5760001, 1.6100001,
1.6240001, 1.6120001, 1.5920001, 1.58, 1.542, 1.5200001,
1.48, 1.4640001, 1.4260001, 1.406, 1.386, 1.3820001, 1.34,
1.312, 1.268, 1.248, 1.8080001, 1.7960001, 1.644, 1.6420001,
1.6600001, 1.6880001, 1.7820001, 2.138, 2.2740002, 2.2940001,
2.252), Hs.Obs = c(1.741, 1.524, 1.618, 1.658, 1.697, 1.822,
1.792, 1.463, 1.433, 1.376, 1.208, 1.299, 1.255, 1.304, 1.328,
1.182, 1.282, 1.293, 1.228, 1.281, 1.45, 1.356, 1.501, 1.5,
1.356, 1.477, 1.408, 1.544, 1.497, 1.768, 2.04, 2.074, 2.042,
2.147, 2.224, 2.022, 2.017, 2.047, 2.353, 2.597, 2.838, 2.67,
2.762, 2.687, 2.734, 2.738, 2.938, 2.795, 2.549, 2.669, 2.447,
2.676, 2.577, 2.383, 2.362, 2.284, 2.341, 2.33, 2.397, 2.498,
2.317, 2.373, 2.377, 2.362, 2.218, 2.226, 1.97, 2.087, 1.874,
2.116, 2.022, 1.886, 2.046, 1.879, 1.638, 1.677, 1.638, 1.647,
1.551, 1.596, 1.591, 1.384, 1.345, 1.522, 1.469, 1.503, 1.459,
1.327, 1.453, 2.448, 2.235, 2.104, 1.958, 2.118, 2.209, 2.034,
2.229, 2.505, 2.163, 2.372)), row.names = c(NA, 100L), class = "data.frame")
Code to make the Q-Q plot:
ggplot(data=dfw, aes(x=sort(Hs.Obs), y=sort(Hs.Mod))) + geom_point(shape = 1, size =2) + xlab('Obs') + ylab('Model')+
theme_bw()+
geom_abline(linetype=2)
Code attempting to colour the 99th percentile:
ggplot(data=dfw, aes(x=sort(Hs.Obs), y=sort(Hs.Mod), col=cut(Hs.Mod,quantile(Hs.Mod, probs = .99)))) +
geom_point(shape = 1, size =2) + xlab('Obs') + ylab('Model')+
theme_bw()+
geom_abline(linetype=2)
Resulting plot:
I'm looking for some help to sort this out, as the attempts I have tried aren't working.
Thanks in advance!

try this
dfw %>%
mutate(qq = quantile(Hs.Mod, probs = c(0.99)),
qq_gt99 = ifelse(qq<= Hs.Mod, 1, 0)) %>%
ggplot(aes(x=Hs.Mod, y = Hs.Obs, col= as.factor(qq_gt99))) + geom_point()
if you order the observations first
dfw %>%
mutate(mod_ordered = sort(Hs.Mod),
obs_ordered = sort(Hs.Obs),
qq = quantile(mod_ordered, probs = c(0.99)),
qq_gt99 = ifelse(qq<= mod_ordered, 1, 0)) %>%
ggplot(aes(x=mod_ordered, y = obs_ordered, col=
as.factor(qq_gt99))) + geom_point()

Im using the dplyr library to filter() the data by quantile():
The code:
library(dplyr)
library(ggplot2)
ggplot()+
geom_point(data=filter(dfw,Hs.Obs>quantile(dfw$Hs.Mod,.99)),aes(x=sort(Hs.Obs),y=sort(Hs.Mod), col="Cuantile 99%"))+
geom_point(data=filter(dfw,Hs.Obs<quantile(dfw$Hs.Mod,.99)),aes(x=sort(Hs.Obs),y=sort(Hs.Mod), col="Cuantile 1-98%"))+
geom_abline(linetype=2)+xlab('Obs') + ylab('Model')+ theme_bw()

Related

Change distance between ticks on axis with DateTime variable

I have made a plot with ggplot where the x-axis is a DateTime variable. I would like to increase the distance between the ticks on the x axis so basically make the plot longer without changing the actual limits (the limits are two dates which appear correctly on the plot). For visibility reasons I need the plot to have more distance between each point/tick on the x axis so that the plotted change is easier to follow.
plot2 <- ggplot(Data, aes(DateTime, group = 1)) +
geom_line(aes(y = Change, colour = "Change"))
I have tried using a function I found online which makes ggplot display all of the ticks over a specified interval. However, this did not expand the actual distance between the ticks, just added more ticks. Function used shown below.
breaks = function(x)
seq.Date(from = min(x),
to = max(x),
by = "10 days")
minor_breaks = function(x)
seq.Date(from = min(x),
to = max(x),
by = "1 day")
plot2 +
scale_x_datetime(breaks = "15 days",
minor_breaks = "1 days")
Any ideas how I can expand the distance between ticks rather than simply adding more ticks or expanding the limits of the plot?
Here is the data:
structure(list(DateTime = structure(c(1670415374, 1670506704,
1670511629, 1670517043, 1670523367, 1670528144, 1670584731, 1670587219,
1670594506, 1670598044, 1670602687, 1670612571, 1670615016, 1670669321,
1670674548, 1670685813, 1670696903, 1670700710, 1670765570, 1670773501,
1670783869, 1670789931, 1670841137, 1670846470, 1670847141, 1670852707,
1670862620, 1670868387, 1670873869, 1670928407, 1670933500, 1670939207,
1670943626, 1670950125, 1670955461, 1670966455, 1671019197, 1671043928,
1671116424, 1671127891, 1671193896, 1671193896, 1671205681, 1671219303,
1671284825, 1671290071, 1671312145, 1671373499, 1671379886, 1671387493,
1671462033, 1671474322, 1671542373, 1671563513, 1671624638, 1671649144,
1671725712, 1671732040, 1671827734, 1671888249, 1671893712, 1671915669,
1671915669, 1671915669, 1671994322, 1671994322, 1672076098, 1672130014,
1672152035, 1672167136, 1672255279, 1672333309, 1672337893, 1672404995,
1672413431, 1672424101, 1672488057, 1672523537, 1672589460, 1672596553,
1672662188, 1672671457, 1672691857, 1672748172, 1672760508, 1672772744,
1672935802, 1672945396, 1673007435, 1673019876, 1673028304, 1673092863,
1673108795, 1673120023, 1673179769, 1673190069, 1673201163, 1673265620,
1673274863, 1673287229, 1673356126, 1673365774, 1673387010, 1673442864,
1673462481, 1673538703, 1673549051, 1673620368, 1673622111, 1673634853,
1673699438, 1673719261, 1673786770, 1673794936, 1673807902, 1673876509,
1673884159, 1673895512, 1673957880, 1673967545, 1673968363, 1673978875,
1674052099, 1674065378, 1674144571, 1674151393, 1674217307, 1674227431,
1674248909, 1674308920, 1674318010, 1674325431, 1674398675, 1674400192,
1674410504, 1674475288, 1674486319, 1674497762, 1674563116, 1674583639,
1674648195, 1674671450, 1674902477, 1674902477, 1674908963, 1674912648,
1674920521, 1674923572, 1674929410, 1674934232, 1674989081, 1674993673,
1675001115, 1675024579, 1675076219, 1675076219, 1675086360, 1675095923,
1675096891, 1675101964, 1675107017, 1675161022, 1675166442, 1675172546,
1675178387, 1675183108, 1675188015, 1675195910, 1675247464, 1675252883,
1675263629, 1675269012, 1675278293, 1675288565, 1675334193, 1675339936,
1675347483, 1675350039, 1675355417, 1675362687, 1675368937), class = c("POSIXct",
"POSIXt"), tzone = ""), Change = c(0.677, 0.522,
0.252, 0.759, 0.733, 0.331, 0.658, 0.661, 0.245, 0.5, 0.5, 0.679,
0.703, 0.5, 0.5, 0.391, 0.688, 0.702, 0.824, 0.718, 0.5, 0.778,
0.295, 0.263, 0.249, 0.297, 0.737, 0.76, 0.755, 0.704, 0.492,
0.333, 0.5, 0.774, 0.899, 0.5, 0.5, 0.822, 0.649, 0.684, 0.72,
0.72, 0.694, 0.813, 0.318, 0.257, 0.739, 0.74, 0.691, 0.786,
0.735, 0.5, 0.5, 0.834, 0.5, 0.452, 0.706, 0.735, 0.733, 0.74,
0.778, 0.887, 0.887, 0.887, 0.688, 0.688, 0.728, 0.726, 0.803,
0.922, 0.788, 0.764, 0.736, 0.724, 0.5, 0.763, 0.286, 0.831,
0.838, 0.743, 0.701, 0.5, 0.316, 0.5, 0.5, 0.774, 0.741, 0.732,
0.902, 0.5, 0.749, 0.592, 0.699, 0.72, 0.785, 0.702, 0.764, 0.74,
0.329, 0.181, 0.902, 0.5, 0.654, 0.79, 0.705, 0.729, 0.766, 0.26,
0.5, 0.728, 0.767, 0.673, 0.832, 0.804, 0.88, 0.274, 0.792, 0.732,
0.741, 0.834, 0.272, 0.745, 0.726, 0.76, 0.305, 0.83, 0.741,
0.284, 0.691, 0.771, 0.755, 0.669, 0.662, 0.696, 0.684, 0.721,
0.5, 0.775, 0.5, 0.66, 0.699, 0.711, 0.743, 0.743, 0.654, 0.717,
0.767, 0.714, 0.724, 0.683, 0.68, 0.771, 0.699, 0.733, 0.21,
0.21, 0.717, 0.739, 0.774, 0.823, 0.73, 0.5, 0.508, 0.5, 0.5,
0.5, 0.785, 0.469, 0.887, 0.732, 0.5, 0.702, 0.696, 0.862, 0.142,
0.815, 0.5, 0.5, 0.694, 0.775, 0.883)), row.names = c(NA, -181L
), class = c("tbl_df", "tbl", "data.frame"))

How to adjust the limit of primary and secondary y-axis?

I want to adjust y-axis limit both for primary and secondary axis by using the example of case: How to limit primary y-axis and secondary y-axis? and ggplot with 2 y axes on each side and different scales
I want to adjust primary y-axis from 0-50 and secondary y-axis from 0-500, then don't want to show any plot with higher value than the limit (even though there are some data with values higher than the limit).
I didn't get any error with my code but the limit that I have set is not been successfully applied to the graph.
Here is my example of data:
df2 <- structure(list(startdate = structure(c(17903, 17910, 17917, 17924,
17931, 17938, 17945, 17952, 17959, 17966, 17982, 17987, 18001,
18003, 18015, 18022, 18029, 18031, 18036, 18043, 18050, 18057,
18064, 18072, 18079, 18085, 18099, 18106, 18113, 18120, 18127,
18134, 18141, 18148, 18155, 18162, 18169, 18183, 18197, 18204,
18211, 18218, 18225, 18227, 18232, 18234, 18239, 18246, 18253,
18267, 18274, 18281, 18288, 18295, 18302, 18309, 18316, 18323,
18330, 18337, 18344, 18351, 18358, 18365, 18373, 18379, 18386,
18393, 18400, 18407, 18414, 18421, 18428, 18430, 18435, 18442,
18449, 18456, 18463, 18472, 18477, 18484, 18491, 18498, 18505,
18514, 18519, 18526, 18533, 18540, 18547, 18554, 18561, 18568,
18575, 18583, 18589, 18596, 18603, 18610, 18617, 18624), class = "Date"),
Al = c(24.744, 19.272, 15.245, 21.497, 26.086, 5.867, 23.722,
30.269, 25.666, 17.106, 53.07, NA, 226.995, 70.341, 108.865,
18.15, 445.203, 393.528, 11.151, 52.329, 37.737, 16.68, 124.039,
22.667, 19.125, 82.391, 87.85, 19.041, 77.098, 34.27, 10.912,
116.28, 42.9, 9.282, 35.504, NA, 133.95, 94.311, 124.97,
63.374, 99.062, 54.366, 38.925, 66.56, 19.525, 221.973, 140.54,
68.699, 117.965, 456, 13, 44.5, 46.6, 69.4, NA, 12.3, 6.81,
NA, 20.6, 19.6, NA, 24.2, 71.6, 566, 219, 158, 58.2, 217,
351, 13.6, 38.3, 91.5, 90.2, 23.8, 23.4, 21.4, 42.9, 13.8,
NA, 35.8, 24, 9.11, 32.6, 24.6, 286, NA, 28.9, 10, NA, 331,
101, 6.58, 83.9, 2230, 1100, NA, NA, 638, 622, 143, 96, 28.3
), Fe = c(9.627, 12.429, 10.115, 9.498, 14.555, 4.39, 12.201,
12.888, 12.318, 9.889, 19.607, 11.202, 51.294, 21.877, 43.531,
9.539, 131.812, 123.998, 7.991, 21.365, 18.732, 8.378, 42.805,
5.886, 10.994, 29.268, NA, 7.832, 15.377, 12.558, 4.829,
42.002, 16.464, 5.545, 17.778, NA, 67.634, 37.384, 49.764,
28.589, 37.174, 21.271, 16.639, 29.878, 11.689, 90.459, 36.085,
15.883, 34.31, 210, 7.55, 21.8, 23.4, 32.2, 8.5, 5.76, 4.83,
1.85, 10.9, 10.5, 2.16, 12.4, 34.1, 212, 106, 65.1, 26.9,
93.1, 163, 6.41, 15.4, 34.7, 36, 10.1, 14.7, 11, 23, 5.36,
1.72, 23.3, 20.2, 6.64, 20.1, 14.3, 129, NA, 13.9, 6.6, NA,
193, 42.1, 4.29, 37.7, 1260, 585, NA, NA, 288, 289, 64.6,
43, 14.1), Mn = c(0.184, 0.377, 0.334, 0.163, 0.416, 0.101,
0.351, 0.359, 0.302, 0.406, 0.393, 0.277, 2.624, 0.656, 0.822,
0.205, 2.401, 2.403, 0.161, 0.415, NA, 0.155, 1.416, 0.134,
0.212, NA, NA, 0.337, 0.898, 0.217, NA, 1.027, 0.264, NA,
0.284, NA, 1.176, 0.599, 0.808, 0.462, 0.826, 0.487, 0.293,
0.518, 0.242, 1.848, 1.083, 0.483, 0.732, 4.22, 0.227, 0.564,
0.446, 0.624, 0.178, 0.198, 0.25, 0.054, 0.245, 0.296, 0.071,
0.304, 0.739, 4.4, 1.62, 0.987, 0.405, 1.45, 3.04, 0.121,
0.447, 0.756, 0.559, 0.201, 0.3, 0.136, 0.431, 0.885, NA,
0.456, 0.366, 0.217, 0.257, 0.208, 3.59, NA, 0.208, 0.091,
NA, 4.91, 0.685, 0.076, 0.7, 22.3, 11.1, NA, NA, 4.21, 5.26,
1.08, 0.722, 0.269), Ti = c(1.032, 0.763, 0.795, 0.861, 1.263,
0.426, 1.168, 1.284, 1.257, 0.706, 1.566, 0.965, 3.978, 1.939,
4.109, 0.787, 11.025, 12.884, 0.691, 1.58, 1.541, 0.788,
4.588, 0.45, 0.873, 2.115, NA, 1.636, 1.195, 0.971, 0.37,
3.132, 1.351, 0.328, 1.222, NA, 4.251, 2.502, 3.157, 2.044,
2.627, 1.698, 1.34, 1.879, 0.77, 4.539, 2.46, 1.17, 2.2,
8.52, 0.492, 1.41, 1.65, 2.1, 0.652, 0.368, 0.277, NA, 0.762,
0.712, NA, 0.787, 2.2, 11.6, 5.29, 4.65, 2.16, 5.38, 13.7,
0.555, 1.41, 2.78, 3.15, 0.88, 0.955, 0.853, 1.65, 0.379,
NA, 1.05, 1.06, 0.574, 1.45, 1.02, 7.83, NA, 1.06, 0.501,
NA, 8.52, 2.96, 0.339, 2.68, 47.2, 23.2, NA, NA, 13.2, 12.9,
4.11, 2.14, 1.1)), row.names = c(NA, -102L), class = c("tbl_df",
"tbl", "data.frame"))
And here is the code:
library(ggplot2)
library(dplyr)
library(tidyr)
library(lubridate)
library(scales)
label_y1 = expression(bold(Mn,Ti~(ng/m^{3})))
label_y2 = expression(bold(Al,Fe~(ng/m^{3})))
#Determine certain date for shading
shade <- df2 %>% transmute(year = year(startdate)) %>% unique() %>%
mutate( from = as.Date(paste0(year, "-02-14")), to = as.Date(paste0(year, "-05-07")))
# Function factory for secondary axis transforms
train_sec <- function(primary, secondary) {
from <- range(secondary)
to <- range(primary)
# Forward transform for the data
forward <- function(x) {
rescale(x, from = from, to = to)
}
# Reverse transform for the secondary axis
reverse <- function(x) {
rescale(x, from = to, to = from)
}
list(fwd = forward, rev = reverse)
#Set the limit of both y-axis
sec <- train_sec(c(0, 50), c(0, 500))
#Plotting data
ggplot(df2) +
geom_line( aes(x=startdate, y=Mn, color='Mn')) +
geom_line( aes(x=startdate, y=Ti, color='Ti')) +
geom_line( aes(x=startdate, y= sec$fwd(Al), color = 'Al')) +
geom_line( aes(x=startdate, y= sec$fwd(Fe), color = 'Fe')) +
geom_rect(data = shade, aes(xmin = from, xmax = to, ymin = -Inf, ymax = Inf), fill = 'red',alpha=0.1) +
scale_y_continuous(name = label_y1, sec.axis = sec_axis(~sec$rev(.), name = label_y2))+ ggtitle ("a)")+
theme_bw()+ theme(legend.position = c(0.1, 0.9),legend.direction="horizontal", axis.text.x = element_text(face="bold", size=10) ,axis.text.y = element_text(face="bold", size=10), axis.title = element_text(size = 10), plot.title = element_text(size=10, face="bold", hjust=0.05,vjust = - 12), legend.spacing.y = unit(0, "mm"), axis.text = element_text(colour = 1),legend.background = element_blank(),legend.box.background = element_blank(), legend.key = element_blank(), legend.justification = "left")+labs(color = NULL, fill = NULL, x=NULL)+guides(colour = guide_legend(override.aes = list(size=1)))
I'm not really sure why the limitation to y-axis is not successful. If anybody know the reason and how to fix this, please let me know. I really appreciate it.
Thank you so much. Best regards.
I added limits directly into scale_y_continous and I think this works
ggplot(df2) +
geom_line( aes(x=startdate, y=Mn, color='Mn')) +
geom_line( aes(x=startdate, y=Ti, color='Ti')) +
geom_line( aes(x=startdate, y= sec$fwd(Al), color = 'Al')) +
geom_line( aes(x=startdate, y= sec$fwd(Fe), color = 'Fe')) +
geom_rect(data = shade, aes(xmin = from, xmax = to, ymin = -Inf, ymax = Inf), fill = 'red',alpha=0.1) +
scale_y_continuous(
limits = c(0, 50),
name = label_y1, sec.axis = sec_axis(~sec$rev(.), name = label_y2)
)+
ggtitle ("a)") +
theme_bw() +
theme(
legend.position = c(0.1, 0.9),
legend.direction="horizontal",
axis.text.x = element_text(face="bold", size=10),
axis.text.y = element_text(face="bold", size=10),
axis.title = element_text(size = 10),
plot.title = element_text(size=10, face="bold", hjust=0.05,vjust = - 12),
legend.spacing.y = unit(0, "mm"),
axis.text = element_text(colour = 1),
legend.background = element_blank(),
legend.box.background = element_blank(),
legend.key = element_blank(),
legend.justification = "left"
) +
labs(color = NULL, fill = NULL, x=NULL) +
guides(colour = guide_legend(override.aes = list(size=1)))

How to implement k-fold cross-validation while forcing linear regression of predicted to real values to 1:1 line

I'm trying to train y as a polynomial function of x so that when the predicted y values are linearly regressed against the real y values, the relationship is on the 1:1 line (diagram - The image on the right uses geom_smooth(method="lm") for demonstration, but with SMA from the lmodel2() function, the regression line is 1:1). I'm kind of a stats amateur so I'm aware there might be problems with this, but without forcing the model tends to overestimate low values and underestimate high values. My question is: How do I introduce k-fold cross-validation using an existing package like caret or cvms? It seems like they need a model object to be returned and I can't figure out how to code my problem like that. Is there some way I can train the model by minimizing my custom metric and still return a model object with ypred and use it in k-fold CV?
This is my code for calculating the coefficients without k-fold CV:
data <- data.frame(
x = c(1.514, 1.514, 1.825, 1.281, 1.118, 1.279, 1.835, 1.819, 0.462, 1.53, 1.004, 1.19, 1.275, 0.428, 0.313, 0.909, 0.995, 0.995, 0.706, 0.563, 0.827, 0.65, 0.747, 1.013, 1.013, 1.163, 1.091, 1.163, 1.091, 0.955, 0.955, 2.044, 2.044, 1.777, 1.777, 1.434, 1.393, 1.324, 0.981, 0.845, 1.595, 1.595, 1.517, 1.517, 1.403, 1.403, 0.793, 0.793, 1.016, 0.901, 0.847, 1.054, 0.877, 1.639, 1.639, 1.268, 1.268, 0.842, 0.842, 0.827, 0.777, 1.024, 1.238, 1.238, 1.702, 1.702, 0.673, 0.673, 1.256, 1.256, 0.898, 0.898, 0.66, 0.933, 0.827, 0.836, 1.122, 1.5, 1.5, 1.44, 1.44, 0.671, 0.671, 0.486, 0.486, 1.051, 1.051, 0.971, 0.538, 0.971, 0.538, 1.012, 1.012, 0.776, 0.776, 0.854, 0.854, 0.74, 0.989, 0.989),
y = c(0.19, 0.18, 0.816, 2.568, 0.885, 0.521, 0.268, 0.885, 4.781, 1.648, 0.989, 1.614, 1.492, 0.679, 2.256, 3.17, 1.926, 1.631, 0.462, 2.48, 0.658, 0.355, 0.373, 2.31, 3.263, 1.374, 1.374, 2.637, 2.637, 2.073, 2.298, 0.257, 0.292, 0.359, 0.329, 1.329, 1.272, 3.752, 1.784, 0.76, 0.458, 0.488, 0.387, 0.387, 3.401, 1.458, 8.945, 9.12, 0.308, 0.386, 0.405, 6.444, 3.17, 0.458, 0.47, 0.572, 0.589, 1.961, 1.909, 0.636, 0.32, 1.664, 0.756, 0.851, 0.403, 0.232, 23.112, 22.042, 0.745, 0.477, 2.349, 3.01, 0.39, 0.246, 0.43, 1.407, 1.358, 0.235, 0.215, 0.595, 0.685, 2.539, 2.128, 8.097, 5.372, 0.644, 0.626, 17.715, 17.715, 6.851, 6.851, 2.146, 1.842, 3.147, 2.95, 1.127, 1.019, 8.954, 0.796, 0.758),
stringsAsFactors = FALSE)
optim_results <- optim(par = c(a0 = 0.3, a1 = -3.8, a2 = -1, a3 = 1, a4 = 1),
fn = function (params, x, y) {
params <- as.list(params)
ypred <- with(params, (a0 + (a1*x) + (a2*x^2) + (a3*x^3) + (a4*x^4)))
mod <- suppressMessages(lmodel2::lmodel2(ypred ~ y))$regression.results[3,]
line <- mod$Slope * y + mod$Intercept
return(sum((y - line)^2))},
x = log10(data$x),
y = log10(data$y))
cf <- as.numeric(optim_results$par)
data <- data %>% dplyr::mutate(ypred = 10^(cf[1] + cf[2]*log10(x) + cf[3]*log10(x)^2 + cf[4]*log10(x)^3 + cf[5]*log10(x)^4))
str(data)
Great question!
cvms::cross_validate_fn() allows you to cross-validate custom functions. You just have to wrap your code in a model function and a predict function as so:
EDIT: Added extraction of model parameters from the optim() output. optim() returns a list, which we convert to a class and then tell coef() how to extract the coefficients for that class.
library(dplyr)
library(groupdata2)
library(cvms)
# Set seed for reproducibility
set.seed(2)
data <- data.frame(
x = c(1.514, 1.514, 1.825, 1.281, 1.118, 1.279, 1.835, 1.819, 0.462, 1.53, 1.004, 1.19, 1.275, 0.428, 0.313, 0.909, 0.995, 0.995, 0.706, 0.563, 0.827, 0.65, 0.747, 1.013, 1.013, 1.163, 1.091, 1.163, 1.091, 0.955, 0.955, 2.044, 2.044, 1.777, 1.777, 1.434, 1.393, 1.324, 0.981, 0.845, 1.595, 1.595, 1.517, 1.517, 1.403, 1.403, 0.793, 0.793, 1.016, 0.901, 0.847, 1.054, 0.877, 1.639, 1.639, 1.268, 1.268, 0.842, 0.842, 0.827, 0.777, 1.024, 1.238, 1.238, 1.702, 1.702, 0.673, 0.673, 1.256, 1.256, 0.898, 0.898, 0.66, 0.933, 0.827, 0.836, 1.122, 1.5, 1.5, 1.44, 1.44, 0.671, 0.671, 0.486, 0.486, 1.051, 1.051, 0.971, 0.538, 0.971, 0.538, 1.012, 1.012, 0.776, 0.776, 0.854, 0.854, 0.74, 0.989, 0.989),
y = c(0.19, 0.18, 0.816, 2.568, 0.885, 0.521, 0.268, 0.885, 4.781, 1.648, 0.989, 1.614, 1.492, 0.679, 2.256, 3.17, 1.926, 1.631, 0.462, 2.48, 0.658, 0.355, 0.373, 2.31, 3.263, 1.374, 1.374, 2.637, 2.637, 2.073, 2.298, 0.257, 0.292, 0.359, 0.329, 1.329, 1.272, 3.752, 1.784, 0.76, 0.458, 0.488, 0.387, 0.387, 3.401, 1.458, 8.945, 9.12, 0.308, 0.386, 0.405, 6.444, 3.17, 0.458, 0.47, 0.572, 0.589, 1.961, 1.909, 0.636, 0.32, 1.664, 0.756, 0.851, 0.403, 0.232, 23.112, 22.042, 0.745, 0.477, 2.349, 3.01, 0.39, 0.246, 0.43, 1.407, 1.358, 0.235, 0.215, 0.595, 0.685, 2.539, 2.128, 8.097, 5.372, 0.644, 0.626, 17.715, 17.715, 6.851, 6.851, 2.146, 1.842, 3.147, 2.95, 1.127, 1.019, 8.954, 0.796, 0.758),
stringsAsFactors = FALSE)
# Fold data
# Will do 10-fold repeated cross-validation (10 reps)
data <- fold(
data = data,
k = 10, # Num folds
num_fold_cols = 10 # Num repetitions
)
# Write a model function from your code
# This ignores the formula and hyperparameters but
# you could pass values through those if you wanted
# to try different formulas or hyperparameter values
model_fn <- function(train_data, formula, hyperparameters){
out <- optim(par = c(a0 = 0.3, a1 = -3.8, a2 = -1, a3 = 1, a4 = 1),
fn = function (params, x, y) {
params <- as.list(params)
ypred <- with(params, (a0 + (a1*x) + (a2*x^2) + (a3*x^3) + (a4*x^4)))
mod <- suppressMessages(lmodel2::lmodel2(ypred ~ y))$regression.results[3,]
line <- mod$Slope * y + mod$Intercept
return(sum((y - line)^2))},
x = log10(train_data$x),
y = log10(train_data$y))
# Convert output to an S3 class
# so we can extract parameters with coef()
class(out) <- "OptimModel"
out
}
# Tell coef() how to extract the parameters
# This can modified if you need more info from the optim() output
# Just return a named list
coef.OptimModel <- function(object) {
object$par
}
# Write a predict function from your code
predict_fn <- function(test_data, model, formula, hyperparameters, train_data){
cf <- as.numeric(model$par)
test_data %>%
dplyr::mutate(
ypred = 10^(cf[1] + cf[2]*log10(x) + cf[3]*log10(x)^2 + cf[4]*log10(x)^3 + cf[5]*log10(x)^4)
) %>%
.[["ypred"]]
}
# Cross-validate the model
cv <- cross_validate_fn(
data = data,
model_fn = model_fn,
predict_fn = predict_fn,
formulas = c("y ~ x"), # Not currently used by the model function
fold_cols = paste0('.folds_', seq_len(10)),
type = 'gaussian'
)
#> Will cross-validate 1 models. This requires fitting 100 model instances.
# Check output
cv
# A tibble: 1 × 17
Fixed RMSE MAE NRMSE(I…¹ RRSE RAE RMSLE Predic…² Results Coeffi…³ Folds
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list> <list> <int>
1 x 4.00 2.31 2.66 1.47 1.17 0.662 <tibble> <tibble> <tibble> 100
# … with 6 more variables: `Fold Columns` <int>, `Convergence Warnings` <int>,
# `Other Warnings` <int>, `Warnings and Messages` <list>, Process <list>,
# Dependent <chr>, and abbreviated variable names ¹​`NRMSE(IQR)`,
# ²​Predictions, ³​Coefficients
# ℹ Use `colnames()` to see all variable names
Created on 2022-10-15 with reprex v2.0.2

Find point of systematic decrease in R

I have the following data frame:
df <- structure(list(x = c(1059.6, 1061.4, 1063.4, 1064.9, 1066.3,
1068, 1069.8, 1071.4, 1072.9, 1074.4, 1075.9, 1077.5, 1079.1,
1080.5, 1082.1, 1083.8, 1085.1, 1086.7, 1088.1, 1089.5, 1091.6,
1093.1, 1094.5, 1095.8, 1097.1, 1098.4, 1099.8, 1101.1, 1102.5,
1103.9, 1105.3, 1106.6, 1108, 1109.4, 1110.8, 1112.2, 1113.7,
1115.2, 1116.5, 1117.9, 1119.1, 1120.4, 1121.8, 1123.1, 1124.8,
1126.2, 1127.4, 1128.8, 1130.2, 1131.8, 1133.3, 1134.6, 1138.5,
1141.2, 1142.4, 1143.6, 1144.8, 1146.8, 1148.2, 1149.6, 1150.9,
1152.2, 1153.4, 1154.7, 1155.9, 1157.1, 1158.3, 1159.5, 1161.9,
1163.4, 1164.7, 1166, 1167.2, 1169, 1170.3, 1171.5, 1172.8, 1173.9,
1175.1, 1176.8, 1178, 1179.2, 1180.3, 1181.6, 1182.8, 1184.1,
1185.8, 1187, 1188.2, 1189.4, 1190.5, 1191.8, 1193, 1194.3, 1195.5,
1205.8, 1206.9, 1208, 1209, 1210.2, 1211.3, 1212.4, 1213.6, 1214.7,
1217.2, 1218.6, 1222.3, 1223.6, 1224.7, 1225.9, 1227.1, 1228.2,
1229.3, 1230.4, 1231.6, 1232.7, 1233.6, 1234.6, 1235.7, 1236.9,
1238.4, 1239.5, 1240.6, 1241.6, 1242.7, 1243.7, 1244.8, 1245.9,
1247, 1248.1, 1249.2, 1250.3, 1251.3, 1252.6, 1253.7, 1254.8,
1255.8, 1256.8, 1257.8, 1258.8, 1261.4, 1262.5, 1263.5, 1264.5,
1265.6, 1266.6, 1267.8, 1268.8, 1270.1, 1271.1, 1272.1, 1273.2,
1274.1, 1275.2, 1276.3, 1279, 1280, 1281, 1282.1, 1283.1, 1284.1,
1285, 1286, 1287, 1288, 1289, 1290, 1291.1, 1292.3, 1293.3, 1294.4,
1298.6, 1299.6, 1300.5, 1301.5, 1302.5, 1303.5, 1304.6, 1305.5,
1306.4, 1307.6, 1308.6, 1309.7, 1310.7, 1311.7, 1312.7, 1315.2,
1316.3, 1317.3, 1318.3, 1319.3, 1320.3, 1321.3, 1322.3, 1323.2,
1326.8, 1327.8, 1329, 1330, 1331, 1332, 1333, 1333.9, 1335, 1336,
1337.3, 1338.3, 1339.3, 1340.5, 1341.6, 1342.7, 1343.8, 1344.9,
1345.9, 1346.8, 1347.8, 1348.8, 1350, 1351.1, 1352, 1353.3, 1354.3,
1355.3, 1356.2, 1357.1, 1358, 1359.2, 1360.2, 1364.4, 1365.5,
1366.6, 1367.6, 1368.7, 1369.8, 1371, 1372, 1373, 1374.1, 1375,
1376, 1376.9, 1377.8, 1378.7, 1379.6, 1380.5, 1381.4, 1382.3,
1383.3, 1384.2, 1385.2, 1387.6, 1388.5, 1389.5, 1390.4, 1391.4,
1392.5, 1393.6, 1394.6, 1395.6, 1397, 1397.9, 1398.8, 1399.8,
1400.6, 1401.6, 1402.5, 1403.4, 1404.2, 1405.1, 1407.4, 1408.3,
1409.2, 1410.1, 1411.2, 1412.2, 1413.2, 1414.2, 1415.6, 1416.7,
1417.8, 1418.9, 1420.2, 1421.5, 1424.6, 1425.7, 1427, 1428.1,
1429.3, 1430.7, 1431.9, 1433.1, 1434.5, 1435.7, 1436.8, 1438,
1439.4, 1440.6, 1441.9, 1443, 1444.4, 1445.6, 1447.3, 1448.5,
1449.7, 1450.9, 1452.1, 1453.2, 1454.5, 1455.6, 1456.8, 1458.1,
1459.3, 1460.3, 1461.4, 1462.4, 1463.9, 1465.1, 1466.3, 1469.8,
1471.1, 1472.6, 1473.8, 1475, 1476.2, 1477.5, 1479.1, 1480.7,
1482, 1483.2, 1484.9, 1486.2, 1487.5, 1488.8, 1490, 1491.3, 1492.4,
1503, 1504.3, 1506.3, 1507.5, 1508.8, 1510.2, 1511.4, 1512.5,
1513.8, 1515.6, 1517.1, 1520.1, 1523.9, 1526.5, 1527.9, 1529.8,
1531.2, 1532.4, 1533.7, 1536, 1537.4, 1538.8, 1540.2, 1541.5,
1542.9, 1544.2, 1545.6, 1546.9, 1548.3, 1549.7, 1551.1, 1552.7,
1554.1, 1556.4, 1557.8, 1559.2, 1560.6, 1562, 1563.4, 1564.7,
1566.2, 1567.5, 1568.9, 1570.2, 1571.4, 1573.9, 1576.7, 1581.5,
1582.8, 1584.7, 1586.2, 1587.7, 1589.3, 1591, 1592.8, 1594.7,
1596.4, 1598.5, 1600.6, 1602.4, 1604.6, 1606.9, 1609, 1611, 1612.6,
1614.4, 1616.3, 1618.6, 1620.6, 1622.4, 1624.5, 1627.2, 1629.3,
1631.4, 1635, 1636.9, 1638.6, 1640.5, 1642.1, 1643.7, 1645.5,
1647.1, 1648.7, 1650.9, 1653, 1655.2, 1657.1, 1659.1, 1661.5,
1663.6, 1665.9, 1668.1, 1671.7, 1674, 1676.2, 1678.1, 1679.7,
1681.6, 1683.6, 1685.7, 1688, 1693.7, 1695.7, 1697.6, 1699.7,
1701.7, 1704.1), y = c(1.876, 2.027, 2.087, 2.231, 2.18, 1.922,
1.921, 1.851, 1.961, 2.035, 2.043, 2.043, 1.838, 2.032, 2.112,
1.976, 2.046, 2.117, 2.062, 2.07, 1.748, 1.917, 2.092, 2.283,
2.158, 2.119, 2.023, 1.971, 1.882, 2.058, 2.141, 2.241, 2.079,
1.946, 1.959, 2.117, 1.923, 2.015, 2.066, 1.98, 2.091, 1.929,
1.987, 1.852, 1.935, 2.127, 1.982, 2.182, 2.099, 2.03, 1.912,
1.998, 2.491, 2.359, 2.188, 1.965, 1.906, 1.772, 1.927, 2.077,
2.381, 2.191, 2.089, 2.086, 2.017, 2.028, 1.832, 1.88, 2.053,
2.177, 1.995, 2.045, 2.116, 1.961, 1.99, 2.227, 2.235, 2.208,
2.249, 1.992, 2.045, 2.152, 2.237, 2.239, 2.247, 2.114, 1.956,
2.042, 1.926, 2.396, 2.184, 2.208, 2.016, 2.177, 2.29, 2.469,
2.502, 2.115, 2.081, 2.091, 2.188, 2.118, 2.179, 2.067, 1.962,
2.181, 2.246, 2.526, 2.145, 1.961, 2.299, 2.306, 2.34, 2.133,
1.974, 1.997, 2.47, 2.24, 2.247, 2.137, 1.965, 2.232, 2.225,
2.417, 2.362, 2.155, 2.034, 2.151, 2.176, 2.183, 2.372, 2.145,
2.284, 1.967, 2.299, 2.299, 2.183, 2.292, 2.193, 2.249, 2.32,
2.333, 2.286, 2.216, 2.233, 2.453, 2.373, 2.284, 2.074, 2.014,
2.153, 2.353, 2.465, 2.373, 2.181, 2.424, 2.334, 2.349, 2.39,
2.513, 2.526, 2.268, 2.098, 2.326, 2.385, 2.306, 2.378, 2.126,
2.191, 2.363, 2.222, 2.723, 2.686, 2.4, 2.251, 2.121, 2.104,
2.16, 2.333, 2.151, 2.116, 2.136, 2.293, 2.281, 2.313, 2.374,
2.585, 2.521, 2.656, 2.66, 2.399, 2.442, 2.413, 2.528, 2.212,
2.58, 2.667, 2.153, 2.736, 2.486, 2.406, 2.39, 2.403, 2.504,
2.502, 2.158, 2.617, 2.434, 2.364, 2.497, 2.456, 2.263, 2.432,
2.562, 2.453, 2.249, 2.18, 2.141, 2.324, 2.176, 2.184, 2.153,
2.332, 2.202, 2.332, 2.125, 2.156, 2.189, 2.71, 2.458, 2.502,
2.285, 2.527, 2.437, 2.418, 2.507, 2.087, 2.321, 2.701, 2.486,
2.389, 2.335, 2.26, 2.108, 2.164, 2.286, 2.103, 2.257, 2.137,
2.076, 2.378, 2.637, 2.446, 2.448, 2.539, 2.253, 2.099, 2.59,
2.405, 2.219, 2.542, 2.532, 2.507, 2.439, 2.463, 2.342, 2.329,
2.436, 2.511, 2.557, 2.603, 2.5, 2.428, 2.204, 2.307, 2.174,
2.193, 1.793, 2.116, 2.107, 2.209, 1.967, 1.834, 2.713, 2.647,
2.379, 2.229, 2.11, 1.964, 1.985, 2.162, 1.996, 2.074, 1.994,
1.839, 1.838, 1.743, 1.668, 1.91, 1.735, 1.714, 1.421, 1.767,
1.816, 1.755, 1.755, 1.698, 1.608, 1.556, 1.511, 1.394, 1.425,
1.579, 1.495, 1.627, 1.305, 1.471, 1.469, 1.67, 1.697, 1.42,
1.483, 1.274, 1.341, 1.235, 1.295, 1.401, 1.463, 1.313, 1.176,
1.333, 1.373, 1.299, 1.086, 1.139, 1.237, 1.303, 1.143, 1.13,
1.114, 1.096, 1.248, 1.302, 1.19, 1.069, 1.1, 1.027, 0.897, 1.09,
0.922, 1.116, 0.963, 1.011, 1.053, 1.025, 0.985, 0.981, 1.025,
1.117, 1.141, 1.135, 1.068, 0.982, 1.028, 1.06, 1.004, 1.112,
1.108, 1.04, 0.857, 0.91, 0.98, 1.081, 1.025, 0.996, 0.931, 1,
1.074, 0.987, 0.996, 1.125, 0.9, 0.607, 1.17, 1.08, 1, 0.909,
0.841, 0.924, 0.818, 0.846, 0.732, 1.006, 0.717, 0.594, 0.786,
0.685, 0.619, 0.684, 0.69, 0.633, 0.564, 0.689, 0.555, 0.445,
0.696, 0.677, 0.729, 0.541, 0.362, 0.312, 0.568, 0.711, 0.515,
0.622, 0.583, 0.631, 0.645, 0.696, 0.535, 0.424, 0.469, 0.519,
0.511, 0.485, 0.436, 0.412, 0.351, 0.556, 0.255, 0.519, 0.399,
0.497, 0.477, 0.564, 0.462, 0.433, 0.616, 0.547, 0.42, 0.499,
0.415, 0.368)), row.names = c(NA, -443L), class = c("tbl_df",
"tbl", "data.frame"), .Names = c("x", "y"))
Plot:
And I need to find the point that y starts to systematically decrease.
I know that the real point is x == 1405. However, is there a way to automatically detect it?
I am not expecting to find the exact x point. A really good approximation would do the job.
I already tried to perform a break point analysis with the segmented package, but with not much success. The best number I could get was x == 1363, but I am looking for a closer approximation.
Here's how to get a fitted smooth of the data using loess. When you say "starts to systematically decrease," I think you mean something like "when the slope gets negative beyond a certain threshold," since it seems to me that it visually peaks and starts to decline around the 1350's. I could manually get the peak to occur later by smoothing more than default, using span = 0.4.
library(broom)
fit <- loess(y ~ x, df, span = 0.4)
df_aug <- augment(fit)
Using that model, the peak looks to be around the 1370's.
library(dplyr); library(ggplot2)
df_aug %>% filter(.fitted == max(.fitted))
# # A tibble: 1 x 5
# y x .fitted .se.fit .resid
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2.09 1373 2.39 0.0181 -0.307
I presume you could get a better result if you can more definitively describe what model should be used to define "systematically decrease."
You might alternately extract the slope and acceleration from the loess curve, but it's not clear that'd get you much closer you your expected result:
# Extract slope & acceleration
df_aug_slope <- df_aug %>%
mutate(slope = (.fitted - lag(.fitted)) /
(x - lag(x)),
curve = (slope - lag(slope)) /
(x - lag(x)))
ggplot(df_aug_slope, aes(x)) +
geom_point(aes(y=y)) +
geom_line(aes(y=.fitted), color ="red") +
geom_line(aes(y= slope * 100), color = "blue") +
geom_line(aes(y= curve * 1000), color = "green") +
geom_vline(xintercept = 1405, lty = "dashed") +
theme_minimal()

Remove margin inside plot pf ggplot2

this is my script and the associated plot:
library(ggplot2)
library(reshape)
df <- structure(list(ID = structure(1:19, .Label = c("2818/22/0834",
"2818/22/0851", "2818/22/0853", "2818/22/0886", "B0F", "B12T",
"B1T", "B21T", "B22F", "B26T", "B33F", "B4F", "P1", "P21", "P24",
"P25", "P27", "P28", "P29"), class = "factor"), K = c(0.089,
0.094, 0.096, 0.274, 0.09, 0.312, 0.33, 0.178, 0.05, 0.154, 0.083,
0.098, 0.035, 0.084, 0.053, 0.061, 0.043, 0.094, 0.101), Na = c(2.606,
3.822, 4.977, 2.522, 15.835, 83.108, 52.041, 41.448, 11.849,
40.531, 5.854, 10.151, 3.52, 8.445, 5.273, 7.246, 6.177, 14.813,
15.569), Cl = c(3.546, 6.181, 8.422, 3.733, 14.685, 96.911, 65.518,
79.01, 10.349, 53.361, 6.12, 10.832, 2.313, 10.312, 5.641, 8.708,
6.138, 12.302, 20.078), Mg = c(1.487, 1.773, 1.992, 1.143, 2.991,
1.678, 2.23, 3.288, 1.148, 2.428, 3.428, 2.729, 0.777, 2.554,
2.374, 4.075, 1.993, 1.881, 3.034), Ca = c(5.529, 6.205, 6.59,
4.099, 10.631, 4.564, 6.652, 13.374, 4.332, 10.542, 11.194, 10.053,
2.969, 7.73, 8.163, 11.539, 6.166, 5.968, 9.299), SO4 = c(0.663,
0.831, 0.607, 0.882, 9.013, 0.896, 0.652, 0.021, 1.446, 0.012,
8.832, 6.665, 1.003, 2.575, 3.685, 7.121, 3.64, 5.648, 2.397),
HCO3 = c(7.522, 5.498, 6.15, 5.242, 8.582, 4.067, 5.65, 9.364,
5.435, 8.068, 9.054, 8.326, 4.805, 7.235, 7.488, 9.234, 6.352,
6.98, 8.34)), .Names = c("ID", "K", "Na", "Cl", "Mg", "Ca",
"SO4", "HCO3"), class = "data.frame", row.names = c(NA, -19L))
df_melted<-melt(df, na.rm=T)
ggplot(df_melted, aes(variable, value, group=ID, color=ID))+
geom_point(size=2) +
geom_line() +
theme(legend.position="none") +
scale_y_log10(breaks=seq(0, 100, 10))
Is there a way to remove the spaces at the beginning and at the end of the plot? I tried with xlim but the problem is that the x variable is not a numerical variable, so, something like xlim(c("K", "HCO3")) doesn't work.
This is a discrete scale, but you can still used the expand argument as follows. Whether the output looks acceptable or not is another matter. Play with the c(0,0) values until you find something that suits. Using 0.1 for the second value gives a slightly better plot, in my view...
ggplot(df_melted, aes(variable, value, group=ID, color=ID))+
geom_point(size=2) +
geom_line() +
theme(legend.position="none") +
scale_y_log10(breaks=seq(0, 100, 10)) +
scale_x_discrete(expand = c(0,0))

Resources