mgcv: plotting factor 'by' smooths - r

I want to plot spline effect of a parameter called "NO2" on birthweight, but I want 4 graphs for four quartiles. My current code gives only one graph, could you please help me to figure out the problem? You can see the code at the end, model_1_F1_spline is adjusted for different parameters, but my question is about F1_quartile. When I adjust NO2 by F1_quartile, it includes results for four quartiles, but I don't know how to extract those results and draw 4 graphs.
Here is a reproducible example:
structure(list(coefficients = structure(c(2779.15322482481, 11.6029323631846,
-109.637722127332, -70.5777182211836, -33.2026137282293, 1.34507275289371,
-104.16616170941, -84.3138020433217, 17.079775791272, 49.2699120523702,
65.7993773354024, 73.9523088264003, 62.1308005103464, 11.8305504033343,
17.2509811135892, 34.167485824927, 37.5379409075558, 39.4891005510156,
2.08045456267659, 95.0617726758795, 159.185162814325, 216.767405256274,
30.4053773772453, 67.9509936017346, 75.9715680793893, 76.0634702947319,
197.304475883704, 346.536371507916, 452.520999581153, 582.904282791219,
646.972345369266, -13.117918823958, -21.2577276011179, -36.4775602045112,
-2.53495678184362, 4.25561833400684, -4.24061504987865, 1.22183358211853,
-17.6781972182122, -13.9465039223737, -24.9221422877004, -26.5305128528655,
2.72740931108257, 17.3508955652218, -4.33132009995294, -11.4103790176564,
48.1115836583216, -23.8853869176324, -11.9906695483978, 0.159117077270929,
3.1823388043623, -30.2233558177321, 22.9158634128136, 1.86241593993877,
-7.46279510854093, -17.7265172939209, 15.6908002520418, 10.7367940888643,
11.9368630460758, 48.0464522543244, -10.5383667390476, 8.84142833076189,
38.6344171322845, -4.18823289724547, 20.9039579936433, -27.1572322476693,
-23.3055121479652, -10.125234127069, -2.3505578660444, -5.59801575548779,
21.0487614265911, -0.113655733751338, 1.4592300415459, -0.395003023852113,
-1.33572259818002, -0.195697887437374, -1.22245366980104, 0.161927450428184,
-8.83284987935688, -11.7655241486702, 10.0814083754381, 4.95053998927621,
0.0512729497898481, -2.47612645668306, -0.324705343736638, -2.73702305143146,
0.367899109531455, -17.8006136959884, -20.7138572162521, 1.66439599003613,
0.991339450831016, -0.094477049206764, -0.333359963322134, -0.0535341357101135,
-0.166135609567417, 0.0263694684353763, -0.790300658406237, -7.88088655871398,
2.30124665956728, 0.526763779856579, -0.729268724581621, -1.64502812073609,
0.245438533444878, -1.68875200672467, 0.471404077584143, -12.0519624220913,
-8.61178665100117), .Names = c("(Intercept)", "M_ethni_cat3FB White",
"M_ethni_cat3USB Black", "M_ethni_cat3FB Black", "M_ethni_cat3USB Hispanic",
"M_ethni_cat3FB Hispanic", "M_ethni_cat3USB Asian", "M_ethni_cat3FB Asian",
"M_Age_Cat1", "M_Age_Cat2", "M_Age_Cat3", "M_Age_Cat4", "M_Age_Cat5",
"M_EDU_Cat1", "M_EDU_Cat2", "M_EDU_Cat3", "M_EDU_Cat4", "M_EDU_Cat5",
"MEDICAID1", "prepregBMI_4cat1", "prepregBMI_4cat2", "prepregBMI_4cat3",
"PNC_RECEIVED1", "Parity_Cat1", "Parity_Cat2", "Parity_Cat3",
"gest_clin38", "gest_clin39", "gest_clin40", "gest_clin41", "gest_clin42",
"concept_year2008", "concept_year2009", "concept_year2010", "conc_season_num2",
"conc_season_num3", "conc_season_num4", "s(UHF34).1", "s(UHF34).2",
"s(UHF34).3", "s(UHF34).4", "s(UHF34).5", "s(UHF34).6", "s(UHF34).7",
"s(UHF34).8", "s(UHF34).9", "s(UHF34).10", "s(UHF34).11", "s(UHF34).12",
"s(UHF34).13", "s(UHF34).14", "s(UHF34).15", "s(UHF34).16", "s(UHF34).17",
"s(UHF34).18", "s(UHF34).19", "s(UHF34).20", "s(UHF34).21", "s(UHF34).22",
"s(UHF34).23", "s(UHF34).24", "s(UHF34).25", "s(UHF34).26", "s(UHF34).27",
"s(UHF34).28", "s(UHF34).29", "s(UHF34).30", "s(UHF34).31", "s(UHF34).32",
"s(UHF34).33", "s(UHF34).34", "s(NO2300_mean_total):F1_quartile1.1",
"s(NO2300_mean_total):F1_quartile1.2", "s(NO2300_mean_total):F1_quartile1.3",
"s(NO2300_mean_total):F1_quartile1.4", "s(NO2300_mean_total):F1_quartile1.5",
"s(NO2300_mean_total):F1_quartile1.6", "s(NO2300_mean_total):F1_quartile1.7",
"s(NO2300_mean_total):F1_quartile1.8", "s(NO2300_mean_total):F1_quartile1.9",
"s(NO2300_mean_total):F1_quartile2.1", "s(NO2300_mean_total):F1_quartile2.2",
"s(NO2300_mean_total):F1_quartile2.3", "s(NO2300_mean_total):F1_quartile2.4",
"s(NO2300_mean_total):F1_quartile2.5", "s(NO2300_mean_total):F1_quartile2.6",
"s(NO2300_mean_total):F1_quartile2.7", "s(NO2300_mean_total):F1_quartile2.8",
"s(NO2300_mean_total):F1_quartile2.9", "s(NO2300_mean_total):F1_quartile3.1",
"s(NO2300_mean_total):F1_quartile3.2", "s(NO2300_mean_total):F1_quartile3.3",
"s(NO2300_mean_total):F1_quartile3.4", "s(NO2300_mean_total):F1_quartile3.5",
"s(NO2300_mean_total):F1_quartile3.6", "s(NO2300_mean_total):F1_quartile3.7",
"s(NO2300_mean_total):F1_quartile3.8", "s(NO2300_mean_total):F1_quartile3.9",
"s(NO2300_mean_total):F1_quartile4.1", "s(NO2300_mean_total):F1_quartile4.2",
"s(NO2300_mean_total):F1_quartile4.3", "s(NO2300_mean_total):F1_quartile4.4",
"s(NO2300_mean_total):F1_quartile4.5", "s(NO2300_mean_total):F1_quartile4.6",
"s(NO2300_mean_total):F1_quartile4.7", "s(NO2300_mean_total):F1_quartile4.8",
"s(NO2300_mean_total):F1_quartile4.9"))), .Names = "coefficients")
Here is how I do:
model_1_F1_spline <- gam(BWGT~ s(UHF34,bs="re") + s(NO2300_mean_total, by=F1_quartile)+M_ethni_cat3 + M_Age_Cat + M_EDU_Cat + MEDICAID +
prepregBMI_4cat + PNC_RECEIVED + Parity_Cat + gest_clin + concept_year + conc_season_num, data=births_stressors, method="REML")
png(filename="plot_factor1_spline.png")
plot(model_1_F1_spline, ylab="Change in birth weight (g)", xlab="NO2")
dev.off()

From your provide coefficient vector of your fitted GAM, I could infer that F1_quartile is a factor by variable, with levels 1, 2, 3, 4, so that you have smooth functions s(NO2300_mean_total):F1_quartile1, s(NO2300_mean_total):F1_quartile2, s(NO2300_mean_total):F1_quartile3 and s(NO2300_mean_total):F1_quartile4.
In this situation, calling predict.gam should return you 5 plots, one being a Q-Q plot of your 34-level random intercept s(UHF34, bs = 're'), and 4 plots for the by smooths.
Your question is mainly regarding the by smooths, so consider the following minimal reproducible example.
dat <- data.frame(y = rnorm(40), x = runif(40), f = gl(4, 10))
library(mgcv)
fit <- gam(y ~ f + s(x, k = 5, by = f))
Note that you need to put by as a covariate, too, as factor by smooth is subject to centering constraint (if unclear of this, skip it).
Now if you call plot.gam(fit, page = 1), you will see 4 plots: a smooth s(x) for each level of f.
Note that plot.gam can invisibly return data generating the plots. If you do
oo <- plot.gam(fit, page = 1)
you will see that oo is a list of 4. For each element, say oo[[1]], $x and $fit gives respectively the x-coordinate and y-coordinate of the plot, while se gives standard error. $xlab gives variable name, $ylab gives smooth function name. These data are sufficient for you to reconstruct the plots by plot.gam.

Related

Plot Lines instead of points in scatterplot

I want to replace the points in my graph with a line like in the first picture, the second picture is what I have.
but its not quite what im looking for, I want a smooth line without the points
I think I have to use predict for the 1/x curve but I am not sure how,
Assuming f(1/x) fits the data well. One can use the lm() function to fix the desired function y= a/x + b and then use the predict() function to estimate the desired points.
If a more complicated nonlinear function is required to fit the data then the nls() maybe required
x<- c(176.01685819061, 21.6704613594849, 19.007554742708, 50.1865574864131, 17.6174002411188, 40.2758022496774, 11.0963214407251, 1249.94375253114, 694.894678288085, 339.786950220117, 42.1452961176151, 220.352895161601, 19.6303352674776, 9.10350287678884, 10.6222946396451, 44.1984352318898, 21.8069112975004, 42.1237630342764, 22.7551891190248, 12.9587850506626, 12.0207189111152, 20.2704921282476, 13.3441156357956, 9.13092569988769, 1781.08346869568, 71.2690023512206, 80.2376892286713, 344.114362037227, 208.830841645638, 91.1778810401913, 2220.0120768657, 41.4820962277111, 16.5730025748281, 32.30173229022, 108.703930214512, 51.6770035143256, 709.071405759588, 87.9618878732223, 10.4198968123037, 34.4951840238729, 57.8603720445067, 72.3289197551429, 30.2366643066749, 23.8696161364716, 270.014690419247, 13.8170113452005, 39.5159584479013, 27.764841260433, 18.0311836472615, 40.5709477295999, 33.1888820958952, 9.03112843931787, 4.63738971549635, 12.7591169313099, 4.7998894219979, 8.93458248803248, 7.33904760386628, 12.0940344070925, 7.17364602165948, 6.514191844409, 9.69911157978057, 6.57874454980745, 7.90556524435596)
y<- c(0.02840637, 0.230728821, 0.2630533, 0.099628272, 0.28381032, 0.12414402, 0.45059978, 0.00400018, 0.00719533500000001, 0.014715103086687, 0.118637201789886, 0.022690875, 0.254707825, 0.54923913, 0.470708088, 0.113126176837872, 0.22928510745, 0.118697847481752, 0.219730100850697, 0.38583864, 0.4159485, 0.24666396693114, 0.374696992776912, 0.547589605297248, 0.00280728, 0.070156727820596, 0.062314855376136, 0.01453005323695, 0.02394282358199, 0.0548378613646, 0.00225224, 0.120533928, 0.301695482, 0.15479046, 0.045996497, 0.096754836, 0.00705147600000001, 0.0568428, 0.47985120103071, 0.14494777, 0.08641493, 0.069128642, 0.165362156, 0.20947132, 0.018517511, 0.36187275779699, 0.126531158458224, 0.180083867690804, 0.277297380904852, 0.1232408972382, 0.15065285976048, 0.55364067, 1.07819275643191, 0.39187665, 1.04169066418176, 0.55962324, 0.68128731, 0.41342697, 0.69699564, 0.76755492, 0.515511133042674, 0.760023430328564, 0.632465844687028)
#data frame for prediction
df <- data.frame(x=sort(x))
# fit model y= a/x + b
model <-lm( y ~ I(1/x))
#summary(model)
#plot model
plot(df$x, predict(model, df), type="l", col="blue")
#optional
points(x, y)
Update - response to comments
x is sorted in the data frame, so that points are plotted in order. If not the line could go from x=1 to x=100, back to x=10 etc. thus making a mess. Try removing the sort and see what happens.
The I(1/x) term is to signal lm to perform the inverse transform on x first and then perform the least squares regression.
The predict() function is on the axis since that is the variable used in the plot function. To change this just assign the output from the predict function to a better variable name and plot that. Or use the "ylab= " option.
For smoothing, you can fit a linear model as foolws:
m <- lm(AM_cost_resorb~I(1/AM_leafP), data=data)
Then extract the predictied values on a new data set that covers the range of the exposure variable.
newx <- seq(min(data$AM_leafP), max(data$AM_leafP), by=0.01)
pr <- predict(m, newdata=data.frame(AM_leafP=newx))
And visualize:
plot(AM_cost_resorb~AM_leafP, data=data, type="p", pch= 15, col="red",ylab="Cost of reabsorbtion (kg C m^-2 yr^-1)", xlab="leaf P before senescence (g P/m2)", ylim=c(0,500), las=1)
lines(newx, y=pr, col="blue", lwd=2)
Data:
data <- structure(list(AM_cost_resorb = c(176.01685819061, 21.6704613594849,
19.007554742708, 50.1865574864131, 17.6174002411188, 40.2758022496774,
11.0963214407251, 1249.94375253114, 694.894678288085, 339.786950220117,
42.1452961176151, 220.352895161601, 19.6303352674776, 9.10350287678884,
10.6222946396451, 44.1984352318898, 21.8069112975004, 42.1237630342764,
22.7551891190248, 12.9587850506626, 12.0207189111152, 20.2704921282476,
13.3441156357956, 9.13092569988769, 1781.08346869568, 71.2690023512206,
80.2376892286713, 344.114362037227, 208.830841645638, 91.1778810401913,
2220.0120768657, 41.4820962277111, 16.5730025748281, 32.30173229022,
108.703930214512, 51.6770035143256, 709.071405759588, 87.9618878732223,
10.4198968123037, 34.4951840238729, 57.8603720445067, 72.3289197551429,
30.2366643066749, 23.8696161364716, 270.014690419247, 13.8170113452005,
39.5159584479013, 27.764841260433, 18.0311836472615, 40.5709477295999,
33.1888820958952, 9.03112843931787, 4.63738971549635, 12.7591169313099,
4.7998894219979, 8.93458248803248, 7.33904760386628, 12.0940344070925,
7.17364602165948, 6.514191844409, 9.69911157978057, 6.57874454980745,
7.90556524435596), AM_leafP = c(0.02840637, 0.230728821, 0.2630533,
0.099628272, 0.28381032, 0.12414402, 0.45059978, 0.00400018,
0.00719533500000001, 0.014715103086687, 0.118637201789886, 0.022690875,
0.254707825, 0.54923913, 0.470708088, 0.113126176837872, 0.22928510745,
0.118697847481752, 0.219730100850697, 0.38583864, 0.4159485,
0.24666396693114, 0.374696992776912, 0.547589605297248, 0.00280728,
0.070156727820596, 0.062314855376136, 0.01453005323695, 0.02394282358199,
0.0548378613646, 0.00225224, 0.120533928, 0.301695482, 0.15479046,
0.045996497, 0.096754836, 0.00705147600000001, 0.0568428, 0.47985120103071,
0.14494777, 0.08641493, 0.069128642, 0.165362156, 0.20947132,
0.018517511, 0.36187275779699, 0.126531158458224, 0.180083867690804,
0.277297380904852, 0.1232408972382, 0.15065285976048, 0.55364067,
1.07819275643191, 0.39187665, 1.04169066418176, 0.55962324, 0.68128731,
0.41342697, 0.69699564, 0.76755492, 0.515511133042674, 0.760023430328564,
0.632465844687028)), class = "data.frame", row.names = c(NA,
-63L))

Logistic Regression in Sigmoid Data R

Overview
Hello I am working on a project with displaying a "best fit line" over raw data. I have very little statistical experience, so I am unsure what methodologies & functions to pursue. I am also unsure what the general output should be.
I am working with sigmoidal data, which can be noisy at times. I was informed that I will end up using logistical regression over linear regression.
Goal
-Plot the approximated logistic regression over the raw data using ggplot.
Sample dput() Data
structure(list(Temperature = c(0.35937, 0.3623, 0.88796, 1.38134,
1.89773, 2.40185, 2.90063, 3.40432, 3.92358, 4.40969, 4.91506,
5.42822, 5.93337, 6.43823, 6.95019, 7.46044, 7.95995, 8.45434,
8.98095, 9.48974, 10.00073, 10.5122, 11.00073, 11.51513, 12.03613,
12.54614, 13.04028, 13.5476, 14.04397, 14.58032, 15.07253, 15.58715,
16.09963, 16.60449, 17.11501, 17.60693, 18.12231, 18.63134, 19.14575,
19.63745, 20.16479, 20.65478, 21.15478, 21.64843, 22.15872, 22.65649,
23.1575, 23.67309, 24.17651, 24.67065, 25.19387, 25.69558, 26.19238,
26.7019, 27.20193, 27.70242, 28.19778, 28.70629, 29.19799, 29.69409,
30.20312, 30.70898, 31.21337, 31.71975, 32.21874, 32.7351, 33.22045,
33.74001, 34.24926, 34.73901, 35.26269, 35.75146, 36.26806, 36.76562,
37.28637, 37.77514, 38.29202, 38.78686, 39.2954, 39.80761, 40.31689,
40.81985, 41.31371, 41.8225, 42.3291, 42.85546, 43.3562, 43.87304,
44.37011, 44.88256, 45.38891, 45.89919, 46.40942, 46.92089, 47.42651,
47.94579, 48.479, 48.96218, 49.47411, 49.9851, 50.49438, 51.02368,
51.52905, 52.04907, 52.55493, 53.05493, 53.57543, 54.07836, 54.59548,
55.12451, 55.6206, 56.12866, 56.64379, 57.14745, 57.65945, 58.17553,
58.68432, 59.18408, 59.70019, 60.22167, 60.71703, 61.24246, 61.77538,
62.26391, 62.77612, 63.29614, 63.77807, 64.30053, 64.81689, 65.33279,
65.85131, 66.35229, 66.86694, 67.3933, 67.91723, 68.41577, 68.9436,
69.44677, 69.95141, 70.46655, 71.01635, 71.49514, 72.00906, 72.51269,
73.03542, 73.5498, 74.07055, 74.5747, 75.1018, 75.63061, 76.15283,
76.67504, 77.17822, 77.68456, 78.19848, 78.69775, 79.2124, 79.70727,
80.22656, 80.76611, 81.26049, 81.78369, 82.29101, 82.81469, 83.33544,
83.87496, 84.32372, 84.85815, 85.45971, 85.89111, 86.3623, 86.93578
), Absorbance = c(1.81071, 1.81388, 1.81683, 1.81888, 1.82262,
1.82458, 1.82688, 1.82958, 1.83234, 1.83512, 1.83743, 1.84024,
1.84237, 1.8451, 1.84772, 1.85036, 1.85254, 1.85495, 1.85805,
1.86069, 1.86304, 1.86508, 1.86808, 1.87077, 1.87352, 1.87564,
1.87863, 1.88164, 1.88402, 1.88598, 1.88886, 1.89159, 1.89392,
1.8968, 1.8995, 1.90179, 1.90508, 1.90725, 1.9098, 1.91265, 1.91516,
1.9173, 1.92062, 1.92298, 1.92563, 1.92855, 1.9307, 1.93383,
1.93642, 1.93903, 1.94168, 1.94381, 1.9462, 1.94994, 1.95289,
1.95581, 1.95902, 1.96158, 1.96398, 1.96661, 1.96978, 1.97321,
1.97583, 1.97916, 1.98271, 1.98456, 1.98892, 1.99297, 1.99605,
1.99921, 2.0035, 2.00686, 2.01138, 2.01495, 2.0189, 2.02396,
2.0282, 2.03317, 2.03781, 2.04254, 2.0479, 2.05363, 2.05974,
2.06564, 2.07107, 2.07914, 2.08561, 2.09258, 2.1002, 2.10902,
2.11876, 2.12582, 2.13495, 2.14506, 2.15465, 2.16517, 2.17522,
2.18627, 2.19739, 2.20907, 2.22094, 2.23388, 2.24563, 2.25891,
2.27144, 2.28452, 2.29779, 2.31205, 2.32543, 2.33695, 2.3501,
2.36332, 2.37649, 2.39207, 2.40574, 2.42009, 2.43282, 2.44392,
2.45723, 2.46878, 2.47973, 2.49073, 2.49976, 2.51041, 2.51965,
2.52679, 2.53644, 2.54241, 2.54962, 2.55618, 2.56106, 2.56637,
2.57346, 2.57632, 2.58174, 2.58477, 2.58925, 2.5937, 2.59516,
2.59829, 2.60149, 2.60401, 2.6065, 2.61033, 2.6111, 2.61375,
2.61648, 2.61617, 2.62002, 2.62089, 2.62385, 2.62798, 2.62696,
2.63116, 2.63123, 2.63459, 2.63557, 2.64139, 2.64367, 2.64472,
2.64471, 2.65139, 2.64948, 2.6567, 2.65765, 2.65911, 2.65614,
2.66194, 2.66976, 2.66926, 2.67418, 2.6769)), class = "data.frame", row.names = c(NA,
-172L))
Sample Data
library(ggplot2)
df = "insert dput() code"
#plot sigmoidal curve
ggplot(df, aes(x = Temperature, y = Absorbance, color = "red")) +
geom_point() +
theme_classic()
If there are any R methods or statistical functions that I can implement, feel free to drop suggestions!
Unfortunately it doesn't look as though a logistic model fits your data very well (a logistic flattens out as x → ± infinity, while your curve looks linear at the extremes). We can do a little better though ...
Fit with self-starting four-parameter logistic (SSfpl(), built-in)
fit <- nls(Absorbance ~ SSfpl(Temperature, left, right, midpt, scale),
data = df)
(at this point I drew the picture as below with just this fit and saw that it was inadequate ...)
Refit with a new model, which is the SSfpl model plus a linear term:
use the previous starting values + (slope = zero)
fit2 <- nls(Absorbance ~ left+(right-left)/(1+exp((midpt-Temperature)/scale))
+ (Temperature-midpt)*slope,
start = c(as.list(coef(fit)), slope = 0),
data = df)
Set up a data frames with predictions:
pred <- data.frame(Temperature = df$Temperature,
Absorbance = predict(fit),
Absorbance2 = predict(fit2))
Draw the picture:
ggplot(df, aes(x = Temperature, y = Absorbance)) +
geom_point(color = "red") +
geom_line(data=pred, lwd = 2) +
geom_line(data=pred, aes(y=Absorbance2), colour = "blue") +
theme_classic()
The extended fit (blue) is very good for temperatures below 30 (linear), then slightly off for the rest of the range (worst near 60).

Fit a dataset that presents an elbow/knee bent using in nlsLM and "force" the coefficients to be near a threshold

Due to the necessity of fitting a dataset that is related to a two dimensional diffusion process D2 process with a sestak berggren model (derived from logistic model) I needed to understand how to use the nlsLM
when in presence of a elbow/knee because the following "easy way did not work"
x=c(1.000000e-05, 1.070144e-05, 1.208082e-05, 1.456624e-05, 1.861581e-05, 2.490437e-05, 3.407681e-05, 4.696710e-05,
6.474653e-05, 8.870800e-05, 1.206194e-04, 1.624442e-04, 2.172716e-04, 2.882747e-04, 3.794489e-04, 4.956619e-04,
6.427156e-04, 8.275095e-04, 1.058201e-03, 1.344372e-03, 1.697222e-03, 2.129762e-03, 2.657035e-03, 3.296215e-03,
4.067301e-03, 4.992831e-03, 6.098367e-03, 7.412836e-03, 8.968747e-03, 1.080251e-02, 1.295471e-02, 1.547045e-02,
1.839960e-02, 2.179713e-02, 2.572334e-02, 3.024414e-02, 3.543131e-02, 4.136262e-02, 4.812205e-02, 5.579985e-02,
6.449256e-02, 7.430297e-02, 8.533991e-02, 9.771803e-02, 1.115573e-01, 1.269824e-01, 1.441219e-01, 1.631074e-01,
1.840718e-01, 2.071477e-01, 2.324656e-01, 2.601509e-01, 2.903210e-01, 3.230812e-01, 3.585200e-01, 3.967033e-01,
4.376671e-01, 4.814084e-01, 5.278744e-01, 5.769469e-01, 6.284244e-01, 6.819947e-01, 7.371982e-01, 7.933704e-01,
8.495444e-01, 9.042616e-01)
ynorm=c(
1.000000e+00, 8.350558e-01, 6.531870e-01, 4.910995e-01, 3.581158e-01, 2.553070e-01, 1.814526e-01, 1.290639e-01,
9.219591e-02, 6.623776e-02, 4.817180e-02, 3.543117e-02, 2.624901e-02, 1.961542e-02, 1.478284e-02, 1.123060e-02,
8.597996e-03, 6.631400e-03, 5.151026e-03, 4.028428e-03, 3.171096e-03, 2.511600e-03, 2.001394e-03, 1.604211e-03,
1.292900e-03, 1.047529e-03, 8.530624e-04, 6.981015e-04, 5.739778e-04, 4.740553e-04, 3.932255e-04, 3.275345e-04,
2.739059e-04, 2.299339e-04, 1.937278e-04, 1.637946e-04, 1.389500e-04, 1.182504e-04, 1.009406e-04, 8.641380e-05,
7.418032e-05, 6.384353e-05, 5.508090e-05, 4.762920e-05, 4.127282e-05, 3.583451e-05, 3.116813e-05, 2.715264e-05,
2.368759e-05, 2.068935e-05, 1.808802e-05, 1.582499e-05, 1.385102e-05, 1.212452e-05, 1.061032e-05, 9.278534e-06,
8.103650e-06, 7.063789e-06, 6.140038e-06, 5.315870e-06, 4.576585e-06, 3.908678e-06, 3.298963e-06, 2.732866e-06,
2.189810e-06, 1.614149e-06)
dfxy <- data.frame(x[1:length(ynorm)],ynorm)
fn=funSel <-"co*((1-x)^m)*(x^n)"
mod_fit <- nlsLM(ynorm~eval(parse(text=fn)),start=c(co=0.5,m=-1,n=0.5),data=dfxy)
plot(dfxy$x,dfxy$y,xlim=c(0,0.001))
plot(dfxy$x,(fitted(mod_fit))[1:length(dfxy$x)],xlim=c(0,0.001))
The only solution I've found is based on https://stackoverflow.com/a/54286595/6483091. So first finding the "elbow" and then applying the regression only to the reduced dataset. Everything in this way works but I was wondering if there can be other solutions (tweaking the parameter of the regression instead of making it in two steps, in some way let nlsLM "recognize" the curve using Dynamic First Derivate Threshold, but still forcing the fn for regression)
Also the "biggest problem is that I alredy know the "range" for the parameters" (i.e.
Applying a regression using "good" starting point (coefficients near the "ground truth" ynorm <- 0.973*(1-x)^(0.425)*x^(-1.008) ) but even if I give them as a starting point there is no way I obtain anything with similar values.
the "ground truth"
plot(x,yrnom) yt <- 0.973*(1-x)^(0.425)*x^(-1.008)
lines(x,yt/max(yt))
Here is a solution using nls and a hyperbolic fit:
x=c(1.000000e-05, 1.070144e-05, 1.208082e-05, 1.456624e-05, 1.861581e-05, 2.490437e-05, 3.407681e-05, 4.696710e-05,
6.474653e-05, 8.870800e-05, 1.206194e-04, 1.624442e-04, 2.172716e-04, 2.882747e-04, 3.794489e-04, 4.956619e-04,
6.427156e-04, 8.275095e-04, 1.058201e-03, 1.344372e-03, 1.697222e-03, 2.129762e-03, 2.657035e-03, 3.296215e-03,
4.067301e-03, 4.992831e-03, 6.098367e-03, 7.412836e-03, 8.968747e-03, 1.080251e-02, 1.295471e-02, 1.547045e-02,
1.839960e-02, 2.179713e-02, 2.572334e-02, 3.024414e-02, 3.543131e-02, 4.136262e-02, 4.812205e-02, 5.579985e-02,
6.449256e-02, 7.430297e-02, 8.533991e-02, 9.771803e-02, 1.115573e-01, 1.269824e-01, 1.441219e-01, 1.631074e-01,
1.840718e-01, 2.071477e-01, 2.324656e-01, 2.601509e-01, 2.903210e-01, 3.230812e-01, 3.585200e-01, 3.967033e-01,
4.376671e-01, 4.814084e-01, 5.278744e-01, 5.769469e-01, 6.284244e-01, 6.819947e-01, 7.371982e-01, 7.933704e-01,
8.495444e-01, 9.042616e-01)
ynorm=c(
1.000000e+00, 8.350558e-01, 6.531870e-01, 4.910995e-01, 3.581158e-01, 2.553070e-01, 1.814526e-01, 1.290639e-01,
9.219591e-02, 6.623776e-02, 4.817180e-02, 3.543117e-02, 2.624901e-02, 1.961542e-02, 1.478284e-02, 1.123060e-02,
8.597996e-03, 6.631400e-03, 5.151026e-03, 4.028428e-03, 3.171096e-03, 2.511600e-03, 2.001394e-03, 1.604211e-03,
1.292900e-03, 1.047529e-03, 8.530624e-04, 6.981015e-04, 5.739778e-04, 4.740553e-04, 3.932255e-04, 3.275345e-04,
2.739059e-04, 2.299339e-04, 1.937278e-04, 1.637946e-04, 1.389500e-04, 1.182504e-04, 1.009406e-04, 8.641380e-05,
7.418032e-05, 6.384353e-05, 5.508090e-05, 4.762920e-05, 4.127282e-05, 3.583451e-05, 3.116813e-05, 2.715264e-05,
2.368759e-05, 2.068935e-05, 1.808802e-05, 1.582499e-05, 1.385102e-05, 1.212452e-05, 1.061032e-05, 9.278534e-06,
8.103650e-06, 7.063789e-06, 6.140038e-06, 5.315870e-06, 4.576585e-06, 3.908678e-06, 3.298963e-06, 2.732866e-06,
2.189810e-06, 1.614149e-06)
dfxy <- data.frame(x[1:length(ynorm)],ynorm)
plot(ynorm ~ x.1.length.ynorm.., data = dfxy)
mod <- nls(ynorm ~ a/x.1.length.ynorm.. + b, data = dfxy, start = list(a = 1, b = 0))
lines(x = dfxy$x.1.length.ynorm.., y = predict(mod, newdata = dfxy$x.1.length.ynorm..))
The fit isn't perfect, though. I guess there is no continuous function to fit a right angle...
Depending on what you want to use the regression for, you could also use a loess regression:
dfxy <- data.frame(x[1:length(ynorm)],ynorm)
names(dfxy) <- c("x", "y")
plot(y ~ x, data = dfxy)
mod <- loess(y ~ x, data = dfxy, span = 0.1)
lines(x = dfxy$x, y = predict(mod, newdata = dfxy$x), col = "red")
Resulting in:

floating.pie error while using nodelables from ape package

I get an error while using the ARD model of the ace function in R. The error is
Error in floating.pie.asp(XX[i], YY[i], pie[i, ], radius = xrad[i], col = piecol) :
floating.pie: x values must be non-negative
library(ape)
library(phylobase)
tree <- read.nexus("data1.nexus")
plot(tree)
data <- read.csv("phagy_species.csv")
clade.full <- extract.clade(tree, node=91)
plot(clade.full)
clade.1 <- drop.tip(clade.full, "Bar_bre")
clade.2<- drop.tip(clade.1, "Par_pho")
clade.3<- drop.tip(clade.2, "Par_iph")
clade.4<- drop.tip(clade.3, "Eur_ser")
clade.5<- drop.tip(clade.4, "Opo_sym")
clade.6<- drop.tip(clade.5, "Mor_pel")
clade.7<- drop.tip(clade.6, "Aph_hyp")
clade.8<- drop.tip(clade.7, "Ere_oem")
clade.9<- drop.tip(clade.8, "Cal_bud")
clade.10<- drop.tip(clade.9, "Lim_red")
clade.11<- drop.tip(clade.10, "Act_str")
clade.12<- drop.tip(clade.11, "Hel_hec")
clade.13<- drop.tip(clade.12,"Col_dir")
clade.14<- drop.tip(clade.13, "Hyp_pau")
clade.15<- drop.tip(clade.14, "Nym_pol")
clade.16<- drop.tip(clade.15, "Mel_cin")
clade.17<- drop.tip(clade.16,"Apa_iri")
clade.18<- drop.tip(clade.17, "Bib_hyp")
clade.19<- drop.tip(clade.18, "Mar_ors")
clade.20<- drop.tip(clade.19, "Apo_cra")
clade.21<- drop.tip(clade.20, "Pse_par")
clade.22 <- drop.tip(clade.21, "Lep_sin")
clade.23<- drop.tip(clade.22, "Dis_spi")
plot(clade.23)
data2 <- as.numeric(data[,2])
model2 <- ace(data2, clade.23, type="discrete", method="ML", model="ARD")
summary(model2)
d <-logLik(model2)
deviance(model2)
AIC(model2)
plot(clade.23, type="phylogram", cex=0.8, font=3, label.offset = 0.004)
co <- c("red", "blue", "green", "black")
nodelabels(pie = model2$lik.anc, piecol = co, cex = 0.5)
And that is when I get the error. There is no error if I use the original tree without trimming. But, when i trim them to my requirements, it goes in the negative.
Here is the data
tree file
data file
The matrix you are using for the proportions of the pie has complex numbers in it. To see this, try:
class(model2$lik.anc[1,1])
The rows of that matrix define the proportions of the pies, and they need to sum to 1. Your code produces a plot with pies if I replace the pie matrix in the nodelabels function like this:
nodelabels(pie = matrix(0.25, 64, 4), piecol = co, cex = 0.5)
because now there is a legitimate matrix for the pie argument with rows that sum to 1.
As for why you have complex numbers in that matrix, I am not sure. It is probably related to all the warnings produced by the ace in your example. But that is a completely different issue.
I had the same problem with my data. I put my data into the matrix (like Slow Ioris suggested) and then unlisted the matrix.
x <- matrix(data=c(model2$lik.anc[,1],model2$lik.anc[,2],model2$lik.anc[,3],model2$lik.anc[,4]))
plotTree(tree,ftype="i",label.offset = 0.02)
nodelabels(pie = unlist(x))
For other people having the same problem also after purging imaginable parts of their data: The nodelabels function gives the same error when you provide a data.frame instead of a matrix to pie.

Fitting polynomial results in multiple straight lines on plot in R

I'm trying to plot a polynomial line to my data, however the plot results in multiple diagonal lines instead of one single curved line.
I've managed to correctly produce a polynomial using a fake dataset, but when I use this subset of my data (below) and attached code, I get multiple straight lines through the data. I have also tried fit2 = lm(y ~ I(x^2) + x) as a variant with no luck.
Any help is greatly appreciated.
x<-c(102.397, 101.863, 101.22, 101.426, 100.718, 100.665, 100.616,
100.844, 100.567, 100.864, 100.779, 101.002, 101.465, 102.291,
101.711, 101.208, 101.252, 100.781, 100.631, 100.87, 100.552,
100.762, 100.62, 101.044, 100.956, 101.3, 102.065, 101.581, 101.136,
101.122, 100.773, 100.55, 100.897, 100.747, 100.738, 100.585,
100.697, 100.487, 100.726, 100.706, 100.809, 101.208, 101.752,
101.498, 101.153, 101.035, 101.076, 100.544, 100.779, 100.792,
100.601, 100.454, 100.682, 100.687, 100.49, 100.552, 100.886,
100.936, 101.288, 101.284, 101.115, 101.026, 101.08, 100.777,
100.637, 100.846, 100.782, 100.492, 100.72, 100.73, 100.598,
100.261, 100.366, 100.563, 100.841)
y<- c(14.32613169, 13.72501806, 21.95599022, 16.48122392, 31.82829181,
49.09958776, 34.80769231, 29.69148033, 37.67365199, 12.75652985,
19.48755851, 15.2639087, 11.97119712, 15.69222982, 14.40972222,
20.2803527, 18.36173722, 32.52930568, 57.40697943, 33.18696557,
43.16302735, 34.08973698, 26.78616511, 16.15409518, 21.05716748,
15.06352087, 16.6404671, 18.29689298, 21.19666048, 15.7168413,
25.05327966, 59.63324601, 26.08805031, 28.93116956, 49.86901643,
49.25615364, 37.8384913, 47.14684757, 29.71848225, 20.51349921,
17.08812261, 22.06913828, 13.41404358, 19.45597049, 20.21563342,
20.82317899, 19.16648094, 54.67094703, 31.24128312, 35.30612245,
52.52072597, 34.42824882, 29.47282163, 28.90414317, 43.49371889,
21.28460091, 17.10587147, 21.67644184, 18.17082023, 16.62439474,
22.60932244, 23.04822808, 18.02791803, 33.44095941, 50.23319083,
28.65369341, 28.86940033, 32.6568959, 18.89804325, 14.54496931,
14.80571684, 43.49477683, 24.98729029, 19.12702305, 14.72747497)
plot(x,y)
fit<-lm(y ~ poly(x, 2))
lines(x, predict(fit), col = "red")
If you absolutely want to use the generic plotting functions in R, I've figured out the problem. Your x-values aren't in order, and lines simply plots things in order. To fix it, you have to order your x values:
Y<-predict(fit)
df<-data.frame(x,Y)
df2<-df[order(df$x),]
plot(x,y)
lines(df2$x,df2$Y,col="red")
You can do this nicely with a package called ggplot2:
install.packages("ggplot2")
library(ggplot2)
df<-data.frame(x,y)
ggplot(df,aes(x,y))+
geom_point()+geom_smooth(method = "lm", formula = y ~ poly(x, 2),colour="red")

Resources