Calculate AUC and variables importance in mcgv::gam in R - r

Hello my dataset looks like this:
structure(list(pa = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
1L, 2L, 1L, 1L, 2L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
lon = c(26.953632, 26.914444, 26.854655, 26.377477, 26.653273,
26.739085, 26.732233, 26.67895, 26.6691, 26.925116, 26.771316,
26.952233, 26.934466, 26.9493, 26.948333), lat = c(37.65571,
37.658056, 37.548262, 37.714353, 37.670897, 37.652183, 37.664717,
37.672083, 37.6934, 37.63755, 37.41155, 37.65095, 37.661533,
37.65825, 37.652166), distance = c(2664.205501, 2188.408657,
1309.509802, 2931.223857, 443.7116677, 83.4248179, 1162.349952,
1025.302461, 1447.284772, 156.3081952, 1718.49796, 2120.230705,
2940.015299, 2859.658249, 2179.706853), N = c(2L, 3L, 3L,
4L, 1L, 3L, 3L, 4L, 8L, 7L, 2L, 0L, 10L, 0L, 0L), nh4 = c(0.0911071189102672,
0.0912837530530634, 0.0887604283967188, 0.0809833919295647,
0.0806452852518153, 0.0873989977309376, 0.0854938036251452,
0.0837840217003991, 0.113291559368372, 0.139553981108798,
0.136305334431029, 0.149872598116116, 0.14975582563108, 0.149872598116116,
0.149872598116116), ppn = c(3.13649814951996, 3.38222779366539,
2.5790228332411, 1.68392748415672, 2.80087243875361, 3.2346900728285,
3.17393288172866, 2.63412894585215, 3.14572940860351, 4.80038520203728,
5.83457531216185, 5.10820325640801, 5.14342739916075, 5.10820325640801,
5.10820325640801)), row.names = c(1L, 2L, 3L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 13L, 16L, 17L, 18L, 19L), class = "data.frame")
I'm trying to fit a model with this kind of formula:
mod <- mgcv::gam(data=db, family=binomial(link="logit"), method="REML",
cbind(pa, N) ~ s(lon) + s(lat) + ti(lon, lat, distance, bs = "re") +
s(nh4) + s(ppn, k = 10) )
Where pa is a binomial variable (presence/absence) and N is the number of individuals collected (when when presence has value 1). The problem is when I run the following code to calculate the AUC, R returns errors:
library(mgcv) # library for GAM
library(ggplot2) # for beautiful plots
library(cdata) # data wrangling
library(sigr) # AUC calculation
data <- dplyr::select(db, pa, lon, lat, distance, nh4, ppn, N, season)
randn=runif(nrow(data))
train_idx=randn<=0.8
train=data[train_idx,]
test=data[!train_idx,]
performance=function(y,pred){
confmat_test=table(truth=y,predict=pred>0.5)
acc=sum(diag(confmat_test))/sum(confmat_test)
precision=confmat_test[2,2]/sum(confmat_test[,2])
recall=confmat_test[2,2]/sum(confmat_test[2,])
auc=calcAUC(pred,y)
c(acc,precision,recall,auc)
}
# Posterior probability
train$pred=predict(gam_model,newdata = train,type = "response")
test$pred=predict(gam_model,newdata=test,type="response")
# model performance evaluated using training data
perf_train=performance(train$pa_dd,train$pred)
perf_test=performance(test$pa_dd,test$pred)
perf_mat=rbind(perf_train,perf_test)
colnames(perf_mat)=c("accuracy","precision","recall","AUC")
round(perf_mat,4)
Questions are:
Is this formula correct?
How can I compute AUC?
How can I compute each variable's importance?
Thank you in advance.

Related

How to specify upper and lower parameter bounds in nlmer models of lme4 using the bobyqa optimizer

I have a dataset to which I want to fit a nonlinear model with random effects. The dataset involves different lines being observed along time. The total number of lines were split up into batches that were executed on different times in the year. When using nlmer(), I ran into issues on how to specify boundaries of parameters when using the bobyqa optimizer.
A simple version of my dataset is as follows:
batch<-c(rep("A",29),rep("B",10),rep("C",10))
line<-c(rep(1:3,9), 1,3,rep(4:5,5),rep(6:7,5))
day<-c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L,
9L, 9L)
result<-c(-2.5336544395728, -2.69179853934892, -2.85649494251061, -4.08634506491338,
-3.57079698958629, -2.62038994824068, -2.69029745619165, -2.18131299587959,
-2.1028751114459, -2.56553968316024, -2.55450633017557, -2.43072209061048,
-2.42496349148255, -2.52850292795008, -1.09958807849945, -1.49448455383069,
-0.461525929110392, -0.298569396331159, -0.520372425046126, -0.676393841451061,
-0.930448741799686, -0.414050789171074, -0.0915696466880981,
-0.239509444743891, -0.319036274966057, -0.189385981406834, -0.376015368786241,
-0.269728570922294, -0.260869642513491, -0.206260420960064, -0.790169432375232,
-0.0573210164472325, -0.202013642441365, -0.0853200223702248,
-0.13422881481997, 0.0831839881028635, -0.0288333371533044, 0.124233139837959,
-0.16906818823674, -0.299957519185662, -0.085547531863026, 0.00929447912629702,
-0.117359669415726, -0.0764263122102468, -0.00718772329252618,
0.0110076995240481, -0.0304444368953004, 0.0586926009563272,
-0.0457574905606751)
data <- data.frame(day, line, batch, result)
data$line<-as.factor(data$line)
data$batch<-as.factor(data$batch)
The nlmer() function of lme4 allows for complex random effects to be specified. I use bobyqa as optimizer, to avoid convergence issues:
#defining the function needed for nlmer()
nform <- ~ z-(p0*(z-Za))/(p0+(1-p0)*(1/(1+s))^day)
nfun <- deriv(nform, namevec = c("z","p0","Za","s"),
function.arg = c("day", "z","p0","Za","s"))
nlmerfit = nlmer(log10perfract ~ nfun(day, z, p0, Za, s) ~
(z+s+Za|batch),
data = data,
start= coef(nlsfit),
control= nlmerControl(optimizer = "bobyqa")
However, specifying upper and lower limits does not work (with nlme or nls, no issues whatsoever) :
Error in nlmerControl(optimizer = "bobyqa", lower = lower_bounds,
upper = upper_bounds) : unused arguments (lower = lower_bounds,
upper = upper_bounds)
When specifying these bounds in an optCtrl argument as a list, R returns that my starting values violate the bounds (which they do not?):
nlmerfit = nlmer(log10perfract ~ nfun(day, z, p0, Za, s) ~
(z+s+Za|batch),
data = data,
start= coef(nlsfit),
control= nlmerControl(optimizer = "bobyqa",
optCtrl = list(lower = lower_bounds,
upper = upper_bounds)
)
)
Error in (function (par, fn, lower = -Inf, upper = Inf, control =
list(), : Starting values violate bounds
I need these bounds to be working as my real data is even a bit more complex (containing different groups of data for which the bounds are needed to allow a fit).

Calculate random variables from grouped dataframe

I have a data frame called stats. I'd like to group by month_name and item and generate a random variable drawn from a normal distribution in a new column called rv.
This is the code I tried but it repeats the generation of 1 random variable in the rv column:
stats %>%
group_by(month_name, item) %>%
mutate(rv = rnorm(1, mean = mean, sd = sd))
The goal is to eventually replicate the rv output 10,000 times. How can I modify my code to generate the random variable for every row once and 10,000 times?
This is my data:
structure(list(month_name = structure(c(1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L,
5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L,
8L, 8L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L,
11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L), .Label = c("January",
"February", "March", "April", "May", "June", "July", "August",
"September", "October", "November", "December"), class = c("ordered",
"factor")), item = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
2L, 3L, 4L, 5L), .Label = c("a", "b", "e", "d", "c"), class = "factor"),
min = c(39853.3676768762, 11963.7336771844, 28475.0829411315,
36241.9007031999, 35743.7750504859, 16092.552892924, 12528.9369640133,
28363.8516762228, 29557.1911907891, 20577.9268503088, 26012.6643332399,
43743.1386573406, 33757.0104608081, 24012.3644652027, 29987.8232384625,
26663.1671529956, 50724.1357441692, 33156.7203254077, 36604.0975795671,
32448.5497811945, 47818.2983698804, 25173.5205474241, 29648.7882981325,
39034.0329768052, 15083.5548610647, 41560.8896893507, 40563.2944650284,
48794.4526055819, 35895.1783353774, 30085.4271923688, 39675.7305889162,
33628.9557047603, 36950.5993766457, 30593.5091646214, 28957.5398056329,
37080.7178800747, 45454.3924797489, 28755.6280571895, 34733.1340290652,
37227.9361452194, 21493.809533368, 33292.9944106622, 28137.6372068055,
25582.8046285949, 23073.0637573296, 28846.9082264882, 19454.182866794,
33869.2858697563, 19016.7538627489, 30647.6876387916, 35123.8965500988,
34146.2780735908, 40593.6508043686, 26908.3734089905, 47178.2458120079,
24665.5899193578, 22701.4906439165, 50735.1331088719, 36108.7624278488,
21415.5715318508), lower = c(54524.7101912146, 26928.6804993352,
25119.8847919585, 45942.5372327181, 52100.762800828, 23399.2712234262,
14178.7907654734, 71366.6268933559, 49209.2124037853, 54643.7588467776,
48369.7944054794, 29515.3335011807, 41577.635577101, 25357.3837384686,
43253.4733925982, 43401.4748829102, 37741.3586860236, 52294.4029786582,
58136.6122795486, 43617.5523486807, 46648.1777348884, 47822.6060157009,
37122.0182632065, 65447.4620274838, 29544.1919272749, 54822.3562275875,
64814.4174753617, 65538.2587526896, 39975.4034746898, 59117.6049731313,
49024.4324422717, 25273.7368374795, 56946.7596272533, 50660.5745923196,
37221.8185672126, 30508.2772838287, 47172.6674212663, 52956.1465111511,
45488.8349086128, 52660.1832157037, 37406.8854102724, 25601.012749268,
41414.610113642, 41145.7009104373, 26879.9690641376, 69323.7347440924,
59453.3099916568, 19260.9187209561, 14090.2250971317, 41778.9038974128,
35013.9160392596, 39672.0871995261, 57517.2881078087, 52765.3573599843,
57267.2271717807, 54869.720268229, 58525.9231470629, 44610.285805162,
47317.3995094377, 17599.590085043), mean = c(58549.8098049081,
56374.4327553941, 39864.1715264267, 85333.1530921059, 64454.2358008729,
63343.4098283811, 69838.6859070403, 41935.3881398536, 40239.4399412696,
70073.2291007902, 57535.295477502, 76197.4454180647, 60836.2074195693,
64601.7379215889, 51599.3556004457, 49092.0124309883, 47319.767991988,
63121.0872241636, 43048.0322965586, 77405.4987695189, 64320.8901918307,
53059.7915920758, 63712.4934804165, 37248.933469329, 48285.12302248,
60352.1030623367, 67648.010113929, 52282.8579266665, 63868.4373429784,
71370.1455147326, 59275.2217698193, 74524.7831867724, 62464.1935824186,
50255.8945012446, 31094.1686136834, 75833.6439248775, 32190.7391406323,
77010.5148506178, 69635.0888164364, 65885.8987213858, 54022.7135642953,
35801.3865465657, 60637.9983665307, 90783.7721781328, 57264.0603250172,
59977.2976696403, 71712.656969139, 76705.4011709067, 89462.5059367925,
76714.0458753254, 56859.5782454854, 66820.0053236744, 58243.7435076688,
52843.8704599132, 77247.3384533588, 55515.7748808548, 75004.3165800858,
88370.1869726297, 68628.9281194796, 53895.0496305422), median = c(42352.1610450345,
57330.3183802072, 55273.2047201131, 82351.3852530883, 46370.4898234873,
52386.0432388715, 47943.0683307536, 53897.781347776, 67858.0064600009,
73013.024717384, 83116.7356352266, 44401.5903576421, 69025.6068023045,
81625.3403276092, 43344.4404418446, 49701.9746204065, 44889.5603216509,
86449.7649043697, 52150.9769065634, 58675.8138647348, 55665.7047792249,
44566.4888204713, 50517.7492643733, 73778.9515308994, 60652.1631558926,
87345.0069311662, 68268.9807235179, 41356.3226356087, 41585.1763113502,
75144.8373297139, 81967.7788670882, 66041.6207332688, 55103.8870449834,
77301.4195253735, 54130.4774678618, 65176.7990367632, 46834.9652749994,
65134.3889325556, 76621.5018669346, 89066.7483257445, 79344.8597627239,
50867.4889878177, 51326.3717332736, 74843.6262595514, 66235.6184875188,
98300.5112442494, 51378.9240605971, 61277.8214283028, 48915.1245226839,
52765.9194941648, 47028.8412992194, 74841.2039136489, 70896.5761749783,
67414.0877191645, 60655.1682545525, 42707.2850070942, 51244.6187187212,
70889.9732948709, 82834.1260629236, 56029.4540887989), upper = c(96808.9361470916,
72722.9262056796, 89079.513341868, 84709.1878768955, 87694.368834914,
87860.8548839792, 80996.3827453218, 84247.9259137302, 95585.6388675179,
57338.746606262, 88681.3926853573, 87957.989278465, 87360.6574510974,
92664.4254709955, 73493.0826366849, 84230.5990186054, 81442.2517006442,
87801.9592453634, 107883.319372054, 101919.939543795, 78090.4252899963,
70239.1417329303, 100675.767786787, 99806.9236049608, 71452.5071326737,
73879.3479602876, 106131.22309752, 125238.035074805, 76731.6350473027,
105563.285669622, 98604.105083167, 88657.8428176833, 81133.2031578456,
92495.2957986084, 104836.803460225, 102419.6178137, 86160.3548401189,
87287.9179449312, 72987.3973022452, 73185.0732579627, 90916.179982239,
111282.33982277, 142168.512194455, 100479.774695548, 118375.00968986,
116099.107730658, 105747.461541425, 106715.198136428, 128585.197217447,
87996.5319472346, 67831.1501517932, 109713.080164634, 78535.3157822644,
128602.704986898, 82213.8086826659, 118591.773718681, 66518.2467960131,
91250.5061727746, 117072.914540123, 114524.034290364), max = c(137612.711045413,
142519.370905613, 137456.124250483, 149209.014602568, 158745.717583772,
144886.189765236, 168837.723206789, 148308.890270968, 158590.65413993,
152288.303209753, 154042.306686713, 143922.848061827, 147477.579594905,
147438.066965268, 141502.628117831, 150285.096748915, 148713.594899874,
156656.255445038, 151517.357942321, 146177.731181398, 130056.291991729,
150991.849546995, 150476.190905448, 140149.802748207, 162573.574139209,
124218.878401843, 140313.610415297, 156852.359228369, 147676.550419975,
139922.178103581, 131822.195549853, 143008.968758112, 142237.425864494,
148756.818388612, 123905.560034301, 157126.60664862, 132868.19652461,
137884.902850549, 142164.212835827, 144616.429331364, 154277.663061656,
156870.781144851, 170948.478868233, 154970.297432983, 144661.430142095,
151193.528913062, 136056.623739965, 132695.069145067, 144366.408646971,
154456.483407293, 143518.023088591, 145811.265404348, 139900.024678788,
127547.709882734, 149995.24047052, 145400.958382574, 159524.480570906,
118905.663549293, 161631.72583606, 147524.546274058), sd = c(9989.37951375166,
9906.50689980405, 9903.6852849217, 10008.3321579478, 10075.4653993515,
10063.7122293343, 10053.0016932606, 9826.1129055558, 9855.88655389009,
10028.7176055065, 10070.3833732403, 9941.07465801432, 10094.2667749602,
9910.53181242413, 10104.5889493016, 9851.70104229335, 9972.91821342281,
10080.4485086333, 10044.5102818099, 10037.3707232711, 10025.1107006076,
10022.3659427419, 9941.51637265177, 9873.12826319285, 10027.9036424549,
10033.6518983864, 9970.47127759776, 9937.3319252128, 10013.3439414305,
10030.3125017708, 10168.5115559098, 10213.3568382367, 9990.24289183087,
9968.82189362707, 10048.7504375345, 10015.8411633632, 10037.6851291425,
9925.92765463682, 9835.81447415085, 9782.6505066721, 10033.5360418173,
9991.76186224687, 9924.86818104305, 9970.41809893224, 9980.55197551292,
9886.97032019385, 9925.73912143071, 9971.01687402101, 9858.19281102242,
9969.19466304141, 9955.12658457894, 10139.5950943687, 9967.09479735319,
10168.1650679826, 10023.9501235604, 9821.41776472295, 10064.1149573067,
10134.8532916488, 9943.57024828908, 9833.93164357077)), row.names = c(NA,
-60L), groups = structure(list(month_name = structure(1:12, .Label = c("January",
"February", "March", "April", "May", "June", "July", "August",
"September", "October", "November", "December"), class = c("ordered",
"factor")), .rows = structure(list(1:5, 6:10, 11:15, 16:20, 21:25,
26:30, 31:35, 36:40, 41:45, 46:50, 51:55, 56:60), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
you can try data.table:
library(data.table) # load data.table
setDT(stats) # convert stats to data.table
stats[, rv := rnorm(.N, mean = mean, sd = sd), by = .(month_name, item)]
In your example there's only one record of each combination month_name and item, but I can imagine your real data has more.
The := is an assignment operator. You'll assign the result of rnorm into a new column called rv.
The by = part serves for grouping, see ?data.table.
EDIT TO ADD:
If you want 10,000 random variables, then:
stats[, new_rv := .(list(rnorm(1e4, mean, sd))), by = .(month_name, item)]
You already know the := and the by = parts, so let's dive into the expression in the middle:
The .(list()) bit will assign the resulting list (vector of 10,000 random numbers, in our case) to the variable (because we are using the assignment operator :=).
The very interesting thing is that with this .(list()) "combo" you can store complex things in a variable (column) of a data.table. I use it often to store things such as forecasts, plots or linear models, etc. by group: it is very useful!
Now, if you want to operate on your new variable, please keep in mind that it is a list, so you need to subset it accordingly:
If you want to check that the standard deviation of new_rv of row 1 is close to what you expect, the following code will throw an error:
stats[1, sd(new_rv)]
> Error in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm) :
> is.atomic(x) is not TRUE
It is complaining that you are attempting to calculate sd() on a list. The correct code is:
stats[1, sd(new_rv[[1]])]
[1] 9926.439
The [[1]]] part is correctly subsetting the first element of the list.

Error in terms.formula : '.' in formula and no 'data' argument for linear regression

I was writing a simple code for multiple linear regression in R. The code is as follows.
dataset$State = factor (dataset$State,
levels = c ('New York','California','Florida'),
labels = c ('1','2','3') )
#Splitting the dataset
library(caTools)
set.seed(123)
split = sample.split(dataset$Profit, SplitRatio = 0.8)
training_set = subset(dataset$Profit, split == TRUE)
test_set = subset(dataset$Profit, split == FALSE)
#Fitting Multiple Linear Regression to the Training set
regressor = lm(formula = Profit ~ ., data = training_set)
But i get this error when running.
Error in terms.formula(formula, data = data) : '.' in formula and
no 'data' argument
Why gives such an error?
dataset at https://drive.google.com/drive/folders/1M5HAKs1s2ABYMEzVYMwWUaATlCw2ayZC?usp=sharing
Thanks for making this reproducible
dataset <-
structure(list(R.D.Spend = c(165349.2, 162597.7, 153441.51, 144372.41,
142107.34, 131876.9, 134615.46, 130298.13, 120542.52, 123334.88,
101913.08, 100671.96, 93863.75, 91992.39, 119943.24, 114523.61,
78013.11, 94657.16, 91749.16, 86419.7, 76253.86, 78389.47, 73994.56,
67532.53, 77044.01, 64664.71, 75328.87, 72107.6, 66051.52, 65605.48,
61994.48, 61136.38, 63408.86, 55493.95, 46426.07, 46014.02, 28663.76,
44069.95, 20229.59, 38558.51, 28754.33, 27892.92, 23640.93, 15505.73,
22177.74, 1000.23, 1315.46, 0, 542.05, 0), Administration = c(136897.8,
151377.59, 101145.55, 118671.85, 91391.77, 99814.71, 147198.87,
145530.06, 148718.95, 108679.17, 110594.11, 91790.61, 127320.38,
135495.07, 156547.42, 122616.84, 121597.55, 145077.58, 114175.79,
153514.11, 113867.3, 153773.43, 122782.75, 105751.03, 99281.34,
139553.16, 144135.98, 127864.55, 182645.56, 153032.06, 115641.28,
152701.92, 129219.61, 103057.49, 157693.92, 85047.44, 127056.21,
51283.14, 65947.93, 82982.09, 118546.05, 84710.77, 96189.63,
127382.3, 154806.14, 124153.04, 115816.21, 135426.92, 51743.15,
116983.8), Marketing.Spend = c(471784.1, 443898.53, 407934.54,
383199.62, 366168.42, 362861.36, 127716.82, 323876.68, 311613.29,
304981.62, 229160.95, 249744.55, 249839.44, 252664.93, 256512.92,
261776.23, 264346.06, 282574.31, 294919.57, 0, 298664.47, 299737.29,
303319.26, 304768.73, 140574.81, 137962.62, 134050.07, 353183.81,
118148.2, 107138.38, 91131.24, 88218.23, 46085.25, 214634.81,
210797.67, 205517.64, 201126.82, 197029.42, 185265.1, 174999.3,
172795.67, 164470.71, 148001.11, 35534.17, 28334.72, 1903.93,
297114.46, 0, 0, 45173.06), State = structure(c(1L, 2L, 3L, 1L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 2L, 3L, 2L, 3L, 1L, 2L, 1L, 3L, 1L,
2L, 1L, 3L, 3L, 1L, 2L, 3L, 1L, 3L, 1L, 3L, 1L, 2L, 3L, 2L, 1L,
3L, 2L, 1L, 2L, 2L, 3L, 2L, 1L, 2L, 1L, 3L, 2L, 1L, 2L), .Label = c("1",
"2", "3"), class = "factor"), Profit = c(192261.83, 191792.06,
191050.39, 182901.99, 166187.94, 156991.12, 156122.51, 155752.6,
152211.77, 149759.96, 146121.95, 144259.4, 141585.52, 134307.35,
132602.65, 129917.04, 126992.93, 125370.37, 124266.9, 122776.86,
118474.03, 111313.02, 110352.25, 108733.99, 108552.04, 107404.34,
105733.54, 105008.31, 103282.38, 101004.64, 99937.59, 97483.56,
97427.84, 96778.92, 96712.8, 96479.51, 90708.19, 89949.14, 81229.06,
81005.76, 78239.91, 77798.83, 71498.49, 69758.98, 65200.33, 64926.08,
49490.75, 42559.73, 35673.41, 14681.4)), .Names = c("R.D.Spend",
"Administration", "Marketing.Spend", "State", "Profit"), row.names = c(NA,
-50L), class = "data.frame")
The issue is with subsetting. Replace
training_set = subset(dataset$Profit, split == TRUE)
test_set = subset(dataset$Profit, split == FALSE)
with
training_set = subset(dataset, subset = split)
test_set = subset(dataset, subset = !split)
lm(formula = Profit ~ ., data = training_set)
#Call:
#lm(formula = Profit ~ ., data = training_set)
#
#Coefficients:
# (Intercept) R.D.Spend Administration Marketing.Spend
# 4.965e+04 7.986e-01 -2.942e-02 3.268e-02
# State2 State3
# 1.213e+02 2.376e+02

nls regression and storing output coefficients and plots

I tried to make reproducible data as suggested by How to make a great R reproducible example?:
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ANK.26.1",
"ANK.35.10"), class = "factor"), DAY = c(2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L,
18L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L), carbon = c(1684.351094778, 3514.451339358,
6635.877888654, 10301.700591252, 11361.360992769, 11891.934331254,
12772.885869486, 13545.127224369, 14022.00520767, 14255.045990397,
14479.813468278, 14611.749542181, 14746.382638335, 14942.733363567,
14961.338739162, 15049.433738817, 15047.197961499, 1705.361701104,
3293.593040601, 4788.872254899, 6025.622715999, 6670.80499518,
7150.526272512, 7268.955557607, 7513.61998338, 7896.202773246,
8017.953574608, 8146.09464786, 8286.148260324, 8251.229520243,
8384.244997158, 8413.034235219, 8461.066691601, 8269.360979031
), g.rate.perc = c(NA, 1.08653133557123, 0.888168948119852,0.55242467750436,
0.102862667394628, 0.0466998046116733, 0.0740797513417739, 0.060459426536321,
0.0352066079115925, 0.0166196474238596, 0.0157675729725753, 0.00911172469120847,
0.00921402983026387, 0.0133151790542558, 0.00124511193115184,
0.00588817626489591, -0.000148562222127446, NA, 0.931316411333049,
0.45399634862756, 0.258255053647507, 0.107073129133681, 0.0719135513148148,
0.0165623173150578, 0.0336588143694119, 0.0509185706373581,0.0154189051191185,
0.0159817679236518, 0.0171927308137518, -0.00421410998016991,
0.0161206856006937, 0.00343373053515927, 0.00570929049366353,
-0.0226573929218994), max.carb = c(15049.433738817, 15049.433738817,
15049.433738817, 15049.433738817, 15049.433738817, 15049.433738817,
15049.433738817, 15049.433738817, 15049.433738817, 15049.433738817,
15049.433738817, 15049.433738817, 15049.433738817, 15049.433738817,
15049.433738817, 15049.433738817, 15049.433738817, 8461.066691601,
8461.066691601, 8461.066691601, 8461.066691601, 8461.066691601,
8461.066691601, 8461.066691601, 8461.066691601, 8461.066691601,
8461.066691601, 8461.066691601, 8461.066691601, 8461.066691601,
8461.066691601, 8461.066691601, 8461.066691601, 8461.066691601
)), .Names = c("ID", "DAY", "carbon", "g.rate.perc", "max.carb"
), row.names = c(NA, 34L), class = "data.frame")
'data.frame': 34 obs. of 5 variables:
$ ID : Factor w/ 150 levels "ANK.26.1","ANK.35.10",..: 1 1 1 1 1 1 1 1 1 1 ...
$ DAY : int 2 3 4 5 6 7 8 9 10 11 ...
$ carbon : num 1684 3514 6636 10302 11361 ...
$ g.rate.perc: num NA 1.087 0.888 0.552 0.103 ...
$ max.carb : num 15049 15049 15049 15049 15049 ...
In the sample data ID only has two levels not the indicated 150.
My nls looks like that:
res.sample <- ddply (
d, .(ID),
function(x){
mod <- nls(carbon~phi1/(1+exp(-(phi2 + phi3 * DAY))),
start=list(
phi1 = x$max.carb,
phi2 = int[1],
phi3 = mean(x$g.rate.perc)),
data=x,trace=TRUE)
return(coef(mod))
}
)
phi2 is actually the result for the intercept from
int <- coef(lm(DAY~carbon,data=sample))
Unfortunately it doesnt work anymore, since i tried to wrap it into a ddply surrounding, but i cant go through all original 150 levels of ID manually.
On top of that, i would like to store all three output values for phi1 - phi3 in a data frame/ list with the respective ID. I meant to do that by
return(coef(mod))
The cherry on top would be the plots of the actual data and the fitted curve on top. Manually with subsetting i can do this as well but it is just too time consuming.
My reduced ggplot code for that is
ggplot(data=n, aes(x = DAY, y = carbon))+
geom_point(stat="identity", size=2) +
geom_line( aes(DAY,predict(logMod) ))+
ggtitle("ID")
if somehow the ID which contains a triplet of information, is less useful, here is how you can return it into another version
sep_sample <- sample %>% separate(ID, c("algae", "id", "nutrient"))
I feel like this is so much to ask but i have really tried and i can only spend so many days on this.
Here is a summary:
I need to run the model on every level of ID / every combination of algae & nutrient if you split it.
The output phi's should be stored in some kind of frame/list/table with their respective identification of where they belong to.
Ideally there is a way to include ggplots in all of this, which are automatically generated as well and stored.
as i said, the model in itself already worked, but when i put in in the ddply structure i get the following error message:
Error in numericDeriv(form[[3L]], names(ind), env) :
Missing value or an infinity produced when evaluating the model
I hope this is something you can work with somehow and this appears as a reasonable question to ask. If there are pages that already provide a similar solution that I havent found I will gladly take a look.
Thanks a lot!
soo i came up with this solution, which is not all i wanted, but i think a step closer, since it is running
coef_list <- list()
curve_list <- list()
for(i in levels(d$ALGAE)) {
for(j in levels(d$NUTRIENT)) {
dat = d[d$ALGAE == i & d$NUTRIENT == j,]
#int <- coef(lm(DAY~carbon,data=dat))
mod <- nls(carbonlog~phi1/(1+exp(-(phi2+phi3*DAY))),
start=list(
phi1=9.364,
phi2=0,
phi3= 0.135113),
data=dat,trace=TRUE)
coef_list[[paste(i, j, sep = "_")]] = coef(mod)
plt <- ggplot(data = dat, aes(x = DAY, y = carbonlog)) + geom_point()+
geom_line( aes(DAY,predict(mod) ))+
ggtitle(paste(i,"RATIO",j,sep=" ")) +
theme.plot
curve_list[[paste(i, j, sep = "_")]] = plt
}
}
unfortunately the paramters are static and not dependend on the respective factor combination. i reckon that the letter would be more helpful finding a fit.
if i apply
curve_list[["ANK_1"]]
i get an error message though:
Error: Aesthetics must be either length 1 or the same as the data (17): x, y
i only get the message when i use the log transformed carbon values. when i use carbon in its original format it plots everything

Parameter fitting ODE models with nls.lm using short time series

I am currently trying to fit functional responses in laboratory experiments accounting for depletion using the Levenberg-Marquardt routine (nls.lm) in minpack. As an example I have been using the levenberg-marquardt routine (nls.lm) in minpack following the tutorial here (http://www.r-bloggers.com/learning-r-parameter-fitting-for-models-involving-differential-equations/).
In the example he fits the data by first setting up a function rxnrate which I modified shown below:
# rate function
rxnrate=function(t,c,parms){
# rate constant passed through a list called parms
a=parms$a
h=parms$h
# c is the concentration of species
# derivatives are computed below
r=rep(0,length(c))
r[1]=-c["B"]*a*c["A"]/(c["B"]+a*h*c["A"])#prey
r[2]=0#predator
# the computed derivatives are returned as a list
# order of derivatives needs to be the same as the order of species in c
return(list(r))
}
My problem is that rather than having a long time series to work with I have many short time series (n=6) with multiple start points. Fitting these individually with the nls.lm function would result in fairly useless estimates. My low-tech solution, which has yielded comparable results to the rogers analytic method, was to arrange them all and fit them simultaneously as in the example below.
# rate function
rxnrate=function(t,c,parms){
# rate constant passed through a list called parms
a=parms$a
h=parms$h
# c is the concentration of species
# derivatives are computed below
r=rep(0,length(c))
r[1]=-c["B"]*a*c["A"]/(c["B"]+a*h*c["A"])#prey
r[2]=0#predator
r[3]=-c["D"]*a*c["C"]/(c["D"]+a*h*c["C"])#prey2
r[4]=0#predator2
r[5]=-c["F"]*a*c["E"]/(c["F"]+a*h*c["E"])#prey3
r[6]=0#predator3
# and so on
return(list(r))
}
The problem with this is that I quickly run out of letters in addition to the fact that it is extremely inefficient to hard code all of these time series (over 100 in total).
My question is because the paired equations are all the same, is there a solution where I can write them once and have the function apply it to all the subsequent paired time series. I also wasn't sure if this solution would lead to any mathematical issues down the road with parameter estimation even though it seems to give comparable results to other methods.
here is a small working example
library(reshape2) # library for reshaping data (tall-narrow <-> short-wide)
library(deSolve) # library for solving differential equations
library(minpack.lm) # library for least squares fit using levenberg-marquart algorithm
#load population data
rate= structure(list(time = c(0, 0.5, 1, 1.5, 2, 2.5), a = c(6L, 5L,
3L, 4L, 3L, 3L), b = c(1L, 1L, 1L, 1L, 1L, 1L), c = c(6L, 3L,
3L, 4L, 2L, 3L), d = c(3L, 3L, 3L, 3L, 3L, 3L), e = c(6L, 6L,
4L, 2L, 3L, 3L), f = c(6L, 6L, 6L, 6L, 6L, 6L), g = c(12L, 8L,
8L, 8L, 8L, 7L), h = c(1L, 1L, 1L, 1L, 1L, 1L), i = c(12L, 11L,
7L, 6L, 3L, 4L), j = c(3L, 3L, 3L, 3L, 3L, 3L), k = c(24L, 12L,
11L, 15L, 8L, 7L), l = c(1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("time",
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"), row.names = c(NA,
6L), class = "data.frame")
rxnrate=function(t,c,parms){
# rate constant passed through a list called parms
a=parms$a
h=parms$h
m=parms$m
# derivatives dc/dt are computed below
r=rep(0,length(c))
holling<-c["B"]*a*c["A"]/(c["B"]+a*h*c["A"])
r[1]=-c["B"]*a*c["A"]/(c["B"]^m+a*h*c["A"]) #dN1/dt
r[2]=0
r[3]=-c["D"]*a*c["C"]/(c["D"]^m+a*h*c["C"]) #dN1/dt
r[4]=0
r[5]=-c["F"]*a*c["E"]/(c["F"]^m+a*h*c["E"]) #dN1/dt
r[6]=0
r[7]=-c["H"]*a*c["G"]/(c["H"]^m+a*h*c["G"]) #dN1/dt
r[8]=0
r[9]=-c["J"]*a*c["I"]/(c["J"]^m+a*h*c["I"]) #dN1/dt
r[10]=0
r[11]=-c["L"]*a*c["K"]/(c["L"]^m+a*h*c["K"]) #dN1/dt
r[12]=0
return(list(r))
}
ssq=function(parms){
# inital concentration
cinit=cinit
# time points for which conc is reported
# include the points where data is available
t=c(seq(0,2.5,0.5),rate$time)
t=sort(unique(t))
# parameters from the parameter estimation routin
a=parms[1]
h=parms[2]
m=parms[3]
# solve ODE for a given set of parameters
out=ode(y=cinit,times=t,func=rxnrate,parms=parms)
# Filter data that contains time points where data is available
outdf=data.frame(out)
outdf=outdf[outdf$time %in% rate$time,]
# Evaluate predicted vs experimental residual
preddf=melt(outdf,id.var="time",variable.name="species",value.name="conc")
expdf=melt(rate,id.var="time",variable.name="species",value.name="conc")
ssqres=preddf$conc-expdf$conc
return(ssqres)
}
# parameter fitting using levenberg marquart algorithm
# initial guess for parameters
control=nls.lm.control(maxiter = 1000,ptol=0.000000000000000000000001,ftol=0.0000000000000000000001)
cinit=c(A=6,B=1,C=6,D=3,E=6,F=6,G=12,H=1,I=12,J=3,K=24,L=1)
parms=list(a=1,h=0.1,m=1)
fit=nls.lm(par=parms,fn=ssq,lower=c(rep(0,3)),upper=c(2,0.5,2),control=control)

Resources