Calculate random variables from grouped dataframe - r

I have a data frame called stats. I'd like to group by month_name and item and generate a random variable drawn from a normal distribution in a new column called rv.
This is the code I tried but it repeats the generation of 1 random variable in the rv column:
stats %>%
group_by(month_name, item) %>%
mutate(rv = rnorm(1, mean = mean, sd = sd))
The goal is to eventually replicate the rv output 10,000 times. How can I modify my code to generate the random variable for every row once and 10,000 times?
This is my data:
structure(list(month_name = structure(c(1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L,
5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L,
8L, 8L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L,
11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L), .Label = c("January",
"February", "March", "April", "May", "June", "July", "August",
"September", "October", "November", "December"), class = c("ordered",
"factor")), item = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
2L, 3L, 4L, 5L), .Label = c("a", "b", "e", "d", "c"), class = "factor"),
min = c(39853.3676768762, 11963.7336771844, 28475.0829411315,
36241.9007031999, 35743.7750504859, 16092.552892924, 12528.9369640133,
28363.8516762228, 29557.1911907891, 20577.9268503088, 26012.6643332399,
43743.1386573406, 33757.0104608081, 24012.3644652027, 29987.8232384625,
26663.1671529956, 50724.1357441692, 33156.7203254077, 36604.0975795671,
32448.5497811945, 47818.2983698804, 25173.5205474241, 29648.7882981325,
39034.0329768052, 15083.5548610647, 41560.8896893507, 40563.2944650284,
48794.4526055819, 35895.1783353774, 30085.4271923688, 39675.7305889162,
33628.9557047603, 36950.5993766457, 30593.5091646214, 28957.5398056329,
37080.7178800747, 45454.3924797489, 28755.6280571895, 34733.1340290652,
37227.9361452194, 21493.809533368, 33292.9944106622, 28137.6372068055,
25582.8046285949, 23073.0637573296, 28846.9082264882, 19454.182866794,
33869.2858697563, 19016.7538627489, 30647.6876387916, 35123.8965500988,
34146.2780735908, 40593.6508043686, 26908.3734089905, 47178.2458120079,
24665.5899193578, 22701.4906439165, 50735.1331088719, 36108.7624278488,
21415.5715318508), lower = c(54524.7101912146, 26928.6804993352,
25119.8847919585, 45942.5372327181, 52100.762800828, 23399.2712234262,
14178.7907654734, 71366.6268933559, 49209.2124037853, 54643.7588467776,
48369.7944054794, 29515.3335011807, 41577.635577101, 25357.3837384686,
43253.4733925982, 43401.4748829102, 37741.3586860236, 52294.4029786582,
58136.6122795486, 43617.5523486807, 46648.1777348884, 47822.6060157009,
37122.0182632065, 65447.4620274838, 29544.1919272749, 54822.3562275875,
64814.4174753617, 65538.2587526896, 39975.4034746898, 59117.6049731313,
49024.4324422717, 25273.7368374795, 56946.7596272533, 50660.5745923196,
37221.8185672126, 30508.2772838287, 47172.6674212663, 52956.1465111511,
45488.8349086128, 52660.1832157037, 37406.8854102724, 25601.012749268,
41414.610113642, 41145.7009104373, 26879.9690641376, 69323.7347440924,
59453.3099916568, 19260.9187209561, 14090.2250971317, 41778.9038974128,
35013.9160392596, 39672.0871995261, 57517.2881078087, 52765.3573599843,
57267.2271717807, 54869.720268229, 58525.9231470629, 44610.285805162,
47317.3995094377, 17599.590085043), mean = c(58549.8098049081,
56374.4327553941, 39864.1715264267, 85333.1530921059, 64454.2358008729,
63343.4098283811, 69838.6859070403, 41935.3881398536, 40239.4399412696,
70073.2291007902, 57535.295477502, 76197.4454180647, 60836.2074195693,
64601.7379215889, 51599.3556004457, 49092.0124309883, 47319.767991988,
63121.0872241636, 43048.0322965586, 77405.4987695189, 64320.8901918307,
53059.7915920758, 63712.4934804165, 37248.933469329, 48285.12302248,
60352.1030623367, 67648.010113929, 52282.8579266665, 63868.4373429784,
71370.1455147326, 59275.2217698193, 74524.7831867724, 62464.1935824186,
50255.8945012446, 31094.1686136834, 75833.6439248775, 32190.7391406323,
77010.5148506178, 69635.0888164364, 65885.8987213858, 54022.7135642953,
35801.3865465657, 60637.9983665307, 90783.7721781328, 57264.0603250172,
59977.2976696403, 71712.656969139, 76705.4011709067, 89462.5059367925,
76714.0458753254, 56859.5782454854, 66820.0053236744, 58243.7435076688,
52843.8704599132, 77247.3384533588, 55515.7748808548, 75004.3165800858,
88370.1869726297, 68628.9281194796, 53895.0496305422), median = c(42352.1610450345,
57330.3183802072, 55273.2047201131, 82351.3852530883, 46370.4898234873,
52386.0432388715, 47943.0683307536, 53897.781347776, 67858.0064600009,
73013.024717384, 83116.7356352266, 44401.5903576421, 69025.6068023045,
81625.3403276092, 43344.4404418446, 49701.9746204065, 44889.5603216509,
86449.7649043697, 52150.9769065634, 58675.8138647348, 55665.7047792249,
44566.4888204713, 50517.7492643733, 73778.9515308994, 60652.1631558926,
87345.0069311662, 68268.9807235179, 41356.3226356087, 41585.1763113502,
75144.8373297139, 81967.7788670882, 66041.6207332688, 55103.8870449834,
77301.4195253735, 54130.4774678618, 65176.7990367632, 46834.9652749994,
65134.3889325556, 76621.5018669346, 89066.7483257445, 79344.8597627239,
50867.4889878177, 51326.3717332736, 74843.6262595514, 66235.6184875188,
98300.5112442494, 51378.9240605971, 61277.8214283028, 48915.1245226839,
52765.9194941648, 47028.8412992194, 74841.2039136489, 70896.5761749783,
67414.0877191645, 60655.1682545525, 42707.2850070942, 51244.6187187212,
70889.9732948709, 82834.1260629236, 56029.4540887989), upper = c(96808.9361470916,
72722.9262056796, 89079.513341868, 84709.1878768955, 87694.368834914,
87860.8548839792, 80996.3827453218, 84247.9259137302, 95585.6388675179,
57338.746606262, 88681.3926853573, 87957.989278465, 87360.6574510974,
92664.4254709955, 73493.0826366849, 84230.5990186054, 81442.2517006442,
87801.9592453634, 107883.319372054, 101919.939543795, 78090.4252899963,
70239.1417329303, 100675.767786787, 99806.9236049608, 71452.5071326737,
73879.3479602876, 106131.22309752, 125238.035074805, 76731.6350473027,
105563.285669622, 98604.105083167, 88657.8428176833, 81133.2031578456,
92495.2957986084, 104836.803460225, 102419.6178137, 86160.3548401189,
87287.9179449312, 72987.3973022452, 73185.0732579627, 90916.179982239,
111282.33982277, 142168.512194455, 100479.774695548, 118375.00968986,
116099.107730658, 105747.461541425, 106715.198136428, 128585.197217447,
87996.5319472346, 67831.1501517932, 109713.080164634, 78535.3157822644,
128602.704986898, 82213.8086826659, 118591.773718681, 66518.2467960131,
91250.5061727746, 117072.914540123, 114524.034290364), max = c(137612.711045413,
142519.370905613, 137456.124250483, 149209.014602568, 158745.717583772,
144886.189765236, 168837.723206789, 148308.890270968, 158590.65413993,
152288.303209753, 154042.306686713, 143922.848061827, 147477.579594905,
147438.066965268, 141502.628117831, 150285.096748915, 148713.594899874,
156656.255445038, 151517.357942321, 146177.731181398, 130056.291991729,
150991.849546995, 150476.190905448, 140149.802748207, 162573.574139209,
124218.878401843, 140313.610415297, 156852.359228369, 147676.550419975,
139922.178103581, 131822.195549853, 143008.968758112, 142237.425864494,
148756.818388612, 123905.560034301, 157126.60664862, 132868.19652461,
137884.902850549, 142164.212835827, 144616.429331364, 154277.663061656,
156870.781144851, 170948.478868233, 154970.297432983, 144661.430142095,
151193.528913062, 136056.623739965, 132695.069145067, 144366.408646971,
154456.483407293, 143518.023088591, 145811.265404348, 139900.024678788,
127547.709882734, 149995.24047052, 145400.958382574, 159524.480570906,
118905.663549293, 161631.72583606, 147524.546274058), sd = c(9989.37951375166,
9906.50689980405, 9903.6852849217, 10008.3321579478, 10075.4653993515,
10063.7122293343, 10053.0016932606, 9826.1129055558, 9855.88655389009,
10028.7176055065, 10070.3833732403, 9941.07465801432, 10094.2667749602,
9910.53181242413, 10104.5889493016, 9851.70104229335, 9972.91821342281,
10080.4485086333, 10044.5102818099, 10037.3707232711, 10025.1107006076,
10022.3659427419, 9941.51637265177, 9873.12826319285, 10027.9036424549,
10033.6518983864, 9970.47127759776, 9937.3319252128, 10013.3439414305,
10030.3125017708, 10168.5115559098, 10213.3568382367, 9990.24289183087,
9968.82189362707, 10048.7504375345, 10015.8411633632, 10037.6851291425,
9925.92765463682, 9835.81447415085, 9782.6505066721, 10033.5360418173,
9991.76186224687, 9924.86818104305, 9970.41809893224, 9980.55197551292,
9886.97032019385, 9925.73912143071, 9971.01687402101, 9858.19281102242,
9969.19466304141, 9955.12658457894, 10139.5950943687, 9967.09479735319,
10168.1650679826, 10023.9501235604, 9821.41776472295, 10064.1149573067,
10134.8532916488, 9943.57024828908, 9833.93164357077)), row.names = c(NA,
-60L), groups = structure(list(month_name = structure(1:12, .Label = c("January",
"February", "March", "April", "May", "June", "July", "August",
"September", "October", "November", "December"), class = c("ordered",
"factor")), .rows = structure(list(1:5, 6:10, 11:15, 16:20, 21:25,
26:30, 31:35, 36:40, 41:45, 46:50, 51:55, 56:60), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))

you can try data.table:
library(data.table) # load data.table
setDT(stats) # convert stats to data.table
stats[, rv := rnorm(.N, mean = mean, sd = sd), by = .(month_name, item)]
In your example there's only one record of each combination month_name and item, but I can imagine your real data has more.
The := is an assignment operator. You'll assign the result of rnorm into a new column called rv.
The by = part serves for grouping, see ?data.table.
EDIT TO ADD:
If you want 10,000 random variables, then:
stats[, new_rv := .(list(rnorm(1e4, mean, sd))), by = .(month_name, item)]
You already know the := and the by = parts, so let's dive into the expression in the middle:
The .(list()) bit will assign the resulting list (vector of 10,000 random numbers, in our case) to the variable (because we are using the assignment operator :=).
The very interesting thing is that with this .(list()) "combo" you can store complex things in a variable (column) of a data.table. I use it often to store things such as forecasts, plots or linear models, etc. by group: it is very useful!
Now, if you want to operate on your new variable, please keep in mind that it is a list, so you need to subset it accordingly:
If you want to check that the standard deviation of new_rv of row 1 is close to what you expect, the following code will throw an error:
stats[1, sd(new_rv)]
> Error in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm) :
> is.atomic(x) is not TRUE
It is complaining that you are attempting to calculate sd() on a list. The correct code is:
stats[1, sd(new_rv[[1]])]
[1] 9926.439
The [[1]]] part is correctly subsetting the first element of the list.

Related

Calculate AUC and variables importance in mcgv::gam in R

Hello my dataset looks like this:
structure(list(pa = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
1L, 2L, 1L, 1L, 2L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
lon = c(26.953632, 26.914444, 26.854655, 26.377477, 26.653273,
26.739085, 26.732233, 26.67895, 26.6691, 26.925116, 26.771316,
26.952233, 26.934466, 26.9493, 26.948333), lat = c(37.65571,
37.658056, 37.548262, 37.714353, 37.670897, 37.652183, 37.664717,
37.672083, 37.6934, 37.63755, 37.41155, 37.65095, 37.661533,
37.65825, 37.652166), distance = c(2664.205501, 2188.408657,
1309.509802, 2931.223857, 443.7116677, 83.4248179, 1162.349952,
1025.302461, 1447.284772, 156.3081952, 1718.49796, 2120.230705,
2940.015299, 2859.658249, 2179.706853), N = c(2L, 3L, 3L,
4L, 1L, 3L, 3L, 4L, 8L, 7L, 2L, 0L, 10L, 0L, 0L), nh4 = c(0.0911071189102672,
0.0912837530530634, 0.0887604283967188, 0.0809833919295647,
0.0806452852518153, 0.0873989977309376, 0.0854938036251452,
0.0837840217003991, 0.113291559368372, 0.139553981108798,
0.136305334431029, 0.149872598116116, 0.14975582563108, 0.149872598116116,
0.149872598116116), ppn = c(3.13649814951996, 3.38222779366539,
2.5790228332411, 1.68392748415672, 2.80087243875361, 3.2346900728285,
3.17393288172866, 2.63412894585215, 3.14572940860351, 4.80038520203728,
5.83457531216185, 5.10820325640801, 5.14342739916075, 5.10820325640801,
5.10820325640801)), row.names = c(1L, 2L, 3L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 13L, 16L, 17L, 18L, 19L), class = "data.frame")
I'm trying to fit a model with this kind of formula:
mod <- mgcv::gam(data=db, family=binomial(link="logit"), method="REML",
cbind(pa, N) ~ s(lon) + s(lat) + ti(lon, lat, distance, bs = "re") +
s(nh4) + s(ppn, k = 10) )
Where pa is a binomial variable (presence/absence) and N is the number of individuals collected (when when presence has value 1). The problem is when I run the following code to calculate the AUC, R returns errors:
library(mgcv) # library for GAM
library(ggplot2) # for beautiful plots
library(cdata) # data wrangling
library(sigr) # AUC calculation
data <- dplyr::select(db, pa, lon, lat, distance, nh4, ppn, N, season)
randn=runif(nrow(data))
train_idx=randn<=0.8
train=data[train_idx,]
test=data[!train_idx,]
performance=function(y,pred){
confmat_test=table(truth=y,predict=pred>0.5)
acc=sum(diag(confmat_test))/sum(confmat_test)
precision=confmat_test[2,2]/sum(confmat_test[,2])
recall=confmat_test[2,2]/sum(confmat_test[2,])
auc=calcAUC(pred,y)
c(acc,precision,recall,auc)
}
# Posterior probability
train$pred=predict(gam_model,newdata = train,type = "response")
test$pred=predict(gam_model,newdata=test,type="response")
# model performance evaluated using training data
perf_train=performance(train$pa_dd,train$pred)
perf_test=performance(test$pa_dd,test$pred)
perf_mat=rbind(perf_train,perf_test)
colnames(perf_mat)=c("accuracy","precision","recall","AUC")
round(perf_mat,4)
Questions are:
Is this formula correct?
How can I compute AUC?
How can I compute each variable's importance?
Thank you in advance.

ddply dropping rows with zero sum

I am trying to sum my data per Meter, then average out the sumCover by Transect. My issue is that when I mean the transects, at the meter points where the cover data was taken if no native species were recorded, then that transect is effectively dropped from the dataframe after the ddply function. I have tried using the .drop function, but the issue is each site has unequal transect sampling because it was scaled to site size, so it effectively adds transects to every site. What I need to figure out to do is how to fill in within a list of numbers for missing Transect while taking into account each site varies from 3 to 16 transects - EDIT - the data preview seem to of got cut off and does not have sufficient rows so here is a file:
Here is a downloadable link of the data csv
read.csv()
require(ddply)
NativeNonnativeCoverperMeter <- ddply(RestoredGrasslandSurveys, c("Site","Transect","Locality","Meter"), summarise,
sumCover = sum(Cover))
NativeNonnativeCoverperTransect <- ddply(NativeNonnativeCoverperMeter, c("Site","Transect","Locality"), summarise,
avgCover = mean(sumCover), .drop = F)
dput(RestoredGrasslandSurveys[1:10, ])
structure(list(Site = structure(c(10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L), .Label = c("AzevedoNorth", "AzevedoSouth",
"Big.Banana", "BlohmRanch", "CypressGrove", "Diablo.Canyon",
"Dipsea.Moors", "Elkhorn.Nursery", "Elkhorn.Owl", "ElkhornHotwire",
"FacultyHousing", "Glass.Beach", "Hanson.ESHA", "Hanson.Uplands",
"Hawk.Hill", "LightHouse", "Modoc", "MooreCreek", "Morning.Sun",
"Noyo.Headlands", "Paradise.Ridge", "Prosper.Ridge", "RussianRidge",
"Stinson.Gulch", "Tennessee.Valley", "Watsonville.Uplands", "YoungerLagoon"
), class = "factor"), County = structure(c(4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L), .Label = c("Humboldt", "Marin", "Mendocino",
"Monterery", "Monterey", "San.Luis.Obispo", "SanMateo", "Santa.Barbara",
"SantaCruz", "Sonoma"), class = "factor"), Transect = c(3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), Meter = c(0L, 5L, 10L, 15L,
20L, 25L, 30L, 35L, 40L, 45L), Lifeform = structure(c(4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("AnnualForb", "AnnualGrass",
"Fern", "Groundcover", "Horsetail", "Nfixer", "PerennialForb",
"PerennialGrass", "PerrenialForb", "Rush", "Sedge", "Shrub",
"Tree"), class = "factor"), Locality = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Groundcover", "Native",
"Nonnative"), class = "factor"), Species = structure(c(265L,
265L, 265L, 265L, 265L, 265L, 265L, 265L, 265L, 265L), .Label = c("Achillea.millefolium",
"Acmispon.glaber", "Acmispon.maritimus", "Acmispon.parviflorus",
"Acmispon.strigosus", "Agropyron.cristatum", "Aira.caryophyllea",
"Aira.elegans", "Aira.praecox", "Amsinckia.menziesii", "Anaphalis.margaritacea",
"Angelica.hendersonii", "Anthoxanthum.odoratum", "Anthriscus.caucalis",
"Artemisia.californica", "Asclepias.fascicularis", "Atriplex.semibucatta",
"Avena.barbata", "Avena.Barbata", "Avena.fatua", "Baccharis. pilularis",
"Baccharis.pilularis", "Bareground", "Bellis.perennis", "Berberis.pinnata",
"Brachypodium.distachyon", "Brassica.nigra", "Brassica.rapa",
"Brassica.tournefortii", "Briza.maxima", "Briza.minor", "Bromus.carinatus",
"Bromus.catharticus", "Bromus.diandrus", "Bromus.hordeaceous",
"Bromus.madritensis", "Bromus.maritimus", "Bromus.tectorum",
"Calamagrostis.nutkaensis", "Calandrinia.menziesii", "Calendula.arvensis",
"Calystegia.collina", "Calystegia.purpurata", "Cardamine.oligiosperma",
"Carduus.pycnocephalus", "carex.athrostachya", "Carex.gynodynama",
"Carex.lasiocarpa", "Carex.Praegracilis", "Carex.spp", "Carex.suberecta",
"Carex.tomentosa", "Carex.tumulicola", "Carpobrotus.edulis",
"Castilleja.affinis", "Castilleja.densiflora", "Cerastium.fontanum",
"Cerastium.glomeratum", "Chlorogalum.pomeridianum", "Cirsium.brevistylum",
"Cirsium.vulgare", "Clarkia.purpurea", "Clarkia.spp", "Claytonia.perfoliata",
"Clinopodium.douglasii", "Conium.maculatum", "Convolvulus.arvensis",
"Corethrogyne.filaginifolia", "Cortaderia.jubata", "Cotula.coronopifolia",
"Crassula.connata", "Crepis.vesicaria", "Croton.setigerus", "Cynodon.dactylon",
"Cynosurus.echinatus", "Cyperus.eragrostis", "Danthonia.californica",
"Daucus.pusillus", "Deschampsia.cespitosa", "Dichelostemma.capitatum",
"Dichondra.donelliana", "Dichondra.Donelliana", "Dichondra.micrantha",
"Distichlis.spicata", "Dudleya.cymosa", "Dudleya.farinosa", "Dysphania.ambrosioides",
"Ehrharta.erecta", "Elymus.condensatus", "Elymus.glaucus", "Elymus.triticoides",
"Elymus.vancouverensis", "Epilobium.brachycarpum", "Epilobium.cilatum",
"Equisetum.arvense", "Erigeron.canadensis", "Erigeron.glaucus",
"Erigeron.sumatrensis", "Eriogonum.latifolium", "Eriogonum.parvifolium",
"Eriophyllum.staechadifolium", "Erodium.botrys", "Erodium.cicutarium",
"Erodium.moscatum", "Eschscholzia.californica", "Eucalyptus.globulus",
"Festua.muyros", "Festuca.arundinacea", "Festuca.bromioides",
"Festuca.californica", "Festuca.idahoensis", "Festuca.microstachys",
"Festuca.muyros", "Festuca.perennis", "Festuca.pratensis", "Festuca.rubra",
"Foeniculum.vulgare", "Fragaria.vesca", "Frangula.californica",
"Fritillaria.affinis", "Galium.aparine", "Galium.divaricatum",
"Galium.porrigens", "Gamochaeta.ustulata", "Genista.monspessulana",
"Geranium.dissectum", "Geranium.molle", "Gilia.capitata", "Gnaphalium.palustre",
"Grindelia.latifolia", "Grindelia.stricta", "Helminthotheca.echioides",
"Hemiparasitic.ericaceae", "Heracleum.lanatum", "Heterotheca.grandiflora",
"Heterotheca.sessiliflora", "Hirschfieldia.incana", "Holcus.lanatus",
"Hordeum.brachyantherum", "Hordeum.marinum", "Hordeum.murinum",
"Horkelia.californica", "Hosackia.gracilis", "Hypochaeris.spp",
"Iris.douglasiana", "Iris.macrosiphon", "Juncus.bufonis", "Juncus.effusus",
"Juncus.mexicanus", "Juncus.occidentalis", "Juncus.patens", "Juncus.phaeocephalus",
"Koeleria.macrantha", "Lactuca.serriola", "Lasthenia.californica",
"Lathyrus.vestitus", "Leontodon.taraxacoides", "Lichen", "Linum.bienne",
"Logfia.gallica", "Lomatium.dasycarpum", "Lomatium.utriculatum",
"Lonicera.hispidula", "Lotus.corniculatus", "Lotus.micranthus",
"Lupinus.arboreus", "Lupinus.bicolor", "Lupinus.littoralis",
"Lupinus.nanus", "Lupinus.variicolor", "Luzula.comosa", "Luzula.subsessilis",
"Lysimachia.arvensis", "Lythrum.hyssopifolia", "Madia.exigua",
"Madia.gracilis", "Madia.madioides", "Madia.spp", "Malva.parviflora",
"Marah.fabaceus", "Matricaria.discoides", "Medicago.polymorpha",
"Melica.californica", "Melica.imperfecta", "Melica.torreyana",
"Melilotus.indicus", "Melilotus.officinalis", "Modiola.caroliniana",
"Moss", "Mulch", "Mushroom.cover", "Myosotis.discolor", "Oxalis.corniculata",
"Oxalis.pes-caprae", "Parentucellia.latifolia", "Parentucellia.viscosa",
"Paronychia.franciscana", "Pennisetum.clandestinum", "Perideridia.kelloggii",
"Phacelia.californica", "Phacelia.malvifolia", "Phalaris.aquatica",
"Pholistoma.auritum", "Plagiobothyrs.nothofulvus", "Plantago.coronopus",
"Plantago.erecta", "Plantago.lanceolata", "Poa.annua", "Poa.pratensis",
"Polygonum.arenastrum", "Polygonum.aviculare", "Polypodium.califomicum",
"Polypodium.californicum", "Polypogon.monspeliensis", "Polystichum.munitum",
"Prunella.vulgaris", "Pseudognaphalium.beneolens", "Pseudognaphalium.bioletti",
"Pseudognaphalium.californicum", "Pseudognaphalium.canescens",
"Pseudognaphalium.luteoalbum", "Pseudognaphalium.ramosissimum",
"Pseudotsuga.meziesii", "Pteridium.aquilinum", "Quercus.agrifolia",
"Ranunculus.californicus", "Ranunculus.occidentalis", "Raphanus.sativus",
"Raphanus.spp", "Rock", "Rubus.armeniacus", "Rubus.ursinus",
"Rumex.acetosella", "Rumex.crispus", "Rumex.Crispus", "Rumex.transitorius",
"Salix.lasiolepis", "Sanicula.arctopoides", "Sanicula.bipinnatifida",
"Sanicula.crassicaulis", "Scandix.peten-veneris", "Senecio.vulgare",
"Sherardia.arvensis", "Sidalcea.malviflora", "Silene.gallica",
"Sisyrinchium.bellum", "Solanum.americanum", "Solidago.velutina",
"Soliva.sessilis", "Sonchus.asper", "Sonchus.oleraceus", "Spergula.arvensis",
"Stachys.ajugoides", "Stachys.bullata", "Stellaria.media", "Stipa.cernua",
"Stipa.lepida", "Stipa.pulchra", "Stipa.purpurata", "Symphiotrichum.chilensis",
"Taraxia.ovata", "Tauschia.hartwegii", "Thatch.cover", "Thatch.Cover",
"Thatch.Depth", "Thysanocarpus.laciniatus", "Toxicodendron.diversilobum",
"Toxicoscordion.fremontii", "Tragopogon.porrifolius", "Tribulus.terrestris",
"Trifolium.angustifolium", "Trifolium.barbigerum", "Trifolium.bifidum",
"Trifolium.depauperatum", "Trifolium.dubium", "Trifolium.glomeratum",
"Trifolium.hirtum", "Trifolium.hybridum", "Trifolium.macraei",
"Trifolium.microcephalum", "Trifolium.repens", "Trifolium.subterraneum",
"Trifolium.variegatum", "Trifolium.willdenovii", "Triphysaria.pusilla",
"Triphysaria.versicolor", "Trisetum.canescens", "Vaccinium.ovatum",
"Veronica.persica", "Vicia.americana", "Vicia.benghalensis",
"Vicia.sativa", "Vicia.tetrasperma", "Vicia.villosa", "Viola.adunca",
"Viola.pedunculata", "Wyethia.angustifolia", "Wyethia.glabra"
), class = "factor"), Cover = c(1, 1, 0.5, 0.5, 0.5, 8, 2, 2,
5, 1)), row.names = c(NA, 10L), class = "data.frame")

Regression of multiple dose-response curves using the drc package in R

I am trying to fit regressions (4 or 5 PL) through my experimental data. I have several compounds inhibiting my enzyme of interest. Each has it's own range between 0-100% enzyme activity. All the data is in one dataframe and dinstinguished by one column specifying my compound ('toxin'). Therefore I want to have a regression for each of the toxins/compounds individually. I tried the following code
drc <- drm(avg ~ conc, data = testdata, toxin, fct = LL.5())
which gives the two following errors:
Error in optim(startVec, opfct, hessian = TRUE, method = optMethod,
control = list(maxit = maxIt, : non-finite finite-difference value
[24] Error in drmOpt(opfct, opdfct1, startVecSc, optMethod,
constrained, warnVal, : Convergence failed
After reading some posts on SO, this error was often solved by not using a log scale for the concentration ('conc'). In my case, the data is not log-transformed, therefore I do not really know how to proceed as I do not really understand what the error-message is telling me.
I tried the same command with only a subset of the data (only one of the toxins) and that works.
Here is the data:
testdata <- structure(list(toxin = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("toxin1",
"toxin2", "toxin3", "toxin4", "toxin5", "toxin6", "NC", "PC",
"toxin7"), class = "factor"), conc = c(80, 230, 690, 2060,
6170, 18520, 55560, 116700, 5e+05, 1500000, 10, 30, 100, 290,
860, 2600, 7700, 23300, 70000, 210000, 0.25, 0.76, 2.29, 6.69,
29.57, 61.73, 185.19, 555.56, 1666.67, 5000, 0.1, 0.3, 0.91,
2.74, 8.23, 24.69, 74.07, 222.22, 666.67, 2000, 0.19, 0.39, 0.78,
1.56, 3.125, 6.25, 12.5, 25, 50, 100, 0.05, 0.14, 0.41, 1.23,
3.7, 11.11, 33.33, 100, 300, 900, 0.25, 0.76, 2.29, 6.69, 20.57,
61.73, 185.19, 555.56, 1666.67, 5000), avg = c(93.7392909656605,
109.438977761257, 102.50389863782, 97.8565582988098, 98.7749196390328,
94.6820096545283, 88.3878644123183, 74.6531623906189, 59.8033994067719,
33.1521812859023, 84.3458578131283, 80.8432075369312, 80.5041552022783,
74.3806536115552, 65.867746238255, 46.7093609589345, 25.2625895634089,
16.5991924099889, 9.8338847737454, 9.1267136985971, 96.7637675923354,
100.217322048861, 106.911067427548, 105.869274152439, 104.26295691452,
99.924974639669, 105.178112603458, 100.834869287621, 97.0640881891228,
100.517438616909, 102.664029650058, 104.079019894009, 106.005108031173,
101.539083701953, 98.0496674854621, 67.7840816081928, 39.3101865930841,
38.410593148271, 8.98193991681226, 7.22314661576326, 84.0614720922454,
82.7675961061481, 65.2085894181738, 37.3278677636159, 24.9075938602538,
14.3617392491638, 10.7917687047216, 8.37929257644196, 8.42895771412019,
12.9194757988616, 76.5674185459266, 65.8625860764468, 47.7169920989096,
29.6780563387259, 7.69651805994566, 4.34554390880982, 4.33821927277971,
0.39797595095055, 2.38671848257005, 5.89474149920234, 107.319075979956,
110.227548845268, 116.828640966343, 107.913632096559, 110.071386130938,
106.575197414688, 105.043139402911, 98.236919454246, 104.052659508375,
84.6763301224036), sd = c(7.49544951952132, 14.9170973650272,
1.03754566304896, 3.87773637652399, 9.17174603323541, 2.0257944547102,
0.874956239047901, 3.35155947287539, 1.91936941393018, 2.02594096726786,
1.60035835782164, 1.25579403370456, 3.52866856497447, 4.04640886982452,
7.37920326517342, 6.40246869316039, 4.77482079353957, 4.68322190067079,
1.74780492483205, 0.738821067897037, 5.42050977224004, 12.2951096302121,
9.08089564089922, 7.46281702965045, 9.52060311645085, 6.66339041948764,
9.04568668161887, 10.9590666295114, 6.25902541715453, 4.96928340386536,
10.8885949633507, 15.9830841613276, 7.11298501037955, 8.54768106201583,
12.7115587453605, 5.72457692384765, 4.62110397186864, 50.9817341717873,
2.96030364454981, 2.83464116977327, 10.7124422767561, 10.3544552730142,
9.05103847553877, 13.233995551835, 4.26528894064237, 2.18416799462023,
1.17346307923401, 5.46453008680512, 3.09705214055433, 10.1345046611914,
2.11845922287944, 3.11915150865922, 6.31893385595251, 14.1295842962481,
1.33224797602539, 2.11901484197009, 5.05792906176149, 2.08503325893712,
3.05243406958019, 8.68923158027763, 8.49552648053034, 7.45485150355005,
8.70510335269844, 7.13998242209083, 6.32588028411456, 4.75860842345735,
4.09767898578108, 7.04991004776136, 9.37260366463128, 7.20137530818876
)), .Names = c("toxin", "conc", "avg", "sd"), row.names = c(NA,
-70L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars = list(
toxin), drop = TRUE, indices = list(0:9, 10:19, 20:29, 30:39,
40:49, 50:59, 60:69), group_sizes = c(10L, 10L, 10L, 10L,
10L, 10L, 10L), biggest_group_size = 10L, labels = structure(list(
toxin = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 9L), .Label = c("toxin1",
"toxin2", "toxin3", "toxin4", "toxin5", "toxin6", "NC",
"PC", "toxin7"), class = "factor")), row.names = c(NA,
-7L), class = "data.frame", vars = list(toxin), drop = TRUE, .Names = "toxin"))
The error message is telling you that the function could not find a solution for at least one of your data subsets. In your case, toxin3 is the offending dataset. If I run the following code which omits toxin3, I get a converged result and a nice nice plot showing the results.
drc <- drm(avg ~ conc, data = testdata, curveid=toxin, subset=toxin %in% c("toxin1","toxin2","toxin4","toxin5","toxin6","toxin7"), fct = LL.5())
plot(drc)
The reason toxin 3 fails is that the data describe a flat line that could be fit by any number of sets of LL.5() parameters. One way to get around these this would be to fit each curve separately in a loop (or apply function) and use try/catch to handle any datasets that throw a convergence error.

Data not ordering by date using as.Date in R

I have a data set with a date column like this:
dateCol other column
"2013/11/12" some data
"2012/05/02" more data
"2013/09/22" etc
"" etc
"2013/09/17" etc
When I try to order the data frame by this column(dateCOl) by date, it just does nothing, I tried several codes, my last code was:
mydata<-mydata[with(mydata, order(as.Date(mydata[,dateCol], format="%y/%m/%d"))),]
But is not working, any ideas?
Thanks in advance!
You need to provide the right format of the dates for the conversion to succeed. In this case you need "%Y" with a capital Y for data with years including the centuries.
Try
sort(as.Date(mydata[,"dateCol"], format="%Y/%m/%d"))
#[1] "2011-07-13" "2011-08-21" "2012-05-02" "2012-07-02" "2012-07-17" "2013-01-29"
#[7] "2013-08-19" "2013-09-17" "2013-09-22" "2013-11-12" "2014-04-02"
data
mydata <-structure(list(dateCol = structure(c(1L, 11L, 4L, 10L, 1L, 1L,
1L, 1L, 1L, 9L, 6L, 5L, 12L, 1L, 1L, 8L, 1L, 7L, 3L, 2L),
.Label = c("", "2011/07/13", "2011/08/21", "2012/05/02",
"2012/07/02", "2012/07/17", "2013/01/29", "2013/08/19",
"2013/09/17", "2013/09/22", "2013/11/12", "2014/04/02"),
class = "factor")), .Names = "dateCol",
row.names = (NA, -20L), class = "data.frame")

How to add multiple data series to a scatterplot and how to format numbers to appear in standard form on y axis

My data set:
structure(list(Site = c(2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 6L), Average.worm.weight..g. = c(0.1934,
0.249, 0.263, 0.262, 0.4186, 0.204, 0.311, 0.481, 0.326, 0.657,
0.347, 0.311, 0.239, 0.4156, 0.31, 0.3136, 0.4033, 0.302, 0.277
), Average.total.immune.cell.count = structure(c(8L, 16L, 11L,
12L, 10L, 1L, 4L, 15L, 4L, 3L, 17L, 13L, 18L, 7L, 5L, 6L, 9L,
14L, 2L), .Label = c("0", "168750", "18650000", "200,000", "21,600,000",
"226666.6", "22683333.33", "2533333.33", "283333.333", "291666.6",
"335833.3", "435800", "474816666.7", "500000", "6450000", "729166.667",
"7433333.3", "9916667"), class = "factor"), Average.eleocyte.number = structure(c(2L,
5L, 14L, 10L, 1L, 1L, 6L, 1L, 6L, 7L, 1L, 9L, 15L, 8L, 12L, 3L,
11L, 13L, 4L), .Label = c("0", "1266666.67", "153333.3", "168740",
"17", "200,000", "2266666.667", "22683333.33", "23116666.67",
"264000", "283333.333", "442", "500000", "7.3", "9916667"), class = "factor")), .Names = c("Site",
"Average.worm.weight..g.", "Average.total.immune.cell.count",
"Average.eleocyte.number"), class = "data.frame", row.names = c(NA,
-19L))
This is my R script so far:
Plotting multiple data series on a graph
y1<-dframe1$"Average.total.immune.cell.count"
y2<-dframe1$"Average.eleocyte.number"
x<-dframe1$"Average.worm.weight..g."
plot.default(y1~x,type="p" )
points(y2~x)
I am trying to add to y series to the same scatterplot and I am struggling to do so, I want to have different symbols for the points so as to tell apart the two different data series. Also I would like the axes to meet on the bottom left hand side and would appreciate being informed as to how I can do that? I would also like the y axis to be in standard form, but do not know how to get R to do that.
Best regards.
K.
So this is an object lesson is getting your data in the correct format to begin with. Your numbers have commas, which R does not like. Hence the numbers get converted to character and imported as factors (which your structure(...) clearly shows. You need to fix that, or better yet get rid of the commas prior to exporting.
Something like this will work
colnames(dframe) <- c("Site","x","y1","y2")
dframe$y1 <- as.numeric(as.character(gsub(",","",dframe$y1,fixed=TRUE)))
dframe$y2 <- as.numeric(as.character(gsub(",","",dframe$y2,fixed=TRUE)))
plot(y1~x,dframe, col="red", pch=20)
points(y2~x,dframe, col="blue", pch=20)
But there are additional problems. One of the numbers (in row 12) is a factor of 10 larger than all the others, so the plot above is not very informative. It's hard to know if this is a data input error, or a genuine outlier in your data.
EDIT: Response to OP's comment
dframe <- dframe[-12,] # remove row 12
dframe <- dframe[order(dframe$x),] # order by increasing x
plot(y1~x,dframe, col="red", pch=20, type="b")
points(y2~x,dframe, col="blue", pch=20, type="b")
legend("topleft",legend=c("y1","y2"),col=c("red","blue"),pch=20)

Resources