Package (under development): ACRER
computer: iMAC (OS X 10.15.7)
RStudio version: 1.4.1717
R version: 4.1.0
The package "ACRER" has a main script "Wageningen_SQLite_2020.R" and 4 functions (Run_Queries.R,Wcombinations.R,Wexpected_visit_duration.R and Wcrosstab.R with 4 sub functions:NGroup0.R, NGroup1.R,NGroup2.R, NGroup3.R and NGroup4.R).
I get this error:
devtools::install(pkg = "/Volumes/Aias/ACRER", reload = TRUE,
build = TRUE, quiet = FALSE, build_vignettes = FALSE, force = TRUE)
Error in grepRaw("00new", r, fixed = TRUE, all = FALSE, value = FALSE) :
long vectors not supported yet: /Volumes/Builds/R4/R-4.1.0/src/main/grep.c:1450
(see reprex code bellow)
I would appreciate if somebody can give me some suggestions about this problem
devtools::install(pkg = "/Volumes/Aias/ACRER", reload = TRUE,
build = TRUE, quiet = FALSE, build_vignettes = FALSE, force = TRUE)
#> Skipping 1 packages not available: intpoint
#> Skipping 1 packages ahead of CRAN: data.table
#> checking for file ‘/Volumes/Aias/ACRER/DESCRIPTION’ ... ✓ checking for file ‘/Volumes/Aias/ACRER/DESCRIPTION’
#> ─ preparing ‘ACRER’: (40.4s)
#> ✓ checking DESCRIPTION meta-information
#> ─ checking for LF line-endings in source and make files and shell scripts
#> ─ checking for empty or unneeded directories
#> Removed empty directory ‘ACRER/tests/testthat’
#> ─ building ‘ACRER_0.9.1.tar.gz’
#>
#> Running /Library/Frameworks/R.framework/Resources/bin/R CMD INSTALL \
#> /var/folders/6h/lzdkxf9j07bd1q5t82lphhjc0000gn/T//RtmpkmZA7M/ACRER_0.9.1.tar.gz \
#> --install-tests
#> * installing to library ‘/Library/Frameworks/R.framework/Versions/4.1/Resources/library’
#> * installing *source* package ‘ACRER’ ...
#> ** using staged installation
#> ** R
#> ** data
#> *** moving datasets to lazyload DB
#> ** tests
#> ** byte-compile and prepare package for lazy loading
#> Wcrosstab(Rmatrix = RmatrixA, Cmatrix = CmatrixA, C1 = C1A)
#> `summarise()` has grouped output by 'activity_pattern'. You can override using the `.groups` argument.
#> Wcrosstab(Rmatrix = RmatrixB, Cmatrix = CmatrixB, C1 = C1B)
#> `summarise()` has grouped output by 'groupsRecr_act_ts'. You can override using the `.groups` argument.
#> Warning in xtfrm.data.frame(x) : cannot xtfrm data frames
#> Wcrosstab(Rmatrix = RmatrixC, Cmatrix = CmatrixC, C1 = C1C)
#> Warning in rm(list = c("Run_Queries", "Wcombinations", "Wcrosstab", "Wexpected_visit_duration", :
#> object 'Wcombinations' not found
#> Warning in rm(list = c("Run_Queries", "Wcombinations", "Wcrosstab", "Wexpected_visit_duration", :
#> object 'Wcrosstab' not found
#> Warning in rm(list = c("Run_Queries", "Wcombinations", "Wcrosstab", "Wexpected_visit_duration", :
#> object 'Wexpected_visit_duration' not found
#> choicenr choicetxt
#> [1,] "1" "Pivot table for the number of persons"
#> [2,] "2" "Pivot tables for the expected visiting duration"
#> [3,] "3" "Save tables in csv format files"
#> [4,] "4" "Exit"
#> RecrID recreation_areas
#> 1 1 Beeldengalerij Het Depot
#> 2 2 Arboretum Belmonte
#> 3 3 Wageningen Berg en Bergpad
#> 4 4 Arboretum de Dreijen
#> 5 5 Uitwaarden Wageningen
#> 6 6 Renkums Beekdal
#> 7 7 Utrechtse Heuvelrug_Grebbeberg
#> 8 8 De Blauwe Kamer
#> 9 9 Ouwehands Dierenpark Rhenen
#> 10 10 Kasteel Doorwerth
#> 11 11 Edese bos
#> 12 12 AIRBORNE MUSEUM HARTENSTEIN
#> 13 13 Peppelenburg Westerrode
#> 14 14 Landgoed Kernhem
#> 15 15 Wekeromse Zand
#> 16 16 De Betuwe
#> 17 17 Roekel
#> 18 18 Hoog en Laag Amerongse Bos
#> 19 19 Noord Ginkel
#> 20 20 Amerongse Bovenpolder
#> 21 21 Lunters Buurtbosch
#> 22 22 WARNSBORN
#> 23 23 PARK ZYPENDAAL
#> 24 24 Zuid Ginkel
#> 25 25 HET NATIONALE PARK DE HOGE VELUWE
#> 26 26 BURGERS ZOO
#> 27 27 PARK SONSBEEK
#> 28 28 HET NEDERLANDS OPENLUCHTMUSEUM
#> 29 29 PARK ANGERENSTEIN
#> 30 30 PARK EN KASTEEL ROSENDAEL
#> 31 31 Nationaal Park_Utrechtse Heuvelrug
#> 32 32 GROESBEEKSE BOS
#> 33 33 VELUWETRANSFERIUM POSBANK
#> transport_type_ID transport_type_definition maximum_of_total_time
#> 1 1 on foot 60
#> 2 2 bicycle 50
#> 3 3 car 40
#> wait_time_per_full_travel minimum_distance_used maximum_distance_used
#> 1 0 0 0
#> 2 0 0 25
#> 3 0 0 100
#> speed_of_transport_type fixed_costs_per_transport_type
#> 1 0 0
#> 2 20 0
#> 3 60 0
#> per_km_costs_per_transport_type per_person_costs_per_transport_type
#> 1 0.00 0
#> 2 0.00 0
#> 3 0.65 0
#> per_km_en_per_person_costs_per_transport_type
#> 1 0
#> 2 0
#> 3 0
#> ** help
#> *** installing help indices
#> ** building package indices
#> ** installing vignettes
#> ** testing if installed package can be loaded from temporary location
#> ** testing if installed package can be loaded from final location
#> ** testing if installed package keeps a record of temporary installation path
#> Error in grepRaw("00new", r, fixed = TRUE, all = FALSE, value = FALSE) :
#> long vectors not supported yet: /Volumes/Builds/R4/R-4.1.0/src/main/grep.c:1450
#> * removing ‘/Library/Frameworks/R.framework/Versions/4.1/Resources/library/ACRER’
#> Error in (function (command = NULL, args = character(), error_on_status = TRUE, : System command 'R' failed, exit status: 1, stdout & stderr were printed
Related
Below is a reprex copy of my code, I'm trying to group the below data set by the sport type, however when I use the group__by function variables with the same sport type aren't grouped together. For example below all the sport type 'All track combined' aren't grouped in a single row.
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.1.3
#> Warning: package 'ggplot2' was built under R version 4.1.3
#> Warning: package 'tibble' was built under R version 4.1.3
#> Warning: package 'dplyr' was built under R version 4.1.3
install.packages("tidytuesdayR")
#> Installing package into 'C:/Users/caitl/OneDrive/Documents/R/win-library/4.1'
#> (as 'lib' is unspecified)
#> package 'tidytuesdayR' successfully unpacked and MD5 sums checked
#>
#> The downloaded binary packages are in
#> C:\Users\caitl\AppData\Local\Temp\RtmpsXMLIf\downloaded_packages
tuesdata <- tidytuesdayR::tt_load('2022-03-29')
#> --- Compiling #TidyTuesday Information for 2022-03-29 ----
#> --- There is 1 file available ---
#> --- Starting Download ---
#>
#> Downloading file 1 of 1: `sports.csv`
#> --- Download complete ---
tuesdata$sports %>%
dplyr::group_by(sports) %>%
dplyr::summarise(sports = sports, prop = (partic_men)/(partic_men + partic_women)) %>%
na.omit()
#> `summarise()` has grouped output by 'sports'. You can override using the
#> `.groups` argument.
#> # A tibble: 43,614 x 2
#> # Groups: sports [31]
#> sports prop
#> <chr> <dbl>
#> 1 All Track Combined 0.570
#> 2 All Track Combined 0.556
#> 3 All Track Combined 0.513
#> 4 All Track Combined 0.494
#> 5 All Track Combined 0.450
#> 6 All Track Combined 0.567
#> 7 All Track Combined 0.478
#> 8 All Track Combined 0.464
#> 9 All Track Combined 0.492
#> 10 All Track Combined 0.512
#> # ... with 43,604 more rows
Is this what you want?
tuesdata$sports %>%
dplyr::group_by(sports) %>%
dplyr::summarise(prop = (sum(partic_men,
na.rm =TRUE)/(sum(partic_men, na.rm = TRUE) +
sum(partic_women, na.rm = TRUE))))
Output:
sports prop
<chr> <dbl>
1 All Track Combined 0.485
2 Archery 0.439
3 Badminton 0
4 Baseball 1
5 Basketball 0.536
6 Beach Volleyball 0.0192
7 Bowling 0.402
8 Diving 0.468
9 Equestrian 0.000675
10 Fencing 0.480
I am working through 'Machine Learning & R Expert techniques for predictive modeling' by Brett Lantz. I am using the tidymodels suite as I try the example modeling exercises in R.
I am working through chapter 5 in which you build a decision tree with the C5.0 algorithm. I hav e created the model using the code shown below
c5_v1 <- C5_rules() %>%
set_mode('classification') %>%
set_engine('C5.0')
c5_res_1 <- fit(object = c5_v1, formula = default ~., data = credit_train)
This has worked successfully:
parsnip model object
Call:
C5.0.default(x = x, y = y, trials = trials, rules = TRUE, control
= C50::C5.0Control(minCases = minCases, seed = sample.int(10^5, 1), earlyStopping
= FALSE))
Rule-Based Model
Number of samples: 900
Number of predictors: 20
Number of Rules: 22
Non-standard options: attempt to group attributes
Try as I might, Google as I do, read parsnips documentation, etc., I cannot find out how to view the decision tree. Can anyone tell me how to view the actual tree it has created?
Do note C5_rules() is a specification for a rule-fit model. Therefore, after fitting with C5_rules(), you shouldn't expect the output to be a decision tree but a set of rules instead.
With the C5.0 engine, you're able to get both a decision tree output and a rules output. With the fitted model, run extract_fit_engine() to obtain the engine specific fit embedded within a parsnip model fit, followed by summary() to extract the output.
library(tidymodels)
library(rules)
#>
#> Attaching package: 'rules'
#> The following object is masked from 'package:dials':
#>
#> max_rules
data(penguins, package = "modeldata")
#model specification
C5_decision_tree <- decision_tree() |>
set_engine("C5.0") |>
set_mode("classification")
C5_rules <- C5_rules() |>
#no need to set engine because only C5.0 is used for C5_rules()
#verify with show_engines("C5_rules")
set_mode("classification")
#fitting the models
C5_decision_tree_fitted <- C5_decision_tree |>
fit(species ~ ., data = penguins)
C5_rules_fitted <- C5_rules |>
fit(species ~ ., data = penguins)
#extracting decision tree
C5_decision_tree_fitted |>
extract_fit_engine() |>
summary()
#>
#> Call:
#> C5.0.default(x = x, y = y, trials = 1, control = C50::C5.0Control(minCases =
#> 2, sample = 0))
#>
#>
#> C5.0 [Release 2.07 GPL Edition] Mon Jul 4 09:32:16 2022
#> -------------------------------
#>
#> Class specified by attribute `outcome'
#>
#> Read 333 cases (7 attributes) from undefined.data
#>
#> Decision tree:
#>
#> flipper_length_mm > 206:
#> :...island = Biscoe: Gentoo (118)
#> : island in {Dream,Torgersen}:
#> : :...bill_length_mm <= 46.5: Adelie (2)
#> : bill_length_mm > 46.5: Chinstrap (5)
#> flipper_length_mm <= 206:
#> :...bill_length_mm > 43.3:
#> :...island in {Biscoe,Torgersen}: Adelie (4/1)
#> : island = Dream: Chinstrap (59/1)
#> bill_length_mm <= 43.3:
#> :...bill_length_mm <= 42.3: Adelie (134/1)
#> bill_length_mm > 42.3:
#> :...sex = female: Chinstrap (4)
#> sex = male: Adelie (7)
#>
#>
#> Evaluation on training data (333 cases):
#>
#> Decision Tree
#> ----------------
#> Size Errors
#>
#> 8 3( 0.9%) <<
#>
#>
#> (a) (b) (c) <-classified as
#> ---- ---- ----
#> 145 1 (a): class Adelie
#> 1 67 (b): class Chinstrap
#> 1 118 (c): class Gentoo
#>
#>
#> Attribute usage:
#>
#> 100.00% flipper_length_mm
#> 64.56% bill_length_mm
#> 56.46% island
#> 3.30% sex
#>
#>
#> Time: 0.0 secs
#extracting rules
C5_rules_fitted |>
extract_fit_engine() |>
summary()
#>
#> Call:
#> C5.0.default(x = x, y = y, trials = trials, rules = TRUE, control
#> = C50::C5.0Control(minCases = minCases, seed = sample.int(10^5,
#> 1), earlyStopping = FALSE))
#>
#>
#> C5.0 [Release 2.07 GPL Edition] Mon Jul 4 09:32:16 2022
#> -------------------------------
#>
#> Class specified by attribute `outcome'
#>
#> Read 333 cases (7 attributes) from undefined.data
#>
#> Rules:
#>
#> Rule 1: (68, lift 2.2)
#> bill_length_mm <= 43.3
#> sex = male
#> -> class Adelie [0.986]
#>
#> Rule 2: (208/64, lift 1.6)
#> flipper_length_mm <= 206
#> -> class Adelie [0.690]
#>
#> Rule 3: (48, lift 4.8)
#> island = Dream
#> bill_length_mm > 46.5
#> -> class Chinstrap [0.980]
#>
#> Rule 4: (34/1, lift 4.6)
#> bill_length_mm > 42.3
#> flipper_length_mm <= 206
#> sex = female
#> -> class Chinstrap [0.944]
#>
#> Rule 5: (118, lift 2.8)
#> island = Biscoe
#> flipper_length_mm > 206
#> -> class Gentoo [0.992]
#>
#> Default class: Adelie
#>
#>
#> Evaluation on training data (333 cases):
#>
#> Rules
#> ----------------
#> No Errors
#>
#> 5 2( 0.6%) <<
#>
#>
#> (a) (b) (c) <-classified as
#> ---- ---- ----
#> 146 (a): class Adelie
#> 1 67 (b): class Chinstrap
#> 1 118 (c): class Gentoo
#>
#>
#> Attribute usage:
#>
#> 97.90% flipper_length_mm
#> 49.85% island
#> 40.84% bill_length_mm
#> 30.63% sex
#>
#>
#> Time: 0.0 secs
Created on 2022-07-04 by the reprex package (v2.0.1)
Can a survival model with just the treatment as a predictor be fit
with a tidymodels survival function?
Here I mention the example, which uses many predictors, then try
to duplicated it with only one predictor. This fails.
https://www.tidyverse.org/blog/2021/11/survival-analysis-parsnip-adjacent/
has code to fit a survival tidymodel
library(survival)
bladder_train <- bladder[-c(1:3),]
bladder_test <- bladder[1:3,]
cox_spec <- proportional_hazards(penalty = 0.123) %>%
set_engine("glmnet")
f_fit <- fit(cox_spec,
Surv(stop, event) ~ rx + size + number + strata(enum),
data = bladder_train)
But with only the treatment in the model, it does not work
f_fit <- fit(cox_spec,
Surv(stop, event) ~ rx,
data = bladder_train)
Why? What am I missing
It seems the error has more to do with glmnet than tidymodels. This is the error:
library(survival)
library(censored)
#> Loading required package: parsnip
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
bladder_train <- bladder[-c(1:3), ]
bladder_test <- bladder[1:3, ]
cox_spec <- proportional_hazards(penalty = 0.123) %>%
set_engine("glmnet")
f_fit <- fit(cox_spec,
Surv(stop, event) ~ rx,
data = bladder_train)
#> Error in glmnet::glmnet(data_obj$x, data_obj$y, family = "cox", alpha = alpha, : x should be a matrix with 2 or more columns
Created on 2021-12-30 by the reprex package (v2.0.1)
glmnet needs a mtrix with 2 or more columns. Using just rx means you'd have just one column. If I add size as an additional feature, it works just fine.
library(survival)
library(censored)
#> Loading required package: parsnip
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
bladder_train <- bladder[-c(1:3), ]
bladder_test <- bladder[1:3, ]
cox_spec <- proportional_hazards(penalty = 0.123) %>%
set_engine("glmnet")
f_fit <- fit(cox_spec,
Surv(stop, event) ~ rx + size,
data = bladder_train)
f_fit
#> parsnip model object
#>
#> Fit time: NA
#>
#> Call: glmnet::glmnet(x = data_obj$x, y = data_obj$y, family = "cox", alpha = alpha, lambda = lambda)
#>
#> Df %Dev Lambda
#> 1 0 0.00 0.070850
#> 2 1 0.10 0.064560
#> 3 1 0.18 0.058820
#> 4 1 0.24 0.053600
#> 5 1 0.30 0.048840
#> 6 2 0.35 0.044500
#> 7 2 0.43 0.040550
#> 8 2 0.50 0.036940
#> 9 2 0.55 0.033660
#> 10 2 0.60 0.030670
#> 11 2 0.64 0.027950
#> 12 2 0.67 0.025460
#> 13 2 0.70 0.023200
#> 14 2 0.72 0.021140
#> 15 2 0.74 0.019260
#> 16 2 0.75 0.017550
#> 17 2 0.77 0.015990
#> 18 2 0.78 0.014570
#> 19 2 0.79 0.013280
#> 20 2 0.79 0.012100
#> 21 2 0.80 0.011020
#> 22 2 0.81 0.010040
#> 23 2 0.81 0.009151
#> 24 2 0.81 0.008338
#> 25 2 0.82 0.007597
#> 26 2 0.82 0.006922
#> 27 2 0.82 0.006308
#> 28 2 0.82 0.005747
#> 29 2 0.82 0.005237
#> 30 2 0.83 0.004771
#> 31 2 0.83 0.004348
#> 32 2 0.83 0.003961
#> 33 2 0.83 0.003609
#> 34 2 0.83 0.003289
#> 35 2 0.83 0.002997
#> 36 2 0.83 0.002730
#> 37 2 0.83 0.002488
#> 38 2 0.83 0.002267
#> 39 2 0.83 0.002065
#> 40 2 0.83 0.001882
#> 41 2 0.83 0.001715
#> The training data has been saved for prediction.
Created on 2021-12-30 by the reprex package (v2.0.1)
If you wanted to just use one feature rx, consider other models e.g. decision trees
library(survival)
library(censored)
#> Loading required package: parsnip
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
bladder_train <- bladder[-c(1:3), ]
bladder_test <- bladder[1:3, ]
dt_spec <- decision_tree() %>%
set_engine("rpart") %>%
set_mode("censored regression")
f_fit <- fit(dt_spec,
Surv(stop, event) ~ rx,
data = bladder_train)
f_fit
#> parsnip model object
#>
#> $rpart
#> n= 337
#>
#> node), split, n, deviance, yval
#> * denotes terminal node
#>
#> 1) root 337 403.0968 1.0000000
#> 2) rx>=1.5 152 166.6335 0.7751669 *
#> 3) rx< 1.5 185 231.2927 1.1946030 *
#>
#> $survfit
#>
#> Call: prodlim::prodlim(formula = form, data = data)
#> Stratified Kaplan-Meier estimator for the conditional event time survival function
#> Discrete predictor variable: rpartFactor (0.775166899958249, 1.19460305107131)
#>
#> $levels
#> [1] "0.775166899958249" "1.19460305107131"
#>
#> attr(,"class")
#> [1] "pecRpart"
Created on 2021-12-30 by the reprex package (v2.0.1)
I was trying to perform an exercise with correlated random normal variables in R. The purpose is pretty simple -- generate correlated random variables, add some errors to these random variables, and look at their combined standard deviations. The following code works fine but periodically spits out this message:
res = NULL
x1 = rnorm(n = 500, mean = 0.02, sd = 0.2)
for (i in 3:5) {
nn = i
wght = rep(1/(nn + 1), nn + 1)
x234 = scale(matrix( rnorm(nn * 500), ncol = nn ))
x1234 = cbind(scale(x1),x234)
c1 = var(x1234)
chol1 = solve(chol(c1))
newx = x1234 %*% chol1
dgn = diag(x = 1, nrow = nn + 1, ncol = nn + 1)
corrs = (runif(nn, min = 0.2, max = 0.8))
v = c(1)
vv = c(v, corrs)
dgn[1, ] = vv
dgn[, 1] = t(vv)
newc = dgn
chol2 = chol(newc)
finalx = newx %*% chol2 * sd(x1) + mean(x1)
fsd = sqrt(t(wght)%*%cov(finalx)%*%wght)
noise = scale(matrix(rnorm((nn + 1) * 500, mean = 0, sd = 0.1), ncol = nn + 1))
nmt = finalx + noise
nsd = sqrt(t(wght)%*%cov(nmt)%*%wght)
cmb = c(nn + 1, fsd, nsd)
res = rbind(res, cmb)
res
}
res
Here is the error message again:
Error in chol.default(newc) :
the leading minor of order 4 is not positive definite
As I increase the number of random variables from 5 to 10, the success rate falls dramatically. I have done some searching, but was not able to understand properly what is going on. I would appreciate if someone could help me explain the reason for the error message and improve the code so that I can increase the number of random variables. I am OK to modify the number of observations (currently set to 500).
Thanks!
Not to discourage from doing your own code but have you considered using one of the packages that generates the random variables correlated the way you specify and then adding your noise as desired. Seems more efficient...
# install.packages("SimMultiCorrData")
library(SimMultiCorrData)
#>
#> Attaching package: 'SimMultiCorrData'
#> The following object is masked from 'package:stats':
#>
#> poly
rcorrvar(n = 100, k_cat = 0, k_cont = 3, method = "Fleishman",
means = c(0.02, 0.02, 0.02), vars = c(.2, .2, .2), skews = c(0, 0, 0), skurts = c(0, 0, 0),
rho = matrix(c(1, -.8475514, -.7761684, -.8475514, 1, .7909486, -.7761684, .7909486, 1), 3, 3))
#>
#> Constants: Distribution 1
#>
#> Constants: Distribution 2
#>
#> Constants: Distribution 3
#>
#> Constants calculation time: 0 minutes
#> Intercorrelation calculation time: 0 minutes
#> Error loop calculation time: 0 minutes
#> Total Simulation time: 0 minutes
#> $constants
#> c0 c1 c2 c3
#> 1 0 1 0 0
#> 2 0 1 0 0
#> 3 0 1 0 0
#>
#> $continuous_variables
#> V1 V2 V3
#> 1 0.319695107 -0.09539562 0.04935637
#> 2 -0.044993481 0.18392534 -0.06670649
#> 3 -0.070313476 -0.06346264 -0.24941367
#> 4 0.172113990 0.34618351 0.47828409
#> 5 -0.274574396 0.34460006 0.09628439
#> 6 0.163286017 -0.10404186 -0.30498440
#> 7 0.189720419 0.34919058 -0.06916222
#> 8 0.346294222 -0.06309378 -0.17904333
#> 9 0.126299946 -0.08265343 0.04920184
#> 10 -0.280404683 0.17026612 0.51986206
#> 11 0.038499522 0.12446549 0.08325109
#> 12 -0.280384601 0.39031703 0.52271159
#> 13 0.045278970 0.46994063 0.11951804
#> 14 -0.194794669 -0.23913369 0.20371862
#> 15 -0.231546212 -0.00530418 -0.05841145
#> 16 0.346088425 -0.33119118 -0.27331346
#> 17 -0.453004492 0.60059088 0.52166094
#> 18 -0.072573425 0.05046599 0.33414391
#> 19 0.166013559 -0.18329940 0.10446314
#> 20 -0.098604755 -0.12496718 -0.61084161
#> 21 0.112571406 0.06160790 -0.16522639
#> 22 -0.089738379 0.35995382 0.18410621
#> 23 1.263601427 -0.93129093 -1.01284304
#> 24 0.467595367 -0.37048826 -0.56007336
#> 25 0.687837527 -0.71037730 -0.39024692
#> 26 -0.069806105 0.12184969 0.48233090
#> 27 0.460417179 0.11288231 -0.65215841
#> 28 -0.280200352 0.69895708 0.48867650
#> 29 -0.434993285 0.34369961 0.38985123
#> 30 0.156164881 -0.01521342 0.12130470
#> 31 0.106427524 -0.43769376 -0.38152970
#> 32 0.004461824 -0.02790287 0.13729747
#> 33 -0.617069179 0.62369153 0.74216927
#> 34 0.246206541 -0.22352474 -0.07086127
#> 35 -0.367155270 0.81098732 0.74171120
#> 36 -0.350166970 0.31690673 0.65302786
#> 37 -0.811889266 0.47066271 1.39740693
#> 38 -0.640483432 0.95157401 0.91042674
#> 39 0.288275932 -0.33698868 -0.15963674
#> 40 -0.056804796 0.29483915 0.15245274
#> 41 -0.266446983 0.09157321 -0.18294133
#> 42 0.611748802 -0.51417900 -0.22829506
#> 43 -0.052303947 -0.12391952 0.32055082
#> 44 0.127253868 0.06030743 -0.05578007
#> 45 0.395341299 -0.16222908 -0.08101956
#> 46 0.232971542 -0.09001768 0.06416376
#> 47 0.950584749 -0.67623380 -0.53429103
#> 48 0.256754894 -0.02981766 0.11701343
#> 49 0.233344371 -0.16151008 -0.05955383
#> 50 0.179751022 -0.09613500 -0.02272254
#> 51 0.097857477 0.27647838 0.40066424
#> 52 0.312418540 -0.02838812 -0.13918162
#> 53 0.705549829 -0.61698405 -0.29640094
#> 54 -0.074780651 0.42953939 0.31652087
#> 55 -0.291403183 0.05610553 0.32864232
#> 56 0.255325304 -0.55157170 -0.35415178
#> 57 0.120880052 -0.03856729 -0.61262393
#> 58 -0.648674586 0.59293157 0.79705060
#> 59 -0.404069704 0.29839572 -0.11963513
#> 60 0.029594092 0.24640773 0.27927410
#> 61 -0.127056071 0.30463198 -0.11407147
#> 62 -0.443629418 0.01942471 -0.32452308
#> 63 -0.139397963 0.20547578 0.11826198
#> 64 -0.512486967 0.24807759 0.67593407
#> 65 0.175825431 -0.15323003 -0.15738781
#> 66 -0.169247924 -0.29342285 -0.32655455
#> 67 0.540012695 -0.59459258 -0.12475814
#> 68 -0.498927728 0.05150384 0.07964582
#> 69 -0.166410612 0.07525901 -0.24507295
#> 70 0.582444257 -0.64069856 -0.60202487
#> 71 0.432974856 -0.66789588 -0.35017817
#> 72 0.484137908 -0.05404562 -0.34554109
#> 73 0.050180754 0.16226779 0.03339923
#> 74 -0.454340954 0.71886665 0.16057079
#> 75 0.776382309 -0.78986861 -1.29451966
#> 76 -0.480735672 0.43505688 0.46473186
#> 77 -0.086088864 0.54821715 0.42424756
#> 78 1.274991665 -1.26223004 -0.89524217
#> 79 0.006008305 -0.07710162 -0.07703056
#> 80 0.052344453 0.05182247 0.03126195
#> 81 -1.196792535 1.25723077 1.07875988
#> 82 0.057429049 0.06333375 -0.01933766
#> 83 0.207780426 -0.25919776 0.23279382
#> 84 0.316861262 -0.17226266 -0.24638375
#> 85 -0.032954787 -0.35399252 -0.17783342
#> 86 0.629198645 -0.85950566 -0.72744805
#> 87 0.068142675 -0.44343898 -0.17731659
#> 88 -0.244845275 0.28838443 0.32273254
#> 89 -0.206355945 -0.16599180 0.28202824
#> 90 0.023354603 0.18240309 0.30508536
#> 91 0.038201949 0.21409777 -0.05523652
#> 92 -0.961385546 1.21994616 0.71859653
#> 93 -0.916876574 0.36826421 0.35458708
#> 94 -0.135629660 -0.19348452 -0.14177523
#> 95 1.142650739 -0.94119197 -0.87394690
#> 96 0.561089630 -0.29328666 -0.63295015
#> 97 -0.054000942 -0.09673068 0.40208010
#> 98 -0.536990807 0.41466009 0.21541141
#> 99 0.015140675 -0.10702733 -0.29580071
#> 100 -0.830043387 0.77655165 0.08875664
#>
#> $summary_continuous
#> Distribution n mean sd median min max skew
#> X1 1 100 0.02 0.4472136 0.026474348 -1.196793 1.274992 0.19390551
#> X2 2 100 0.02 0.4472136 0.007060266 -1.262230 1.257231 -0.02466011
#> X3 3 100 0.02 0.4472136 0.005962144 -1.294520 1.397407 0.03116489
#> skurtosis fifth sixth
#> X1 0.7772036 0.1731516 -5.447704
#> X2 0.6009945 0.4473608 -4.123007
#> X3 0.7130090 0.1809188 -2.905017
#>
#> $summary_targetcont
#> Distribution mean sd skew skurtosis
#> 1 1 0.02 0.4472136 0 0
#> 2 2 0.02 0.4472136 0 0
#> 3 3 0.02 0.4472136 0 0
#>
#> $sixth_correction
#> [1] NA NA NA
#>
#> $valid.pdf
#> [1] "TRUE" "TRUE" "TRUE"
#>
#> $correlations
#> [,1] [,2] [,3]
#> [1,] 1.0000000 -0.8475514 -0.7761684
#> [2,] -0.8475514 1.0000000 0.7909486
#> [3,] -0.7761684 0.7909486 1.0000000
#>
#> $Sigma1
#> [,1] [,2] [,3]
#> [1,] 1.0000000 -0.8475514 -0.7761684
#> [2,] -0.8475514 1.0000000 0.7909486
#> [3,] -0.7761684 0.7909486 1.0000000
#>
#> $Sigma2
#> [,1] [,2] [,3]
#> [1,] 1.0000000 -0.8475514 -0.7761684
#> [2,] -0.8475514 1.0000000 0.7909486
#> [3,] -0.7761684 0.7909486 1.0000000
#>
#> $Constants_Time
#> Time difference of 0 mins
#>
#> $Intercorrelation_Time
#> Time difference of 0 mins
#>
#> $Error_Loop_Time
#> Time difference of 0 mins
#>
#> $Simulation_Time
#> Time difference of 0 mins
#>
#> $niter
#> 1 2 3
#> 1 0 0 0
#> 2 0 0 0
#> 3 0 0 0
#>
#> $maxerr
#> [1] 2.220446e-16
Created on 2020-05-13 by the reprex package (v0.3.0)
Plotting my soil compaction data gives a convex-up curve. I need to determine the maximum y-value and the x-value which produces that maximum.
The 'features' package fits a smooth spline to the data and returns the features of the spline, including the y-maximum and critical x-value. I am having difficulty iterating the features() function over multiple samples, which are contained in a tidy list.
It seems that the features package is having trouble indexing to the data. The code works fine when I use data for only one sample, but when I try to use the dot placeholder and square brackets it loses track of the data.
Below is the code showing how this process works correctly for one sample, but not for an iteration.
#load packages
library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 3.6.3
#> Warning: package 'forcats' was built under R version 3.6.3
library(features)
#> Warning: package 'features' was built under R version 3.6.3
#> Loading required package: lokern
#> Warning: package 'lokern' was built under R version 3.6.3
# generate example data
df <- tibble(
sample = (rep(LETTERS[1:3], each=4)),
w = c(seq(0.08, 0.12, by=0.0125),
seq(0.09, 0.13, by=0.0125),
seq(0.10, 0.14, by=0.0125)),
d= c(1.86, 1.88, 1.88, 1.87,
1.90, 1.92, 1.92, 1.91,
1.96, 1.98, 1.98, 1.97) )
df
#> # A tibble: 12 x 3
#> sample w d
#> <chr> <dbl> <dbl>
#> 1 A 0.08 1.86
#> 2 A 0.0925 1.88
#> 3 A 0.105 1.88
#> 4 A 0.118 1.87
#> 5 B 0.09 1.9
#> 6 B 0.102 1.92
#> 7 B 0.115 1.92
#> 8 B 0.128 1.91
#> 9 C 0.1 1.96
#> 10 C 0.112 1.98
#> 11 C 0.125 1.98
#> 12 C 0.138 1.97
# use the 'features' package to fit a smooth spline and extract the spline features,
# including local y-maximum and critical point along x-axis.
# This works fine for one sample at a time:
sample1_data <- df %>% filter(sample == 'A')
sample1_features <- features(x= sample1_data$w,
y= sample1_data$d,
smoother = "smooth.spline")
sample1_features
#> $f
#> fmean fmin fmax fsd noise
#> 1.880000e+00 1.860000e+00 1.880000e+00 1.000000e-02 0.000000e+00
#> snr d1min d1max fwiggle ncpts
#> 2.707108e+11 -9.100000e-01 1.970000e+00 9.349000e+01 1.000000e+00
#>
#> $cpts
#> [1] 0.1
#>
#> $curvature
#> [1] -121.03
#>
#> $outliers
#> [1] NA
#>
#> attr(,"fits")
#> attr(,"fits")$x
#> [1] 0.0800 0.0925 0.1050 0.1175
#>
#> attr(,"fits")$y
#> [1] 1.86 1.88 1.88 1.87
#>
#> attr(,"fits")$fn
#> [1] 1.86 1.88 1.88 1.87
#>
#> attr(,"fits")$d1
#> [1] 1.9732965 0.8533784 -0.5868100 -0.9061384
#>
#> attr(,"fits")$d2
#> [1] 4.588832e-03 -1.791915e+02 -5.123866e+01 1.461069e-01
#>
#> attr(,"class")
#> [1] "features"
# But when attempting to use the pipe and the map() function
# to iterate over a list containing data for multiple samples,
# using the typical map() placeholder dot will not index to the
# list element/columns that are being passed to .f
df_split <- split(df, f= df[['sample']])
df_split
#> $A
#> # A tibble: 4 x 3
#> sample w d
#> <chr> <dbl> <dbl>
#> 1 A 0.08 1.86
#> 2 A 0.0925 1.88
#> 3 A 0.105 1.88
#> 4 A 0.118 1.87
#>
#> $B
#> # A tibble: 4 x 3
#> sample w d
#> <chr> <dbl> <dbl>
#> 1 B 0.09 1.9
#> 2 B 0.102 1.92
#> 3 B 0.115 1.92
#> 4 B 0.128 1.91
#>
#> $C
#> # A tibble: 4 x 3
#> sample w d
#> <chr> <dbl> <dbl>
#> 1 C 0.1 1.96
#> 2 C 0.112 1.98
#> 3 C 0.125 1.98
#> 4 C 0.138 1.97
df_split %>% map(.f = features, x = .[['w']], y= .[['d']], smoother = "smooth.spline")
#> Warning in min(x): no non-missing arguments to min; returning Inf
#> Warning in max(x): no non-missing arguments to max; returning -Inf
#> Error in seq.default(min(x), max(x), length = max(npts, length(x))): 'from' must be a finite number
Created on 2020-04-04 by the reprex package (v0.3.0)
You could use group_split to split the data based on sample and use map to apply features functions to each subset of data.
library(features)
library(dplyr)
library(purrr)
list_model <- df %>%
group_split(sample) %>%
map(~features(x = .x$w, y = .x$d, smoother = "smooth.spline"))