Issues with simulating correlated random variables - r

I was trying to perform an exercise with correlated random normal variables in R. The purpose is pretty simple -- generate correlated random variables, add some errors to these random variables, and look at their combined standard deviations. The following code works fine but periodically spits out this message:
res = NULL
x1 = rnorm(n = 500, mean = 0.02, sd = 0.2)
for (i in 3:5) {
nn = i
wght = rep(1/(nn + 1), nn + 1)
x234 = scale(matrix( rnorm(nn * 500), ncol = nn ))
x1234 = cbind(scale(x1),x234)
c1 = var(x1234)
chol1 = solve(chol(c1))
newx = x1234 %*% chol1
dgn = diag(x = 1, nrow = nn + 1, ncol = nn + 1)
corrs = (runif(nn, min = 0.2, max = 0.8))
v = c(1)
vv = c(v, corrs)
dgn[1, ] = vv
dgn[, 1] = t(vv)
newc = dgn
chol2 = chol(newc)
finalx = newx %*% chol2 * sd(x1) + mean(x1)
fsd = sqrt(t(wght)%*%cov(finalx)%*%wght)
noise = scale(matrix(rnorm((nn + 1) * 500, mean = 0, sd = 0.1), ncol = nn + 1))
nmt = finalx + noise
nsd = sqrt(t(wght)%*%cov(nmt)%*%wght)
cmb = c(nn + 1, fsd, nsd)
res = rbind(res, cmb)
res
}
res
Here is the error message again:
Error in chol.default(newc) :
the leading minor of order 4 is not positive definite
As I increase the number of random variables from 5 to 10, the success rate falls dramatically. I have done some searching, but was not able to understand properly what is going on. I would appreciate if someone could help me explain the reason for the error message and improve the code so that I can increase the number of random variables. I am OK to modify the number of observations (currently set to 500).
Thanks!

Not to discourage from doing your own code but have you considered using one of the packages that generates the random variables correlated the way you specify and then adding your noise as desired. Seems more efficient...
# install.packages("SimMultiCorrData")
library(SimMultiCorrData)
#>
#> Attaching package: 'SimMultiCorrData'
#> The following object is masked from 'package:stats':
#>
#> poly
rcorrvar(n = 100, k_cat = 0, k_cont = 3, method = "Fleishman",
means = c(0.02, 0.02, 0.02), vars = c(.2, .2, .2), skews = c(0, 0, 0), skurts = c(0, 0, 0),
rho = matrix(c(1, -.8475514, -.7761684, -.8475514, 1, .7909486, -.7761684, .7909486, 1), 3, 3))
#>
#> Constants: Distribution 1
#>
#> Constants: Distribution 2
#>
#> Constants: Distribution 3
#>
#> Constants calculation time: 0 minutes
#> Intercorrelation calculation time: 0 minutes
#> Error loop calculation time: 0 minutes
#> Total Simulation time: 0 minutes
#> $constants
#> c0 c1 c2 c3
#> 1 0 1 0 0
#> 2 0 1 0 0
#> 3 0 1 0 0
#>
#> $continuous_variables
#> V1 V2 V3
#> 1 0.319695107 -0.09539562 0.04935637
#> 2 -0.044993481 0.18392534 -0.06670649
#> 3 -0.070313476 -0.06346264 -0.24941367
#> 4 0.172113990 0.34618351 0.47828409
#> 5 -0.274574396 0.34460006 0.09628439
#> 6 0.163286017 -0.10404186 -0.30498440
#> 7 0.189720419 0.34919058 -0.06916222
#> 8 0.346294222 -0.06309378 -0.17904333
#> 9 0.126299946 -0.08265343 0.04920184
#> 10 -0.280404683 0.17026612 0.51986206
#> 11 0.038499522 0.12446549 0.08325109
#> 12 -0.280384601 0.39031703 0.52271159
#> 13 0.045278970 0.46994063 0.11951804
#> 14 -0.194794669 -0.23913369 0.20371862
#> 15 -0.231546212 -0.00530418 -0.05841145
#> 16 0.346088425 -0.33119118 -0.27331346
#> 17 -0.453004492 0.60059088 0.52166094
#> 18 -0.072573425 0.05046599 0.33414391
#> 19 0.166013559 -0.18329940 0.10446314
#> 20 -0.098604755 -0.12496718 -0.61084161
#> 21 0.112571406 0.06160790 -0.16522639
#> 22 -0.089738379 0.35995382 0.18410621
#> 23 1.263601427 -0.93129093 -1.01284304
#> 24 0.467595367 -0.37048826 -0.56007336
#> 25 0.687837527 -0.71037730 -0.39024692
#> 26 -0.069806105 0.12184969 0.48233090
#> 27 0.460417179 0.11288231 -0.65215841
#> 28 -0.280200352 0.69895708 0.48867650
#> 29 -0.434993285 0.34369961 0.38985123
#> 30 0.156164881 -0.01521342 0.12130470
#> 31 0.106427524 -0.43769376 -0.38152970
#> 32 0.004461824 -0.02790287 0.13729747
#> 33 -0.617069179 0.62369153 0.74216927
#> 34 0.246206541 -0.22352474 -0.07086127
#> 35 -0.367155270 0.81098732 0.74171120
#> 36 -0.350166970 0.31690673 0.65302786
#> 37 -0.811889266 0.47066271 1.39740693
#> 38 -0.640483432 0.95157401 0.91042674
#> 39 0.288275932 -0.33698868 -0.15963674
#> 40 -0.056804796 0.29483915 0.15245274
#> 41 -0.266446983 0.09157321 -0.18294133
#> 42 0.611748802 -0.51417900 -0.22829506
#> 43 -0.052303947 -0.12391952 0.32055082
#> 44 0.127253868 0.06030743 -0.05578007
#> 45 0.395341299 -0.16222908 -0.08101956
#> 46 0.232971542 -0.09001768 0.06416376
#> 47 0.950584749 -0.67623380 -0.53429103
#> 48 0.256754894 -0.02981766 0.11701343
#> 49 0.233344371 -0.16151008 -0.05955383
#> 50 0.179751022 -0.09613500 -0.02272254
#> 51 0.097857477 0.27647838 0.40066424
#> 52 0.312418540 -0.02838812 -0.13918162
#> 53 0.705549829 -0.61698405 -0.29640094
#> 54 -0.074780651 0.42953939 0.31652087
#> 55 -0.291403183 0.05610553 0.32864232
#> 56 0.255325304 -0.55157170 -0.35415178
#> 57 0.120880052 -0.03856729 -0.61262393
#> 58 -0.648674586 0.59293157 0.79705060
#> 59 -0.404069704 0.29839572 -0.11963513
#> 60 0.029594092 0.24640773 0.27927410
#> 61 -0.127056071 0.30463198 -0.11407147
#> 62 -0.443629418 0.01942471 -0.32452308
#> 63 -0.139397963 0.20547578 0.11826198
#> 64 -0.512486967 0.24807759 0.67593407
#> 65 0.175825431 -0.15323003 -0.15738781
#> 66 -0.169247924 -0.29342285 -0.32655455
#> 67 0.540012695 -0.59459258 -0.12475814
#> 68 -0.498927728 0.05150384 0.07964582
#> 69 -0.166410612 0.07525901 -0.24507295
#> 70 0.582444257 -0.64069856 -0.60202487
#> 71 0.432974856 -0.66789588 -0.35017817
#> 72 0.484137908 -0.05404562 -0.34554109
#> 73 0.050180754 0.16226779 0.03339923
#> 74 -0.454340954 0.71886665 0.16057079
#> 75 0.776382309 -0.78986861 -1.29451966
#> 76 -0.480735672 0.43505688 0.46473186
#> 77 -0.086088864 0.54821715 0.42424756
#> 78 1.274991665 -1.26223004 -0.89524217
#> 79 0.006008305 -0.07710162 -0.07703056
#> 80 0.052344453 0.05182247 0.03126195
#> 81 -1.196792535 1.25723077 1.07875988
#> 82 0.057429049 0.06333375 -0.01933766
#> 83 0.207780426 -0.25919776 0.23279382
#> 84 0.316861262 -0.17226266 -0.24638375
#> 85 -0.032954787 -0.35399252 -0.17783342
#> 86 0.629198645 -0.85950566 -0.72744805
#> 87 0.068142675 -0.44343898 -0.17731659
#> 88 -0.244845275 0.28838443 0.32273254
#> 89 -0.206355945 -0.16599180 0.28202824
#> 90 0.023354603 0.18240309 0.30508536
#> 91 0.038201949 0.21409777 -0.05523652
#> 92 -0.961385546 1.21994616 0.71859653
#> 93 -0.916876574 0.36826421 0.35458708
#> 94 -0.135629660 -0.19348452 -0.14177523
#> 95 1.142650739 -0.94119197 -0.87394690
#> 96 0.561089630 -0.29328666 -0.63295015
#> 97 -0.054000942 -0.09673068 0.40208010
#> 98 -0.536990807 0.41466009 0.21541141
#> 99 0.015140675 -0.10702733 -0.29580071
#> 100 -0.830043387 0.77655165 0.08875664
#>
#> $summary_continuous
#> Distribution n mean sd median min max skew
#> X1 1 100 0.02 0.4472136 0.026474348 -1.196793 1.274992 0.19390551
#> X2 2 100 0.02 0.4472136 0.007060266 -1.262230 1.257231 -0.02466011
#> X3 3 100 0.02 0.4472136 0.005962144 -1.294520 1.397407 0.03116489
#> skurtosis fifth sixth
#> X1 0.7772036 0.1731516 -5.447704
#> X2 0.6009945 0.4473608 -4.123007
#> X3 0.7130090 0.1809188 -2.905017
#>
#> $summary_targetcont
#> Distribution mean sd skew skurtosis
#> 1 1 0.02 0.4472136 0 0
#> 2 2 0.02 0.4472136 0 0
#> 3 3 0.02 0.4472136 0 0
#>
#> $sixth_correction
#> [1] NA NA NA
#>
#> $valid.pdf
#> [1] "TRUE" "TRUE" "TRUE"
#>
#> $correlations
#> [,1] [,2] [,3]
#> [1,] 1.0000000 -0.8475514 -0.7761684
#> [2,] -0.8475514 1.0000000 0.7909486
#> [3,] -0.7761684 0.7909486 1.0000000
#>
#> $Sigma1
#> [,1] [,2] [,3]
#> [1,] 1.0000000 -0.8475514 -0.7761684
#> [2,] -0.8475514 1.0000000 0.7909486
#> [3,] -0.7761684 0.7909486 1.0000000
#>
#> $Sigma2
#> [,1] [,2] [,3]
#> [1,] 1.0000000 -0.8475514 -0.7761684
#> [2,] -0.8475514 1.0000000 0.7909486
#> [3,] -0.7761684 0.7909486 1.0000000
#>
#> $Constants_Time
#> Time difference of 0 mins
#>
#> $Intercorrelation_Time
#> Time difference of 0 mins
#>
#> $Error_Loop_Time
#> Time difference of 0 mins
#>
#> $Simulation_Time
#> Time difference of 0 mins
#>
#> $niter
#> 1 2 3
#> 1 0 0 0
#> 2 0 0 0
#> 3 0 0 0
#>
#> $maxerr
#> [1] 2.220446e-16
Created on 2020-05-13 by the reprex package (v0.3.0)

Related

Is it possible to measure distances between each and every sampling point (220 locations) in r?

I am creating a nested sampling design.
Is it possible to calculate the distance between each and every one of my 200 sampling locations (longitude/latitude)?
I would like to calculate where the distances occur on the lag line (e.g how many samples are separated by 1m,10m,100m etc) to check that there is sufficient amount of points at each distance.
Is this possible in r or any other free software?
Let's use sf::st_distance() suggested by #nniloc
Let's prepare sample occurrence data:
occ <- rgbif::occ_data(
scientificName = "Calystegia pulchra",
country = "GB",
hasCoordinate = TRUE
)
occ <- head(occ$data, 220) |>
sf::st_as_sf(coords = c("decimalLongitude", "decimalLatitude"), crs = 4326) |>
subset(select = c("key", "scientificName"))
Let's create a distance matrix
m <- sf::st_distance(occ)
m[1:4, 1:4]
#> Units: [m]
#> [,1] [,2] [,3] [,4]
#> [1,] 0.0 127215.11 202758.86 763395.9
#> [2,] 127215.1 0.00 98557.85 681999.6
#> [3,] 202758.9 98557.85 0.00 583484.0
#> [4,] 763395.9 681999.59 583484.00 0.0
and function, which takes the row, and calculates how much entries where the distance > ...
how_much <- function(matrix = m, row = 1, distance = 100000) {
length(which({{matrix}}[{{row}},] > units::as_units({{distance}}, "m")))
}
how_much(m, 2, 100000)
#> [1] 195
Let's add it to our occurrence data:
occ |>
dplyr::mutate(row_number = dplyr::row_number()) |>
dplyr::rowwise() |>
dplyr::mutate(dist_200000 = how_much(m, row_number, 200000))
#> Simple feature collection with 220 features and 4 fields
#> Geometry type: POINT
#> Dimension: XY
#> Bounding box: xmin: -7.978369 ymin: 49.97314 xmax: 1.681262 ymax: 57.90633
#> Geodetic CRS: WGS 84
#> # A tibble: 220 × 5
#> # Rowwise:
#> key scientificName geometry row_number dist_20…¹
#> * <chr> <chr> <POINT [°]> <int> <int>
#> 1 3320924569 Calystegia pulchra Brum… (-0.17902 51.49553) 1 196
#> 2 3437494613 Calystegia pulchra Brum… (-1.962333 51.78564) 2 158
#> 3 3785877810 Calystegia pulchra Brum… (-2.541045 52.5979) 3 146
#> 4 3352641836 Calystegia pulchra Brum… (-6.189535 57.41207) 4 171
#> 5 3338086543 Calystegia pulchra Brum… (-2.302697 53.20301) 5 154
#> 6 3352720276 Calystegia pulchra Brum… (-3.052632 54.8225) 6 147
#> 7 3352736637 Calystegia pulchra Brum… (-1.614114 55.37932) 7 164
#> 8 3384449063 Calystegia pulchra Brum… (-4.681922 57.14801) 8 135
#> 9 3421262904 Calystegia pulchra Brum… (-6.19056 57.42103) 9 171
#> 10 3392248456 Calystegia pulchra Brum… (-6.159327 56.11731) 10 158
#> # … with 210 more rows, and abbreviated variable name ¹​dist_200000
Regards,
Grzegorz
Created on 2022-10-04 with reprex v2.0.2

What does a tidymodels survival fit require more than one predictor

Can a survival model with just the treatment as a predictor be fit
with a tidymodels survival function?
Here I mention the example, which uses many predictors, then try
to duplicated it with only one predictor. This fails.
https://www.tidyverse.org/blog/2021/11/survival-analysis-parsnip-adjacent/
has code to fit a survival tidymodel
library(survival)
bladder_train <- bladder[-c(1:3),]
bladder_test <- bladder[1:3,]
cox_spec <- proportional_hazards(penalty = 0.123) %>%
set_engine("glmnet")
f_fit <- fit(cox_spec,
Surv(stop, event) ~ rx + size + number + strata(enum),
data = bladder_train)
But with only the treatment in the model, it does not work
f_fit <- fit(cox_spec,
Surv(stop, event) ~ rx,
data = bladder_train)
Why? What am I missing
It seems the error has more to do with glmnet than tidymodels. This is the error:
library(survival)
library(censored)
#> Loading required package: parsnip
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
bladder_train <- bladder[-c(1:3), ]
bladder_test <- bladder[1:3, ]
cox_spec <- proportional_hazards(penalty = 0.123) %>%
set_engine("glmnet")
f_fit <- fit(cox_spec,
Surv(stop, event) ~ rx,
data = bladder_train)
#> Error in glmnet::glmnet(data_obj$x, data_obj$y, family = "cox", alpha = alpha, : x should be a matrix with 2 or more columns
Created on 2021-12-30 by the reprex package (v2.0.1)
glmnet needs a mtrix with 2 or more columns. Using just rx means you'd have just one column. If I add size as an additional feature, it works just fine.
library(survival)
library(censored)
#> Loading required package: parsnip
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
bladder_train <- bladder[-c(1:3), ]
bladder_test <- bladder[1:3, ]
cox_spec <- proportional_hazards(penalty = 0.123) %>%
set_engine("glmnet")
f_fit <- fit(cox_spec,
Surv(stop, event) ~ rx + size,
data = bladder_train)
f_fit
#> parsnip model object
#>
#> Fit time: NA
#>
#> Call: glmnet::glmnet(x = data_obj$x, y = data_obj$y, family = "cox", alpha = alpha, lambda = lambda)
#>
#> Df %Dev Lambda
#> 1 0 0.00 0.070850
#> 2 1 0.10 0.064560
#> 3 1 0.18 0.058820
#> 4 1 0.24 0.053600
#> 5 1 0.30 0.048840
#> 6 2 0.35 0.044500
#> 7 2 0.43 0.040550
#> 8 2 0.50 0.036940
#> 9 2 0.55 0.033660
#> 10 2 0.60 0.030670
#> 11 2 0.64 0.027950
#> 12 2 0.67 0.025460
#> 13 2 0.70 0.023200
#> 14 2 0.72 0.021140
#> 15 2 0.74 0.019260
#> 16 2 0.75 0.017550
#> 17 2 0.77 0.015990
#> 18 2 0.78 0.014570
#> 19 2 0.79 0.013280
#> 20 2 0.79 0.012100
#> 21 2 0.80 0.011020
#> 22 2 0.81 0.010040
#> 23 2 0.81 0.009151
#> 24 2 0.81 0.008338
#> 25 2 0.82 0.007597
#> 26 2 0.82 0.006922
#> 27 2 0.82 0.006308
#> 28 2 0.82 0.005747
#> 29 2 0.82 0.005237
#> 30 2 0.83 0.004771
#> 31 2 0.83 0.004348
#> 32 2 0.83 0.003961
#> 33 2 0.83 0.003609
#> 34 2 0.83 0.003289
#> 35 2 0.83 0.002997
#> 36 2 0.83 0.002730
#> 37 2 0.83 0.002488
#> 38 2 0.83 0.002267
#> 39 2 0.83 0.002065
#> 40 2 0.83 0.001882
#> 41 2 0.83 0.001715
#> The training data has been saved for prediction.
Created on 2021-12-30 by the reprex package (v2.0.1)
If you wanted to just use one feature rx, consider other models e.g. decision trees
library(survival)
library(censored)
#> Loading required package: parsnip
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
bladder_train <- bladder[-c(1:3), ]
bladder_test <- bladder[1:3, ]
dt_spec <- decision_tree() %>%
set_engine("rpart") %>%
set_mode("censored regression")
f_fit <- fit(dt_spec,
Surv(stop, event) ~ rx,
data = bladder_train)
f_fit
#> parsnip model object
#>
#> $rpart
#> n= 337
#>
#> node), split, n, deviance, yval
#> * denotes terminal node
#>
#> 1) root 337 403.0968 1.0000000
#> 2) rx>=1.5 152 166.6335 0.7751669 *
#> 3) rx< 1.5 185 231.2927 1.1946030 *
#>
#> $survfit
#>
#> Call: prodlim::prodlim(formula = form, data = data)
#> Stratified Kaplan-Meier estimator for the conditional event time survival function
#> Discrete predictor variable: rpartFactor (0.775166899958249, 1.19460305107131)
#>
#> $levels
#> [1] "0.775166899958249" "1.19460305107131"
#>
#> attr(,"class")
#> [1] "pecRpart"
Created on 2021-12-30 by the reprex package (v2.0.1)

I want to simulate and obtain best ARIMA ith number of time in R

I want to simulate ARIMA(1,0,0) with arima.sim() 100 times and find the best model with auto.arima() function for each time the simulation is done. I want the program to print the order of ARIMA obtain each time.
reslt = c()
num <- 60
epselon = rnorm(num, mean=0, sd=1^2)
for(i in 1:10){
reslt[i]<-auto.arima(arima.sim(n = num, model=list(ar=0.8, order = c(1, 0, 0)), n.start=1, innov=c(0,epselon[-1])))
}
The above is what I tried but no result.
What I want is to print a series of ARIMA(p, d, q) into 10 times
This will do it:
library(forecast)
nsim <- 10
result <- matrix(NA_integer_, nrow = nsim, ncol = 3)
colnames(result) <- c("p","d","q")
num <- 60
for (i in seq(nsim)) {
result[i, ] <- arima.sim(n=num, model=list(ar=0.8, order=c(1,0,0)), sd=1) %>%
auto.arima() %>%
arimaorder()
}
result
#> p d q
#> [1,] 0 1 0
#> [2,] 0 1 0
#> [3,] 0 1 0
#> [4,] 1 0 0
#> [5,] 1 0 0
#> [6,] 0 1 0
#> [7,] 0 1 0
#> [8,] 1 0 0
#> [9,] 1 0 0
#> [10,] 1 0 0
Created on 2020-06-24 by the reprex package (v0.3.0)
A few comments:
Your code will produce the same series every time because epselon is generated outside the loop. As you are just using random normal innovations, it is simpler to let arima.sim() handle it as in the code above.
If you wanted to keep the whole model object that is returned by auto.arima() rather than just the orders as in my code, you could modify it like this:
library(forecast)
nsim <- 10
result <- list()
num <- 60
for (i in seq(nsim)) {
result[[i]] <- arima.sim(n=num, model=list(ar=0.8, order=c(1,0,0)), sd=1) %>%
auto.arima()
}
result
#> [[1]]
#> Series: .
#> ARIMA(0,1,0)
#>
#> sigma^2 estimated as 1.145: log likelihood=-87.72
#> AIC=177.44 AICc=177.51 BIC=179.52
#>
#> [[2]]
#> Series: .
#> ARIMA(1,0,2) with zero mean
#>
#> Coefficients:
#> ar1 ma1 ma2
#> 0.5200 0.4086 0.4574
#> s.e. 0.1695 0.1889 0.1446
#>
#> sigma^2 estimated as 0.877: log likelihood=-80.38
#> AIC=168.77 AICc=169.5 BIC=177.15
#>
#> [[3]]
#> Series: .
#> ARIMA(0,1,0)
#>
#> sigma^2 estimated as 0.9284: log likelihood=-81.53
#> AIC=165.05 AICc=165.12 BIC=167.13
#>
#> [[4]]
#> Series: .
#> ARIMA(1,0,0) with zero mean
#>
#> Coefficients:
#> ar1
#> 0.615
#> s.e. 0.099
#>
#> sigma^2 estimated as 1.123: log likelihood=-88.35
#> AIC=180.7 AICc=180.91 BIC=184.89
#>
#> [[5]]
#> Series: .
#> ARIMA(0,0,3) with zero mean
#>
#> Coefficients:
#> ma1 ma2 ma3
#> 0.5527 0.2726 -0.3297
#> s.e. 0.1301 0.1425 0.1202
#>
#> sigma^2 estimated as 0.6194: log likelihood=-69.83
#> AIC=147.66 AICc=148.39 BIC=156.04
#>
#> [[6]]
#> Series: .
#> ARIMA(1,0,0) with non-zero mean
#>
#> Coefficients:
#> ar1 mean
#> 0.7108 0.9147
#> s.e. 0.0892 0.4871
#>
#> sigma^2 estimated as 1.332: log likelihood=-93.08
#> AIC=192.15 AICc=192.58 BIC=198.43
#>
#> [[7]]
#> Series: .
#> ARIMA(1,0,1) with non-zero mean
#>
#> Coefficients:
#> ar1 ma1 mean
#> 0.6116 0.3781 -1.0024
#> s.e. 0.1264 0.1559 0.4671
#>
#> sigma^2 estimated as 1.161: log likelihood=-88.6
#> AIC=185.2 AICc=185.92 BIC=193.57
#>
#> [[8]]
#> Series: .
#> ARIMA(1,0,0) with zero mean
#>
#> Coefficients:
#> ar1
#> 0.6412
#> s.e. 0.0969
#>
#> sigma^2 estimated as 0.8666: log likelihood=-80.6
#> AIC=165.2 AICc=165.41 BIC=169.39
#>
#> [[9]]
#> Series: .
#> ARIMA(0,1,0)
#>
#> sigma^2 estimated as 1.314: log likelihood=-91.78
#> AIC=185.57 AICc=185.64 BIC=187.64
#>
#> [[10]]
#> Series: .
#> ARIMA(1,0,0) with non-zero mean
#>
#> Coefficients:
#> ar1 mean
#> 0.6714 1.3449
#> s.e. 0.0985 0.4428
#>
#> sigma^2 estimated as 1.397: log likelihood=-94.44
#> AIC=194.89 AICc=195.32 BIC=201.17
Created on 2020-06-24 by the reprex package (v0.3.0)

How to index the features() function to iterate over a list of data frames using map() function in R?

Plotting my soil compaction data gives a convex-up curve. I need to determine the maximum y-value and the x-value which produces that maximum.
The 'features' package fits a smooth spline to the data and returns the features of the spline, including the y-maximum and critical x-value. I am having difficulty iterating the features() function over multiple samples, which are contained in a tidy list.
It seems that the features package is having trouble indexing to the data. The code works fine when I use data for only one sample, but when I try to use the dot placeholder and square brackets it loses track of the data.
Below is the code showing how this process works correctly for one sample, but not for an iteration.
#load packages
library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 3.6.3
#> Warning: package 'forcats' was built under R version 3.6.3
library(features)
#> Warning: package 'features' was built under R version 3.6.3
#> Loading required package: lokern
#> Warning: package 'lokern' was built under R version 3.6.3
# generate example data
df <- tibble(
sample = (rep(LETTERS[1:3], each=4)),
w = c(seq(0.08, 0.12, by=0.0125),
seq(0.09, 0.13, by=0.0125),
seq(0.10, 0.14, by=0.0125)),
d= c(1.86, 1.88, 1.88, 1.87,
1.90, 1.92, 1.92, 1.91,
1.96, 1.98, 1.98, 1.97) )
df
#> # A tibble: 12 x 3
#> sample w d
#> <chr> <dbl> <dbl>
#> 1 A 0.08 1.86
#> 2 A 0.0925 1.88
#> 3 A 0.105 1.88
#> 4 A 0.118 1.87
#> 5 B 0.09 1.9
#> 6 B 0.102 1.92
#> 7 B 0.115 1.92
#> 8 B 0.128 1.91
#> 9 C 0.1 1.96
#> 10 C 0.112 1.98
#> 11 C 0.125 1.98
#> 12 C 0.138 1.97
# use the 'features' package to fit a smooth spline and extract the spline features,
# including local y-maximum and critical point along x-axis.
# This works fine for one sample at a time:
sample1_data <- df %>% filter(sample == 'A')
sample1_features <- features(x= sample1_data$w,
y= sample1_data$d,
smoother = "smooth.spline")
sample1_features
#> $f
#> fmean fmin fmax fsd noise
#> 1.880000e+00 1.860000e+00 1.880000e+00 1.000000e-02 0.000000e+00
#> snr d1min d1max fwiggle ncpts
#> 2.707108e+11 -9.100000e-01 1.970000e+00 9.349000e+01 1.000000e+00
#>
#> $cpts
#> [1] 0.1
#>
#> $curvature
#> [1] -121.03
#>
#> $outliers
#> [1] NA
#>
#> attr(,"fits")
#> attr(,"fits")$x
#> [1] 0.0800 0.0925 0.1050 0.1175
#>
#> attr(,"fits")$y
#> [1] 1.86 1.88 1.88 1.87
#>
#> attr(,"fits")$fn
#> [1] 1.86 1.88 1.88 1.87
#>
#> attr(,"fits")$d1
#> [1] 1.9732965 0.8533784 -0.5868100 -0.9061384
#>
#> attr(,"fits")$d2
#> [1] 4.588832e-03 -1.791915e+02 -5.123866e+01 1.461069e-01
#>
#> attr(,"class")
#> [1] "features"
# But when attempting to use the pipe and the map() function
# to iterate over a list containing data for multiple samples,
# using the typical map() placeholder dot will not index to the
# list element/columns that are being passed to .f
df_split <- split(df, f= df[['sample']])
df_split
#> $A
#> # A tibble: 4 x 3
#> sample w d
#> <chr> <dbl> <dbl>
#> 1 A 0.08 1.86
#> 2 A 0.0925 1.88
#> 3 A 0.105 1.88
#> 4 A 0.118 1.87
#>
#> $B
#> # A tibble: 4 x 3
#> sample w d
#> <chr> <dbl> <dbl>
#> 1 B 0.09 1.9
#> 2 B 0.102 1.92
#> 3 B 0.115 1.92
#> 4 B 0.128 1.91
#>
#> $C
#> # A tibble: 4 x 3
#> sample w d
#> <chr> <dbl> <dbl>
#> 1 C 0.1 1.96
#> 2 C 0.112 1.98
#> 3 C 0.125 1.98
#> 4 C 0.138 1.97
df_split %>% map(.f = features, x = .[['w']], y= .[['d']], smoother = "smooth.spline")
#> Warning in min(x): no non-missing arguments to min; returning Inf
#> Warning in max(x): no non-missing arguments to max; returning -Inf
#> Error in seq.default(min(x), max(x), length = max(npts, length(x))): 'from' must be a finite number
Created on 2020-04-04 by the reprex package (v0.3.0)
You could use group_split to split the data based on sample and use map to apply features functions to each subset of data.
library(features)
library(dplyr)
library(purrr)
list_model <- df %>%
group_split(sample) %>%
map(~features(x = .x$w, y = .x$d, smoother = "smooth.spline"))

using emmeans for lmer

I've been trying to calculate marginal means for my lmer & glmer in R. I found the emmeans function and I've been trying to understand it and apply it to my model. I found that it's hard to get the means for an interaction, so I'm starting with just additive predictors, but the function doesn't work the way it's presented in examples (e.g. here https://cran.r-project.org/web/packages/emmeans/vignettes/sophisticated.html)
emmeans(Oats.lmer, "nitro")
nitro emmean SE df lower.CL upper.CL
0.0 78.89207 7.294379 7.78 61.98930 95.79484
0.2 97.03425 7.136271 7.19 80.25029 113.81822
0.4 114.19816 7.136186 7.19 97.41454 130.98179
0.6 124.06857 7.070235 6.95 107.32795 140.80919
what I'm getting is:
emmeans(model2, "VariableA")
VariableA emmean SE df lower.CL upper.CL
0.4657459 2649.742 120.8955 19.07 2396.768 2902.715
Only one line and the variable is averaged instead of split into 0 and 1 (which are the values in the dataset, and maybe the problem is that it's categorical?)
The model I'm running is :
model2 = lmer (rt ~ variableA + variableB + (1 |participant) + (1 |sequence/item), data=memoryData, REML=FALSE)
EDIT: The data file is quite big and I wasn't sure how to extract useful information from it, but here is the structure:
> str(memoryData)
'data.frame': 3168 obs. of 123 variables:
$ participant : int 10 10 10 10 10 10 10 10 10 10 ...
$ variableA : int 1 1 1 1 1 1 1 1 1 1 ...
$ variableB : int 1 1 1 1 1 1 1 1 1 1 ...
$ sequence: int 1 1 1 1 1 1 1 1 1 1 ...
$ item : int 25 26 27 28 29 30 31 32 33 34 ...
$ accuracy : int 1 1 1 1 1 1 0 1 1 1 ...
$ rt : num 1720 1628 1728 2247 1247 ...
Why is the function not working for me?
And as a further question, is there a way to get these means when I include interaction between variables A and B?
EDIT 2: ok, it did work when I changed it to factor, I guess my method of doing it was incorrect. But I'm still not sure how to calculate it when there is an interaction? Because with this method, R says "NOTE: Results may be misleading due to involvement in interactions"
To see marginal means of interactions, add all variables of the interaction term to emmeans(), and you need to use the at-argument if you want to see the marginal means at different levels of the interaction terms.
Here are some examples, for the average effect of the interaction, and for marginal effects at different levels of the interaction term. The latter has the advantage in terms of visualization.
library(ggeffects)
library(lme4)
library(emmeans)
data("sleepstudy")
sleepstudy$inter <- sample(1:5, size = nrow(sleepstudy), replace = T)
m <- lmer(Reaction ~ Days * inter + (1 + Days | Subject), data = sleepstudy)
# average marginal effect of interaction
emmeans(m, c("Days", "inter"))
#> Days inter emmean SE df lower.CL upper.CL
#> 4.5 2.994444 298.3427 8.84715 16.98 279.6752 317.0101
#>
#> Degrees-of-freedom method: kenward-roger
#> Confidence level used: 0.95
# marginal effects at different levels of interactions -
# useful for plotting
ggpredict(m, c("Days [3,5,7]", "inter"))
#>
#> # Predicted values of Reaction
#> # x = Days
#>
#> # inter = 1
#> x predicted std.error conf.low conf.high
#> 3 279.349 8.108 263.458 295.240
#> 5 304.839 9.818 285.597 324.082
#> 7 330.330 12.358 306.109 354.551
#>
#> # inter = 2
#> x predicted std.error conf.low conf.high
#> 3 280.970 7.624 266.028 295.912
#> 5 304.216 9.492 285.613 322.819
#> 7 327.462 11.899 304.140 350.784
#>
#> # inter = 3
#> x predicted std.error conf.low conf.high
#> 3 282.591 7.446 267.997 297.185
#> 5 303.593 9.384 285.200 321.985
#> 7 324.594 11.751 301.562 347.626
#>
#> # inter = 4
#> x predicted std.error conf.low conf.high
#> 3 284.212 7.596 269.325 299.100
#> 5 302.969 9.502 284.345 321.594
#> 7 321.726 11.925 298.353 345.099
#>
#> # inter = 5
#> x predicted std.error conf.low conf.high
#> 3 285.834 8.055 270.046 301.621
#> 5 302.346 9.839 283.062 321.630
#> 7 318.858 12.408 294.540 343.177
#>
#> Adjusted for:
#> * Subject = 308
emmeans(m, c("Days", "inter"), at = list(Days = c(3, 5, 7), inter = 1:5))
#> Days inter emmean SE df lower.CL upper.CL
#> 3 1 279.3488 8.132335 23.60 262.5493 296.1483
#> 5 1 304.8394 9.824196 20.31 284.3662 325.3125
#> 7 1 330.3300 12.366296 20.69 304.5895 356.0704
#> 3 2 280.9700 7.630745 18.60 264.9754 296.9646
#> 5 2 304.2160 9.493225 17.77 284.2529 324.1791
#> 7 2 327.4621 11.901431 17.84 302.4420 352.4822
#> 3 3 282.5912 7.445982 16.96 266.8786 298.3038
#> 5 3 303.5927 9.383978 16.98 283.7927 323.3927
#> 7 3 324.5942 11.751239 16.98 299.7988 349.3896
#> 3 4 284.2124 7.601185 18.34 268.2639 300.1609
#> 5 4 302.9694 9.504102 17.85 282.9900 322.9487
#> 7 4 321.7263 11.927612 17.99 296.6666 346.7860
#> 3 5 285.8336 8.076779 23.02 269.1264 302.5409
#> 5 5 302.3460 9.845207 20.48 281.8399 322.8521
#> 7 5 318.8584 12.416642 21.02 293.0380 344.6788
#>
#> Degrees-of-freedom method: kenward-roger
#> Confidence level used: 0.95
And a plotting example:
ggpredict(m, c("Days", "inter [1,3,5]")) %>% plot()
You say that "changing the vari[a]ble to factor doesn't help", but I would think this would (as documented in the emmeans FAQ):
md <- transform(memoryData,
variableA=factor(variableA),
variableB=factor(variableB))
model2 = lmer (rt ~ variableA + variableB +
(1 |participant) + (1 |sequence/item), data=md, REML=FALSE)
emmeans(model2, ~variableA)
emmeans(model2, ~variableB)
emmeans(model2, ~variableA + variableB)
If this really doesn't work, then we need a reproducible example ...

Resources