How to forecast with lagged external regressors using fable::VAR - r

I'd like to use lagged external regressors in my VAR forecast. Using the VAR() function from the fable package, I am able to fit a model, but I can't use it to forecast, as I return NAs for the dependent variables. My reprex follows examples from Forecasting: Principles and Practice v3.
Thanks in advance for any guidance.
require(fpp3)
#> Loading required package: fpp3
#> ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──
#> ✔ tibble 3.1.7 ✔ tsibble 1.0.1
#> ✔ dplyr 1.0.9 ✔ tsibbledata 0.3.0
#> ✔ tidyr 1.1.3 ✔ feasts 0.2.2
#> ✔ lubridate 1.7.10 ✔ fable 0.3.1
#> ✔ ggplot2 3.3.5
#> ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
#> ✖ lubridate::date() masks base::date()
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ tsibble::intersect() masks base::intersect()
#> ✖ tsibble::interval() masks lubridate::interval()
#> ✖ dplyr::lag() masks stats::lag()
#> ✖ tsibble::setdiff() masks base::setdiff()
#> ✖ tsibble::union() masks base::union()
us_change <- fpp3::us_change
fit <- us_change %>%
model(
xregs_lag1 = VAR(vars(Consumption, Income) ~ xreg(Unemployment, lag(Unemployment, 1)))
)
fit
#> # A mable: 1 x 1
#> xregs_lag1
#> <model>
#> 1 <VAR(5) w/ mean>
new_data_ex <- new_data(us_change, 4) %>%
mutate(Unemployment = mean(us_change$Unemployment))
#############
# Here I tried creating a new_data frame that included one lag of Unemployment, and pass that to the new_data argument of forecast, but it doesn't work either
#
# new_data_ex_lags <- us_change %>%
# tail(1) %>%
# bind_rows(new_data_ex) %>%
# select(colnames(new_data_ex))
#############
fit %>%
select(xregs_lag1) %>%
forecast(new_data = new_data_ex)
#> # A fable: 4 x 6 [1Q]
#> # Key: .model [1]
#> .model Quarter .distribution .mean_Consumption .mean_Income Unemployment
#> <chr> <qtr> <dist> <dbl> <dbl> <dbl>
#> 1 xregs_lag1 2019 Q3 MVN[2] NA NA 0.00101
#> 2 xregs_lag1 2019 Q4 MVN[2] NA NA 0.00101
#> 3 xregs_lag1 2020 Q1 MVN[2] NA NA 0.00101
#> 4 xregs_lag1 2020 Q2 MVN[2] NA NA 0.00101
fit %>%
select(xregs_lag1) %>%
report()
#> Series: Consumption, Income
#> Model: VAR(5) w/ mean
#>
#> Coefficients for Consumption:
#> lag(Consumption,1) lag(Income,1) lag(Consumption,2) lag(Income,2)
#> 0.1156 0.1062 0.1479 0.0079
#> s.e. 0.0772 0.0483 0.0753 0.0509
#> lag(Consumption,3) lag(Income,3) lag(Consumption,4) lag(Income,4)
#> 0.2248 -0.0207 -0.0729 -0.0544
#> s.e. 0.0730 0.0499 0.0746 0.0500
#> lag(Consumption,5) lag(Income,5) constant Unemployment
#> -0.0217 0.0327 0.3923 -0.8602
#> s.e. 0.0708 0.0491 0.0923 0.1331
#> lag(Unemployment, 1)
#> 0.4563
#> s.e. 0.1402
#>
#> Coefficients for Income:
#> lag(Consumption,1) lag(Income,1) lag(Consumption,2) lag(Income,2)
#> 0.3715 -0.2991 0.0836 -0.0410
#> s.e. 0.1212 0.0758 0.1182 0.0799
#> lag(Consumption,3) lag(Income,3) lag(Consumption,4) lag(Income,4)
#> 0.4531 -0.1445 0.2481 -0.2475
#> s.e. 0.1145 0.0783 0.1170 0.0785
#> lag(Consumption,5) lag(Income,5) constant Unemployment
#> -0.1270 -0.1878 0.6142 -0.1100
#> s.e. 0.1111 0.0771 0.1449 0.2089
#> lag(Unemployment, 1)
#> -0.0401
#> s.e. 0.2201
#>
#> Residual covariance matrix:
#> Consumption Income
#> Consumption 0.2602 0.1341
#> Income 0.1341 0.6410
#>
#> log likelihood = -350.43
#> AIC = 760.86 AICc = 772.34 BIC = 858.74
Created on 2022-07-22 by the reprex package (v2.0.0)

Using lag() with VAR() models was not fully implemented, but I have added support for this in the development version of the fable package (https://github.com/tidyverts/fable/commit/bb15c9462b80850565aee13d8f9b33e49dfd0f33).
There are some other changes not yet pushed to CRAN such as how forecast means are represented in the fable, but the code is otherwise the same.
require(fpp3)
#> Loading required package: fpp3
#> ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──
#> ✔ tibble 3.1.7 ✔ tsibble 1.1.1
#> ✔ dplyr 1.0.9 ✔ tsibbledata 0.4.0
#> ✔ tidyr 1.2.0 ✔ feasts 0.2.2
#> ✔ lubridate 1.8.0 ✔ fable 0.3.1.9000
#> ✔ ggplot2 3.3.6
#> ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
#> ✖ lubridate::date() masks base::date()
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ tsibble::intersect() masks base::intersect()
#> ✖ tsibble::interval() masks lubridate::interval()
#> ✖ dplyr::lag() masks stats::lag()
#> ✖ tsibble::setdiff() masks base::setdiff()
#> ✖ tsibble::union() masks base::union()
us_change <- fpp3::us_change
fit <- us_change %>%
model(
xregs_lag1 = VAR(vars(Consumption, Income) ~ xreg(Unemployment, lag(Unemployment, 1)))
)
fit
#> # A mable: 1 x 1
#> xregs_lag1
#> <model>
#> 1 <VAR(5) w/ mean>
new_data_ex <- new_data(us_change, 4) %>%
mutate(Unemployment = mean(us_change$Unemployment))
#############
# Here I tried creating a new_data frame that included one lag of Unemployment, and pass that to the new_data argument of forecast, but it doesn't work either
#
# new_data_ex_lags <- us_change %>%
# tail(1) %>%
# bind_rows(new_data_ex) %>%
# select(colnames(new_data_ex))
#############
fit %>%
select(xregs_lag1) %>%
forecast(new_data = new_data_ex)
#> Warning in if (is_transformed) {: the condition has length > 1 and only the
#> first element will be used
#> # A fable: 4 x 5 [1Q]
#> # Key: .model [1]
#> .model Quarter .distribution .mean[,"Consumption… [,"Income"] Unemployment
#> <chr> <qtr> <dist> <dbl> <dbl> <dbl>
#> 1 xregs_lag1 2019 Q3 MVN[2] 0.548 0.657 0.00101
#> 2 xregs_lag1 2019 Q4 MVN[2] 0.679 0.316 0.00101
#> 3 xregs_lag1 2020 Q1 MVN[2] 0.763 0.832 0.00101
#> 4 xregs_lag1 2020 Q2 MVN[2] 0.697 0.733 0.00101
fit %>%
select(xregs_lag1) %>%
report()
#> Series: Consumption, Income
#> Model: VAR(5) w/ mean
#>
#> Coefficients for Consumption:
#> lag(Consumption,1) lag(Income,1) lag(Consumption,2) lag(Income,2)
#> 0.1156 0.1062 0.1479 0.0079
#> s.e. 0.0772 0.0483 0.0753 0.0509
#> lag(Consumption,3) lag(Income,3) lag(Consumption,4) lag(Income,4)
#> 0.2248 -0.0207 -0.0729 -0.0544
#> s.e. 0.0730 0.0499 0.0746 0.0500
#> lag(Consumption,5) lag(Income,5) constant Unemployment
#> -0.0217 0.0327 0.3923 -0.8602
#> s.e. 0.0708 0.0491 0.0923 0.1331
#> lag(Unemployment, 1)
#> 0.4563
#> s.e. 0.1402
#>
#> Coefficients for Income:
#> lag(Consumption,1) lag(Income,1) lag(Consumption,2) lag(Income,2)
#> 0.3715 -0.2991 0.0836 -0.0410
#> s.e. 0.1212 0.0758 0.1182 0.0799
#> lag(Consumption,3) lag(Income,3) lag(Consumption,4) lag(Income,4)
#> 0.4531 -0.1445 0.2481 -0.2475
#> s.e. 0.1145 0.0783 0.1170 0.0785
#> lag(Consumption,5) lag(Income,5) constant Unemployment
#> -0.1270 -0.1878 0.6142 -0.1100
#> s.e. 0.1111 0.0771 0.1449 0.2089
#> lag(Unemployment, 1)
#> -0.0401
#> s.e. 0.2201
#>
#> Residual covariance matrix:
#> Consumption Income
#> Consumption 0.2602 0.1341
#> Income 0.1341 0.6410
#>
#> log likelihood = -350.43
#> AIC = 760.86 AICc = 772.34 BIC = 858.74
Created on 2022-07-23 by the reprex package (v2.0.1)

Related

getting a color name in an R color palette

Is there a way to get the name or code for the colours in Set3 colour palette of ggplot2?
You could use brewer.pal from RColorBrewer like this:
library(RColorBrewer)
brewer.pal(12, "Set3")
#> [1] "#8DD3C7" "#FFFFB3" "#BEBADA" "#FB8072" "#80B1D3" "#FDB462" "#B3DE69"
#> [8] "#FCCDE5" "#D9D9D9" "#BC80BD" "#CCEBC5" "#FFED6F"
Created on 2022-10-19 with reprex v2.0.2
To get the names of the colors codes, you could use the function color.id from plotrix like this:
library(RColorBrewer)
colors <- brewer.pal(12, "Set3")
library(plotrix)
sapply(colors, color.id)
#> $`#8DD3C7`
#> [1] "paleturquoise3"
#>
#> $`#FFFFB3`
#> [1] "wheat1"
#>
#> $`#BEBADA`
#> [1] "lightsteelblue"
#>
#> $`#FB8072`
#> [1] "salmon"
#>
#> $`#80B1D3`
#> [1] "lightskyblue3"
#>
#> $`#FDB462`
#> [1] "sandybrown"
#>
#> $`#B3DE69`
#> [1] "darkolivegreen2"
#>
#> $`#FCCDE5`
#> [1] "thistle2"
#>
#> $`#D9D9D9`
#> [1] "gray85" "grey85"
#>
#> $`#BC80BD`
#> [1] "orchid3"
#>
#> $`#CCEBC5`
#> [1] "darkseagreen1"
#>
#> $`#FFED6F`
#> [1] "lightgoldenrod1"
Created on 2022-10-19 with reprex v2.0.2
Absolutely! These palettes come from ‘RColorBrewer’ and, using that package, you can get the color values from each palette via the following code:
palette = 'Set3'
brewer.pal(brewer.pal.info[palette, 'maxcolors'], palette)

How to assign manually colors to each data group

Could someone help me in assigning a manual colour to each group? I want to use these two colours for my data #F96167 for beer, and #FCE77D for Whisky.
Also, in the last graph, I would like different plot symbols (e.g. ∆ for Beer, O for Whisky) for each group.
library(rgeos)
#> Loading required package: sp
#> rgeos version: 0.5-9, (SVN revision 684)
#> GEOS runtime version: 3.9.1-CAPI-1.14.2
#> Please note that rgeos will be retired by the end of 2023,
#> plan transition to sf functions using GEOS at your earliest convenience.
#> GEOS using OverlayNG
#> Linking to sp version: 1.5-0
#> Polygon checking: TRUE
library(sp)
library(vegan)
#> Loading required package: permute
#> Loading required package: lattice
#> This is vegan 2.6-2
library(tidyverse)
library(Momocs)
#>
#> Attaching package: 'Momocs'
#> The following objects are masked from 'package:dplyr':
#>
#> arrange, combine, filter, mutate, rename, sample_frac, sample_n,
#> select, slice
#> The following object is masked from 'package:tidyr':
#>
#> chop
#> The following object is masked from 'package:stats':
#>
#> filter
library(caret)
#>
#> Attaching package: 'caret'
#> The following object is masked from 'package:purrr':
#>
#> lift
#> The following object is masked from 'package:vegan':
#>
#> tolerance
library(doParallel)
#> Loading required package: foreach
#>
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#>
#> accumulate, when
#> Loading required package: iterators
#> Loading required package: parallel
library(xlsx)
library(foreach)
library(broom)
library(MASS)
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:Momocs':
#>
#> select
#> The following object is masked from 'package:dplyr':
#>
#> select
library(scales)
#>
#> Attaching package: 'scales'
#> The following object is masked from 'package:Momocs':
#>
#> rescale
#> The following object is masked from 'package:purrr':
#>
#> discard
#> The following object is masked from 'package:readr':
#>
#> col_factor
library(RColorBrewer)
library(ggspatial)
data(bot)
bot
#> Out (outlines)
#> - 40 outlines, 162 +/- 21 coords (in $coo)
#> - 2 classifiers (in $fac):
#> # A tibble: 40 × 2
#> type fake
#> <fct> <fct>
#> 1 whisky a
#> 2 whisky a
#> 3 whisky a
#> 4 whisky a
#> 5 whisky a
#> 6 whisky a
#> # … with 34 more rows
#> - also: $ldk
panel(bot, fac="type", names=TRUE)
bot.f <- efourier(bot, nb.h=10)
#> 'norm=TRUE' is used and this may be troublesome. See ?efourier
bot.f
#> An OutCoe object [ elliptical Fourier analysis ]
#> --------------------
#> - $coe: 40 outlines described, 10 harmonics
#> # A tibble: 40 × 2
#> type fake
#> <fct> <fct>
#> 1 whisky a
#> 2 whisky a
#> 3 whisky a
#> 4 whisky a
#> 5 whisky a
#> 6 whisky a
#> # … with 34 more rows
# mean shape
ms_ <- MSHAPES(bot.f, fac="type")
ms_ <- ms_$shp
datams_<-rbind(data.frame(ms_$beer, Group="Beer"),
data.frame(ms_$whisky, Group="Whisky"))
ggplot(datams_)+theme_bw()+geom_path( aes(x,y, color=Group, linetype=Group), size=0.5)+theme_void()+theme(legend.position = c(0.5,0.6), axis.title = element_blank(), axis.ticks = element_blank(), axis.text = element_blank())+coord_equal()+
theme(legend.key.size = unit(5, units = "mm"))
####Also, in this code, I would like different plot symbols (e.g. ∆ for Beer, O for Whisky) for each group.
pca.fourier = bot.f %>% PCA %>% plot_PCA(~type)
layer_ellipses( conf = 0.9, lwd = 1, alpha = 0)%>%
layer_axes(lwd = 1)%>%
layer_axesvar(cex=1.5)%>%
layer_ellipsesaxes (conf = 0.5,lwd=1.5)%>%
layer_grid( col = "#999999", lty = 3, grid = 3)%>%
layer_stars(alpha = 0.8)%>%
layer_points( cex=1.3) %>%
#layer_eigen( nb_max =5, cex = 1 )%>%
layer_legend( cex = 1)%>%
layer_title( title = "Study", cex =1)
#> Error in is.factor(x$f): argument "x" is missing, with no default
Created on 2022-07-12 by the reprex package (v2.0.1)

step_pca() arguments are not being applied

I'm new to tidymodels but apparently the step_pca() arguments such as nom_comp or threshold are not being implemented when being trained. as in example below, I'm still getting 4 component despite setting nom_comp = 2.
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
rec <- recipe( ~ ., data = USArrests) %>%
step_normalize(all_numeric()) %>%
step_pca(all_numeric(), num_comp = 2)
prep(rec) %>% tidy(number = 2, type = "coef") %>%
pivot_wider(names_from = component, values_from = value, id_cols = terms)
#> # A tibble: 4 x 5
#> terms PC1 PC2 PC3 PC4
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Murder -0.536 0.418 -0.341 0.649
#> 2 Assault -0.583 0.188 -0.268 -0.743
#> 3 UrbanPop -0.278 -0.873 -0.378 0.134
#> 4 Rape -0.543 -0.167 0.818 0.0890
The full PCA is determined (so you can still compute the variances of each term) and num_comp only specifies how many of the components are retained as predictors. If you want to specify the maximal rank, you can pass that through options:
library(recipes)
#> Loading required package: dplyr
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
#>
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stats':
#>
#> step
rec <- recipe( ~ ., data = USArrests) %>%
step_normalize(all_numeric()) %>%
step_pca(all_numeric(), num_comp = 2, options = list(rank. = 2))
prep(rec) %>% tidy(number = 2, type = "coef")
#> # A tibble: 8 × 4
#> terms value component id
#> <chr> <dbl> <chr> <chr>
#> 1 Murder -0.536 PC1 pca_AoFOm
#> 2 Assault -0.583 PC1 pca_AoFOm
#> 3 UrbanPop -0.278 PC1 pca_AoFOm
#> 4 Rape -0.543 PC1 pca_AoFOm
#> 5 Murder 0.418 PC2 pca_AoFOm
#> 6 Assault 0.188 PC2 pca_AoFOm
#> 7 UrbanPop -0.873 PC2 pca_AoFOm
#> 8 Rape -0.167 PC2 pca_AoFOm
Created on 2022-01-12 by the reprex package (v2.0.1)
You could also control this via the tol argument from stats::prcomp(), also passed in as an option.
If you bake the recipe it seems to work as intended but I don't know what you aim to achieve afterward.
library(tidyverse)
library(tidymodels)
USArrests <- USArrests %>%
rownames_to_column("Countries")
rec <-
recipe( ~ ., data = USArrests) %>%
step_normalize(all_numeric()) %>%
step_pca(all_numeric(), num_comp = 2)
prep(rec) %>%
bake(new_data = NULL)
#> # A tibble: 50 x 3
#> Countries PC1 PC2
#> <fct> <dbl> <dbl>
#> 1 Alabama -0.976 1.12
#> 2 Alaska -1.93 1.06
#> 3 Arizona -1.75 -0.738
#> 4 Arkansas 0.140 1.11
#> 5 California -2.50 -1.53
#> 6 Colorado -1.50 -0.978
#> 7 Connecticut 1.34 -1.08
#> 8 Delaware -0.0472 -0.322
#> 9 Florida -2.98 0.0388
#> 10 Georgia -1.62 1.27
#> # ... with 40 more rows
Created on 2022-01-11 by the reprex package (v2.0.1)

Not being able to execute stl decomposition properly

from this ts:
australia_data <- tourism %>%
select(Quarter, Trips) %>%
summarise(TotalTrips = sum(Trips))
> head(australia_data)
# A tsibble: 6 x 4 [1D]
# Key: Region, Purpose [1]
# Groups: Region [1]
Region Purpose Quarter TotalTrips
<chr> <chr> <date> <dbl>
1 Adelaide Business 1998-01-01 135.
2 Adelaide Business 1998-04-01 110.
3 Adelaide Business 1998-07-01 166.
4 Adelaide Business 1998-10-01 127.
5 Adelaide Business 1999-01-01 137.
6 Adelaide Business 1999-04-01 200.
I want to do a STL decomposition, in order to get seasonally adjusted data :
australia_data_dcmp <- australia_data %>%
model(STL(TotalTrips))
but I'm not being able to get components
> components(australia_data_dcmp)
Error: Problem with `mutate()` column `cmp`.
i `cmp = map(.fit, components)`.
x no applicable method for 'components' applied to an object of class "null_mdl"
> head(augment(australia_data_dcmp))
# A tsibble: 6 x 8 [1D]
# Key: Region, Purpose, .model [1]
Region Purpose .model Quarter TotalTrips .fitted .resid .innov
<chr> <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl>
1 Adelaide Business STL(TotalTrips) 1998-01-01 135. NA NA NA
2 Adelaide Business STL(TotalTrips) 1998-04-01 110. NA NA NA
3 Adelaide Business STL(TotalTrips) 1998-07-01 166. NA NA NA
4 Adelaide Business STL(TotalTrips) 1998-10-01 127. NA NA NA
5 Adelaide Business STL(TotalTrips) 1999-01-01 137. NA NA NA
6 Adelaide Business STL(TotalTrips) 1999-04-01 200. NA NA NA
can someone explain me the mistake I'm commiting please ?
Best regards
The tourism object you show is not what you get when using the latest versions of the various packages loaded by fpp3. This is what I get.
library(fpp3)
#> ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──
#> ✓ tibble 3.1.5 ✓ tsibble 1.1.0
#> ✓ dplyr 1.0.7 ✓ tsibbledata 0.3.0
#> ✓ tidyr 1.1.4 ✓ feasts 0.2.2
#> ✓ lubridate 1.8.0 ✓ fable 0.3.1
#> ✓ ggplot2 3.3.5
#> ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
#> x lubridate::date() masks base::date()
#> x dplyr::filter() masks stats::filter()
#> x tsibble::intersect() masks base::intersect()
#> x tsibble::interval() masks lubridate::interval()
#> x dplyr::lag() masks stats::lag()
#> x tsibble::setdiff() masks base::setdiff()
#> x tsibble::union() masks base::union()
australia_data <- tourism %>%
select(Quarter, Trips) %>%
summarise(TotalTrips = sum(Trips))
australia_data
#> # A tsibble: 80 x 2 [1Q]
#> Quarter TotalTrips
#> <qtr> <dbl>
#> 1 1998 Q1 23182.
#> 2 1998 Q2 20323.
#> 3 1998 Q3 19827.
#> 4 1998 Q4 20830.
#> 5 1999 Q1 22087.
#> 6 1999 Q2 21458.
#> 7 1999 Q3 19914.
#> 8 1999 Q4 20028.
#> 9 2000 Q1 22339.
#> 10 2000 Q2 19941.
#> # … with 70 more rows
Created on 2021-11-01 by the reprex package (v2.0.1)
Perhaps you are over-writing the tourism object with a grouped version. Or perhaps you are using an old version of the tsibble package where the keys were not dropped using summarise().
In any case, without a reproducible example it is hard to provide more substantial help.

Step_dummy. Dealing with duplicated column names generated by recipe() steps, Tidymodels

Dear community,
I have been struggeling for extensive amount of time now trying to understand what is going on here, when I perform my recipe() steps for my linear (glm) model using the Tidymodels framework. The recipe() step_dummy(all_nominal(), -all_outcomes()) was suggested by the usemodels() function https://usemodels.tidymodels.org/index.html .
When I commend out the step_dummy() the recipe() and prep() works fine, however its important to me that these categorical variables are dummyfied (..is that a word!?).
This is the first time I making use of and including a reprex in a question on stackoverflow, so please let me know if you need more information to assist on this matter.
I have looked everywhere, e.g. including a one_hot = TRUE or keep_original_cols argument in the step_dummy() but it does not seem to be effective.
It should be quite easy as it is a matter of renaming the generated columns as unique, but do not succeed. Here is the era.af_train set.
> era.af_train
# A tibble: 7,104 x 44
logRR ID AEZ16simple PrName.Code SubPrName.Code Product Country
<dbl> <dbl> <fct> <fct> <fct> <fct> <fct>
1 -0.851 1663 Warm.Semiar~ BP TW Pearl Mill~ Niger
2 -1.17 1663 Warm.Semiar~ BP/Mu Mu-N/TW Pearl Mill~ Niger
3 -0.314 1663 Warm.Semiar~ BP TW Pearl Mill~ Niger
4 -0.776 1663 Warm.Semiar~ BP TW Pearl Mill~ Niger
5 -0.0850 1675 Warm.Semiar~ AP TPM+N Pearl Mill~ Niger
6 -0.159 1689 Warm.Subhum~ Al/AP/BP Al+N/LF/TP/TPM~ Maize Togo
7 -0.579 1701 Warm.Semiar~ BP TW Fodder (Le~ Tunisia
8 -0.662 1729 Warm.Subhum~ Al Al-N/Al+N Cassava or~ Nigeria
9 -1.80 1802 Cool.Subhum~ Al/AP Al+N/TP Wheat Ethiop~
10 -1.74 1802 Cool.Subhum~ Al/AP Al+N/TP/TPI+N Wheat Ethiop~
# ... with 7,094 more rows, and 37 more variables: Latitude <dbl>,
# Longitude <dbl>, Site.Type <fct>, Tree <fct>, Bio01_MT_Anu.Mean <dbl>,
# Bio02_MDR.Mean <dbl>, Bio03_Iso.Mean <dbl>, Bio04_TS.Mean <dbl>,
# Bio05_TWM.Mean <dbl>, Bio06_MinTCM.Mean <dbl>, Bio07_TAR.Mean <dbl>,
# Bio08_MT_WetQ.Mean <dbl>, Bio09_MT_DryQ.Mean <dbl>,
# Bio10_MT_WarQ.Mean <dbl>, Bio11_MT_ColQ.Mean <dbl>,
# Bio12_Pecip_Anu.Mean <dbl>, Bio13_Precip_WetM.Mean <dbl>,
# Bio14_Precip_DryM.Mean <dbl>, Bio15_Precip_S.Mean <dbl>,
# Bio16_Precip_WetQ.Mean <dbl>, Bio17_Precip_DryQ.Mean <dbl>,
# Mean_log.n_tot_ncs <dbl>, Mean_log.ca_mehlich3 <dbl>,
# Mean_log.k_mehlich3 <dbl>, Mean_log.mg_mehlich3 <dbl>,
# Mean_log.p_mehlich3 <dbl>, Mean_log.s_mehlich3 <dbl>,
# Mean_log.fe_mehlich3 <dbl>, Mean_db_od <dbl>, Mean_bdr <dbl>,
# Mean_sand_tot_psa <dbl>, Mean_clay_tot_psa <dbl>, Mean_ph_h2o <dbl>,
# Mean_log.ecec.f <dbl>, Mean_log.c_tot <dbl>, Mean_log.oc <dbl>,
# Slope.mean <dbl>
I am including the columns ID, AEZ16simple, PrName.Code, SubPrName.Code, Product, Country, Latitude and Longitude as "ID variables", as I wish to compare the glm model later with a random forest model and a XGBoost model.
All help is welcome!
Have a good weekend and
thank you in advance.
library(reprex)
#> Warning: package 'reprex' was built under R version 4.0.5
library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.0.5
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(recipes)
#> Warning: package 'recipes' was built under R version 4.0.5
#>
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stats':
#>
#> step
library(readr)
#> Warning: package 'readr' was built under R version 4.0.5
setwd("C:/Users/lindh011/OneDrive - Wageningen University & Research/Internship ICRAF (ERA)/ERA_Agroforestry_WURwork")
era.af_train <- read_csv("era.af_train.csv")
#>
#> -- Column specification --------------------------------------------------------
#> cols(
#> .default = col_double(),
#> AEZ16simple = col_character(),
#> PrName.Code = col_character(),
#> SubPrName.Code = col_character(),
#> Product = col_character(),
#> Country = col_character(),
#> Site.Type = col_character(),
#> Tree = col_character()
#> )
#> i Use `spec()` for the full column specifications.
era.af_train_Tib <- as_tibble(era.af_train)
glmnet_recipe <-
recipe(formula = logRR ~ ., data = era.af_train) %>%
step_novel(all_nominal(), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes(), naming = dummy_names) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors(), -all_nominal()) %>%
update_role(ID,
AEZ16simple,
PrName.Code,
SubPrName.Code,
Product,
Country,
Latitude,
Longitude,
new_role = "sample ID") %>%
step_impute_mode(all_nominal(), -all_outcomes()) %>%
step_impute_knn (all_numeric_predictors()) %>%
step_impute_knn(logRR) %>%
step_corr(all_numeric_predictors()) %>%
step_nzv(all_numeric_predictors()) %>%
prep()
#> Error: Column names `SubPrName.Code_AF.N.Al.N.TP`, `SubPrName.Code_AF.N.Al.N.TP.TPM`, `SubPrName.Code_Al.N.In.N`, `SubPrName.Code_Al.N.In.N`, `SubPrName.Code_Al.N`, and 33 more must not be duplicated.
#> Use .name_repair to specify repair.
Created on 2021-07-02 by the reprex package (v2.0.0)

Resources