Related
I have this df
dx <- structure(list(a = c(0.916290731874155, 2.89037175789616, -0.156004248476581,
-0.318453731118534, -2.07944154167984, 2.00533356952611, -1.24319351747922,
0.42744401482694, 1.29532258291416, -2.03292152604494, -0.606135803570316,
-0.693147180559945), b = c(0.550046336919272, 0.228258651980981,
-0.577634293438101, 0.135801541159061, 0.644357016390513, -2.30258509299405,
-0.0870113769896297, 1.71297859137494, 0.17958557697508, -1.65140211153313,
1.31218638896617, 0.282862786015832), c = c(0.0988458346366325,
-3.34403896782221, 1.99243016469021, -1.70474809223843, 2.62103882411258,
2.20727491318972, -1.40242374304977, -1.256836293883, -2.16905370036952,
2.91777073208428, 0.138586163286146, -0.946143695023836), d = c(0.268263986594679,
-2.83321334405622, 1.83258146374831, 1.15057202759882, 0.0613689463762919,
-2.23359222150709, 4.34236137828145, -3.44854350225935, 1.29098418131557,
-0.356674943938732, -0.21868920096483, -0.810930216216329), e = c(1.65140211153313,
0.220400065368459, -0.044951387862266, 0.0773866636154201, -1.49877234454658,
1.36219680954083, -0.295845383090942, -0.709676482511156, -0.916290731874155,
1.65822807660353, 0.451985123743057, -0.810930216216329)), class = "data.frame", row.names = 2:13)
and i need to add a 0 row to df in a loop, because I need the 0 row as first row of each sequence of rows (1:4, 2:5, 3:6, etc).
rs <- 4
sr <- 2
for (t in (rs+1:12-sr)){
z <- 0
R <- Map(`+`, list(t-rs:t-1), 0:z)
for (r in seq(R)) {
s_df<- rbind(0,dx[R[[r]],])
}
}
but returns me this error:
Error in xj[i] : only 0's may be mixed with negative subscripts
The full loop I'm trying to get to work is
rs <- 4
sr <- 2
for (t in (rs+1:12-sr)){
z <- 0
R <- Map(`+`, list(t-rs:t-1), 0:z)
cmin <- t(as.matrix(rep(NA, ncol(dx))))
cdf_mat <- matrix(NA, length(R), ncol(dx))
sq <- list()
for (r in seq(R)) {
for (f in seq(ncol(dx))) {
s_df<- rbind(0,dx[R[[r]],])
df_cum <- sapply(out, function(x) ((cumsum(x)) + 1))
x <- df_cum[,f]
y <- df_cum[,-f]
dif_2 <- (x - y)^2
cmin[f] <- which.min(colSums(dif_2))
dif_3 <- as.matrix(dif_2[,cmin[f]])
cdf_mat[r,f] <-
if (f <= cmin[f]) {
cmin[f] + 1
} else {
cmin[f]
}
sq <- c(sq, list(sqrt(dif_3)))
sqmat <- do.call(cbind, sq)
sd <- (colSums(sqmat))/t
}
}
}
Can you tell me why and how can I do?
Thank you
What about this
insert_z <- function(df){
i <- 1L
ldf <- list()
z <- rep(0 , ncol(df))
while(i+2 < nrow(df)){
df1 <- df[i:(i+3) , ]
ldf[[i]] <- rbind(z , df1)
i <- i + 1L
}
do.call(rbind , ldf)
}
insert_z(dx)
#> a b c d e
#> 1 0.0000000 0.00000000 0.00000000 0.00000000 0.00000000
#> 2 0.9162907 0.55004634 0.09884583 0.26826399 1.65140211
#> 3 2.8903718 0.22825865 -3.34403897 -2.83321334 0.22040007
#> 4 -0.1560042 -0.57763429 1.99243016 1.83258146 -0.04495139
#> 5 -0.3184537 0.13580154 -1.70474809 1.15057203 0.07738666
#> 14 0.0000000 0.00000000 0.00000000 0.00000000 0.00000000
#> 31 2.8903718 0.22825865 -3.34403897 -2.83321334 0.22040007
#> 41 -0.1560042 -0.57763429 1.99243016 1.83258146 -0.04495139
#> 51 -0.3184537 0.13580154 -1.70474809 1.15057203 0.07738666
#> 6 -2.0794415 0.64435702 2.62103882 0.06136895 -1.49877234
#> 15 0.0000000 0.00000000 0.00000000 0.00000000 0.00000000
#> 42 -0.1560042 -0.57763429 1.99243016 1.83258146 -0.04495139
#> 52 -0.3184537 0.13580154 -1.70474809 1.15057203 0.07738666
#> 61 -2.0794415 0.64435702 2.62103882 0.06136895 -1.49877234
#> 7 2.0053336 -2.30258509 2.20727491 -2.23359222 1.36219681
#> 16 0.0000000 0.00000000 0.00000000 0.00000000 0.00000000
#> 53 -0.3184537 0.13580154 -1.70474809 1.15057203 0.07738666
#> 62 -2.0794415 0.64435702 2.62103882 0.06136895 -1.49877234
#> 71 2.0053336 -2.30258509 2.20727491 -2.23359222 1.36219681
#> 8 -1.2431935 -0.08701138 -1.40242374 4.34236138 -0.29584538
#> 17 0.0000000 0.00000000 0.00000000 0.00000000 0.00000000
#> 63 -2.0794415 0.64435702 2.62103882 0.06136895 -1.49877234
#> 72 2.0053336 -2.30258509 2.20727491 -2.23359222 1.36219681
#> 81 -1.2431935 -0.08701138 -1.40242374 4.34236138 -0.29584538
#> 9 0.4274440 1.71297859 -1.25683629 -3.44854350 -0.70967648
#> 18 0.0000000 0.00000000 0.00000000 0.00000000 0.00000000
#> 73 2.0053336 -2.30258509 2.20727491 -2.23359222 1.36219681
#> 82 -1.2431935 -0.08701138 -1.40242374 4.34236138 -0.29584538
#> 91 0.4274440 1.71297859 -1.25683629 -3.44854350 -0.70967648
#> 10 1.2953226 0.17958558 -2.16905370 1.29098418 -0.91629073
#> 19 0.0000000 0.00000000 0.00000000 0.00000000 0.00000000
#> 83 -1.2431935 -0.08701138 -1.40242374 4.34236138 -0.29584538
#> 92 0.4274440 1.71297859 -1.25683629 -3.44854350 -0.70967648
#> 101 1.2953226 0.17958558 -2.16905370 1.29098418 -0.91629073
#> 11 -2.0329215 -1.65140211 2.91777073 -0.35667494 1.65822808
#> 110 0.0000000 0.00000000 0.00000000 0.00000000 0.00000000
#> 93 0.4274440 1.71297859 -1.25683629 -3.44854350 -0.70967648
#> 102 1.2953226 0.17958558 -2.16905370 1.29098418 -0.91629073
#> 111 -2.0329215 -1.65140211 2.91777073 -0.35667494 1.65822808
#> 12 -0.6061358 1.31218639 0.13858616 -0.21868920 0.45198512
#> 112 0.0000000 0.00000000 0.00000000 0.00000000 0.00000000
#> 103 1.2953226 0.17958558 -2.16905370 1.29098418 -0.91629073
#> 113 -2.0329215 -1.65140211 2.91777073 -0.35667494 1.65822808
#> 121 -0.6061358 1.31218639 0.13858616 -0.21868920 0.45198512
#> 13 -0.6931472 0.28286279 -0.94614370 -0.81093022 -0.81093022
Created on 2022-06-04 by the reprex package (v2.0.1)
We may do this easily with slider
library(slider)
out <- lapply(Filter(\(x) length(x) == rs, slide(seq_len(nrow(dx)),
.after = rs-1, .f = I)), \(x) rbind(0, dx[x,]))
-output
> head(out, 3)
[[1]]
a b c d e
1 0.0000000 0.0000000 0.00000000 0.000000 0.00000000
2 0.9162907 0.5500463 0.09884583 0.268264 1.65140211
3 2.8903718 0.2282587 -3.34403897 -2.833213 0.22040007
4 -0.1560042 -0.5776343 1.99243016 1.832581 -0.04495139
5 -0.3184537 0.1358015 -1.70474809 1.150572 0.07738666
[[2]]
a b c d e
1 0.0000000 0.0000000 0.000000 0.00000000 0.00000000
3 2.8903718 0.2282587 -3.344039 -2.83321334 0.22040007
4 -0.1560042 -0.5776343 1.992430 1.83258146 -0.04495139
5 -0.3184537 0.1358015 -1.704748 1.15057203 0.07738666
6 -2.0794415 0.6443570 2.621039 0.06136895 -1.49877234
[[3]]
a b c d e
1 0.0000000 0.0000000 0.000000 0.00000000 0.00000000
4 -0.1560042 -0.5776343 1.992430 1.83258146 -0.04495139
5 -0.3184537 0.1358015 -1.704748 1.15057203 0.07738666
6 -2.0794415 0.6443570 2.621039 0.06136895 -1.49877234
7 2.0053336 -2.3025851 2.207275 -2.23359222 1.36219681
If we want a single dataset, rbind the list elements
out2 <- do.call(rbind, out)
I would like to display a boxplot with a dot for each of my data.
Here is my a downsampling of my data:
value value1 value2 value3 value4 value5 value6 value7 value8 value9 value10 value11 value12 value13 value14 value15 value16 value17 value18 value19 value20 value21 value22 value23 value24 value25 value26 value27 value28 value29 value30 value31 value32 value33 value34 value35 value36 value37 value38 value39 value40 value41 value42 value43 value44 value45 value46 value47 value48 value49 value50 value51 value52 value53 value54 value55 value56 value57 value58 value59 value60 value61 value62 value63 value64 value65 value66 value67 value68 value69 value70 value71 value72 value73 value74 value75 value76 value77 value78 value79 value80 value81 value82 value83 value84 value85 value86 value87 value88 value89 value90 value91 value92 value93
1 DLBCL 1994.95631 2621.3410 753.2132 0.000000 11197.10111 0.000000 176.337991 2000.983371 862.402989 8491.35251 0.000000 0.000000 0.000000 0.000000 0.000000 1293.604484 431.201495 11022.058175 6899.22391 1557.191604 0.00000 0.0000000 491.33939 0.00000 935.4880 473.089640 117093.3704 267.06673 0.000000 1201.315893 546.473181 817.685797 5550.213652 5864.340327 0.000000 756.0793 1186.963254 0.000000 0.000000 182.35834 0.000000 0.000000 2.221214e+04 546.4731813 0.000000 22467.36115 25197.16560 4527.61569 47851.49797 0.0000000 809.029514 1780.444881 466.4264055 2854.851275 2178.702289 0.000000 1155.2188880 0.000000 0.000000 0.000000 0.0000000 325.947587 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0000000 0.000000 5219.72808 0.000000 1092.946363 1914.235537 0.00000 41395.343 5012.19294 0.0000 0.00000 0.000000 0.00000 211214.036 771.94114 5792.9344 155407.942 586.647915 904.81625 5221.03431 26527.2485 118750.28 103149.05
2 HL 2685.55082 3282.5779 4598.1600 4183.367213 1465.89302 0.000000 66.245848 0.000000 161.991801 61.34601 161.991801 0.000000 485.975403 404.979503 80.995901 80.995901 161.991801 6164.020846 4211.78683 17549.958130 2601.72383 1143.4715367 1292.08891 2101.51526 8785.9960 157.980575 25628.0113 2257.43413 426.060627 3572.830049 410.593080 11519.416962 23630.893343 47042.419019 2594.830952 5964.8488 3901.738003 0.000000 0.000000 376.79150 0.000000 833.100691 1.251683e+05 3797.9859885 4500.351000 231.24480 901.51959 8990.54496 21686.09505 0.0000000 50.655417 0.000000 5081.5230881 766.069601 8594.091339 4754.510950 578.6497823 0.000000 0.000000 540.128957 5906.6921396 1897.982677 0.000000 0.000000 0.00000 517.142472 0.000000 90.021493 0.000000 0.000000 395.929041 51.1553056 0.000000 5501.47987 569.641498 1180.455105 1258.479657 0.00000 31700.549 8406.06103 650.9810 198.52612 1888.006678 183.67574 130532.228 108.74974 3400.4110 58514.733 4600.624542 1019.75167 0.00000 20734.9505 163994.61 181005.92
3 HL 3937.68099 5174.0505 14309.5447 17201.448539 6027.55676 0.000000 1566.266081 246.848582 9575.025066 966.94533 5745.015039 5106.680035 5745.015039 8298.355057 5745.015039 8936.690061 3830.010026 2595.831304 0.00000 3842.016327 932.01765 0.0000000 0.00000 0.00000 12463.7614 2256.666225 105760.7753 165061.07726 2014.690206 296.397390 808.979015 0.000000 684.694530 0.000000 1120.551505 47009.4381 0.000000 0.000000 0.000000 809.86996 0.000000 6565.731474 1.992851e+03 2831.4265541 0.000000 911.22915 0.00000 0.00000 0.00000 0.0000000 0.000000 0.000000 345.2403404 1811.236269 0.000000 1561.277973 0.0000000 0.000000 736.098023 3192.598806 0.0000000 0.000000 0.000000 0.000000 0.00000 9897.983156 0.000000 3015.232206 0.000000 1210.472305 3120.347631 2015.7947507 0.000000 89720.16482 0.000000 0.000000 0.000000 984.42025 23569.292 794.98586 570.0480 0.00000 0.000000 482.52095 42461.843 571.37679 3573.1872 25446.846 1519.791401 0.00000 0.00000 57004.8004 153509.90 112514.3
and here is my code :
data2=read.table("/../data.txt",sep="\t",header=TRUE )
data2 %>%
ggplot( aes(x=name, y=value, value1, value2, value3, value4, value5, value6, value7, value8, value9, value10, value11, value12, value13, value14, value15, value16, value17, value18, value19, value20, value21, value22, value23, value24, value25, value26, value27, value28, value29, value30, value31, value32, value33, value34, value35, value36, value37, value38, value39, value40, value41, value42, value43, value44, value45, value46, value47, value48, value49, value50, value51, value52, value53, value54, value55, value56, value57, value58, value59, value60, value61, value62, value63, value64, value65, value66, value67, value68, value69, value70, value71, value72, value73, value74, value75, value76, value77, value78, value79, value80, value81, value82, value83, value84, value85, value86, value87, value88, value89, value90, value91, value92, value93, fill=name)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
geom_jitter(color="black", size=0.4, alpha=0.9) +
theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
ggtitle("Distribution of ... ") +
xlab("")
I got a plot but not all of my data appeared. I suspect only the first column (value) is taken into account.
What did I miss? Does anyone know a trick to get all my dots?
Thanks a lot!
You can try reshaping data to long:
library(ggplot2)
library(dplyr)
library(tidyr)
#Code
data2 %>%
rename(key=value) %>%
pivot_longer(-key) %>%
ggplot(aes(x=key,y=value,fill=name))+
geom_boxplot() +
#scale_fill_viridis(discrete = TRUE, alpha=0.6) +
geom_jitter(color="black", size=0.4, alpha=0.9) +
#theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
ggtitle("Distribution of total EBV gene expression for each PTCL subtype ") +
xlab("")
Output:
Goal
Create a LASSO model using MLR3
Use nested CV with inner CV or bootstraps for hyperparameter (lambda) determination and outer CV for model performance evaluation (instead of doing just one test-train spit) and finding the standard deviation of the different LASSO regression coefficients amongst the different model instances.
Do a prediction on a testing data set not available yet.
Issues
I am unsure whether the nested CV approach as described is implemented correctly in my code below.
I am unsure whether alpha is set correctly alpha = 1 only.
I do not know how to access the LASSO lamda coefficients when using resampling in mlr3. (importance() in mlr3learners does not yet support LASSO)
I don't know how to apply a possible model to the unavailable testing set in mlr3.
Code
library(readr)
library(mlr3)
library(mlr3learners)
library(mlr3pipelines)
library(reprex)
# Data ------
# Prepared according to the Blog post by Julia Silge
# https://juliasilge.com/blog/lasso-the-office/
urlfile = 'https://raw.githubusercontent.com/shudras/office_data/master/office_data.csv'
data = read_csv(url(urlfile))[-1]
#> Warning: Missing column names filled in: 'X1' [1]
#> Parsed with column specification:
#> cols(
#> .default = col_double()
#> )
#> See spec(...) for full column specifications.
# Add a factor to data
data$factor = as.factor(c(rep('a', 20), rep('b', 50), rep('c', 30), rep('a', 6), rep('c', 10), rep('b', 20)))
# Task creation
task =
TaskRegr$new(
id = 'office',
backend = data,
target = 'imdb_rating'
)
# Model creation
graph =
po('scale') %>>%
po('encode') %>>% # make factors numeric
# How to normalize predictors, leaving target unchanged?
lrn('regr.cv_glmnet', # 10-fold CV for inner loop. Is alpha permanently set to 1?
id = 'rp', alpha = 1, family = 'gaussian'
)
graph_learner = GraphLearner$new(graph)
# Execution (actual modeling)
result =
resample(
task,
graph_learner,
rsmp('cv', folds = 5) # 5-fold for outer CV
)
#> INFO [13:21:53.485] Applying learner 'scale.encode.regr.cv_glmnet' on task 'office' (iter 3/5)
#> INFO [13:21:54.937] Applying learner 'scale.encode.regr.cv_glmnet' on task 'office' (iter 2/5)
#> INFO [13:21:55.242] Applying learner 'scale.encode.regr.cv_glmnet' on task 'office' (iter 1/5)
#> INFO [13:21:55.500] Applying learner 'scale.encode.regr.cv_glmnet' on task 'office' (iter 4/5)
#> INFO [13:21:55.831] Applying learner 'scale.encode.regr.cv_glmnet' on task 'office' (iter 5/5)
# How to access results, i.e. lamda coefficients,
# and compare them (why no variable importance for glmnet)
# Access prediction
result$prediction()
#> <PredictionRegr> for 136 observations:
#> row_id truth response
#> 2 8.3 8.373798
#> 6 8.7 8.455151
#> 9 8.4 8.358964
#> ---
#> 116 9.7 8.457607
#> 119 8.2 8.130352
#> 128 7.8 8.224150
Created on 2020-06-11 by the reprex package (v0.3.0)
Edit 1 (LASSO coefficients)
According to a comment from missuse LASSO coefficients can be accessed through result$data$learner[[1]]$model$rp$model$glmnet.fit$beta Additionally, I found that store_models = TRUE needs to be set in result to store the model and in turn access the coefficients.
despite setting alpha = 1, I optained multiple LASSO coefficients. I would like the 'best' LASSO coefficients (stemming from e. g. from lamda = lamda.min or lamda.1se). What do the different s1, s2, s3, ... mean? Are these different lamdas?
The different coefficients indeed seem to stem from different lambda values denoted as s1, s2 , s3, ... (Numer is index.) I suppose, the 'best' coefficients can be accessed by first finding the indices of the 'best' lambda index_lamda.1se = which(ft$lambda == ft$lambda.1se)[[1]]; index_lamda.min = which(ft$lambda == ft$lambda.min)[[1]] and then finding the set of coefficients. A more concise approach to find the 'best' coefficients is given in the comments by missuse.
library(readr)
library(mlr3)
library(mlr3learners)
library(mlr3pipelines)
library(reprex)
urlfile = 'https://raw.githubusercontent.com/shudras/office_data/master/office_data.csv'
data = read_csv(url(urlfile))[-1]
# Add a factor to data
data$factor = as.factor(c(rep('a', 20), rep('b', 50), rep('c', 30), rep('a', 6), rep('c', 10), rep('b', 20)))
# Task creation
task =
TaskRegr$new(
id = 'office',
backend = data,
target = 'imdb_rating'
)
# Model creation
graph =
po('scale') %>>%
po('encode') %>>% # make factors numeric
# How to normalize predictors, leaving target unchanged?
lrn('regr.cv_glmnet', # 10-fold CV for inner loop. Is alpha permanently set to 1?
id = 'rp', alpha = 1, family = 'gaussian'
)
graph$keep_results = TRUE
graph_learner = GraphLearner$new(graph)
# Execution (actual modeling)
result =
resample(
task,
graph_learner,
rsmp('cv', folds = 5), # 5-fold for outer CV
store_models = TRUE # Store model needed to acces coefficients
)
# LASSO coefficients
# Why more than one coefficient per predictor?
# What are s1, s2 etc.? Shouldn't 'lrn' fix alpha = 1?
# How to obtain the best coefficient (for lamda 1se or min) if multiple?
as.matrix(result$data$learner[[1]]$model$rp$model$glmnet.fit$beta)
#> s0 s1 s2 s3 s4 s5
#> andy 0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> angela 0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> b_j_novak 0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> brent_forrester 0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> darryl 0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> dwight 0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> episode 0 0.000000000 0.00000000 0.00000000 0.010297763 0.02170423
#> erin 0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> gene_stupnitsky 0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> greg_daniels 0 0.000000000 0.00000000 0.00000000 0.001845101 0.01309437
#> jan 0 0.000000000 0.00000000 0.00000000 0.005663699 0.01357832
#> jeffrey_blitz 0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> jennifer_celotta 0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> jim 0 0.006331732 0.01761548 0.02789682 0.036853510 0.04590513
#> justin_spitzer 0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> [...]
#> s6 s7 s8 s9 s10
#> andy 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> angela 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> b_j_novak 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> brent_forrester 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> darryl 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> dwight 0.002554576 0.007006995 0.011336058 0.01526851 0.01887180
#> episode 0.031963475 0.040864492 0.047487987 0.05356482 0.05910066
#> erin 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> gene_stupnitsky 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> greg_daniels 0.023040791 0.031866343 0.040170917 0.04779004 0.05472702
#> jan 0.021030152 0.028094541 0.035062678 0.04143812 0.04725379
#> jeffrey_blitz 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> jennifer_celotta 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> jim 0.053013058 0.058503984 0.062897112 0.06683734 0.07041964
#> justin_spitzer 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> kelly 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> ken_kwapis 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> kevin 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> lee_eisenberg 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> michael 0.057190859 0.062963830 0.068766981 0.07394472 0.07865977
#> mindy_kaling 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> oscar 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> pam 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> paul_feig 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> paul_lieberstein 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> phyllis 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> randall_einhorn 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> ryan 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> season 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> toby 0.000000000 0.000000000 0.005637169 0.01202893 0.01785309
#> factor.a 0.000000000 -0.003390125 -0.022365768 -0.03947047 -0.05505681
#> factor.b 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> factor.c 0.000000000 0.000000000 0.000000000 0.00000000 0.00000000
#> s11 s12 s13 s14
#> andy 0.000000000 0.000000000 0.000000000 0.0000000000
#> angela 0.000000000 0.000000000 0.000000000 0.0000000000
#> b_j_novak 0.000000000 0.000000000 0.000000000 0.0000000000
#> brent_forrester 0.000000000 0.000000000 0.000000000 0.0000000000
#> darryl 0.000000000 0.000000000 0.000000000 0.0017042281
#> dwight 0.022170870 0.025326337 0.027880703 0.0303865693
#> episode 0.064126846 0.069018240 0.074399623 0.0794693480
#> [...]
Created on 2020-06-15 by the reprex package (v0.3.0)
Edit 2 (optional follow up question)
Nested CV provides discrepancy-evalutation amongst multiple models. The discrepancy can be expressed as an error (e.g. RMSE) obtained by the outer CV. While that error may be small, individual LASSO coefficients (importance of predictors) from the models (instanciated by the outer CV) may vary considerably.
Does mlr3 provide functionality describing the consitancy in quantitative importance of predictor variables, i. e. RMSE of LASSO coefficients amongst models created by the outer CV? Or should a custom function be created, retrieving the LASSO coefficients using result$data$learner[[i]]$model$rp$model$glmnet.fit$beta (suggested by missuse) with i = 1, 2, 3, 4, 5 being the folds of the outer CV and then taking RMSE of the matching coefficients?
I have an XTS object of monthly returns across multiple columns, I'm trying to calculate rolling annual returns (geometric) for each column.
Date Manager 1 Manager 2 Manager 3 Manager 4 Manager 5
20160430 0.0152000 0.0100700 0.0102210 0.0046160 NA
20160531 0.0462000 0.0515240 0.0287490 0.0374920 NA
20160630 0.0007000 0.0126830 0.0156410 0.0130820 NA
20160731 0.0200000 0.0158810 0.0239540 0.0214950 NA
20160831 0.0339000 0.0531980 0.0021170 0.0476160 0.0457650
20160930 -0.0071000 0.0047540 -0.0088080 0.0031540 -0.0034070
20161031 -0.0224000 -0.0181930 0.0181410 -0.0048280 0.0170850
20161130 -0.0439000 -0.0131600 -0.0243030 -0.0064650 -0.0007180
20161231 -0.0051000 0.0200130 0.0204210 0.0160740 0.0172270
20170131 0.0083000 0.0146560 0.0247000 0.0203410 0.0227060
20170228 0.0211000 -0.0067120 0.0257530 0.0029940 0.0124730
20170331 0.0530000 0.0532190 0.0283950 0.0416190 0.0237900
20170430 0.0638300 0.0592280 0.0341340 0.0437430 0.0293500
20170531 0.0339000 0.0264270 0.0287670 0.0207810 0.0179080
20170630 NA -0.0046950 -0.0091310 -0.0074520 -0.0137600
20170731 NA 0.0109280 0.0029630 0.0146560 0.0167990
20170831 NA 0.0290430 0.0372960 0.0284390 0.0229930
20170930 NA 0.0226390 0.0030190 0.0063850 -0.0087170
Exepcted Results:
Date Manager 1 Manager 2 Manager 3 Manager 4 Manager 5
20160430
20160531
20160630
20160731
20160831
20160930
20161031
20161130
20161231
20170131
20170228
20170331 0.121979182 0.212964432 0.176317288 0.213932804
20170430 0.175724107 0.271996881 0.204161963 0.261212111
20170531 0.161901314 0.241637796 0.204183032 0.240897626
20170630 0.220330851 0.174812396 0.215746067
20170731 0.214381041 0.150728807 0.207606539 0.200188843
20170831 0.186529323 0.191124778 0.185500853 0.174054195
20170930 0.207649992 0.205337395 0.189319163 0.167798654
I've been using the PerformanceAnalytics package, but having some trouble applying the function across each column:
apply.rolling(ManagerReturns, width = 12, trim = FALSE ,FUN = Return.annualized)
apply.rolling is a wrapper around rollapply. For some reason apply.rolling doesn't work correctly with your data, but using rollapply will solve the issue.
using rollapply I can get close to your outcome, with a but. The but is that the Return.annualized removes the NA values but continues to calculate. You can see this happening with Manager1 and Manager5. This is not because rollapply, but because of Return.annualized. For example Return.annualized(my_data$Manager5[1:12]) returns an annualized return of 0.2207884.
ra <- rollapply(my_data, width = 12, FUN = Return.annualized, fill = 0)
Manager1 Manager2 Manager3 Manager4 Manager5
2016-04-30 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
2016-05-31 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
2016-06-30 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
2016-07-31 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
2016-08-31 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
2016-09-30 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
2016-10-31 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
2016-11-30 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
2016-12-31 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
2017-01-31 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
2017-02-28 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
2017-03-31 0.1219792 0.2129644 0.1763173 0.2139328 0.2207884
2017-04-30 0.1757241 0.2719969 0.2041620 0.2612121 0.2409790
2017-05-31 0.1619013 0.2416378 0.2041830 0.2408976 0.2406184
2017-06-30 0.1769613 0.2203309 0.1748124 0.2157461 0.1982881
2017-07-31 0.1682027 0.2143810 0.1507288 0.2076065 0.2001888
2017-08-31 0.1368823 0.1865293 0.1911248 0.1855009 0.1740542
2017-09-30 0.1676742 0.2076500 0.2053374 0.1893192 0.1677987
Now you could do something like ra * !is.na(my_data) which will multiply ra with a 0 in case of NA's and will remove the last 4 records of Manager1. But it will not help with Manager5.
data:
my_data <- structure(c(0.0152, 0.0462, 7e-04, 0.02, 0.0339, -0.0071, -0.0224,
-0.0439, -0.0051, 0.0083, 0.0211, 0.053, 0.06383, 0.0339, NA,
NA, NA, NA, 0.01007, 0.051524, 0.012683, 0.015881, 0.053198,
0.004754, -0.018193, -0.01316, 0.020013, 0.014656, -0.006712,
0.053219, 0.059228, 0.026427, -0.004695, 0.010928, 0.029043,
0.022639, 0.010221, 0.028749, 0.015641, 0.023954, 0.002117, -0.008808,
0.018141, -0.024303, 0.020421, 0.0247, 0.025753, 0.028395, 0.034134,
0.028767, -0.009131, 0.002963, 0.037296, 0.003019, 0.004616,
0.037492, 0.013082, 0.021495, 0.047616, 0.003154, -0.004828,
-0.006465, 0.016074, 0.020341, 0.002994, 0.041619, 0.043743,
0.020781, -0.007452, 0.014656, 0.028439, 0.006385, NA, NA, NA,
NA, 0.045765, -0.003407, 0.017085, -0.000718, 0.017227, 0.022706,
0.012473, 0.02379, 0.02935, 0.017908, -0.01376, 0.016799, 0.022993,
-0.008717), .Dim = c(18L, 5L), .Dimnames = list(NULL, c("Manager1",
"Manager2", "Manager3", "Manager4", "Manager5")), index = structure(c(1461974400,
1464652800, 1467244800, 1469923200, 1472601600, 1475193600, 1477872000,
1480464000, 1483142400, 1485820800, 1488240000, 1490918400, 1493510400,
1496188800, 1498780800, 1501459200, 1504137600, 1506729600), tzone = "UTC", tclass = "Date"), class = c("xts",
"zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC")
I am having some troubles regarding a cluster analysis that I am trying to do with the pvclust package.
Specifically, I have a data matrix composed by species (rows) and sampling stations (columns). I want to perform a CA in order to group my sampling stations according to my species abundance (which I have previously log(x+1) transformed).
Once having prepared adequately my matrix,I've tried to run a CA according to the pvclust package, using Ward's clustering method and Bray-Curtis as distance index. However, every time I get the following error message:
''Error in hclust(distance, method = method.hclust) :
invalid clustering method''
I then tried to perform the same analysis using another cluster method, and I had no problem. I also tried to perform the same analysis using the hclust function from the vegan package, and I had no problem at all, too. The analysis run without any problems.
To better understand my problem, I'll display part of my matrix and the
script that I used to perfrom the analysis:
P1 P2 P3 P4 P5 P6
1 10.8750000 3.2888889 2.0769231 1.4166667 3.2395833 5.333333
3 0.3645833 0.3027778 0.3212038 0.7671958 0.4993676 0.000000
4 0.0000000 0.0000000 2.3500000 0.0000000 0.0000000 0.264000
5 0.0000000 0.7333333 0.2692308 0.0000000 0.2343750 0.000000
6 0.0000000 0.9277778 0.0000000 0.2936508 0.7291667 0.000000
7 0.4166667 6.3500000 1.0925463 0.5476190 0.1885169 0.000000
8 1.6250000 0.0000000 0.0000000 0.0000000 5.2187500 0.000000
9 0.0000000 0.8111111 0.0000000 0.0000000 0.0000000 0.000000
10 2.6770833 0.6666667 2.3304890 4.5906085 2.9652778 0.000000
15 1.8020833 0.9666667 1.4807137 3.3878968 0.1666667 0.000000
16 17.8750000 4.9555556 1.4615385 6.5000000 7.8593750 7.666667
19 4.5312500 1.0555556 3.5766941 6.7248677 2.3196181 0.000000
20 0.0000000 0.6777778 0.5384615 0.0000000 0.0000000 0.000000
21 0.0000000 0.9777778 0.0000000 0.2500000 0.0000000 0.000000
24 1.2500000 3.0583333 0.1923077 0.0000000 4.9583333 0.000000
25 0.0000000 0.0000000 2.5699634 0.0000000 0.0000000 0.000000
26 6.6666667 2.2333333 24.8730020 55.9980159 17.6239583 0.000000
Where P1-P6 are my sampling stations, and the leftmost row numbers are my different species. I'll denote this example matrix just as ''platforms''.
Afterwards, I've used the following code lines:
dist <- function(x, ...){
vegdist(x, ...)
}
result<-pvclust(platforms,method.dist = "bray",method.hclust = "ward")
It is noteworthy that I run the three first codelines, since the bray-curtis index isn't originally available in the pvclust package. Thus, running these codelines allowed me to specify the bray-curtis index in the pvclust function
Does anyone know why it doesn't work with the pvclust package?
Any help will be much appreciated.
Kind regards,
Marie
There are two related issues:
When calling method.hclust you need to pass hclust compatible methods. In theory pvclust checks for ward and converts to ward.D, but you probably want to pass the (correct) names of either ward.D or ward.D2.
You cannot over-write dist in that fashion. However, you can pass a custom function to pvclust.
For instance, this should work:
library(vegan)
library(pvclust)
sample.data <- "P1 P2 P3 P4 P5 P6
10.8750000 3.2888889 2.0769231 1.4166667 3.2395833 5.3333330
0.3645833 0.3027778 0.3212038 0.7671958 0.4993676 0.0000000
0.0000000 0.0000000 2.3500000 0.0000000 0.0000000 0.2640000
0.0000000 0.7333333 0.2692308 0.0000000 0.2343750 0.0000000
0.0000000 0.9277778 0.0000000 0.2936508 0.7291667 0.0000000
0.4166667 6.3500000 1.0925463 0.5476190 0.1885169 0.0000000
1.6250000 0.0000000 0.0000000 0.0000000 5.2187500 0.0000000
0.0000000 0.8111111 0.0000000 0.0000000 0.0000000 0.0000000
2.6770833 0.6666667 2.3304890 4.5906085 2.9652778 0.0000000
1.8020833 0.9666667 1.4807137 3.3878968 0.1666667 0.0000000
17.8750000 4.9555556 1.4615385 6.5000000 7.8593750 7.6666670
4.5312500 1.0555556 3.5766941 6.7248677 2.3196181 0.0000000
0.0000000 0.6777778 0.5384615 0.0000000 0.0000000 0.0000000
0.0000000 0.9777778 0.0000000 0.2500000 0.0000000 0.0000000
1.2500000 3.0583333 0.1923077 0.0000000 4.9583333 0.0000000
0.0000000 0.0000000 2.5699634 0.0000000 0.0000000 0.0000000
6.6666667 2.2333333 24.8730020 55.9980159 17.6239583 0.0000000"
platforms <- read.table(text = sample.data, header = TRUE)
result <- pvclust(platforms,
method.dist = function(x){
vegdist(x, "bray")
},
method.hclust = "ward.D")