Related
Since last version, mlr3tuning package supports (custom) instantiated resampling in AutoTuner class: https://github.com/mlr-org/mlr3tuning/releases/tag/v0.17.2
I have tried to construct rolling window CV's with custom resmapling as in the following link https://towardsdatascience.com/time-series-nested-cross-validation-76adba623eb9 (figure 1).
I want to tune hyperparameters on (let's say )row ids 1:1000 (for example 1:800 train and 801:100 test). and then I would like to evaluate the model on test set, say 1001:1100.
Her is my try:
library(mlr3)
library(mlr3tuning)
library(mlr3pipelines)
library(mlr3learners)
# task
task = tsk("iris")
task_ = task$clone()
data_ = task_$data()
data_ = cbind(data_, monthid = c(rep(1, 30), rep(2, 30), rep(3, 30), rep(4, 30), rep(5, 30)))
task = as_task_classif(data_, target = "Species")
# inner custom rolling window resampling
custom = rsmp("custom")
task_ <- task$clone()
task_$set_col_roles("monthid", "group")
groups = task_$groups
rm(task_)
groups_v <- groups[, unique(group)]
train_length <- 2
test_length <- 1
train_groups <- lapply(0:(length(groups_v)-(train_length+1)), function(x) x + (1:train_length))
test_groups <- lapply(train_groups, function(x) tail(x, 1) + test_length)
train_sets <- lapply(train_groups, function(x) groups[group %in% groups_v[x], row_id])
test_sets <- lapply(test_groups, function(x) groups[group %in% groups_v[x], row_id])
custom$instantiate(task, train_sets, test_sets)
# outer custom rolling window resampling
customo = rsmp("custom")
task_ <- task$clone()
task_$set_col_roles("monthid", "group")
groups = task_$groups
rm(task_)
groups_v <- groups[, unique(group)]
train_length_out <- train_length + test_length
test_length_out <- 1
train_groups_out <- lapply(0:(length(groups_v)-(train_length_out+1)), function(x) x + (1:train_length_out))
test_groups_out <- lapply(train_groups_out, function(x) tail(x, 1) + test_length_out)
train_sets_out <- lapply(train_groups_out, function(x) groups[group %in% groups_v[x], row_id])
test_sets_out <- lapply(test_groups_out, function(x) groups[group %in% groups_v[x], row_id])
customo$instantiate(task, train_sets_out, test_sets_out)
# inspect custom cv's
custom$train_set(1)
custom$test_set(1)
(max(custom$train_set(1)) + 1) == head(custom$test_set(1), 1) # test set starts after train set
customo$train_set(1)
customo$test_set(1)
(max(customo$train_set(1)) + 1) == head(customo$test_set(1), 1) # test set starts after train set
all(c(custom$train_set(1), custom$test_set(1)) %in% customo$train_set(1)) # first outer set contains all inner sets
length(intersect(customo$test_set(1), c(custom$train_set(1), custom$test_set(1)))) == 0
# costruct graph
graph = po("removeconstants", id = "removeconstants_1", ratio = 0) %>>%
po("branch", options = c("nop_prep", "yeojohnson", "pca", "ica"), id = "prep_branch") %>>%
gunion(list(po("nop", id = "nop_prep"), po("yeojohnson"), po("pca", scale. = TRUE), po("ica"))) %>>%
po("unbranch", id = "prep_unbranch") %>>%
po("learner", learner = lrn("classif.rpart"))
plot(graph)
graph_learner = as_learner(graph)
as.data.table(graph_learner$param_set)[1:70, .(id, class, lower, upper)]
search_space = ps(
prep_branch.selection = p_fct(levels = c("nop_prep", "yeojohnson", "pca", "ica")),
pca.rank. = p_int(2, 6, depends = prep_branch.selection == "pca"),
ica.n.comp = p_int(2, 6, depends = prep_branch.selection == "ica"),
yeojohnson.standardize = p_lgl(depends = prep_branch.selection == "yeojohnson")
)
at = auto_tuner(
method = "random_search",
learner = graph_learner,
resampling = custom,
measure = msr("classif.acc"),
search_space = search_space
)
# resmpling
rr = resample(task, at, customo, store_models = TRUE)
I get an error:
INFO [09:46:49.340] [mlr3] Applying learner 'removeconstants_1.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.classif.rpart.tuned' on task 'data_' (iter 1/2)
INFO [09:46:49.362] [mlr3] Applying learner 'removeconstants_1.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.classif.rpart.tuned' on task 'data_' (iter 2/2)
Error: Train set 3 of inner resampling 'custom' contains row ids not present in task 'data_': {91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120}
The first outer training set contains these ids:
train_sets_out
#> [[1]]
#> [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
This means row 1 to 90 are available in the inner resampling.
The third training set of the inner resampling needs rows 61 to 120 but rows 91 to 120 are not available.
train_sets
#> [[3]]
#> [1] 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
So that can't work. Check out figure 4.7 in the book to see how the outer and inner resampling work together.
I want to extract specific rows of my dataframe, following a sequence of rownumbers.
The sequence should be:
7, 14, 21, 31, 38, 45, 55, 62, 69.....until 8760.
So it always is starting from row 7 and then it goes +7 +7 +10 and this should be repeated until the end.
I know rep and seq, but I don't know how to deal with that +10 after the +7.
Any ideas?
Try
x <- rep(c(7, 10), c(2, 1))
out <- cumsum(c(7, rep(x, ceiling(8760 / sum(x)))))
Result
head(out, 10)
# [1] 7 14 21 31 38 45 55 62 69 79
tail(out)
# [1] 8726 8733 8743 8750 8757 8767
If you want out to end at 8760 you might do
c(out[out < 8760], 8760)
We can use rep
x1 <- rep(c(7, 10), c(2, 1))
out <- cumsum(c(7, rep(x1, 8760 %/% sum(x1)))))
out1 <- out[out < 8760]
head(out1, 10)
#[1] 7 14 21 31 38 45 55 62 69 79
tail(out1, 10)
#[1] 8685 8695 8702 8709 8719 8726 8733 8743 8750 8757
I have a numeric vector, which I would like to extent with a sequence of three additional numbers between each two values of this vector.
Consider the following example:
# Example vector
set.seed(123)
x <- round(runif(5, 0, 100))
x
### 29 79 41 88 94
I want to insert 3 new numbers between each of these five values. The numbers should be a sequence of length 3 between each pair of values.
The output should look as follows:
# Desired output
c(seq(29, 79, length.out = 5),
seq(79, 41, length.out = 5)[- 1],
seq(41, 88, length.out = 5)[- 1],
seq(88, 94, length.out = 5)[- 1])
# 29 42 54 66 79 70 60 50 41 53 64 76 88 90 91 92 94
The added sequence between 29 and 79 is 42, 54, 66; the added sequence between 79 and 41 is 70, 60, 50; and so on...
How could I do such an operation in an automated way?
One option is Map to get the sequence of adjacent numbers by removing the last element and first element, remove the first element from the list, unlist the output and append the first element of vector
c(x[1], unlist(lapply(Map(seq, x[-length(x)], x[-1],
MoreArgs = list(length.out = 5)), `[`, -1)))
#[1] 29.00 41.50 54.00 66.50 79.00 69.50 60.00 50.50 41.00
#[11] 52.75 64.50 76.25 88.00 89.50 91.00 92.50 94.00
A not fancy solution is:
set.seed(123)
x <- round(runif(5, 0, 100))
x
c(x[1], unlist(lapply(X = 1:(length(x)-1), function(i) seq(x[i], x[i+1], length.out = 5)[-1])))
[1] 29.00 41.50 54.00 66.50 79.00 69.50 60.00 50.50 41.00 52.75 64.50 76.25 88.00 89.50 91.00 92.50 94.00
Actually your seed generates this values: 29 79 41 88 94
When trying to run this code Julia keeps giving me the error message "KeyError: key 18=>63 not found" anytime I try to access demand[i]. It seems that this error happens every time the element in dem is larger than 50.
using JuMP, Clp
hours = 1:24
dem = [43 40 36 36 35 38 41 46 49 48 47 47 48 46 45 47 50 63 75 75 72 66 57 50]
demand = Dict(zip(hours, dem))
m = Model(solver=ClpSolver())
#variable(m, x[demand] >= 0)
#variable(m, y[demand] >= 0)
for i in demand
if demand[i] > 50
#constraint(m, y[i] == demand[i])
else
#constraint(m, x[i] == demand[i])
end
end
Not sure how to solve this issue.
You are using a Python-style for x in dict. In Julia, this iterates over the key-value pairs of the dictionary, not the keys. Try
for i in keys(demand)
if demand[i] > 50
#constraint(m, y[i] == demand[i])
else
#constraint(m, x[i] == demand[i])
end
end
or
for (h, d) in demand
if d > 50
#constraint(m, y[h] == d)
else
#constraint(m, x[h] == d)
end
end
This worked for me, using Julia 1.0
using JuMP, Clp
hours = 1:24
dem = [43 40 36 36 35 38 41 46 49 48 47 47 48 46 45 47 50 63 75 75 72 66 57 50]
demand = Dict(zip(hours, dem))
m = Model()
setsolver(m, ClpSolver())
#variable(m, x[keys(demand)] >= 0)
#variable(m, y[keys(demand)] >= 0)
for (h, d) in demand
if d > 50
#constraint(m, y[h] == d)
else
#constraint(m, x[h] == d)
end
end
status = solve(m)
println("Objective value: ", getobjectivevalue(m))
println("x = ", getvalue(x))
println("y = ", getvalue(y))
REF:
Reply of #Fengyang Wang
Comment of #Wikunia at https://stackoverflow.com/a/51910619/1096140
https://jump.readthedocs.io/en/latest/quickstart.html
I have a list of 7 vectors, I would like to group the lists into 4 lists of vectors each.
B2 <- list ( c (12 , 47 ,137 ,170), c(44 , 47 ,135, 170) , c(12 , 28 , 34 , 44 , 47 , 59 , 61 , 67 , 76 , 80 , 84 ,135, 148, 170) , c(44 , 47 , 84 ,135 ,170) , c(12 , 28 , 34 , 44 , 47 , 59 , 61 , 67 , 76 , 80 , 84 ,135, 148, 156, 159, 164, 170) , c(12 , 28 , 34 , 44 , 47 , 84 ,135 ,170) , c(12 , 28 , 44 , 47 , 84, 135, 170))
# I would like to create four groups (lists) as per the following index of consecutive list sequence)
Subgroup <- c(2,4,5,7)
# Desired Output
B2 <- list ( list ( c (12 , 47 ,137 ,170), c(44 , 47 ,135, 170)) , list ( c(12 , 28 , 34 , 44 , 47 , 59 , 61 , 67 , 76 , 80 , 84 ,135, 148, 170) , c(44 , 47 , 84 ,135 ,170)) , list ( c(12 , 28 , 34 , 44 , 47 , 59 , 61 , 67 , 76 , 80 , 84 ,135, 148, 156, 159, 164, 170)) , list ( c (12 , 28 , 34 , 44 , 47 , 84 ,135 ,170) , c(12 , 28 , 44 , 47 , 84, 135, 170)))
Using split and rep:
split(B2, rep(1:length(Subgroup), diff(c(0, Subgroup))))
Another option
rng = data.frame(1+c(0, head(Subgroup,-1)), Subgroup)
apply(rng, 1,function(x) B2[x[1]:x[2]])
Here is an option using sequence
i1 <- sequence(Subgroup)
i2 <- !duplicated(i1)
res <- split(B2[i1[i2]],cumsum(c(TRUE,diff(i1)<0))[i2])
all.equal(res, Out, check.attributes=FALSE)
#[1] TRUE
Or as #Chris mentioned in the comments,
lst <- Map(`:`, c(1,head(Subgroup,-1)+1), Subgroup)
lapply(lst, function(i) B2[i])