I have a list of 7 vectors, I would like to group the lists into 4 lists of vectors each.
B2 <- list ( c (12 , 47 ,137 ,170), c(44 , 47 ,135, 170) , c(12 , 28 , 34 , 44 , 47 , 59 , 61 , 67 , 76 , 80 , 84 ,135, 148, 170) , c(44 , 47 , 84 ,135 ,170) , c(12 , 28 , 34 , 44 , 47 , 59 , 61 , 67 , 76 , 80 , 84 ,135, 148, 156, 159, 164, 170) , c(12 , 28 , 34 , 44 , 47 , 84 ,135 ,170) , c(12 , 28 , 44 , 47 , 84, 135, 170))
# I would like to create four groups (lists) as per the following index of consecutive list sequence)
Subgroup <- c(2,4,5,7)
# Desired Output
B2 <- list ( list ( c (12 , 47 ,137 ,170), c(44 , 47 ,135, 170)) , list ( c(12 , 28 , 34 , 44 , 47 , 59 , 61 , 67 , 76 , 80 , 84 ,135, 148, 170) , c(44 , 47 , 84 ,135 ,170)) , list ( c(12 , 28 , 34 , 44 , 47 , 59 , 61 , 67 , 76 , 80 , 84 ,135, 148, 156, 159, 164, 170)) , list ( c (12 , 28 , 34 , 44 , 47 , 84 ,135 ,170) , c(12 , 28 , 44 , 47 , 84, 135, 170)))
Using split and rep:
split(B2, rep(1:length(Subgroup), diff(c(0, Subgroup))))
Another option
rng = data.frame(1+c(0, head(Subgroup,-1)), Subgroup)
apply(rng, 1,function(x) B2[x[1]:x[2]])
Here is an option using sequence
i1 <- sequence(Subgroup)
i2 <- !duplicated(i1)
res <- split(B2[i1[i2]],cumsum(c(TRUE,diff(i1)<0))[i2])
all.equal(res, Out, check.attributes=FALSE)
#[1] TRUE
Or as #Chris mentioned in the comments,
lst <- Map(`:`, c(1,head(Subgroup,-1)+1), Subgroup)
lapply(lst, function(i) B2[i])
Related
Since last version, mlr3tuning package supports (custom) instantiated resampling in AutoTuner class: https://github.com/mlr-org/mlr3tuning/releases/tag/v0.17.2
I have tried to construct rolling window CV's with custom resmapling as in the following link https://towardsdatascience.com/time-series-nested-cross-validation-76adba623eb9 (figure 1).
I want to tune hyperparameters on (let's say )row ids 1:1000 (for example 1:800 train and 801:100 test). and then I would like to evaluate the model on test set, say 1001:1100.
Her is my try:
library(mlr3)
library(mlr3tuning)
library(mlr3pipelines)
library(mlr3learners)
# task
task = tsk("iris")
task_ = task$clone()
data_ = task_$data()
data_ = cbind(data_, monthid = c(rep(1, 30), rep(2, 30), rep(3, 30), rep(4, 30), rep(5, 30)))
task = as_task_classif(data_, target = "Species")
# inner custom rolling window resampling
custom = rsmp("custom")
task_ <- task$clone()
task_$set_col_roles("monthid", "group")
groups = task_$groups
rm(task_)
groups_v <- groups[, unique(group)]
train_length <- 2
test_length <- 1
train_groups <- lapply(0:(length(groups_v)-(train_length+1)), function(x) x + (1:train_length))
test_groups <- lapply(train_groups, function(x) tail(x, 1) + test_length)
train_sets <- lapply(train_groups, function(x) groups[group %in% groups_v[x], row_id])
test_sets <- lapply(test_groups, function(x) groups[group %in% groups_v[x], row_id])
custom$instantiate(task, train_sets, test_sets)
# outer custom rolling window resampling
customo = rsmp("custom")
task_ <- task$clone()
task_$set_col_roles("monthid", "group")
groups = task_$groups
rm(task_)
groups_v <- groups[, unique(group)]
train_length_out <- train_length + test_length
test_length_out <- 1
train_groups_out <- lapply(0:(length(groups_v)-(train_length_out+1)), function(x) x + (1:train_length_out))
test_groups_out <- lapply(train_groups_out, function(x) tail(x, 1) + test_length_out)
train_sets_out <- lapply(train_groups_out, function(x) groups[group %in% groups_v[x], row_id])
test_sets_out <- lapply(test_groups_out, function(x) groups[group %in% groups_v[x], row_id])
customo$instantiate(task, train_sets_out, test_sets_out)
# inspect custom cv's
custom$train_set(1)
custom$test_set(1)
(max(custom$train_set(1)) + 1) == head(custom$test_set(1), 1) # test set starts after train set
customo$train_set(1)
customo$test_set(1)
(max(customo$train_set(1)) + 1) == head(customo$test_set(1), 1) # test set starts after train set
all(c(custom$train_set(1), custom$test_set(1)) %in% customo$train_set(1)) # first outer set contains all inner sets
length(intersect(customo$test_set(1), c(custom$train_set(1), custom$test_set(1)))) == 0
# costruct graph
graph = po("removeconstants", id = "removeconstants_1", ratio = 0) %>>%
po("branch", options = c("nop_prep", "yeojohnson", "pca", "ica"), id = "prep_branch") %>>%
gunion(list(po("nop", id = "nop_prep"), po("yeojohnson"), po("pca", scale. = TRUE), po("ica"))) %>>%
po("unbranch", id = "prep_unbranch") %>>%
po("learner", learner = lrn("classif.rpart"))
plot(graph)
graph_learner = as_learner(graph)
as.data.table(graph_learner$param_set)[1:70, .(id, class, lower, upper)]
search_space = ps(
prep_branch.selection = p_fct(levels = c("nop_prep", "yeojohnson", "pca", "ica")),
pca.rank. = p_int(2, 6, depends = prep_branch.selection == "pca"),
ica.n.comp = p_int(2, 6, depends = prep_branch.selection == "ica"),
yeojohnson.standardize = p_lgl(depends = prep_branch.selection == "yeojohnson")
)
at = auto_tuner(
method = "random_search",
learner = graph_learner,
resampling = custom,
measure = msr("classif.acc"),
search_space = search_space
)
# resmpling
rr = resample(task, at, customo, store_models = TRUE)
I get an error:
INFO [09:46:49.340] [mlr3] Applying learner 'removeconstants_1.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.classif.rpart.tuned' on task 'data_' (iter 1/2)
INFO [09:46:49.362] [mlr3] Applying learner 'removeconstants_1.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.classif.rpart.tuned' on task 'data_' (iter 2/2)
Error: Train set 3 of inner resampling 'custom' contains row ids not present in task 'data_': {91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120}
The first outer training set contains these ids:
train_sets_out
#> [[1]]
#> [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
This means row 1 to 90 are available in the inner resampling.
The third training set of the inner resampling needs rows 61 to 120 but rows 91 to 120 are not available.
train_sets
#> [[3]]
#> [1] 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
So that can't work. Check out figure 4.7 in the book to see how the outer and inner resampling work together.
I have some dataframe. Here is a small expample:
a <- rnorm(100, 5, 2)
b <- rnorm(100, 10, 3)
c <- rnorm(100, 15, 4)
df <- data.frame(a, b, c)
And I have a character variable vect <- "c('a','b')"
When I try to calculate sum of vars using command
df$d <- df[vect]
which must be an equivalent of
df$d <- df[c('a','b')]
But, as a reslut I have got an error
[.data.frame(df, vect) :undefined columns selected
You're assumption that
vect <- "c('a','b')"
df$d <- df[vect]
is equivalent to
df$d <- df[c('a','b')]
is incorrect.
As #Karthik points out, you should remove the quotation marks in the assignment to vect
However, from your question it sounds like you want to then sum the elements specified in vect and then assign to d. To do this you need to slightly change your code
vect <- c('a','b')
df$d <- apply(X = df[vect], MARGIN = 1, FUN = sum)
This does elementwise sum on the columns in df specified by vect. The MARGIN = 1 specifies that we want to apply the sum rowise rather than columnwise.
EDIT:
As #ThomasIsCoding points out below, if for some reason vect has to be a string, you can parse a string to an R expression using str2lang
vect <- "c('a','b')"
parsed_vect <- eval(str2lang(vect))
df$d <- apply(X = df[parsed_vect], MARGIN = 1, FUN = sum)
Perhaps you can try
> df[eval(str2lang(vect))]
a b
1 8.1588519 9.0617818
2 3.9361214 13.2752377
3 5.5370983 8.8739725
4 8.4542050 8.5704234
5 3.9044461 13.2642793
6 5.6679639 12.9529061
7 4.0183808 6.4746806
8 3.6415608 11.0308990
9 4.5237453 7.3255129
10 6.9379168 9.4594150
11 5.1557935 11.6776181
12 2.3829337 3.5170335
13 4.3556430 7.9706624
14 7.3274615 8.1852829
15 -0.5650641 2.8109197
16 7.1742283 6.8161200
17 3.3412044 11.6298940
18 2.5388981 10.1289533
19 3.8845686 14.1517643
20 2.4431608 6.8374837
21 4.8731053 12.7258259
22 6.9534912 6.5069513
23 4.4394807 14.5320225
24 2.0427553 12.1786148
25 7.1563978 11.9671603
26 2.4231207 6.1801862
27 6.5830372 0.9814878
28 2.5443326 9.8774632
29 1.1260322 9.4804636
30 4.0078436 12.9909014
31 9.3599808 12.2178596
32 3.5362245 8.6758910
33 4.6462337 8.6647953
34 2.0698037 7.2750532
35 7.0727970 8.9386798
36 4.8465248 8.0565347
37 5.6084462 7.5676308
38 6.7617479 9.5357666
39 5.2138482 13.6822924
40 3.6259103 13.8659939
41 5.8586547 6.5087016
42 4.3490281 9.5367522
43 7.5130701 8.1699117
44 3.7933813 9.3241308
45 4.9466813 9.4432584
46 -0.3730035 6.4695187
47 2.0646458 10.6511916
48 4.6027309 4.9207746
49 5.9919348 7.1946723
50 6.0148330 13.4702419
51 5.5354452 9.0193366
52 5.2621651 12.8856488
53 6.8580210 6.3526151
54 8.0812166 14.4659778
55 3.6039030 5.9857886
56 9.8548553 15.9081336
57 3.3675037 14.7207681
58 3.9935336 14.3186175
59 3.4308085 10.6024579
60 3.9609624 6.6595521
61 4.2358603 10.6600581
62 5.1791856 9.3241118
63 4.6976289 13.2833055
64 5.1868906 7.1323826
65 3.1810915 12.8402472
66 6.0258287 9.3805249
67 5.3768112 6.3805096
68 5.7072092 7.1130150
69 6.5789349 8.0092541
70 5.3175820 17.3377234
71 9.7706112 10.8648956
72 5.2332127 12.3418373
73 4.7626124 13.8816910
74 3.9395911 6.5270785
75 6.4394724 10.6344965
76 2.6803695 10.4501753
77 3.5577834 8.2323369
78 5.8431140 7.7932460
79 2.8596818 8.9581837
80 2.7365174 10.2902512
81 4.7560973 6.4555758
82 4.6519084 8.9786777
83 4.9467471 11.2818536
84 5.6167284 5.2641380
85 9.4700525 2.9904731
86 4.7392906 11.3572521
87 3.1221908 6.3881556
88 5.6949432 7.4518023
89 5.1435241 10.8912283
90 2.1628966 10.5080671
91 3.6380837 15.0594135
92 5.3434709 7.4034042
93 -0.1298439 0.4832707
94 7.8759390 2.7411723
95 2.0898649 9.7687250
96 4.2131549 9.3175228
97 5.0648105 11.3943350
98 7.7225193 11.4180456
99 3.1018895 12.8890257
100 4.4166832 10.4901303
I have 2880 observations in my data.frame. I have to create a new data.frame in which, I have to select rows from 25-77 from every 96 selected rows.
df.new = df[seq(25, nrow(df), 77), ] # extract from 25 to 77
The above code extracts only row number 25 to 77 but I want every row from 25 to 77 in every 96 rows.
One option is to create a vector of indeces with which subset the dataframe.
idx <- rep(25:77, times = nrow(df)/96) + 96*rep(0:29, each = 77-25+1)
df[idx, ]
You can use recycling technique to extract these rows :
from = 25
to = 77
n = 96
df.new <- df[rep(c(FALSE, TRUE, FALSE), c(from - 1, to - from + 1, n - to))), ]
To explain for this example it will work as :
length(rep(c(FALSE, TRUE, FALSE), c(24, 53, 19))) #returns
#[1] 96
In these 96 values, value 25-77 are TRUE and rest of them are FALSE which we can verify by :
which(rep(c(FALSE, TRUE, FALSE), c(24, 53, 19)))
# [1] 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
#[23] 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
#[45] 69 70 71 72 73 74 75 76 77
Now this vector is recycled for all the remaining rows in the dataframe.
First, define a Group variable, with values 1 to 30, each value repeating 96 times. Then define RowWithinGroup and filter as required. Finally, undo the changes introduced to do the filtering.
df <- tibble(X=rnorm(2880)) %>%
add_column(Group=rep(1:96, each=30)) %>%
group_by(Group) %>%
mutate(RowWithinGroup=row_number()) %>%
filter(RowWithinGroup >= 25 & RowWithinGroup <= 77) %>%
select(-Group, -RowWithinGroup) %>%
ungroup()
Welcome to SO. This question may not have been asked in this exact form before, but the proinciples required have been rerefenced in many, many questions and answers,
A one-liner base solution.
lapply(split(df, cut(1:nrow(df), nrow(df)/96, F)), `[`, 25:77, )
Note: Nothing after the last comma
The code above returns a list. To combine all data together, just pass the result above into
do.call(rbind, ...)
Having a tracking dataset with 3 time moments (36,76,96) for a match.
My requirement is to calculate distances between a given player and opponents.
Dataframe contains following 5 columns
- time_id (second or instant)
- player ( identifier for player)
- x (x position)
- y (y position)
- team (home or away)
As an example for home player = 26
I need to calculate distances with
all away players ( "12","17","24","37","69","77" ) in the
3 distinct time_id (36,76,96)
Here we can see df data
https://pasteboard.co/ICiyyFB.png
Here it is the link to download sample rds with df
https://1drv.ms/u/s!Am7buNMZi-gwgeBpEyU0Fl9ucem-bw?e=oSTMhx
library(tidyverse)
dat <- readRDS(file = "dat.rds")
# Given home player with id 26
# I need to calculate on each time_id the euclidean distance
# with all away players on each time_id
p36_home <- dat %>% filter(player ==26)
# all away players
all_away <- dat %>% filter(team =='away')
# I know I can calculate it if i put on columns but not elegant
# and require it group by time_id
# mutate(dist= round( sqrt((x1-x2)^2 +(y1-y2)^2),2) )
# below distances row by row should be calculated
# time_id , homePlayer, awayPlayer , distance
#
# 36 , 26 , 12 , x
# 36 , 26 , 17 , x
# 36 , 26 , 24 , x
# 36 , 26 , 37 , x
# 36 , 26 , 69 , x
# 36 , 26 , 77 , x
#
# 76 , 26 , 12 , x
# 76 , 26 , 17 , x
# 76 , 26 , 24 , x
# 76 , 26 , 37 , x
# 76 , 26 , 69 , x
# 76 , 26 , 77 , x
#
# 96 , 26 , 12 , x
# 96 , 26 , 17 , x
# 96 , 26 , 24 , x
# 96 , 26 , 37 , x
# 96 , 26 , 69 , x
# 96 , 26 , 77 , x
This solution should work for you. I simply joined the two dataframes you provided and used your distance calculation. Then filtered the columns to get the desired result.
test <- left_join(p36_home,all_away,by="time_id")
test$dist <- round( sqrt((test$x.x-test$x.y)^2 +(test$y.x-test$y.y)^2),2)
test <- test[,c(1,2,6,10)]
names(test) <- c("time_id",'homePlayer','awayPlayer','distance')
test
The result looks something like this:
time_id homePlayer awayPlayer distance
36 26 37 26.43
36 26 17 28.55
36 26 24 20.44
36 26 69 24.92
36 26 77 11.22
36 26 12 22.65
.
.
.
my dataset is :
x=data.frame(v1=c(97 , 97 , 85 , 84 , 90 , 80 , 81 , 90 , 80, 70, 90 , 90, 90 ,95 , 88 , 99),
+ v2=c(99 , 91 , 91 ,83 , 99 , 95 , 74 , 88 , 82 , 80 , 96 , 87 , 92 , 96 , 88, 95),
+ v3=c( 89 ,93 , 87 , 80 , 96 , 96 , 75 , 90 , 78, 86 , 92 ,88 , 80, 88 , 98 ,98),
+ v4=c( 89 , 97 ,91 , 86 , 95 , 95 , 89 , 88 , 75, 82 , 99, 92 , 95, 92 , 90, 98),
+ v5=c( 99 ,90 , 93 ,91 , 90 , 90 , 77 , 92 , 85, 76 , 90, 96 , 90, 90 , 90, 92))
> x
v1 v2 v3 v4 v5
1 97 99 89 89 99
2 97 91 93 97 90
3 85 91 87 91 93
4 84 83 80 86 91
5 90 99 96 95 90
6 80 95 96 95 90
7 81 74 75 89 77
8 90 88 90 88 92
9 80 82 78 75 85
10 70 80 86 82 76
11 90 96 92 99 90
12 90 87 88 92 96
13 90 92 80 95 90
14 95 96 88 92 90
15 88 88 98 90 90
16 99 95 98 98 92
I used rpart package to apply decision tree as follows :
# Classification Tree with rpart
library(rpart)
fit <- rpart(v5 ~ v1+v2+v3+v4,
method="class", data=x)
printcp(fit) # display the results
Classification tree:
rpart(formula = v5 ~ v1 + v2 + v3 + v4, data = x, method = "class")
Variables actually used in tree construction:
character(0)
Root node error: 9/16 = 0.5625
n= 16
CP nsplit rel error xerror xstd
1 0.01 0 1 0 0
> summary(fit) # detailed summary of splits
Call:
rpart(formula = v5 ~ v1 + v2 + v3 + v4, data = x, method = "class")
n= 16
CP nsplit rel error xerror xstd
1 0.01 0 1 0 0
Node number 1: 16 observations
predicted class=90 expected loss=0.5625 P(node) =1
class counts: 1 1 1 7 1 2 1 1 1
probabilities: 0.062 0.062 0.062 0.438 0.062 0.125 0.062 0.062 0.062
plot tree
# plot tree
plot(fit, uniform=TRUE,
+ main="Classification Tree ")
Error in plot.rpart(fit, uniform = TRUE, main = "Classification Tree ") :
fit is not a tree, just a root
text(fit, use.n=TRUE, all=TRUE, cex=.8)
Error in text.rpart(fit, use.n = TRUE, all = TRUE, cex = 0.8) :
fit is not a tree, just a root
what is my wrong while I applied rpart ? why it give me error with tree plot? how to fix this error Error :
fit is not a tree, just a root
You use method="class" if you are building a classification tree and method="anova" if you are building a regression tree. It looks like you have a continuous response, so you should be building a regression tree (i.e. method="anova").
You are using the RPART's default control parameters. With your data set RPART is unable to adhere to default values and create a tree (branch splitting)
rpart.control(minsplit = 20, minbucket = round(minsplit/3), cp = 0.01,
maxcompete = 4, maxsurrogate = 5, usesurrogate = 2, xval = 10,
surrogatestyle = 0, maxdepth = 30, ...)
Adjust the control parameters according to the data set.
e.g :
t <- rpart(formula = v5 ~ v1 + v2 + v3 + v4, data = x, method = anova",control =rpart.control(minsplit = 1,minbucket=1, cp=0))
But be aware this could create an over fitting decision tree.
I ran the following code with your x data frame and got a tree as shown below:
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
fit <- rpart(v5 ~ v1+v2+v3+v4,
method="anova",
data=x,
control = rpart.control(minsplit = 6, cp = 0.01))
fancyRpartPlot(fit) #from RColorBrewer package
Note that your method should be anova as v5 is a continuous variable, and you have to override the control parameters control = rpart.control(...) to adjust the depth of the tree.