Stacking models from different packages - r

I am doing bankruptcy prediction (classification) with SVM in the e1071 package. To improve my results I would like to combine it with random forest in the caret package. Firstly, I will show my RF model, and then I will show the SVM model. And after that I will show my attempts to combine(stack) them.
Sorry in advance for messy code. I am new to all this.
RF model (caret package)
set.seed(123)
model.rf <- train(as.factor(year.of.bankruptcy) ~ ., method = "rf", data = training.set)
predict.rf <- predict(model.rf, testing.set[,-1])
RF model accuracy
confusionMatrix(predict.rf, testing.set$year.of.bankruptcy, mode="everything")$overall[1]
-> this gives me the accuracy of the model:
Accuracy
0.7166667
SVM (e1071 package)
set.seed(123)
model1<-function(k,d,c,g){
model <-svm(year.of.bankruptcy ~., data = training.set, type = "C-classification", kernel = k, degree= d, cost =c, gamma =g)
1<-testing.set[,-1]
2<-testing.set$year.of.bankruptcy
model_prediction <- predict(model, 1)
result<-table(model_prediction, 2)
return(result)
}
result<-model1(k="radial", d=2, c=2,g=0.1)
result
classAgreement(tab=result, match.names = FALSE)
classAgreement(tab=result, match.names = FALSE)$diag
-> this gives me the accuracy of the model:
[1] 0.7466667
Stacking models together
predictDF <- data.frame(predict.rf, classAgreement(tab=result, match.names = FALSE)$diag, class = testing.set$year.of.bankruptcy)
predictDF_bc <- ROSE(class ~.,predictDF, N=300, p=0.5, seed=12)$data
set.seed(123)
combined.model.gbm <- train(as.factor(class) ~ ., method = "gbm", data = predictDF_bc, distribution = "bernoulli")
combined.prediction.gbm <- predict(combined.model.gbm, predictDF)
Evaluating the model
confusionMatrix(combined.prediction.gbm, testing.set$year.of.bankruptcy)$overall[1]`enter code here`
-> this gives me the accuracy of the stacked model:
Accuracy
0.7166667
As you can see, the combined model does not take the SVM into account. Since my combined score is lower than my SVM score. Any suggestions to what I could do?
> dput(training.set[sample(1:nrow(training.set), 50),])
structure(list(year.of.bankruptcy = c(-1, -1, -1, -1, -1, -1, -1, 1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, 1, -1, -1, -1, -1, -1), liquidity_1 = c(90.0695931477516,
85.4305617311398, 76.2455934195065, 4.34688111280157, 159.020111900801,
104.569486404834, 58.3391003460208, 42.0907973873116, 101.121495327103,
94.3786295005807, 47.7552816901408, 125.702184574231, 125.763725699637,
106.584557081952, 0, 143.6203466894, 82.5245328673209, 35.296442687747,
8.85744561490993, 12.4657534246575, 128.164489183979, 133.131146034372,
92.0528568769775, 22.8177150192555, 100.237812128419, 40.0340715502555,
91.360486091332, 129.123757904246, 92.9165443694355, 130.999694283094,
22.2526106414719, 101.714770797963, 93.1704260651629, 46.6268560361524,
125.838858750251, 106.076759061834, 86.787017476474, 84.7495991700462,
42.1171171171171, 68.806311160926, 93.1549687282835, 104.196667352397,
47.0834921845215, 77.8816199376947, 76.9065981148243, 90.988709507228,
98.9704873026767, 163.446031970576, 113.768115942029, 92.9742188833874
), profmarg_1 = c(241.916488222698, 215.221579961464, 633.490011750881,
0, 173.627703009224, 193.164652567976, 3.32179930795848, 82.390221819828,
131.842456608812, 102.044134727062, 0, 7.2447614801605, 113.608203375347,
169.208905731881, 0, 179.866439329355, 250.396558677242, 48.0632411067194,
0, 12.8082191780822, 0.963803812379525, 0, 452.279918109064,
0, 16.4090368608799, 11.4449434722007, 173.331434539068, 240.216802168022,
307.709617454261, 179.883827575665, 281.476877175535, 539.609507640068,
183.12447786132, 31.8431245965139, 151.215591721921, 95.3980099502487,
259.97695410025, 174.073375459776, 11.986986986987, 160.94322541708,
119.110493398193, 428.03949804567, 194.624475791079, 325.877466251298,
37.2322193658955, 245.71066793289, 207.343857240906, 22.49257320696,
43.6487638533674, 97.4987194809629), drmarg = c(1.46603230803275,
12.6575304731079, -0.798553144129104, 53.3333333333333, 11.8097892353249,
29.1893259137473, 60.4166666666667, -23.041601255887, 1.21518987341772,
6.1535019019915, 82.4626865671642, -4, 4.47536667920271, -3.69540873460246,
65.3543307086614, 6.46738701790362, -3.63987759703656, 0.575657894736842,
70.2460850111857, 45.4545454545455, -724.444444444444, 18.809947734191,
3.22818215293973, 92.9292929292929, 6.52173913043478, 50.8680555555556,
4.88031987730733, 19.9684115523466, 1.1446376903755, 13.3729821580289,
1.22027317479027, 4.0986955838441, -3.29607664233577, 73.4414597060314,
3.95960669678448, 28.6645874681032, 17.2991867598802, 10.8455534851063,
55.741127348643, 8.98526582981339, 7.36196319018405, 4.85894170231172,
10.4852855193919, -1.6774275224712, 16.3210702341137, 2.47726693294808,
5.64784053156146, 59.622641509434, 11.0029211295034, 50.5987773218323
), ROA = c(3.546573875803, 27.2417370683267, -5.05875440658049,
6.52032166920235, 20.5050657795252, 87.1601208459215, 2.00692041522491,
-18.9840263855655, 1.60213618157543, 6.38792102206736, 9.72711267605634,
-0.356665180561748, 5.08438367870113, -6.25296068214116, 3.53041259038707,
11.6510372264848, -9.11412824304342, 0.276679841897233, 5.87171975316337,
5.82191780821918, -6.98222317412722, 30.0983365499495, 14.6845337800112,
11.8100128369705, 1.07015457788347, 6.05028134840741, 8.45912845343207,
47.9674796747967, 3.52216025829175, 24.0599205136044, 4.37593237195425,
22.1392190152801, -6.0359231411863, 23.3860555196901, 5.98754269640346,
35.9275053304904, 46.5719224121375, 18.9380364047911, 6.68168168168168,
19.5326981937319, 9.17303683113273, 20.7981896729068, 20.5108654212734,
-5.50363447559709, 10.4541559554413, 6.15173578136541, 12.4456646076413,
13.4106662894327, 4.81670929241262, 51.5793068123613), debt_ratio_1 = c(75.6423982869379,
157.077219504965, 180.975323149236, 88.958921973484, 96.869801905338,
93.0513595166163, 78.6159169550173, 131.707948004915, 132.096128170895,
100.789779326365, 28.080985915493, 48.1497993758359, 85.6868190557573,
85.5518711511132, 75.4714305969091, 92.0431940892299, 123.551552628041,
43.8735177865613, 89.2601134451162, 69.0547945205479, 29.727993146284,
110.265600588181, 154.662199888331, 54.2362002567394, 20.9274673008323,
79.0666460172423, 150.536409380044, 101.355013550135, 145.827218471774,
45.2155304188322, 123.222277473894, 134.90662139219, 123.141186299081,
41.7043253712072, 66.2648181635523, 26.5813788201848, 95.1411561359708,
105.191926813166, 7.60760760760761, 179.997413458637, 92.7032661570535,
121.49763423164, 96.3400686237133, 129.823468328141, 39.502999143102,
136.213991769547, 119.01166781057, 84.8210496534163, 8.99403239556692,
113.957657503842), young = c(1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), medium_age = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0, 1, 0, 1, 0, 0, 0), old = c(0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0), agriculture = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), offshore_shipping = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), transport = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), manufacturing = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), telecom_it_tech = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), electricity = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), construction = c(0,
0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1), wholesale_retail = c(0, 0, 1, 0, 1, 0,
0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
0, 0), finance = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), change_output = c(0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549), oil_price_dummy = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), fish_price_dummy = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0.180737819481274, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.180737819481274, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("year.of.bankruptcy", "liquidity_1",
"profmarg_1", "drmarg", "ROA", "debt_ratio_1", "young", "medium_age",
"old", "agriculture", "offshore_shipping", "transport", "manufacturing",
"telecom_it_tech", "electricity", "construction", "wholesale_retail",
"finance", "change_output", "oil_price_dummy", "fish_price_dummy"
), row.names = c(19L, 49L, 25L, 53L, 56L, 3L, 31L, 50L, 58L,
62L, 51L, 24L, 35L, 29L, 6L, 44L, 12L, 2L, 15L, 42L, 39L, 30L,
27L, 40L, 26L, 41L, 21L, 22L, 11L, 63L, 32L, 60L, 36L, 52L, 1L,
14L, 37L, 34L, 8L, 43L, 4L, 10L, 9L, 54L, 59L, 64L, 23L, 20L,
17L, 13L), class = "data.frame")

Stacking models is done quite easily with caretEnsemble library.
Here is an example:
library(mlbench) #for the data set
library(caret)
library(caretEnsemble)
data(PimaIndiansDiabetes)
set.seed(123)
list the algorithms to use:
algorithmList <- c("svmRadial", "rf" )
if you would like to specify the tuning parameters in each model use tuneList argument in caretList function:
in trainControl, savePredictions = "final" and classProbs = TRUE are mandatory
control <- trainControl(method = "repeatedcv", number = 4, repeats = 3,
savePredictions = "final" , classProbs = TRUE)
models <- caretList(diabetes ~ ., data = PimaIndiansDiabetes, trControl = control,
metric = "Kappa", methodList = algorithmList)
results <- resamples(models)
summary(results)
#output
Call:
summary.resamples(object = results)
Models: svmRadial, rf
Number of resamples: 12
Accuracy
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
svmRadial 0.6979167 0.7135417 0.7343750 0.7304688 0.7447917 0.7604167 0
rf 0.7291667 0.7604167 0.7682292 0.7690972 0.7760417 0.8125000 0
Kappa
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
svmRadial 0.2637842 0.3570103 0.4053130 0.3917770 0.4394767 0.4775359 0
rf 0.3788379 0.4612661 0.4788076 0.4809233 0.5028566 0.5785880 0
now the stack,
stack.glm <- caretStack(models, method = "glm", metric = "Kappa", trControl = control)
print(stack.glm)
#output
A glm ensemble of 2 base models: svmRadial, rf
Ensemble results:
Generalized Linear Model
2304 samples
2 predictor
2 classes: 'neg', 'pos'
No pre-processing
Resampling: Cross-Validated (4 fold, repeated 3 times)
Summary of sample sizes: 1728, 1728, 1728, 1728, 1728, 1728, ...
Resampling results:
Accuracy Kappa
0.7667824 0.4685406
or a gbm stack
stack.gbm <- caretStack(models, method="gbm", metric = "Kappa", trControl = control)
print(stack.gbm)
#output
A gbm ensemble of 2 base models: svmRadial, rf
Ensemble results:
Stochastic Gradient Boosting
2304 samples
2 predictor
2 classes: 'neg', 'pos'
No pre-processing
Resampling: Cross-Validated (4 fold, repeated 3 times)
Summary of sample sizes: 1728, 1728, 1728, 1728, 1728, 1728, ...
Resampling results across tuning parameters:
interaction.depth n.trees Accuracy Kappa
1 50 0.7693866 0.4832061
1 100 0.7675058 0.4785977
1 150 0.7663484 0.4753614
2 50 0.7662037 0.4748160
2 100 0.7638889 0.4684015
2 150 0.7634549 0.4653090
3 50 0.7630208 0.4657834
3 100 0.7612847 0.4606506
3 150 0.7569444 0.4511977
Tuning parameter 'shrinkage' was held constant at a value of 0.1
Tuning parameter 'n.minobsinnode' was
held constant at a value of 10
Kappa was used to select the optimal model using the largest value.
The final values used for the model were n.trees = 50, interaction.depth = 1, shrinkage = 0.1 and n.minobsinnode
= 10.
So the kappa values for
svm: 0.3917770
rf: 0.4809233
glm ensemble: 0.4685406
gbm ensemble: 0.4832061 - this would probably be higher if more models were used
EDIT: with the data provided in the OP:
first convert year.of.bankruptcy to a factor
data$year.of.bankruptcy <- as.factor(data$year.of.bankruptcy)
set level names to something that wont throw an error:
levels(data$year.of.bankruptcy) <- c("minus", "plus")
and go forward
control <- trainControl(method = "repeatedcv", number = 4, repeats = 3,
savePredictions = "final" , classProbs = TRUE)
models <- caretList(year.of.bankruptcy ~ ., data = data, trControl = control,
metric = "Kappa", methodList = algorithmList)
I get warnings about zero variance predictors but this is probably caused by the small data sample. If you see errors like:
In .local(x, ...) : Variable(s) `' constant. Cannot scale data.
on the whole data set then its worth looking into removing near zero variance predictors. There is a nice chapter about this here. Good luck

Related

Why am I getting this error when using glmnet in R

When trying to predict I am getting this error
error in evaluating the argument 'x' in selecting a method for function 'as.matrix': Cholmod error 'X and/or Y have wrong dimensions' at file ../MatrixOps/cholmod_sdmult.c, line 90
Here is my code so far
library(data.table)
library(caret)
library(Metrics)
library(glmnet)
library(plotmo)
library(lubridate)
#Reading in the necessary data needed for this project
train <- fread("project/volume/data/processed/start_train.csv")
test<-fread("project/volume/data/processed/start_test.csv")
example_sub<-fread("project/volume/data/processed/example_submission.csv")
card_tab <- fread("project/volume/data/processed/card_tab.csv")
#Merging the card_tab dataset with both my train and test datasets to add more variables to each
train = merge(train, card_tab, by = "id")
test = merge(test, card_tab, by = "id")
train$power = as.numeric(train$power)
train$toughness = as.numeric(train$toughness)
test$power = as.numeric(test$power)
test$toughness = as.numeric(test$toughness)
train$powerovertough = train$power/train$toughness
test$powerovertough = test$power/test$toughness
train$current_date<-as_date(train$current_date)
train<-train[order(-current_date)]
test$current_date<-as_date(test$current_date)
test<-test[order(-current_date)]
#Handling NA values in both train and test. The NA values will be set to 0
train[is.na(train)] <- 0
test[is.na(test)] <- 0
# Specifying which columns of our model we will be dropping and applying it to our train and test datasets
drops<- c('id','future_date','current_date','card_name','power','loyalty','cmc','type','colors','mana_cost','subtypes', 'text','set','set_name')
train<-train[, !drops, with = FALSE]
test<-test[, !drops, with = FALSE]
#Saving the response variable in the train dataset
train_y<-train$future_price
test$future_price<-0
#Using dummies to train model
dummies <- dummyVars(future_price ~ ., data = train)
train<-predict(dummies, newdata = train)
test<-predict(dummies, newdata = test)
train<-data.table(train)
test<-data.table(test)
#Cross validating the model to fin the best lamda value
train<-as.matrix(train)
test<-as.matrix(test)
gl_model<-cv.glmnet(train, train_y, alpha = 1,family="gaussian")
bestlam<-gl_model$lambda.min
# Fitting the full model
gl_model<-glmnet(train, train_y, alpha = 1,family="gaussian")
plot_glmnet(gl_model)
saveRDS(gl_model,"./project/volume/models/gl_model.model")
test<-as.matrix(test)
#use the full model
pred<-predict(gl_model,s=bestlam, newx = test)
I am trying to predict future_price for my test set. The error is saying my dimensions are wrong but I don't know what could be causing them to be different. I have tried observing the data sets as it runs through the code and they seem to have the same variables.
Here is the dput
> dput(head(train))
structure(c(0.25, 0.1, 0.1, 0.1, 0.25, 0.25, 1, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 3, 0, 2, 0, 0, 0, 0.333333333333333,
0, 1, 0), .Dim = c(6L, 20L), .Dimnames = list(NULL, c("current_price",
"typesArtifact", "typesArtifact Creature", "typesCreature", "typesEnchantment",
"typesEnchantment Artifact", "typesEnchantment Creature", "typesInstant",
"typesLand", "typesPlaneswalker", "typesSorcery", "supertypes",
"supertypesBasic", "supertypesLegendary", "rarityCommon", "rarityMythic",
"rarityRare", "rarityUncommon", "toughness", "powerovertough"
)))
> dput(head(test))
structure(c(0.15, 0.16, 2, 0.39, 0.16, 0.19, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0), .Dim = c(6L, 18L), .Dimnames = list(
NULL, c("current_price", "typesArtifact", "typesArtifact Creature",
"typesCreature", "typesEnchantment", "typesInstant", "typesLand",
"typesPlaneswalker", "typesSorcery", "supertypes", "supertypesBasic",
"supertypesLegendary", "rarityCommon", "rarityMythic", "rarityRare",
"rarityUncommon", "toughness", "powerovertough")))

I am dealing with a DTM and I want to do k-means, heirarchical, and k-medoids clustering. Am I suppose to normalize the DTM first?

The data, AllBooks has 590 observations of 8266 variables. Here is the code I have:
AllBooks = read_csv("AllBooks_baseline_DTM_Unlabelled.csv")
dtms = as.matrix(AllBooks)
dtms_freq = as.matrix(rowSums(dtms) / 8266)
dtms_freq1 = dtms_freq[order(dtms_freq),]
sd = sd(dtms_freq)
mean = mean(dtms_freq)
This tells me that my mean is: 0.01242767
and my std. dev. is: 0.01305608
So since my standard deviation is low this means the data has low variability in terms of size of documents. So I do not need to normalize the DTM? And by normalize I mean using the scale function in R which subtracts the mean of the data and divides by the standard deviation.
In other words my big questions is: When am I suppose to standardize data (specifically a Document Term Matrix) for clustering purposes?
Here is a little output of data:
dput(head(AllBooks,10))
budding = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), enjoyer = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), needs = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), sittest = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), eclipsed = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), engagement = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
exuberant = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), abandons = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), well = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), cheerfulness = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
hatest = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), state = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0), stained = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), production = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), whitened = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), revered = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), developed = c(0, 0, 0, 2, 0, 0, 0, 0, 0, 0),
regarded = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), enactments = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), aromatical = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0), admireth = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0
), foothold = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), shots = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), turner = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), inversion = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
lifeless = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), postponement = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), stout = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), taketh = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), kettle = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), erred = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), thinkest = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), modern = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), reigned = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), sparingly = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
visual = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), thoughts = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), illumines = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0), attire = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
explains = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
You can view full data from link: https://www.dropbox.com/s/p9v1y6oxith1prh/AllBooks_baseline_DTM_Unlabelled.csv?dl=0
You have a sparse dataset, where most of it is dominated by zeros, hence standard deviation is very low. You can scale it if some of your non-zero counts are extremely large, eg some are 100s while others are 1s and 2s.
It might not be such a good idea to use kmeans on sparse data, because it is unlikely you can find meaningful centers. There might be a few options available, check this link on dimension reduction.There are also graph based approaches, such as this used in biology.
Below is a simplistic way to clust and visualize:
x = read.csv("AllBooks_baseline_DTM_Unlabelled.csv")
# remove singleton columns
x = x[rowMeans(x)>0,colSums(x>0)>1]
Treat it as binary and hierachical on a binary distance:
hc=hclust(dist(x,method="binary"),method="ward.D")
clus = cutree(hc,5)
Calculate PCA and visualize:
library(Rtsne)
library(ggplo2)
pca = prcomp(x,scale=TRUE,center=TRUE)
TS = Rtsne(pca$x[,1:30])
ggplot(data.frame(Dim1=TS$Y[,1],Dim2=TS$Y[,2],C=factor(clus)),
aes(x=Dim1,y=Dim2,col=C))+geom_point()
Cluster 5 seems to be very different, and they differ in these words:
names(tail(sort(colMeans(x[clus==5,]) - colMeans(x[clus!=5,])),10))
[1] "wisdom" "thee" "lord" "things" "god" "hath" "thou" "man"
[9] "thy" "shall"

xgb.cv with no folds and return the results based on a split of the data

I have some data which looks like:
# A tibble: 50 x 28
sanchinarro date holiday weekday weekend workday_on_holi… weekend_on_holi… protocol_active
<dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -1.01 2010-01-01 1 1 0 1 0 0
2 0.832 2010-01-02 0 0 1 0 0 0
3 1.29 2010-01-03 0 0 1 0 0 0
4 1.04 2010-01-04 0 1 0 0 0 0
5 0.526 2010-01-05 0 1 0 0 0 0
6 -0.292 2010-01-06 1 1 0 1 0 0
7 -0.394 2010-01-07 0 1 0 0 0 0
8 -0.547 2010-01-08 0 1 0 0 0 0
9 -0.139 2010-01-09 0 0 1 0 0 0
10 0.628 2010-01-10 0 0 1 0 0 0
I want to run xgb.cv on the first 40 rows and validate it on the final 10 rows.
I try the following:
library(xgboost)
library(dplyr)
X_Val <- ddd %>% select(-c(1:2))
Y_Val <- ddd %>% select(c(1)) %>% pull()
dVal <- xgb.DMatrix(data = as.matrix(X_Val), label = as.numeric(Y_Val))
xgb.cv(data = dVal, nround = 30, folds = NA, params = list(eta = 0.1, max_depth = 5))
which gives me this error:
Error in xgb.cv(data = dVal, nround = 30, folds = NA, eta = 0.1,
max_depth = 5) : 'folds' must be a list with 2 or more elements
that are vectors of indices for each CV-fold
How can I run a simple xgb.cv on the first 40 rows and test it on the last 10 rows.
I eventually want to apply a gird search with a list of parameters and save the results in a list. Since I am dealing with time series data I do not want to mix the folds up, I just want a simple train and in-sample test of 40:10.
Data:
ddd <- structure(list(sanchinarro = c(-1.00742964973274, 0.832453587904369,
1.29242439731365, 1.03688505875294, 0.525806381631517, -0.291919501762755,
-0.394135237187039, -0.547458840323464, -0.138595898626329, 0.628022117055801,
1.19020866188936, 1.5990716035865, 1.5990716035865, -0.70078244345989,
2.11015028070792, 1.95682667757149, 0.985777191040795, 0.883561455616511,
0.985777191040795, 0.270267043070807, 2.51901322240505, 2.41679748698077,
0.372482778495091, -0.291919501762755, -0.905213914308458, -0.905213914308458,
-0.649674575747748, 1.2413165296015, 1.54796373587436, -0.70078244345989,
-0.905213914308458, -0.0363801632020448, 1.54796373587436, 2.00793454528363,
1.54796373587436, -0.445243104899181, -0.445243104899181, 1.03688505875294,
0.628022117055801, -0.496350972611323, 0.168051307646523, -0.649674575747748,
0.0658355722222391, -1.00742964973274, -0.291919501762755, 0.0147277045100972,
0.168051307646523, -0.189703766338471, 0.219159175358665, 0.679129984767943
), date = structure(c(14610, 14611, 14612, 14613, 14614, 14615,
14616, 14617, 14618, 14619, 14620, 14621, 14622, 14623, 14624,
14625, 14626, 14627, 14628, 14629, 14630, 14631, 14632, 14633,
14634, 14635, 14636, 14637, 14638, 14639, 14640, 14641, 14642,
14643, 14644, 14645, 14646, 14647, 14648, 14649, 14650, 14651,
14652, 14653, 14654, 14655, 14656, 14657, 14658, 14659), class = "Date"),
holiday = c(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekday = c(1,
0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1), weekend = c(0, 1, 1, 0,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
0, 1, 1, 0, 0, 0, 0, 0), workday_on_holiday = c(1, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), weekend_on_holiday = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), protocol_active = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_broken_clouds = c(0,
1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1), text_clear = c(0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1), text_fog = c(0, 1, 0, 1, 1, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 0), text_partly_cloudy = c(0, 1, 0, 0, 0,
1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), text_partly_sunny = c(1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1), text_passing_clouds = c(1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1), text_scattered_clouds = c(1, 1,
0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1), text_sunny = c(0, 0, 0, 0,
0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1), month_1 = c(1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_2 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1), month_3 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_4 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_5 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_6 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_7 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), month_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_9 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_10 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_11 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-50L))
EDIT: List data:
The final data comes in the form of lists.
datalst <- list(structure(list(sanchinarro = c(-1.00742964973274, 0.832453587904369,
1.29242439731365, 1.03688505875294, 0.525806381631517, -0.291919501762755,
-0.394135237187039, -0.547458840323464, -0.138595898626329, 0.628022117055801,
1.19020866188936, 1.5990716035865, 1.5990716035865, -0.70078244345989
), date = structure(c(14610, 14611, 14612, 14613, 14614, 14615,
14616, 14617, 14618, 14619, 14620, 14621, 14622, 14623), class = "Date"),
holiday = c(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), weekday = c(1,
0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1), weekend = c(0, 1,
1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0), workday_on_holiday = c(1,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), weekend_on_holiday = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), protocol_active = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_broken_clouds = c(0,
1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), text_clear = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0), text_fog = c(0, 1,
0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0), text_partly_cloudy = c(0,
1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0), text_partly_sunny = c(1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1), text_passing_clouds = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), text_scattered_clouds = c(1,
1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1), text_sunny = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0), month_1 = c(1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), month_2 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_3 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_4 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), month_5 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_6 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), month_7 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), month_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_9 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), month_10 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), month_11 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), month_12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-14L)), structure(list(sanchinarro = c(0.832179838392013, 1.29225734336885,
1.03665872949283, 0.525461501740789, -0.292454062662475, -0.394693508212883,
-0.548052676538495, -0.139094894336863, 0.627700947291197, 1.19001789781844,
1.59897568002007, 1.59897568002007, -0.701411844864107, 2.11017290777211
), date = structure(c(14611, 14612, 14613, 14614, 14615, 14616,
14617, 14618, 14619, 14620, 14621, 14622, 14623, 14624), class = "Date"),
holiday = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekday = c(0,
0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1), weekend = c(1, 1,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0), workday_on_holiday = c(0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekend_on_holiday = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), protocol_active = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_broken_clouds = c(1,
0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_clear = c(0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1), text_fog = c(1, 0,
1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0), text_partly_cloudy = c(1,
0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0), text_partly_sunny = c(1,
1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0), text_passing_clouds = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), text_scattered_clouds = c(1,
0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0), text_sunny = c(0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1), month_1 = c(1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), month_2 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_3 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_4 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), month_5 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_6 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), month_7 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), month_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_9 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), month_10 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), month_11 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), month_12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-14L)), structure(list(sanchinarro = c(1.29293502084952, 1.03729933727253,
0.526027970118536, -0.292006217327851, -0.394260490758649, -0.547641900904846,
-0.138624807181653, 0.628282243549334, 1.19068074741873, 1.59969784114192,
1.59969784114192, -0.701023311051044, 2.11096920829591, 1.95758779814971
), date = structure(c(14612, 14613, 14614, 14615, 14616, 14617,
14618, 14619, 14620, 14621, 14622, 14623, 14624, 14625), class = "Date"),
holiday = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekday = c(0,
1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0), weekend = c(1, 0,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1), workday_on_holiday = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekend_on_holiday = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), protocol_active = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_broken_clouds = c(0,
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), text_clear = c(0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0), text_fog = c(0, 1,
1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0), text_partly_cloudy = c(0,
0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0), text_partly_sunny = c(1,
1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1), text_passing_clouds = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), text_scattered_clouds = c(0,
0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0), text_sunny = c(0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0), month_1 = c(1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), month_2 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_3 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_4 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), month_5 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_6 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), month_7 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), month_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_9 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), month_10 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), month_11 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), month_12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-14L)))
EDIT:
I think this gives me what I am after - I need to double/tripple check it. (if you see any errors please let me know)
splt <- 0.80 * nrow(ddd)
ddd[c(1:splt), "id"] = 1
ddd$id[is.na(ddd$id)] = 2
fold.ids <- unique(ddd$id)
custom.folds <- vector("list", length(fold.ids))
i <- 1
for( id in fold.ids){
custom.folds[[i]] <- which( ddd$id %in% id )
i <- i+1
}
custom.folds
cv <- xgb.cv(params = list(eta = 0.1, max_depth = 5), dVal, nround = 10, folds = custom.folds, prediction = TRUE)
cv$evaluation_log
I now need to find a way to apply this to all 3 lists in the "new" added data.
Firstly, you should split the data onto dtrain (40 first rows) and dval (10 last rows). Secondly, you need rather xgb.train, not xgb.cv.
So, your code should be modified to something like that:
library(xgboost)
library(dplyr)
# you code regarding ddd
X <- ddd %>% select(-c(1:2))
Y <- ddd %>% select(c(1)) %>% pull()
dtrain <- xgb.DMatrix(data = as.matrix(X[1:40,]), label = as.numeric(Y[1:40,]))
dval <- xgb.DMatrix(data = as.matrix(X[41:50,]), label = as.numeric(Y[41:50,]))
watchlist <- list(train=dtrain, val=dval)
model <- xgb.train(data=dtrain, watchlist=watchlist, nround = 30, eta = 0.1, max_depth = 5)
IMHO, 40+10 rows only and so sparse features give no hope to obtain good results using XGBoost.

How to use ids from one dataframe to sum rows in another dataframe

I feel like this answer has been asked before, but I can't seem to find an answer to this question. Maybe my title is too vague, so feel free to change it.
So I have one data frame, a, with ids the correspond to column name in data frame b. Both data frames are simplified versions of a much larger data frame.
here is data frame a
a <- structure(list(V1 = structure(c(4L, 5L, 1L, 2L, 3L), .Label = c("GEN[D00105].GT",
"GEN[D00151].GT", "GEN[D00188].GT", "GEN[D86396].GT", "GEN[D86397].GT"
), class = "factor")), row.names = c(NA, -5L), class = "data.frame")
here is data frame b
b <- structure(list(`GEN[D01104].GT` = c(0, 0, 0, 0, 1, 0, 0, 2, 0,
1, 1, 1, 1, 0, 0, 0, 2, 0, 0, 0), `GEN[D01312].GT` = c(1, 0,
2, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 2, 0, 0, 2, 0, 0, 0), `GEN[D01878].GT` = c(0,
0, 0, 2, 0, 0, 2, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 2, 0, 0), `GEN[D01882].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0), `GEN[D01952].GT` = c(0,
0, 1, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0), `GEN[D01953].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 0, 0, 2, 0), `GEN[D02053].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0), `GEN[D00316].GT` = c(0,
0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0), `GEN[D01827].GT` = c(0,
0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0), `GEN[D01881].GT` = c(0,
0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0), `GEN[D02044].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0), `GEN[D02085].GT` = c(0,
0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0), `GEN[D02204].GT` = c(0,
0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0), `GEN[D02276].GT` = c(0,
0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0), `GEN[D02297].GT` = c(0,
0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0), `GEN[D02335].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0), `GEN[D02397].GT` = c(0,
0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0), `GEN[D00856].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0), `GEN[D00426].GT` = c(0,
0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0), `GEN[D02139].GT` = c(0,
0, 1, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0), `GEN[D02168].GT` = c(0,
0, 2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0)), row.names = c(NA,
-20L), class = "data.frame")
I want to be able to use the ids from data frame a to sum the row in data frame b that have a matching id if that makes sense.
So in the past, I just did something like
b$affected.samples <- (b$`GEN[D86396].GT` + b$`GEN[D86397].GT` + b$`GEN[D00105].GT` + b$`GEN[D00151].GT` + b$`GEN[D00188].GT`)
which got annoying and took to much time, so I moved over to
b$affected.samples <- rowSums(b[,c(1:5)])
Which isn't too bad for this example but with my large data set, my sample can be all over the place, and it's starting to take too much time to finds where everything is. I was hoping there is a way just to use my data frame a to sum the correct rows in data frame b.
Hopefully, I gave this is all the information you need! Let me know if you have any questions.
Thanks in advance!!
Extract the 'V1' column as a character string, use that to select the columns of 'b' (assuming these column names are found in 'b') and get the rowSums
rowSums( b[as.character(a$V1)], na.rm = TRUE)

How to make a regression with many data frames

I have data about gender and petitioning. I want to make a regression between the "Femme" (woman) variable and the different issues of the petitions. I have regrouped those issues into data frames under general themes, and those themes are what I want to regress with the "Femme" (woman) variable.
P.S.: Some petitions have many issues (ex.: water + science). So one petition could be counted in two data frames at the same time.
1) Here is what I did for all issues, this one is an example with the "Aboriginal" issue to show you how I coded the initial issues (you can also see the "Femme" variable at the beginning, which is already coded "0" and "1" in the original dataset under "Female"):
DataPetitions$Femme <- DataPetitions$Female
DataPetitions$Aboriginal <- NA
DataPetitions$Aboriginal[grepl("Aboriginal", DataPetitions$Issue)] <-1
DataPetitions$Aboriginal[!grepl("Aboriginal", DataPetitions$Issue)] <-0
# ... (same for all 24 specific issues)
2) Creating 7 data frames for general petitioning themes:
EnvironmentalIssues <- c(DataPetitions$AirQuality,DataPetitions$Biological, DataPetitions$Climate, DataPetitions$Environmental, DataPetitions$Toxic, DataPetitions$Waste, DataPetitions$Water)
EconomicIssues <- c(DataPetitions$Natural, DataPetitions$Transport)
SocialIssues <- c(DataPetitions$Aboriginal, DataPetitions$Health)
AgriculturalIssues <- c(DataPetitions$Agriculture,
DataPetitions$Fisheries, DataPetitions$Pesticides)
PoliticalIssues <- c(DataPetitions$Compliance, DataPetitions$Federal,
DataPetitions$Governance, DataPetitions$International)
ScientificIssues <- c(DataPetitions$Science)
OtherIssues <- c(DataPetitions$Other)
3) Trying to do a regression. This is my glm code:
model7 <- glm(DataPetitions$Femme ~ SocialIssues + PoliticalIssues +
ScientificIssues + EnvironmentalIssues + EconomicIssues +
AgriculturalIssues + OtherIssues, data = DataPetitions)
# When I try to run it, I get this error message:
Error in model.frame.default(formula = DataPetitions$Femme ~
SocialIssues + : variable lengths differ (found for
'SocialIssues')
With dput(head(DataPetitions,20)), I get this:
[...] class = "factor"), Femme = c(1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1), AuMoinsUneFemme = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1),
Homme = c(1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
1, 1, 1, 2), AuMoinsUnHomme = c(1, 0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1), Individual1 = c(0, 0, 0,
1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0), Group1 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1),
Organisation1 = c(1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
0, 0, 1, 0, 0, 0, 0), Aboriginal = c(1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0), Agriculture = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),
AirQuality = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 0), Biological = c(0, 1, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), Climate = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1), Compliance = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0),
Environmental = c(0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), Federal = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Fisheries = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0), Governance = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Health = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), International = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), Natural = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), Other = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Pesticides = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), Science = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Toxic = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Transport = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Waste = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), Water = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("Data.", "Title", "Number", "Issue", "Petitioner", "Individual", "Group", "Organisation",
"Female", "Male", "Unknown", "DateReceived", "Status", "Summary",
"Hyperlink", "Femme", "AuMoinsUneFemme", "Homme", "AuMoinsUnHomme",
"Individual1", "Group1", "Organisation1", "Aboriginal", "Agriculture",
"AirQuality", "Biological", "Climate", "Compliance", "Environmental",
"Federal", "Fisheries", "Governance", "Health", "International",
"Natural", "Other", "Pesticides", "Science", "Toxic", "Transport",
"Waste", "Water"), row.names = c(NA, 20L), class = "data.frame")

Resources