Why am I getting this error when using glmnet in R - r

When trying to predict I am getting this error
error in evaluating the argument 'x' in selecting a method for function 'as.matrix': Cholmod error 'X and/or Y have wrong dimensions' at file ../MatrixOps/cholmod_sdmult.c, line 90
Here is my code so far
library(data.table)
library(caret)
library(Metrics)
library(glmnet)
library(plotmo)
library(lubridate)
#Reading in the necessary data needed for this project
train <- fread("project/volume/data/processed/start_train.csv")
test<-fread("project/volume/data/processed/start_test.csv")
example_sub<-fread("project/volume/data/processed/example_submission.csv")
card_tab <- fread("project/volume/data/processed/card_tab.csv")
#Merging the card_tab dataset with both my train and test datasets to add more variables to each
train = merge(train, card_tab, by = "id")
test = merge(test, card_tab, by = "id")
train$power = as.numeric(train$power)
train$toughness = as.numeric(train$toughness)
test$power = as.numeric(test$power)
test$toughness = as.numeric(test$toughness)
train$powerovertough = train$power/train$toughness
test$powerovertough = test$power/test$toughness
train$current_date<-as_date(train$current_date)
train<-train[order(-current_date)]
test$current_date<-as_date(test$current_date)
test<-test[order(-current_date)]
#Handling NA values in both train and test. The NA values will be set to 0
train[is.na(train)] <- 0
test[is.na(test)] <- 0
# Specifying which columns of our model we will be dropping and applying it to our train and test datasets
drops<- c('id','future_date','current_date','card_name','power','loyalty','cmc','type','colors','mana_cost','subtypes', 'text','set','set_name')
train<-train[, !drops, with = FALSE]
test<-test[, !drops, with = FALSE]
#Saving the response variable in the train dataset
train_y<-train$future_price
test$future_price<-0
#Using dummies to train model
dummies <- dummyVars(future_price ~ ., data = train)
train<-predict(dummies, newdata = train)
test<-predict(dummies, newdata = test)
train<-data.table(train)
test<-data.table(test)
#Cross validating the model to fin the best lamda value
train<-as.matrix(train)
test<-as.matrix(test)
gl_model<-cv.glmnet(train, train_y, alpha = 1,family="gaussian")
bestlam<-gl_model$lambda.min
# Fitting the full model
gl_model<-glmnet(train, train_y, alpha = 1,family="gaussian")
plot_glmnet(gl_model)
saveRDS(gl_model,"./project/volume/models/gl_model.model")
test<-as.matrix(test)
#use the full model
pred<-predict(gl_model,s=bestlam, newx = test)
I am trying to predict future_price for my test set. The error is saying my dimensions are wrong but I don't know what could be causing them to be different. I have tried observing the data sets as it runs through the code and they seem to have the same variables.
Here is the dput
> dput(head(train))
structure(c(0.25, 0.1, 0.1, 0.1, 0.25, 0.25, 1, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 3, 0, 2, 0, 0, 0, 0.333333333333333,
0, 1, 0), .Dim = c(6L, 20L), .Dimnames = list(NULL, c("current_price",
"typesArtifact", "typesArtifact Creature", "typesCreature", "typesEnchantment",
"typesEnchantment Artifact", "typesEnchantment Creature", "typesInstant",
"typesLand", "typesPlaneswalker", "typesSorcery", "supertypes",
"supertypesBasic", "supertypesLegendary", "rarityCommon", "rarityMythic",
"rarityRare", "rarityUncommon", "toughness", "powerovertough"
)))
> dput(head(test))
structure(c(0.15, 0.16, 2, 0.39, 0.16, 0.19, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0), .Dim = c(6L, 18L), .Dimnames = list(
NULL, c("current_price", "typesArtifact", "typesArtifact Creature",
"typesCreature", "typesEnchantment", "typesInstant", "typesLand",
"typesPlaneswalker", "typesSorcery", "supertypes", "supertypesBasic",
"supertypesLegendary", "rarityCommon", "rarityMythic", "rarityRare",
"rarityUncommon", "toughness", "powerovertough")))

Related

lapply Question: How to streamline further without generating errors

I'm looking to condense the steps in my script, but I'm having issues with lapply(). It looks to be an issue with my code as usual. Any help would be much appreciated!
library(iNEXT)
sa4 <- list(Bird = list(structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0), .Dim = c(26L,
6L), .Dimnames = list(Scientific_name = c(" Pycnonotus plumosus",
"Acridotheres javanicus", "Aegithina tiphia", "Aethopyga siparaja",
"Anthreptes malacensis", "Aplonis panayensis", "Cacatua goffiniana",
"Callosciurus notatus", "Cinnyris jugularis", "Copsychus malabaricus",
"Copsychus saularis", "Dicaeum cruentatum", "Dicrurus paradiseus",
"Gorsachius melanolophus", "Larvivora cyane", "Macronus gularis",
"Oriolus chinensis", "Orthotomus atrogularis", "Otus lempiji",
"Pitta moluccensis", "Pycnonotus goiavier", "Pycnonotus plumosus",
"Pycnonotus zeylanicus", "Spilopelia chinensis", "Todiramphus chloris",
"Zosterops simplex"), Sampling_Point = c("SA_01", "SA_02", "SA_03",
"SA_04", "SA_05", "SA_06")))), Butterfly = list(structure(c(0,
0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0), .Dim = c(10L,
4L), .Dimnames = list(Scientific_name = c("Burara harisa consobrina",
"Catopsilia pyranthe pyranthe", "Catopsilia scylla cornelia",
"Delias hyparete metarete", "Eurema sp", "Idea leuconoe clara",
"Pachliopta aristolochiae asteris", "Phalanta phalantha phalantha",
"Troides helena cerberus", "Zizula hylax pygmaea"), Sampling_Point = c("SA_01",
"SA_02", "SA_04", "SA_06")))), Mammal = list(structure(c(0, 1,
1, 1, 1, 0), .Dim = 2:3, .Dimnames = list(Scientific_name = c("Callosciurus notatus",
"Unidentified Fruit Bat sp"), Sampling_Point = c("SA_03", "SA_04",
"SA_05")))), Reptile = list(structure(1, .Dim = c(1L, 1L), .Dimnames = list(
Scientific_name = "Hemidactylus frenatus", Sampling_Point = "SA_05"))))
I've been doing it the longer way:
estimateD(sa4$Butterfly, datatype="incidence_raw") #Sampling coverage for butterflies
estimateD(sa4$Mammal, datatype="incidence_raw") #Sampling coverage for mammals
estimateD(sa4$Bird, datatype="incidence_raw") #Sampling coverage for birds
estimateD(sa4$Reptile, datatype="incidence_raw") #Sampling coverage for reptiles
Note that estimateD(sa4$Reptile, datatype="incidence_raw" generates an error since it only has one species.
Is it possible to condense the following steps via lapply? In this situation I've only have 4 taxa, but for other projects, it might be a lot more. I tried the following and it gives me a warning message--which actually is the same warning message as the one above. I'm wondering if lapply stops working if one component gives an error?
> (lapply(sa4, function(x) estimateD(x, datatype="incidence_raw")) )
Error in `[.data.frame`(tmp, , c(1, 2, 3, 7, 4, 5, 6)) :
undefined columns selected
In addition: Warning messages:
1: In FUN(X[[i]], ...) :
Invalid data type, the element of species by sites presence-absence matrix should be 0 or 1. Set nonzero elements as 1.
2: In log(b/Ub) : NaNs produced
Please let me know if I need to provide more information? Thank you!
This is a simple error trapping issue. Wrap tryCatcharound your problem function call and have the error function return information on what happened.
results <- lapply(sa4, function(x) {
tryCatch(estimateD(x, datatype="incidence_raw"),
error = function(e) e)
})
Now determine which ran alright.
ok <- !sapply(results, inherits, "error")
ok
# Bird Butterfly Mammal Reptile
# TRUE TRUE TRUE FALSE
And check those that did.
results[ok]
It is the issue with the 'Reptiles', so if we select the first 3 elements of the list, it should work
lapply(sa4[1:3], function(x) estimateD(x, datatype="incidence_raw"))

How to use ids from one dataframe to sum rows in another dataframe

I feel like this answer has been asked before, but I can't seem to find an answer to this question. Maybe my title is too vague, so feel free to change it.
So I have one data frame, a, with ids the correspond to column name in data frame b. Both data frames are simplified versions of a much larger data frame.
here is data frame a
a <- structure(list(V1 = structure(c(4L, 5L, 1L, 2L, 3L), .Label = c("GEN[D00105].GT",
"GEN[D00151].GT", "GEN[D00188].GT", "GEN[D86396].GT", "GEN[D86397].GT"
), class = "factor")), row.names = c(NA, -5L), class = "data.frame")
here is data frame b
b <- structure(list(`GEN[D01104].GT` = c(0, 0, 0, 0, 1, 0, 0, 2, 0,
1, 1, 1, 1, 0, 0, 0, 2, 0, 0, 0), `GEN[D01312].GT` = c(1, 0,
2, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 2, 0, 0, 2, 0, 0, 0), `GEN[D01878].GT` = c(0,
0, 0, 2, 0, 0, 2, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 2, 0, 0), `GEN[D01882].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0), `GEN[D01952].GT` = c(0,
0, 1, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0), `GEN[D01953].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 0, 0, 2, 0), `GEN[D02053].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0), `GEN[D00316].GT` = c(0,
0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0), `GEN[D01827].GT` = c(0,
0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0), `GEN[D01881].GT` = c(0,
0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0), `GEN[D02044].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0), `GEN[D02085].GT` = c(0,
0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0), `GEN[D02204].GT` = c(0,
0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0), `GEN[D02276].GT` = c(0,
0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0), `GEN[D02297].GT` = c(0,
0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0), `GEN[D02335].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0), `GEN[D02397].GT` = c(0,
0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0), `GEN[D00856].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0), `GEN[D00426].GT` = c(0,
0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0), `GEN[D02139].GT` = c(0,
0, 1, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0), `GEN[D02168].GT` = c(0,
0, 2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0)), row.names = c(NA,
-20L), class = "data.frame")
I want to be able to use the ids from data frame a to sum the row in data frame b that have a matching id if that makes sense.
So in the past, I just did something like
b$affected.samples <- (b$`GEN[D86396].GT` + b$`GEN[D86397].GT` + b$`GEN[D00105].GT` + b$`GEN[D00151].GT` + b$`GEN[D00188].GT`)
which got annoying and took to much time, so I moved over to
b$affected.samples <- rowSums(b[,c(1:5)])
Which isn't too bad for this example but with my large data set, my sample can be all over the place, and it's starting to take too much time to finds where everything is. I was hoping there is a way just to use my data frame a to sum the correct rows in data frame b.
Hopefully, I gave this is all the information you need! Let me know if you have any questions.
Thanks in advance!!
Extract the 'V1' column as a character string, use that to select the columns of 'b' (assuming these column names are found in 'b') and get the rowSums
rowSums( b[as.character(a$V1)], na.rm = TRUE)

Planned Contrasts on glmmTMB

Apologies if this is a repeat question. Many have posted looking looking for a way to do post-hoc analyses on the conditional model (fixed factors) in glmmTMB. I want to do plannned contrasts between certain groups, not test every pairwise comparison (e.g. Tukey).
The code below worked well on nlme:lme for a lmm. However, it returns an error on the code below.
Error in modelparm.default(model, ...) :
dimensions of coefficients and covariance matrix don't match
Is there a way to do planned contrasts on a glmmTMB?
#filtdens is a dataframe and TRT,DATE,BURN,VEG are factors
filtdens <- merged %>% filter(!BLOCK %in% c("JB2","JB4","JB5") & MEAS =="DENS" &
group == "TOT" & BURN == "N" & VEG == "C")
filtdens$TD <- interaction(filtdens$TRT, filtdens$DATE)
mod2 <- glmmTMB(count~(TD)+(1|BLOCK),
data=filtdens,
zi=~1,
family=nbinom1(link = "log"))
k1 <- matrix(c(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, -1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, -1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, -1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, -1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 1), byrow = T, ncol = 12)
summary(glht(mod2, linfct=k1),test=adjusted("bonferroni"))
A reproducible example would be helpful, but: this vignette in the development version offers code that ought to enable multcomp::linfct, i.e.:
glht_glmmTMB <- function (model, ..., component="cond") {
glht(model, ...,
coef. = function(x) fixef(x)[[component]],
vcov. = function(x) vcov(x)[[component]],
df = NULL)
}
modelparm.glmmTMB <- function (model,
coef. = function(x) fixef(x)[[component]],
vcov. = function(x) vcov(x)[[component]],
df = NULL, component="cond", ...) {
multcomp:::modelparm.default(model, coef. = coef., vcov. = vcov.,
df = df, ...)
}
Test (this example is with Tukey, but I don't see why it shouldn't work more generally ...)
library(glmmTMB)
data("cbpp",package="lme4")
cbpp_b1 <- glmmTMB(incidence/size~period+(1|herd),
weights=size,family=binomial,
data=cbpp)
g1 <- glht(cbpp_b1, linfct = mcp(period = "Tukey"))
summary(g1)
This works with the current CRAN version, but the current development version of glmmTMB offers more options (e.g. emmeans(); see the above-linked vignette). You'll need to install via devtools::install_github("glmmTMB/glmmTMB/glmmTMB") (you'll need compilation tools installed as well).

Stacking models from different packages

I am doing bankruptcy prediction (classification) with SVM in the e1071 package. To improve my results I would like to combine it with random forest in the caret package. Firstly, I will show my RF model, and then I will show the SVM model. And after that I will show my attempts to combine(stack) them.
Sorry in advance for messy code. I am new to all this.
RF model (caret package)
set.seed(123)
model.rf <- train(as.factor(year.of.bankruptcy) ~ ., method = "rf", data = training.set)
predict.rf <- predict(model.rf, testing.set[,-1])
RF model accuracy
confusionMatrix(predict.rf, testing.set$year.of.bankruptcy, mode="everything")$overall[1]
-> this gives me the accuracy of the model:
Accuracy
0.7166667
SVM (e1071 package)
set.seed(123)
model1<-function(k,d,c,g){
model <-svm(year.of.bankruptcy ~., data = training.set, type = "C-classification", kernel = k, degree= d, cost =c, gamma =g)
1<-testing.set[,-1]
2<-testing.set$year.of.bankruptcy
model_prediction <- predict(model, 1)
result<-table(model_prediction, 2)
return(result)
}
result<-model1(k="radial", d=2, c=2,g=0.1)
result
classAgreement(tab=result, match.names = FALSE)
classAgreement(tab=result, match.names = FALSE)$diag
-> this gives me the accuracy of the model:
[1] 0.7466667
Stacking models together
predictDF <- data.frame(predict.rf, classAgreement(tab=result, match.names = FALSE)$diag, class = testing.set$year.of.bankruptcy)
predictDF_bc <- ROSE(class ~.,predictDF, N=300, p=0.5, seed=12)$data
set.seed(123)
combined.model.gbm <- train(as.factor(class) ~ ., method = "gbm", data = predictDF_bc, distribution = "bernoulli")
combined.prediction.gbm <- predict(combined.model.gbm, predictDF)
Evaluating the model
confusionMatrix(combined.prediction.gbm, testing.set$year.of.bankruptcy)$overall[1]`enter code here`
-> this gives me the accuracy of the stacked model:
Accuracy
0.7166667
As you can see, the combined model does not take the SVM into account. Since my combined score is lower than my SVM score. Any suggestions to what I could do?
> dput(training.set[sample(1:nrow(training.set), 50),])
structure(list(year.of.bankruptcy = c(-1, -1, -1, -1, -1, -1, -1, 1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, 1, -1, -1, -1, -1, -1), liquidity_1 = c(90.0695931477516,
85.4305617311398, 76.2455934195065, 4.34688111280157, 159.020111900801,
104.569486404834, 58.3391003460208, 42.0907973873116, 101.121495327103,
94.3786295005807, 47.7552816901408, 125.702184574231, 125.763725699637,
106.584557081952, 0, 143.6203466894, 82.5245328673209, 35.296442687747,
8.85744561490993, 12.4657534246575, 128.164489183979, 133.131146034372,
92.0528568769775, 22.8177150192555, 100.237812128419, 40.0340715502555,
91.360486091332, 129.123757904246, 92.9165443694355, 130.999694283094,
22.2526106414719, 101.714770797963, 93.1704260651629, 46.6268560361524,
125.838858750251, 106.076759061834, 86.787017476474, 84.7495991700462,
42.1171171171171, 68.806311160926, 93.1549687282835, 104.196667352397,
47.0834921845215, 77.8816199376947, 76.9065981148243, 90.988709507228,
98.9704873026767, 163.446031970576, 113.768115942029, 92.9742188833874
), profmarg_1 = c(241.916488222698, 215.221579961464, 633.490011750881,
0, 173.627703009224, 193.164652567976, 3.32179930795848, 82.390221819828,
131.842456608812, 102.044134727062, 0, 7.2447614801605, 113.608203375347,
169.208905731881, 0, 179.866439329355, 250.396558677242, 48.0632411067194,
0, 12.8082191780822, 0.963803812379525, 0, 452.279918109064,
0, 16.4090368608799, 11.4449434722007, 173.331434539068, 240.216802168022,
307.709617454261, 179.883827575665, 281.476877175535, 539.609507640068,
183.12447786132, 31.8431245965139, 151.215591721921, 95.3980099502487,
259.97695410025, 174.073375459776, 11.986986986987, 160.94322541708,
119.110493398193, 428.03949804567, 194.624475791079, 325.877466251298,
37.2322193658955, 245.71066793289, 207.343857240906, 22.49257320696,
43.6487638533674, 97.4987194809629), drmarg = c(1.46603230803275,
12.6575304731079, -0.798553144129104, 53.3333333333333, 11.8097892353249,
29.1893259137473, 60.4166666666667, -23.041601255887, 1.21518987341772,
6.1535019019915, 82.4626865671642, -4, 4.47536667920271, -3.69540873460246,
65.3543307086614, 6.46738701790362, -3.63987759703656, 0.575657894736842,
70.2460850111857, 45.4545454545455, -724.444444444444, 18.809947734191,
3.22818215293973, 92.9292929292929, 6.52173913043478, 50.8680555555556,
4.88031987730733, 19.9684115523466, 1.1446376903755, 13.3729821580289,
1.22027317479027, 4.0986955838441, -3.29607664233577, 73.4414597060314,
3.95960669678448, 28.6645874681032, 17.2991867598802, 10.8455534851063,
55.741127348643, 8.98526582981339, 7.36196319018405, 4.85894170231172,
10.4852855193919, -1.6774275224712, 16.3210702341137, 2.47726693294808,
5.64784053156146, 59.622641509434, 11.0029211295034, 50.5987773218323
), ROA = c(3.546573875803, 27.2417370683267, -5.05875440658049,
6.52032166920235, 20.5050657795252, 87.1601208459215, 2.00692041522491,
-18.9840263855655, 1.60213618157543, 6.38792102206736, 9.72711267605634,
-0.356665180561748, 5.08438367870113, -6.25296068214116, 3.53041259038707,
11.6510372264848, -9.11412824304342, 0.276679841897233, 5.87171975316337,
5.82191780821918, -6.98222317412722, 30.0983365499495, 14.6845337800112,
11.8100128369705, 1.07015457788347, 6.05028134840741, 8.45912845343207,
47.9674796747967, 3.52216025829175, 24.0599205136044, 4.37593237195425,
22.1392190152801, -6.0359231411863, 23.3860555196901, 5.98754269640346,
35.9275053304904, 46.5719224121375, 18.9380364047911, 6.68168168168168,
19.5326981937319, 9.17303683113273, 20.7981896729068, 20.5108654212734,
-5.50363447559709, 10.4541559554413, 6.15173578136541, 12.4456646076413,
13.4106662894327, 4.81670929241262, 51.5793068123613), debt_ratio_1 = c(75.6423982869379,
157.077219504965, 180.975323149236, 88.958921973484, 96.869801905338,
93.0513595166163, 78.6159169550173, 131.707948004915, 132.096128170895,
100.789779326365, 28.080985915493, 48.1497993758359, 85.6868190557573,
85.5518711511132, 75.4714305969091, 92.0431940892299, 123.551552628041,
43.8735177865613, 89.2601134451162, 69.0547945205479, 29.727993146284,
110.265600588181, 154.662199888331, 54.2362002567394, 20.9274673008323,
79.0666460172423, 150.536409380044, 101.355013550135, 145.827218471774,
45.2155304188322, 123.222277473894, 134.90662139219, 123.141186299081,
41.7043253712072, 66.2648181635523, 26.5813788201848, 95.1411561359708,
105.191926813166, 7.60760760760761, 179.997413458637, 92.7032661570535,
121.49763423164, 96.3400686237133, 129.823468328141, 39.502999143102,
136.213991769547, 119.01166781057, 84.8210496534163, 8.99403239556692,
113.957657503842), young = c(1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), medium_age = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0, 1, 0, 1, 0, 0, 0), old = c(0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0), agriculture = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), offshore_shipping = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), transport = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), manufacturing = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), telecom_it_tech = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), electricity = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), construction = c(0,
0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1), wholesale_retail = c(0, 0, 1, 0, 1, 0,
0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
0, 0), finance = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), change_output = c(0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549, 0.0495184733103549, 0.0495184733103549, 0.0495184733103549,
0.0495184733103549), oil_price_dummy = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), fish_price_dummy = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0.180737819481274, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.180737819481274, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("year.of.bankruptcy", "liquidity_1",
"profmarg_1", "drmarg", "ROA", "debt_ratio_1", "young", "medium_age",
"old", "agriculture", "offshore_shipping", "transport", "manufacturing",
"telecom_it_tech", "electricity", "construction", "wholesale_retail",
"finance", "change_output", "oil_price_dummy", "fish_price_dummy"
), row.names = c(19L, 49L, 25L, 53L, 56L, 3L, 31L, 50L, 58L,
62L, 51L, 24L, 35L, 29L, 6L, 44L, 12L, 2L, 15L, 42L, 39L, 30L,
27L, 40L, 26L, 41L, 21L, 22L, 11L, 63L, 32L, 60L, 36L, 52L, 1L,
14L, 37L, 34L, 8L, 43L, 4L, 10L, 9L, 54L, 59L, 64L, 23L, 20L,
17L, 13L), class = "data.frame")
Stacking models is done quite easily with caretEnsemble library.
Here is an example:
library(mlbench) #for the data set
library(caret)
library(caretEnsemble)
data(PimaIndiansDiabetes)
set.seed(123)
list the algorithms to use:
algorithmList <- c("svmRadial", "rf" )
if you would like to specify the tuning parameters in each model use tuneList argument in caretList function:
in trainControl, savePredictions = "final" and classProbs = TRUE are mandatory
control <- trainControl(method = "repeatedcv", number = 4, repeats = 3,
savePredictions = "final" , classProbs = TRUE)
models <- caretList(diabetes ~ ., data = PimaIndiansDiabetes, trControl = control,
metric = "Kappa", methodList = algorithmList)
results <- resamples(models)
summary(results)
#output
Call:
summary.resamples(object = results)
Models: svmRadial, rf
Number of resamples: 12
Accuracy
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
svmRadial 0.6979167 0.7135417 0.7343750 0.7304688 0.7447917 0.7604167 0
rf 0.7291667 0.7604167 0.7682292 0.7690972 0.7760417 0.8125000 0
Kappa
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
svmRadial 0.2637842 0.3570103 0.4053130 0.3917770 0.4394767 0.4775359 0
rf 0.3788379 0.4612661 0.4788076 0.4809233 0.5028566 0.5785880 0
now the stack,
stack.glm <- caretStack(models, method = "glm", metric = "Kappa", trControl = control)
print(stack.glm)
#output
A glm ensemble of 2 base models: svmRadial, rf
Ensemble results:
Generalized Linear Model
2304 samples
2 predictor
2 classes: 'neg', 'pos'
No pre-processing
Resampling: Cross-Validated (4 fold, repeated 3 times)
Summary of sample sizes: 1728, 1728, 1728, 1728, 1728, 1728, ...
Resampling results:
Accuracy Kappa
0.7667824 0.4685406
or a gbm stack
stack.gbm <- caretStack(models, method="gbm", metric = "Kappa", trControl = control)
print(stack.gbm)
#output
A gbm ensemble of 2 base models: svmRadial, rf
Ensemble results:
Stochastic Gradient Boosting
2304 samples
2 predictor
2 classes: 'neg', 'pos'
No pre-processing
Resampling: Cross-Validated (4 fold, repeated 3 times)
Summary of sample sizes: 1728, 1728, 1728, 1728, 1728, 1728, ...
Resampling results across tuning parameters:
interaction.depth n.trees Accuracy Kappa
1 50 0.7693866 0.4832061
1 100 0.7675058 0.4785977
1 150 0.7663484 0.4753614
2 50 0.7662037 0.4748160
2 100 0.7638889 0.4684015
2 150 0.7634549 0.4653090
3 50 0.7630208 0.4657834
3 100 0.7612847 0.4606506
3 150 0.7569444 0.4511977
Tuning parameter 'shrinkage' was held constant at a value of 0.1
Tuning parameter 'n.minobsinnode' was
held constant at a value of 10
Kappa was used to select the optimal model using the largest value.
The final values used for the model were n.trees = 50, interaction.depth = 1, shrinkage = 0.1 and n.minobsinnode
= 10.
So the kappa values for
svm: 0.3917770
rf: 0.4809233
glm ensemble: 0.4685406
gbm ensemble: 0.4832061 - this would probably be higher if more models were used
EDIT: with the data provided in the OP:
first convert year.of.bankruptcy to a factor
data$year.of.bankruptcy <- as.factor(data$year.of.bankruptcy)
set level names to something that wont throw an error:
levels(data$year.of.bankruptcy) <- c("minus", "plus")
and go forward
control <- trainControl(method = "repeatedcv", number = 4, repeats = 3,
savePredictions = "final" , classProbs = TRUE)
models <- caretList(year.of.bankruptcy ~ ., data = data, trControl = control,
metric = "Kappa", methodList = algorithmList)
I get warnings about zero variance predictors but this is probably caused by the small data sample. If you see errors like:
In .local(x, ...) : Variable(s) `' constant. Cannot scale data.
on the whole data set then its worth looking into removing near zero variance predictors. There is a nice chapter about this here. Good luck

How to make a regression with many data frames

I have data about gender and petitioning. I want to make a regression between the "Femme" (woman) variable and the different issues of the petitions. I have regrouped those issues into data frames under general themes, and those themes are what I want to regress with the "Femme" (woman) variable.
P.S.: Some petitions have many issues (ex.: water + science). So one petition could be counted in two data frames at the same time.
1) Here is what I did for all issues, this one is an example with the "Aboriginal" issue to show you how I coded the initial issues (you can also see the "Femme" variable at the beginning, which is already coded "0" and "1" in the original dataset under "Female"):
DataPetitions$Femme <- DataPetitions$Female
DataPetitions$Aboriginal <- NA
DataPetitions$Aboriginal[grepl("Aboriginal", DataPetitions$Issue)] <-1
DataPetitions$Aboriginal[!grepl("Aboriginal", DataPetitions$Issue)] <-0
# ... (same for all 24 specific issues)
2) Creating 7 data frames for general petitioning themes:
EnvironmentalIssues <- c(DataPetitions$AirQuality,DataPetitions$Biological, DataPetitions$Climate, DataPetitions$Environmental, DataPetitions$Toxic, DataPetitions$Waste, DataPetitions$Water)
EconomicIssues <- c(DataPetitions$Natural, DataPetitions$Transport)
SocialIssues <- c(DataPetitions$Aboriginal, DataPetitions$Health)
AgriculturalIssues <- c(DataPetitions$Agriculture,
DataPetitions$Fisheries, DataPetitions$Pesticides)
PoliticalIssues <- c(DataPetitions$Compliance, DataPetitions$Federal,
DataPetitions$Governance, DataPetitions$International)
ScientificIssues <- c(DataPetitions$Science)
OtherIssues <- c(DataPetitions$Other)
3) Trying to do a regression. This is my glm code:
model7 <- glm(DataPetitions$Femme ~ SocialIssues + PoliticalIssues +
ScientificIssues + EnvironmentalIssues + EconomicIssues +
AgriculturalIssues + OtherIssues, data = DataPetitions)
# When I try to run it, I get this error message:
Error in model.frame.default(formula = DataPetitions$Femme ~
SocialIssues + : variable lengths differ (found for
'SocialIssues')
With dput(head(DataPetitions,20)), I get this:
[...] class = "factor"), Femme = c(1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1), AuMoinsUneFemme = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1),
Homme = c(1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
1, 1, 1, 2), AuMoinsUnHomme = c(1, 0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1), Individual1 = c(0, 0, 0,
1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0), Group1 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1),
Organisation1 = c(1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
0, 0, 1, 0, 0, 0, 0), Aboriginal = c(1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0), Agriculture = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),
AirQuality = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 0), Biological = c(0, 1, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), Climate = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1), Compliance = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0),
Environmental = c(0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), Federal = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Fisheries = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0), Governance = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Health = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), International = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), Natural = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), Other = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Pesticides = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), Science = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Toxic = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Transport = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Waste = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), Water = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("Data.", "Title", "Number", "Issue", "Petitioner", "Individual", "Group", "Organisation",
"Female", "Male", "Unknown", "DateReceived", "Status", "Summary",
"Hyperlink", "Femme", "AuMoinsUneFemme", "Homme", "AuMoinsUnHomme",
"Individual1", "Group1", "Organisation1", "Aboriginal", "Agriculture",
"AirQuality", "Biological", "Climate", "Compliance", "Environmental",
"Federal", "Fisheries", "Governance", "Health", "International",
"Natural", "Other", "Pesticides", "Science", "Toxic", "Transport",
"Waste", "Water"), row.names = c(NA, 20L), class = "data.frame")

Resources