Related
I am trying to create two calibration plots (for two different models) but it does not seem to work. My data (a subset of them):
structure(list(X1 = c(0.0205881308065423, 0.030107400545467,
0.0224902821967529, 0.067082269138019, 0.0128933436225658, 0.010528298470225,
0.0448801718109416, 0.0147825838164296, 0.00127338570492985,
0.0187288824619526, 0.0166935353708351, 0.000827013756910522,
0.000268624600100464, 0.00022554771787564, 0.000239290116892055,
0.00046320712675918, 0.0127930773405932, 0.123559021969098, 0.00196413334593659,
0.00267343502355055, 0.0119560304531064, 0.0151288958940289,
0.0450932732709064, 0.284128554073485, 0.0435626434150131, 0.00919667587971063,
0.241220354905637, 0.0188148171033879, 0.0116570772346002, 0.0159496690575734,
0.00518918742249186, 0.0319701660388646, 0.100234998067917, 0.0119794232466471,
0.00123658099677804, 0.00178774726967923, 0.00215162606048125,
0.028398874195245, 0.02727277199735, 0.0536089031118459, 0.00567355556708304,
0.00182798929912398, 0.0221311523302337, 0.0317268552025847,
0.241167765332718, 0.201815176728704, 0.00750328900855035, 0.00346824263327472,
0.00859464311717095, 0.00488864781312837), X2 = c(0.0123677690429329,
0.0275541038901166, 0.0166991553536275, 0.0260168210079643, 0.00693728726147325,
0.00464096927279578, 0.0124618831179862, 0.0184586073538044,
0.00569866130459529, 0.00293809224808261, 0.00119326039429657,
0.00316749683866091, 0.00419982136508501, 0.00140900547876921,
0.00110999833888004, 0.00276678547598108, 0.0162868658191231,
0.0649037872628959, 0.00123222675644274, 0.00171687152904065,
0.0152583510689248, 0.0258721612337077, 0.0392641646035583, 0.361960538193137,
0.0357326269142103, 0.0107540980920499, 0.22279499286353, 0.0301267823507665,
0.0144535141006957, 0.0124677305707919, 0.00520476987173168,
0.0320510777198151, 0.0770024283430764, 0.00793648556749427,
0.000401508352378066, 0.000498605187176815, 0.000982487695277534,
0.0399009464278308, 0.0698023981838097, 0.0506533144316593, 0.00462517180839983,
0.00275731807224233, 0.0374332227392187, 0.0582978817333271,
0.121896031487931, 0.236774303454737, 0.0106755443754257, 0.00398238213200619,
0.0113833654830731, 0.00708983623072867), X3 = c(0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0)), row.names = c(NA, 50L), class = "data.frame")
and the code I use:
par(mfrow = c(1,2),
oma=c(5,5,0,0) + 0.0,
mar=c(0,0,1,1) + 0.0)
val.prob.ci.2(p = lo$X1, y = lo$X3, smooth="loess", legendloc=F, lwd.smooth=2, lwd.ideal=2, lty.ideal=2, dostats = F, axes = F)
axis(side=2,at=c(0,0.2,0.4,0.6,0.8,1))
box(which="plot")
text(x=0, y=1, adj=0, "LG", cex=1.3)
val.prob.ci.2(p = lo$X2, y = lo$X3, smooth="loess", legendloc=F, lwd.smooth=2, lwd.ideal=2, lty.ideal=2, dostats = F, axes = F)
box(which="plot")
text(x=0, y=1, adj=0, "rf", cex=1.3)
title(xlab = list("Estimated probability",cex=1.5),
ylab = list("Observed proportion",cex=1.5),
outer = TRUE)#, line=3)
the output is:
The problem lies with the second plot. Why it does not appear?
I have a data set I'm trying to run a glm regression on, however it contains characters as age limit, race, and comorbidity class. I would like to change those columns into a continuous variable so the regression can accept it. Data below, I want to change the TBI.irace2 into (Hispanic=1, Black=2, white=3, and other=4) same with age (age 18-28=1, 29-46=2, 47-64=3, and >64=4) and with NISS (NISS 0-10=1, NISS 11-20=2, NISS 21-30=3, and NISS 31-40=4, NISS41-50=5, NISS 51-60=6, NISS 61-70=7, NISS>70= 8)
Please find summary of data below
TBI.crani = c(0, 0, 0, 0, 0, 0), TBI.vte = c(0,
0, 0, 0, 0, 0), TBI.FEMALE = c(0, 0, 1, 0, 1, 0), TBI.iracecat2 = c("Whites",
"Whites", "Whites", "Hispanics", "Whites", "Blacks"), TBI.agecat = c("Age 47-64",
"Age 29-46", "Age > 64", "Age 29-46", "Age 18-28", "Age 18-28"
), TBI.nisscategory = c("NISS 21-30", "NISS 11-20", "NISS 21-30",
"NISS 11-20", "NISS 11-20", "NISS 0-10"), TBI.LOS = c(5, 8, 1,
3, 19, 1), TBI.hospitalteach = c(0, 0, 1, 1, 1, 1), TBI.largebedsize = c(1,
1, 1, 1, 1, 1), TBI.CM_ALCOHOL = c(0, 0, 0, 1, 0, 0), TBI.CM_ANEMDEF = c(0,
0, 0, 0, 0, 0), TBI.CM_BLDLOSS = c(0, 0, 0, 0, 0, 0), TBI.CM_CHF = c(1,
0, 0, 0, 0, 0), TBI.CM_CHRNLUNG = c(0, 0, 0, 0, 0, 0), TBI.CM_COAG = c(0,
0, 0, 0, 1, 0), TBI.CM_HYPOTHY = c(0, 0, 0, 0, 0, 0), TBI.CM_LYTES = c(0,
0, 0, 0, 0, 0), TBI.CM_METS = c(0, 0, 0, 0, 0, 0), TBI.CM_NEURO = c(0,
0, 0, 0, 0, 0), TBI.CM_OBESE = c(0, 0, 0, 0, 0, 0), TBI.CM_PARA = c(0,
0, 0, 0, 0, 0), TBI.CM_PSYCH = c(0, 1, 0, 0, 0, 0), TBI.CM_TUMOR = c(0,
0, 0, 0, 0, 0), TBI.CM_WGHTLOSS = c(0, 0, 0, 0, 0, 0), TBI.UTI = c(0,
0, 0, 0, 0, 0), TBI.pneumonia = c(0, 0, 0, 0, 0, 0), TBI.AMI = c(0,
0, 0, 0, 0, 0), TBI.sepsis = c(0, 0, 0, 0, 0, 0), TBI.arrest = c(0,
0, 0, 0, 0, 0), TBI.spineinjury = c(0, 0, 0, 0, 0, 0), TBI.legfracture = c(0,
0, 0, 0, 0, 0), TBI_time_to_surg.NEW = c(0, 0, 0, 0, 0, 0)), row.names = c(NA,
6L), class = "data.frame")
A small little tip, provide a small sample set that is just big enough to address your question.
library(data.table)
# took a small sample and changed one value to Asian
dt <- data.table(
TBI.FEMALE = c(0, 0, 1, 0, 1, 0),
TBI.iracecat2 = as.character(c("Whites", "Whites", "Asian", "Hispanics", "Whites", "Blacks"))
)
# define race groups, and note I did not define Asian
convert_race <- c("Hispanics" = 1, "Blacks" = 2, "Whites" = 3) # other will all be not defined
dt[, TBI.irace2 := lapply(TBI.iracecat2, function(x) convert_race[x]), by = TBI.iracecat2]
dt[is.na(TBI.irace2), TBI.irace2 := 4]
dt
# TBI.FEMALE TBI.iracecat2 TBI.irace2
# 1: 0 Whites 3
# 2: 0 Whites 3
# 3: 1 Asian 4
# 4: 0 Hispanics 1
# 5: 1 Whites 3
# 6: 0 Blacks 2
When trying to predict I am getting this error
error in evaluating the argument 'x' in selecting a method for function 'as.matrix': Cholmod error 'X and/or Y have wrong dimensions' at file ../MatrixOps/cholmod_sdmult.c, line 90
Here is my code so far
library(data.table)
library(caret)
library(Metrics)
library(glmnet)
library(plotmo)
library(lubridate)
#Reading in the necessary data needed for this project
train <- fread("project/volume/data/processed/start_train.csv")
test<-fread("project/volume/data/processed/start_test.csv")
example_sub<-fread("project/volume/data/processed/example_submission.csv")
card_tab <- fread("project/volume/data/processed/card_tab.csv")
#Merging the card_tab dataset with both my train and test datasets to add more variables to each
train = merge(train, card_tab, by = "id")
test = merge(test, card_tab, by = "id")
train$power = as.numeric(train$power)
train$toughness = as.numeric(train$toughness)
test$power = as.numeric(test$power)
test$toughness = as.numeric(test$toughness)
train$powerovertough = train$power/train$toughness
test$powerovertough = test$power/test$toughness
train$current_date<-as_date(train$current_date)
train<-train[order(-current_date)]
test$current_date<-as_date(test$current_date)
test<-test[order(-current_date)]
#Handling NA values in both train and test. The NA values will be set to 0
train[is.na(train)] <- 0
test[is.na(test)] <- 0
# Specifying which columns of our model we will be dropping and applying it to our train and test datasets
drops<- c('id','future_date','current_date','card_name','power','loyalty','cmc','type','colors','mana_cost','subtypes', 'text','set','set_name')
train<-train[, !drops, with = FALSE]
test<-test[, !drops, with = FALSE]
#Saving the response variable in the train dataset
train_y<-train$future_price
test$future_price<-0
#Using dummies to train model
dummies <- dummyVars(future_price ~ ., data = train)
train<-predict(dummies, newdata = train)
test<-predict(dummies, newdata = test)
train<-data.table(train)
test<-data.table(test)
#Cross validating the model to fin the best lamda value
train<-as.matrix(train)
test<-as.matrix(test)
gl_model<-cv.glmnet(train, train_y, alpha = 1,family="gaussian")
bestlam<-gl_model$lambda.min
# Fitting the full model
gl_model<-glmnet(train, train_y, alpha = 1,family="gaussian")
plot_glmnet(gl_model)
saveRDS(gl_model,"./project/volume/models/gl_model.model")
test<-as.matrix(test)
#use the full model
pred<-predict(gl_model,s=bestlam, newx = test)
I am trying to predict future_price for my test set. The error is saying my dimensions are wrong but I don't know what could be causing them to be different. I have tried observing the data sets as it runs through the code and they seem to have the same variables.
Here is the dput
> dput(head(train))
structure(c(0.25, 0.1, 0.1, 0.1, 0.25, 0.25, 1, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 3, 0, 2, 0, 0, 0, 0.333333333333333,
0, 1, 0), .Dim = c(6L, 20L), .Dimnames = list(NULL, c("current_price",
"typesArtifact", "typesArtifact Creature", "typesCreature", "typesEnchantment",
"typesEnchantment Artifact", "typesEnchantment Creature", "typesInstant",
"typesLand", "typesPlaneswalker", "typesSorcery", "supertypes",
"supertypesBasic", "supertypesLegendary", "rarityCommon", "rarityMythic",
"rarityRare", "rarityUncommon", "toughness", "powerovertough"
)))
> dput(head(test))
structure(c(0.15, 0.16, 2, 0.39, 0.16, 0.19, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0), .Dim = c(6L, 18L), .Dimnames = list(
NULL, c("current_price", "typesArtifact", "typesArtifact Creature",
"typesCreature", "typesEnchantment", "typesInstant", "typesLand",
"typesPlaneswalker", "typesSorcery", "supertypes", "supertypesBasic",
"supertypesLegendary", "rarityCommon", "rarityMythic", "rarityRare",
"rarityUncommon", "toughness", "powerovertough")))
I'm looking to condense the steps in my script, but I'm having issues with lapply(). It looks to be an issue with my code as usual. Any help would be much appreciated!
library(iNEXT)
sa4 <- list(Bird = list(structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0), .Dim = c(26L,
6L), .Dimnames = list(Scientific_name = c(" Pycnonotus plumosus",
"Acridotheres javanicus", "Aegithina tiphia", "Aethopyga siparaja",
"Anthreptes malacensis", "Aplonis panayensis", "Cacatua goffiniana",
"Callosciurus notatus", "Cinnyris jugularis", "Copsychus malabaricus",
"Copsychus saularis", "Dicaeum cruentatum", "Dicrurus paradiseus",
"Gorsachius melanolophus", "Larvivora cyane", "Macronus gularis",
"Oriolus chinensis", "Orthotomus atrogularis", "Otus lempiji",
"Pitta moluccensis", "Pycnonotus goiavier", "Pycnonotus plumosus",
"Pycnonotus zeylanicus", "Spilopelia chinensis", "Todiramphus chloris",
"Zosterops simplex"), Sampling_Point = c("SA_01", "SA_02", "SA_03",
"SA_04", "SA_05", "SA_06")))), Butterfly = list(structure(c(0,
0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0), .Dim = c(10L,
4L), .Dimnames = list(Scientific_name = c("Burara harisa consobrina",
"Catopsilia pyranthe pyranthe", "Catopsilia scylla cornelia",
"Delias hyparete metarete", "Eurema sp", "Idea leuconoe clara",
"Pachliopta aristolochiae asteris", "Phalanta phalantha phalantha",
"Troides helena cerberus", "Zizula hylax pygmaea"), Sampling_Point = c("SA_01",
"SA_02", "SA_04", "SA_06")))), Mammal = list(structure(c(0, 1,
1, 1, 1, 0), .Dim = 2:3, .Dimnames = list(Scientific_name = c("Callosciurus notatus",
"Unidentified Fruit Bat sp"), Sampling_Point = c("SA_03", "SA_04",
"SA_05")))), Reptile = list(structure(1, .Dim = c(1L, 1L), .Dimnames = list(
Scientific_name = "Hemidactylus frenatus", Sampling_Point = "SA_05"))))
I've been doing it the longer way:
estimateD(sa4$Butterfly, datatype="incidence_raw") #Sampling coverage for butterflies
estimateD(sa4$Mammal, datatype="incidence_raw") #Sampling coverage for mammals
estimateD(sa4$Bird, datatype="incidence_raw") #Sampling coverage for birds
estimateD(sa4$Reptile, datatype="incidence_raw") #Sampling coverage for reptiles
Note that estimateD(sa4$Reptile, datatype="incidence_raw" generates an error since it only has one species.
Is it possible to condense the following steps via lapply? In this situation I've only have 4 taxa, but for other projects, it might be a lot more. I tried the following and it gives me a warning message--which actually is the same warning message as the one above. I'm wondering if lapply stops working if one component gives an error?
> (lapply(sa4, function(x) estimateD(x, datatype="incidence_raw")) )
Error in `[.data.frame`(tmp, , c(1, 2, 3, 7, 4, 5, 6)) :
undefined columns selected
In addition: Warning messages:
1: In FUN(X[[i]], ...) :
Invalid data type, the element of species by sites presence-absence matrix should be 0 or 1. Set nonzero elements as 1.
2: In log(b/Ub) : NaNs produced
Please let me know if I need to provide more information? Thank you!
This is a simple error trapping issue. Wrap tryCatcharound your problem function call and have the error function return information on what happened.
results <- lapply(sa4, function(x) {
tryCatch(estimateD(x, datatype="incidence_raw"),
error = function(e) e)
})
Now determine which ran alright.
ok <- !sapply(results, inherits, "error")
ok
# Bird Butterfly Mammal Reptile
# TRUE TRUE TRUE FALSE
And check those that did.
results[ok]
It is the issue with the 'Reptiles', so if we select the first 3 elements of the list, it should work
lapply(sa4[1:3], function(x) estimateD(x, datatype="incidence_raw"))
I was wondering how I can get the weighted average of my data. I have already looked on the internet, but when I try the weighted.mean function, I keep getting the same result, so I was wondering what I am doing wrong.
Below is some information of the dataset:
dput(head(new))
structure(list(comp.1 = c(0.5, 0.25, 0, 0.25, 0.31, 0.3), comp.2 = c(0.3,
0.15, 0, 0.15, 0, 0), comp.3 = c(0.2, 0.6, 1, 0.6, 0.69, 0.7),
genderMale = c(0, 1, 1, 1, 0, 0), SeniorCitizen = c(0, 0,
0, 0, 0, 0), PartnerYes = c(1, 0, 0, 0, 0, 0), DependentsYes = c(0,
0, 0, 0, 0, 0), tenure = c(-1.28015700354285, 0.064298112878097,
-1.23941593940889, 0.512449818351747, -1.23941593940889,
-0.994969554605076), MultipleLinesYes = c(0, 0, 0, 0, 0,
1), `InternetServiceFiber optic` = c(0, 0, 0, 0, 1, 1), OnlineSecurityYes = c(0,
1, 1, 1, 0, 0), OnlineBackupYes = c(1, 0, 1, 0, 0, 0), DeviceProtectionYes = c(0,
1, 0, 1, 0, 1), TechSupportYes = c(0, 0, 0, 1, 0, 0), StreamingTVYes = c(0,
0, 0, 0, 0, 1), StreamingMoviesYes = c(0, 0, 0, 0, 0, 1),
`ContractOne year` = c(0, 1, 0, 1, 0, 0), `ContractTwo year` = c(0,
0, 0, 0, 0, 0), PaperlessBillingYes = c(1, 0, 1, 0, 1, 1),
`PaymentMethodCredit card (automatic)` = c(0, 0, 0, 0, 0,
0), `PaymentMethodElectronic check` = c(1, 0, 0, 0, 1, 1),
`PaymentMethodMailed check` = c(0, 1, 1, 0, 0, 0), MonthlyCharges = c(-1.16161133177258,
-0.260859369930086, -0.363897417225722, -0.747797238601399,
0.196164226945719, 1.15840663636787), TotalCharges = c(1.47494433546539,
3.27634689625303, 2.03402652377511, 3.26499480914874, 2.18084241464668,
2.91407858538911)), row.names = c("1", "2", "3", "4", "5",
"6"), class = "data.frame")
As you can see, I have 3 components (comp.1, comp.2, comp.3). All of these components have their posterior probabilities. And I am wondering how I can get the weighted averages for all of these and the final weighted averages. I have tried:
weighted.mean(new$comp.1, new$SeniorCitizen)
weighted.mean(new$comp.2, new$SeniorCitizen)
weighted.mean(new$comp.3, new$SeniorCitizen)
It gave me the output 0.24, 0.14 and 0.61. But irrespectively which variable I put, I get the same output. What am I doing wrong?