k-fold cross validation in quanteda - r

I've been using the quanteda SML workflow as described in the quanteda tutorial (https://tutorials.quanteda.io/machine-learning/nb/) and found it extremely helpful to set up my own classification task. However, instead of the fixed held-out train/test sampling I would like to use a k-fold cross-validation. Could you point me towards the best way to implement it into the workflow? Is there an easy way to apply it in quanteda?
Many thanks
I tried to add a cross validation based on this example:
https://rdrr.io/github/quanteda/quanteda.classifiers/man/crossval.html
require(quanteda)
require(quanteda.textmodels)
require(caret)
corp_movies <- data_corpus_moviereviews
summary(corp_movies, 5)
# generate 1500 numbers without replacement
set.seed(300)
id_train <- sample(1:2000, 1500, replace = FALSE)
head(id_train, 10)
# create docvar with ID
corp_movies$id_numeric <- 1:ndoc(corp_movies)
# tokenize texts
toks_movies <- tokens(corp_movies, remove_punct = TRUE, remove_number = TRUE) %>%
tokens_remove(pattern = stopwords("en")) %>%
tokens_wordstem()
dfmt_movie <- dfm(toks_movies)
# get training set
dfmat_training <- dfm_subset(dfmt_movie, id_numeric %in% id_train)
# get test set (documents not in id_train)
dfmat_test <- dfm_subset(dfmt_movie, !id_numeric %in% id_train)
tmod_nb <- textmodel_nb(dfmat_training, dfmat_training$sentiment)
summary(tmod_nb)
dfmat_matched <- dfm_match(dfmat_test, features = featnames(dfmat_training))
actual_class <- dfmat_matched$sentiment
predicted_class <- predict(tmod_nb, newdata = dfmat_matched)
tab_class <- table(actual_class, predicted_class)
tab_class
require(confusionMatrix)
confusionMatrix(tab_class, mode = "everything", positive = "pos")
#n-fold cross validation
require(crossval)
dfmat <- dfm(toks_movies)
tmod <- textmodel_nb(dfmat, y = data_corpus_moviereviews$sentiment)
crossval(tmod, k = 5, by_class = TRUE)
crossval(tmod, k = 5, by_class = FALSE)
crossval(tmod, k = 5, by_class = FALSE, verbose = TRUE)
but it returns "Error in group.samples(Y) : argument "Y" is missing, with no default"

It should probably be a comment, but I cannot post them yet. I think your problem is caused by the usage of the crossval() function from the improper package. The link you shared suggests that you want to use it from the remote quanteda/quanteda.classifiers package, instead of crossval. The one you used presumably requires a different pipeline cause its definition is different. The used function requires additional X and Y arguments. Their lack is a reason for your error.

Related

External Cluster Validation - Categorical Data R

I've recently been attempting to evaluate output from k-modes (a cluster label), relative to a so-called True cluster label (labelled 'class' below).
In other words: I've been attempting to external validate the clustering output. However, when I tried external validation measures from the 'fpc' package, I was unsuccessful (error term posted below script).
I've attached my code for the mushroom dataset. I would appreciate if anyone could show me how to successful execute these external validation measures in the context of categorical data.
Any help appreciated.
# LIBRARIES
install.packages('klaR')
install.packages('fpc')
library(klaR)
library(fpc)
#MUSHROOM DATA
mushrooms <- read.csv(file = "https://raw.githubusercontent.com/miachen410/Mushrooms/master/mushrooms.csv", header = FALSE)
names(mushrooms) <- c("edibility", "cap-shape", "cap-surface", "cap-color",
"bruises", "odor", "gill-attachment", "gill-spacing",
"gill-size", "gill-color", "stalk-shape", "stalk-root",
"stalk-surface-above-ring", "stalk-surface-below-ring",
"stalk-color-above-ring", "stalk-color-below-ring", "veil-type",
"veil-color", "ring-number", "ring-type", "spore-print-color",
"population", "habitat")
names(mushrooms)[names(mushrooms)=="edibility"] <- "class"
indexes <- apply(mushrooms, 2, function(x) any(is.na(x) | is.infinite(x)))
colnames(mushrooms)[indexes]
table(mushrooms$class)
str(mushrooms)
#REMOVING CLASS VARIABLE
mushroom.df <- subset(mushrooms, select = -c(class))
#KMODES ANALYSIS
result.kmode <- kmodes(mushroom.df, 2, iter.max = 50, weighted = FALSE)
#EXTERNAL VALIDATION ATTEMPT
mushrooms$class <- as.factor(mushrooms$class)
class <- as.numeric(mushrooms$class))
clust_stats <- cluster.stats(d = dist(mushroom.df),
class, result.kmode$cluster)
#ERROR TERM
Error in silhouette.default(clustering, dmatrix = dmat) :
NA/NaN/Inf in foreign function call (arg 1)
In addition: Warning message:
In dist(mushroom.df) : NAs introduced by coercion

Finding the precision, recall and the f1 in R

I want to run models on a loop via and then store the performance metrics into a table. I do not want to use the confusionMatrix function in caret, but I want to compute the precision, recall and f1 and then store those in a table. Please assist, edits to the code are welcome.
My attempt is below.
library(MASS) #will load our biopsy data
library(caret)
data("biopsy")
biopsy$ID<-NULL
names(biopsy)<-c('clump thickness','uniformity cell size','uniformity cell shape',
'marginal adhesion','single epithelial cell size','bare nuclei',
'bland chromatin','normal nuclei','mitosis','class')
sum(is.na(biopsy))
biopsy<-na.omit(biopsy)
sum(is.na(biopsy))
head(biopsy,5)
set.seed(123)
inTraining <- createDataPartition(biopsy$class, p = .75, list = FALSE)
training <- biopsy[ inTraining,]
testing <- biopsy[-inTraining,]
# Run algorithms using 10-fold cross validation
control <- trainControl(method="repeatedcv", number=10,repeats = 5, verboseIter = F, classProbs = T)
#CHANGING THE CHARACTERS INTO FACTORS VARAIBLES
training<- as.data.frame(unclass(training),
stringsAsFactors = TRUE)
#CHANGING THE CHARACTERS INTO FACTORS VARAIBLES
testing <- as.data.frame(unclass(testing),
stringsAsFactors = TRUE)
models<-c("svmRadial","rf")
results_table <- data.frame(models = models, stringsAsFactors = F)
for (i in models){
model_train<-train(class~., data=training, method=i,
trControl=control,metric="Accuracy")
predictions<-predict(model_train, newdata=testing)
precision_<-posPredValue(predictions,testing)
recall_<-sensitivity(predictions,testing)
f1<-(2*precision_*recall_)/(precision_+recall_)
# put that in the results table
results_table[i, "Precision"] <- precision_
results_table[i, "Recall"] <- recall_
results_table[i, "F1score"] <- f1
}
However I get an error which says Error in posPredValue.default(predictions, testing) : inputs must be factors. i do not know where I went wrong and any edits to my code are welcome.
I know that I could get precision,recall, f1 by just using the code below (B), however this is a tutorial question where I am required not to use the code example below (B):
(B)
for (i in models){
model_train<-train(class~., data=training, method=i,
trControl=control,metric="Accuracy")
predictions<-predict(model_train, newdata=testing)
print(confusionMatrix(predictions, testing$class,mode="prec_recall"))
}
A few things need to happen.
You have to change the function calls for posPredValue and sensitivity. For both, change testing to testing$class.
for the results_table, i is a word, not a value, so you're assigning results_table["rf", "Precision"] <- precision_ (This makes a new row, where the row name is "rf".)
Here is your for statement, with changes to those functions mentioned in 1) and a modification to address the issue in 2).
for (i in models){
model_train <- train(class~., data = training, method = i,
trControl= control, metric = "Accuracy")
assign("fit", model_train)
predictions <- predict(model_train, newdata = testing)
precision_ <-posPredValue(predictions, testing$class)
recall_ <- sensitivity(predictions, testing$class)
f1 <- (2*precision_ * recall_) / (precision_ + recall_)
# put that in the results table
results_table[results_table$models %in% i, "Precision"] <- precision_
results_table[results_table$models %in% i, "Recall"] <- recall_
results_table[results_table$models %in% i, "F1score"] <- f1
}
This is what it looks like for me.
results_table
# models Precision Recall F1score
# 1 svmRadial 0.9722222 0.9459459 0.9589041
# 2 rf 0.9732143 0.9819820 0.9775785

Create a multivariate matrix in tidymodels recipes::recipe()

I am trying to do a k-fold cross validation on a model that predicts the joint distribution of the proportion of tree species basal area from satellite imagery. This requires the use of the DiricihletReg::DirichReg() function, which in turn requires that the response variables be prepared as a matrix using the DirichletReg::DR_data() function. I originally tried to accomplish this in the caret:: package, but I found out that caret:: does not support multivariate responses. I have since tried to implement this in the tidymodels:: suite of packages. Following the documentation on how to register a new model in the parsnip:: (I appreciate Max Kuhn's vegetable humor) package, I created a "DREG" model and a "DR" engine. My registered model works when I simply call it on a single training dataset, but my goal is to do kfolds cross-validation, implementing the vfolds_cv(), a workflow(), and the 'fit_resample()' function. With the code I currently have I get warning message stating:
Warning message:
All models failed. See the `.notes` column.
Those notes state that Error in get(resp_char, environment(oformula)): object 'cbind(PSME, TSHE, ALRU2)' not found This, I believe is due to the use of DR_data() to preprocess the response variables into the format necessary for Dirichlet::DirichReg() to run properly. I think the solution I need to implement involve getting this pre-processing to happen in either the recipe() call or in the set_fit() call when I register this model with parsnip::. I have tried to use the step_mutate() function when specifying the recipe, but that performs a function on each column as opposed to applying the function with the columns as inputs. This leads to the following error in the "notes" from the output of fit_resample():
Must subset columns with a valid subscript vector.
Subscript has the wrong type `quosures`.
It must be numeric or character.
Is there a way to get the recipe to either transform several columns to a DirichletRegData class using the DR_data() function with a step_*() function or using the pre= argument in set_fit() and set_pred()?
Below is my reproducible example:
##Loading Necessary Packages##
library(tidymodels)
library(DirichletReg)
##Creating Fake Data##
set.seed(88)#For reproducibility
#Response variables#
PSME_BA<-rnorm(100,50, 15)
TSHE_BA<-rnorm(100,40,12)
ALRU2_BA<-rnorm(100,20,0.5)
Total_BA<-PSME_BA+TSHE_BA+ALRU2_BA
#Predictor variables#
B1<-runif(100, 0, 2000)
B2<-runif(100, 0, 1800)
B3<-runif(100, 0, 3000)
#Dataset for modeling#
DF<-data.frame(PSME=PSME_BA/Total_BA, TSHE=TSHE_BA/Total_BA, ALRU2=ALRU2_BA/Total_BA,
B1=B1, B2=B2, B3=B3)
##Modeling the data using Dirichlet regression with repeated k-folds cross validation##
#Registering the model to parsnip::#
set_new_model("DREG")
set_model_mode(model="DREG", mode="regression")
set_model_engine("DREG", mode="regression", eng="DR")
set_dependency("DREG", eng="DR", pkg="DirichletReg")
set_model_arg(
model = "DREG",
eng = "DR",
parsnip = "param",
original = "model",
func = list(pkg = "DirichletReg", fun = "DirichReg"),
has_submodel = FALSE
)
DREG <-
function(mode = "regression", param = NULL) {
# Check for correct mode
if (mode != "regression") {
rlang::abort("`mode` should be 'regression'")
}
# Capture the arguments in quosures
args <- list(sub_classes = rlang::enquo(param))
# Save some empty slots for future parts of the specification
new_model_spec(
"DREG",
args=args,
eng_args = NULL,
mode = mode,
method = NULL,
engine = NULL
)
}
set_fit(
model = "DREG",
eng = "DR",
mode = "regression",
value = list(
interface = "formula",
protect = NULL,
func = c(pkg = "DirichletReg", fun = "DirichReg"),
defaults = list()
)
)
set_encoding(
model = "DREG",
eng = "DR",
mode = "regression",
options = list(
predictor_indicators = "none",
compute_intercept = TRUE,
remove_intercept = TRUE,
allow_sparse_x = FALSE
)
)
set_pred(
model = "DREG",
eng = "DR",
mode = "regression",
type = "numeric",
value = list(
pre = NULL,
post = NULL,
func = c(fun = "predict.DirichletRegModel"),
args =
list(
object = expr(object$fit),
newdata = expr(new_data),
type = "response"
)
)
)
##Running the Model##
DF$Y<-DR_data(DF[,c(1:3)]) #Preparing the response variables
dreg_spec<-DREG(param="alternative") %>%
set_engine("DR")
dreg_mod<-dreg_spec %>%
fit(Y~B1+B2+B3, data = DF)#Model works when simply run on single dataset
##Attempting Crossvalidation##
#First attempt - simply call Y as the response variable in the recipe#
kfolds<-vfold_cv(DF, v=10, repeats = 2)
rcp<-recipe(Y~B1+B2+B3, data=DF)
dreg_fit<- workflow() %>%
add_model(dreg_spec) %>%
add_recipe(rcp)
dreg_rsmpl<-dreg_fit %>%
fit_resamples(kfolds)#Throws warning about all models failing
#second attempt - use step_mutate_at()#
rcp<-recipe(~B1+B2+B3, data=DF) %>%
step_mutate_at(fn=DR_data, var=vars(PSME, TSHE, ALRU2))
dreg_fit<- workflow() %>%
add_model(dreg_spec) %>%
add_recipe(rcp)
dreg_rsmpl<-dreg_fit %>%
fit_resamples(kfolds)#Throws warning about all models failing
This works, but I'm not sure if it's what you were expecting.
First--getting the data setup for CV and DR_data()
I don't know of any package that has built what would essentially be a translation for CV and DirichletReg. Therefore, that part is manually done. You might be surprised to find it's not all that complicated.
Using the data you created and the modeling objects you created for tidymodels (those prefixed with set_), I created the CV structure that you were trying to use.
df1 <- data.frame(PSME = PSME_BA/Total_BA, TSHE = TSHE_BA/Total_BA,
ALRU2=ALRU2_BA/Total_BA, B1, B2, B3)
set.seed(88)
kDf2 <- kDf1 <- vfold_cv(df1, v=10, repeats = 2)
For each of the 20 subset data frames identified in kDf2, I used DR_data to set the data up for the models.
# convert to DR_data (each folds and repeats)
df2 <- map(1:20,
.f = function(x){
in_ids = kDf1$splits[[x]]$in_id
dd <- kDf1$splits[[x]]$data[in_ids, ] # filter rows BEFORE DR_data
dd$Y <- DR_data(dd[, 1:3])
kDf1$splits[[x]]$data <<- dd
})
Because I'm not all that familiar with tidymodels, next conducted the modeling using DirichReg. I then did it again with tidymodels and compared them. (The output is identical.)
DirichReg Models and summaries of the fits
set.seed(88)
# perform crossfold validation on Dirichlet Model
df2.fit <- map(1:20,
.f = function(x){
Rpt = kDf1$splits[[x]]$id$id
Fld = kDf1$splits[[x]]$id$id2
daf = kDf1$splits[[x]]$data
fit = DirichReg(Y ~ B1 + B2, daf)
list(Rept = Rpt, Fold = Fld, fit = fit)
})
# summary of each fitted model
fit.a <- map(1:20,
.f = function(x){
summary(df2.fit[[x]]$fit)
})
tidymodels and summaries of the fits (the code looks the same, but there are a few differences--the output is the same, though)
# I'm not sure what 'alternative' is supposed to do here?
dreg_spec <- DREG(param="alternative") %>% # this is not model = alternative
set_engine("DR")
set.seed(88)
dfa.fit <- map(1:20,
.f = function(x){
Rpt = kDf1$splits[[x]]$id$id
Fld = kDf1$splits[[x]]$id$id2
daf = kDf1$splits[[x]]$data
fit = dreg_spec %>%
fit(Y ~ B1 + B2, data = daf)
list(Rept = Rpt, Fold = Fld, fit = fit)
})
afit.a <- map(1:20,
.f = function(x){
summary(dfa.fit[[x]]$fit$fit) # extra nest for parsnip
})
If you wanted to see the first model?
fit.a[[1]]
afit.a[[1]]
If you wanted the model with the lowest AIC?
# comare AIC, BIC, and liklihood?
# what do you percieve best fit with?
fmin = min(unlist(map(1:20, ~fit.a[[.x]]$aic))) # dir
# find min AIC model number
paste0((map(1:20, ~ifelse(fit.a[[.x]]$aic == fmin, .x, ""))), collapse = "")
fit.a[[19]]
afit.a[[19]]

tokens_compound() in quanteda changes the order of features

I found tokens_compound() in quanteda changes the order of tokens across different R sessions. That is, the result varies every time after restarting a session even if a seed value is fixed, though it does not change in a single session.
Here is the replication procedure:
Find collocations, compound tokens, and save them.
library(quanteda)
set.seed(12345)
data(data_corpus_inaugural)
toks <- data_corpus_inaugural %>%
tokens(remove_punct = TRUE,
remove_symbol = TRUE,
padding = TRUE) %>%
tokens_tolower()
col <- toks %>%
textstat_collocations()
toks.col <- toks %>%
tokens_compound(pattern = col[col$z > 3])
write(attr(toks.col, "types"), "col1.txt")
End and restart R session and run the above code again with "col1.txt" replaced by "col2.txt".
Compare the two sets of tokens and find they are different.
col1 <- read.table("col1.txt")
col2 <- read.table("col2.txt")
identical(col1$V1, col2$V1) # This should return FALSE.
col1$V1[head(which(col1$V1 != col2$V1))]
col2$V1[head(which(col1$V1 != col2$V1))]
This does not matter for many cases but the result of LDA (by {topicmodels}) changes in different sessions. I guess so because the result of LDA is constant if I reset the order of features in tokens by as.list() and thereafter as.tokens() (dfm_sort() does not work for this).
I wonder whether this happens only for me (Ubuntu 18.04.5, R 4.0.4, and quanteda 2.1.2) and would be happy to hear another (easier) solution.
Updated on Feb 20
For example, the output of LDA is not reproduced.
lis <- list()
for (i in seq_len(2)) {
set.seed(123)
lis[[i]] <- tokens_compound(toks, pattern = col[col$z > 3]) %>%
dfm() %>%
convert(to = "topicmodels") %>%
LDA(k = 5,
method = "Gibbs",
control = list(seed = 12345,
iter = 100))
}
head(lis[[1]]#gamma)
head(lis[[2]]#gamma)
An interesting investigation but this is neither an error nor anything to be concerned with. Within a quanteda tokens object, the types are not determinate in order, after a processing step such as textstat_compound(). This is because this function is parallelised in C++ and how these threads operate is not fixed by set.seed() from R. But this will not affect the important part, which is the set of types, or anything about the tokens themselves. If you want the order of the types that you extract to be the same, then you should sort them upon extraction.
library("quanteda")
## Package version: 2.1.2
toks <- data_corpus_inaugural %>%
tokens(
remove_punct = TRUE,
remove_symbol = TRUE,
padding = TRUE
) %>%
tokens_tolower()
col <- quanteda.textstats::textstat_collocations(toks)
It turns out that you do not need to save the output or restart R - this happens within a single session.
# types are differently indexed, but are the same set
lis <- list()
for (i in seq_len(2)) {
set.seed(123)
toks.col <- tokens_compound(toks, pattern = col[col$z > 3])
lis <- c(lis, list(types = types(toks.col)))
}
dframe <- data.frame(lis)
sum(dframe$types != dframe$types.1)
## [1] 19898
head(dframe[dframe$types != dframe$types.1, ])
## types types.1
## 8897 at_this_second my_fellow_citizens
## 8898 to_take_the_oath_of_the_presidential_office no_people
## 8899 there_is on_earth
## 8900 occasion_for cause_to_be_thankful
## 8901 an_extended this_is_said
## 8902 there_was spirit_of
However the (unordered) set of types is identical:
# but
setequal(dframe$types, dframe$types.1)
## [1] TRUE
More important is that when we compare the values of each token, which is ordered, these are identical:
# tokens are the same
lis <- list()
for (i in seq_len(2)) {
set.seed(123)
toks.col <- tokens_compound(toks, pattern = col[col$z > 3])
lis <- c(lis, list(toks = as.character(toks.col)))
}
dframe <- data.frame(lis)
all.equal(dframe$toks, dframe$toks.1)
## [1] TRUE
Created on 2021-02-18 by the reprex package (v1.0.0)
An additional comment, whose importance is underscored by this analysis: We strongly discourage direct access to object attributes. Use types(x) as above, not attr(x, "types"). The former will always work. The latter relies on our implementation of the object, which may change as we improve the package.

Please select a longer horizon when the forecasts are first computed in forecast package in r

When I run the following code, I do NOT get this error:
## https://www.dataiku.com/learn/guide/code/r/time_series.html
library(readxl)
library(forecast)
library(dplyr)
library(prophet)
library(rstan)
library(Hmisc)
library(caret)
data<-read_excel("Time Series/Items.xlsx", col_types = c("text", "numeric"))
Nper=0.75
stmodels=c("meanf","naive","snaive","rwf","croston","stlf","ses","holt","hw","splinef","thetaf","ets","auto.arima","tbats","prophet")
gkuniforecast = function(data, Np, Ncolumn, tsfreq, model) {
## Preparation
N = ceiling(Np*nrow(data))
## Models
if (model=="prophet"){
df=data
names(df)=c("ds","y")
df$ds=as.Date(paste(df$ds,"-01",sep=""), "%Y-%b-%d")
train.df = df[1:N,]
na.df=data.frame(ds=rep(NA, N),y=rep(NA, N))
test.df <- rbind(na.df, df[(N+1):nrow(data),])
m <- prophet(train.df)
future <- make_future_dataframe(m, periods = nrow(data)-N, freq = 'month')
pro_forecast <- predict(m, future)
plot(m, pro_forecast)
##prophet_plot_components(m, forecast)
acc=matrix(rep(NA, 16),nrow=2,ncol=8,dimnames=list(c("Training set", "Test set"),c("ME","RMSE","MAE","MPE","MAPE","MASE","ACF1","Theil's U")))
acc["Test set","RMSE"]=sqrt(mean((pro_forecast$yhat - test.df)^2, na.rm = TRUE))
}else{
x=pull(data,Ncolumn)
train.x = ts(x[1:N], frequency=tsfreq)
test.x <- ts(c(rep(NA, N), x[(N+1):NROW(x)]), frequency=tsfreq)
str1=paste0("m_",model," = ",model,"(train.x)")
if (Np==1) {str2=paste0("f_",model," = forecast(m_",model,", h=NROW(x)")
} else {str2=paste0("f_",model," = forecast(m_",model,", h=NROW(x)-N)")}
str3=paste0("plot(f_",model,")")
str4="lines(test.x)"
str5=paste0("acc=accuracy(f_",model,",test.x)")
str=paste0(str1,";",str2,";",str3,";",str4,";",str5)
eval(parse(text=str))
}
return(acc)
}
acc = lapply(stmodels, gkuniforecast, data=data, Np=Nper, Ncolumn=2,tsfreq=12)
But when I run this code, I do:
##Forecast data prep
tsfreq=5
x=pull(data,1)
train.x = ts(x[1:N], frequency=tsfreq)
test.x <- ts(c(rep(NA, N), x[(N+1):NROW(x)]), frequency=tsfreq)
stmodels=c("meanf","naive","snaive","rwf","croston","stlf","ses","holt","hw"##,"splinef"
,"thetaf","ets","auto.arima","tbats")
for (i in 1:length(stmodels)){
str1=paste0("m_",stmodels[i]," = ",stmodels[i],"(train.x)")
str2=paste0("f_",stmodels[i]," = forecast(m_",stmodels[i],", h=NROW(x)-N)")
str3=paste0("plot(f_",stmodels[i],")")
str4="lines(test.x)"
str5=paste0('acc[["',stmodels[i],'"]]=accuracy(f_',stmodels[i],',test.x)')
str=paste0(str1,";",str2,";",str3,";",str4,";",str5)
eval(parse(text=str))
}
There seems to be a problem with 'hw' (splinef is commented out, because it gives me another error), but I do not understand why in the first dataset, I get no errors and I do with the second dataset. What is also different is the frequency.
Again the error is:
Please select a longer horizon when the forecasts are first computed
You are mixing functions that create forecasts directly (like meanf()) with functions that generate models (like ets()). For functions that generate forecasts directly, you need to specify the forecast horizon when you call the function. See https://otexts.org/fpp2/the-forecast-package-in-r.html for a list of functions that produce forecasts directly.

Resources