I am working with C50 package and I can not execute the algorithm, for now the two basic thing: dataset without nulls and factor for variable 'credit$default'.
I am wondering what is wrong.
The message after executing this code is:
c50 code called exit with value 1
Code:
library(gmodels)
library(readr)
library(dplyr)
library(class)
library(C50)
# Step 1: Get the data
url_data <- "https://raw.githubusercontent.com/pakinja/Data-R-Value/master/MachineLearning_C5.0_Classification_Algorithm/credit.csv"
credit <- read_csv(url(url_data))
# Step 2: Exploring and preparing the data
credit$default[credit$default == 1] <- "no"
credit$default[credit$default == 2] <- "yes"
credit$default <- as.factor(credit$default)
# Creating Random test
set.seed(123)
train_sample <- sample(1000,900)
str(train_sample)
credit_train <- credit[train_sample,]
credit_test <- credit[-train_sample,]
prop.table(table(credit_test$default))
prop.table(table(credit_train$default))
# Step 3: Training a model on the data
credit_model <- C5.0(credit_train[,-17], credit_train$default, trials = 1,
rules = FALSE)
summary(credit_model)
Related
My course requires me to use the Udacity's enron financial data to craft a financial fraud detection model in R.
I wrote a function for the calculation (The split_train_set just split the data in 1 70-30 training and testing set.
library(e1071)
library(caret)
nb_runner <- function(dataset, rm.na=FALSE) {
split_df <- split_train_set(dataset, rm.na)
nb <- naiveBayes(x=split_df$x_train_set, y=split_df$y_train_set$poi)
nb_predict <- predict(nb, newdata=split_df$x_test_set, type='class')
cm <- confusionMatrix(nb_predict, split_df$y_test_set$poi, positive='True')
return(cm)
}
It worked fine in the beginning.
However, after I am trying to clean up the data by removing the rows with more than 15 NAs by the following code, and rerun the same nb_runner()
remove_high_na <- function(dataset, threshold = 0.7) {
# The range of NA in rows is 2 to 17
# Since we have only 22 features in the dataset, high level of NA makes the col useless
# Hence, we will remove rows with high level of NA, and we will set the threshold as 0.7.
# The row with NA higher than 0.7 (> 15.6) will be removed.
threshold_cols <- floor(ncol(dataset) * threshold)
df <- subset(dataset, rowSums(is.na(dataset)) <= threshold_cols)
# df <- dataset[-which(rowSums(is.na(dataset)) > threshold_cols),]
return(df)
}
Error in object$levels[apply(L, 2, which.max)] :
invalid subscript type 'list'
The code failed and the traceback is as follows:
4.
factor(object$levels[apply(L, 2, which.max)], levels = object$levels)
3.
predict.naiveBayes(nb, newdata = split_df$x_test_set, type = "class")
2.
predict(nb, newdata = split_df$x_test_set, type = "class") at POI_helpers.R#38
1.
nb_runner(df_1)
I am not quite sure what I was doing wrong since the same dataset worked fine in other classifiers.
Thank you in advance for your help.
I am trying to run a negative binomial regression using the glmnet 4.0 package. I have implemented the regression using code from the section entitled 'Fitting Other GLMs' of this webpage. However, I keep getting the following error:
Error in seq.default(log(lambda_max), log(lambda_max * lambda.min.ratio), : 'from' must be a finite number
I haven’t been able to find examples of other people experiencing this error in the past. I think maybe because it is specific to this new version of the package?
Below is an example which should reproduce the error. This is not the data I have been using for my analysis and is simply for example purposes.
library(eventdataR)
library(glmnet)
library(MASS)
df <- subset(traffic_fines, activity == "Create Fine" | activity == "Add penalty" )
df <- df[,c(4,6,7,9,13,14,18)]
df$resource <- as.numeric(df$resource)
dfm <- as.matrix(df[,-3])
newfit <- glmnet(dfm, df$amount, family = negative.binomial(theta = 5))
Does anyone know why this error might be occurring and what I can do to stop it?
In the example you provided, there are no rows with no NAs,
table(complete.cases(df))
FALSE
14635
If we chose some other columns:
df <- subset(traffic_fines, activity == "Create Fine" | activity == "Add penalty" )
df <- df[,c("points","article","amount","resource")]
df = df[complete.cases(df),]
df$resource <- as.numeric(df$resource)
dfm <- as.matrix(df[,-3])
It will run
newfit <- glmnet(dfm, df$amount, family = negative.binomial(theta = 5))
newfit
Call: glmnet(x = dfm, y = df$amount, family = negative.binomial(theta = 5))
Df %Dev Lambda
1 0 0.00 0.46180
2 1 8.23 0.42070
3 1 14.92 0.38340
4 1 20.42 0.34930
I am trying to plot trait data on a phylogeny using the phytools package. I'm sure this should be simple but I'm getting an unhelpful error message and I don't know what to try.
Here is my code including data download.
# General
library(dplyr)
# Phylogenetic libraries.
library(caper)
library(phytools)
#+ data_read
p <- read.table(file = 'http://esapubs.org/archive/ecol/E090/184/PanTHERIA_1-0_WR05_Aug2008.txt',
header = TRUE, sep = "\t", na.strings = c("-999", "-999.00"))
## Some data cleaning
# Remove NAs in response and response where litter size is less than one (doesn't make sense).
p <- p %>%
filter(!is.na(X15.1_LitterSize)) %>%
filter(X15.1_LitterSize >= 1) %>%
mutate(y = log1p(X15.1_LitterSize)) %>%
dplyr::select(-X15.1_LitterSize, -References, -X24.1_TeatNumber)
## Get phylogeny data.
### read in phylogeny data.
# Read in trees
tree <- read.nexus('https://onlinelibrary.wiley.com/action/downloadSupplement?doi=10.1111%2Fj.1461-0248.2009.01307.x&file=ELE_1307_sm_SA1.tre')
# Select best supported tree
tree <- tree[[1]]
tree$tip.label <- gsub('_', ' ', tree$tip.label)
# Check if species are available.
mean(p$MSW05_Binomial %in% tree$tip.label)
in_phylo <- p$MSW05_Binomial %in% tree$tip.label
# Remove data that is not in the phylogeny.
p <- p %>% filter(in_phylo)
# Try just vulpes.
unneededTips <- tree$tip.label[!grepl('Vulpes', tree$tip.label) | !(tree$tip.label %in% p$MSW05_Binomial)]
# Prune tree down to only needed tips.
pruneTree <- drop.tip(tree, unneededTips)
dotTree(pruneTree, p$y[grepl('Vulpes', p$MSW05_Binomial)])
# Try all species
unneededTips <- tree$tip.label[!(tree$tip.label %in% p$MSW05_Binomial)]
# Prune tree down to only needed tips.
pruneTree <- drop.tip(tree, unneededTips)
dotTree(pruneTree, p$y)
I have tried plotting a smaller subset of the tree and the full tree but in both cases I get the error:
Error in if (k <= 0.8 && any(rr > (strwidth("W") * fsize/2))) rr <- rr/max(rr) * :
missing value where TRUE/FALSE needed
for dotTree and similar functions in phytools (e.g. contMap), your trait value must be a named vector with the names corresponding to the tips in your tree.
In your example you need to make sure p$y is a named vector (!is.null(names(p$y)) should be TRUE):
## Prune down the non Vulpes tips
vulpes_tree <- drop.tip(tree, tree$tip.label[-grep("Vulpes", tree$tip.label)])
## Naming the variables in p$y
all_vulpes <- grepl('Vulpes', p$MSW05_Binomial)
traits_to_plot <- p$y[all_vulpes]
names(traits_to_plot) <- p$MSW05_Binomial[all_vulpes]
## Plotting the Vulpes and the traits
dotTree(vulpes_tree, traits_to_plot)
You can apply the same procedure for your bigger tree.
I suggest you use the function cleand.data from the dispRity package to match your tree and your dataset:
## Matching the tree and the data (using the dispRity package)
library(dispRity)
## Attributing rownames to the dataset
rownames(p) <- p$MSW05_Binomial
## Cleaning both the data and the tree
cleaned_data <- dispRity::clean.data(p, tree)
## Extracting the cleaned dataset and the cleaned tree
clean_p <- cleaned_data$data
clean_tree <- cleaned_data$tree
## Same for the complete tree
all_traits <- clean_p$y
names(all_traits) <- clean_p$MSW05_Binomial
## Plotting all species and their traits
dotTree(clean_tree, all_traits)
I am trying to run randomForest multicore mode using foreach function. The fitting of the trees seems to be working, however when trying to use predict on the resulting model it gives me the following error message:
Error in UseMethod("predict") :
no applicable method for 'predict' applied to an object of class "call"
It seems like the foreach function gives back a simple listinstead of a proper randomForest model.
Here is the complete code I am trying to run:
# sample from 1 to k, nrow times (the number of observations in the data)
labeled_data <- bundesliga[bundesliga$Season<2017,]
labeled_data$id <- sample(1:k, nrow(labeled_data), replace = TRUE)
list <- 1:k
# prediction and testset data frames that we add to with each iteration over
# the folds
#Creating a progress bar to know the status of CV
progress.bar <- create_progress_bar("text")
progress.bar$init(k)
prediction <- data.frame()
testsetCopy <- data.frame()
accuracy <- list()
rf.formula <- as.formula(paste("as.factor(FTR)","~",paste("AvgAgeHome",
"AvgAge_Away",
"AvgMarketValueHome_z_score",
"AvgMarketValue_Away_z_score",
"ForeignPlayersHome",
"ForeignPlayers_Away",
"KaderHome",
"Kader_Away",
"no_won_matches_last_20_home",
"no_won_matches_last_20_away",
"no_won_matches_last_15_home",
"no_won_matches_last_15_away",
"no_won_matches_last_10_home",
"no_won_matches_last_10_away",
"no_won_matches_last_5_home",
"no_won_matches_last_5_away",
"no_won_matches_last_3_home",
"no_won_matches_last_3_away",
"no_won_matches_last_2_home",
"no_won_matches_last_2_away",
"won_last_1_matches_away",
"won_last_1_matches_home",
"OverallMarketValueHome_z_score",
"OverallMarketValue_Away_z_score",
"roll_FTHG_Home",
"roll_FTAG_Away",
"Stadium.Capacity.y",
"WDL_3_roll_matches_away",
"WDL_3_roll_matches_home",
"WDL_2_roll_matches_home",
"WDL_2_roll_matches_away",
"WDL_1_roll_matches_home",
"WDL_1_roll_matches_away",sep="+")))
for (i in 1:k){
# remove rows with id i from dataframe to create training set
# select rows with id i to create test set
trainingset <- subset(labeled_data, id %in% list[-i])
testset <- subset(labeled_data, id %in% c(i))
#run a random forest model
rf <- foreach(ntree=rep(1, 8),
.combine=combine,.packages='randomForest') %dopar% {
environment(rf.formula) <- environment()
randomForest(rf.formula,data=trainingset, ntree=ntree)
}
print(class(rf))
# remove response column 1
pred <- predict(rf, testset[,-1])
temp <- as.data.frame(pred)
match_test_pred <- cbind(as.data.frame(testset),temp)
accuracy_fold <- sum(match_test_pred$Correct)/nrow(match_test_pred)
accuracy <- rbind(accuracy,accuracy_fold)
# append this iteration's predictions to the end of the prediction
data frame
prediction <- rbind(prediction, temp)
# append this iteration's test set to the test set copy data frame
# keep only the Sepal Length Column
testsetCopy <- rbind(testsetCopy, as.data.frame(testset$FTR))
print(confusionMatrix(pred,testset$FTR))
progress.bar$step()
}
Thanks in advance for your help!
I am trying to use the pgmm function from the plm package for R. The regression runs and I can call up the results, however, asking for the summary gives the following error:
Error in t(y) %*% x : non-conformable arguments
I've imported the data from the World Bank using the WDI package:
library(plm) # load package
library(WDI) # Load package
COUNTRIES <- c("AGO","BEN","BWA","BFA","BDI") # Specify countries
INDICATORS <- c("NY.GDP.PCAP.KN", "SP.DYN.TFRT.IN", "SP.DYN.CBRT.IN", "SP.POP.TOTL") # Specify indicators
LONG <- WDI(country=COUNTRIES, indicator=INDICATORS, start=2005, end=2009, extra=FALSE) # Load data
PANEL <- pdata.frame(LONG, c("iso2c","year")) # Transform to PANEL dataframe
PANEL$year <- as.numeric(as.character(PANEL$year)) # Encode year
EQ <- pgmm( log(fertility) ~ log(gdp) + lag(log(fertility), 2) | lag(log(fertility), 2), data=PANEL, effect="twoways", model="twosteps", gmm.inst=~log(fertility) ) # Run regression
Calling the results as follows works.
EQ
But the summary (below) gives the error message mentioned above.
summary(EQ)
I think the error occurs because summary.pgmm tries to do a second order Arelland-Bond test of serial correlation on your data, but your data only have two points (2008 and 2009) so it fails.
To fix this problem, you could patch the function so that it checks whether you only have two points in the data set and runs the test only if you have more than two points. I provide a patched function below:
summary.pgmm.patched <- function (object, robust = FALSE, time.dummies = FALSE, ...)
{
model <- plm:::describe(object, "model")
effect <- plm:::describe(object, "effect")
transformation <- plm:::describe(object, "transformation")
if (robust) {
vv <- vcovHC(object)
}
else {
vv <- vcov(object)
}
if (model == "onestep")
K <- length(object$coefficients)
else K <- length(object$coefficients[[2]])
Kt <- length(object$args$namest)
if (!time.dummies && effect == "twoways")
rowsel <- -c((K - Kt + 1):K)
else rowsel <- 1:K
std.err <- sqrt(diag(vv))
b <- coef(object)
z <- b/std.err
p <- 2 * pnorm(abs(z), lower.tail = FALSE)
CoefTable <- cbind(b, std.err, z, p)
colnames(CoefTable) <- c("Estimate", "Std. Error", "z-value",
"Pr(>|z|)")
object$CoefTable <- CoefTable[rowsel, , drop = FALSE]
object$sargan <- sargan(object)
object$m1 <- plm:::mtest(object, 1, vv)
# The problem line:
# object$m2 <- mtest(object, 2, vv)
if (length(object$residuals[[1]] ) > 2) object$m2 <- plm:::mtest(object, 2, vv)
object$wald.coef <- plm:::wald(object, "param", vv)
if (plm:::describe(object, "effect") == "twoways")
object$wald.td <- plm:::wald(object, "time", vv)
class(object) <- "summary.pgmm"
object
}
You might want to write to the author of the plm package and show him this post. The author will be able to write a less 'hacky' patch.
Using your own (slightly modified) example data, here is how you would use the function:
library(WDI) # Load package
library(plm)
COUNTRIES <- c("AGO","BEN","BWA","BFA","BDI") # Specify countries
INDICATORS <- c("NY.GDP.PCAP.KN", "SP.DYN.TFRT.IN", "SP.DYN.CBRT.IN", "SP.POP.TOTL") # Specify indicators
LONG <- WDI(country=COUNTRIES, indicator=INDICATORS, start=2005, end=2009, extra=FALSE) # Load data
PANEL <- pdata.frame(LONG, c("iso2c","year")) # Transform to PANEL dataframe
PANEL$year <- as.numeric(as.character(PANEL$year)) # Encode year
names(PANEL) [c(4,5)] = c('gdp','fertility')
EQ <- pgmm( log(fertility) ~ log(gdp) + lag(log(fertility), 2) | lag(log(fertility), 2), data=PANEL, effect="twoways", model="twosteps", gmm.inst=~log(fertility) ) # Run regression
summary.pgmm.patched(EQ)