My course requires me to use the Udacity's enron financial data to craft a financial fraud detection model in R.
I wrote a function for the calculation (The split_train_set just split the data in 1 70-30 training and testing set.
library(e1071)
library(caret)
nb_runner <- function(dataset, rm.na=FALSE) {
split_df <- split_train_set(dataset, rm.na)
nb <- naiveBayes(x=split_df$x_train_set, y=split_df$y_train_set$poi)
nb_predict <- predict(nb, newdata=split_df$x_test_set, type='class')
cm <- confusionMatrix(nb_predict, split_df$y_test_set$poi, positive='True')
return(cm)
}
It worked fine in the beginning.
However, after I am trying to clean up the data by removing the rows with more than 15 NAs by the following code, and rerun the same nb_runner()
remove_high_na <- function(dataset, threshold = 0.7) {
# The range of NA in rows is 2 to 17
# Since we have only 22 features in the dataset, high level of NA makes the col useless
# Hence, we will remove rows with high level of NA, and we will set the threshold as 0.7.
# The row with NA higher than 0.7 (> 15.6) will be removed.
threshold_cols <- floor(ncol(dataset) * threshold)
df <- subset(dataset, rowSums(is.na(dataset)) <= threshold_cols)
# df <- dataset[-which(rowSums(is.na(dataset)) > threshold_cols),]
return(df)
}
Error in object$levels[apply(L, 2, which.max)] :
invalid subscript type 'list'
The code failed and the traceback is as follows:
4.
factor(object$levels[apply(L, 2, which.max)], levels = object$levels)
3.
predict.naiveBayes(nb, newdata = split_df$x_test_set, type = "class")
2.
predict(nb, newdata = split_df$x_test_set, type = "class") at POI_helpers.R#38
1.
nb_runner(df_1)
I am not quite sure what I was doing wrong since the same dataset worked fine in other classifiers.
Thank you in advance for your help.
I have run several (17) meta-analyses (identified by specific names) and I need to extract the models' outputs into one single table, as well as add a column with the name of each name. I have done it manually, but I was wondering if I could build a loop to do so.
I'm attaching the first three of the 17 analyses, the "names" being "cent", "dist", and "sqrs"
#meta-analyses
res_cent<-rma.mv(yi, vi, mods = ~ factor(drug)-1, random = list(~ 1 | publication_id,~ 1 | strain_def),
data = SR_meta,subset=(SR_meta$measure=="cent"))
res_dist<-rma.mv(yi, vi, mods = ~ factor(drug)-1, random = list(~ 1 | publication_id,~ 1 | strain_def),
data = SR_meta,subset=(SR_meta$measure=="dist"))
res_sqrs<-rma.mv(yi, vi, mods = ~ factor(drug)-1, random = list(~ 1 | publication_id,~ 1 | strain_def),
data = SR_meta,subset=(SR_meta$measure=="sqrs"))
#Creating list for model output - cent
list_cent<-coef(summary(res_cent))
list_cent<-setNames(cbind(rownames(list_cent), list_cent, row.names = NULL),
c("Drug", "Estimate", "se","zval","p-value","CI_l","CI_u"))
df_cent <- list_cent[ -c(3,4) ]
df_cent$Drug<-gsub("factor*","",df_cent$Drug)
df_cent$Drug<-gsub("drug*","",df_cent$Drug)
df_cent$Drug<-gsub("[[:punct:]]","",df_cent$Drug)
n_cent<-plyr::count(cent_sum2, vars = "drug")
names(n_cent)[names(n_cent) == "freq"] <- "n_cent"
df_cent<-cbind(df_cent,n_cent[2])
##same thing can be repeated for the other two measures "dist", and "sqrs".
The output is a data frame that contains the name of the drugs used as factors in the meta-analyses, their estimated effect sizes, p-values, confidence intervals, and how many measures we have per factor (n). I want to compile all of these outputs in a table, (at the end of the code called "matrix_ps") and add a column with the name of the measures.
I have done all the steps manually (below) but it looks extremely inefficient.
Is there a way to create a loop to do this, in which the all the names of the measures are changed an then outcome is appended?
Something like
measures<-c("cent","dist","sqrs")
for(i in measures) - not sure how to continue?
matrix_cent<-data.frame(df_cent$Drug,list_cent$`p-value`,df_cent$n_cent,df_cent$Estimate,df_cent$CI_l,df_cent$CI_u)
matrix_dist<-data.frame(df_dist$Drug,list_dist$`p-value`,df_dist$n_dist,df_dist$Estimate,df_dist$CI_l,df_dist$CI_u)
matrix_sqrs<-data.frame(df_sqrs$Drug,list_sqrs$`p-value`,df_sqrs$n_sqrs,df_sqrs$Estimate,df_sqrs$CI_l,df_sqrs$CI_u)
matrix_cent$measure<-"cent"
matrix_dist$measure<-"dist"
matrix_sqrs$measure<-"sqrs"
matrix_cent<-matrix_cent%>% rename(drug=df_cent.Drug,measure=measure,p=list_cent..p.value.,n=df_cent.n_cent,estimate=df_cent.Estimate,ci_low=df_cent.CI_l,ci_up=df_cent.CI_u)
matrix_dist<-matrix_dist%>% rename(drug=df_dist.Drug,measure=measure,p=list_dist..p.value.,n=df_dist.n_dist,estimate=df_dist.Estimate,ci_low=df_dist.CI_l,ci_up=df_dist.CI_u)
matrix_sqrs<-matrix_sqrs%>% rename(drug=df_sqrs.Drug,measure=measure,p=list_sqrs..p.value.,n=df_sqrs.n_sqrs,estimate=df_sqrs.Estimate,ci_low=df_sqrs.CI_l,ci_up=df_sqrs.CI_u)
matrix_ps<-rbind(matrix_cent,matrix_dist,matrix_rear,matrix_sqrs,matrix_toa,matrix_eca,matrix_eoa,matrix_trans,matrix_dark,matrix_light,matrix_stps,matrix_rrs,matrix_time,matrix_toc,matrix_cross,matrix_hd,matrix_lat)
We don't have your data but you can put all your code in a function :
get_result <- function(x, y) {
list_cent<-coef(summary(x))
list_cent<-setNames(cbind(rownames(list_cent), list_cent, row.names = NULL),
c("Drug", "Estimate", "se","zval","p-value","CI_l","CI_u"))
df_cent <- list_cent[ -c(3,4) ]
df_cent$Drug<-gsub("factor*","",df_cent$Drug)
df_cent$Drug<-gsub("drug*","",df_cent$Drug)
df_cent$Drug<-gsub("[[:punct:]]","",df_cent$Drug)
n_cent<-plyr::count(cent_sum2, vars = "drug")
names(n_cent)[names(n_cent) == "freq"] <- y
df_cent<-cbind(df_cent,n_cent[2])
return(df_cent)
}
Now assuming all your analyses follow the pattern 'res_' you can do :
library(purrr)
list_models <- mget(ls(pattern = 'res_'))
result <- imap(list_models, get_result) %>% reduce(inner_join)
I am applying the kNN algorithm to a data set with 6 predictors and 1 class variable with 1599 rows, I have reviewed my syntax many times and gone back over other examples to try and find my error. I am totally bamboozled at present. I have broken the data set up into test_set, test_set_class, training_set, training_set_class. Any assistance would be fantastic, see below for the code and error.
num_obs <- nrow(wine_preds3)
# set the sample size to be 80%
sample_size <- as.integer(num_obs*0.8)
# set the seed for the sample split
set.seed(0)
# randomly split 80% the data indexes in reduced wine
split_index <- sample(num_obs, size = sample_size, replace = FALSE)
# subset the reduced wine into a testing subset of 20%
test_wine_preds <- wine_preds3[-split_index, 1:6]
test_wine_class <- wine_preds3[-split_index, 7]
# subset the reduced wine into a training subset of 80%
train_wine_preds <- wine_preds3[split_index, 1:6]
train_wine_class <- wine_preds3[split_index, 7]
Pred_class <- kNN(train = train_wine_preds, test = test_wine_preds, cl = train_wine_class, k = 15)
Error in kNN(train = train_wine_preds, test = test_wine_preds, cl = train_wine_class, :
unused arguments (train = train_wine_preds, test = test_wine_preds, cl = train_wine_class)
I have a dataset of 25 variables and 248 rows.
There are 8-factor variables and the rest are integers and numbers.
I am trying to run XGBoost.
I have done the following code: -
# Partition Data
set.seed(1234)
ind <- sample(2, nrow(mission), replace = T, prob = c(0.7,0.3))
train <- mission[ind == 1,]
test <- mission[ind == 2,]
# Create matrix - One-Hot Encoding for Factor variables
trainm <- sparse.model.matrix(GRL ~ .-1, data = train)
head(trainm)
train_label <- train[,"GRL"]
train_matrix <- xgb.DMatrix(data = as.matrix(trainm), label = train_label)
testm <- sparse.model.matrix(GRL~.-1, data = test)
test_label <- test[,"GRL"]
test_matrix <- xgb.DMatrix(data = as.matrix(testm),label = test_label)
The response variable here is "GRL" and I am running the test_label <- test[,"GRL"]
The above code is getting executed but when I am trying to use it in xgb.DMatrix, I am encountering the following error:
Error in setinfo.xgb.DMatrix(dmat, names(p), p[[1]]) :
The length of labels must equal to the number of rows in the input data
I have partitioned the data into 70:30.
test[,"GRL"] returns a data.frame, and XGBoost needs the label to be a vector.
Just use teste$GRL or test[["GRL"]] instead. You also need to do the same for the training dataset
I have a training data set with 28 variables (13 labels and 15 features). A test data set with 15 features and I have to predict labels for this test data set based on the features. I made KNN classifiers for all 13 labels individually.
Is there a possibility of combining all these 13 individual label KNN classifiers into one single multi label classifier?
My current code for single label:
library(class)
train_from_train <- train[1:600,2:16]
target_a_train_from_train <- train[1:600,17]
test_from_train <- train[601:800,2:16]
target_a_test_from_train <- train[601:800,17]
knn_pred_a <-knn (train = train_from_train, test = test_from_train, cl= target_a_train_from_train, k = 29)
table(knn_pred_a, target_a_test_from_train)
mean(knn_pred_a != target_a_test_from_train)
knn_pred_a_ON_TEST <-knn (train = train[,2:16], test = test[2:16], cl= train[,17], k = 29)
knn_pred_a_ON_TEST
I scoured internet and package mldr seems to be an option but I couldn't adapt it to my needs.
You can use the package ARNN for this. However, it is not exact as far as I know.
library(RANN)
library(reshape2)
####
## generate some sample data and randomize order
iris.knn <- iris[sample(1:150,150),]
#add a second class
iris.knn["Class2"] <- iris.knn[,5]=="versicolor"
iris.knn$org.row.id <- 1:nrow(iris.knn)
train <- iris.knn[1:100,]
test <- iris.knn[101:150,]
##
#####
## get nearest neighbours
nn.idx <- as.data.frame(nn2(train[1:4],query=test[1:4],k=4)$nn.idx)
## add row id
nn.idx$test.row.id <- test$rowid
#classes and row id
multiclass.vec <- data.frame(row.id=1:150,iris.knn[,5:6])
#1 row per nearest neighbour
melted <-melt(nn.idx,id.vars="row.id")
merged <- merge(melted,multiclass.vec, by.x = "value",by.y="org.row.id")
#aggrgate a single class
aggregate(merged$Species, list(merged$row.id), function(x) names(which.max(table(x))))
#### aggregate for all classes
all.classes <- melt(merged[c(2,4,5)],id.vars = "row.id")
fun.agg <- function(x) {
if(length(x)==0){
"" #<-- default value adaptation might be needed.
}else{
names(which.max(table(x)))
}
}
dcast(all.classes,row.id~variable, fun.aggregate=fun.agg,fill=NULL)
I did the aggreation only for a single class. Doing this step for all classes in parallel would require another melt operation and would make the code pretty messy.