SVM a dataframe based on the last column - r

I'm trying since hours and hours to svm a dataframe based on the last class name.
I have this data frame
#FIll the data frame
df = read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data",
sep=",",
col.names=c("buying", "maint", "doors", "persons", "lug_boot", "safety", ""),
fill=TRUE,
strip.white=TRUE)
lastColName <- colnames(df)[ncol(df)]
...
model <- svm(lastColName~.,
data = df,
kernel="polynomial",
degree = degree,
type = "C-classification",
cost = cost)
I'm getting either NULL or Error in model.frame.default(formula = str(lastColName) ~ ., data = df1, : invalid type (NULL) for variable 'str(lastColName)'. I understand that NULL arrives when the column hasn't a name. I don't understand the other error since it's the last column name..
Any idea?

You have to use as.formula when you are trying to use dynamic variable in the formula. For details see ?as.formula
The following code works fine:
library(e1071)
df_1 = read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data",
sep=",",
col.names=c("buying", "maint", "doors", "persons", "lug_boot", "safety", ""),
fill=TRUE,
strip.white=TRUE)
lastColName <- colnames(df_1)[ncol(df_1)]
model <- svm(as.formula(paste(lastColName, "~ .", sep = " ")),
data = df_1,
kernel="polynomial",
degree = 3,
type = "C-classification",
cost = 1)
# to predict on the data remove the last column
prediction <- predict(model, df_1[,-ncol(df_1)])
# The output
table(prediction)
# The output is:
prediction
acc good unacc vgood
0 0 1728 0
# Since this is a highly unbalanced classification the model is not doing a very good job

Related

Error with rpart function in R: Error in terms.formula(formula, data = data) : duplicated name 'X.' in data frame using '.'

I was running some classification models for twitter sentiment analysis and I came across this error when using the rpart function:
Error in terms.formula(formula, data = data) : duplicated name 'X.' in data frame using '.'
Hope someone could help me solve it.
Here are the entire codes that I have used so far (apart from loading in the libraries):
setwd("D:/ProjectTwitterTrial")
ustweets <- read.csv('tweets.csv', header = T)
str(ustweets)
set.seed(123)
tweets <- ustweets[sample(nrow(ustweets)),]
tweets <- tweets %>%
select(airline_sentiment, negativereason, airline, text, tweet_location)
tweets %>% distinct() # Keep only unique/distinct rows from the data frame
tweets$language = textcat(tweets$text)
tweets = subset(tweets, language =='english')
datadtm = DocumentTermMatrix(corpus)
datadtm = removeSparseTerms(datadtm, 0.999)
dataset <- as.data.frame(as.matrix(datadtm))
colnames(dataset) <- make.names(colnames(dataset))
dataset$airline_sentiment <- tweets$airline_sentiment
str(dataset$airline_sentiment)
dataset$airline_sentiment <- as.factor(dataset$airline_sentiment)
set.seed(222)
split = sample(2,nrow(dataset),prob = c(0.8,0.2),replace = TRUE)
train_set = dataset[split == 1,]
test_set = dataset[split == 2,]
train_set[4:6,57:59]
test_set[4:6,57:59]
rf_classifier = randomForest(airline_sentiment ~., data=train_set, ntree = 20)

regression model in R use dummy cols

i want to build a regression model.
i have data about job sekeers, academics, non-academics, woman and man.
the data seperated to areas (7).
i try to type this code but im not sure that it is the right model for my data...
I would be happy to help with this.
cleanData - the csv file, contains data from 01/2010 - 04/2020
I deleted the columns of month, city and the name of the city
regressionSubset <- cleanData
regressionSubset$MONTH <- NULL
regressionSubset$CITY <- NULL
regressionSubset$LOCALITY.NAME <- NULL
regressionData <- dummy_cols(regressionSubset, remove_first_dummy = TRUE, remove_selected_columns = TRUE)
regressionData <- data.frame(scale(regressionData))
set.seed(42)
model <- lm(ESTIMATED.CITY.UNEMPLOYMENT ~ ., data = regressionData)
step_model <- stepAIC(model, trace = FALSE)
summary(step_model)
extract_eq(step_model, use_coefs = TRUE, wrap=TRUE, terms_per_line = 1)

I am trying to run XGBoost in R but am facing some issues

I have a dataset of 25 variables and 248 rows.
There are 8-factor variables and the rest are integers and numbers.
I am trying to run XGBoost.
I have done the following code: -
# Partition Data
set.seed(1234)
ind <- sample(2, nrow(mission), replace = T, prob = c(0.7,0.3))
train <- mission[ind == 1,]
test <- mission[ind == 2,]
# Create matrix - One-Hot Encoding for Factor variables
trainm <- sparse.model.matrix(GRL ~ .-1, data = train)
head(trainm)
train_label <- train[,"GRL"]
train_matrix <- xgb.DMatrix(data = as.matrix(trainm), label = train_label)
testm <- sparse.model.matrix(GRL~.-1, data = test)
test_label <- test[,"GRL"]
test_matrix <- xgb.DMatrix(data = as.matrix(testm),label = test_label)
The response variable here is "GRL" and I am running the test_label <- test[,"GRL"]
The above code is getting executed but when I am trying to use it in xgb.DMatrix, I am encountering the following error:
Error in setinfo.xgb.DMatrix(dmat, names(p), p[[1]]) :
The length of labels must equal to the number of rows in the input data
I have partitioned the data into 70:30.
test[,"GRL"] returns a data.frame, and XGBoost needs the label to be a vector.
Just use teste$GRL or test[["GRL"]] instead. You also need to do the same for the training dataset

R : Error in linear regression model

I have a 2 different data frames for which i would like to perform linear regression
I have written following code for it
mydir<- "/media/dev/Daten/Task1/subject1/t1"
#multiple subject paths should be given here
# read full paths
myfiles<- list.files(mydir,pattern = "regional_vol*",full.names=T)
# initialise the dataframe from first file
df<- read.table( myfiles[1], header = F,row.names = NULL, skip = 3, nrows = 1,sep = "\t")
# [-c(1:3),]
df
#read all the other files and update dataframe
#we read 4 lines to read the header correctly, then remove 3
ans<- lapply(myfiles[-1], function(x){ read.table( x, header = F, skip = 3, nrows = 1,sep = "\t") })
ans
#update dataframe
#[-c(1:3),]
lapply(ans, function(x){df<<-rbind(df,x)} )
#this should be the required dataframe
uncorrect<- array(df)
# Linear regression of ICV extracted from global size FSL
# Location where your icv is located
ICVdir <- "/media/dev/Daten/Task1/T1_Images"
#loding csv file from ICV
mycsv <- list.files(ICVdir,pattern = "*.csv",full.names = T )
af<- read.csv(file = mycsv,header = TRUE)
ICV<- as.data.frame(af[,2],drop=FALSE)
#af[1,]
#we take into consideration second column of csv
#finalcsv <-lapply(mycsv[-1],fudnction(x){read.csv(file="global_size_FSL")})
subj1<- as.data.frame(rep(0.824,each=304))
plot(df ~ subj1, data = df,
xlab = "ICV value of each subject",
ylab = "Original uncorrected volume",
main="intercept calculation"
)
fit <- lm(subj1 ~ df )
The data frame df has 304 values in following format
6433 6433
1430 1430
1941 1941
3059 3059
3932 3932
6851 6851
and another data frame Subj1 has 304 values in following format
0.824
0.824
0.824
0.824
0.824
When i run my code i am incurring following error
Error in model.frame.default(formula = subj1 ~ df, drop.unused.levels = TRUE) :
invalid type (list) for variable 'subj1'
any suggestions why the data.frame values from variable subj1 are invalid
As mentioned, you are trying to give a data.frame as an independent variable. Try:
fit <- lm(subj1 ~ ., data=df )
This will use all variables in the data frame, as long as subj1 is the dependent variable's name, and not a data frame by itself.
If df has two columns which are the predictors, and subj1 is the predicted (dependent) variable, combing the two, give them proper column names, and create the model in the format above.
Something like:
data <- cbind(df, subj1)
names(data) <- c("var1", "var2", "subj1")
fit <- lm(subj1 ~ var1 + var2, data=df )
Edit: some pointers:
make sure you use a single data frame that holds all of your independent variables, and your dependent variable.
The number of rows should be equal.
If an independent variable in a constant, it has no variance for different values of the dependent variable, and so will have no meaning. If the dependent variable is a constant, there is no point for regressing - we can predict the value with 100% accuracy.

Applying logistic regression in titanic dataset

I have the famous titanic data set from Kaggle's website. I want to predict the survival of the passengers using logistic regression. I am using the glm() function in R. I first divide my data frame(total rows = 891) into two data frames i.e. train(from row 1 to 800) and test(from row 801 to 891).
The code is as follows
`
>> data <- read.csv("train.csv", stringsAsFactors = FALSE)
>> names(data)
`[1] "PassengerId" "Survived" "Pclass" "Name" "Sex" "Age" "SibSp"
[8] "Parch" "Ticket" "Fare" "Cabin" "Embarked" `
#Replacing NA values in Age column with mean value of non NA values of Age.
>> data$Age[is.na(data$Age)] <- mean(data$Age, na.rm = TRUE)
#Converting sex into binary values. 1 for males and 0 for females.
>> sexcode <- ifelse(data$Sex == "male",1,0)
#dividing data into train and test data frames
>> train <- data[1:800,]
>> test <- data[801:891,]
#setting up the model using glm()
>> model <- glm(Survived~sexcode[1:800]+Age+Pclass+Fare,family=binomial(link='logit'),data=train, control = list(maxit = 50))
#creating a data frame
>> newtest <- data.frame(sexcode[801:891],test$Age,test$Pclass,test$Fare)
>> prediction <- predict(model,newdata = newtest,type='response')
`
And as I run the last line of code
prediction <- predict(model,newdata = newtest,type='response')
I get the following error
Error in eval(expr, envir, enclos) : object 'Age' not found
Can anyone please explain what the problem is. I have checked the newteset variable and there doesn't seem to be any problem in that.
Here is the link to titanic data set https://www.kaggle.com/c/titanic/download/train.csv
First, you should add the sexcode directly to the dataframe:
data$sexcode <- ifelse(data$Sex == "male",1,0)
Then, as I commented, you have a problem in your columns names in the newtest dataframe because you create it manually. You can use directly the test dataframe.
So here is your full working code:
data <- read.csv("train.csv", stringsAsFactors = FALSE)
data$Age[is.na(data$Age)] <- mean(data$Age, na.rm = TRUE)
data$sexcode <- ifelse(data$Sex == "male",1,0)
train <- data[1:800,]
test <- data[801:891,]
model <- glm(Survived~sexcode+Age+Pclass+Fare,family=binomial(link='logit'),data=train, control = list(maxit = 50))
prediction <- predict(model,newdata = test,type='response')

Resources