I perform GBM models.
Data sample
a=structure(list(yield = c(1842L, 2147L, 2444L, 3850L, 1866L, 2897L,
1783L, 2434L, 2650L, 2863L), time.diff = c(122, 186, 177, 168,
162, 186, 161, 125, 187, 185), Biomass = c(18400L, 6400L, 8620L,
12800L, 5400L, 10400L, 6000L, 8800L, 9080L, 60000L)), class = "data.frame", row.names = c(NA,
-10L))
my code
indexes = createDataPartition(a$yield, p = .7, list = F)
train = a[indexes, ]
test = a[-indexes, ]
write.csv(test,"test.csv")
ames_train <- train
ames_test <- test
str(ames_train)
# train GBM model
gbm.fit <- gbm(
formula = yield ~ .,
distribution = "gaussian",
data = ames_train,
n.trees = 10000,
interaction.depth = 1,
shrinkage = 0.001,
cv.folds = 5,
n.cores = NULL, # will use all cores by default
verbose = FALSE
)
# print results
print(gbm.fit)
# get MSE and compute RMSE
sqrt(min(gbm.fit$cv.error))
Here indicated MSE and RMSE
How can i calculate r square (Multiple determination coefficient) for this model?
This way you get the r2 with predictions obtained by cross-validation, which are true predictions.
r <- as.numeric(gbm.fit$cv.statistics[3])
rsq = round(r^2,2)
Related
I would like to fit a logistic regression model in R but adjust for age only. How can this be done? I know there is a way to fit univariable logistic model using gtsummary tbl_uvregression function. Can we tweak this?
library(tidyverse)
library(gtsummary)
set.seed(100)
trialdata <- tibble(
outcome = factor(rbinom(1000, 1, 0.10),
labels = c("No", "Yes")),
age = floor(rnorm(1000, 16, 3)),
score = floor(runif(1000, 30, 99)),
school = factor(rbinom(1000, 1, 0.35),
labels = c("No", "Yes")),
education = factor(rbinom(1000, 2, 0.20),
labels = c("Primary", "Secondary", "Tertiary"))
)
tbl_uv_ex1 <-
trialdata %>%
tbl_uvregression(
method = glm,
y = outcome,
method.args = list(family = binomial),
exponentiate = TRUE
)
tbl_uv_ex1
im building some machine learning using XGB to predict the qty based on price of the product
here's my data
df <- structure(list(price= c(80000, 85000, 87000, 88000, 89000,
90000, 91000, 92000, 93000, 94000, 95000, 96000, 97000, 98000,
102000, 106000, 107000, 108000, 110000, 111000, 112000, 113000,
114000, 115000, 116000, 117000, 118000, 119000), qty = c(1, 212,
298, 243, 309, 195, 248.75, 377, 179, 111, 311.75, 115, 274.5,
121.571428571429, 143, 128.5, 203, 215, 193, 198, 128, 136.25,
134.666666666667, 124.125, 125.75, 144.3, 148.142857142857, 115.333333333333
)), class = "data.frame", row.names = c(NA, -28L))
and here's my code for my model
trcontrol <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
tune_grid <- expand.grid(nrounds = 200,
max_depth = 5,
eta = 0.05,
gamma = 0.01,
colsample_bytree = 0.75,
min_child_weight = 0,
subsample = 0.5
)
model_train_xgb <- train(qty~price, data = df,
method = "xgbTree",
trControl = trcontrol,
tuneGrid = tune_grid,
verbose = FALSE,
objective = "reg:squarederror")
im wondering why when i predict using this model outside the value of the train data it always return the same.
for example if i predict using this code
predict(model_train_xgb, tibble(price= 150000))
it will returned 119.7586, and if i change into higher number it always return 119.7586.
it will returned the same number if i change into whatever number as long the number are higher than the train data.
can you tell me why?
thanks
I'm using random forest for the first time. I was wondering why after running the model I can't see all the statistics in the call. I expected to see also accuracy and OOB error, for instance.
rf <-randomForest(VWC~.,data=Data_All,
mtry=best.m, importance=TRUE,
ntree=500)
print(rf)
Call:
randomForest(formula = VWC ~ ., data = Data_All, mtry = best.m, importance = TRUE, ntree = 500)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 41
Mean of squared residuals: 4422.255
% Var explained: 49.43
I selected a few columns because it is a huge dataframe:
dput(Data_All[1:20, c(1, 8)])
structure(list(VWC = c(56.1, 50.6, 60.7, 40.1, 47.3, 52.8, 51.4,
44, 47.1, 51.9, 47.7, 51, 45.3, 49.3, 52.4, 51.8, 52.2, 49, 46.1,
44.4), vh_glcm_7 = c(0.910762965, 0.910106623, 0.91908574, 0.926299954,
0.945425676, 0.948440292, 0.948440292, 0.940617102, 0.938677993,
0.927837995, 0.928881178, 0.931472503, 0.927350345, 0.919106027,
0.917548344, 0.922729618, 0.93057993, 0.93057993, 0.931927558,
0.927018363)), row.names = c(NA, 20L), class = "data.frame")
For regression there is no accuracy, but rather MSE and R^2 (% Var), which are standard measures for regression, you should look into those if you do not know them yet.
Currently practicing SVM with a sample dataset and I'm trying to add the abline to the plot but says that w[1,2] is not the correct dimensions, leading me back to the created variable 'w' in which 'coefs' and 'SV' are not wanting to work in the code
I've tried lowercasing SV, which works but results in w | numeric (empty) instead of num [1, 1:2] -0.035 -0.0188.
structure(list(Height = c(44, 52.1, 57.1, 33, 27.8, 27.2, 32,
45.1, 56.7, 56.9, 122.1, 123.9, 122.9, 101.1, 128.9, 137.1, 127,
103, 141.6, 102.4), Weight = c(126.3, 136.9, 109.2, 148.3, 110.4,
107.8, 128.4, 120.2, 140.2, 139.2, 154.1, 170.8, 183.1, 164,
193.6, 181.7, 164.8, 174.6, 185.8, 176.9)), class = "data.frame", row.names = c(NA,
-20L))
#create an is horse indicator variable
ishorse <- c(rep(-1,10),rep(+1,10))
library(e1071)
#create data frame for performing svm
data <- data.frame(Height= animal_data['Height'],
Weight= animal_data['Weight'],
animal=as.factor(ishorse))
#plot data
plot(data[,-3],col=(3)/2, pch=19); abline(h=0,v=0,lty=3)
#perform svm
svm.model <- svm(animal ~ .,
data=data,
type='C-classification',
kernel='linear',
scale=FALSE)
#show support vectors
points(data[svm.model$index, c(1,2)], col="orange", cex = 2)
#get parameters of hyperplane
w <- t(svm.model$coefs) %% svm.model$SV
b <- -svm.model$rho
#in this 2D case the hyperplane is the line w[1,1]*x1 + w[1,2]*2 +b = 0
abline(a=-b/w[2,2], b=-w[,]/w[,], col = "blue", lty = 3)
The correct syntax for matrix multiplication is %*%. Try this:
svm.model <- svm(animal ~ .,
data=data,
type='C-classification',
kernel='linear',
scale=FALSE)
#show support vectors
points(data[svm.model$index, c(1,2)], col="orange", cex = 2)
#get parameters of hyperplane
w <- t(svm.model$coefs) %*% svm.model$SV
b <- -1*svm.model$rho
#in this 2D case the hyperplane is the line w[1,1]*x1 + w[1,2]*2 +b = 0
abline(a=-b/w[1,2], b=-w[1]/w[2], col = "blue", lty = 3)
I would like to plot both a linear model (LM) and non-linear (GLM) model of the same data.
The range between 16% - 84% should line up between a LM and GLM, Citation: section 3.5
I have included a more complete chunk of the code because I am not sure at which point I should try to cut the linear model. or at which point I have messed up - I think with the linear model.
The code below results in the following image:
My Objective (taken from previous citation-link).
Here is my data:
mydata3 <- structure(list(
dose = c(0, 0, 0, 3, 3, 3, 7.5, 7.5, 7.5, 10, 10, 10, 25, 25, 25, 50, 50, 50),
total = c(25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L),
affected = c(1, 0, 1.2, 2.8, 4.8, 9, 2.8, 12.8, 8.6, 4.8, 4.4, 10.2, 6, 20, 14, 12.8, 23.4, 21.6),
probability = c(0.04, 0, 0.048, 0.112, 0.192, 0.36, 0.112, 0.512, 0.344, 0.192, 0.176, 0.408, 0.24, 0.8, 0.56, 0.512, 0.936, 0.864)),
.Names = c("dose", "total", "affected", "probability"),
row.names = c(NA, -18L),
class = "data.frame")
My script:
#load libraries
library(ggplot2)
library(drc) # glm model
library(plyr) # rename function
library(scales) #log plot scale
#Creating linear model
mod_linear <- lm(probability ~ (dose), weights = total, data = mydata3)
#Creating data.frame: note values 3 and 120 refer to 16% and 84% response in sigmoidal plot
line_df <-expand.grid(dose=exp(seq(log(3),log(120),length=200)))
#Extracting values from linear model
p_line_df <- as.data.frame(cbind(dose = line_df,
predict(mod_linear, newdata=data.frame(dose = line_df),
interval="confidence",level=0.95)))
#Renaming linear df columns
p_line_df <-rename(p_line_df, c("fit"="probability"))
p_line_df <-rename(p_line_df, c("lwr"="Lower"))
p_line_df <-rename(p_line_df, c("upr"="Upper"))
p_line_df$model <-"Linear"
#Create sigmoidal dose-response curve using drc package
mod3 <- drm(probability ~ (dose), weights = total, data = mydata3, type ="binomial", fct=LL.2(names=c("Slope:b","ED50:e")))
#data frame for ggplot2
base_DF_3 <-expand.grid(dose=exp(seq(log(1.0000001),log(10000),length=200)))
#extract data from model
p_df3 <- as.data.frame(cbind(dose = base_DF_3,
predict(mod3, newdata=data.frame(dose = base_DF_3),
interval="confidence", level=.95)))
#renaming columns
p_df3 <-rename(p_df3, c("Prediction"="probability"))
p_df3$model <-"Sigmoidal"
#combining Both DataFames
p_df_all <- rbind(p_df3, p_line_df)
#plotting
ggplot(p_df_all, aes(x=dose,y=probability, group=model))+
geom_line(aes(x=dose,y=probability,group=model,linetype=model),show.legend = TRUE)+
scale_x_log10(breaks = c(0.000001, 10^(0:10)),labels = c(0, math_format()(0:10)))
Looking at the reference you provided, what the authors describe is the use of a linear model to approximate the central portion of a (sigmoidal) logistic function. The linear model that achieves this is a straight line that passes through the inflection point of the logistic curve, and has the same slope as the logistic function at that inflection point. We can use some basic algebra and calculus to solve this problem.
From ?LL.2, we see that the form of the logistic function being fitted by drm is
f(x) = 1 / {1 + exp(b(log(x) - log(e)))}
We can get the values of the coefficient in this equation by
b = mod3$coefficients[1]
e = mod3$coefficients[2]
Now, by differentiation, the slope of the logistic function is given by
dy/dx = -(b * exp((log(x)-log(e))*b)) / (1+exp((log(x)-log(e))*b))^2
At the inflection point, the dose (x) is equal to the coefficient e, thus the slope at the inflection point simplifies (greatly) to
sl50 = -b/4
Since we also know that the inflection point occurs at the point where probability = 0.5 and dose = e, we can construct the straight line (in log-transformed coordinates) like this:
linear_probability = sl50 * (log(p_df3$dose) - log(e)) + 0.5
Now, to plot the logistic and linear functions together:
p_df3_lin = p_df3
p_df3_lin$model = 'linear'
p_df3_lin$probability = linear_probability
p_df_all <- rbind(p_df3, p_df3_lin)
ggplot(p_df_all, aes(x=dose,y=probability, group=model))+
geom_line(aes(x=dose,y=probability,group=model,linetype=model),show.legend = TRUE)+
scale_x_log10(breaks = c(0.000001, 10^(0:10)),labels = c(0, math_format()(0:10))) +
scale_y_continuous(limits = c(0,1))