R GLM: Modify coefficients of an existing glm model - r

I have been trying to adjust the coefficients of an existing glm model but the predictions don't seem to change. The idea is to enhance an existing logistic model by incorporating 'qualitative' parameters in the quantitative coefficients (see 'adj model' block). I replicated the problem below.
I really appreciate any. Thank you!
set.seed(100)
#create sim data (correlated)
input_size <- 200
scale <- 10000
y_var = sample(0:1, input_size, replace = TRUE)
input_data <- cbind.data.frame(y_var, x1 = sample(1:1000, input_size, replace = TRUE) + (y_var*200), x2 = sample(1:50, input_size, replace = TRUE) - (y_var*30))
cor(input_data)
#build log-reg model
reg1 <- glm(input_data$y ~ input_data$x1 + input_data$x2, data = input_data, family = "binomial")
reg1$coefficients
#test log-reg model
input_test <- cbind.data.frame(x1 = sample(1:1000, input_size, replace = TRUE) + (y_var*400), x2 = sample(1:50, input_size, replace = TRUE) - (y_var*10))
y_predict <- predict(reg1, input_test, type="response")
#adjust log-reg model
adj_coeff <- round(c(intercept = reg1$coefficients[1], x1 = reg1$coefficients[2] * 3, x2 = -reg1$coefficients[3] * 0.5), 4)
reg2 <- reg1
reg2$coefficients <- as.numeric(adj_coeff)
reg2$coefficients
#visualize predication of the log-reg models
y2_predict <- predict(reg1, input_test, type="response")
plot(y_predict, type = "p", lwd = 2)
lines(y2_predict, type = "p", pch = 3, col = "orange")

Related

neural network with R package nnet: rubbish prediction due to overfitting?

Trying to figure out if I have an R problem or a general neural net problem.
Say I have this data:
set.seed(123)
n = 1e3
x = rnorm(n)
y = 1 + 3*sin(x/2) + 15*cos(pi*x) + rnorm(n = length(x))
df = data.frame(y,x)
df$train = sample(c(TRUE, FALSE), length(y), replace=TRUE, prob=c(0.7,0.3))
df_train = subset(df, train = TRUE)
df_test = subset(df, train = FALSE)
then you train the neural net and it looks good on the holdout:
library(nnet)
nn = nnet(y~x, data = df_train, size = 60, linout=TRUE)
yhat_nn = predict(nn, newdata = df_test)
plot(df_test$x,df_test$y)
points(df_test$x, yhat_nn, col = 'blue')
Ok, so then I thought, let's just generate new data and then predict using the trained net. But the predictions are way off:
x2 = rnorm(n)
y2 = 1 + 3*sin(x2/2) + 15*cos(pi*x2) + rnorm(n = length(x2))
df2 = data.frame(y2,x2)
plot(df2$x, df2$y)
points(df2$x, predict(nn, newdata = df2), col = 'blue')
Is this because I overfitted to the training set? I thought by splitting the original data into test-train I would avoid overfitting.
The fatal issue is that your new data frame, df2, does not have the correct variable names. As a result, predict.nnet can not find the right values.
names(df)
#[1] "y" "x" "train"
names(df2)
#[1] "y2" "x2"
Be careful when you construct a data frame for predict.
## the right way
df2 <- data.frame(y = y2, x = x2)
## and it solves the mystery
plot(df2$x, df2$y)
points(df2$x, predict(nn, newdata = df2), col = 'blue')
Another minor issue is your use of subset. It should be
## not train = TRUE or train = FALSE
df_train <- subset(df, train == TRUE) ## or simply subset(df, train)
df_test <- subset(df, train == FALSE) ## or simply subset(df, !train)
This has interesting effect:
nrow(subset(df, train == TRUE))
#[1] 718
nrow(subset(df, train = TRUE)) ## oops!!
#[1] 1000
The complete R session
set.seed(123)
n = 1e3
x = rnorm(n)
y = 1 + 3*sin(x/2) + 15*cos(pi*x) + rnorm(n = length(x))
df = data.frame(y,x)
df$train = sample(c(TRUE, FALSE), length(y), replace=TRUE, prob=c(0.7,0.3))
df_train = subset(df, train == TRUE) ## fixed
df_test = subset(df, train == FALSE) ## fixed
library(nnet)
nn = nnet(y~x, data = df_train, size = 60, linout=TRUE)
yhat_nn = predict(nn, newdata = df_test)
plot(df_test$x,df_test$y)
points(df_test$x, yhat_nn, col = 'blue')
x2 = rnorm(n)
y2 = 1 + 3*sin(x2/2) + 15*cos(pi*x2) + rnorm(n = length(x2))
df2 = data.frame(y = y2, x = x2) ## fixed
plot(df2$x, df2$y)
points(df2$x, predict(nn, newdata = df2), col = 'blue')

LIME in R on xgboost model with objective 'count:poisson'

I am trying to use LIME in R to explain an xgboost model with the objective 'count:poisson'. It seems to work just fine for the standard 'reg:linear'. Is there a way around this? The question was previously asked here, but with no accepted answers.
Can the R version of lime explain xgboost models with count:poisson objective function?
require(dplyr)
require(xgboost)
require(lime)
#generate data
df_train <- data.frame(
x1 = rnorm(n = 1000),
x2 = rnorm(n = 1000),
x3 = rnorm(n = 1000)) %>%
mutate(y = rpois(1000, pmax(0, x1 + 2*x2 - 0.5*x3)))
df_hold_out <- data.frame(
x1 = rnorm(n = 5),
x2 = rnorm(n = 5),
x3 = rnorm(n = 5)) %>%
mutate(y = rpois(5, pmax(0, x1 + 2*x2 - 0.5*x3)))
#set matrix
dmat <- xgb.DMatrix(data = as.matrix(df_train[, c("x1", "x2", "x3")]), label = df_train[["y"]])
#train with linear objective
mod_linear <- xgboost(data = dmat, nrounds = 100, params = list(objective = "reg:linear"))
#train with poisson objective
mod_poisson <- xgboost(data = dmat, nrounds = 100, params = list(objective = "count:poisson"))
#explain linear model
explainer_linear <- lime(x = df_hold_out, model = mod_linear, n_bins = 5)
explanation_linear <- lime::explain(
x = df_hold_out[, c("x1", "x2", "x3")],
explainer = explainer_linear,
n_permutations = 5000,
dist_fun = "gower",
kernel_width = .75,
n_features = 10,
feature_select = "highest_weights")
#plot
plot_features(explanation_linear)
#explain poisson model
explainer_poisson <- lime(x = df_hold_out, model = mod_poisson, n_bins = 5)
explanation_poisson <- lime::explain(
x = df_hold_out[, c("x1", "x2", "x3")],
explainer = explainer_poisson,
n_permutations = 5000,
dist_fun = "gower",
kernel_width = .75,
n_features = 10,
feature_select = "highest_weights")
#plot
plot_features(explanation_poisson)
trying to run the explain function on the poisson explainer ultimately tosses this error
Error: Unsupported model type

How to avoid getting grid arrangements with plotROCCurves()?

I want to plot ROC Curves into one plot, so that I can compare each one. An easy on going tutorial for that can be found on MLR-org.com.
There you found this example:
library(mlr)
n = getTaskSize(sonar.task)
train.set = sample(n, size = round(2/3 * n))
test.set = setdiff(seq_len(n), train.set)
lrn1 = makeLearner("classif.lda", predict.type = "prob")
mod1 = train(lrn1, sonar.task, subset = train.set)
pred1 = predict(mod1, task = sonar.task, subset = test.set)
lrn2 = makeLearner("classif.ksvm", predict.type = "prob")
mod2 = train(lrn2, sonar.task, subset = train.set)
pred2 = predict(mod2, task = sonar.task, subset = test.set)
df = generateThreshVsPerfData(list(lda = pred1, ksvm = pred2), measures = list(fpr, tpr))
plotROCCurves(df)
This should generate a graphic like this:
But instead I get always this one:
Is anybody there who can help me to get only one graphic?

Iterate two arguments with map2 (purrr function)

I want to calculate all possible predictions with different probabilities of my data with multiple models. The result is a list.
df<-iris
df$y<-sample(0:1,nrow(df),replace=TRUE)
set.seed(101)
#Now Selecting 80% of data as sample from total 'n' rows of the data
sample <- sample.int(n = nrow(df), size = floor(.8*nrow(df)), replace = F)
train <- df[sample, ]
test <- df[-sample, ]
Then i create a logistic model:
full <- glm(y~., data = train, family = "binomial")
min <- glm( y~ 1, data = train, family = "binomial")
backward <- step(full,direction = "backward",trace=0)
forward <- step(min,scope=list(lower=min, upper=full),direction = "forward",trace=0)
model2<- glm(y~Sepal.Length+Sepal.Width , data = train, family = "binomial")
models<-list(backward,forward,model2)
prediction<- lapply(models, function(x){predict(x,newdata=test,type="response")})
First of all i have table with predictions. Then i created a vector with all posible probabilities.
p <- seq(from = 0.1, to = 0.9, by = 0.5)
Problem is i want to apply differents breaks point. I tried with map2 function of purrr package but it doesn't work.
pred = map2(prediction,p, function(x,pi){ifelse(x > pi, 1, 0)})
The problem is:
Error: .x (3) and .y (2) are different lengths
Anyone can help?
I think is best to change apply to sapply, then i will have a data.frame.
prediction<- sapply(models, function(x){predict(x, newdata=test,type="response")},
simplify = T,USE.NAMES = TRUE)
Then i could use pmap function?
thanks
EDIT: I updated with all code.
See if this makes sense:
df<-iris
df$y<-sample(0:1,nrow(df),replace=TRUE)
set.seed(101)
#Now Selecting 80% of data as sample from total 'n' rows of the data
sample <- sample.int(n = nrow(df), size = floor(.8*nrow(df)), replace = F)
train <- df[sample, ]
test <- df[-sample, ]
full <- glm(y~., data = train, family = "binomial")
min <- glm( y~ 1, data = train, family = "binomial")
backward <- step(full,direction = "backward",trace=0)
forward <- step(min,scope=list(lower=min, upper=full),direction = "forward",trace=0)
model2<- glm(y~Sepal.Length+Sepal.Width , data = train, family = "binomial")
models<-list(backward,forward,model2)
prediction<- lapply(models, function(x){predict(x,newdata=test,type="response")})
p <- seq(from = 0.1, to = 0.9, by = 0.5)
combn = cross2(prediction, p)
pred <- map(combn,
function(combination) {
x <- combination[[1]]
pi <- combination[[2]]
ifelse(x > pi, 1, 0)
}
)

Bootstrapping of multiple values using boot::boot()

I try to estimate confidence intervals for several parameters of a nonlinear model using bootstrapping. Right now, I do bootstrapping for for each parameter individually. Therefore I have to gererate the model serveral times.
Here is an example:
library(boot)
# generate some data:
x <- rnorm(300, mean = 5, sd = 2)
y <- xvalues^2*rnorm(300, mean = 1.5, sd = 1) + rnorm(300, mean = 3, sd = 1)
data <- data.frame(x = x, y = y)
# this is my model: nls(y ~ b1*x^2+b2, data = data, start = list(b1 = 1.5,b2 = 3))
# functions for bootstrapping:
getParamB1 <- function(x1, idx){
data <- x1 %>%
dplyr::slice(idx)
model <- nls(y ~ b1*x^2+b2, data = data, start = list(b1 = 1.5,b2 = 3))
coef(model)[['b1']]
}
getParamB2 <- function(x1, idx){
data <- x1 %>%
dplyr::slice(idx)
model <- nls(y ~ b1*x^2+b2, data = data, start = list(b1 = 1.5,b2 = 3))
coef(model)[['b2']]
}
# Calculate bootstrap confidence intervals
btrpB1 <- boot(data, statistic = getParamB1, R=200)
btrpB2 <- boot(data, statistic = getParamB2, R=200)
ciB1 <- boot.ci(btrpB1)
ciB2 <- boot.ci(btrpB2)
This is of course not very nice code. Is there a way to estiamte confidence intervals for several parameters (here b1 and b2) at once?
How about this?
library(boot)
# generate some data:
x <- rnorm(300, mean = 5, sd = 2)
y <- x^2 * rnorm(300, mean = 1.5, sd = 1) + rnorm(300, mean = 3, sd = 1)
df <- data.frame(x = x, y = y)
m1 <- nls(y ~ b1 * x^2 + b2, data = df, start = list(b1 = 1.5, b2 = 3))
boot.coef <- function(mod, data, indices) {
assign(deparse(mod$data), data[indices, ])
m <- eval(mod$call)
return(coef(m))
}
results <- boot(data = df, statistic = boot.coef,
R = 1000, mod = m1)

Resources