I would like to fit the data and predict y values for wider x range.
Lets assume I have 'iris' data set and use following data for prediction from this post
library(dplyr)
cc <- iris %>%
group_by(Species) %>%
do({
mod <- nlsLM(Sepal.Length ~ k*Sepal.Width/2+U, start=c(k=10,U=5), data = ., trace=F, control = nls.lm.control(maxiter=100))
pred <- predict(mod, newdata =.["Sepal.Width"])
data.frame(., pred)
})
This is the fitting plot
I want to fit this data with wider Sepal width range such that
new.range<- data.frame(x=seq(2,10,length.out=20))
and modify the script
pred <- predict(mod, newdata =new.range)
TO plot new.range fitting
library(ggplot2)
ggplot(cc,aes(y=Sepal.Length,x=Sepal.Width ,col=factor(Species)))+
geom_point()+
facet_wrap(~Species)+
geom_line(aes(x=new.range,y=pred),size=1)
Error in (function (..., row.names = NULL, check.rows = FALSE,
check.names = TRUE, : arguments imply differing number of rows:
20, 150
I cannot understand why getting this error. I suppose that pred is calculated from new.range so they should have the same length?
similar posts
using-predict-in-nls
trouble-with-predict-function-in-r
predict-maybe-im-not-understanding-it?
This is something that achieves what you want. The cause for your original problem is that in your regression, the predictor's name is Sepal.width not x, and your prediction doesn't use your new.range at all, so you have to do something like new.range<- data.frame(Sepal.Width=seq(2,10,length.out=50)) to make predictions on your new.range.
Another problem is that you have to make the new.range's length to be 50, so that the pred and new.range fit in the original data.frame.
And then you can draw the plot you want, note that the new.range becomes Sepal.Width.1.
library(dplyr)
cc <- iris %>%
group_by(Species) %>%
do({
mod <- nlsLM(Sepal.Length ~ k*Sepal.Width/2+U, start=c(k=10,U=5), data = ., trace=F, control = nls.lm.control(maxiter=100))
new.range<- data.frame(Sepal.Width=seq(2,10,length.out=50))
pred <- predict(mod, newdata =new.range)
# pred <- predict(mod, newdata =.["Sepal.Width"])
data.frame(., new.range, pred)
})
library(ggplot2)
ggplot(cc,aes(y=Sepal.Length,x=Sepal.Width ,col=factor(Species)))+
geom_point()+
facet_wrap(~Species)+
geom_line(aes(x=Sepal.Width.1,y=pred),size=1)
Related
I would like to run some regression models with different y (so the independent variables stay the same for all the models), and then extract the residuals from each of these models and add them to the original data set.
I will use diamonds to show what I came up with:
# In my example, the models are: x or y or z = carat + cut + color + clarity + price
dependent = c("x", "y", "z")
model = function(y, dataset) {
a = map(
setNames(y, y), ~ glm(reformulate(termlabels = c("carat", "cut", "color", "clarity", "price"),
response = y),
family = gaussian(link = "identity"),
data = dataset
))
resids = map_dfr(a, residuals)
df = bind_cols(dataset, resids)
print(df)
}
model(y = dependent, dataset = diamonds)
But this code doesn't work. I would also like to have sensible names for the residuals that are added as new columns, otherwise it is difficult to differentiate the residuals when the number of models is big.
generate example data
library(tidyverse)
set.seed(101)
dd <- diamonds
dependent <- c("x", "y", "z")
for (d in dependent) {
dd[[d]] <- rnorm(nrow(diamonds))
}
process
library(broom)
res <- (dependent
## set names so .id = argument works downstream
%>% setNames(dependent)
## construct list of formulas
%>% map(reformulate, termlabels = c("carat", "cut", "color", "clarity", "price"))
## fit glmes
%>% map(glm, family = gaussian(link = "identity"), dd,
na.action = na.exclude)
## compute resids (add observation number) and collapse to tibble
%>% map_dfr(~tibble(.obs=seq(nrow(dd)), .resid = residuals(.)), .id = "response")
## widen data → residuals from each response variable as a column
%>% pivot_wider(names_from = "response", values_from = ".resid",
names_prefix ="res_")
%>% select(-.obs)
)
## combine with original data
res2 <- bind_cols(dd, res)
Notes:
it's not obvious to me why you're using glm(., family = gaussian(link = "identity)) here, unless it's as a placeholder to something more complicated you're doing with your real data. (If this is your actual model then using lm() will be simpler and faster.)
adding na.action = na.exclude to the [g]lm() call will include NA values in the predictions, residuals, etc., which will help your residuals line up better with the original data.
Using the 'iris' dataset (sightly modified as below), I plot the results of an LME.
PLEASE NOTE: I am only using the iris dataset as mock data for the purpose of plotting, so please do not critique the appropriateness of this test. I'm not interested in the statistics, rather the plotting.
Using ggpredict function and plotting the results, the plot extends the predictions beyond the range of the data. Is there a systematic way plot predictions only within the range of each faceted data?
I can plot each facet separately, limit the axis per plot manually, and cowplot them back together, but if there is way to say 'predict only to the max. and min. of the data for that group', this would be great.
Given that these are facets of a single model, perhaps not showing the predictions for different groups is in fact misleading, and I should rather create three different models if I only want predictions within those data subsets?
library(lme4)
library(ggeffects)
library(ggplot2)
data(iris)
glimpse(iris)
df = iris
glimpse(df)
df_ed = df %>% group_by(Species) %>% mutate(Sepal.Length = ifelse(Species == "setosa",Sepal.Length+10,Sepal.Length+0))
df_ed = df_ed %>% group_by(Species) %>% mutate(Sepal.Length = ifelse(Species == "versicolor",Sepal.Length-3,Sepal.Length+0))
glimpse(df_ed)
m_test =
lmer(Sepal.Width ~ Sepal.Length * Species +
(1|Petal.Width),
data = df_ed, REML = T)
summary(m_test)
test_plot = ggpredict(m_test, c("Sepal.Length", "Species"), type = "re") %>% plot(rawdata = T, dot.alpha = 0.6, facet = T, alpha = 0.3)
As per the OP's comment, I think this will provide a solution. In this example, I use data from the sleepstudy dataset that comes with the lme4 package. First, we have to postulate a mixed model, which I generically call fit.
Note that I do not perform any hypothesis test to formally select an appropriate random-effects structure. Of course, this is essential to adequately capture the correlations in the repeated measurements, but falls outside the scope of this post.
library(lme4)
library(splines)
# quantiles of Days
quantile(sleepstudy$Days, c(0.05, 0.95))
# 5% 95%
# 0 9
# mixed model
fit <- lmer(Reaction ~ ns(Days, df = 2, B = c(0, 9)) +
(Days | Subject), data = sleepstudy)
# new data.frame for prediction
ND <- with(sleepstudy, expand.grid(Days = seq(0L, 9L, len = 50)))
Then, we need a fucntion that enables us to obtain predictions from fit for certain values of the covariates. The function effectPlot_lmer() takes the following arguments:
object: a character string indicating the merMod object that was fitted (the mixed model).
ND: a character string indicating the new data.frame, which specifies the values of the covariates for which we want to obtain predictions.
orig_data: a character string specifying the data on which the mixed model was fitted.
# function to obtain predicted reaction times
effectPlot_lmer <- function (object, ND, orig_data) {
form <- formula(object, fixed.only = TRUE)
namesVars <- all.vars(form)
betas <- fixef(object)
V <- vcov(object)
orig_data <- orig_data[complete.cases(orig_data[namesVars]), ]
Terms <- delete.response(terms(form))
mfX <- model.frame(Terms, data = orig_data)
Terms_new <- attr(mfX, "terms")
mfX_new <- model.frame(Terms_new, ND, xlev = .getXlevels(Terms, mfX))
X <- model.matrix(Terms_new, mfX_new)
pred <- c(X %*% betas)
ses <- sqrt(diag(X %*% V %*% t(X)))
ND$pred <- pred
ND$low <- pred - 1.96 * ses
ND$upp <- pred + 1.96 * ses
return(ND)
}
Finally, we can make an effect plot with ggplot.
# effect plot
library(ggplot2)
ggplot(effectPlot_lmer(fit, ND, orig_data = sleepstudy),
aes(x = Days, y = pred)) +
geom_line(size = 1.2, colour = 'blue4') +
geom_ribbon(aes(ymin = low, ymax = upp), colour = NA,
fill = adjustcolor('blue4', 0.2)) +
theme_bw() + ylab('Expected Reaction (ms)')
I am plotting box-plots of the accuracy scores of resamples of yearly models trained with caret.
The models are named by the years they refer to: 2000, 2001, 2002, ..., 2010.
I want the models to appear in the box-plots with ascending order based on the year i.e. name of the model.
The summary of resamples based on the below code
fit.year.res <- resamples(fit.year)
summary(fit.year.res)
looks like this:
But then, the different yearly models in the box plot are not sorted:
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(fit.year.res, scales=scales)
I have tried converting the models element of resamples fit.year.res$models to factor from character, but it didn't make nay difference.
I am not aware of an easy solution using bwplot method from caret package. Perhaps there is one but my lattice skills are lacking. I recommend plotting the boxplots manually using ggplot2. This way you will have much better control over the final plot.
Since you did not post an example with data I will use one of the examples from ?caret:::bwplot.resamples
library(caret)
library(party)
library(RWeka)
load(url("http://topepo.github.io/caret/exampleModels.RData"))
resamps <- resamples(list(CART = rpartFit,
CondInfTree = ctreeFit,
MARS = earthFit))
bwplot(resamps,
metric = "RMSE")
produces:
To make the plot manually using ggplot you will need some data manipulation:
library(tidyverse)
resamps$values %>% #extract the values
select(1, ends_with("RMSE")) %>% #select the first column and all columns with a name ending with "RMSE"
gather(model, RMSE, -1) %>% #convert to long table
mutate(model = sub("~RMSE", "", model)) %>% #leave just the model names
ggplot()+ #call ggplot
geom_boxplot(aes(x = RMSE, y = model)) -> p1 #and plot the box plot
p1
To set a specific order on the y axis:
p1 +
scale_y_discrete(limits = c("MARS", "CART", "CondInfTree"))
If you prefer lattice
library(lattice)
resamps$values %>%
select(1, ends_with("RMSE")) %>%
gather(model, RMSE, -1) %>%
mutate(model = sub("~RMSE", "", model)) %>%
{bwplot(model ~ RMSE, data = .)}
to change the order change the levels of model (this approach also works with ggplot2):
resamps$values %>%
select(1, ends_with("RMSE")) %>%
gather(model, RMSE, -1) %>%
mutate(model = sub("~RMSE", "", model),
model = factor(model, levels = c("MARS", "CART", "CondInfTree"))) %>%
{bwplot(model ~ RMSE, data = .)}
The function bwplot.resamples is used to generate this plot and if you look at the underlying code, the variables are factorized based on their average performance under the metric of interest.
Below I have the relevant code that does the factorization:
bwplot.resamples <- function (x, data = NULL, models = x$models, metric = x$metric, ...)
{
....
avPerf <- ddply(subset(plotData, Metric == metric[1]),
.(Model),
function(x) c(Median = median(x$value, na.rm = TRUE)))
avPerf <- avPerf[order(avPerf$Median),]
......
}
I guess what you need to do is to make the plot manually:
data(BloodBrain)
gbmFit <- train(bbbDescr[,-3], logBBB,"gbm",tuneLength=6,
trControl = trainControl(method = "cv"),verbose=FALSE)
glmnetFit <- train(bbbDescr[,-3], logBBB,"glmnet",tuneLength=6,
trControl = trainControl(method = "cv"))
rfFit <- train(bbbDescr[,-3], logBBB,"rf",tuneLength=6,
trControl = trainControl(method = "cv"))
knnFit <- train(bbbDescr[,-3], logBBB,"knn",tuneLength=6,
trControl = trainControl(method = "cv"))
resamps <- resamples(list(gbm = gbmFit,glmnet=glmnetFit,knn=knnFit,rf=rfFit))
If you plot, you can see they are sorted according to the medians (the solid dot):
bwplot(resamps,metric="MAE")
You can access the values under $values and make a function to plot it, something like below:
plotMet = function(obj,metric,var_order){
mat = obj$values
mat = mat[,grep(metric,colnames(mat))]
colnames(mat) = gsub("[~][^ ]*","",colnames(mat))
boxplot(mat[,var_order],horizontal=TRUE,las=2,xlab=metric)
}
plotMet(resamps,"MAE",c("rf","knn","gbm","glmnet"))
Also not a very good idea to name your models with numbers.. try something like model_2000, model_2001 etc
I know that glmnet does not regularize the intercept by default, but I would like to do it anyway. I was taking a look at this question and tried to do what whuber suggested (adding a constant variable and turning the parameter intercept to FALSE) , but as a result glmnet is not fitting the added constant as well.
library(dplyr)
library(glmnet)
X <-
mtcars %>%
mutate(intercept = 1) %>%
select(-c(mpg)) %>%
as.matrix()
y <-
mtcars %>%
select(mpg) %>%
as.matrix()
model <- glmnet(X, y, intercept = FALSE, alpha = 0, lambda = 0)
coef(model)
I'm running a binomial GLM to predict the probability of a species occurrence, where I am training on one dataset and testing the model on another dataset:
TrainingData<-read.csv("TrainingData.csv")[,-1]
TrainingData[,1]<-as.factor(TrainingData[,1])
TrainingData[,4]<-as.factor(TrainingData[,4])
TestData<-read.csv("TestData.csv")[,-1]
TestData[,1]<-as.factor(TestData[,1])
TestData[,4]<-as.factor(TestData[,4])
mod<-glm(presence~var1+var2+var3, family=binomial, data=TrainingData)
probs=predict(mod, TestData, type="response")
What is the best way (or function) to create response curves to plot the relationship between the probability of presence and each predictor variable?
Thanks!
The marginal probabilities can be calculated from predict.glm with type = "terms",
since each of the terms are calculated with the remaining variables set at their mean values.
This is converted back to a probabilty scale with plogis(term + intercept).
Second, because your data set contains and combination of continuous values and factors
for your predictor variables, separate plots were made for each type and combined
with grid.arrange.
Although this answers your question directly based on the glm model you presented,
I would still recommend examining the spatial autocorrelation of both your predictor
and response variables, as this could have a likely impact on your final model.
library(reshape2)
library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)
TrainingData <- read.csv("~/Downloads/TrainingData.csv", header = TRUE)
TrainingData[['presence']] <- as.factor(TrainingData[['presence']])
TrainingData[['var3']] <- as.factor(TrainingData[['var3']])
TrainingData[['X']] <- NULL # Not used in the model
TestData <- read.csv("~/Downloads/TestData.csv", header = TRUE)
TestData[['presence']] <- as.factor(TestData[['presence']])
TestData[['var3']] <- as.factor(TestData[['var3']])
TestData[['X']] <- NULL
Presence/Absence model
mod <- glm(presence ~ var1 + var2 + var3, family = binomial, data = TrainingData)
Get predicted probabilities for each of the centered variables (i.e remaining variables set to their mean).
mod_terms <- predict(mod, newdata = TestData, type = "terms")
mod_prob <- data.frame(idx = 1:nrow(TestData), plogis(mod_terms +
attr(mod_terms, "constant")))
mod_probg <- mod_prob %>% gather(variable, probability, -idx)
Melt the Test data into long format
TestData['idx'] <- 1:nrow(TestData) # Add index to data
TestData[['X']] <- NULL # Drop the X variable since it was not used in the model
data_long <- melt(TestData, id = c("presence","idx"))
data_long[['value']] <- as.numeric(data_df[['value']])
Merge Testdata with predictions and separate the data containing continuous (var1 and var2) and factors (var3).
# Merge Testdata with predictions
data_df <- merge(data_long, mod_probg, by = c("idx", "variable"))
data_df <- data_df %>% arrange(variable, value)
data_continuous <- data_df %>% filter(., variable != "var3") %>%
transform(value = as.numeric(value)) %>% arrange(variable, value)
data_factor <- data_df %>% filter(., variable == "var3") %>%
transform(value = as.factor(value))%>%
arrange(idx)
ggplot output
g_continuous <- ggplot(data_continuous, aes(x = value, y = probability)) + geom_point()+
facet_wrap(~variable, scales = "free_x")
g_factor <- ggplot(data = data_factor, aes(x = value, y = probability)) + geom_boxplot() +
facet_wrap(~variable)
grid.arrange(g_continuous, g_factor, nrow = 1)