when I tried to plot a graph of decision boundary in R, I met some problem and it returned a error "Continuous value supplied to discrete scale". I think the problem happened in the scale_colur_manual but I don't know how to fix it. Below is the code attached.
library(caTools)
set.seed(123)
split = sample.split(df$Purchased,SplitRatio = 0.75)
training_set = subset(df,split==TRUE)
test_set = subset(df,split==FALSE)
# Feature Scaling
training_set[,1:2] = scale(training_set[,1:2])
test_set[,1:2] = scale(test_set[,1:2])
# Fitting logistic regression to the training set
lr = glm(formula = Purchased ~ .,
family = binomial,
data = training_set)
#Predicting the test set results
prob_pred = predict(lr,type = 'response',newdata = test_set[-3])
y_pred = ifelse(prob_pred > 0.5, 1, 0)
#Making the Confusion Matrix
cm = table(test_set[,3],y_pred)
cm
#Visualizing the training set results
library(ggplot2)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
prob_set = predict(lr, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1,0)
ggplot(grid_set) +
geom_tile(aes(x = Age, y = EstimatedSalary, fill = factor(y_grid)),
show.legend = F) +
geom_point(data = set, aes(x = Age, y = EstimatedSalary, color = Purchased),
show.legend = F) +
scale_fill_manual(values = c("orange", "springgreen3")) +
scale_colour_manual(values = c("red3", "green4")) +
scale_x_continuous(breaks = seq(floor(min(X1)), ceiling(max(X2)), by = 1)) +
labs(title = "Logistic Regression (Training set)",
ylab = "Estimated Salary", xlab = "Age")
Is your Purchased variable a factor? If not, it has to be. Try this:
grid_set %>%
mutate(Purchased=factor(Purchased)) %>%
ggplot() +
geom_tile(aes(x = Age, y = EstimatedSalary, fill = factor(y_grid)),
show.legend = F) + ... # add the rest of your commands.
Related
I am trying to fit association-dissociation SPR kinetics data for a protein and small molecule for two concentrations using ggplot2. The data is here.
The time variable indicates the time in seconds, the sample variable indicates the two concentrations (32nM and 8nM), and the values variable is the readout.
I have imported the data and running the following code to plot:
# LINE PLOT
ggplot(data) +
geom_point(aes(x = time, y = values), size = 1, color = "black") +
geom_smooth(aes(x = time, y = values, color = sample), method = "loess", se = F) +
scale_x_continuous(expand = c(0, 0), limits = c(0, NA)) +
#scale_y_continuous(expand = c(0, 0), limits = c(0, 60)) +
scale_color_npg(breaks = c("2nM", "4nM", "8nM", "16nM", "32nM")) +
theme_linedraw() +
labs(x = "Time (seconds)",
y = "Response Units") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
Here is the plot:
As you can see that the fit did not work using method = "loess". I need something like this(there are 5 concentrations here):
The fitting requires 1:1 Langmuir model but I am not sure how I can do that in ggplot. Can someone please help me?
Here is the equation:
This is from the pbm package that fits this kind of plots.
Your data are smooth enough that you need only use geom_line, not geom_smooth:
df %>%
ggplot(aes(time, values, color = sample)) +
geom_line(size = 2, na.rm = TRUE) +
geom_point(color = 'black', size = 1) +
theme_linedraw(base_size = 16) +
xlim(c(0, 400))
Edit
It is possible to fit the results to the data using non-linear least squares, employing the binding1to1 function from pbm, but it requires a bit of method tweaking to get the model to fit. It would probably be better to create a model then plot the predictions rather than using geom_smooth. However, if you really wanted to, you could do:
df %>%
ggplot(aes(time, values, color = sample)) +
geom_smooth(method = nls, se = FALSE, n = 1000,
formula = y ~ binding1to1(x, 123, 32e-9, kon, koff, rmax),
method.args = list(
start = list(kon = 2000, koff = 0.02, rmax = 2e4),
control = nls.control(minFactor = 1e-6, maxiter = 1000)
),
data = df[df$time > 0 & df$sample == "32nM",]) +
geom_smooth(method = nls, se = FALSE, n = 1000,
formula = y ~ binding1to1(x, 123, 8e-9, kon, koff, rmax),
method.args = list(
start = list(kon = 3000, koff = 0.02, rmax = 2e4),
control = nls.control(minFactor = 1e-9, maxiter = 10000)
),
data = df[df$time > 0 & df$sample == "8nM",]) +
geom_point(color = 'black', size = 1) +
theme_linedraw(base_size = 16) +
xlim(c(0, 400))
If you want to actually fit a model from which to extract the parameters and plot, you can do:
library(tidyverse)
library(pbm)
df <- read.csv("SPR.csv") %>%
filter(time >= 0) %>%
mutate(sample = as.numeric(gsub("\\D+", "", sample)) * 1e-9,
values = values * 1e-3) %>%
group_by(sample) %>%
mutate(tmax = time[which.max(values)])
fit_fun <- function(time, tmax, sample, kon, koff, rmax) {
unlist(Map(function(time, tmax, sample) {
binding1to1(time, tmax, sample, kon, koff, rmax)
}, time, tmax, sample))
}
mod <- nls(values ~ fit_fun(time, tmax, sample, kon, koff, rmax),
data = df,
start = list(kon = 3000, koff = 0.02, rmax = 2),
control = nls.control(minFactor = 1e-9, maxiter = 10000))
This gives us a model with the best fitting values for the various parameters:
mod
#> Nonlinear regression model
#> model: values ~ fit_fun(time, tmax, sample, kon, koff, rmax)
#> data: df
#> kon koff rmax
#> 8.925e+05 2.521e-03 5.445e-02
#> residual sum-of-squares: 5.219e-05
#>
#> Number of iterations to convergence: 536
#> Achieved convergence tolerance: 5.155e-07
We can then predict the output of the model over the range of our input variables:
pred_df <- expand.grid(time = 0:400, sample = c(8, 32) * 1e-9,
tmax = df$tmax[1])
pred_df$values <- predict(mod, pred_df)
And we can plot it like this:
df %>%
ggplot(aes(time, values, color = factor(sample))) +
geom_line(data = pred_df, size = 1) +
geom_point(color = 'black', size = 1) +
theme_linedraw(base_size = 16) +
xlim(c(0, 400))
I am willing to interpolate two curves, according to the function Curing ~ a * atan(b * Time), fitting the data reported in the code below. I am getting two problems with this:
library(tidyverse)
library(investr)
library(ggplot2)
#DATAFRAME
RawData <- data.frame("Time" = c(0, 4, 8, 24, 28, 32, 0, 4, 8, 24, 28, 32), "Curing" = c(0, 28.57, 56.19, 86.67, 89.52, 91.42, 0, 85.71, 93.33, 94.28, 97.62, 98.09), "Grade" = c("Product A", "Product A", "Product A", "Product A", "Product A", "Product A", "Product B", "Product B", "Product B", "Product B", "Product B", "Product B"))
attach(RawData)
model <- nls(Curing ~ a * atan(b * Time), data= RawData, control=nls.control(printEval=TRUE, minFactor=2^-24, warnOnly=TRUE))
new.data <- data.frame(time=seq(1, 32, by = 0.1))
interval <- as_tibble(predFit(model, newdata = new.data, interval = "confidence", level= 0.9)) %>% mutate(Time = RawData$Time)
The first is an error as soon as I input the last line:
Error in assign(xname, newdata[, xname]) : first argument not valid
I have tried to change the values of new.data without success. If I remove the optional argument newdata = I can fit, but it looks like the fitting is made interpolating the whole set of data without differentiating the two series.
Below the command lines for getting the graph:
Graph <- ggplot(data=RawData, aes(x=`Time`, y=`Curing`, col=Grade)) + geom_point(aes(color = Grade), shape = 1, size = 2.5)
Graph + geom_line(data=interval, aes(x = Time, y = fit))+
geom_ribbon(data=interval, aes(x=Time, ymin=lwr, ymax=upr), alpha=0.5, inherit.aes=F, fill="blue")+
theme_classic()
Is it possible to have both: a smooth and series-separated fitting?
Your error is caused by a typo (time instead of Time in new.data). However, this will not fix the problem of getting one ribbon for each series.
To do this as a one-off, you will need two separate models for the two different sets of data. It is best to use the split-apply-bind idiom to create a single prediction data frame. It also helps plotting if this has a Grade column and the fit column is renamed to Curing
library(tidyverse)
library(investr)
library(ggplot2)
pred_df <- do.call(rbind, lapply(split(RawData, RawData$Grade), function(d) {
new.data <- data.frame(Time = seq(0, 32, by = 0.1))
nls(Curing ~ a * atan(b * Time), data = d, start = list(a = 5, b = 1)) %>%
predFit(newdata = new.data, interval = "confidence", level = 0.9) %>%
as_tibble() %>%
mutate(Time = new.data$Time,
Grade = d$Grade[1],
Curing = fit)
}))
This then allows the plot to be quite straightforward:
ggplot(data = RawData, aes(x = Time, y = Curing, color = Grade)) +
geom_point(shape = 1, size = 2.5) +
geom_ribbon(data = pred_df, aes(ymin = lwr, ymax = upr, fill = Grade),
alpha = 0.3, color = NA) +
geom_line(data = pred_df) +
theme_classic(base_size = 16)
General approach
I think this is quite a useful technique, and might be of broader interest, so a more general solution if one wishes to plot confidence bands with an nls model using geom_smooth would be to create little wrappers around nls and predFit:
nls_se <- function(formula, data, start, ...) {
mod <- nls(formula, data, start)
class(mod) <- "nls_se"
mod
}
predict.nls_se <- function(model, newdata, level = 0.9, ...) {
class(model) <- "nls"
p <- investr::predFit(model, newdata = newdata,
interval = "confidence", level = level)
list(fit = p, se.fit = p[,3] - p[,1])
}
This allows very simple plotting with ggplot:
ggplot(data = RawData, aes(x = Time, y = Curing, color = Grade)) +
geom_point(size = 2.5) +
geom_smooth(method = nls_se, formula = y ~ a * atan(b * x),
method.args = list(start = list(a = 5, b = 1))) +
theme_minimal(base_size = 16)
To put both prediction and confidence bands, we can do:
nls_se <- function(formula, data, start, type = "confidence", ...) {
mod <- nls(formula, data, start)
class(mod) <- "nls_se"
attr(mod, "type") <- type
mod
}
predict.nls_se <- function(model, newdata, level = 0.9, interval, ...) {
class(model) <- "nls"
p <- investr::predFit(model, newdata = newdata,
interval = attr(model, "type"), level = level)
list(fit = p, se.fit = p[,3] - p[,1])
}
ggplot(data = RawData, aes(x = Time, y = Curing, color = Grade)) +
geom_point(size = 2.5) +
geom_smooth(method = nls_se, formula = y ~ a * atan(b * x),
method.args = list(start = list(a = 5, b = 1),
type = "prediction"), alpha = 0.2,
aes(fill = after_scale(color))) +
geom_smooth(method = nls_se, formula = y ~ a * atan(b * x),
method.args = list(start = list(a = 5, b = 1)),
aes(fill = after_scale(color))) +
theme_minimal(base_size = 16)
This is a follow up question to Combine ggflags with linear regression in ggplot2
I have a plot like below with a log-linear model for x and y for certain countries that I have made in R with ggplot2 and ggflags:
The problem is when I want to print out the regression equation, the R2 and the p-value with the help of stat_regline_equation and stat_cor, I get values for a linear model and not the log-linear model I want to use.
How can I solve this?
library(ggplot2)
library(ggflags)
library(ggpubr)
library(SciViews)
set.seed(123)
Data <- data.frame(
country = c("at", "be", "dk", "fr", "it"),
x = runif(5),
y = runif(5)
)
ggplot(Data, aes(x = x, y = y, country = country, size = 11)) +
geom_flag() +
scale_country() +
scale_size(range = c(10, 10)) +
geom_smooth(aes(group = 1), method = "lm", , formula = y ~ log(x), se = FALSE, size = 1) +
stat_regline_equation(label.y = 0.695,
aes(group = 1, label = ..eq.label..), size = 5.5) +
stat_cor(aes(group = 1,
label =paste(..rr.label.., ..p.label.., sep = "~`,`~")),
label.y = 0.685, size = 5.5, digits= 1)
edit: I have also tried to use ln(x) instead of log(x) but I do not get any results when printing out the coefficient from that either.
There are four things you need to do:
Provide your regression formula to the formula argument of stat_regline_equation
Use sub to change "x" to "log(x)" in eq.label
Change the x aesthetic of stat_cor to log(x)
Fix the x limits inside coord_cartesian to compensate
ggplot(Data, aes(x = x, y = y, country = country, size = 11)) +
geom_flag() +
scale_country() +
scale_size(range = c(10, 10)) +
geom_smooth(aes(group = 1), method = "lm", , formula = y ~ log(x),
se = FALSE, size = 1) +
stat_regline_equation(label.y = 0.695, label.x = 0.25,
aes(group = 1, label = sub("x", "log(x)", ..eq.label..)),
size = 5.5,
formula = y ~ log(x),
check_overlap = TRUE, output.type = "latex") +
stat_cor(aes(group = 1, x = log(x),
label =paste(..rr.label.., ..p.label.., sep = "~`,`~")),
label.x = 0.25,
label.y = 0.65, size = 5.5, digits= 1, check_overlap = TRUE) +
coord_cartesian(xlim = c(0.2, 1))
I want to generate a number of plots of linear regressions (bacterial OTUs plotted against temperature) using ggplot. I want the titles of the plots to be the linear regression equation, which I am determining with a function. The code works when I make the plots individually but not when I use a for loop.
I keep getting the following error:
Error in model.frame.default(formula = taxa_list[i] ~ Temperature, data = dataframe, :
variable lengths differ (found for 'Temperature')
See below for my code. Do I need a nested for loop to make this work?
taxa_list <- c("Vibrio","Salmonella","Campylobacter","Listeria","Streptococcus","Legionella")
taxa_list <- sort(taxa_list)
for (i in seq_along(taxa_list)) {
lm_eqn <- function(dataframe) {
m <- lm(taxa_list[i] ~ Temperature, dataframe)
p <- summary(m)
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2 %.% italic(x)*","~~italic(p)~"="~p0,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3),
p0 = format(p$coefficients[8], digits = 3)))
as.expression(eq);
}
plot <- ggplot(data = all_data, aes(x = Temperature, y = taxa_list[i], fill = taxa_list[i])) +
geom_point(data = all_data, aes(x = Temperature, y = taxa_list[i]), color = "black", size = 3) +
geom_smooth(method = "lm", size = 1, color = "black", fill = "gray") +
labs(title = lm_eqn(dataframe = all_data), subtitle = "") + xlab("Temperature") + ylab("Number of OTUs")
print(plot)
}
I tried to rewrite your code to make it more readable, efficient and maintainable. I used tidyverse choices. I believe there was an extra * x in your original eq function that I removed.
library(dplyr)
library(ggplot2)
library(purrr)
library(broom)
taxa_list <- c("Vibrio","Salmonella","Campylobacter","Listeria","Streptococcus","Legionella")
taxa_list <- sort(taxa_list)
MyFunctionNew <- function(data, bacteria, temperature)
{
my_lm <- lm(as.formula(paste(bacteria, "~", temperature)), data = data)
terms_info <- broom::tidy(my_lm)
model_info <- broom::glance(my_lm)
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2 *","~~italic(p)~"="~p0,
list(a = format(terms_info$estimate[1], digits = 2),
b = format(terms_info$estimate[2], digits = 2),
r2 = format(model_info$r.squared, digits = 3),
p0 = format(model_info$p.value, digits = 3)))
plot <- ggplot(data = data, aes_string(x = temperature, y = bacteria, fill = bacteria)) +
geom_point(size = 3, show.legend = TRUE) +
geom_smooth(method = "lm", size = 1, color = "black", fill = "gray") +
labs(title = eq, subtitle = "") + xlab("Temperature") + ylab("Number of OTUs")
return(plot)
}
MyFunctionNew(dat1, "Vibrio", "Temperature")
#> `geom_smooth()` using formula 'y ~ x'
purrr::map(taxa_list, ~ MyFunctionNew(dat1, .x, "Temperature"))
#> [[1]]
#> `geom_smooth()` using formula 'y ~ x'
Here's some made up data that should more or less be close enough
set.seed(1111)
dat1 <- data.frame(Temperature = runif(200, min = 32, max = 100),
Vibrio = rnorm(200),
Salmonella = rnorm(200),
Campylobacter = rnorm(200),
Listeria = rnorm(200),
Streptococcus = rnorm(200),
Legionella = rnorm(200)
)
I know that this is possible with other types of models (e.g., this), but I haven't come across this for a model fit with "brms".
Has anyone had luck plotting the coefficients of multiple models, with different DVs, that were fit through brms?
Edit:
This is as far as I've gotten trying to use the method from that link for brms.
m1h<-fixef(mcmcm1_h1)
m1e<-fixef(mcmcm1_e1) #these extract fixed effect info from a model fit with brms; below is an example of their output#
m1h
Estimate Est.Error 2.5%ile 97.5%ile
Intercept 0.2615716 0.1482702 -0.01995366 0.5593722
m1hframe <- data.frame(Variable = colnames(m1h),Coefficient = m1h[, 1],SE =
m1h[, 2],min = m1h[, 3],max = m1h[, 4],modelName = "HH")
m1eframe <- data.frame(Variable = colnames(m1e),Coefficient = m1e[, 1],SE =
m1e[, 2],min = m1e[, 3],max = m1e[, 4],modelName = "Em")
allModelFrame <- data.frame(rbind(m1hframe, m1eframe))
library(ggplot2)
zp1 <- ggplot(allModelFrame, aes(colour = modelName))
zp1 <- zp1 + geom_hline(yintercept = 0, colour = gray(1/2), lty = 2)
zp1 <- zp1 + geom_linerange(aes(x = Variable, ymin = min,ymax = max),lwd = 1,
position = position_dodge(width = 1/2))
zp1 <- zp1 + geom_pointrange(aes(x = Variable, y = Coefficient, ymin = min,
ymax = max,lwd = 1/2, position = position_dodge(width = 1/2),shape = 21, fill
= "WHITE"))
zp1 <- zp1 + coord_flip() + theme_bw()
zp1 <- zp1 + ggtitle("two models")
print(zp1)
This is the error I am getting:
Don't know how to automatically pick scale for object of type PositionDodge/Position/ggproto. Defaulting to continuous.
Error: A continuous variable can not be mapped to shape
A slightly simpler solution using broom:
multiplot <- function(x) {
x %>% purrr::map(function(.) {
broom::tidy(., conf.int = TRUE, par_type = "non-varying") }) %>%
dplyr::bind_rows(.id = "model") %>%
ggplot(aes(term, estimate, ymin = lower, ymax = upper, color = model)) +
geom_pointrange(position = position_dodge(width = 0.3)) + coord_flip()
}
Use like this multiplot(list(m1, m2, m3)).