I've learnt to do this type of plots with r, and add this regression lines predicted from a model.
## Predict values of the model##
p11=predict(model.coh1, data.frame(COH=coh1, espajpe=1:4))
p12=predict(model.coh1, data.frame(COH=coh2, espaje=1:4))
p11
1 2 3 4
1.996689 2.419994 2.843298 3.266602
p12
1 2 3 4
1.940247 2.414299 2.888351 3.362403
##PLOT##
plot(espapli~espaje, mydata)
lines(1:4,p11, col="red")
lines(1:4,p12, col="green")
Now, I would like to do something similar using ggplot, is that possible? That is, introducing a regression line for these particular values.
#gennaroTedesco gives an answer using the built in smoothing method. I'm not sure that follows the OP. You can do this via geom_line
# example data
set.seed(2125)
x <- rnorm(100)
y <- 1 + 2.5 *x + rnorm(100, sd= 0.5)
lm1 <- lm(y~x)
x2 <- rnorm(100)
p1 <- predict(lm1, data.frame(x= x2), interval= "c")
library(ggplot2)
df <- data.frame(x= x2, yhat= p1[,1], lw= p1[,2], up= p1[,3])
# plot just the fitted points
ggplot(df, aes(x= x, y= yhat)) + geom_line()
# also plot the confidence interval
ggplot(df, aes(x= x, y= yhat)) + geom_line() +
geom_line(aes(x= x, y= up, colour= "red")) +
geom_line(aes(x= x, y= lw, colour= "red")) +
theme(legend.position= "none")
# only the last plot is shown
As a general rule regression lines can be added to ggplot making use of the function geom_smooth. Please see full documentation here. If the values to be fitted are the same ones used in the general aesthetic, then
p <- ggplot(data, aes(x = x, y = y)
p <- p + geom_smooth(method = 'lm')
does the job. Otherwise you need to fully specify the set of data and the model in the geom_smooth aesthetics.
Related
I have the following data
df <- data.frame(x= c(0,1,10,100,1000,0,1, 10,100,1000,0,1,10,100,1000),
y=c(7,15,135,1132,6459,-3,11,127,1120,6249,-5,13,126,1208,6208))
After making a linear model using the data, I used the model to predict y values from know x values. Stored the predicted y values in a data frame "pred.fits"
fit <- lm(data = df, y ~ x)
pred.fits <- expand.grid(x=seq(1, 2000, length=2001))
pm <- predict(fit, newdata=pred.fits, interval="confidence")
pred.fits$py <- pm[,1]
I plot the data and use both geom_smooth() and geom_line(), they seem to be quite coincident.
ggplot(df, aes(x=x, y=y)) +
geom_point() +
geom_smooth(method = lm, formula = y ~ x, se = FALSE, size=1.5) +
geom_line(data=pred.fits, aes(x=x, y=py), size=.2)
However, when I plot the same data, with setting the axes in log scale the two regressions differs drastically.
ggplot(df, aes(x=x, y=y)) +
geom_point() +
geom_smooth(method = lm, formula = y ~ x, se = FALSE, size=1.5) +
geom_line(data=pred.fits, aes(x=x, y=py), size=.2) +
scale_x_log10() +
scale_y_log10()
Am I missing something here?
UPDATE
After #Duck pointed me to correct direction, I was able to get it right. The issue was, I wanted the data to be untransformed, but the axes transformed to log10 scale. This is how I was able to do it.
df2 <- df[df$x>=1,] # remove annoying warning msgs.
fit2 <- lm(data = df2, log10(y) ~ log10(x))
pred.fits2 <- expand.grid(x=seq(10^0, 10^3 , length=200))
pm2 <- predict(fit2, newdata=pred.fits2, interval="confidence")
pred.fits2$py <- 10^pm2[,1] # convert the predicted y values to linear scale
ggplot(df2, aes(x=x, y=y)) +
geom_point() +
geom_smooth(method = lm, formula = y ~ x, se = FALSE, size=1.5) +
geom_line(data=pred.fits2, aes(x=x, y=py), size=1.5, linetype = "longdash") +
scale_x_log10() +
scale_y_log10()
Thanks everyone for your help.
This code can be useful for your understanding (Thanks to #BWilliams for the valious comment). You want x and y in log scale so if mixing a linear model with different scales can mess everything. If you want to see similar scales it is better if you train a different model with log variables and then plot it also using the proper values. Here an approach where we build a log-log model and then plot (data values as ones or negative have been isolated in a new dataframe df2). Here the code:
First linear model:
library(ggplot2)
#Data
df <- data.frame(x= c(0,1,10,100,1000,0,1, 10,100,1000,0,1,10,100,1000),
y=c(7,15,135,1132,6459,-3,11,127,1120,6249,-5,13,126,1208,6208))
#Model 1 all obs
fit <- lm(data = df, y ~ x)
pred.fits <- expand.grid(x=seq(1, 2000, length=2001))
pm <- predict(fit, newdata=pred.fits, interval="confidence")
pred.fits$py <- pm[,1]
#Plot 1
ggplot(df, aes(x=x, y=y)) +
geom_point() +
geom_smooth(method = lm, formula = y ~ x, se = FALSE, size=1.5) +
geom_line(data=pred.fits, aes(x=x, y=py), size=.2)
Output:
Now the sketch for log variables, notice how we use log() across main variables and also how the model is build:
#First remove issue values
df2 <- df[df$x>1,]
#Train a new model
pred.fits2 <- expand.grid(x=seq(1, 2000, length=2001))
fit2 <- lm(data = df2, log(y) ~ log(x))
pm2 <- predict(fit2, newdata=pred.fits2, interval="confidence")
pred.fits2$py <- pm2[,1]
#Plot 2
ggplot(df2, aes(x=log(x), y=log(y))) +
geom_point() +
geom_smooth(method = lm, formula = y ~ x, se = FALSE, size=1.5) +
geom_line(data=pred.fits2, aes(x=log(x), y=py), size=.2)
Output:
I have a very simple question but so far couldn't find easy solution for that. Let's say I have a some data that I want to fit and show its x axis value where y is in particular value. In this case let's say when y=0 what is the x value. Model is very simple y~x for fitting but I don't know how to estimate x value from there. Anyway,
sample data
library(ggplot2)
library(scales)
df = data.frame(x= sort(10^runif(8,-6,1),decreasing=TRUE), y = seq(-4,4,length.out = 8))
ggplot(df, aes(x = x, y = y)) +
geom_point() +
#geom_smooth(method = "lm", formula = y ~ x, size = 1,linetype="dashed", col="black",se=FALSE, fullrange = TRUE)+
geom_smooth(se=FALSE)+
labs(title = "Made-up data") +
scale_x_log10(breaks = c(1e-6,1e-4,1e-2,1),
labels = trans_format("log10", math_format(10^.x)),limits = c(1e-6,1))+
geom_hline(yintercept=0,linetype="dashed",colour="red",size=0.6)
I would like to convert 1e-10 input to 10^-10 format and annotate it on the plot. As I indicated in the plot.
thanks in advance!
Because geom_smooth() uses R functions to calculate the smooth line, you can attain the predicted values outside the ggplot() environment. One option is then to use approx() to get a linear approximations of the x-value, given the predicted y-value 0.
# Define formula
formula <- loess(y~x, df)
# Approximate when y would be 0
xval <- approx(x = formula$fitted, y = formula$x, xout = 0)$y
# Add to plot
ggplot(...) + annotate("text", x = xval, y = 0 , label = yval)
I'd like to analyse monthly rainfall data (make time series plot + regression equation for time series). I've written code in R and plot the monthly time series data and I've tried to make different regression equations (linear and non-linear) and show these equation on the same graph of time series plot but unfortunately I cannot. May be because I'm new user of R / Rstudio statistical packages.
Data style
Date monthly rainfall (mm)
jan94 12
Feb94 11
.
.
. Dec14 1x
The code
# plotting of time series rainfall data (option1)
# step1: read files
MR<-read.table("C:\\Users\\Salam\\Desktop\\trend Kufa\\CSV2 Habel\\Monthly rainfall.csv", header=T,sep=",")
summary(MR)
names(MR)
MR
# step2: plot observed discharge
MR1<-MR[c(1:252),2];
summary (MR1)
MR1
class(MR1)
require(zoo)
x <- yearmon(1994 + seq(0, 251)/12)
x
y<-MR1
y
pcp<-y~x
plot(pcp,type="l", xlab="Month",ylab="Monthly Rainfall(mm)", axes=T)
grid(nx=250, ny=250, col="lightgray", lty="solid")
lines(pcp,lwd=2, col="blue")
box(which='plot')
title("Monthly Observed rainfall(mm)")
## Regression
S1 <- lm(y ~ z, data=MR)
abline(S1,col='red',lwd=3)
summary(S1)
S2<-lm( y~poly(x,3), data=MR)
summary(S2)
abline(S2,col='green',lwd=3)
S3 <- nls(y ~ exp(a + b / x),start = list(a = 0, b = 0))
summary(S3)
S4 <- nls(y ~ (a + b *log( x)), start = list(a = 0, b = 0))
summary(S4)
You can use the text function to put the equations on the plots.
text(x, y, "S1 <- lm(y ~ z, data=x)",
cex = .8)
the x and y are the coordinates on the plot where you would like the equation
put the equation in quotes
data is your data frame
cex controls the font size
for more info & options on text use ?text
Check the following example and you can modify yours easily.
library(ggplot2)
# example dataset
dt = data.frame(date = 1:10,
value = c(10,11,15,13,16,17,18,19,16,22))
# plot everything in one graph
ggplot(dt, aes(date, value)) +
geom_point() + # plot the points
stat_smooth(method="lm",se=F,level=0.95, col="red") + # linear reg
stat_smooth(method="lm", formula = y~poly(x,2,raw=T), se=F,level=0.95, col="blue") + # quadratic reg
stat_smooth(method="lm", formula = y~poly(x,3,raw=T), se=F,level=0.95, col="green") # cubic reg
# plot everything in separately
library(gridExtra)
plot1 = ggplot(dt, aes(date, value)) +
geom_point() +
stat_smooth(method="lm",se=T,level=0.95, col="red")
plot2 = ggplot(dt, aes(date, value)) +
geom_point() +
stat_smooth(method="lm", formula = y~poly(x,2,raw=T), se=T,level=0.95, col="blue")
plot3 = ggplot(dt, aes(date, value)) +
geom_point() +
stat_smooth(method="lm", formula = y~poly(x,3,raw=T), se=T,level=0.95, col="green")
grid.arrange(plot1,plot2,plot3)
I hope you are familiar with the ggplot2 package as it is the most important in this case. You can then investigate ways to add titles, change colours, change confidence intervals, etc.
I created a ggplot with linear geom_smooth now i would like to have the points, from the geom_point to have a different colour below and above the linear smooth line.
I know I can add the color to the point by doing geom_point(aes(x, y, colour = z)). My problem is how to determine if a point in the plot is below or above the linear line.
Can ggplot2 do this or do have to create a new column in the data frame first?
Below is the sample code with geom_smooth but without the different colours above and below the line.
Any help is appreciated.
library(ggplot2)
df <- data.frame(x = rnorm(100),
y = rnorm(100))
ggplot(df, aes(x,y)) +
geom_point() +
geom_smooth(method = "lm")
I believe ggplot2 can't do this for you. As you say, you could create a new variable in df to make the colouring. You can do so, based on the residuals of the linear model.
For example:
library(ggplot2)
set.seed(2015)
df <- data.frame(x = rnorm(100),
y = rnorm(100))
# Fit linear regression
l = lm(y ~ x, data = df)
# Make new group variable based on residuals
df$group = NA
df$group[which(l$residuals >= 0)] = "above"
df$group[which(l$residuals < 0)] = "below"
# Make the plot
ggplot(df, aes(x,y)) +
geom_point(aes(colour = group)) +
geom_smooth(method = "lm")
Note that the colour argument has to be passed to geom_point(), otherwise geom_smooth() will produce a fit to each group separately.
Result:
I have a bivariate data set:
set.seed(45)
require(mvtnorm)
sigma <- matrix(c(3,2,2,3), ncol=2)
df <- as.data.frame(rmvnorm(100, sigma=sigma))
names(df) <- c("u", "v")
Setting up v as the dependent variable, with ggplot I can easily show the "usual" least-squares regression of v on u:
require(ggplot2)
qplot(u, v, data=df) + geom_smooth(aes(u, v), method="lm", se=FALSE)
... but I'd also like to show the least-squares regression of u on v (at the same time).
This is how I naively tried to do it, by passing a different aes to geom_smooth:
last_plot() + geom_smooth(aes(v, u), method="lm", color="red", se=FALSE)
Of course, that doesn't quite work. The second geom_smooth shows the inverse of the proper line (I think). I'm expecting it to have a steeper slope than the first line.
Moreover, the confidence intervals are wrongly shaped. I don't particularly care about those, but I do think they might be a clue.
Am I asking for something that can't easily be done with ggplot2?
EDIT: Here is a bit more, showing the lines I expect:
# (1) Least-squares regression of v on u
mod <- lm(v ~ u, data=df)
v_intercept <- coef(mod)[1]
v_slope <- coef(mod)[2]
last_plot() + geom_abline(
intercept = v_intercept,
slope = v_slope,
color = "blue",
linetype = 2
)
# (2) Least-squares regression of u on v
mod2 <- lm(u ~ v, data=df)
u_intercept <- coef(mod2)[1]
u_slope <- coef(mod2)[2]
# NOTE: we have to solve for the v-intercept and invert the slope
# because we're still in the original (u, v) coordinate frame
last_plot() + geom_abline(
intercept = - u_intercept / u_slope,
slope = 1 / u_slope,
color = "red",
linetype = 2
)
ggplot(df) +
geom_smooth(aes(u,v), method='lm') +
geom_smooth(aes(v,u), method='lm', colour="red")