multiple density plot with slope line added in r - r

I want to creat density plot with multiple groups and add slope line for the means. The plot looks like following:
library(tidyverse)
library(ggridges)
data1 <- data.frame(x1 = c(rep(1,50), rep(2,50), rep(3,50), rep(4,50), rep(5,50)),
y1 = c(rnorm(50,10,1), rnorm(50,15,2), rnorm(50,20,3), rnorm(50,25,3), rnorm(50,30,4)))
data1$x1 <- as.factor(data1$x1)
ggplot(data1, aes(x = y1, y = x1, fill = 0.5 - abs(0.5 - stat(ecdf)))) +
stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE) +
scale_fill_viridis_c(name = "Tail probability", direction = -1)

There are two ways to construct the red line. You can either (1) use geom_line through points representing the group means, or (2) fit a regression through the data.
(1) will be truncated to fit the data, (2) can be extended beyond the data, but will only look right if there is an overall linear relationship between your x and y.
Code for (1)
means <- aggregate(y1 ~ x1, data=data1, FUN=mean)
ggplot(data1, aes(x = y1, y = x1, fill = 0.5 - abs(0.5 - stat(ecdf)))) +
stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE) +
scale_fill_viridis_c(name = "Tail probability", direction = -1) +
geom_line(aes(x=y1, y=as.numeric(x1), fill=1), data=means, colour="red")
// NB: need to override the fill aesthetic or you get an error
Code for (2)
regressionLine <- coef(lm(as.numeric(x1) ~ y1 , data=data1))
ggplot(data1, aes(x = y1, y = x1, fill = 0.5 - abs(0.5 - stat(ecdf)))) +
stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE) +
scale_fill_viridis_c(name = "Tail probability", direction = -1) +
geom_abline(intercept=regressionLine[1], slope=regressionLine[2], colour="red")

Related

Plot logistic regression using parameters in ggplot2

I would like to plot a logistic regression directly from the parameter estimates using ggplot2, but not quite sure how to do it.
For example, if I had 1500 draws of alpha and beta parameter estimates, I could plot each of the lines thus:
alpha_post = rnorm(n=1500,mean=1.1,sd = .15)
beta_post = rnorm(n=1500,mean=1.8,sd = .19)
X_lim = seq(from = -3,to = 2,by=.01)
for (i in 1:length(alpha_post)){
print(i)
y = exp(alpha_post[i] + beta_post[i]*X_lim)/(1+ exp(alpha_post[i] + beta_post[i]*X_lim) )
if (i==1){plot(X_lim,y,type="l")}
else {lines(X_lim,y,add=T)}
}
How would I do this in ggplot2? I know how to use geom_smooth(), but this is a little different.
As always in ggplot, you want to make a data.frame with all data that needs to be plotted:
d <- data.frame(
alpha_post = alpha_post,
beta_post = beta_post,
X_lim = rep(seq(from = -3,to = 2,by=.01), each = length(alpha_post))
)
d$y <- with(d, exp(alpha_post + beta_post * X_lim) / (1 + exp(alpha_post + beta_post * X_lim)))
Then the plotting itself becomes quite easy:
ggplot(d, aes(X_lim, y, group = alpha_post)) + geom_line()
If you want to be more fancy, add a summary line with e.g. the mean:
ggplot(d, aes(X_lim, y)) +
geom_line(aes(group = alpha_post), alpha = 0.3) +
geom_line(size = 1, color = 'firebrick', stat = 'summary', fun.y = 'mean')

Use ggplot to plot partial effects obtained with effects library

I would like to use ggplot to replicate the plots partial effects (with partial residuals), as obtained with the "effect" package. To do this I need to retrieve some information.
This is the plot I want to replicate with ggplot.
library(effects)
mod <- lm(log(prestige) ~ income:type + education, data=Prestige)
eff = effect("education", mod, partial.residuals=T)
plot(eff)
From the eff object I am able to retrieve the partial residuals, as eff$residuals, but they are not sufficient to replicate the plot. I think that what I need is the both the residuals, AND the marginal predicted effect. However I was not able to retrieve them from my eff object.
Otherwise I only have the residuals scores that cannot be plotted against the line of the marginal effect.
Any hint on how to retrieve this information?
You have almost all the information available. This would take some more time to generalize, but here's some code that results in a figure approximately like from the effects package. Notice that the smoother is off, but I didn't bother to dig up why.
The code should be self explanatory. I only copied function closest from the package.
mod <- lm(log(prestige) ~ income:type + education, data=Prestige)
eff = effect("education", mod, partial.residuals=T)
library(ggplot2)
library(gridExtra)
closest <- function(x, x0) apply(outer(x, x0, FUN=function(x, x0) abs(x - x0)), 1, which.min)
x.fit <- unlist(eff$x.all)
trans <- I
x <- data.frame(lower = eff$lower, upper = eff$upper, fit = eff$fit, education = eff$x$education)
xy <- data.frame(x = x.fit, y = x$fit[closest(trans(x.fit), x$education)] + eff$residuals)
g <- ggplot(x, aes(x = education, y = fit)) +
theme_bw() +
geom_line(size = 1) +
geom_point(data = xy, aes(x = x, y = y), shape = 1, col = "blue", size = 2) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.5) +
geom_smooth(data = xy, aes(x = trans(x), y = y),
method = "loess", span = 2/3, linetype = "dashed", se = FALSE)
grid.arrange(plot(eff), g, ncol = 2)

Scatter plot in R with large overlap and 3000+ points

I am making a scatter plot in R with ggplot2. I am comparing the fraction of votes Hillary and Bernie received in the primary and education level. There is a lot over overlap and way to many points. I tried to use transparency so I could see the overlap but it still looks bad.
Code:
demanalyze <- function(infocode, n = 1){
infoname <- filter(infolookup, column_name == infocode)$description
infocolumn <- as.vector(as.matrix(mydata[infocode]))
ggplot(mydata) +
aes(x = infocolumn) +
ggtitle(infoname) +
xlab(infoname) +
ylab("Fraction of votes each canidate recieved") +
xlab(infoname) +
geom_point(aes(y = sanders_vote_fraction, colour = "Bernie Sanders")) +#, color = alpha("blue",0.02), size=I(1)) +
stat_smooth(aes(y = sanders_vote_fraction), method = "lm", formula = y ~ poly(x, n), size = 1, color = "darkblue", se = F) +
geom_point(aes(y = clinton_vote_fraction, colour = "Hillary Clinton")) +#, color = alpha("red",0.02), size=I(1)) +
stat_smooth(aes(y = clinton_vote_fraction), method = "lm", formula = y ~ poly(x, n), size = 1, color = "darkred", se = F) +
scale_colour_manual("",
values = c("Bernie Sanders" = alpha("blue",0.02), "Hillary Clinton" = alpha("red",0.02))
) +
guides(colour = guide_legend(override.aes = list(alpha = 1)))
}
What could I change to make the overlap areas look less messy?
The standard way to plot a large number of points over 2 dimensions is to use 2D density plots:
With reproducible example:
x1 <- rnorm(1000, mean=10)
x2 <- rnorm(1000, mean=10)
y1 <- rnorm(1000, mean= 5)
y2 <- rnorm(1000, mean = 7)
mydat <- data.frame(xaxis=c(x1, x2), yaxis=c(y1, y2), lab=rep(c("H","B"),each=1000))
head(mydat)
library(ggplot2)
##Dots and density plots (kinda messy, but can play with alpha)
p1 <-ggplot(mydat) + geom_point(aes(x=xaxis, y = yaxis, color=lab),alpha=0.4) +
stat_density2d(aes(x=xaxis, y = yaxis, color=lab))
p1
## just density
p2 <-ggplot(mydat) + stat_density2d(aes(x=xaxis, y = yaxis, color=lab))
p2
There are many parameters to play with, so look here for the full info on the plot type in ggplot2.

graphing confidence intervals nls r

I'm in the process of putting some incidence data together for a proposal. I know that the data takes on a sigmoid shape overall so I fit it using NLS in R. I was trying to get some confidence intervals to plot as well so I used bootstrapping for the parameters, made three lines and here's where I'm having my problem. The bootstrapped CIs give me three sets of values, but because of equation the lines they are crossing.
Picture of Current Plot with "Ideal" Lines in Black
NLS is not my strong suit so perhaps I'm not going about this the right way. I've used mainly a self start function to this point just to get something down on the plot. The second NLS equation will give the same output, but I've put it down now so that I can alter later if needed.
Here is my code thus far:
data <- readRDS(file = "Incidence.RDS")
inc <- nls(y ~ SSlogis(x, beta1, beta2, beta3),
data = data,
control = list(maxiter = 100))
b1 <- summary(inc)$coefficients[1,1]
b2 <- summary(inc)$coefficients[2,1]
b3 <- summary(inc)$coefficients[3,1]
inc2 <- nls(y ~ phi1 / (1 + exp(-(x - phi2) / phi3)),
data = data,
start = list(phi1 = b1, phi2 = b2, phi3 = b3),
control = list(maxiter = 100))
inc2.boot <- nlsBoot(inc2, niter = 1000)
phi1 <- summary(inc2)$coefficients[1,1]
phi2 <- summary(inc2)$coefficients[2,1]
phi3 <- summary(inc2)$coefficients[3,1]
phi1_L <- inc2.boot$bootCI[1,2]
phi2_L <- inc2.boot$bootCI[2,2]
phi3_L <- inc2.boot$bootCI[3,2]
phi1_U <- inc2.boot$bootCI[1,3]
phi2_U <- inc2.boot$bootCI[2,3]
phi3_U <- inc2.boot$bootCI[3,3]
#plot lines
age <- c(20:95)
mean_incidence <- phi1 / (1 + exp(-(age - phi2) / phi3))
lower_incidence <- phi1_L / (1 + exp(-(age - phi2_L) / phi3_L))
upper_incidence <- phi1_U / (1 + exp(-(age - phi2_U) / phi3_U))
inc_line <- data.frame(age, mean_incidence, lower_incidence, upper_incidence)
p <- ggplot()
p <- (p
+ geom_point(data = data, aes(x = x, y = y), color = "darkgreen")
+ geom_line(data = inc_line,
aes(x = age, y = mean_incidence),
color = "blue",
linetype = "solid")
+ geom_line(data = inc_line,
aes(x = age, y = lower_incidence),
color = "blue",
linetype = "dashed")
+ geom_line(data = inc_line,
aes(x = age, y = upper_incidence),
color = "blue",
linetype = "dashed")
+ geom_ribbon(data = inc_line,
aes(x = age, ymin = lower_incidence, ymax = upper_incidence),
fill = "blue", alpha = 0.20)
+ labs(x = "\nAge", y = "Incidence (per 1,000 person years)\n")
)
print(p)
Here's a link to the data.
Any help on what to do next or if this is even possible given my current set up would be appreciated.
Thanks
Try plot.drc in the drc package.
library(drc)
fm <- drm(y ~ x, data = data, fct = LL.3())
plot(fm, type = "bars")
P.S. Please include the library calls in your questions so that the code is self contained and complete. In the case of the question here: library(ggplot2); library(nlstools) .

Computing weighted average using lowess mthod in R

I am trying to use the lowess method from R to compute the weighted average of a data set which is not uniformly distributed along x axis. For example, the first 5 data points are like this, where the first column is the x and the second is the y.
375.0 2040.0
472.0 5538.0
510.0 4488.0
573.0 2668.0
586.0 7664.0
I used the following command in R:
x<-read.table(add,header=FALSE,sep="\t")
y<-lowess(x[,1],x[,2],f=0.01)
write.table(y, file = results , sep = "\t", col.names =FALSE, row.names =FALSE)
The output looks like this:
The green line shows the average computed by the smooth function in matlab (tri-cubic kernel), and the red line is the average line computed by lowess method in R. The blue dots are the data points.
I can't find why the method in R does not work. Do you have any idea?
Here is a link to part of the data.
Thanks a lot for your help.
Th smooth function in matlab is like a filter ,
yy = smooth(y)
yy(1) = y(1)
yy(2) = (y(1) + y(2) + y(3))/3
yy(3) = (y(1) + y(2) + y(3) + y(4) + y(5))/5 ## convolution of size 5
yy(4) = (y(2) + y(3) + y(4) + y(5) + y(6))/5
I think it is better to do a simple smooth here.
Here some attempts using loess, lowesss with f = 0.2(1/5) and using smooth.spline
I am using ggplot2 to plot ( to use geom_jitter with some alpha )
library(ggplot2)
dat <- subset(data, V2 < 5000)
#dat <- data
xy <- lowess(dat$V1,dat$V2,f = 0.8)
xy <- as.data.frame(do.call(cbind,xy))
p1<- ggplot(data = dat, aes(x= V1, y = V2))+
geom_jitter(position = position_jitter(width = .2), alpha= 0.1)+
geom_smooth()
xy <- lowess(dat$V1,dat$V2,f = 0.2)
xy <- as.data.frame(do.call(cbind,xy))
xy.smooth <- smooth.spline(dat$V1,dat$V2)
xy.smooth <- data.frame(x= xy.smooth$x,y = xy.smooth$y)
p2 <- ggplot(data = dat, aes(x= V1, y = V2))+
geom_jitter(position = position_jitter(width = .2), alpha= 0.1)+
geom_line(data = xy, aes(x=x, y = y, group = 1 ), color = 'red')+
geom_line(data = xy.smooth, aes(x=x, y = y, group = 1 ), color = 'blue')
library(gridExtra)
grid.arrange(p1,p2)

Resources