I'm trying to make a boxplot to visualize this regression model
library(lme4)
lmer(dv1 ~ intervention + (1|id/area),
data=data,
REML=T)
In this experiment, the control and treatment intervention are both applied to a subject within discrete areas.
Here's the data I'm using
data <- data.frame("id" = 1:2,
"intervention" = c(rep("a",27),rep("b", 27)),
"area" = 1:3,
"dv1" = rnorm(54),
"dv2" = rnorm(54),
"dv3" = rnorm(54))
data$area <- as.factor(data$area)
data$id <- as.factor(data$id)
Here's what I've tried
library(ggplot2)
ggplot(data,aes(x=area,y=dv1,col=intervention)) +
geom_point() +
geom_boxplot(alpha=0.2) +
facet_wrap(~id) +
ggtitle("DV1") +
xlab("Intervention") +
ylab("DV1")
Instead of the red points overlaying the red boxplot, they're all over the place. How do I fix this?
Edit: I used the jitter options that u/eipi10 suggested and this is what I have now.
ggplot(data,aes(x=area,y=dv1,col=intervention)) +
geom_point(position=position_jitterdodge(dodge.width=0.75, jitter.height=0, jitter.width=0.25), alpha=0.6) +
geom_boxplot(alpha=0.2, size=0.3) +
facet_wrap(~id) +
ggtitle("DV1") +
xlab("Area") +
ylab("DV1")
Related
In the R statistical package, is there a way to plot a graph of a second order polynomial regression with one continuous variable and one categorical variable?
To generate a linear regression graph with one categorical variable:
library(ggplot2)
library(ggthemes) ## theme_few()
set.seed(1)
df <- data.frame(minutes = runif(60, 5, 15), endtime=60, category="a")
df$category = df$category=letters[seq( from = 1, to = 2 )]
df$endtime = df$endtime + df$minutes^3/180 + df$minutes*runif(60, 1, 2)
ggplot(df, aes(y=endtime, x=minutes, col = category)) +
geom_point() +
geom_smooth(method=lm) +
theme_few()
To plot a polynomial graph with one one continuous variable:
ggplot(df, aes(x=minutes, y=endtime)) +
geom_point() +
stat_smooth(method='lm', formula = y ~ poly(x,2), size = 1) +
xlab('Minutes of warm up') +
ylab('End time')
But I can’t figure out how to plot a polynomial graph with one continuous variable and one categorical variable.
Just add a colour or group mapping. This will make ggplot fit and display separate polynomial regressions for each category. (1) It's not possible to display an additive mixed-polynomial regression (i.e. lm(y ~ poly(x,2) + category)); (2) what's shown here is not quite equivalent to the results of the interaction model lm(y ~ poly(x,2)*col), because the residual variances (and hence the widths of the confidence ribbons) are estimated separately for each group.
ggplot(df, aes(x=minutes, y=endtime, col = category)) +
geom_point() +
stat_smooth(method='lm', formula = y ~ poly(x,2)) +
labs(x = 'Minutes of warm up', y = 'End time') +
theme_few()
For descriptive plots in R studio, I would like to fit a regression curve in my spaghetti plot. To create the spaghetti plot I used:
library(lattice)
GCIP <- data_head$GCIP
time_since_on <- data_head$time_since_on
Patient <- data_head$Patient
Eye <-data_head$Eye
xyplot(GCIP~time_since_on, groups = Patient, type='b', data=data_head)
and I've got this plot
Then I wanted to fit a polynomial curve, so I used this code:
plot.new<- plot(time_since_on,GCIP)
lines(lowess(GCIP ~ time_since_on))
This is what I've got:
What I want is to fit a curve like the one I've got in the image 2 but over the spaghetti plot (with the longitudinal data for each subject).
I've tried to use this code:
library(ggplot2)
library(reshape2)
GCIP <- data_head$GCIP
time_since_on <- data_head$time_since_on
Patient.ID <- data_head$Patient.ID
Eye <-data_head$Eye
Visit <-data_head$Visit
Patient<-data_head$Patient
ggplot(data = reprex, aes(x,y)) +
geom_point(alpha=1, size=2) +
aes(colour=Patient.ID) +
geom_text(aes(label=label), size=2, colour='white') +
geom_path(aes(group=Patient.ID))
ggplot(data= reprex, aes(x = time_since_on, y = GCIP)) +
geom_point(size = 2, alpha= 1, aes(color = Patient.ID)) + #colour points by group
geom_path(aes(group = Patient.ID)) + #spaghetti plot
stat_smooth(method = "lm", formula = y ~ x, aes(group = Patient.ID, colour = group)) + #line of best fit by group
ylab("GCIP (volume)") + xlab("time_since_on (months)") +
theme_bw()
But I don't get anything from this.
COuld anyone help me please?
Here an example taken from the internet
Million Thanks.
Lili
Working with this data in Rstudio. I need to run a simple regression of ed76 on lwage76 and a saturated regression that turns ed76 into a dummy variable for every level within the column. Then I need to plot both regressions in an XY plot with lwage76 as the Y axis and ed76 as the X axis. This is what I have so far:
regression <- lm(nlsdata$lwage76~nlsdata$ed76)
predicted <- data.frame(Edu =nlsdata$ed76, Wage = predict(regression))
aggplot <- aggregate(Wage ~ Edu, data=predicted, mean)
xyplot( Wage ~ Edu, data = aggplot, grid = TRUE, type = c("p","l"))
This gives me a very nice XY plot, but now I need to add the predicted values from my staturated model:
satreg <- lm(lwage76 ~ ed76*edu_1 + ed76*edu_2 + ed76*edu_3 +
ed76*edu_4 + ed76*edu_5 + ed76*edu_6 + ed76*edu_7 +
ed76*edu_8 + ed76*edu_9 + ed76*edu_10 + ed76*edu_11 +
ed76*edu_12 + ed76*edu_13 + ed76*edu_14 + ed76*edu_15 +
ed76*edu_16 + ed76*edu_17, data = nlsdata)
satmodel <- data.frame(Edu =nlsdata$ed76, Wage = predict(satreg))
So how do I add the second data set to the graph that I have?
Solution in ggplot:
ggplot(data=predicted, aes(Edu, Wage)) +
geom_line() +
geom_point() +
geom_line(data=satmodel, colour="blue") +
geom_point(data=satmodel, colour="blue")
Alternatively, you can label each of your table and combined them into a single data.frame.
satmodel <- satmodel %>% mutate(type="sat_model")
predicted <- predicted %>% mutate(type="predicted")
df <- rbind(satmodel, predicted)
ggplot(df, aes(Edu, Wage, colour=type)) +
geom_line() +
geom_point()
As per ex18q1 in "R for Data Science" I am trying to find the best model for the data:
sim1a <- tibble(
x = rep(1:10, each = 3),
y = x * 1.5 + 6 + rt(length(x), df = 2)
)
I've applied linear model and am trying to plot the results on a graph using ggplot:
sim1a_mod <- lm(x ~ y, data = sim1a)
ggplot(sim1a, aes(x, y)) +
geom_point(size = 2, colour= "gray") +
geom_abline(intercept = coef(sim1a_mod)[[1]], slope = coef(sim1a_mod)[[2]], colour = "red")
coef(sim1a_mod)[[1]] prints -1.14403
coef(sim1a_mod)[[2]] prints 0.4384473
I create the plot with the data points, but the model is not showing. What am I doing wrong?
The nomenclature for typing formulas for model functions like lm(), glm(), lmer() etc. in R is always DV ~ IV1 + IV2 + ... + IVn where DV is your dependent variable and IVn is your list of independent variables. We typically chart the dependent variable on the y-axis and the independent variable on the x-axis, so in your case you'll need to change your sim1a_mod model to lm(y ~ x, data = sim1a).
In your original code, because you were running a different model, your line was being charted, but it was outside of your view. If you attempt to chart again with your original model with the following code you will then see your regression line:
ggplot(sim1a, aes(x, y)) +
geom_point(size = 2, colour= "gray") +
geom_abline(intercept = coef(sim1a_mod)[[1]], slope = coef(sim1a_mod)[[2]], colour = "red") +
scale_x_continuous(limits = c(-30, 30)) + scale_y_continuous(limits = c(-30, 30))
I am trying to plot the model predictions from a binary choice glm against the empirical probability using data from the titanic. To show differences across class and sex I am using faceting, but I have two things things I can't quite figure out. The first is that I'd like to restrict the loess curve to be between 0 and 1, but if I add the option ylim(c(0,1)) to the end of the plot, the ribbon around the loess curve gets cut off if one side of it is outside the bound. The second thing I'd like to do is draw a line from the minimum x-value (predicted probability from the glm) for each facet, to the maximum x-value (within the same facet) and y = 1 so as to show glm predicted probability.
#info on this data http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt
load(url('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.sav'))
titanic <- titanic3[ ,-c(3,8:14)]; rm(titanic3)
titanic <- na.omit(titanic) #probably missing completely at random
titanic$age <- as.numeric(titanic$age)
titanic$sibsp <- as.integer(titanic$sibsp)
titanic$survived <- as.integer(titanic$survived)
training.df <- titanic[sample(nrow(titanic), nrow(titanic) / 2), ]
validation.df <- titanic[!(row.names(titanic) %in% row.names(training.df)), ]
glm.fit <- glm(survived ~ sex + sibsp + age + I(age^2) + factor(pclass) + sibsp:sex,
family = binomial(link = "probit"), data = training.df)
glm.predict <- predict(glm.fit, newdata = validation.df, se.fit = TRUE, type = "response")
plot.data <- data.frame(mean = glm.predict$fit, response = validation.df$survived,
class = validation.df$pclass, sex = validation.df$sex)
require(ggplot2)
ggplot(data = plot.data, aes(x = as.numeric(mean), y = as.integer(response))) + geom_point() +
stat_smooth(method = "loess", formula = y ~ x) +
facet_wrap( ~ class + sex, scale = "free") + ylim(c(0,1)) +
xlab("Predicted Probability of Survival") + ylab("Empirical Survival Rate")
The answer to your first question is to use coord_cartesian(ylim=c(0,1)) instead of ylim(0,1); this is a moderately FAQ.
For your second question, there may be a way to do it within ggplot but it was easier for me to summarize the data externally:
g0 <- ggplot(data = plot.data, aes(x = mean, y = response)) + geom_point() +
stat_smooth(method = "loess") +
facet_wrap( ~ class + sex, scale = "free") +
coord_cartesian(ylim=c(0,1))+
labs(x="Predicted Probability of Survival",
y="Empirical Survival Rate")
(I shortened your code slightly by eliminating some default values and using labs.)
ss <- ddply(plot.data,c("class","sex"),summarise,minx=min(mean),maxx=max(mean))
g0 + geom_segment(data=ss,aes(x=minx,y=minx,xend=maxx,yend=maxx),
colour="red",alpha=0.5)