smoothed grouped proportion plot - r

I have the following data set:
set.seed(10)
start_date <- as.Date('2000-01-01')
end_date <- as.Date('2000-01-10')
Data <- data.frame(
id = rep((1:1000),10),
group = rep(c("A","B"), 25),
x = sample(1:100),
y = sample(c("1", "0"), 10, replace = TRUE),
date = as.Date(
sample(as.numeric(start_date):
as.numeric(end_date), 1000,
replace = T), origin = '2000-01-01'))
With that, I create the following plot:
Data %>% mutate(treated = factor(group)) %>%
mutate(date = as.POSIXct(date)) %>% #convert date to date
group_by(treated, date) %>% #group
summarise(prop = sum(y=="1")/n()) %>% #calculate proportion
ggplot()+ theme_classic() +
geom_line(aes(x = date, y = prop, color = treated)) +
geom_point(aes(x = date, y = prop, color = treated)) +
geom_vline(xintercept = as.POSIXct("2000-01-05 12:00 GMT"), color = 'black', lwd = 1)
Unfortunately the plot is pretty 'jumpy' and I would like to smooth it. I tried geom_smooth() but can't get it to work. Other questions regarding smoothing didn't help me because they missed the grouping aspect and therefore had a different structure. However, the example data set is in reality part of a larger data set so I need to stick to that code.
[Edit: the geom_smooth() code I tried is geom_smooth(method = 'auto', formula = y ~ x)]
Can someone point me into the right direction?
Many thanks and all the best.

Is this what you want by a smoothed line? You call geom_smooth with aesthetics, not in combination with geom_line. You can choose different smoothing methods, though the default loess with low observations is usually what people want. As an aside, I don't think this is necessarily nicer to look at than the geom_line version, and in fact is slightly less readable. geom_smooth is best used when there are many y observations for every x which makes patterns hard to see, geom_line is good for 1-1.
EDIT: After looking at what you're doing more closely, I added a second plot that doesn't directly calculate the treatment-date means and just uses geom_smooth directly. That lets you get a more reasonable confidence interval instead of having to remove it as before.
set.seed(10)
start_date <- as.Date('2000-01-01')
end_date <- as.Date('2000-01-10')
Data <- data.frame(
id = rep((1:1000),10),
group = rep(c("A","B"), 25),
x = sample(1:100),
y = sample(c("1", "0"), 10, replace = TRUE),
date = as.Date(
sample(as.numeric(start_date):
as.numeric(end_date), 1000,
replace = T), origin = '2000-01-01'))
library(tidyverse)
Data %>%
mutate(treated = factor(group)) %>%
mutate(date = as.POSIXct(date)) %>% #convert date to date
group_by(treated, date) %>% #group
summarise(prop = sum(y=="1")/n()) %>% #calculate proportion
ggplot() +
theme_classic() +
geom_smooth(aes(x = date, y = prop, color = treated), se = F) +
geom_point(aes(x = date, y = prop, color = treated)) +
geom_vline(xintercept = as.POSIXct("2000-01-05 12:00 GMT"), color = 'black', lwd = 1)
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Data %>%
mutate(treated = factor(group)) %>%
mutate(y = ifelse(y == "0", 0, 1)) %>%
mutate(date = as.POSIXct(date)) %>% #convert date to date
ggplot() +
theme_classic() +
geom_smooth(aes(x = date, y = y, color = treated), method = "loess") +
geom_vline(xintercept = as.POSIXct("2000-01-05 12:00 GMT"), color = 'black', lwd = 1)
Created on 2018-03-27 by the reprex package (v0.2.0).

Related

How to vary line types when using the autoplot() function to help the colour-blind among us?

Running the code below plots actual data (black line) against 4-month forecasts for that data. However, the forecast lines are indistinguishable to me since I don't see colours. How can the lines be distinguished from each other (with the actual (non-forecast) data line, the black line, made thicker than the others), either via dashed lines or the use of markers? In XLS I used dashed lines/markers to distinguish.
I have fooled around with ggplot(...scale_linetype_manual(values = c("TRUE" = "solid", "FALSE" = "dotted"))...) with no luck.
Code:
library(feasts)
library(fable)
library(ggplot2)
library(tsibble)
tmp <- data.frame(
Month = c(1,2,3,4,5,6,7,8,9,10),
StateX = c(1527,1297,933,832,701,488,424,353,302,280)
) %>%
as_tsibble(index = Month)
tmpNext4 <- data.frame(
Month = c(11,12,13,14),
StateX = c(211,182,153,125)
) %>%
as_tsibble(index = Month)
# Fit the models to tmp dataframe:
fit <- tmp %>%
model(
Mean = MEAN(StateX),
`Naïve` = NAIVE(StateX),
Drift = NAIVE(StateX ~ drift())
)
# Produce forecasts for the next 4 months:
fcTmp <- fit %>%
forecast(new_data = tmpNext4)
# Plot the forecasts:
fcTmp %>%
autoplot(tmp, level = NULL) +
autolayer(tmpNext4, StateX, colour = "black") +
labs(y = "Units",
title = "Units reaching dead state X",
subtitle = "(Months 11 - 15)") +
guides(colour = guide_legend(title = "Forecast"))
fcTmp %>%
ggplot(aes(Month, .mean)) +
geom_line(aes(linetype = .model, color = .model)) +
geom_line(aes(y = StateX, linetype = "Next4", color = "Next4"), data = tmpNext4) +
geom_line(aes(y = StateX), data = tmp)

Adding a power curve to scatterplot

I want to add a power curve with confidence intervals to my diamter-weight relationship, which clearly follows a y=a*x^b regression. So far, I used the geom_smooth "loess" version, but this is not yet quite right and perfect. Any suggestion how to add a power regression line would be much appreciated. Below is the used code:
p2<-ggplot(Data,aes(x=Diameter,y=Wet_weight,colour=Site))+
geom_point(size=3.5,alpha=0.3)+
geom_smooth(aes(group=Species),method=loess,colour="black")+
labs(x="\nUmbrella diamter (mm)",y="Wet weight (mg)\n")+theme_classic()+
scale_colour_manual(values=c("black","dark blue","blue","dark green","green"))+
theme(axis.title.x=element_text(size=20),
axis.text.x=element_text(size=18,colour="black"),
axis.title.y=element_text(size=20),
axis.text.y=element_text(size=18,colour="black"),
axis.ticks=element_line(colour="black",size=1),
axis.line=element_line(colour="black",size=1,linetype="solid"),
legend.position=c(0.18,0.75),
legend.text=element_text(colour="black",size=17),
legend.title=element_text(colour="black",size=18))
p2
Thank you!
I used this to get many equations, R2, and plots.
df= #change your data frame so it fits the current code
variables=c("group","year") #if you have multiple groups/seasons/years/elements add them here
df$y= #which variable will be your y
df$x= #which variable will be your x
#No changes get the equations
text=df %>%
group_by(across(all_of(variables))) %>% #your grouping variables
do(broom::tidy(lm(log(y) ~ log(x), data = .))) %>%
ungroup() %>%
mutate(y = round(ifelse(term=='(Intercept)',exp(estimate),estimate),digits = 2)) %>% #your equation values rounded to 2
select(-estimate,-std.error,-statistic ,-p.value) %>%
pivot_wider(names_from = term,values_from = y) %>%
rename(.,a=`(Intercept)`,b=`log(x)`)
#CHANGE before running!! add your grouping variables
rsq=df %>%
split(list(.$group,.$year)) %>% #---- HERE add the names after $
map(~lm(log(y) ~ log(x), data = .)) %>%
map(summary) %>%
map_dbl("r.squared") %>%
data.frame()
#Join the R2 and y results for the plot in a single data frame and write the equations
labels.df=mutate(rsq,groups=row.names(rsq)) %>%
separate(col = groups,into = c(variables),sep = "[.]",
convert = TRUE, remove = T, fill = "right") %>%
rename("R"='.') %>%
left_join(text,.) %>%
mutate(R=round(R,digits = 4), #round your R2 digits
eq= paste('y==',a,"~x^(",b,")", sep = ""),
rsql=paste("R^2==",R),
full= paste('y==',a,"~x^(",b,")","~~R^2==",R, sep = ""))
# plot
ggplot(df,aes(x = x,y = y)) +
geom_point(size=4,mapping = aes(
colour=factor(ifelse(is.na(get(variables[2])),"",(get(variables[2])))), #points colour
shape=get(variables[1]))) + # different shapes
facet_wrap(get(variables[1])~ifelse(is.na(get(variables[2])),"",get(variables[2])),
scales = "free",labeller = labeller(.multi_line = F))+ #for multiple groups; join text in one line
stat_smooth(mapping=aes(colour=get(variables[1])), #colours for our trend
method = 'nls', formula = 'y~a*x^b',
method.args = list(start=c(a=1,b=1)),se=FALSE) +
geom_text(labels.df,x = Inf, y = Inf,size=5, mapping = aes(label = (eq)), parse = T,vjust=1, hjust=1)+
geom_text(labels.df,x = Inf,y = Inf,size=5, mapping = aes(label = (rsql)), parse = T,vjust=2.5, hjust=1)+
#scale_y_log10() + #add this to avoid problems with big y values
labs(x="Your x label",y="your y label")+
theme_bw(base_size = 16) +
theme(legend.position = "none",
strip.background = element_rect(fill="#b2d6e2"))

In ggplot how do I plot the mean line for two groups in a scatterplot

I would like to show the mean of two groups in a scatterplot. I have sorted the data so the groups are next to each other. Group 1 is the first 11 records and group2 is the next 133. How can I tell ggplot to draw one line across the range for the first group (House 1-11) and a second line for the second (House 12-133).
Here is what I have so far:
And the code is here:
library(tidyverse)
library(tidymodels)
data(ames)
ames <- AmesHousing::make_ames()
set.seed(1)
split <- initial_split(ames, prop = 0.95, strata = "Sale_Price")
ames_plot <- testing(split)
model1 <- lm(Sale_Price ~ Central_Air, data = ames_plot)
p1 <- model1 %>%
broom::augment() %>%
arrange(Central_Air) %>%
mutate(House = row_number()) %>%
ggplot(aes(House, Sale_Price, color = Central_Air)) +
geom_point(size = 1, alpha = 0.3) +
geom_segment(aes(x = 1, y = .fitted, xend = 144, yend =.fitted)) +
scale_y_continuous(labels = scales::dollar)
p1
Using geom_smooth(formula = 'y ~ x', se = FALSE, method = "lm") instead of geom_segment() gets me close to what I want but I want to show the actual predicted values coming form the lm().
It would be best just to summarize your data for that layer. For example
model1 %>%
broom::augment() %>%
arrange(Central_Air) %>%
mutate(House = row_number()) %>%
ggplot(aes(House, Sale_Price, color = Central_Air)) +
geom_point(size = 1, alpha=.3) +
geom_segment(aes(x = first, y = .fitted, xend = last, yend =.fitted),
data = function(x) {
x %>%
group_by(Central_Air) %>%
summarize(first=first(House), last=last(House), .fitted=mean(.fitted), .groups="drop_last")
}) +
scale_y_continuous(labels = scales::dollar)

How to create two lines and scatter plots using ggplot

I have the following data in R:
id <- factor(seq(1:72))
initial.e <- rnorm(n=72, mean = 21.51, sd = 6.58)
initial.f <- rnorm(n = 72, mean = 20.75, sd = 3.378)
final.e <- rnorm(n = 72, mean = 19.81, sd = 7.48)
final.f <- rnorm(n = 72, mean = 19.77, sd = 5.389)
data <- data.frame(id,initial.e, initial.f, final.e, final.f)
I need to create a scatter plot with two straight trendlines for e and f, but I'm lost on how to create that. I found this article: https://sakaluk.wordpress.com/2015/08/27/6-make-it-pretty-plotting-2-way-interactions-with-ggplot2/ which I tried following, but didn't work the way I wanted.
I also tried using melt from reshape2 package, but I can't get the plots to show the way I want to - with two trendlines for e and f in the scatter plot.
datamelt <- melt(data, id = 'id')
datamelt <- datamelt %>% mutate(names = ifelse(datamelt$variable %in% c('initial.e', 'initial.f'), 'Before', 'After'))
datamelt <- datamelt %>% mutate(types = ifelse(datamelt$variable %in% c('final.e', 'final.f'), 'e', 'f'))
After this things went downhill. All the codes I tried either have some basic scatter plot with geom_smooth() or just some generic error.
EDIT
The plot should contain scatterplot containing relationship between intial.e and initial.f with a trend line, and another relationship between final.e and final.f with a trend line in the same plot.
I think what you're looking for is something like this: I haven't tested the code, but it should give you an idea
ggplot(data) +
geom_point(aes(x=initial.e, y=initial.f)) +
geom_smooth(method = "lm", se = FALSE, aes(initial.e, final.e)) +
geom_point(aes(x=final.e, y = final.f)) +
geom_smooth(method = "lm", se = FALSE, aes(final.e, final.f))
How about something like this?
data %>%
gather(k, value, -id) %>%
mutate(
state = gsub("(\\.e$|\\.f$)", "", k),
what = gsub("(initial\\.|final\\.)", "", k)) %>%
ggplot(aes(id, value, colour = what)) +
geom_line() +
facet_wrap(~ state)
Or with points
data %>%
gather(k, value, -id) %>%
mutate(
state = gsub("(\\.e$|\\.f$)", "", k),
what = gsub("(initial\\.|final\\.)", "", k)) %>%
ggplot(aes(id, value, colour = what)) +
geom_line() +
geom_point() +
facet_wrap(~ state)
Update
data %>%
gather(k, value, -id) %>%
mutate(
state = gsub("(\\.e$|\\.f$)", "", k),
what = gsub("(initial\\.|final\\.)", "", k)) %>%
select(-k) %>%
spread(state, value) %>%
ggplot(aes(x = initial, y = final, colour = what, fill = what)) +
geom_smooth(fullrange = T, method = "lm") +
geom_point()
We're showing a trend-line based on a simple linear regression lm, including confidence band (disable with se = F inside geom_smooth). You could also show a LOESS trend with method = loess inside geom_smooth. See ?geom_smooth for more details.

Normalizing series for plotting in ggplot2

I have a dataframe that I would like to plot, generated by the following code.
df_rn1 = as.data.frame(cbind(rnorm(40, 1, 1), rep("rn1", 40)))
df_rn2 = as.data.frame(cbind(rnorm(40, 10, 1), rep("rn2", 40)))
df_rn3 = as.data.frame(cbind(rnorm(40, 100, 1), rep("rn3", 40)))
df_test = rbind(df_rn1, df_rn2, df_rn3)
colnames(df_test) <- c("value", "type")
I would like to plot the dataframe normalized by the respective first observation s.t. they are scaled properly. However, I am not getting further than this:
ggplot(aes(x = rep(1:40, 3), y=as.numeric(as.character(value)), color = type), data = df_test) +
geom_line()
Is it possible to do the normalization by types directly in the ggplot code?
Thx
How about this?
library(tidyverse);
df_test %>%
group_by(type) %>%
mutate(
value = as.numeric(as.character(value)),
value.scaled = (value - mean(value)) / sd(value),
idx = 1:n()) %>%
ggplot(aes(idx, value.scaled, colour = type)) + geom_line()
Note that values are scaled within type; not sure what you're after, for global scaling, see #ManishSaraswat's answer.
You can use scale function to normalize the values.
df_test %>%
mutate(value = scale(value)) %>%
ggplot(aes(x = rep(1:40, 3), y = value, color=type))+
geom_line()

Resources