ggplot sum and mean? - r

How would I be able to do a line showing the median of the sum?
What I currently have
dataset$`Created Date`<- gsub("T.*","",dataset$`Created Date`)
dataset$`Created Date`<- ymd(strptime(dataset$`Created Date`, format="%Y-%m-%d"))
names(dataset) <- gsub(" ","_",names(dataset)) #rename column to remove space
dfcount <- data.frame(count(dataset, `Created_Date`)) #create dataframe
dfcount$Created_Date <- as.POSIXlt(dfcount$Created_Date) #Convert to POSIX for weekdayfilter
Monthlywithavg <- ggplot(dfcount,aes(Month, n))+
stat_summary(fun.y = sum, geom = "line") +
scale_x_date(labels = date_format("%Y-%m"))+
stat_summary(fun.y = mean, geom = "line")
Monthlywithavg
Let me know if there's anything else I should change too.
Thanks!

Here is another alternative to CPak's, it is essentially the same.
library(ggplot2)
library(plyr)
agg = plyr::ddply(mtcars,'cyl',summarize,mpg = sum(mpg) )
ggplot(mtcars, aes(x=cyl, y=mpg)) +
stat_summary(fun.y=sum, geom="line") +
geom_hline(data = agg,aes(yintercept = median(mpg)),color="red")

Hopefully someone has a better answer but I think you'll have to calculate the median of the sum yourself. See this reproducible example.
library(ggplot2)
library(dplyr)
library(magrittr)
median_of_sum <- mtcars %>%
group_by(cyl) %>%
summarise(sum = sum(mpg)) %>%
ungroup() %>%
summarise(median = median(sum))
ggplot(mtcars, aes(x=cyl, y=mpg)) +
stat_summary(fun.y=sum, geom="line") +
geom_hline(data=median_of_sum, aes(yintercept=median), color="red")

Related

How to normalize data series to start value = 0?

I have a dataset similar to this:
library(ggplot2)
data(economics_long)
economics_long$date2 <- as.numeric(economics_long$date) + 915
ggplot(economics_long, aes(date2, value01, colour = variable)) +
geom_line()
Which gives the following plot:
Now I would like to normalize it to the start value of the green line (or the mean), so all variables start at the same point of the Y axes. Similar to this:
Thanks for any help.
You could subtract the starting value of each vector depending on variable-value using by().
library(ggplot2)
l <- by(economics_long, economics_long$variable, function(x)
within(x, varnorm <- value01 - value01[1]))
dat <- do.call(rbind, l)
ggplot(dat, aes(date2, value01.n, colour = variable)) +
geom_line()
use group_by() and mutate() to shift each variable by its initial y-value.
library(tidyverse)
data(economics_long)
economics_long %>%
group_by(variable) %>%
mutate(value_shifted = value01 - value01[1]) %>%
ungroup() %>%
ggplot(aes(date2, value_shifted, colour = variable)) +
geom_line()

unable to set xlim and ylim using min() and max() in ggplot

I am missing something crucial here and can't see it.
Why does min and max not work to set the axis limits?
mtcars %>%
select(mpg, cyl, disp, wt) %>%
filter(complete.cases(disp)) %>%
ggplot() +
geom_point(aes(x=mpg, y=disp, colour=cyl), size=3) +
xlim(min(mpg, na.rm=TRUE),max(mpg, na.rm=TRUE)) +
ylim(min(disp, na.rm=TRUE),max(disp, na.rm=TRUE)) +
scale_colour_gradient(low="red",high="green", name = "cyl")
This works:
mtcars %>%
select(mpg, cyl, disp, wt) %>%
filter(complete.cases(disp)) %>%
ggplot() +
geom_point(aes(x=mpg, y=disp, colour=cyl), size=3) +
# xlim(min(mpg, na.rm=TRUE),max(mpg, na.rm=TRUE)) +
# ylim(min(disp, na.rm=TRUE),max(disp, na.rm=TRUE)) +
scale_colour_gradient(low="red",high="green", name = "cyl")
ggplot cannot access the column values in the way that dplyr can.
You need to add in the data:
mtcars %>%
select(mpg, cyl, disp, wt) %>%
filter(complete.cases(disp)) %>%
ggplot() +
geom_point(aes(x=mpg, y=disp, colour=cyl), size=3) +
xlim(min(mtcars$mpg, na.rm=TRUE),max(mtcars$mpg, na.rm=TRUE)) +
ylim(min(mtcars$disp, na.rm=TRUE),max(mtcars$disp, na.rm=TRUE)) +
scale_colour_gradient(low="red",high="green", name = "cyl")
You can't reference column names in ggplot objects except inside aes() and in a formula or vars() in a facet_* function. But the helper function expand_scale is there to help you expand the scales in a more controlled way.
For example:
# add 1 unit to the x-scale in each direction
scale_x_continuous(expand = expand_scale(add = 1))
# have the scale exactly fit the data, no padding
scale_x_continuous(expand = expand_scale(0, 0))
# extend the scale by 10% in each direction
scale_x_continuous(expand = expand_scale(mult = .1))
See ?scale_x_continuous and especially ?expand_scale for details. It's also possible to selectively pad just the top or just the bottom of each scale, there are examples in ?expand_scale.

Add titles to ggplots created with map()

What's the easiest way to add titles to each ggplot that I've created below using the map function? I want the titles to reflect the name of each data frame - i.e. 4, 6, 8 (cylinders).
Thanks :)
mtcars_split <-
mtcars %>%
split(mtcars$cyl)
plots <-
mtcars_split %>%
map(~ ggplot(data=.,mapping = aes(y=mpg,x=wt)) +
geom_jitter()
# + ggtitle(....))
plots
Use map2 with names.
plots <- map2(
mtcars_split,
names(mtcars_split),
~ggplot(data = .x, mapping = aes(y = mpg, x = wt)) +
geom_jitter() +
ggtitle(.y)
)
Edit: alistaire pointed out this is the same as imap
plots <- imap(
mtcars_split,
~ggplot(data = .x, mapping = aes(y = mpg, x = wt)) +
geom_jitter() +
ggtitle(.y)
)
Perhaps you'd be interested in using facet_wrap instead
ggplot(mtcars, aes(y=mpg, x=wt)) + geom_jitter() + facet_wrap(~cyl)
You can use purrr::map2():
mtcars_split <- mtcars %>% split(mtcars$cyl)
plots <- map2(mtcars_split, titles,
~ ggplot(data=.x, aes(mpg,wt)) + geom_jitter() + ggtitle(.y)
)
EDIT
Sorry duplicated with Paul's answer.

How to assign standard deviation using facet_grid operator?

I have used the ggplot using the folowing script
data <- fread("Book1.txt")
names(data)
days <- c("UD","D4","D6","D10","D16")
sample <-c("H1","H2","H3")
SD <- c('SD_UD', 'SD_D4', 'SD_D6', 'SD_D10','SD_16')
vol <- data.table(gather(data, days, expression, UD:D16))
volL <- data.table(gather(vol, expression, SD, SD_UD:SD_D16))
dodge <- position_dodge(width=0.9)
gg <-ggplot(volL, aes(days, expression, fill= sample)) + geom_bar(stat="identity", width = 0.9, position = dodge) +
geom_errorbar(aes(ymin=expression-SD, ymax=expression+SD), color= "grey", width=.1) +
facet_grid(sample~.) +
scale_x_discrete(limits=days, labels = days) +
ggtitle("Expression vs days")
Data table :
sample UD D4 D6 D10 D16 SD_UD SD_D4 SD_D6 SD_D10 SD_D16
H1 9.96113E-05 0.000276321 0.105211427 0.098271655 0.06978369 1.17174E-05 9.84763E-05 0.03589122 0.05107505 0.017763717
H2 7.9913E-06 4.43916E-05 0.040212602 0.106493626 0.162614138 9.31988E-07 4.28076E-05 0.012332228 0.0441571 0.063304324
H3 0.000233391 0.000382084 0.001415172 0.003544547 0.018624673 0.000103126 0.000110262 0.000612986 0.000572592 0.010861883
Using the above script I got the following graph;
Problem: It did not assign the standard deviation according to the data.fram. Could anyone help me out how to assign the SD values correctly.
It did assign the standard deviation according to the data.frame.. problem is the data.frame contains 4 different value of SD for each expression.
To solve, using dplyr:
library(dplyr)
library(ggplot2)
expression <- df %>%
select(sample, UD:D16) %>%
gather(days, expr, UD:D16)
SD <- df %>%
select(sample, SD_UD:SD_D16) %>%
gather(days, SD, SD_UD:SD_D16) %>%
mutate(sample, days = gsub(x = days, pattern = 'SD_', ''), SD)
df1 <- inner_join(expression,SD)
gg <-ggplot(df1, aes(days, expr, fill = sample)) +
geom_bar(stat="identity", width = 0.9, position = dodge) +
geom_errorbar(aes(ymin=expr-SD, ymax=expr+SD), color= "grey", width=.1) +
facet_grid(sample~.) +
scale_x_discrete(limits=days, labels = days) +
ggtitle("Expression vs days")
Hope this helps.

Pass column names to a function

How can I turn this ggplot() call into a function? I can't figure out how to get R to recognize the column names I want to pass to the function. I've come across several similar sounding questions, but I've not had success adapting ideas. See here for substitute().
# setup
library(dplyr)
library(ggplot2)
set.seed(205)
dat = data.frame(t=rep(1:2, each=10),
pairs=rep(1:10,2),
value=rnorm(20))
# working example
ggplot(dat %>% group_by(pairs) %>%
mutate(slope = (value[t==2] - value[t==1])/(2-1)),
aes(t, value, group=pairs, colour=slope > 0)) +
geom_point() +
geom_line() +
stat_summary(fun.y=mean,geom="line",lwd=2,aes(group=1))
# attempt at turning into a function
plotFun <- function(df, groupBy, dv, time) {
groupBy2 <- substitute(groupBy)
dv2 <- substitute(dv)
time2 <- substitute(time)
ggplot(df %>% group_by(groupBy2) %>%
mutate(slope = (dv2[time2==2] - dv2[time2==1])/(2-1)),
aes(time2, dv2, group=groupBy2, colour=slope > 0)) +
geom_point() +
geom_line() +
stat_summary(fun.y=mean,geom="line",lwd=2,aes(group=1))
}
# error time
plotFun(dat, pairs, value, t)
Update
I took #joran's advice to look at this answer, and here's what I came up with:
library(dplyr)
library(ggplot2)
library(lazyeval)
plotFun <- function(df, groupBy, dv, time) {
ggplot(df %>% group_by_(groupBy) %>%
mutate_(slope = interp(~(dv2[time2==2] - dv2[time2==1])/(2-1),
dv2=as.name(dv),
time2=as.name(time))),
aes(time, dv, group=groupBy, colour=slope > 0)) +
geom_point() +
geom_line() +
stat_summary(fun.y=mean,geom="line",lwd=2,aes(group=1))
}
plotFun(dat, "pairs", "value", "t")
The code runs but the plot is not correct:
geom_path: Each group consists of only one observation. Do you need to
adjust the group aesthetic?
Here's the working solution informed by all of the commenters:
# setup
library(dplyr)
library(ggplot2)
library(lazyeval)
set.seed(205)
dat = data.frame(t=rep(1:2, each=10),
pairs=rep(1:10,2),
value=rnorm(20))
# function
plotFun <- function(df, groupBy, dv, time) {
ggplot(df %>% group_by_(groupBy) %>%
mutate_(slope = interp(~(dv2[time2==2] - dv2[time2==1])/(2-1),
dv2=as.name(dv),
time2=as.name(time))),
aes_string(time, dv, group = groupBy,
colour = 'slope > 0')) +
geom_point() +
geom_line() +
stat_summary(fun.y=mean,geom="line",lwd=2,aes(group=1))
}
# plot
plotFun(dat, "pairs", "value", "t")

Resources