Related
I have two gene-expression time-course data sets:
First, gene expression was measured over 14 time points from 4 groups:
df1 <- structure(list(val = c(-0.1, -0.13, -0.4, -0.3, -0.3, -0.2, -0.24,
0.1, 0.2, 0.13, 0, 0.63, 0.83, 0.85, -0.07, -0.07, -0.27, -0.2,
-0.2, -0.1, 0.2, 0.1, 0.07, 0.17, 0.6, 0.75, 1.1, 1.1, -0.13,
-0.15, -0.26, -0.25, -0.14, 0.04, 0.2, 0.24, 0.23, 0.2, 0.1,
0.73, 1, 1.3, 0, 0.06, -0.24, -0.17, -0.17, -0.04, 0.16, 0.1,
0.14, 0.27, 0.34, 0.9, 0.97, 1.04),
time = c(-1, 0, 1, 1.58,2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17,7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58,6.17, 7.39),
group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,4L),
.Label = c("a", "b", "c", "d"), class = "factor")), .Names = c("val","time", "group"),
row.names = c(NA, -56L), class = "data.frame")
df1$group <- factor(df1$group,levels=c("a","b","c","d"))
which looks like this (adding a loess smoothed trend line):
library(ggplot2)
ggplot(df1,aes(x=time,y=val,color=group))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
Second, gene expression was measured over similar 14 time points but now from 2 different groups, each represented by the two sexes:
df2 <- structure(list(val = c(-0.23, -0.01, -0.14, -0.01, -0.21, -0.16,
-0.24, -0.11, 0.02, -0.11, -0.01, -0.25, -0.47, -1.25, 0.02,
-0.3, -0.02, 0.14, 0.25, -0.05, 0.15, 0.11, -0.24, -0.18, -0.39,
-0.49, -0.5, -0.65, -0.06, 0.09, 0.1, 0.15, 0.08, 0.15, 0.4,
0.24, 0.07, 0.08, -0.18, -0.35, -0.19, -0.81, -0.16, 0.29, -0.05,
0.14, 0.14, 0.48, 0.34, 0.11, -0.07, -0.13, -0.41, -0.22, -0.54,
-0.76, 0.35, 0.34, -0.06, 0.21, 0.14, 0.14, 0.25, 0.22, 0.25,
0.16, 0.3, 0.44, 0.08, 0.48, 0.1, 0.16, -0.03, -0.22, 0.2, 0.01,
-0.09, -0.02, -0.01, 0.06, -0.13, 0.19, 0.11, -0.04, -0.39, 0.03,
-0.01, 0.09, 0.1, -0.14, -0.12, -0.1, 0.36, 0.08, 0.09, 0.09,
0.42, 0.37, -0.14, 0.12, 0.09, 0.03, 0.06, -0.25, 0.2, -0.06,
-0.44, 0.23, 0.03, 0.16, 0.81, 0.83),
time = c(-1, 0, 1, 1.58,2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0,1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17,7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58,6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58,5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17,4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39),
sex = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("F", "M"), class = "factor"), group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
.Label = c("a", "b"), class = "factor")), .Names = c("val", "time", "sex", "group"), row.names = c(NA, -112L), class = "data.frame")
df2$sex <- ordered(df2$sex,levels=c("M","F"))
df2$group <- ordered(df2$group,levels=c("a","b"))
df2$col <- factor(paste0(df2$group,":",df2$sex))
which looks like this (adding a loess smoothed trend line):
ggplot(df2,aes(x=time,y=val,color=col))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
For df1, I would like to estimate the effect of time on val, adjusting for group.
For df2, I would like to estimate the effect of time:group on val, adjusting for sex.
Looking at the data I thought using spline regressions would be appropriate so I used the gam function from the mgcv package, which as far as I understand optimizes the degrees of freedom of the splines fitted to the data.
This is what I fitted for df1:
mgcv1.fit <- mgcv::gam(val ~ group+s(time),data=df1)
Which gives:
Family: gaussian
Link function: identity
Formula:
val ~ group + s(time)
Estimated degrees of freedom:
7.18 total = 11.18
GCV score: 0.01258176
But 7.18 degrees of freedom seems too much for these data.
For df2:
mgcv2.fit <- mgcv::gam(val ~ sex+s(time,by=group),data=df2)
which gives:
Family: gaussian
Link function: identity
Formula:
val ~ sex + s(time, by = group)
Estimated degrees of freedom:
1.72 total = 3.72
GCV score: 0.08522094
I guess that in this case I'd imagine the degrees of freedom to be slightly higher.
One more point. Plotting the fitted values for these two data sets:
df1$mgcv <- mgcv1.fit$fitted.values
ggplot(df1,aes(x=time,y=mgcv,color=group))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
which looks fine.
But for df2
df2$mgcv <- mgcv2.fit$fitted.values
ggplot(df2,aes(x=time,y=mgcv,color=col))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
Looks like it flipped the group labels.
So my questions are:
Am I using mgcv::gam correctly for optimizing the spline degrees of freedom for my questions?
Does mgcv reorders the samples in its fitted.values?
First of all, mgcv does the right thing on the factor levels. If you check str(df2$sex), you will see that "M" (male) is the first level and "F" (female) is the second. But it seems from str(df2$col) that "F" is the first, so you get some mislabeling when making plot.
Secondly, your second model has not been specified correctly.
The spline s(time) is under centering constraint when there is no "by" variable, or the "by" is a factor. So you to provide your "by" variable group as a separate term in your model formula to catch its marginal effect;
Since the "by" variable group is an ordered variable, mgcv applies contrasts on it, dropping the first level "a" when constructing the s(time, by = group). So you need to provide a separate s(time) as the baseline smooth.
Your current mgcv2.fit is a rather poor model (not surprising), giving an explained deviance of 9%. But if you do the following you get 64%.
gam(val ~ sex + s(time) + group + s(time, by = group), data = df2, method = "REML")
The ggplot now looks right (I haven't changed df2$col so the coloring is still probably reversed).
gam defaults to use "GCV.Cp" as smoothing parameter selection method. But it is recommended to use "REML" as it is less prone to overfitting.
Remark 1
If the "by" variable group is a (non-ordered) factor, it is not subject to contrasts. So the model formula should be:
val ~ sex + group + s(time, by = group)
The following is quoted from 'by' variables section of ?gam.models:
If a ‘by’ variable is a ‘factor’ then it generates an indicator
vector for each level of the factor, unless it is an ‘ordered’
factor. In the non-ordered case, the model matrix for the smooth
term is then replicated for each factor level, and each copy has
its rows multiplied by the corresponding rows of its indicator
variable. The smoothness penalties are also duplicated for each
factor level. In short a different smooth is generated for each
factor level (the ‘id’ argument to ‘s’ and ‘te’ can be used to
force all such smooths to have the same smoothing parameter).
‘ordered’ ‘by’ variables are handled in the same way, except that
no smooth is generated for the first level of the ordered factor
(see ‘b3’ example below). This is useful for setting up
identifiable models when the same smooth occurs more than once in
a model, with different factor ‘by’ variables.
Remark 2
I am not to judge your model, but there seems to be a clear within-group difference between "F" and "M". From your data we see that "F" and "M" has a bigger difference in group "b" than in group "a". At the moment the effect of sex is identical in both groups, and it is just a vertical shift. You can observe this in the above ggplot in this answer. It is up to you to decide the model in the end, but just in case that you want to model this sex-group interaction, you can do
df2$sex_group <- with(df2, interaction(sex, group)) ## the new variable is unordered
test <- gam(val ~ sex + group + s(time, by = sex_group), data = df2, method = "REML")
Note how I provide two factor variables to by. An auxiliary variable sex_group is created.
Running the following script I was hoping to have one datapoint for each of the six terms with different colors depending on the dataset, facetted by adjustment. However, I get three and four point for each term in each facet. Any idea how this can happen when I only have 24 rows in the dataset?
library(ggplot2)
tb5 <- structure(list(term = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"),
coef = c(-1.444, 0.035, -0.034, 0.005, 0.001, 2.43, -1.032,
0.032, -0.024, 0.025, 0.003, 1.758, -1.148, 0.02, 0.003,
0.027, 0.003, 12.713, -1.494, 0.028, -0.021, 0.007, 0.004,
13.499), ci.lb = c(-1.826, 0.025, -0.087, -0.011, -0.004,
0.3, -1.293, 0.026, -0.061, 0.016, -0.001, -0.273, -1.48,
0.011, -0.045, 0.014, -0.003, 11.858, -1.931, 0.015, -0.08,
-0.014, -0.002, 12.624), ci.ub = c(-1.071, 0.045, 0.019,
0.022, 0.007, 7.305, -0.775, 0.038, 0.012, 0.035, 0.007,
6.613, -0.816, 0.029, 0.051, 0.039, 0.008, 13.569, -1.056,
0.04, 0.038, 0.027, 0.01, 14.375), Adjusted = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Adjusted", "Unadjusted"
), class = "factor"), Dataset = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor")), .Names = c("term",
"coef", "ci.lb", "ci.ub", "Adjusted", "Dataset"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -24L))
ggplot(data = tb5,aes(x=term,y=coef,color=Dataset))+geom_point()+
facet_grid(facets = ~Adjusted)+
geom_jitter(height = .8)
I am using ezPlot from the ez package in R to plot results of a mixed within and between-ss design. The data point from the two groups I have overlap so that I would like to jitter both the data point and associated error bar.
data<-structure(list(Sub = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("WW", "XX", "YY",
"ZZ"), class = "factor"), DepVar = c(0.67, 0.35, 0.09, 0.2, 0.19,
0.13, 0.45, 0.23, 0.08, 0.32, 0.17, 0.18, 0.67, 0.36, 0.55, 0.4,
0.37, 0.05, 0.26, 0.11, 0.08, 0.46, 0.29, 0.18, 0.16, 0, 0.38,
0.22, 0.08, 0.1, 0.54, 0.17, 0.07, 0.38, 0.75, 0.87, 0.27, 0.57,
0.31, 0.28, 0.07, 0.12, 0.75, 0.33, 0.23, 0.33, 0.26, 0.18),
Group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
Con = structure(c(1L, 3L, 3L, 3L, 4L, 5L, 2L, 3L, 4L, 1L,
2L, 3L, 1L, 3L, 3L, 3L, 4L, 5L, 2L, 3L, 4L, 1L, 2L, 3L, 1L,
3L, 3L, 3L, 4L, 5L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 3L, 3L, 3L,
4L, 5L, 2L, 3L, 4L, 1L, 2L, 3L), .Label = c("C", "D", "E",
"F", "G"), class = "factor")), .Names = c("Sub", "DepVar",
"Group", "Con"), class = "data.frame", row.names = c(NA, -48L))
ezPlot( data,
dv = .(DepVar),
wid = .(Sub), # subject
within = .(Con),
between=.(Group),
split=.(Group),
do_bars=TRUE,
type = 2,
x = .(Con))
A non elegant trick is so set scale-color manual white so that the underlying data points disappear and then using geom-point position dodge(0.4))
ezPlot( data,
dv = .(DepVar),
wid = .(Sub), # subject
within = .(Con),
between=.(Group),
split=.(Group),
do_bars=TRUE,
type = 2,
x = .(Con))+
scale_color_manual(values=c("white", "white"))+
geom_point(aes(fill=Group), color="black", pch= 21, size= 3, position=position_dodge(0.4))+
geom_line(aes(group = Group), lty = 3, lwd = 1.3, color='black')
however, I would like to have the error bar plotted and I don't know how to achieve this or if other workarounds are possible. I would like to stick to ezplot. Thanks!
One way is to use set print_code = TRUE, to produce data to be plotted, as well as the ggplot code:
library(ggplot2)
stats <- ezPlot( data,
dv = .(DepVar),
wid = .(Sub), # subject
within = .(Con),
between=.(Group),
split=.(Group),
do_bars=TRUE,
type = 2,
x = .(Con),
print_code = TRUE)
Then, manually modify the code to add position = position_dodge(0.4) to each geom, then run the ggplot code.
A more efficient way to do the same thing would be to capture.output the code as a character vector, use gsub to add position = position_dodge(0.4), then eval(parse(text = ...)) the modified code:
gg_code <- capture.output(stats <- ezPlot( data,
dv = .(DepVar),
wid = .(Sub), # subject
within = .(Con),
between=.(Group),
split=.(Group),
do_bars=TRUE,
type = 2,
x = .(Con),
print_code = TRUE))
gg_code <- gsub("alpha", "position = position_dodge(0.4), alpha", gg_code)
eval(parse(text = paste(gg_code, collapse = "")))
Output:
I am using facet_grid to plot my data. I have three plots in grid and I Want to fix the ymax (or ylimit) for two of the plots. Currently, I am using following code
f <- ggplot(data=newmel,aes(x=timestamp,y=value,ymin=0,ymax=value))+facet_grid(variable~., scales = "free_y")+
theme(axis.title.x=element_blank(),axis.title.y=element_blank())
f1 <- f + geom_linerange(subset=.(variable=="hpanom")) # require(plyr) for dot function
f2 <- f1 + geom_linerange(subset=.(variable=="lofanom"))
f3 <- f2 + geom_line(subset=.(variable=="power"))
f3
The output plot is:
I want to fix the range of first two (hpanom and lofanom) plots from 0-1 and I do not care for the third one. This is because first two represent probabilites, hence range is always fixed, whereas the third one represent values of which I do not know limits.
Here I am attaching my whole dataset,so that it become easy to replicate the case
structure(list(timestamp = structure(c(1438450200, 1438536600,
1438623000, 1438709400, 1438795800, 1438882200, 1438968600, 1439055000,
1439141400, 1439227800, 1439314200, 1439400600, 1439487000, 1439573400,
1439659800, 1439746200, 1439832600, 1439919000, 1440005400, 1440091800,
1440178200, 1440264600, 1440351000, 1440437400, 1440523800, 1440610200,
1440696600, 1440783000, 1440869400, 1440955800, 1438450200, 1438536600,
1438623000, 1438709400, 1438795800, 1438882200, 1438968600, 1439055000,
1439141400, 1439227800, 1439314200, 1439400600, 1439487000, 1439573400,
1439659800, 1439746200, 1439832600, 1439919000, 1440005400, 1440091800,
1440178200, 1440264600, 1440351000, 1440437400, 1440523800, 1440610200,
1440696600, 1440783000, 1440869400, 1440955800, 1438450200, 1438536600,
1438623000, 1438709400, 1438795800, 1438882200, 1438968600, 1439055000,
1439141400, 1439227800, 1439314200, 1439400600, 1439487000, 1439573400,
1439659800, 1439746200, 1439832600, 1439919000, 1440005400, 1440091800,
1440178200, 1440264600, 1440351000, 1440437400, 1440523800, 1440610200,
1440696600, 1440783000, 1440869400, 1440955800), tzone = "Asia/Kolkata", tclass = c("POSIXct",
"POSIXt"), class = c("POSIXct", "POSIXt")), variable = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("power", "hpanom",
"lofanom"), class = "factor"), value = c(713.818544290426, 1.60000010638915e-16,
1588.93456060134, 1080.34496835479, 1602.88616015399, 1325.85664208325,
1487.27242035101, 647.925289288333, 1.60000010638915e-16, 1280.71707200196,
1558.88686231823, 1349.32481298453, 1386.23617766603, 995.841940511863,
961.865845980269, 1153.33076383446, 1601.85509748887, 1346.15843354498,
1209.13461354976, 1060.91957740428, 1963.3986755642, 1995.90120364349,
1.60000010638915e-16, 1391.2765167008, 1198.11633766185, 1202.33001076712,
1508.21685464222, 1299.09592097037, 1.60000010638915e-16, 1.60000010638915e-16,
0.718682639785718, 0.835994256774337, 0.323313580710195, 0.323826587878846,
0.252277897807753, 0.427692277575098, 0, 0.737219412602466, 0.835994256774337,
0.165199776286282, 0.0893388057418736, 0.20964070918484, 0.13145481071022,
0.408292163070401, 0.824540692174425, 0.71239540630304, 0.323313580710195,
0.485828441524218, 0.188258288292337, 0.231165659725455, 0.907512281748878,
0.951462340841186, 0.835994256774337, 0.29824669841755, 0.245223685520087,
0.108813825489535, 0.387339161244294, 0.359556716396615, 0.835994256774337,
0.835994256774337, 0.4, 0.43, 0.13, 0.28, 0.11, 0.1, 0.13, 0.43,
0.43, 0.13, 0.11, 0.01, 0.12, 0.3, 0.6, 0.4, 0.2, 0.11, 0.1,
0.15, 0.67, 1, 0.43, 0.08, 0.08, 0.12, 0.07, 0.08, 0.43, 0.43
)), row.names = c(NA, -90L), .Names = c("timestamp", "variable",
"value"), class = "data.frame")
It's a bit of a hack, but you can plot an "invisible" point by using size=0 at y=1 to force the limit to 1 on the two linerange plots as shown below. I've also removed need to use plyr.
f <- ggplot(data=newmel,aes(x=timestamp,y=value,ymin=0,ymax=value))+facet_grid(variable~., scales = "free_y")
f <- f + theme(axis.title.x=element_blank(),axis.title.y=element_blank())
f <- f + geom_linerange(data=subset(newmel, variable %in% c("hpanom","lofanom")))
# plot invisible point ( size=0) to set upper limit of y axis to 1
f <- f + geom_point(data=subset(newmel, variable %in% c("hpanom","lofanom")),
aes(x=min(timestamp), y=1), size=0)
f <- f + geom_line(data=subset(newmel, variable=="power"))
f
Update for ggplot 2.0
In ggplot 2.0, setting size=0 no longer makes the point invisible. Instead, use colour = NA to make it transparent . New solution is
library(ggplot2)
f <- ggplot(data=newmel,aes(x=timestamp,y=value,ymin=0,ymax=value))+facet_grid(variable~., scales = "free_y")
f <- f + theme(axis.title.x=element_blank(),axis.title.y=element_blank())
f <- f + geom_linerange(data=subset(newmel, variable %in% c("hpanom","lofanom")))
# plot transparent point ( colour = NA) to set upper limit of y axis at 1
f <- f + geom_point(data=subset(newmel, variable %in% c("hpanom","lofanom")),
aes(x=timestamp[1], y=1), colour=NA )
f <- f + geom_line(data=subset(newmel, variable=="power"))
f
I have two questions on building a bar plot by using ggplot().
How to display data format (Sep-12)?
I would like to display the date in the format of Sep-12. My data is a quarterly summary. I would like to show Mar, Jun, Sep and Dec quarters. However, I used the as.Date(YearQuarter) within the ggplot() function. It shows a different sequence of Apr, July, Oct, Jan.
How to increase y axis limit?
The y axis is set at 70%, one of value label is out of the picutre. I have added ylim(0,1) to increase the y limit to 1. However, I lost the percentage format as the y axis is not displaying the percentage anymore.
x4.can.t.m <- structure(list(NR_CAT = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L), .Label = c("0%", "1 to 84%", "85% +"
), class = "factor"), TYPE = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("PM BUSINESS", "PM CONSUMER",
"PREPAY"), class = "factor"), YearQuarter = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("2011-09-01",
"2011-12-01", "2012-03-01", "2012-06-01", "2012-09-01"), class = "factor"),
value = c(0.5, 0, 0.5, 0.35, 0, 0.65, 0.28, 0.02, 0.7, 0.4,
0, 0.6, 0.38, 0, 0.62, 0.43, 0.01, 0.56, 0.57, 0, 0.43, 0.35,
0, 0.65, 0.39, 0.01, 0.6, 0.55, 0, 0.45, 0.4, 0.02, 0.58,
0.35, 0.02, 0.63, 0.35, 0, 0.65, 0.55, 0.01, 0.44, 0.47,
0, 0.53)), .Names = c("NR_CAT", "TYPE", "YearQuarter", "value"
), row.names = c(NA, -45L), class = "data.frame")
This is my plot code:
x4.can.t.m$YearQuarter <- as.Date(x4.can.t.m$YearQuarter)
x4.can.t.d.bar <- ggplot(data=x4.can.t.m, aes(x=YearQuarter, y=value,fill=NR_CAT)) +
geom_bar(stat="identity",position = "dodge",ymax=NR_CAT+0.2) +
facet_wrap(~TYPE,ncol=1) +
geom_text(aes(label =paste(round(value*100,0),"%",sep="")),
position=position_dodge(width=0.9),
vjust=-0.25,size=3) +
scale_y_continuous(formatter='percent',ylim=1) +
labs(y="Percentage",x="Year Quarter") +
ylim(0,100%)
x4.can.t.d.bar +scale_fill_manual("Canopy Indicators",values=tourism.cols(c(6,9,8)))+
opts(title="Canopy Indicator: All Customers portout for Network
Issues",size=4)
It looks like you have an older version of ggplot; the following is for ggplot 0.2.9.1. I had to fix several things to make your plot work. Starting from your original definition of x4.can.t.m:
x4.can.t.m$YearQuarter <- format(as.Date(x4.can.t.m$YearQuarter),"%b-%y")
library("scales")
ggplot(data=x4.can.t.m, aes(x=YearQuarter, y=value, fill=NR_CAT)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label = paste(round(value*100,0),"%",sep=""), group=NR_CAT),
position=position_dodge(width=0.9),
vjust=-0.25, size=3) +
scale_y_continuous("Percentage", labels=percent, limits=c(0,1)) +
labs(x="Year Quarter") +
scale_fill_discrete("Canopy Indicators") +
facet_wrap(~TYPE,ncol=1) +
ggtitle("Canopy Indicator: All Customers portout for Network Issues") +
theme(plot.title = element_text(size=rel(1.2)))
The first part of the question is just achieved by formatting YearQuarter into the format you wanted, leaving it as a string.
The second part specifies the limits in scale_y_continuous and uses the labels argument to specify the formatting function. Note that library("scales") is needed for this part to work.