I want to estimate an "Average curve" from curves of multiple trials. I have done this before using approx() , but then I had a fixed set of x-axis values against which y was measured.
In this dataset, values are mixed for both x and y (i.e., there are no fixed values of x for which y has been measured). Instead, different set of x values for every trial.
Is there a way to average curves in these situations (with standard errors)?
Alternatively :
How would you extract y-values (for a fixed set of x-values) from different curves and construct a new dataframe ?
I have provided a sample dataset (melted) - and the code for plotting the curves for individual trials.
P1, P2,P3,P4, P5 the names/ID for the individual trials
> dput(head(dat,74))
structure(list(ID = structure(c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L), .Label = c("LCRA_P1", "LCRA_P2",
"LCRA_P3", "LCRA_P4", "LCRA_P5", "LCRA_P6", "P1", "P2", "P3",
"P4", "P5"), class = "factor"), Time = c(170L, 452L, 572L, 692L,
812L, 932L, 1052L, 1172L, 1292L, 1412L, 1532L, 1652L, 1772L,
1892L, 2012L, 2132L, 2252L, 54L, 290L, 410L, 530L, 650L, 770L,
890L, 1010L, 1130L, 1250L, 1370L, 1490L, 1610L, 1730L, 1850L,
1970L, 115L, 235L, 355L, 475L, 595L, 715L, 835L, 955L, 1075L,
1195L, 1315L, 1435L, 1555L, 1675L, 1795L, 135L, 201L, 321L, 441L,
561L, 681L, 801L, 921L, 1041L, 1161L, 1281L, 1401L, 100L, 251L,
371L, 431L, 491L, 611L, 731L, 791L, 851L, 911L, 971L, 1031L,
1091L, 1151L), I = c(154.5066034, 138.3819058, 104.8425346, 61.6283449,
40.34374398, 35.18384073, 29.37894957, 40.34374398, 44.85865933,
27.44398585, 31.9589012, 41.6337198, 54.53347792, 64.20829652,
70.65817559, 66.78824815, 66.78824815, 154.5066034, 90.00781278,
73.88311512, 62.2733328, 61.6283449, 57.75841746, 53.24350211,
48.08359886, 55.17846583, 51.30853839, 42.92369561, 53.24350211,
50.66355049, 54.53347792, 38.40878026, 54.53347792, 154.5066034,
73.88311512, 62.2733328, 61.6283449, 57.75841746, 53.24350211,
48.08359886, 55.17846583, 51.30853839, 42.92369561, 38.40878026,
54.53347792, 37.79284177, 35.21289014, 39.08281758, 154.5066034,
129.997063, 84.84790953, 51.30853839, 40.98873189, 33.24887701,
29.37894957, 27.44398585, 33.24887701, 33.89386492, 31.9589012,
31.9589012, 135.1569662, 85.49289744, 48.08359886, 48.08359886,
22.2840826, 27.44398585, 49.37357467, 51.30853839, 31.9589012,
28.73396167, 23.57405841, 21.63909469, 9.384324471, 25.50902213
)), .Names = c("ID", "Time", "I"), row.names = c(NA, 74L), class = "data.frame")
(The code for plotting is included)
> ggplot(dat,aes(x=Time, y = I, colour=ID)+
geom_point()+
labs(x="Time (Seconds)", y ="Infiltration (mm/hour)")+
scale_x_continuous(breaks=seq(0,2500,100))+
scale_y_continuous(breaks=seq(0,160,10))+
geom_line(aes(group=ID))
To average, I used this :
ggplot(df2,aes(x=Time, y=I))+
stat_summary(fun.data="mean_se",mult=1, geom="smooth")
The result (the figure below) is not making any sense.
I'm still not sure what's the exact output you want, but here are a few simple examples you can adapt. I think you still had the color or group set in your aes when you made the geom_smooth, which is why you have lots of lines. If you want lines or points or any other geom for the different IDs, but then want a single smoothing line that averages all the IDs, you need to separate what gets a color or group and what doesn't.
Study up on the arguments to stat_smooth—there's a lot you can do to specify the curve it draws, including the method and formula, and arguments depending on the method. Note (from the output geom_smooth gives) that the default for a small number of observations is a loess curve, which might be the type of averaging you're looking for.
Here are examples of where you might want to take this:
library(ggplot2)
ggplot(df, aes(x = Time, y = I)) +
geom_point(aes(color = ID)) +
geom_smooth()
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(df, aes(x = Time, y = I)) +
geom_point(aes(color = ID)) +
geom_smooth(se = F, method = lm)
ggplot(df, aes(x = Time, y = I)) +
geom_line(aes(group = ID), alpha = 0.5) +
geom_smooth(size = 0.8, se = F, span = 0.2)
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Created on 2018-06-14 by the reprex package (v0.2.0).
Related
I am trying to reorder the bars in ggPlot2's barplot from the highest values to lowest values. Where the highest values are at the top of the barchart and the lowest values are at the bottom.
I've used this stack overflow post in other plots and it works with no problem.
However, ggPlot2 seems to have a problem when there are the same values in both facets. It does not produce the correct ordering in the plot.
Here is what it looks like now. As you can see, it is out of order. Idealy, I'd like the Unvax_to_Vax facet to read (from top to bottom): safe, sheep, good, dumb, stupid, scared and I'd like the Vax_to_Unvax facet to read (from top to bottom): stupid, selfish, ingnorant, dumb, unsade, foolish.
Here is the data and code to reproduce the figure.
df <- structure(list(Var1 = structure(c(8L, 7L, 4L, 1L, 9L, 2L, 5L,
10L, 3L, 1L, 8L, 6L), .Label = c("dumb", "foolish", "good", "ignorant",
"safe", "scared", "selfish", "stupid", "unsafe", "sheep"), class = "factor"),
Freq = c(101L, 94L, 47L, 33L, 29L, 24L, 27L, 22L, 18L, 15L,
15L, 11L), Percent = c(8.82096069868996, 8.20960698689956,
4.10480349344978, 2.882096069869, 2.53275109170306, 2.09606986899563,
5.54414784394251, 4.51745379876797, 3.69609856262834, 3.08008213552361,
3.08008213552361, 2.25872689938398), Group = c("Vax_to_Unvax",
"Vax_to_Unvax", "Vax_to_Unvax", "Vax_to_Unvax", "Vax_to_Unvax",
"Vax_to_Unvax", "Unvax_to_Vax", "Unvax_to_Vax", "Unvax_to_Vax",
"Unvax_to_Vax", "Unvax_to_Vax", "Unvax_to_Vax")), row.names = c(319L,
292L, 147L, 82L, 375L, 98L, 173L, 182L, 76L, 54L, 190L, 176L), class = "data.frame")
ggplot(df,
aes( x= reorder(Var1, Freq), y = Percent, fill = Group)) +
geom_bar(stat="identity") +
facet_wrap(Group ~. , scales = "free") +
coord_flip()
Thank you for your help.
The code I used and the result can be seen in the image below. The main problem is that the title doesn't appear in the center and the x and y labels don't appear at all. How do I fix this?
The graph and code
You should upload your code as a snippet and your data so we can reproduce this on our own machines easily...
Take the example below. You can recreate the data set and then run the code immediately.
Using ggtitle, xlab, ylab you can plot the text and center it with theme.
If this does not help you have the wrong print / render settings.
balloon <- data.table(structure(list(Genera = c("Prevotella", "Treponema", "Fusobacterium","Selenomonas", "Veillonella", "Porphyromonas", "Streptococcus","Leptotrichia", "Aggregatibacter", "Succiniclasticum"), S1 = c(97L,28L, 11L, 40L, 5L, 13L, 10L, 24L, 0L, 16L), S3 = c(5370L, 3760L,5551L, 2087L, 533L, 873L, 1330L, 5877L, 1213L, 44L), S4 = c(7892L,8004L, 11017L, 19712L, 5115L, 2695L, 7451L, 13611L, 301L, 2557L), S5 = c(23L, 79L, 30L, 7L, 0L, 34L, 0L, 2L, 2L, 0L), S6 = c(8310L,3379L, 38058L, 1133L, 2506L, 17811L, 12103L, 403L, 668L, 3L),S2 = c(7379L, 14662L, 10085L, 148L, 1502L, 5222L, 1010L,2463L, 4790L, 28L), S7 = c(6238L, 18977L, 2674L, 2198L, 27L,2999L, 174L, 1197L, 5268L, 5L), S8 = c(20019L, 18674L, 15306L,1472L, 1898L, 9600L, 1683L, 2221L, 3435L, 1109L), S9 = c(153L,12L, 23L, 36L, 15L, 15L, 6L, 41L, 0L, 30L), S10 = c(20103L,29234L, 10857L, 2869L, 4923L, 14206L, 1415L, 4574L, 649L,2160L)), .Names = c("Genera", "S1", "S3", "S4", "S5", "S6","S2", "S7", "S8", "S9", "S10"), class = c("data.table", "data.frame"), row.names = c(NA, -10L)))
library(ggplot2)
library(reshape2)
library(data.table)
balloon<-fread("Downloads/balloon.csv")
balloon
balloon_melted<-melt(balloon)
head(balloon_melted)
p <- ggplot(balloon_melted, aes(x =variable, y = Genera))
p+
geom_point( aes(size=value))+
theme(panel.background=element_blank(),
panel.border = element_rect(colour = "blue", fill=NA, size=1)) +
ggtitle("Pretty title") +
xlab("x lab label") +
ylab("y lab label") +
theme(plot.title = element_text(hjust = 0.5))
I have a simple (yet very large) data set of counts made at different sites from Apr to Aug.
Between mid Apr and July there are no zero counts - yet a line at zero extends from the earliest to latest date.
Here is the part of the data used to make the above chart (columns are- Site.ID, DATE, Visible Number):
data=structure(list(Site.ID = c(302L, 302L, 302L, 302L, 302L, 302L,
302L, 302L, 302L, 302L, 302L, 302L, 304L, 304L, 304L, 304L, 304L,
304L, 304L, 304L, 304L, 304L, 304L, 304L), DATE = structure(c(1L,
2L, 5L, 3L, 4L, 6L, 8L, 7L, 9L, 10L, 11L, 12L, 1L, 2L, 5L, 3L,
4L, 6L, 8L, 7L, 9L, 10L, 11L, 12L), .Label = c("3/21/2014", "3/27/2014",
"4/17/2014", "4/28/2014", "4/8/2014", "5/13/2014", "6/17/2014",
"6/6/2014", "7/10/2014", "7/22/2014", "7/29/2014", "8/5/2014"
), class = "factor"), Visible.Number = c(0L, 0L, 5L, 14L, 20L,
21L, 6L, 8L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 7L, 7L, 7L, 7L, 5L,
0L, 0L, 0L, 0L)), .Names = c("Site.ID", "DATE", "Visible.Number"
), class = "data.frame", row.names = c(NA, -24L))
attach(data)
DATE<-as.Date(DATE,"%m/%d/%Y")
plot(data$Visible.Number~DATE, type="l", ylab="Visible Number")
I have two sites but there are three lines. How to make R not plot a line along zero?
Thank you for your help!
Your problem is with the multiple site ID's. It plots the first one, then goes back (drawing a line) to draw the second one. Essentially, base plots tries to draw all the lines without "lifting the pen". With base plotting, your option is to plot them separately with lines, perhaps in a for loop. I think stuff like this is easier with ggplot2
library(ggplot2)
ggplot(data, aes(x = DATE, y = Visible.Number, group = Site.ID)) + geom_line()
# if you prefer more base-like styling
ggplot(data, aes(x = DATE, y = Visible.Number, group = Site.ID)) +
geom_line() +
theme_bw()
In base:
plot(data$DATE, data$Visible.Number, type = "n",
ylab = "Visible Number", xlab = "Date")
for(site in unique(data$Site.ID)) {
with(subset(data, Site.ID == site),
lines(Visible.Number ~ DATE)
)
}
N.B. I did not attach my data as you did, so I don't know if the subsetting in the base solution will work properly for you if you do attach. In general, avoid attach; with is a nice way to save typing without attaching, and is much less "risky" in that it doesn't copy your data columns into isolated vectors, thus making them more difficult to keep track of as you subset or otherwise work with your data.
I have factors on x-axis and order those factor levels in a way that's intuitive to plot with ggplot. It works fine. However, when I use the subset command within ggplot, it re-orders my original sequence of factors.
Is it possible to do subsetting within ggplot and preserve the order of factor levels?
Here is the data and code:
library(ggplot2)
library(plyr)
dat <- structure(list(SubjectID = structure(c(12L, 4L, 6L, 7L, 12L,
7L, 5L, 8L, 14L, 1L, 15L, 1L, 7L, 1L, 7L, 5L, 4L, 2L, 9L, 6L,
7L, 13L, 12L, 2L, 15L, 3L, 5L, 13L, 13L, 10L, 7L, 8L, 10L, 10L,
1L, 10L, 12L, 7L, 6L, 10L), .Label = c("s001", "s002", "s003",
"s004", "s005", "s006", "s007", "s008", "s009", "s010", "s011",
"s012", "s013", "s014", "s015"), class = "factor"), Parameter = structure(c(7L,
3L, 5L, 3L, 6L, 4L, 6L, 7L, 7L, 4L, 7L, 12L, 8L, 11L, 1L, 4L,
3L, 4L, 6L, 4L, 6L, 6L, 12L, 5L, 12L, 1L, 7L, 13L, 11L, 1L, 4L,
1L, 6L, 13L, 10L, 10L, 10L, 13L, 5L, 8L), .Label = c("(Intercept)",
"c0.008", "c0.01", "c0.015", "c0.02", "c0.03", "PrevCorr1", "PrevFail1",
"c0.025", "c0.004", "c0.006", "c0.009", "c0.012", "c0.005"), class = "factor"),
Weight = c(0.0352725634087837, 1.45546697427904, 2.29457594510248,
0.479548914792514, 6.39680995359234, 1.48829600339586, 2.69253113220079,
-0.171219812386926, -0.453625394224277, 1.43732884325816,
0.742416863226952, 0.256935761466245, -0.29401087047524,
0.34653127811481, 0.33120592543102, 2.79213318878505, 2.47047299128637,
1.022450287681, 6.92891513416868, 0.648982326396105, 6.58336282626389,
6.40600461501379, 1.80062359655524, 3.86658202530889, 1.23833324887194,
-0.026560261876089, 0.121670468861011, 0.9290824087063, 0.349104382483186,
0.24722583823016, 1.82473621255801, -0.712668411699556, 6.51789901685784,
0.74682257127003, 0.0755807984938072, 0.131705709322157,
0.246465073382095, 0.876279316248929, 1.83442709571662, -0.579086982613267
)), .Names = c("SubjectID", "Parameter", "Weight"), row.names = c(2924L,
784L, 1537L, 1663L, 3138L, 1744L, 1266L, 1996L, 3548L, 86L, 3692L,
230L, 1613L, 213L, 1627L, 1024L, 832L, 384L, 2418L, 1568L, 1714L,
3362L, 3200L, 497L, 3632L, 683L, 1020L, 3281L, 3263L, 2779L,
1632L, 1995L, 2674L, 2753L, 312L, 2638L, 3198L, 1809L, 1569L,
2589L), class = "data.frame")
## Sort factors in the order that will make it intuitive to read the plot
## It goes, "(Intercept), "PrevCorr1", "PrevFail1", "c0.004", "c0.006", etc.
paramNames <- levels(dat$Parameter)
contrastNames <- sort(paramNames[grep("c0",paramNames)])
biasNames <- paramNames[!paramNames %in% contrastNames]
dat$Parameter <- factor(dat$Parameter, levels=c(biasNames, contrastNames))
## Add grouping parameter that will be used to plot different weights in different colors
dat$plotColor <-"Contrast"
dat$plotColor[dat$Parameter=="(Intercept)"] <- "Intercept"
dat$plotColor[grep("PrevCorr", dat$Parameter)] <- "PrevSuccess"
dat$plotColor[grep("PrevFail", dat$Parameter)] <- "PrevFail"
p <- ggplot(dat, aes(x=Parameter, y=Weight)) +
# The following command, which adds geom_line to data points of the graph, changes the order of levels
# If I uncomment the next line, the factor level order goes wrong.
#geom_line(subset=.(plotColor=="Contrast"), aes(group=1), stat="summary", fun.y="mean", color="grey50", size=1) +
geom_point(aes(group=Parameter, color=plotColor), size=5, stat="summary", fun.y="mean") +
geom_point(aes(group=Parameter), size=2.5, color="white", stat="summary", fun.y="mean") +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
print(p)
Here is the plot when geom line is commented
And here is what happens when geom_line is uncommented
If you switch the order in which you plot the objects, the problem disappears:
p <- ggplot(dat, aes(x=Parameter, y=Weight)) +
# The following command, which adds geom_line to data points of the graph, changes the order of levels
# If I uncomment the next line, the factor level order goes wrong.
geom_point(aes(group=Parameter, color=plotColor), size=5, stat="summary", fun.y="mean") +
geom_line(subset = .(plotColor == "Contrast"), aes(group=1), stat="summary", fun.y="mean", color="grey50", size=1) +
geom_point(aes(group=Parameter), size=2.5, color="white", stat="summary", fun.y="mean") +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
print(p)
I think the problem lies in plotting the subsetted data first, it ditches the levels for the original data, and when you add back in the points, it doesn't know where to put them. When you plot with the original data first, it maintains the levels. I'm not sure though, you might have to take my word on it.
I'd like to plot this vs. time, with the actual dates (years actually, 1997,1998...2010). The dates are in a raw format, ala SAS, days since 1960 (hence as.date conversion). If I convert the dates using as.date to variable x, and do the GAM plot, I get an error. It works fine with the raw day numbers. But I want the plot to display the years (data are not equally spaced).
structure(list(site = c(928L, 928L, 928L, 928L, 928L, 928L, 928L,
928L, 928L, 928L, 928L, 928L, 928L, 928L, 928L, 928L, 928L, 928L,
928L, 928L, 928L, 928L, 928L, 928L, 928L, 928L), date = c(13493L,
13534L, 13566L, 13611L, 13723L, 13752L, 13804L, 13837L, 13927L,
14028L, 14082L, 14122L, 14150L, 14182L, 14199L, 16198L, 16279L,
16607L, 16945L, 17545L, 17650L, 17743L, 17868L, 17941L, 18017L,
18092L), y = c(7L, 7L, 17L, 18L, 17L, 17L, 10L, 3L, 17L, 24L,
11L, 5L, 5L, 3L, 5L, 14L, 2L, 9L, 9L, 4L, 7L, 6L, 1L, 0L, 5L,
0L)), .Names = c("site", "date", "y"), class = "data.frame", row.names = c(NA,
-26L))
sgam1 <- gam(sites$y ~ s(sites$date))
sgam <- predict(sgam1, se=TRUE)
plot(sites$date,sites$y,xaxt="n", xlab='Time', ylab='Counts')
x<-as.Date(sites$date, origin="1960-01-01")
axis(1, at=1:26,labels=x)
lines(sites$date,sgam$fit, lty = 1)
lines(sites$date,sgam$fit + 1.96* sgam$se, lty = 2)
lines(sites$date,sgam$fit - 1.96* sgam$se, lty = 2)
ggplot2 has a solution (it doesn't mind the as.date thing) but it gives me other problems...
Use the origin= argument to as.Date() to specify a particular offset:
R> as.Date(c(928, 928, 930), origin="1960-01-01")
[1] "1962-07-17" "1962-07-17" "1962-07-19"
R>
Once you have a Date type for your data, you have options for formatting the axis as you wish.
sites <- read.table("349.txt", header = TRUE, sep = "\t", quote="\"", dec=".")
p<-as.Date(sites$date, origin="1960-01-01")
sgam1 <- gam(sites$y ~ s(sites$date))
sgam <- predict(sgam1, se=TRUE)
plot(p,sites$y, xlab='Time', ylab='Counts')
lines(p,sgam$fit, lty = 1)
lines(p,sgam$fit + 1.96* sgam$se, lty = 2)
lines(p,sgam$fit - 1.96* sgam$se, lty = 2)
This works!