Preserve timestamp after decomposing xts in R - r

I have an xts timeseries called hourplot in R with a period of 24 (hourly data) over two weeks, indexed by timestamp objects of POSIXlt class, like the following:
> dput(hourplot)
structure(c(1, 1, 1, 1, 1, 1, 1.11221374045802, 1.3368, 1.18,
1.0032, 1, 1, 1, 1, 1, 1, 1.0736, 1.2536, 1, 1.0032, 1.1856,
1.0048, 1, 1, 1, 1, 1, 1, 1, 1, 1.04045801526718, 1.20229007633588,
1.00229007633588, 1, 1, 1, 1, 1, 1, 1, 1.1152, 1.008, 1, 1, 1.2648,
1.1832, 1, 1, 1, 1, 1, 1, 1, 1.0424, 1.2952, 1.6496, 1.1208,
1.0216, 1, 1, 1, 1, 1, 1, 1.1256, 1, 1, 1, 1.0192, 1.3056, 1,
1.008, 1, 1, 1, 1, 1, 1, 1.0192, 1.0976, 1.0016, 1, 1, 1, 1,
1, 1, 1.0016, 1.6512, 1.8112, 1, 1, 1.2712, 1.0288, 1.0248, 1.1984,
1.0736, 1, 1, 1, 1, 1, 1.112, 1.336, 1.6224, 1.38, 2.2848, 1.628,
1, 1, 1, 1.0464, 1.4328, 1.6088, 1, 1, 1.0208, 1.2088, 1.02666666666667,
1.0784, 1.16, 1, 1.0064, 1.0616, 1.008, 1.0768, 1.4304, 1.3112,
1.0416, 1.00133333333333, 1.028, 1, 1, 1.1736, 1, 1.1264, 2.6744,
1.4696, 1, 1, 1.262, 1.2576, 1.0288, 1.1112, 1.008, 1, 1.01866666666667,
1.01733333333333, 1, 1, 1.294, 1.5744, 1.264, 1.182, 1.008, 1,
1, 1, 1, 1, 1, 1, 1.072, 1.8, 1.6424, 1.488, 1.1176, 1, 1, 1,
1, 1, 1.012, 1.2904, 1, 1, 1, 1, 1, 1.3072, 1.2056, 1.06, 1.0016,
1, 1, 1, 1.048, 1.0688, 1, 1, 1, 1, 1, 1, 1, 1, 1.51333333333333,
1.362, 1, 1, 1.0416, 1.03733333333333, 1.0288, 1.0712, 1, 1,
1, 1, 1, 1, 1.1664, 1.3464, 1.004, 1.0016, 1, 1, 1, 1, 1, 1,
1.15801526717557, 1.18396946564885, 1, 1, 1.1128, 1.2352, 1.0712,
1, 1, 1, 1, 1, 1, 1, 1.0744, 1.0048, 1, 1, 1, 1, 1, 1, 1, 1.0512,
1.0616, 1.4352, 1, 1.2064, 1.3368, 1.3296, 1.1608, 1.6696, 1.0328,
1.1976, 1.0912, 1.2024, 1, 1.0128, 1.1528, 1.26, 1, 1, 1.0192,
1, 1, 1, 1, 1, 1.704, 1.6152, 1, 1, 1.088, 1.4096, 1.0832, 1.1224,
1.0144, 1, 1.336, 1.552, 1.2248, 1.168, 1.4288, 1.4224, 1.2536,
1.0048, 1, 1, 1, 1.012, 1, 1, 1.21, 1.008, 1, 1, 1, 1.016, 1,
1.0048, 1, 1, 1.0176, 1.068, 1, 1, 1.0056, 1.2408, 1.0016, 1,
1, 1, 1, 1, 1.1632, 1.124, 1, 1, 1, 1, 1.0032, 1.0544, 1.0112,
1.008, 1.016, 1.0208, 1.084, 1.1688, 1.2384, 1.1736, 1.6168,
1.5984, 1.2784, 1.0608, 1, 1, 1, 1, 1.02824427480916, 1, 1.3064,
1.5216, 1, 1, 1.0672, 1.051), .Dim = c(346L, 1L), index = structure(c(1484722282,
1484725287, 1484729469, 1484731265, 1484737199, 1484740697, 1484744294,
1484747896, 1484751493, 1484755097, 1484758693, 1484762294, 1484765898,
1484769495, 1484773093, 1484776694, 1484780298, 1484783899, 1484787499,
1484791095, 1484794698, 1484798299, 1484800723, 1484805577, 1484809098,
1484812688, 1484816293, 1484819889, 1484823492, 1484827094, 1484830692,
1484834292, 1484837891, 1484841494, 1484845158, 1484848699, 1484852298,
1484855895, 1484859499, 1484863096, 1484866699, 1484870302, 1484873901,
1484877501, 1484881100, 1484884696, 1484888301, 1484891900, 1484895499,
1484899097, 1484902699, 1484906297, 1484909902, 1484913499, 1484917102,
1484920702, 1484924298, 1484927902, 1484931499, 1484935101, 1484938698,
1484942300, 1484945897, 1484949495, 1484953100, 1484956702, 1484960299,
1484963902, 1484967501, 1484971104, 1484974700, 1484978300, 1484981900,
1484985500, 1484989099, 1484992701, 1484996299, 1484999900, 1485003503,
1485007104, 1485010704, 1485014299, 1485017903, 1485021500, 1485025102,
1485028701, 1485032300, 1485035899, 1485039502, 1485043100, 1485046701,
1485050304, 1485053906, 1485057500, 1485061102, 1485064701, 1485068302,
1485071901, 1485075504, 1485079101, 1485082703, 1485086300, 1485089903,
1485093500, 1485097100, 1485100702, 1485104305, 1485107903, 1485111501,
1485115105, 1485118701, 1485122306, 1485125905, 1485129506, 1485133103,
1485136701, 1485140306, 1485143906, 1485147503, 1485151105, 1485154703,
1485158303, 1485161904, 1485165481, 1485169077, 1485172682, 1485176276,
1485179879, 1485183479, 1485187080, 1485190681, 1485194277, 1485197877,
1485201478, 1485205077, 1485208680, 1485212281, 1485215878, 1485219477,
1485223082, 1485226680, 1485230278, 1485233881, 1485237478, 1485241076,
1485244677, 1485248282, 1485251882, 1485255482, 1485259196, 1485262680,
1485265335, 1485312724, 1485316675, 1485320277, 1485323879, 1485327478,
1485331075, 1485334678, 1485338280, 1485341881, 1485345479, 1485349077,
1485352684, 1485356277, 1485359878, 1485363478, 1485367079, 1485370672,
1485374276, 1485377885, 1485381477, 1485385080, 1485388679, 1485392280,
1485395878, 1485399478, 1485403082, 1485406677, 1485410284, 1485413877,
1485417477, 1485421081, 1485424679, 1485428278, 1485431880, 1485435481,
1485439080, 1485442680, 1485446279, 1485449884, 1485453481, 1485457082,
1485460680, 1485464280, 1485467876, 1485471483, 1485475081, 1485478680,
1485482280, 1485485880, 1485489485, 1485493082, 1485496679, 1485500280,
1485503879, 1485507485, 1485511079, 1485514682, 1485518285, 1485521885,
1485525482, 1485529085, 1485532684, 1485536281, 1485539882, 1485543484,
1485547081, 1485550679, 1485554281, 1485557884, 1485561483, 1485565082,
1485568685, 1485572287, 1485575886, 1485579483, 1485583083, 1485586685,
1485590282, 1485593886, 1485597487, 1485601085, 1485604681, 1485608285,
1485611885, 1485615484, 1485619082, 1485622681, 1485626287, 1485629882,
1485633484, 1485637083, 1485640681, 1485644283, 1485647889, 1485651484,
1485655086, 1485658686, 1485662288, 1485665889, 1485669486, 1485673085,
1485676685, 1485680283, 1485683886, 1485687488, 1485691085, 1485694687,
1485698288, 1485701886, 1485705489, 1485709089, 1485712685, 1485716287,
1485719884, 1485723484, 1485727084, 1485730688, 1485734287, 1485737884,
1485741487, 1485745088, 1485748690, 1485752291, 1485755885, 1485759487,
1485763085, 1485766686, 1485770289, 1485773889, 1485777486, 1485781093,
1485784691, 1485788287, 1485791887, 1485795492, 1485799088, 1485802689,
1485806287, 1485809890, 1485813491, 1485817088, 1485820693, 1485824289,
1485827888, 1485831491, 1485835093, 1485838688, 1485842289, 1485845889,
1485849489, 1485853090, 1485856691, 1485860290, 1485863888, 1485867490,
1485871089, 1485874693, 1485878289, 1485881888, 1485885488, 1485889091,
1485892688, 1485896288, 1485899890, 1485903494, 1485907096, 1485910694,
1485914292, 1485917890, 1485921490, 1485925090, 1485928695, 1485932291,
1485935888, 1485939492, 1485943093, 1485946690, 1485950293, 1485953895,
1485957493, 1485961096, 1485964692, 1485968291, 1485971892, 1485975492,
1485979084, 1485982689, 1485986289, 1485989895, 1485993493, 1485997092,
1486000694, 1486004292, 1486006761), tzone = "", tclass = c("POSIXlt",
"POSIXt")), .indexCLASS = c("POSIXlt", "POSIXt"), .indexTZ = "", tclass = c("POSIXlt",
"POSIXt"), tzone = "", class = c("xts", "zoo"), frequency = 24)
Now, I want to decompose this timeseries and look at the seasonality, so I run:
dec <- decompose(as.ts(hourplot))
plot(dec)
The plot I get looks like:
Now, I want to have the timestamps mentioned under the individual trend and seasonality plots too, but R seems to strip off the timestamp and put 1-14 in the X axis instead.
How can I preserve the timestamps in the plots?

Here's a function that will decompose an xts series and return an object of class "decomposed.xts".
decompose.xts <-
function (x, type = c("additive", "multiplicative"), filter = NULL)
{
dts <- decompose(as.ts(x), type, filter)
dts$x <- .xts(dts$x, .index(x))
dts$seasonal <- .xts(dts$seasonal, .index(x))
dts$trend <- .xts(dts$trend, .index(x))
dts$random <- .xts(dts$random, .index(x))
with(dts,
structure(list(x = x, seasonal = seasonal, trend = trend,
random = if (type == "additive") x - seasonal - trend else x/seasonal/trend,
figure = figure, type = type), class = "decomposed.xts"))
}
And here's a plot.decomposed.xts() method:
plot.decomposed.xts <-
function(x, ...)
{
xx <- x$x
if (is.null(xx))
xx <- with(x,
if (type == "additive") random + trend + seasonal
else random * trend * seasonal)
p <- cbind(observed = xx, trend = x$trend, seasonal = x$seasonal, random = x$random)
plot(p, main = paste("Decomposition of", x$type, "time series"), multi.panel = 4,
yaxis.same = FALSE, major.ticks = "days", grid.ticks.on = "days", ...)
}
And an example of using it on your data:
dex <- decompose.xts(hourplot)
plot(dex)

I would translate the decomposed data into a dataframe:
n = length(index(hourplot))
df1 = data.frame(date = index(hourplot), name = rep("random", n), data = as.numeric(dec$random))
df2 = data.frame(date = index(hourplot), name = rep("seasonal", n), data = as.numeric(dec$seasonal))
df3 = data.frame(date = index(hourplot), name = rep("trend", n), data = as.numeric(dec$trend))
df4 = data.frame(date = index(hourplot), name = rep("observed", n), data = as.numeric(dec$x))
df = rbind(df1, df2, df3, df4)
And then use ggplot2:
library(ggplot2)
ggplot(df, aes(x = date, y = data)) +
geom_line() +
facet_wrap(~name, ncol = 1, scales = "free") +
scale_x_datetime(date_labels = "%d", date_breaks = "24 hours")
I'll leave it to you to fine-tune the scales and labels in scale_x_datetime.

Related

Calculating t-test with apply (by row) returns data are essentially constant

I'm trying to use the t-test with just two values:
1 and n.
This works:
a <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8.97, 8.97, 8.97, 8.97, 8.97, 8.97, 8.97, 8.97, 8.97, 8.97, 8.97, 8.97, 8.97, 8.97, 8.97)
or
a <- rep(c(1, 8.97), each = 15)
and:
t.test(x = a, mu = 2.33, alternative = "greater")
One Sample t-test
data: a
t = 3.5879, df = 29, p-value = 0.0006046
alternative hypothesis: true mean is greater than 2.33
95 percent confidence interval:
3.727653 Inf
sample estimates:
mean of x
4.985
Right, but:
variavel <- seq(from = 1, to = 9, by = .01)
df <- data.frame(fix1 = 1, fix2 = 1, fix3 = 1, fix4 = 1, fix5 = 1, fix6 = 1, fix7 = 1, fix8 = 1, fix9 = 1, fix10 = 1, fix11 = 1, fix12 = 1, fix13 = 1, fix14 = 1, fix15 = 1,
ge1 = variavel, ge2 = variavel, ge3 = variavel, ge4 = variavel, ge5 = variavel, ge6 = variavel, ge7 = variavel, ge8 = variavel, ge9 = variavel, ge10 = variavel, ge11 = variavel, ge12 = variavel, ge13 = variavel, ge14 = variavel, ge15 = variavel
)
When I go to calculate the test for each line and obtain the p-value,
apply(X = df, MARGIN = 1, FUN = function(x) {
t.test(x = x, mu = 2.33, alternative = "greater")$p.value
})
it gives me the error:
Error in t.test.default(x = x, mu = 2.33) :
data are essentially constant
What happens?
Your first row is invariant, and since that's an error, the rest of the data is never tested.
table(unlist(df[1,]))
# 1
# 30
You have two options:
Omit row 1 in your calculation:
head(apply(X = df[-1,], MARGIN = 1, FUN = function(x) {
t.test(x = x, mu = 2.33, alternative = "greater")$p.value
}))
# 2 3 4 5 6 7
# 1 1 1 1 1 1
Catch it inside the loop:
head(apply(X = df, MARGIN = 1, FUN = function(x) {
tryCatch(t.test(x = x, mu = 2.33, alternative = "greater")$p.value,
error = function(e) NA_real_)
}))
# [1] NA 1 1 1 1 1

How To Obtain Monthly Data from Quarterly Data

Problem:
I have quarterly level data. I need to perform a month over month analysis. Is there a function, or a ggplot feature that will interpolate the quarterly data and fill in monthly data?
Reference Data:
dput(HPF[1:25, ])
structure(list(region = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), path = c(1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
), date = structure(c(16116, 16205, 16297, 16389, 16481, 16570,
16662, 16754, 16846, 16936, 17028, 17120, 17212, 17301, 17393,
17485, 17577, 17666, 17758, 17850, 17942, 18031, 18123, 18215,
18307), class = "Date"), index_value = c(1, 1.033852765, 1.041697122,
1.038876363, 1.041043093, 1.060900982, 1.073728928, 1.075879441,
1.080898915, 1.10368893, 1.119240863, 1.122827602, 1.128639801,
1.15275796, 1.169021733, 1.172707492, 1.178666441, 1.203634882,
1.220348482, 1.223890323, 1.229770019, 1.255791539, 1.273560554,
1.278236959, 1.285508086), index = c(0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24), counter = 1:25, BaseQoQ = c(0, 0.033852765, 0.00758749917354051,
-0.00270784947028013, 0.00208564760655761, 0.0190749923163842,
0.0120915582298895, 0.00200284535874973, 0.00466546139717505,
0.0210843166587877, 0.0140908661646175, 0.00320461762840418,
0.00517639483536669, 0.0213692260175751, 0.0141085757499344,
0.00315285755256367, 0.00508136004984272, 0.0211836361259394,
0.0138859385432799, 0.00290231933930496, 0.00480410367620832,
0.021159663675294, 0.0141496533844698, 0.00367191413499146, 0.00568840303732765
), fdate = structure(c(16116, 16205, 16297, 16389, 16481, 16570,
16662, 16754, 16846, 16936, 17028, 17120, 17212, 17301, 17393,
17485, 17577, 17666, 17758, 17850, 17942, 18031, 18123, 18215,
18307), class = "Date"), StressC = c(0.99749, 1.031342765, 1.039187122,
1.036366363, 1.038533093, 1.058390982, 1.071218928, 1.073369441,
1.078388915, 1.10117893, 1.116730863, 1.120317602, 1.126129801,
1.15024796, 1.166511733, 1.170197492, 1.176156441, 1.201124882,
1.217838482, 1.221380323, 1.229770019, 1.255791539, 1.273560554,
1.278236959, 1.285508086), StressQoQ = c(0, 0.0339379492526242,
0.00760596502560418, -0.0027143898728953, 0.00209069888540969,
0.0191210941026796, 0.0121202336548254, 0.00200753827606026,
0.00467637125510434, 0.0211333913794913, 0.0141229845362187,
0.00321182042946733, 0.00518799221722843, 0.021416855302633,
0.0141393626118667, 0.00315964160130755, 0.00509225924746737,
0.021228843485116, 0.0139149560969629, 0.00290830110260876, 0.0068690282969297,
0.021159663675294, 0.0141496533844698, 0.00367191413499146, 0.00568840303732765
)), .Names = c("region", "path", "date", "index_value", "index",
"counter", "BaseQoQ", "fdate", "StressC", "StressQoQ"), row.names = c(NA,
-25L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars = "region", drop = TRUE, indices = list(
0:24), group_sizes = 25L, biggest_group_size = 25L, labels = structure(list(
region = 1), row.names = c(NA, -1L), class = "data.frame", vars = "region", drop = TRUE, .Names = "region"))
Any insight would be much appreciated! I hope the data I have provided is enough to offer suggestions.
Goal:
I only need to plot/graph the monthly information, I do not need it reference in a data.frame.
So this is what you can do:
1) create a vector of dates to interpolate
months <- lapply(X = data$date, FUN = seq.Date, by = "month", length.out = 3)
months <- data.frame(date = do.call(what = c, months))
2) left join you date.frame to the months data.frame to create NAs for extrapolation
library(dplyr)
monthly_data <- left_join(x = months, y = data, by = "date")
3) use one of na.locf() na.appox() na.spline() to interpolate f.ex StressC
library(zoo)
monthly_data$StressC <- na.spline(object = monthly_data$StressC)
Note: the extrapolations above are
na.locf() - most recent point
na.appox() - linear
na.spline() - spline (often used in graphics)
Run the following (before interpolation) to see the difference:
plot(x = monthly_data$StressC, ylab = "StressC", xlab="", xaxt = "n")
lines(x = na.locf(monthly_data$StressC), col = "red")
lines(x = na.approx(monthly_data$StressC), col = "green")
lines(x = na.spline(monthly_data$StressC), col = "blue")
Can also do something like this to get gglot:
ggplot(monthly_data, aes(x=date)) +
geom_point(aes(y = StressC), colour="black") +
geom_line(aes(y = na.locf(StressC)), col="red") +
geom_line(aes(y = na.spline(StressC)), col="red")

top_n function returning more rows than expected

I'm pretty new to r (and pretty tired- I imagine my brain is just not currently working properly) but to me the below code should only return 10 rows- it returns 66. Why is this?
library(dplyr)
a <- structure(list(calls_in_range = c(17, 14, 6, 4, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), mid_point = c(-20680,
-20660, -20640, -20700, -36900, -36500, -36380, -36020, -35960,
-35260, -35140, -34640, -33060, -32600, -30920, -29340, -29100,
-28780, -27980, -27640, -27220, -27160, -26980, -26800, -26740,
-26500, -25640, -25540, -24840, -24820, -24800, -24380, -23880,
-23820, -23720, -23320, -22220, -21920, -21860, -21760, -21060,
-20240, -19780, -18700, -18500, -17500, -16740, -16500, -14260,
-14200, -14120, -13860, -13420, -13120, -12780, -12740, -12460,
-12420, -12280, -12260, -11720, -10660, -10060, -9960, -6380,
-5520), lower_range = c(-20690, -20670, -20650, -20710, -36910,
-36510, -36390, -36030, -35970, -35270, -35150, -34650, -33070,
-32610, -30930, -29350, -29110, -28790, -27990, -27650, -27230,
-27170, -26990, -26810, -26750, -26510, -25650, -25550, -24850,
-24830, -24810, -24390, -23890, -23830, -23730, -23330, -22230,
-21930, -21870, -21770, -21070, -20250, -19790, -18710, -18510,
-17510, -16750, -16510, -14270, -14210, -14130, -13870, -13430,
-13130, -12790, -12750, -12470, -12430, -12290, -12270, -11730,
-10670, -10070, -9970, -6390, -5530), upper_range = c(-20670,
-20650, -20630, -20690, -36890, -36490, -36370, -36010, -35950,
-35250, -35130, -34630, -33050, -32590, -30910, -29330, -29090,
-28770, -27970, -27630, -27210, -27150, -26970, -26790, -26730,
-26490, -25630, -25530, -24830, -24810, -24790, -24370, -23870,
-23810, -23710, -23310, -22210, -21910, -21850, -21750, -21050,
-20230, -19770, -18690, -18490, -17490, -16730, -16490, -14250,
-14190, -14110, -13850, -13410, -13110, -12770, -12730, -12450,
-12410, -12270, -12250, -11710, -10650, -10050, -9950, -6370,
-5510)), class = "data.frame", row.names = c(NA, -66L), .Names = c("calls_in_range",
"mid_point", "lower_range", "upper_range"))
top_n(a, 10, calls_in_range)
If you inspect the calls_in_range column, you can see there are ties. This is the variable used for ordering. According to the documentation for the n argument in the top_n function:
number of rows to return. If x is grouped, this is the number of rows per group. Will include more than n rows if there are ties. If n is positive, selects the top n rows. If negative, selects the bottom n rows.
This is why it is returning more rows than expected.

determining age from min max dates for each item in dataset [duplicate]

This question is very similar to a question asked in another thread which can be found here. I'm trying to achieve something similar: within groups (events) subtract the first date from the last date. I'm using the dplyr package and code provided in the answers of this thread. Subtracting the first date from the last date works, however it does not provide satisfactory results; the resulting time difference is displayed in numbers, and there seems to be no distinction between different time units (e.g., minutes and hours) --> subtractions in first 2 events are correct, however in the 3rd one it is not i.e. should be minutes. How can I manipulate the output by dplyr so that the resulting subtractions are actually a correct reflection of the time difference? Below you will find a sample of my data (1 group only) and the code that I used:
df<- structure(list(time = structure(c(1428082860, 1428083340, 1428084840,
1428086820, 1428086940, 1428087120, 1428087240, 1428087360, 1428087480,
1428087720, 1428088800, 1428089160, 1428089580, 1428089700, 1428090120,
1428090240, 1428090480, 1428090660, 1428090780, 1428090960, 1428091080,
1428091200, 1428091500, 1428091620, 1428096060, 1428096420, 1428096540,
1428096600, 1428097560, 1428097860, 1428100440, 1428100560, 1428100680,
1428100740, 1428100860, 1428101040, 1428101160, 1428101400, 1428101520,
1428101760, 1428101940, 1428102240, 1428102840, 1428103080, 1428103620,
1428103980, 1428104100, 1428104160, 1428104340, 1428104520, 1428104700,
1428108540, 1428108840, 1428108960, 1428110340, 1428110460, 1428110640
), class = c("POSIXct", "POSIXt"), tzone = ""), event = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3)), .Names = c("time",
"event"), class = "data.frame", row.names = c(NA, 57L))
df1 <- df %>%
group_by(event) %>%
summarize(first(time),last(time),difference = last(time)-first(time))
We can use difftime and specify the unit to get all the difference in the same unit.
df %>%
group_by(event) %>%
summarise(First = first(time),
Last = last(time) ,
difference= difftime(last(time), first(time), unit='hour'))

Creating arrow head matching size (or lwd) in ggplot2

To draw arrows in ggplot, I use geom_segment and arrow=arrow().
I would like the arrow head size to match the segment width (or size).
However, arrow does not recognize variables directly from the data argument in ggplot and one must specify data.frame containing the variable using the $ operator. This causes a disjunct between the values used for plotting the line and those used for plotting the arrow head (the largest arrow head can be on the thinest segment).
Example:
d <- structure(list(Process = structure(c(2L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 1L, 2L), .Label = c("First", "Second"), class = "factor"),
x.sink = c(1, 3, 1, 2, 2, 3, 3, 2, 2, 2, 2), y.sink = c(1,
1, 1, 2, 2, 1, 1, 1, 1, 2, 2), x.source = c(2, 2, 2, 2, 2,
2, 2, 1, 1, 1, 3), y.source = c(2, 2, 2, 1, 1, 1, 1, 1, 1,
2, 1), offset = c(1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1),
Std.Flux = c(0.179487179487179, 0.170940170940171, 0.944444444444444,
0.0854700854700855, 0.726495726495726, 0.128205128205128,
0.213675213675214, 0.213675213675214, 0.128205128205128,
0.106837606837607, 1)), .Names = c("Process", "x.sink", "y.sink",
"x.source", "y.source", "offset", "Std.Flux"), class = "data.frame", row.names = c(NA,
-11L))
p <- qplot(data=d,
#alpha=I(0.4),
colour=Process,
size=Std.Flux,
xlim=c(0,4),
ylim=c(0,3),
x=x.source+as.numeric(Process)/10,
y=y.source+as.numeric(Process)/10,
xend=x.sink+as.numeric(Process)/10,
yend=y.sink+as.numeric(Process)/10,
geom="segment",
arrow = arrow(type="closed",
length = unit(d$Std.Flux,"cm")))
print(p)
Any suggestions?
Here's one way:
require(ggplot2)
df <- mtcars
arrow_pos <- data.frame(y = 250)
ggplot(df, aes(x=factor(cyl), y=mpg)) +
geom_bar(width = .4, stat="identity", fill="darkblue") +
geom_segment(data=arrow_pos,
aes(x=1.526, xend=1.01, y=y + 90.02, yend=y + 0.25),
arrow=arrow(length=unit(4.2, "mm")), lwd=2,
color="black") +
geom_segment(data=arrow_pos,
aes(x=1.525, xend=1.01, y=y + 90, yend=y + 0.25),
arrow=arrow(length=unit(4, "mm")), lwd=1,
color="gold2") +
annotate("text", x=2.39, y=360,
label='This arrow points to the highest MPG.') +
scale_y_continuous(limits = c(0,400)) +
xlab('CYL') + ylab('MPG')
Output:
Must have been fixed in the last 8 years :)
Here translated into a call to ggplot()
library(ggplot2)
ggplot(d, aes(colour=Process, size=Std.Flux)) +
geom_segment(aes(x=x.source+as.numeric(Process)/10,
y=y.source+as.numeric(Process)/10,
xend=x.sink+as.numeric(Process)/10,
yend=y.sink+as.numeric(Process)/10),
arrow = arrow(type="closed",
length = unit(d$Std.Flux,"cm")))

Resources