I have used hclust in the TSclust package to do agglomerative hierarchical clustering. My question is, Can I get the dissimlarity (distance) matrix back from hclust? I wanted the values of the distance to rank which variable is closer to a single variable in the group of variables.
example: If (x1,x2, x3,x4,x5,x6,x7,x8,x9,x10) are the variables used to form the distance matrix, then what I wanted is the distance between x3 and the rest of variables (x3x1,x3x2,x3x4,x3x5, and so on). Can we do that? Here is the code and reproducible data.
Data:
structure(list(x1 = c(186.41, 100.18, 12.3, 14.38, 25.97, 0.06,
0, 6.17, 244.06, 19.26, 256.18, 255.69, 121.88, 75, 121.45, 11.34,
34.68, 3.09, 34.3, 26.13, 111.31), x2 = c(327.2, 8.05, 4.23,
6.7, 3.12, 1.91, 37.03, 39.17, 140.06, 83.72, 263.29, 261.22,
202.48, 23.27, 2.87, 7.17, 14.48, 3.41, 5.95, 70.56, 91.58),
x3 = c(220.18, 126.14, 98.59, 8.56, 0.5, 0.9, 17.45, 191.1,
164.64, 224.36, 262.86, 237.75, 254.88, 42.05, 9.12, 0.04,
12.22, 0.61, 61.86, 114.08, 78.94), x4 = c(90.74, 26.11,
47.86, 10.86, 3.74, 23.69, 61.79, 68.12, 87.92, 171.76, 260.98,
266.62, 96.27, 57.15, 78.89, 16.73, 6.59, 49.44, 57.21, 202.2,
67.17), x5 = c(134.09, 27.06, 7.44, 4.53, 17, 47.66, 95.96,
129.53, 40.23, 157.37, 172.61, 248.56, 160.84, 421.94, 109.93,
22.77, 2.11, 49.18, 64.13, 52.61, 180.87), x6 = c(173.17,
46.68, 6.54, 3.05, 0.35, 0.12, 5.09, 72.46, 58.19, 112.31,
233.77, 215.82, 100.63, 65.84, 2.69, 0.01, 3.63, 12.93, 66.55,
28, 61.74), x7 = c(157.22, 141.81, 19.98, 116.18, 16.55,
122.3, 62.67, 141.84, 78.3, 227.27, 340.22, 351.38, 147.73,
0.3, 56.12, 33.2, 5.51, 54.4, 82.98, 152.66, 218.26), x8 = c(274.08,
51.92, 54.86, 15.37, 0.31, 0.05, 36.3, 162.04, 171.78, 181.39,
310.73, 261.55, 237.99, 123.99, 1.92, 0.74, 0.23, 18.51,
7.68, 65.55, 171.33), x9 = c(262.71, 192.34, 2.75, 21.68,
1.69, 3.92, 0.09, 9.33, 120.36, 282.92, 236.7, 161.59, 255.44,
126.44, 7.63, 2.04, 1.02, 0.12, 5.87, 146.25, 134.11), x10 = c(82.71,
44.09, 1.52, 2.63, 4.38, 28.64, 168.43, 80.62, 20.36, 39.29,
302.31, 247.52, 165.73, 18.27, 2.67, 1.77, 23.13, 53.47,
53.14, 46.61, 86.29)), class = "data.frame", row.names = c(NA,
-21L))
Code:
as.ts(cdata)
library(dplyr) # data wrangling
library(ggplot2) # grammar of graphics
library(ggdendro) # dendrograms
library(TSclust) # cluster time series
cluster analysis
dist_ts <- TSclust::diss(SERIES = t(cdata), METHOD = "INT.PER") # note the data frame must be transposed
hc <- stats::hclust(dist_ts, method="complete") # method can be also "average" or diana (for DIvisive ANAlysis Clustering)
hcdata <- ggdendro::dendro_data(hc)
names_order <- hcdata$labels$label
# Use the following to remove labels from dendogram so not doubling up - but good for checking hcdata$labels$label <- ""
hcdata%>%ggdendro::ggdendrogram(., rotate=FALSE, leaf_labels=FALSE)
I believe the object you are looking for is stored in the variable dist_ts:
dist_ts <- TSclust::diss(SERIES = t(cdata), METHOD = "INT.PER")
print(dist_ts)
I am now facing on a problem about how to make moving average crossover plot in R. I added ma5 and ma20 as two moving average plots base on my price data.
It is my sample code here..
library("TTR")
library(ggplot2)
price<- c(3.23, 3.29, 3.29 , 3.21, 3.19, 3.18, 3.11, 3.21, 3.25,
3.40, 3.39, 3.28, 3.31 , 3.32, 3.21, 3.19, 3.16, 3.20,
3.26, 3.30, 3.42, 3.44, 3.40, 3.41, 3.59, 3.83, 3.70,
3.86, 3.95, 3.89, 3.94, 3.78, 3.69, 3.74, 3.67, 3.69,
3.69, 3.61, 3.64, 3.83, 3.88, 3.98, 3.98, 3.86, 3.87,
3.93, 4.05, 3.97, 3.90, 3.93, 4.00, 3.85, 3.81, 4.20,
4.17, 4.05, 3.95, 3.96, 3.97, 3.96, 3.88, 3.85, 3.79,
3.83, 3.68, 3.72, 3.73, 3.81, 3.80, 3.81, 3.75, 3.87,
3.90, 3.89, 3.86, 3.81, 3.86, 3.78, 3.83, 3.87, 3.91,
4.05, 4.07, 4.02, 4.01, 4.00, 4.13, 4.07, 4.11, 4.26,
4.33, 4.32, 4.39, 4.30, 4.39, 4.68, 4.69, 4.70, 4.60,
4.71, 4.81, 4.73, 4.78, 4.64, 4.64, 4.64, 4.61, 4.44)
date<- c("2004-01-23", "2004-01-26", "2004-01-27", "2004-01-28",
"2004-02-02", "2004-02-03", "2004-02-04", "2004-02-05",
"2004-02-06", "2004-02-11", "2004-02-12", "2004-02-13",
"2004-02-17", "2004-02-18", "2004-02-19", "2004-02-20",
"2004-02-23", "2004-02-24", "2004-02-25", "2004-02-26",
"2004-02-27", "2004-03-01", "2004-03-02", "2004-03-03",
"2004-03-04", "2004-03-05", "2004-03-08", "2004-03-09",
"2004-03-10", "2004-03-11", "2004-03-12", "2004-03-15",
"2004-03-16", "2004-03-17", "2004-03-18", "2004-03-19",
"2004-03-22", "2004-03-23", "2004-03-24", "2004-03-25",
"2004-03-26", "2004-03-29", "2004-03-30", "2004-03-31",
"2004-04-01", "2004-04-02", "2004-04-05", "2004-04-06",
"2004-04-07", "2004-04-08", "2004-04-12", "2004-04-13",
"2004-04-14", "2004-04-15", "2004-04-16", "2004-04-19",
"2004-04-20", "2004-04-21", "2004-04-22", "2004-04-23",
"2004-04-26", "2004-04-27", "2004-04-28", "2004-04-29",
"2004-04-30", "2004-05-03", "2004-05-04", "2004-05-05",
"2004-05-06", "2004-05-07", "2004-05-10", "2004-05-11",
"2004-05-12", "2004-05-13", "2004-05-14", "2004-05-17",
"2004-05-18", "2004-05-19", "2004-05-20", "2004-05-21",
"2004-05-24", "2004-05-25", "2004-05-26", "2004-05-27",
"2004-05-28", "2004-06-01", "2004-06-02", "2004-06-03",
"2004-06-04", "2004-06-07", "2004-06-08", "2004-06-09",
"2004-06-10", "2004-06-14", "2004-06-15", "2004-06-16",
"2004-06-17", "2004-06-18", "2004-06-21", "2004-06-22",
"2004-06-23", "2004-06-24", "2004-06-25", "2004-06-28",
"2004-06-29", "2004-06-30", "2004-07-01", "2004-07-02")
price5<- SMA(price,n=5)
price20<- SMA(price,n=20)
pricedf<- data.frame(date,price5,price20,price)
ggplot(pricedf,aes(date))+geom_line(group=1,aes(y=price5,colour="ma5"))+geom_line(group=1,aes(y=price20,colour="ma20"))+xlab("Date")+ylab("Price")
There are a couples of crossovers on this plot. What I want to have is when ma5 above ma20 mark as green line on 'price'(one feature in my pricedf) plot. On the other hand when ma5 under ma20 mark as red line on 'price' plot.
The example plot looks like this picture,
I was thinking subtract price5 to price20 and compare whether the values are greater than 0. But how can I draw them on another plot with different colors?
Here is how I solved it.
library("TTR")
library(ggplot2)
price<- c(3.23, 3.29, 3.29 , 3.21, 3.19, 3.18, 3.11, 3.21, 3.25,
3.40, 3.39, 3.28, 3.31 , 3.32, 3.21, 3.19, 3.16, 3.20,
3.26, 3.30, 3.42, 3.44, 3.40, 3.41, 3.59, 3.83, 3.70,
3.86, 3.95, 3.89, 3.94, 3.78, 3.69, 3.74, 3.67, 3.69,
3.69, 3.61, 3.64, 3.83, 3.88, 3.98, 3.98, 3.86, 3.87,
3.93, 4.05, 3.97, 3.90, 3.93, 4.00, 3.85, 3.81, 4.20,
4.17, 4.05, 3.95, 3.96, 3.97, 3.96, 3.88, 3.85, 3.79,
3.83, 3.68, 3.72, 3.73, 3.81, 3.80, 3.81, 3.75, 3.87,
3.90, 3.89, 3.86, 3.81, 3.86, 3.78, 3.83, 3.87, 3.91,
4.05, 4.07, 4.02, 4.01, 4.00, 4.13, 4.07, 4.11, 4.26,
4.33, 4.32, 4.39, 4.30, 4.39, 4.68, 4.69, 4.70, 4.60,
4.71, 4.81, 4.73, 4.78, 4.64, 4.64, 4.64, 4.61, 4.44)
date<- c("2004-01-23", "2004-01-26", "2004-01-27", "2004-01-28",
"2004-02-02", "2004-02-03", "2004-02-04", "2004-02-05",
"2004-02-06", "2004-02-11", "2004-02-12", "2004-02-13",
"2004-02-17", "2004-02-18", "2004-02-19", "2004-02-20",
"2004-02-23", "2004-02-24", "2004-02-25", "2004-02-26",
"2004-02-27", "2004-03-01", "2004-03-02", "2004-03-03",
"2004-03-04", "2004-03-05", "2004-03-08", "2004-03-09",
"2004-03-10", "2004-03-11", "2004-03-12", "2004-03-15",
"2004-03-16", "2004-03-17", "2004-03-18", "2004-03-19",
"2004-03-22", "2004-03-23", "2004-03-24", "2004-03-25",
"2004-03-26", "2004-03-29", "2004-03-30", "2004-03-31",
"2004-04-01", "2004-04-02", "2004-04-05", "2004-04-06",
"2004-04-07", "2004-04-08", "2004-04-12", "2004-04-13",
"2004-04-14", "2004-04-15", "2004-04-16", "2004-04-19",
"2004-04-20", "2004-04-21", "2004-04-22", "2004-04-23",
"2004-04-26", "2004-04-27", "2004-04-28", "2004-04-29",
"2004-04-30", "2004-05-03", "2004-05-04", "2004-05-05",
"2004-05-06", "2004-05-07", "2004-05-10", "2004-05-11",
"2004-05-12", "2004-05-13", "2004-05-14", "2004-05-17",
"2004-05-18", "2004-05-19", "2004-05-20", "2004-05-21",
"2004-05-24", "2004-05-25", "2004-05-26", "2004-05-27",
"2004-05-28", "2004-06-01", "2004-06-02", "2004-06-03",
"2004-06-04", "2004-06-07", "2004-06-08", "2004-06-09",
"2004-06-10", "2004-06-14", "2004-06-15", "2004-06-16",
"2004-06-17", "2004-06-18", "2004-06-21", "2004-06-22",
"2004-06-23", "2004-06-24", "2004-06-25", "2004-06-28",
"2004-06-29", "2004-06-30", "2004-07-01", "2004-07-02")
price5<- SMA(price,n=5)
price20<- SMA(price,n=20)
pricedf<- data.frame(date,price5,price20,price)
coldf <- ifelse(price5 - price20 > 0, 'green', 'red')
coldf[is.na(coldf)] <- 'green'
coldf
ggplot(pricedf) +
geom_line( aes(x = date, y=price, group = 1, color = coldf)) +
xlab("Date") +
ylab("Price")
Which creates this
graph,
I used an ifelse statement to find where price5 is greater then price 20. The problem is that this creates NA's which I filled with green. I am not 100% on if you which way you wanted it to be in terms of the green to the red. You can simply change the
coldf <- ifelse(price5 - price20 > 0, 'green', 'red')
to
coldf <- ifelse(price5 - price20 > 0, 'red', 'green')
Which looks like graph2.
This is a continuum to a Cross-validated question where I asked about plausible methods for the problem. This question is more programming orientated, so I post it here on SO.
Background
I have a curve with known dates spanning over a year. The y-values of this curve are predictions for d18O values calculated from daily temperature and salinity records. I also have measured d18O values from a shell consisting of calcium carbonate. These values are measured along distance axis, where the first and the last measurement takes place approximately (but not exactly) at the same time than the beginning and the end of the curve.
It is known that d18O values match with the predicted values in the curve within some unknown random error. I want to get the best fit for the measured values to the curve by changing the x-axis for the measured values (or at least by matching the index with the index in the curve). In this way I can get estimates for the dates of the measured values and can further estimate the growth rate for the shell over the year. The growth rate is expected to be variable and there might be a growth hiatus (i.e. the growth stops). However, the growth between the measured values has to be > 0 (a constraint).
Example data
Here are the example datasets (curve and measured):
meas <- structure(list(index = 1:10, distance = c(0.1, 1, 3, 5, 7, 8,
13, 20, 22, 25), value = c(3.5, 4.2, 4.5, 4.4, 4.7, 4.8, 5.1,
4.9, 4.1, 3.7)), .Names = c("index", "distance", "value"), class = "data.frame",
row.names = c(NA, -10L))
curve <- structure(list(date = structure(c(15218, 15219, 15220, 15221,
15222, 15223, 15224, 15225, 15226, 15227, 15228, 15229, 15230,
15231, 15232, 15233, 15234, 15235, 15236, 15237, 15238, 15239,
15240, 15241, 15242, 15243, 15244, 15245, 15246, 15247, 15248,
15249, 15250, 15251, 15252, 15253, 15254, 15255, 15256, 15257,
15258, 15259, 15260, 15261, 15262, 15263, 15264, 15265, 15266,
15267, 15268, 15269, 15270, 15271, 15272, 15273, 15274, 15275,
15276, 15277, 15278, 15279, 15280, 15281, 15282, 15283, 15284,
15285, 15286, 15287, 15288, 15289, 15290, 15291, 15292, 15293,
15294, 15295, 15296, 15297, 15298, 15299, 15300, 15301, 15302,
15303, 15304, 15305, 15306, 15307, 15308, 15309, 15310, 15311,
15312, 15313, 15314, 15315, 15316, 15317, 15318, 15319, 15320,
15321, 15322, 15323, 15324, 15325, 15326, 15327, 15328, 15329,
15330, 15331, 15332, 15333, 15334, 15335, 15336, 15337, 15338,
15339, 15340, 15341, 15342, 15343, 15344, 15345, 15346, 15347,
15348, 15349, 15350, 15351, 15352, 15353, 15354, 15355, 15356,
15357, 15358, 15359, 15360, 15361, 15362, 15363, 15364, 15365,
15366, 15367, 15368, 15369, 15370, 15371, 15372, 15373, 15374,
15375, 15376, 15377, 15378, 15379, 15380, 15381, 15382, 15383,
15384, 15385, 15386, 15387, 15388, 15389, 15390, 15391, 15392,
15393, 15394, 15395, 15396, 15397, 15398, 15399, 15400, 15401,
15402, 15403, 15404, 15405, 15406, 15407, 15408, 15409, 15410,
15411, 15412, 15413, 15414, 15415, 15416, 15417, 15418, 15419,
15420, 15421, 15422, 15423, 15424, 15425, 15426, 15427, 15428,
15429, 15430, 15431, 15432, 15433, 15434, 15435, 15436, 15437,
15438, 15439, 15440, 15441, 15442, 15443, 15444, 15445, 15446,
15447, 15448, 15449, 15450, 15451, 15452, 15453, 15454, 15455,
15456, 15457, 15458, 15459, 15460, 15461, 15462, 15463, 15464,
15465, 15466, 15467, 15468, 15469, 15470, 15471, 15472, 15473,
15474, 15475, 15476, 15477, 15478, 15479, 15480, 15481, 15482,
15483, 15484, 15485, 15486, 15487, 15488, 15489, 15490, 15491,
15492, 15493, 15494, 15495, 15496, 15497, 15498, 15499, 15500,
15501, 15502, 15503, 15504, 15505, 15506, 15507, 15508, 15509,
15510, 15511, 15512, 15513, 15514, 15515, 15516, 15517, 15518,
15519, 15520, 15521, 15522, 15523, 15524, 15525, 15526, 15527,
15528, 15529, 15530, 15531, 15532, 15533, 15534, 15535, 15536,
15537, 15538, 15539, 15540, 15541, 15542, 15543, 15544, 15545,
15546, 15547, 15548, 15549, 15550, 15551, 15552, 15553, 15554,
15555, 15556, 15557, 15558, 15559, 15560, 15561, 15562, 15563,
15564, 15565, 15566, 15567, 15568, 15569, 15570, 15571, 15572,
15573, 15574, 15575, 15576, 15577, 15578, 15579, 15580, 15581,
15582, 15583, 15584), class = "Date"), index = 1:367, value = c(3.33,
3.35, 3.36, 3.38, 3.4, 3.42, 3.43, 3.45, 3.47, 3.48, 3.5, 3.52,
3.53, 3.55, 3.56, 3.58, 3.6, 3.61, 3.63, 3.64, 3.66, 3.67, 3.69,
3.7, 3.72, 3.73, 3.75, 3.76, 3.78, 3.79, 3.81, 3.82, 3.83, 3.85,
3.86, 3.88, 3.89, 3.9, 3.92, 3.93, 3.94, 3.96, 3.97, 3.98, 3.99,
4.01, 4.02, 4.03, 4.04, 4.06, 4.07, 4.08, 4.09, 4.1, 4.11, 4.13,
4.14, 4.15, 4.16, 4.17, 4.18, 4.19, 4.2, 4.21, 4.22, 4.23, 4.24,
4.25, 4.26, 4.27, 4.28, 4.28, 4.29, 4.3, 4.31, 4.32, 4.33, 4.33,
4.34, 4.35, 4.36, 4.36, 4.37, 4.38, 4.38, 4.39, 4.4, 4.41, 4.41,
4.42, 4.42, 4.43, 4.44, 4.44, 4.45, 4.45, 4.46, 4.46, 4.47, 4.47,
4.47, 4.48, 4.48, 4.49, 4.49, 4.49, 4.5, 4.5, 4.5, 4.51, 4.51,
4.51, 4.52, 4.52, 4.53, 4.53, 4.53, 4.54, 4.54, 4.54, 4.55, 4.55,
4.56, 4.57, 4.57, 4.58, 4.58, 4.59, 4.6, 4.61, 4.61, 4.62, 4.63,
4.64, 4.64, 4.65, 4.66, 4.67, 4.67, 4.68, 4.69, 4.7, 4.7, 4.71,
4.72, 4.72, 4.73, 4.74, 4.74, 4.75, 4.75, 4.75, 4.76, 4.76, 4.76,
4.76, 4.76, 4.76, 4.76, 4.76, 4.76, 4.75, 4.75, 4.75, 4.75, 4.74,
4.74, 4.73, 4.73, 4.73, 4.72, 4.72, 4.72, 4.71, 4.71, 4.71, 4.71,
4.7, 4.7, 4.7, 4.71, 4.71, 4.71, 4.71, 4.72, 4.72, 4.73, 4.74,
4.75, 4.75, 4.76, 4.78, 4.79, 4.8, 4.81, 4.82, 4.83, 4.84, 4.85,
4.86, 4.88, 4.89, 4.9, 4.91, 4.92, 4.92, 4.93, 4.94, 4.95, 4.95,
4.95, 4.96, 4.96, 4.96, 4.96, 4.96, 4.95, 4.95, 4.95, 4.94, 4.93,
4.92, 4.92, 4.91, 4.9, 4.89, 4.88, 4.87, 4.86, 4.85, 4.84, 4.83,
4.82, 4.8, 4.79, 4.78, 4.77, 4.76, 4.75, 4.75, 4.74, 4.73, 4.72,
4.72, 4.71, 4.71, 4.71, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7,
4.7, 4.7, 4.7, 4.7, 4.7, 4.69, 4.69, 4.69, 4.69, 4.69, 4.69,
4.69, 4.69, 4.68, 4.68, 4.68, 4.67, 4.67, 4.67, 4.66, 4.65, 4.65,
4.64, 4.63, 4.62, 4.61, 4.6, 4.59, 4.58, 4.57, 4.56, 4.55, 4.54,
4.53, 4.51, 4.5, 4.49, 4.48, 4.47, 4.46, 4.45, 4.43, 4.42, 4.41,
4.4, 4.39, 4.38, 4.37, 4.36, 4.35, 4.34, 4.33, 4.32, 4.32, 4.31,
4.3, 4.29, 4.28, 4.28, 4.27, 4.26, 4.25, 4.24, 4.24, 4.23, 4.22,
4.21, 4.21, 4.2, 4.19, 4.18, 4.17, 4.17, 4.16, 4.15, 4.14, 4.14,
4.13, 4.12, 4.12, 4.11, 4.1, 4.09, 4.08, 4.08, 4.07, 4.06, 4.05,
4.05, 4.04, 4.03, 4.02, 4.02, 4.01, 4, 4, 3.99, 3.98, 3.97, 3.97,
3.96, 3.95, 3.94, 3.94, 3.93, 3.92, 3.92, 3.91, 3.9, 3.9, 3.89,
3.88)), .Names = c("date", "index", "value"), row.names = c(NA,
-367L), class = "data.frame")
...and here is how it looks like:
library(ggplot2)
library(scales)
library(gridExtra)
p.curve <- ggplot() + geom_line(data = curve, aes(x = date, y = value)) + scale_x_date(name = "Month", breaks = date_breaks("months"), labels = date_format("%b")) + labs(title = "curve")
p.meas <- ggplot(meas, aes(x = distance, y = value)) + geom_point(color = "red") + labs(title = "measured", x = "Distance (mm)")
grid.arrange(p.curve, p.meas, ncol = 1)
The problem in practice
I want to find a mathematical/statistical method for R to fit meas to curve by changing the x-axis for meas. In addition I want to get some kind of goodness of fit statistics to compare the fitted "x-axes" among each other (in case I run several models with different constraints). I call the "x-axis model" a growth model, because that is what it essentially is. I want to constrain the fitting by specifying that distance between meas values has to be > 0. i.e. Measvalue with index == 2 has to occur after the value with index == 1. I also want to be able to constrain the growth rate (i.e. the maximum distance between two adjacent index points). To demonstrate this I will do it manually:
ggplot() + geom_line(data = curve, aes(x = index, y = value)) + geom_line(data = meas, aes(x = index, y = value), color = "red", linetype = 2) + scale_x_continuous(breaks = seq(0,370,10)) + scale_y_continuous(breaks = seq(3,5,0.1))
First, some of the indices in meas(red dashed line) have to be anchored to the indices of curve(black line). I choose to anchor the first and the last point plus the point with the highest value.
anchor <- data.frame(meas.index = c(1,7,10), curve.index = c(11,215,367))
example.fit <- merge(meas, anchor, by.x = "index", by.y = "meas.index", all = T, sort = F)
example.fit <- example.fit[with(example.fit, order(distance)),]
Then, I assume a linear growth between these anchored points. The growth will be along curve indices. Curve has one value per day. Hence the growth will be on a daily scale.
library(zoo)
example.fit$curve.index <- round(na.approx(example.fit$curve.index),0)
After this I replace the indices with dates and plot the results.
library(plyr)
example.fit$date <- as.Date(mapvalues(example.fit$curve.index, from = curve$index, to = as.character(curve$date)))
a <- ggplot() + geom_line(data = curve, aes(x = date, y = value)) + geom_point(data = example.fit, aes(x = date, y = value), color = "red") + scale_x_date(limits = range(curve$date), name = "Month", breaks = date_breaks("months"), labels = date_format("%b"))
b <- ggplot(example.fit, aes(x = date, y = distance)) + geom_line() + scale_x_date(limits = range(curve$date), name = "Month", breaks = date_breaks("months"), labels = date_format("%b"))
grid.arrange(a,b)
The plot above shows the resulting fit, which is based on three anchor points. The plot below shows the modeled growth along time on daily interval. The bend in the growth curve in the beginning of March is some funny mathematical artifact I do not understand (due to na.approxfunction from zoo package).
What have I tried
From my previous question I learned that dynamic time warping could be a solution. I also found an R package, which contains dtw functions. Nice. Dynamic time warping, indeed, worked for my example dataset in that question (except for setting the constraint), but I cannot get it to work for this dataset, where curve has much more data points than meas (called points in the previous question). I will try to save some space and will not copy the code/figures here. You can see what I have tried in my answer to that question. The problem seems to be that none of the step patterns, except for the simplest one, can handle these type of data. The simplest step-pattern matches the measured values several times to the curve, which is something I want to avoid, because I need defined dates for each measurement point. Also setting the constraint that growth rate has to be >0 between measurement points seems difficult.
Question
My question is two fold: first, would there be a better method to solve the problem than dynamic time warping? Second, how do I do this in practice in R?.
EDITS 9. Dec 2013 I tried to make the question clearer.
I'm not sure I understand 100% what the objective is, but if you're looking to fit the measured points to the reference curve then using dtw seems sensible. Fitting the 10 measured points to the 370-odd curve points does give a slightly strange result (which is just the optimization with the symmetric step.pattern). I think that's largely a function of the small number of points.
One option which may help is to use ggplot() (or another function) to smooth the measured curve and provide some additional points for matching. But obviously it can only do so much depending on the limitation of the measured points. With so few points you might lose information in the process of fitting your data.
If you could trim curve to be exactly contemporaneous with the first and last point of the meas observations, that would also help since you're matching with open.begin and open.end FALSE, but I'm not sure whether the exact dates are available.
This shows smoothing meas out to 80 points, and mapping the 10-point raw data and 80-point smooth to the reference curve
require(ggplot2)
require(scales)
require(gridExtra)
require(dtw)
require(plyr)
# use ggplot default to smooth the 10 point curve
meas.plot.smooth<-ggplot(meas, aes(x = distance, y = value)) + geom_line() + labs(title = "ggplot smoothed (blue curve)")+geom_smooth()
# use ggplot_build() to get the smoothed points
meas.curve.smooth<-ggplot_build(meas.plot.smooth)$data[[2]]
orig.align<-dtw(meas$value,curve$value,keep=T,step.pattern=symmetric1)
orig.freqs<-count(orig.align$index1)
# reference the matching points (which are effectively dates)
orig.freqs$cumsum<-cumsum(orig.freqs$freq)
g.10<-ggplot() + geom_line(data = curve, aes(x = date, y = value)) +
geom_line(aes(x = curve[orig.freqs$cumsum,"date"], y = meas$value),color="red") +
geom_text(aes(x = curve[orig.freqs$cumsum,"date"], y = meas$value, label=orig.freqs$x),color="red",size=5) +
scale_x_date(name = "Month", breaks = date_breaks("months"), labels = date_format("%b")) +
labs(title = "Native 10 pt curve - dtw mapped")
smooth.align<-dtw(meas.curve.smooth$y,curve$value,keep=T,step.pattern=symmetric1)
smooth.freqs<-count(smooth.align$index1)
smooth.freqs$cumsum<-cumsum(smooth.freqs$freq)
g.80<-ggplot() + geom_line(data = curve, aes(x = date, y = value)) +
geom_line(aes(x = curve[smooth.freqs$cumsum,"date"], y = meas.curve.smooth$y),color="red") +
scale_x_date(name = "Month", breaks = date_breaks("months"), labels = date_format("%b")) +
geom_text(aes(x = curve[smooth.freqs$cumsum,"date"], y = meas.curve.smooth$y, label=smooth.freqs$x),color="red",size=3.5,position="jitter") +
labs(title = "80 point loess curve - dtw mapped")
grid.arrange(meas.plot.smooth,g.10,g.80,ncol=1)
EDIT
Obviously part of the problem is confidence intervals. I've included an example here to build a random curve within the standard error levels around the smoothed curve. As you can see, it's quite different to using the projected curve itself. I think the issue is that when you're trying to map 10 measures against a 370-point reference curve, unless they track extremely tightly, it's going to be difficult to get precise predictions.
rand.align<-dtw(meas.curve.smooth$ymin+(meas.curve.smooth$ymax-meas.curve.smooth$ymin)*runif(length(meas.curve.smooth$ymin)),curve$value,keep=T,step.pattern=symmetric1)
rand.freqs<-count(rand.align$index1)
rand.freqs$cumsum<-cumsum(rand.freqs$freq)
g.rand<-ggplot() + geom_line(data = curve, aes(x = date, y = value)) +
geom_line(aes(x = curve[rand.freqs$cumsum,"date"], y = meas.curve.smooth$y),color="red") +
scale_x_date(name = "Month", breaks = date_breaks("months"), labels = date_format("%b")) +
geom_text(aes(x = curve[rand.freqs$cumsum,"date"], y = meas.curve.smooth$y, label=rand.freqs$x),color="red",size=3.5,position="jitter") +
labs(title = "Random curve within standard CI - dtw mapped")
grid.arrange(meas.plot.smooth,g.10,g.80,g.rand,ncol=1)
EDIT updated to include simulation.
OK - this is updated to run 1000 simulations. It creates curves for mapping which are randomised from within the 95% CI. I changed n to 10 (from 80) in the geom_smooth() function to try and preserve as much info as possible from the measured curve.
It models the cumulative growth (assuming linear growth between unmeasured days)
Not sure if it's completely useful, but provides a decent way of visualizing the uncertainty.
get_scenario<-function(i){
set.seed(i)
# create random curve within the CI
rand.align<-dtw(meas.curve.smooth$ymin+(meas.curve.smooth$ymax-meas.curve.smooth$ymin)*runif(length(meas.curve.smooth$ymin)),curve$value,keep=T,step.pattern=symmetric1)
rand.freqs<-count(rand.align$index1)
rand.freqs$cumsum<-cumsum(rand.freqs$freq)
growth.index<-data.frame(cumsum=curve$index,val=curve$value)
merged<-merge(growth.index,rand.freqs,by="cumsum")
return(data.frame(x=merged$cumsum,growth=cumsum(merged$val*merged$freq),scenario=i))
}
scenario.set <- ldply(lapply(1:1000,function(l)get_scenario(l)), data.frame)
g.s<-ggplot(scenario.set,aes(x,growth)) +
geom_line(aes(,group=scenario,color=scenario),alpha=0.25) +
scale_colour_gradient(low = "yellow", high = "orangered") +
xlab("Days from start") + ylab("Cumulative Growth")
g.xmax<-max(scenario.set$x) # get the final day (or set to another day)
g.xmin<-g.xmax-30 # thirty day window from end
b<-ggplot_build(g.s)
build.data<-b$data[[1]]
ylims<-build.data[build.data$x<=g.xmax & build.data$x>=g.xmin,]$y
g.subplot<-g.s+geom_point(aes(x,growth,color=scenario),alpha=0.25,size=5,position="jitter")+coord_cartesian(xlim=c(g.xmin,g.xmax),ylim=c(min(ylims),max(ylims)))
grid.arrange(meas.plot.smooth,g.s,g.subplot,ncol=1)
and here are some other ways of looking at the tail:
g.s<-ggplot(scenario.set,aes(x,growth)) +
geom_line(aes(,group=scenario,color=scenario),alpha=0.25) +
scale_colour_gradient(low = "yellow", high = "orangered") +
xlab("Days from start") + ylab("Cumulative Growth")
g.xmax<-max(scenario.set$x) # get the final day (or set to another day)
g.xmin<-g.xmax-50 # thirty day window from end
b<-ggplot_build(g.s)
build.data<-b$data[[1]]
ylims<-build.data[build.data$x<=g.xmax & build.data$x>=g.xmin,]$y
g.subplot<-g.s+geom_point(aes(x,growth,color=scenario),alpha=0.25,size=5,position="jitter")+coord_cartesian(xlim=c(g.xmin,g.xmax),ylim=c(min(ylims),max(ylims)))
grid.arrange(meas.plot.smooth,g.s,g.subplot,ncol=1)
g.box<-ggplot(build.data)+
geom_boxplot(aes(x,y,group=cut(x,max(x)/7),fill=cut(x,max(x)/7)),alpha=0.5)+ # bucket by group
theme(legend.position="none")+
coord_cartesian(xlim=c(g.xmin,g.xmax),ylim=c(min(ylims)-50,max(ylims)+50))
build.data.sum<-ddply(build.data,.(x),summarise,ymax=max(y),ymin=min(y),mean=mean(y))
g.spots<-ggplot(build.data)+
geom_point(aes(x,y,color=group),size=10,alpha=0.25,position="jitter")+
theme(legend.position="none")+scale_colour_gradient(low = "yellow", high = "orangered")+
geom_ribbon(data=build.data.sum,aes(x,ymax=ymax,ymin=ymin),alpha=0.25)+
coord_cartesian(xlim=c(g.xmax-50,g.xmax+1),ylim=c(min(ylims)-50,max(ylims)+50))+geom_smooth(aes(x,y),n=max(build.data$x))
grid.arrange(g.box,g.spots,ncol=1)