Frequency table for intervals - r

I saved data into the object datos so I could calculate AF (absolute frequency) and RF(relative frequency) for a continuous variable in column V1. But I want to have the frequencies be in intervals.
I don't really know how to do it so I need your help. If anyone has any idea about how to do it, here is my code:
k is the number of intervals I'm using
and largo is the quantity of data I have.
read.table("datos.txt", header = FALSE)-> datos
largo<-length(datos$V1)
k<- (1+log2(largo))
k<-round(k,digits = 0)
vectordatos <- datos$v1
histograma<-hist(datos$V1,breaks=k)
FA<-table(datos$V1)
FR<-table(datos$V1)/largo
FA
FR
The datos object is as follows:
datos = structure(list(V1 = c(6.16, 5.83, 5.66, 3.63, 1.38, 9.64, 7.46,
5.34, 7.93, 8.5, 4.18, 5.18, 10.27, 5.41, 4.76, 4.67, 10.02,
7.1, 5.38, 8.55, 4.85, 8.28, 2.9, 7.18, 6.54, 5.66, 7.26, 6.45,
3.97, 6.55, 5.15, 7.83, 5.52, 7.21, 7.3, 6.19)), class = "data.frame", row .names = c(NA,
-36L))

You can use cut to create k intervals and table to represent the frequency per interval. You can use the following code:
table(cut(datos$V1,k))
Output:
(1.37,2.86] (2.86,4.34] (4.34,5.83] (5.83,7.31] (7.31,8.79] (8.79,10.3]
1 4 11 11 6 3

Related

Getting the distance matrix back from already clustered data

I have used hclust in the TSclust package to do agglomerative hierarchical clustering. My question is, Can I get the dissimlarity (distance) matrix back from hclust? I wanted the values of the distance to rank which variable is closer to a single variable in the group of variables.
example: If (x1,x2, x3,x4,x5,x6,x7,x8,x9,x10) are the variables used to form the distance matrix, then what I wanted is the distance between x3 and the rest of variables (x3x1,x3x2,x3x4,x3x5, and so on). Can we do that? Here is the code and reproducible data.
Data:
structure(list(x1 = c(186.41, 100.18, 12.3, 14.38, 25.97, 0.06,
0, 6.17, 244.06, 19.26, 256.18, 255.69, 121.88, 75, 121.45, 11.34,
34.68, 3.09, 34.3, 26.13, 111.31), x2 = c(327.2, 8.05, 4.23,
6.7, 3.12, 1.91, 37.03, 39.17, 140.06, 83.72, 263.29, 261.22,
202.48, 23.27, 2.87, 7.17, 14.48, 3.41, 5.95, 70.56, 91.58),
x3 = c(220.18, 126.14, 98.59, 8.56, 0.5, 0.9, 17.45, 191.1,
164.64, 224.36, 262.86, 237.75, 254.88, 42.05, 9.12, 0.04,
12.22, 0.61, 61.86, 114.08, 78.94), x4 = c(90.74, 26.11,
47.86, 10.86, 3.74, 23.69, 61.79, 68.12, 87.92, 171.76, 260.98,
266.62, 96.27, 57.15, 78.89, 16.73, 6.59, 49.44, 57.21, 202.2,
67.17), x5 = c(134.09, 27.06, 7.44, 4.53, 17, 47.66, 95.96,
129.53, 40.23, 157.37, 172.61, 248.56, 160.84, 421.94, 109.93,
22.77, 2.11, 49.18, 64.13, 52.61, 180.87), x6 = c(173.17,
46.68, 6.54, 3.05, 0.35, 0.12, 5.09, 72.46, 58.19, 112.31,
233.77, 215.82, 100.63, 65.84, 2.69, 0.01, 3.63, 12.93, 66.55,
28, 61.74), x7 = c(157.22, 141.81, 19.98, 116.18, 16.55,
122.3, 62.67, 141.84, 78.3, 227.27, 340.22, 351.38, 147.73,
0.3, 56.12, 33.2, 5.51, 54.4, 82.98, 152.66, 218.26), x8 = c(274.08,
51.92, 54.86, 15.37, 0.31, 0.05, 36.3, 162.04, 171.78, 181.39,
310.73, 261.55, 237.99, 123.99, 1.92, 0.74, 0.23, 18.51,
7.68, 65.55, 171.33), x9 = c(262.71, 192.34, 2.75, 21.68,
1.69, 3.92, 0.09, 9.33, 120.36, 282.92, 236.7, 161.59, 255.44,
126.44, 7.63, 2.04, 1.02, 0.12, 5.87, 146.25, 134.11), x10 = c(82.71,
44.09, 1.52, 2.63, 4.38, 28.64, 168.43, 80.62, 20.36, 39.29,
302.31, 247.52, 165.73, 18.27, 2.67, 1.77, 23.13, 53.47,
53.14, 46.61, 86.29)), class = "data.frame", row.names = c(NA,
-21L))
Code:
as.ts(cdata)
library(dplyr) # data wrangling
library(ggplot2) # grammar of graphics
library(ggdendro) # dendrograms
library(TSclust) # cluster time series
cluster analysis
dist_ts <- TSclust::diss(SERIES = t(cdata), METHOD = "INT.PER") # note the data frame must be transposed
hc <- stats::hclust(dist_ts, method="complete") # method can be also "average" or diana (for DIvisive ANAlysis Clustering)
hcdata <- ggdendro::dendro_data(hc)
names_order <- hcdata$labels$label
# Use the following to remove labels from dendogram so not doubling up - but good for checking hcdata$labels$label <- ""
hcdata%>%ggdendro::ggdendrogram(., rotate=FALSE, leaf_labels=FALSE)
I believe the object you are looking for is stored in the variable dist_ts:
dist_ts <- TSclust::diss(SERIES = t(cdata), METHOD = "INT.PER")
print(dist_ts)

Creating a 2D-grid or raster in R comparing all respondents with all variables

reproducible example for my data:
df_1 <- data.frame(cbind("Thriving" = c(2.33, 4.21, 6.37, 5.28, 4.87, 3.92, 4.16, 5.53), "Satisfaction" = c(3.45, 4.53, 6.01, 3.87, 2.92, 4.50, 5.89, 4.72), "Wellbeing" = c(2.82, 3.45, 5.23, 3.93, 6.18, 4.22, 3.68, 4.74), "id" = c(1:8)))
As you can see, it includes three variables of psychological measures and one identifier with an id for each respondent.
Now, my aim is to create a 2D-grid with which I can have a nice overview of all the values for all respondents concerning each of the variables. So on the x-axis I would have the id of all the respondents and on the y-axis all variables, whereas the colour of the particular field depends on the value - 1 to 3 in red, 3 to 5 in yellow and 5 to 7 in green The style of the grid should be like this image.
All I have achieved so far is the following code which compresses all the variables/items into one column so they can together be portrayed on the y-axis - the id is of course included in its own column as are the values:
df_1 %>%
select("Thr" = Thriving, "Stf" = Satisfaction, "Wb" = Wellbeing, "id" = id) %>%
na.omit %>%
gather(key = "variable", value = "value", -id) %>%
I am looking for a solution that works without storing the data in a new frame.
Also, I am looking for a solution that would be useful for even 100 or more respondents and up to about 40 variables. It would not matter if one rectangle would then be very small, I just want to have a nice colour play which would give a nice taste of where an organisation may be achieving low or high - and how it is achieving in general.
Thanks for reading, very grateful for any help!
There is probably a better graphics oriented approach, but you can do this with base plot and by treating your data as a raster:
library(raster)
df_1 <- cbind("Thriving" = c(2.33, 4.21, 6.37, 5.28, 4.87, 3.92, 4.16, 5.53), "Satisfaction" = c(3.45, 4.53, 6.01, 3.87, 2.92, 4.50, 5.89, 4.72), "Wellbeing" = c(2.82, 3.45, 5.23, 3.93, 6.18, 4.22, 3.68, 4.74), "id" = c(1:8))
r <- raster(ncol=nrow(df_1), nrow=3, xmn=0, xmx=8, ymn=0, ymx=3)
values(r) <- as.vector(as.matrix(df_1[,1:3]))
plot(r, axes=F, box=F, asp=NA)
axis(1, at=seq(-0.5, 8.5, 1), 0:9)
axis(2, at=seq(-0.5, 3.5, 1), c("", colnames(df_1)), las=1)

How to set different colors in different ranges of one single line in R?

I am now facing on a problem about how to make moving average crossover plot in R. I added ma5 and ma20 as two moving average plots base on my price data.
It is my sample code here..
library("TTR")
library(ggplot2)
price<- c(3.23, 3.29, 3.29 , 3.21, 3.19, 3.18, 3.11, 3.21, 3.25,
3.40, 3.39, 3.28, 3.31 , 3.32, 3.21, 3.19, 3.16, 3.20,
3.26, 3.30, 3.42, 3.44, 3.40, 3.41, 3.59, 3.83, 3.70,
3.86, 3.95, 3.89, 3.94, 3.78, 3.69, 3.74, 3.67, 3.69,
3.69, 3.61, 3.64, 3.83, 3.88, 3.98, 3.98, 3.86, 3.87,
3.93, 4.05, 3.97, 3.90, 3.93, 4.00, 3.85, 3.81, 4.20,
4.17, 4.05, 3.95, 3.96, 3.97, 3.96, 3.88, 3.85, 3.79,
3.83, 3.68, 3.72, 3.73, 3.81, 3.80, 3.81, 3.75, 3.87,
3.90, 3.89, 3.86, 3.81, 3.86, 3.78, 3.83, 3.87, 3.91,
4.05, 4.07, 4.02, 4.01, 4.00, 4.13, 4.07, 4.11, 4.26,
4.33, 4.32, 4.39, 4.30, 4.39, 4.68, 4.69, 4.70, 4.60,
4.71, 4.81, 4.73, 4.78, 4.64, 4.64, 4.64, 4.61, 4.44)
date<- c("2004-01-23", "2004-01-26", "2004-01-27", "2004-01-28",
"2004-02-02", "2004-02-03", "2004-02-04", "2004-02-05",
"2004-02-06", "2004-02-11", "2004-02-12", "2004-02-13",
"2004-02-17", "2004-02-18", "2004-02-19", "2004-02-20",
"2004-02-23", "2004-02-24", "2004-02-25", "2004-02-26",
"2004-02-27", "2004-03-01", "2004-03-02", "2004-03-03",
"2004-03-04", "2004-03-05", "2004-03-08", "2004-03-09",
"2004-03-10", "2004-03-11", "2004-03-12", "2004-03-15",
"2004-03-16", "2004-03-17", "2004-03-18", "2004-03-19",
"2004-03-22", "2004-03-23", "2004-03-24", "2004-03-25",
"2004-03-26", "2004-03-29", "2004-03-30", "2004-03-31",
"2004-04-01", "2004-04-02", "2004-04-05", "2004-04-06",
"2004-04-07", "2004-04-08", "2004-04-12", "2004-04-13",
"2004-04-14", "2004-04-15", "2004-04-16", "2004-04-19",
"2004-04-20", "2004-04-21", "2004-04-22", "2004-04-23",
"2004-04-26", "2004-04-27", "2004-04-28", "2004-04-29",
"2004-04-30", "2004-05-03", "2004-05-04", "2004-05-05",
"2004-05-06", "2004-05-07", "2004-05-10", "2004-05-11",
"2004-05-12", "2004-05-13", "2004-05-14", "2004-05-17",
"2004-05-18", "2004-05-19", "2004-05-20", "2004-05-21",
"2004-05-24", "2004-05-25", "2004-05-26", "2004-05-27",
"2004-05-28", "2004-06-01", "2004-06-02", "2004-06-03",
"2004-06-04", "2004-06-07", "2004-06-08", "2004-06-09",
"2004-06-10", "2004-06-14", "2004-06-15", "2004-06-16",
"2004-06-17", "2004-06-18", "2004-06-21", "2004-06-22",
"2004-06-23", "2004-06-24", "2004-06-25", "2004-06-28",
"2004-06-29", "2004-06-30", "2004-07-01", "2004-07-02")
price5<- SMA(price,n=5)
price20<- SMA(price,n=20)
pricedf<- data.frame(date,price5,price20,price)
ggplot(pricedf,aes(date))+geom_line(group=1,aes(y=price5,colour="ma5"))+geom_line(group=1,aes(y=price20,colour="ma20"))+xlab("Date")+ylab("Price")
There are a couples of crossovers on this plot. What I want to have is when ma5 above ma20 mark as green line on 'price'(one feature in my pricedf) plot. On the other hand when ma5 under ma20 mark as red line on 'price' plot.
The example plot looks like this picture,
I was thinking subtract price5 to price20 and compare whether the values are greater than 0. But how can I draw them on another plot with different colors?
Here is how I solved it.
library("TTR")
library(ggplot2)
price<- c(3.23, 3.29, 3.29 , 3.21, 3.19, 3.18, 3.11, 3.21, 3.25,
3.40, 3.39, 3.28, 3.31 , 3.32, 3.21, 3.19, 3.16, 3.20,
3.26, 3.30, 3.42, 3.44, 3.40, 3.41, 3.59, 3.83, 3.70,
3.86, 3.95, 3.89, 3.94, 3.78, 3.69, 3.74, 3.67, 3.69,
3.69, 3.61, 3.64, 3.83, 3.88, 3.98, 3.98, 3.86, 3.87,
3.93, 4.05, 3.97, 3.90, 3.93, 4.00, 3.85, 3.81, 4.20,
4.17, 4.05, 3.95, 3.96, 3.97, 3.96, 3.88, 3.85, 3.79,
3.83, 3.68, 3.72, 3.73, 3.81, 3.80, 3.81, 3.75, 3.87,
3.90, 3.89, 3.86, 3.81, 3.86, 3.78, 3.83, 3.87, 3.91,
4.05, 4.07, 4.02, 4.01, 4.00, 4.13, 4.07, 4.11, 4.26,
4.33, 4.32, 4.39, 4.30, 4.39, 4.68, 4.69, 4.70, 4.60,
4.71, 4.81, 4.73, 4.78, 4.64, 4.64, 4.64, 4.61, 4.44)
date<- c("2004-01-23", "2004-01-26", "2004-01-27", "2004-01-28",
"2004-02-02", "2004-02-03", "2004-02-04", "2004-02-05",
"2004-02-06", "2004-02-11", "2004-02-12", "2004-02-13",
"2004-02-17", "2004-02-18", "2004-02-19", "2004-02-20",
"2004-02-23", "2004-02-24", "2004-02-25", "2004-02-26",
"2004-02-27", "2004-03-01", "2004-03-02", "2004-03-03",
"2004-03-04", "2004-03-05", "2004-03-08", "2004-03-09",
"2004-03-10", "2004-03-11", "2004-03-12", "2004-03-15",
"2004-03-16", "2004-03-17", "2004-03-18", "2004-03-19",
"2004-03-22", "2004-03-23", "2004-03-24", "2004-03-25",
"2004-03-26", "2004-03-29", "2004-03-30", "2004-03-31",
"2004-04-01", "2004-04-02", "2004-04-05", "2004-04-06",
"2004-04-07", "2004-04-08", "2004-04-12", "2004-04-13",
"2004-04-14", "2004-04-15", "2004-04-16", "2004-04-19",
"2004-04-20", "2004-04-21", "2004-04-22", "2004-04-23",
"2004-04-26", "2004-04-27", "2004-04-28", "2004-04-29",
"2004-04-30", "2004-05-03", "2004-05-04", "2004-05-05",
"2004-05-06", "2004-05-07", "2004-05-10", "2004-05-11",
"2004-05-12", "2004-05-13", "2004-05-14", "2004-05-17",
"2004-05-18", "2004-05-19", "2004-05-20", "2004-05-21",
"2004-05-24", "2004-05-25", "2004-05-26", "2004-05-27",
"2004-05-28", "2004-06-01", "2004-06-02", "2004-06-03",
"2004-06-04", "2004-06-07", "2004-06-08", "2004-06-09",
"2004-06-10", "2004-06-14", "2004-06-15", "2004-06-16",
"2004-06-17", "2004-06-18", "2004-06-21", "2004-06-22",
"2004-06-23", "2004-06-24", "2004-06-25", "2004-06-28",
"2004-06-29", "2004-06-30", "2004-07-01", "2004-07-02")
price5<- SMA(price,n=5)
price20<- SMA(price,n=20)
pricedf<- data.frame(date,price5,price20,price)
coldf <- ifelse(price5 - price20 > 0, 'green', 'red')
coldf[is.na(coldf)] <- 'green'
coldf
ggplot(pricedf) +
geom_line( aes(x = date, y=price, group = 1, color = coldf)) +
xlab("Date") +
ylab("Price")
Which creates this
graph,
I used an ifelse statement to find where price5 is greater then price 20. The problem is that this creates NA's which I filled with green. I am not 100% on if you which way you wanted it to be in terms of the green to the red. You can simply change the
coldf <- ifelse(price5 - price20 > 0, 'green', 'red')
to
coldf <- ifelse(price5 - price20 > 0, 'red', 'green')
Which looks like graph2.

texreg: How to save space in htmlreg-regression tables?

Is there a way to reduce the vertical size of a htmlreg-table? I have severeal modells with about 10 or more IV. So atm I need an entire page to present my regressions results. I would like to save some lines by reporting SD or SE (in parenthesis) inline (next to) the coefficients. Straightforward way is creating output-tables in latex by hand. Is there an easy solution (more elegant way)?
library(texreg)
alligator = data.frame(
lnLength = c(3.87, 3.61, 4.33, 3.43, 3.81, 3.83, 3.46, 3.76,
3.50, 3.58, 4.19, 3.78, 3.71, 3.73, 3.78),
lnWeight = c(4.87, 3.93, 6.46, 3.33, 4.38, 4.70, 3.50, 4.50,
3.58, 3.64, 5.90, 4.43, 4.38, 4.42, 4.25)
)
alli.mod = lm(lnWeight ~ lnLength, data = alligator)
htmlreg(list(alli.mod),
file="MWE_regression.html",
caption="MWE Regression",
caption.above = TRUE,
include.rs=TRUE,
include.adjrs = FALSE,
digits=3,
stars=c(0.01, 0.05, 0.1)
)
Thanks :)
Update The amazing, simple and elegant solution is using the stargazer-package. Quite new: http://www.r-statistics.com/2013/01/stargazer-package-for-beautiful-latex-tables-from-r-statistical-models-output/ this package can export wonderful latex-tables, much much better than the texreg.
If you'd still like to accomplish this with texreg and its htmlreg function, just use the argument single.row = TRUE. Here is your full example:
library(texreg)
alligator = data.frame(
lnLength = c(3.87, 3.61, 4.33, 3.43, 3.81, 3.83, 3.46, 3.76,
3.50, 3.58, 4.19, 3.78, 3.71, 3.73, 3.78),
lnWeight = c(4.87, 3.93, 6.46, 3.33, 4.38, 4.70, 3.50, 4.50,
3.58, 3.64, 5.90, 4.43, 4.38, 4.42, 4.25)
)
alli.mod = lm(lnWeight ~ lnLength, data = alligator)
htmlreg(list(alli.mod),
single.row = TRUE,
file="MWE_regression.html",
caption="MWE Regression",
caption.above = TRUE,
include.rs=TRUE,
include.adjrs = FALSE,
digits=3,
stars=c(0.01, 0.05, 0.1)
)
Your original result is on the left, the new output on the right:
Use the texreg function instead of htmlreg if you are interested in LaTeX output rather than HTML, as mentioned in your additional comment.
Edit: The HTML output now looks a bit nicer with more recent versions of texreg.

How to fit known values to a known curve by changing x-axis

This is a continuum to a Cross-validated question where I asked about plausible methods for the problem. This question is more programming orientated, so I post it here on SO.
Background
I have a curve with known dates spanning over a year. The y-values of this curve are predictions for d18O values calculated from daily temperature and salinity records. I also have measured d18O values from a shell consisting of calcium carbonate. These values are measured along distance axis, where the first and the last measurement takes place approximately (but not exactly) at the same time than the beginning and the end of the curve.
It is known that d18O values match with the predicted values in the curve within some unknown random error. I want to get the best fit for the measured values to the curve by changing the x-axis for the measured values (or at least by matching the index with the index in the curve). In this way I can get estimates for the dates of the measured values and can further estimate the growth rate for the shell over the year. The growth rate is expected to be variable and there might be a growth hiatus (i.e. the growth stops). However, the growth between the measured values has to be > 0 (a constraint).
Example data
Here are the example datasets (curve and measured):
meas <- structure(list(index = 1:10, distance = c(0.1, 1, 3, 5, 7, 8,
13, 20, 22, 25), value = c(3.5, 4.2, 4.5, 4.4, 4.7, 4.8, 5.1,
4.9, 4.1, 3.7)), .Names = c("index", "distance", "value"), class = "data.frame",
row.names = c(NA, -10L))
curve <- structure(list(date = structure(c(15218, 15219, 15220, 15221,
15222, 15223, 15224, 15225, 15226, 15227, 15228, 15229, 15230,
15231, 15232, 15233, 15234, 15235, 15236, 15237, 15238, 15239,
15240, 15241, 15242, 15243, 15244, 15245, 15246, 15247, 15248,
15249, 15250, 15251, 15252, 15253, 15254, 15255, 15256, 15257,
15258, 15259, 15260, 15261, 15262, 15263, 15264, 15265, 15266,
15267, 15268, 15269, 15270, 15271, 15272, 15273, 15274, 15275,
15276, 15277, 15278, 15279, 15280, 15281, 15282, 15283, 15284,
15285, 15286, 15287, 15288, 15289, 15290, 15291, 15292, 15293,
15294, 15295, 15296, 15297, 15298, 15299, 15300, 15301, 15302,
15303, 15304, 15305, 15306, 15307, 15308, 15309, 15310, 15311,
15312, 15313, 15314, 15315, 15316, 15317, 15318, 15319, 15320,
15321, 15322, 15323, 15324, 15325, 15326, 15327, 15328, 15329,
15330, 15331, 15332, 15333, 15334, 15335, 15336, 15337, 15338,
15339, 15340, 15341, 15342, 15343, 15344, 15345, 15346, 15347,
15348, 15349, 15350, 15351, 15352, 15353, 15354, 15355, 15356,
15357, 15358, 15359, 15360, 15361, 15362, 15363, 15364, 15365,
15366, 15367, 15368, 15369, 15370, 15371, 15372, 15373, 15374,
15375, 15376, 15377, 15378, 15379, 15380, 15381, 15382, 15383,
15384, 15385, 15386, 15387, 15388, 15389, 15390, 15391, 15392,
15393, 15394, 15395, 15396, 15397, 15398, 15399, 15400, 15401,
15402, 15403, 15404, 15405, 15406, 15407, 15408, 15409, 15410,
15411, 15412, 15413, 15414, 15415, 15416, 15417, 15418, 15419,
15420, 15421, 15422, 15423, 15424, 15425, 15426, 15427, 15428,
15429, 15430, 15431, 15432, 15433, 15434, 15435, 15436, 15437,
15438, 15439, 15440, 15441, 15442, 15443, 15444, 15445, 15446,
15447, 15448, 15449, 15450, 15451, 15452, 15453, 15454, 15455,
15456, 15457, 15458, 15459, 15460, 15461, 15462, 15463, 15464,
15465, 15466, 15467, 15468, 15469, 15470, 15471, 15472, 15473,
15474, 15475, 15476, 15477, 15478, 15479, 15480, 15481, 15482,
15483, 15484, 15485, 15486, 15487, 15488, 15489, 15490, 15491,
15492, 15493, 15494, 15495, 15496, 15497, 15498, 15499, 15500,
15501, 15502, 15503, 15504, 15505, 15506, 15507, 15508, 15509,
15510, 15511, 15512, 15513, 15514, 15515, 15516, 15517, 15518,
15519, 15520, 15521, 15522, 15523, 15524, 15525, 15526, 15527,
15528, 15529, 15530, 15531, 15532, 15533, 15534, 15535, 15536,
15537, 15538, 15539, 15540, 15541, 15542, 15543, 15544, 15545,
15546, 15547, 15548, 15549, 15550, 15551, 15552, 15553, 15554,
15555, 15556, 15557, 15558, 15559, 15560, 15561, 15562, 15563,
15564, 15565, 15566, 15567, 15568, 15569, 15570, 15571, 15572,
15573, 15574, 15575, 15576, 15577, 15578, 15579, 15580, 15581,
15582, 15583, 15584), class = "Date"), index = 1:367, value = c(3.33,
3.35, 3.36, 3.38, 3.4, 3.42, 3.43, 3.45, 3.47, 3.48, 3.5, 3.52,
3.53, 3.55, 3.56, 3.58, 3.6, 3.61, 3.63, 3.64, 3.66, 3.67, 3.69,
3.7, 3.72, 3.73, 3.75, 3.76, 3.78, 3.79, 3.81, 3.82, 3.83, 3.85,
3.86, 3.88, 3.89, 3.9, 3.92, 3.93, 3.94, 3.96, 3.97, 3.98, 3.99,
4.01, 4.02, 4.03, 4.04, 4.06, 4.07, 4.08, 4.09, 4.1, 4.11, 4.13,
4.14, 4.15, 4.16, 4.17, 4.18, 4.19, 4.2, 4.21, 4.22, 4.23, 4.24,
4.25, 4.26, 4.27, 4.28, 4.28, 4.29, 4.3, 4.31, 4.32, 4.33, 4.33,
4.34, 4.35, 4.36, 4.36, 4.37, 4.38, 4.38, 4.39, 4.4, 4.41, 4.41,
4.42, 4.42, 4.43, 4.44, 4.44, 4.45, 4.45, 4.46, 4.46, 4.47, 4.47,
4.47, 4.48, 4.48, 4.49, 4.49, 4.49, 4.5, 4.5, 4.5, 4.51, 4.51,
4.51, 4.52, 4.52, 4.53, 4.53, 4.53, 4.54, 4.54, 4.54, 4.55, 4.55,
4.56, 4.57, 4.57, 4.58, 4.58, 4.59, 4.6, 4.61, 4.61, 4.62, 4.63,
4.64, 4.64, 4.65, 4.66, 4.67, 4.67, 4.68, 4.69, 4.7, 4.7, 4.71,
4.72, 4.72, 4.73, 4.74, 4.74, 4.75, 4.75, 4.75, 4.76, 4.76, 4.76,
4.76, 4.76, 4.76, 4.76, 4.76, 4.76, 4.75, 4.75, 4.75, 4.75, 4.74,
4.74, 4.73, 4.73, 4.73, 4.72, 4.72, 4.72, 4.71, 4.71, 4.71, 4.71,
4.7, 4.7, 4.7, 4.71, 4.71, 4.71, 4.71, 4.72, 4.72, 4.73, 4.74,
4.75, 4.75, 4.76, 4.78, 4.79, 4.8, 4.81, 4.82, 4.83, 4.84, 4.85,
4.86, 4.88, 4.89, 4.9, 4.91, 4.92, 4.92, 4.93, 4.94, 4.95, 4.95,
4.95, 4.96, 4.96, 4.96, 4.96, 4.96, 4.95, 4.95, 4.95, 4.94, 4.93,
4.92, 4.92, 4.91, 4.9, 4.89, 4.88, 4.87, 4.86, 4.85, 4.84, 4.83,
4.82, 4.8, 4.79, 4.78, 4.77, 4.76, 4.75, 4.75, 4.74, 4.73, 4.72,
4.72, 4.71, 4.71, 4.71, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7,
4.7, 4.7, 4.7, 4.7, 4.7, 4.69, 4.69, 4.69, 4.69, 4.69, 4.69,
4.69, 4.69, 4.68, 4.68, 4.68, 4.67, 4.67, 4.67, 4.66, 4.65, 4.65,
4.64, 4.63, 4.62, 4.61, 4.6, 4.59, 4.58, 4.57, 4.56, 4.55, 4.54,
4.53, 4.51, 4.5, 4.49, 4.48, 4.47, 4.46, 4.45, 4.43, 4.42, 4.41,
4.4, 4.39, 4.38, 4.37, 4.36, 4.35, 4.34, 4.33, 4.32, 4.32, 4.31,
4.3, 4.29, 4.28, 4.28, 4.27, 4.26, 4.25, 4.24, 4.24, 4.23, 4.22,
4.21, 4.21, 4.2, 4.19, 4.18, 4.17, 4.17, 4.16, 4.15, 4.14, 4.14,
4.13, 4.12, 4.12, 4.11, 4.1, 4.09, 4.08, 4.08, 4.07, 4.06, 4.05,
4.05, 4.04, 4.03, 4.02, 4.02, 4.01, 4, 4, 3.99, 3.98, 3.97, 3.97,
3.96, 3.95, 3.94, 3.94, 3.93, 3.92, 3.92, 3.91, 3.9, 3.9, 3.89,
3.88)), .Names = c("date", "index", "value"), row.names = c(NA,
-367L), class = "data.frame")
...and here is how it looks like:
library(ggplot2)
library(scales)
library(gridExtra)
p.curve <- ggplot() + geom_line(data = curve, aes(x = date, y = value)) + scale_x_date(name = "Month", breaks = date_breaks("months"), labels = date_format("%b")) + labs(title = "curve")
p.meas <- ggplot(meas, aes(x = distance, y = value)) + geom_point(color = "red") + labs(title = "measured", x = "Distance (mm)")
grid.arrange(p.curve, p.meas, ncol = 1)
The problem in practice
I want to find a mathematical/statistical method for R to fit meas to curve by changing the x-axis for meas. In addition I want to get some kind of goodness of fit statistics to compare the fitted "x-axes" among each other (in case I run several models with different constraints). I call the "x-axis model" a growth model, because that is what it essentially is. I want to constrain the fitting by specifying that distance between meas values has to be > 0. i.e. Measvalue with index == 2 has to occur after the value with index == 1. I also want to be able to constrain the growth rate (i.e. the maximum distance between two adjacent index points). To demonstrate this I will do it manually:
ggplot() + geom_line(data = curve, aes(x = index, y = value)) + geom_line(data = meas, aes(x = index, y = value), color = "red", linetype = 2) + scale_x_continuous(breaks = seq(0,370,10)) + scale_y_continuous(breaks = seq(3,5,0.1))
First, some of the indices in meas(red dashed line) have to be anchored to the indices of curve(black line). I choose to anchor the first and the last point plus the point with the highest value.
anchor <- data.frame(meas.index = c(1,7,10), curve.index = c(11,215,367))
example.fit <- merge(meas, anchor, by.x = "index", by.y = "meas.index", all = T, sort = F)
example.fit <- example.fit[with(example.fit, order(distance)),]
Then, I assume a linear growth between these anchored points. The growth will be along curve indices. Curve has one value per day. Hence the growth will be on a daily scale.
library(zoo)
example.fit$curve.index <- round(na.approx(example.fit$curve.index),0)
After this I replace the indices with dates and plot the results.
library(plyr)
example.fit$date <- as.Date(mapvalues(example.fit$curve.index, from = curve$index, to = as.character(curve$date)))
a <- ggplot() + geom_line(data = curve, aes(x = date, y = value)) + geom_point(data = example.fit, aes(x = date, y = value), color = "red") + scale_x_date(limits = range(curve$date), name = "Month", breaks = date_breaks("months"), labels = date_format("%b"))
b <- ggplot(example.fit, aes(x = date, y = distance)) + geom_line() + scale_x_date(limits = range(curve$date), name = "Month", breaks = date_breaks("months"), labels = date_format("%b"))
grid.arrange(a,b)
The plot above shows the resulting fit, which is based on three anchor points. The plot below shows the modeled growth along time on daily interval. The bend in the growth curve in the beginning of March is some funny mathematical artifact I do not understand (due to na.approxfunction from zoo package).
What have I tried
From my previous question I learned that dynamic time warping could be a solution. I also found an R package, which contains dtw functions. Nice. Dynamic time warping, indeed, worked for my example dataset in that question (except for setting the constraint), but I cannot get it to work for this dataset, where curve has much more data points than meas (called points in the previous question). I will try to save some space and will not copy the code/figures here. You can see what I have tried in my answer to that question. The problem seems to be that none of the step patterns, except for the simplest one, can handle these type of data. The simplest step-pattern matches the measured values several times to the curve, which is something I want to avoid, because I need defined dates for each measurement point. Also setting the constraint that growth rate has to be >0 between measurement points seems difficult.
Question
My question is two fold: first, would there be a better method to solve the problem than dynamic time warping? Second, how do I do this in practice in R?.
EDITS 9. Dec 2013 I tried to make the question clearer.
I'm not sure I understand 100% what the objective is, but if you're looking to fit the measured points to the reference curve then using dtw seems sensible. Fitting the 10 measured points to the 370-odd curve points does give a slightly strange result (which is just the optimization with the symmetric step.pattern). I think that's largely a function of the small number of points.
One option which may help is to use ggplot() (or another function) to smooth the measured curve and provide some additional points for matching. But obviously it can only do so much depending on the limitation of the measured points. With so few points you might lose information in the process of fitting your data.
If you could trim curve to be exactly contemporaneous with the first and last point of the meas observations, that would also help since you're matching with open.begin and open.end FALSE, but I'm not sure whether the exact dates are available.
This shows smoothing meas out to 80 points, and mapping the 10-point raw data and 80-point smooth to the reference curve
require(ggplot2)
require(scales)
require(gridExtra)
require(dtw)
require(plyr)
# use ggplot default to smooth the 10 point curve
meas.plot.smooth<-ggplot(meas, aes(x = distance, y = value)) + geom_line() + labs(title = "ggplot smoothed (blue curve)")+geom_smooth()
# use ggplot_build() to get the smoothed points
meas.curve.smooth<-ggplot_build(meas.plot.smooth)$data[[2]]
orig.align<-dtw(meas$value,curve$value,keep=T,step.pattern=symmetric1)
orig.freqs<-count(orig.align$index1)
# reference the matching points (which are effectively dates)
orig.freqs$cumsum<-cumsum(orig.freqs$freq)
g.10<-ggplot() + geom_line(data = curve, aes(x = date, y = value)) +
geom_line(aes(x = curve[orig.freqs$cumsum,"date"], y = meas$value),color="red") +
geom_text(aes(x = curve[orig.freqs$cumsum,"date"], y = meas$value, label=orig.freqs$x),color="red",size=5) +
scale_x_date(name = "Month", breaks = date_breaks("months"), labels = date_format("%b")) +
labs(title = "Native 10 pt curve - dtw mapped")
smooth.align<-dtw(meas.curve.smooth$y,curve$value,keep=T,step.pattern=symmetric1)
smooth.freqs<-count(smooth.align$index1)
smooth.freqs$cumsum<-cumsum(smooth.freqs$freq)
g.80<-ggplot() + geom_line(data = curve, aes(x = date, y = value)) +
geom_line(aes(x = curve[smooth.freqs$cumsum,"date"], y = meas.curve.smooth$y),color="red") +
scale_x_date(name = "Month", breaks = date_breaks("months"), labels = date_format("%b")) +
geom_text(aes(x = curve[smooth.freqs$cumsum,"date"], y = meas.curve.smooth$y, label=smooth.freqs$x),color="red",size=3.5,position="jitter") +
labs(title = "80 point loess curve - dtw mapped")
grid.arrange(meas.plot.smooth,g.10,g.80,ncol=1)
EDIT
Obviously part of the problem is confidence intervals. I've included an example here to build a random curve within the standard error levels around the smoothed curve. As you can see, it's quite different to using the projected curve itself. I think the issue is that when you're trying to map 10 measures against a 370-point reference curve, unless they track extremely tightly, it's going to be difficult to get precise predictions.
rand.align<-dtw(meas.curve.smooth$ymin+(meas.curve.smooth$ymax-meas.curve.smooth$ymin)*runif(length(meas.curve.smooth$ymin)),curve$value,keep=T,step.pattern=symmetric1)
rand.freqs<-count(rand.align$index1)
rand.freqs$cumsum<-cumsum(rand.freqs$freq)
g.rand<-ggplot() + geom_line(data = curve, aes(x = date, y = value)) +
geom_line(aes(x = curve[rand.freqs$cumsum,"date"], y = meas.curve.smooth$y),color="red") +
scale_x_date(name = "Month", breaks = date_breaks("months"), labels = date_format("%b")) +
geom_text(aes(x = curve[rand.freqs$cumsum,"date"], y = meas.curve.smooth$y, label=rand.freqs$x),color="red",size=3.5,position="jitter") +
labs(title = "Random curve within standard CI - dtw mapped")
grid.arrange(meas.plot.smooth,g.10,g.80,g.rand,ncol=1)
EDIT updated to include simulation.
OK - this is updated to run 1000 simulations. It creates curves for mapping which are randomised from within the 95% CI. I changed n to 10 (from 80) in the geom_smooth() function to try and preserve as much info as possible from the measured curve.
It models the cumulative growth (assuming linear growth between unmeasured days)
Not sure if it's completely useful, but provides a decent way of visualizing the uncertainty.
get_scenario<-function(i){
set.seed(i)
# create random curve within the CI
rand.align<-dtw(meas.curve.smooth$ymin+(meas.curve.smooth$ymax-meas.curve.smooth$ymin)*runif(length(meas.curve.smooth$ymin)),curve$value,keep=T,step.pattern=symmetric1)
rand.freqs<-count(rand.align$index1)
rand.freqs$cumsum<-cumsum(rand.freqs$freq)
growth.index<-data.frame(cumsum=curve$index,val=curve$value)
merged<-merge(growth.index,rand.freqs,by="cumsum")
return(data.frame(x=merged$cumsum,growth=cumsum(merged$val*merged$freq),scenario=i))
}
scenario.set <- ldply(lapply(1:1000,function(l)get_scenario(l)), data.frame)
g.s<-ggplot(scenario.set,aes(x,growth)) +
geom_line(aes(,group=scenario,color=scenario),alpha=0.25) +
scale_colour_gradient(low = "yellow", high = "orangered") +
xlab("Days from start") + ylab("Cumulative Growth")
g.xmax<-max(scenario.set$x) # get the final day (or set to another day)
g.xmin<-g.xmax-30 # thirty day window from end
b<-ggplot_build(g.s)
build.data<-b$data[[1]]
ylims<-build.data[build.data$x<=g.xmax & build.data$x>=g.xmin,]$y
g.subplot<-g.s+geom_point(aes(x,growth,color=scenario),alpha=0.25,size=5,position="jitter")+coord_cartesian(xlim=c(g.xmin,g.xmax),ylim=c(min(ylims),max(ylims)))
grid.arrange(meas.plot.smooth,g.s,g.subplot,ncol=1)
and here are some other ways of looking at the tail:
g.s<-ggplot(scenario.set,aes(x,growth)) +
geom_line(aes(,group=scenario,color=scenario),alpha=0.25) +
scale_colour_gradient(low = "yellow", high = "orangered") +
xlab("Days from start") + ylab("Cumulative Growth")
g.xmax<-max(scenario.set$x) # get the final day (or set to another day)
g.xmin<-g.xmax-50 # thirty day window from end
b<-ggplot_build(g.s)
build.data<-b$data[[1]]
ylims<-build.data[build.data$x<=g.xmax & build.data$x>=g.xmin,]$y
g.subplot<-g.s+geom_point(aes(x,growth,color=scenario),alpha=0.25,size=5,position="jitter")+coord_cartesian(xlim=c(g.xmin,g.xmax),ylim=c(min(ylims),max(ylims)))
grid.arrange(meas.plot.smooth,g.s,g.subplot,ncol=1)
g.box<-ggplot(build.data)+
geom_boxplot(aes(x,y,group=cut(x,max(x)/7),fill=cut(x,max(x)/7)),alpha=0.5)+ # bucket by group
theme(legend.position="none")+
coord_cartesian(xlim=c(g.xmin,g.xmax),ylim=c(min(ylims)-50,max(ylims)+50))
build.data.sum<-ddply(build.data,.(x),summarise,ymax=max(y),ymin=min(y),mean=mean(y))
g.spots<-ggplot(build.data)+
geom_point(aes(x,y,color=group),size=10,alpha=0.25,position="jitter")+
theme(legend.position="none")+scale_colour_gradient(low = "yellow", high = "orangered")+
geom_ribbon(data=build.data.sum,aes(x,ymax=ymax,ymin=ymin),alpha=0.25)+
coord_cartesian(xlim=c(g.xmax-50,g.xmax+1),ylim=c(min(ylims)-50,max(ylims)+50))+geom_smooth(aes(x,y),n=max(build.data$x))
grid.arrange(g.box,g.spots,ncol=1)

Resources