For Loop and Appending to Vector in R - r

I am trying to create a For Loop in R to fill a Vector with Forecasted values, generated via the auto.arima function.
I am new to R, so I am not sure if this is done correctly.
The code I am using is the following:
library(dplyr)
library(forecast)
dfts <- ts(df$Price_REG1)
fc=c()
for (i in 0:7) {
modArima <- auto.arima(dfts[0+(i*24):168+(i*24)])
forecast <- forecast(modArima, h=24)
forecast_values <- forecast$mean
fc <- append(fc, forecast_values)
}
I use longer sets in reality, but made it smaller here to make it more understandable.
What I am trying to achieve is to use the first week of data (168 hours in one week) to estimate the coefficients for the model. Then I want to put the generated predictions for the first 24 hours after the training set in the Vector fc.
I then want to move the window one day, reestimate the coefficients and generate the forecasts for the following day and saving them into the Vector.
I am a bit unsure on the dfts[0+(i*24):168+(i*24)] part, since df <- df[0:168], doesn't work, but needs the df <- df[0:168,]. But if I put dfts[0+(i*24):168+(i*24)] I get
Error in [.default(dfts, 0 + (i * 24):874 + (i * 24), ) : incorrect
number of dimensions
EDIT :
Sample of Data:
structure(c(28.78, 28.45, 27.9, 27.52, 27.54, 26.55, 25.83, 25.07,
25.65, 26.15, 26.77, 27.4, 28.08, 28.69, 29.37, 29.97, 30.46,
30.39, 30.06, 29.38, 27.65, 27.33, 25.88, 24.81, 12.07, 13.13,
19.07, 21.12, 24.29, 26.27, 27.74, 28.39, 29.37, 29.95, 29.91,
29.96, 29.94, 29.94, 30.18, 30.96, 31.2, 30.98, 30.35, 29.27,
28.17, 28.02, 27.69, 24.39, 18.93, 9.98, 1.53, 0.14, 0.85, 9.92,
24.48, 26.68, 28.12, 28.58, 28.16, 28.78, 28.31, 28.44, 28.96,
29.86, 30.15, 30.07, 29.54, 29.11, 27.91, 27.03, 25.7, 22.04,
21.73, 15.95, 16.23, 6.45, 3.83, 4.03, 4.04, 19.07, 17.49, 24.18,
24.94, 25.11, 24.94, 24.95, 25.25, 26.33, 27.36, 28.88, 29.58,
29.42, 27.71, 27.4, 27.37, 25.77, 26.65, 27.13, 27.11, 27.42), tsp = c(1,
5.125, 24), class = "ts")

Here is an example with built-in data set AirPassengers on how to run a rolling forecast with package forecast.
The code below makes use of time series functions
window to subset objects of class "ts";
frequency and start to get those attributes.
The output vector is created beforehand, not extended in the loop with append.
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
data("AirPassengers", package = "datasets")
fc <- ts(
data = rep(NA, length(AirPassengers)),
start = start(AirPassengers),
frequency = frequency(AirPassengers)
)
start <- start(AirPassengers)[1]
freq <- frequency(AirPassengers)
i_fc <- seq_len(freq)
fc[i_fc] <- AirPassengers[i_fc]
for(i in 1:11) {
w <- window(AirPassengers, start = start + i - 1L, end = c(start + i - 1L, freq))
modArima <- auto.arima(w)
y <- forecast(modArima, h = freq)$mean
i_fc <- i_fc + freq
fc[i_fc] <- y
}
plot(cbind(AirPassengers, fc))
Created on 2022-12-20 with reprex v2.0.2
Edit
I believe that the code below forecasts the next day given a certain initial number of days.
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
fill_first_periods <- function(x, weeks = 1L, week_days) {
if(missing(week_days)) week_days <- 7L
fc <- ts(
data = rep(NA, length(x)),
start = start(x),
frequency = frequency(x)
)
i_fc <- seq_len(frequency(x) * week_days * weeks)
fc[i_fc] <- x[i_fc]
fc
}
# not enough data to run an example for 1 week
# three days only
weeks <- 1L
week_days <- 3L
fc <- fill_first_periods(dfts, weeks = weeks, week_days)
n <- length(fc)
i_last <- length(fc[!is.na(fc)])
h <- frequency(fc)
curr_start <- start(fc)
curr_end <- c(curr_start[1] + weeks*week_days - 1L, frequency(fc))
for(i in 2:(end(fc)[1] - 1L)) {
if(n - i_last < h) {
h <- end(fc)[2]
i_fc <- tail(seq_len(n), h)
} else {
i_fc <- (i_last + 1L):(i_last + h)
i_last <- i_last + h
}
w <- window(dfts, start = curr_start, end = curr_end)
modArima <- auto.arima(w)
fc[i_fc] <- forecast(modArima, h = h)$mean
#
curr_start[1] <- curr_start[1L] + 1L
curr_end <- c(curr_end[1L] + 1L, h)
}
plot(cbind(dfts, fc))
Created on 2022-12-21 with reprex v2.0.2

Related

Missing value or an infinity produced when evaluating the model in R

I am trying to fit planks equation to a data frame but keep getting the error above.
# create data frame from provided data
data <- data.frame(x = c(2.27, 2.72, 3.18, 3.63, 4.08, 4.54, 4.99, 5.45, 5.90, 6.35, 6.81, 7.26, 7.71, 8.17, 8.62, 9.08, 9.53, 9.98, 10.44, 10.89, 11.34, 11.80, 12.25, 12.71, 13.16, 13.61, 14.07, 14.52, 14.97, 15.43, 15.88, 16.34, 16.79, 17.24, 17.70, 18.15, 18.61, 19.06, 19.51, 19.97, 20.42, 20.87, 21.33),
brightness = c(200.723, 249.508, 293.024, 327.770, 354.081, 372.079, 381.493, 383.478, 378.901, 368.833, 354.063, 336.278, 316.076, 293.924, 271.432, 248.239, 225.940, 204.327, 183.262, 163.830, 145.750, 128.835, 113.568, 99.451, 87.036, 75.876, 65.766, 57.008, 49.223, 42.267, 36.352, 31.062, 26.580, 22.644, 19.255, 16.391, 13.811, 11.716, 9.921, 8.364, 7.087, 5.801, 4.523))
freq <- (data$x)*29.9792458
planck <- function(freq, t, h, c, k) {
(2 * h * freq^3) / (c^2 * (exp((h * freq) / (k * t)) - 1))
}
# fit the data using nls
library(stats)
h = 6.62607e-34
c= 3e8
k = 1.38065e-23
fit <- nls(brightness ~ planck(freq, t, h, c, k), start = list(t = 3), data = data)
# create a sequence of frequencies to use for the fitted curve
freq_seq <- seq(min(data$freq), max(data$freq), length.out = 100)
# use the predict function to generate the fitted curve
brightness_fit <- predict(fit, list(freq = freq_seq))
# create a new data frame with the fitted curve
fit_data <- data.frame(freq = freq_seq, brightness = brightness_fit)
# plot the data and the fitted curve
plot(data$freq, data$brightness, xlab = "Frequency (Hz)", ylab = "Brightness")
lines(fit_data$freq, fit_data$brightness, col = "red")
I am expecting it to create a scatter plot of the original data, with the x-axis as frequency, y-axis as brightness, and then add a line representing the fitted curve in red color.

First Derivative of Scatter Plot R

Hello I am working with sigmoidal data and am attempting to plot two scatter plots on top of each other: the raw data & the first derivative of the raw data. My issue doesn't lie in plotting the data, but more-so finding a function that will create an accurate representation of the first derivative.
What have I tried: Creating a function that calculates the slope of the current & next point: (y2-y1)/(x2-x1) & assigning the value to the current temperature.
dput() of Data Frame:
structure(list(Temperature = c(4.98, 5.49, 6.01, 6.5, 7.02, 7.52, 8.03, 8.52, 9.03, 9.54, 10.04, 10.54, 11.05, 11.55, 12.05, 12.55, 13.05, 13.56, 14.06, 14.57, 15.07, 15.57, 16.07, 16.59, 17.08, 17.59, 18.08, 18.59, 19.09, 19.6, 20.1, 20.64, 21.12, 21.63, 22.13, 22.62, 23.13, 23.63, 24.13, 24.63, 25.11, 25.62, 26.11, 26.68, 27.19, 27.7, 28.2, 28.71, 29.21, 29.71, 30.21, 30.7, 31.21, 31.69, 32.19, 32.69, 33.19, 33.7, 34.19, 34.68, 35.19, 35.68, 36.19, 36.69, 37.19, 37.7, 38.19, 38.7, 39.2, 39.7, 40.21, 40.7, 41.22, 41.71, 42.21, 42.71, 43.21, 43.72, 44.22, 44.72, 45.22, 45.73, 46.23, 46.73, 47.23, 47.97, 48.71, 49.23, 49.74, 50.23, 50.73, 51.23, 51.73, 52.24, 52.75, 53.24, 53.75, 54.24, 54.75, 55.26, 55.75, 56.25, 56.75, 57.24, 57.75, 58.27, 58.77, 59.26, 59.77, 60.26, 60.78, 61.27, 61.79, 62.27, 62.77, 63.29, 63.79, 64.27, 64.78, 65.3, 65.8, 66.27, 66.8, 67.3, 67.8, 68.31, 68.78, 69.3, 69.8, 70.32, 70.81, 71.32, 71.81, 72.33, 72.82, 73.31, 73.83, 74.33, 74.82, 75.32, 75.83, 76.34, 76.84, 77.35, 77.82, 78.34, 78.85, 79.36, 79.84, 80.35, 80.85, 81.36, 81.86, 82.37, 82.86, 83.37, 83.88, 84.36, 84.88, 85.38, 85.88, 86.38, 86.89, 87.38, 87.89, 88.39, 88.89, 89.4, 89.9, 90.39, 90.9, 91.4, 91.91, 92.37, 92.89, 93.4, 93.91, 94.41, 94.91, 95.42), Absorbance = c(1.401351929, 1.403320313, 1.405181885, 1.406326294, 1.407440186, 1.409118652, 1.410095215, 1.410797119, 1.411560059, 1.412918091, 1.413970947, 1.414245605, 1.416000366, 1.415435791, 1.41809082, 1.4190979, 1.419677734, 1.420150757, 1.421966553, 1.420333862, 1.422637939, 1.422790527, 1.423461914, 1.426513672, 1.426315308, 1.426071167, 1.426467896, 1.428710938, 1.428070068, 1.428817749, 1.429733276, 1.432144165, 1.432434082, 1.433227539, 1.434616089, 1.435806274, 1.434814453, 1.436096191, 1.436096191, 1.436447144, 1.437896729, 1.4375, 1.438934326, 1.440139771, 1.440139771, 1.441741943, 1.442108154, 1.443969727, 1.444778442, 1.443862915, 1.444534302, 1.445648193, 1.444473267, 1.446395874, 1.447219849, 1.446151733, 1.449569702, 1.449066162, 1.448852539, 1.4503479, 1.451385498, 1.45111084, 1.451217651, 1.453125, 1.452560425, 1.455047607, 1.455093384, 1.456665039, 1.457977295, 1.457336426, 1.458648682, 1.46043396, 1.462158203, 1.464813232, 1.463531494, 1.468048096, 1.468643188, 1.470748901, 1.471878052, 1.476257324, 1.478057861, 1.482040405, 1.484466553, 1.486129761, 1.48815918, 1.496520996, 1.499786377, 1.504302979, 1.507217407, 1.512985229, 1.517471313, 1.524108887, 1.528198242, 1.534637451, 1.539169312, 1.546142578, 1.554611206, 1.55809021, 1.56854248, 1.572875977, 1.580307007, 1.585739136, 1.592514038, 1.600067139, 1.609222412, 1.616607666, 1.622375488, 1.631469727, 1.635635376, 1.642929077, 1.649780273, 1.655014038, 1.661483765, 1.663742065, 1.671859741, 1.677200317, 1.677108765, 1.683380127, 1.684082031, 1.687438965, 1.694595337, 1.694961548, 1.696685791, 1.696685791, 1.699768066, 1.702514648, 1.703613281, 1.705093384, 1.70022583, 1.707595825, 1.707962036, 1.709075928, 1.705276489, 1.71055603, 1.709259033, 1.70916748, 1.709732056, 1.710189819, 1.710281372, 1.711868286, 1.711883545, 1.713104248, 1.713760376, 1.711120605, 1.709716797, 1.711776733, 1.712814331, 1.714324951, 1.711120605, 1.713378906, 1.712432861, 1.716125488, 1.710006714, 1.710845947, 1.711502075, 1.711120605, 1.710006714, 1.70980835, 1.708602905, 1.708236694, 1.710189819, 1.707672119, 1.706939697, 1.710006714, 1.706192017, 1.706573486, 1.706207275, 1.705734253, 1.706207275, 1.705184937, 1.70954895, 1.705841064, 1.702972412, 1.703979492, 1.703063965, 1.709350586, 1.703338623, 1.700408936, 1.705276489, 1.705368042)), row.names = 1621:1800, class = "data.frame")
Code For my Attempt
raw = "<insert dput line>>"
columns = c("Temperature","Absorbance")
first = data.frame(matrix(nrow=0,ncol=2))
colnames(dFrame) = columns
for (i in 1:nrow(raw)) {
if(i != nrow(raw)) {
cAbs = raw[i,2]
nextAbs = raw[i+1,2]
cT = raw[i,1]
nextT = raw[i+1,1]
Temperature = raw[i,1]
Absorbance =((nextAbs-cAbs)/(nextT-cT))
t <- data.frame(Temperature,Absorbance)
names(t) <- names(raw)
first <- rbind(first, t)
}
}
ggplot()+
geom_point(data=raw, aes(x=Temperature,y=Absorbance), color = "red") +
geom_point(data = first, aes(x=Temperature,y = Absorbance), color = "blue")
What I was expecting
I was expecting an output that had the shape of something like so:
library(dplyr); library(ggplot2)
df %>%
arrange(Temperature) %>%
mutate(slope = (Absorbance - lag(Absorbance))/
(Temperature - lag(Temperature))) %>%
ggplot(aes(Temperature)) +
geom_line(aes(y= Absorbance, color = "Absorbance"), size = 1.2) +
geom_point(aes(y= slope * 20 + 1.4, color = "slope")) +
geom_smooth(aes(y= slope * 20 + 1.4, color = "slope"), se = FALSE, size = 0.8) +
scale_y_continuous(sec.axis = sec_axis(trans = ~(.x - 1.4)/20, name = "slope"))
If the data is even a little noisy, calculating the derivative by first differencing can be very noisy.
You can get a better estimate by fitting a smoothing spline function and calculating the derivative of the spline function. By differentiating a smooth function, you get a smooth derivative.
In most cases, smooth.spline with default arguments is fine, but I recommend taking a look at the result and possibly tuning the smooth.spline parameters for more or less smoothing, depending on your judgment.
edit: I learned this approach from the Numerical Recipes textbook.
library(tidyverse)
df <- tibble(
x = seq(1, 15, by = 0.1),
y = sin(x) + runif(length(x), -0.2, 0.2),
d1_diff = c(NA, diff(y) / diff(x)),
d1_spline = smooth.spline(x, y) %>% predict(x, deriv = 1) %>% pluck("y")
)
df %>%
pivot_longer(-x) %>%
mutate(name = factor(name, unique(name))) %>%
ggplot() + aes(x, value, color = name) + geom_point() + geom_line() +
facet_wrap(~name, ncol = 1)
#> Warning: Removed 1 rows containing missing values (geom_point).
#> Warning: Removed 1 row(s) containing missing values (geom_path).
Created on 2022-10-26 with reprex v2.0.2

qqplot would not graph. Error in converting dataframe into a vector

data = read.csv("HeatofCombustion.csv", header=T)
attach(data)
library(lattice)
x = data[ , "Qc"]
qqplot(x=qexp(x), y=data, main="Exponential Q-Q Plot",
xlab="Theoretical Quantiles", ylab= "Your Data Quantiles")
Errors:
Error in `[.data.frame`(x, order(x, na.last = na.last, decreasing = decreasing)) :
undefined columns selected
In addition: Warning messages:
1: In qexp(x) : NaNs produced
2: In xtfrm.data.frame(x) : cannot xtfrm data frames
Why does this happen? I thought I have already converted dataframe into a vector by using x = data[ , "Qc"]
I am trying to graph an exponential Q-Q plot in R. Many thanks.
Data view:
Actual data for variable Qc (heat capacity):
Qc = c(17.39, 6.68, 23.31, 47.74,
19.53, 45.8, 26.75, 26.86, 29.62, 28.39, 34.21, 43.65, 24.13,
31.37, 25.42, 27.91, 30.9, 31.07, 38.35, 29.18, 26.45, 25.27,
26.92, 24.97, 39.84, 29.38, 31.53, 31.06, 18.71, 29.92, 32.5,
31.07, 31.48, 31.23, 31.15, 31.65, 26.03, 28.61, 30.65, 34.39,
30.28, 30.63, 34.89, 26.5, 29.59, 29.06, 26.54, 25.92, 33.64)
This function is perhaps a little too fancy, but should do what you want. (The qfun.args/do.call nonsense is to allow you to include extra shape parameters for the target distribution, which doesn't seem to be necessary here — because of the way that Q-Q plots are assessed, changes in scale and location parameters don't affect their appearance much.)
It's basically just encapsulating and generalizing the chi-squared example shown in ?qqplot ... to generate the x-variable, you use ppoints() to generate an appropriate set of equally spaced quantile points, then the quantile (q*) function of your target distribution to convert those to theoretical quantiles.
qfun <- function(y, qfun = qnorm, qfun.args = NULL, ...) {
n <- length(y)
qqplot(do.call(qfun,
c(list(ppoints(n)), qfun.args)),
xlab = "",
y, ...)
qqline(y,
distribution = function(p) do.call(qfun, c(list(p), qfun.args)),
probs = c(0.1, 0.6), col = 2)
}
Try it out:
qfun(Qc, main = "Gaussian")
qfun(Qc, qexp, main = "Exponential")
library(VGAM)
qfun(Qc, qgumbel, main = "Gumbel")

how to calculate the radius of a trajectory with given 'x' and 'y' positions in R

I have trajectories of some moving objects in a data frame for example three trajectories were considered here as shown below:
> d1 <- data.frame(X1 = c(86.46, 79.88, 73.63, 67.63, 61.78, 56.05, 50.40, 44.80, 39.25, 33.79, 28.49, 23.40, 18.53, 13.84, 9.31, 5.04, 1.12),
Y1 = c(4.28, 5.49, 6.80, 8.16, 9.59, 11.18, 13.05, 15.28, 17.86, 20.81, 24.12, 27.75, 31.68, 35.87, 40.31, 44.92, 49.61))
> d2 <- data.frame(X2 = c(0.32, 4.00, 8.00, 12.31, 16.87, 21.64, 26.60, 31.75, 37.08, 42.62, 48.36, 54.33, 60.59, 67.25, 74.48, 82.42),
Y2 = c(57.55, 52.67, 47.98, 43.49, 39.24, 35.26, 31.59, 28.24, 25.19, 22.42, 19.92, 17.65, 15.55, 13.54, 11.54, 9.47))
> d3 <- data.frame(X3 = c(0.04, 1.76, 3.61, 5.63, 7.89, 10.42, 13.19, 16.14, 19.25, 22.61, 26.29, 30.35, 34.83, 39.71, 44.97, 50.58, 56.47, 62.56, 68.79, 75.19, 81.82),
Y3 = c(58.34, 55.97, 53.49, 50.89, 48.15, 45.27, 42.35, 39.43, 36.50, 33.57, 30.66, 27.85, 25.18, 22.66, 20.27, 18.02, 15.94, 14.02, 12.22, 10.48, 8.83))
my.lists <- list(d1, d2, d3)
df12 <- do.call(qpcR:::cbind.na, my.lists)
now I would like to calculate the radius of each trajectory in the df12 data frame.
from each trajectory we can get useful parameters such as arc length and chord length as follows:
#for arc length
> library(geosphere) ## for calculating distance between successive points #step-1:
> d11 <- data.frame(Distance = sqrt(diff(d1$X1)^2 + diff(d1$Y1)^2)) #step-2:
> d11$csum1 <- ave(d11$Distance, FUN=cumsum) #step-3:
#for chord length
> sqrt((d1[1,1]-d1[17,1])^2+(d1[1,2]-d1[17,2])^2)
is there any approach to calculate the radius of a trajectory?
thanks in advance
It is not completely clear what you mean with "radius of a trajectory".
Here I assume you want to fit data points of a trajectory with a circle
that runs through these points. We will look at your first example.
library(pracma)
x <- d1$X1; y <- d1$Y1 # data points
res <- circlefit(x, y, fast=TRUE) # "fitting a circle"
res
## RMS error: 0.2696326
## [1] 93.85894 123.25466 118.51384
This computes a circle of radius r0 = 118.51384 and the center at
(93.85894, 123.25466) with an RMS error of about 0.27. To visualize:
x0 <- res[1]; y0 <- res[2]; r0 <- res[3] # center and radius
ts <- seq(0, 2*pi, length.out = 100)
xs <- x0 + r0*cos(ts); ys <- y0 + r0*sin(ts)
plot(xs, ys, type='l', col = "red", asp=1)
points(x, y); grid()
Do not use the fast = FALSE option in circlefit, the optimization
process steered by optim() runs wild.
You can also apply function CircleFitBy...() in package conicfit --
maybe that's even the better alternative. It depends also on what your
preferred 'measure of fit' is.

Data Smoothing in R

This question is related to this one that I asked before. But referring to that question is not necessary to answer this one.
Data
I have a data set containing velocities of 2169 vehicles recorded at intervals of 0.1 seconds. So, there are many rows for an individual vehicle. Here I am reproducing the data only for the vehicle # 2:
> dput(uma)
structure(list(Frame.ID = 13:445, Vehicle.velocity = c(40, 40,
40, 40, 40, 40, 40, 40.02, 40.03, 39.93, 39.61, 39.14, 38.61,
38.28, 38.42, 38.78, 38.92, 38.54, 37.51, 36.34, 35.5, 35.08,
34.96, 34.98, 35, 34.99, 34.98, 35.1, 35.49, 36.2, 37.15, 38.12,
38.76, 38.95, 38.95, 38.99, 39.18, 39.34, 39.2, 38.89, 38.73,
38.88, 39.28, 39.68, 39.94, 40.02, 40, 39.99, 39.99, 39.65, 38.92,
38.52, 38.8, 39.72, 40.76, 41.07, 40.8, 40.59, 40.75, 41.38,
42.37, 43.37, 44.06, 44.29, 44.13, 43.9, 43.92, 44.21, 44.59,
44.87, 44.99, 45.01, 45.01, 45, 45, 45, 44.79, 44.32, 43.98,
43.97, 44.29, 44.76, 45.06, 45.36, 45.92, 46.6, 47.05, 47.05,
46.6, 45.92, 45.36, 45.06, 44.96, 44.97, 44.99, 44.99, 44.99,
44.99, 45.01, 45.02, 44.9, 44.46, 43.62, 42.47, 41.41, 40.72,
40.49, 40.6, 40.76, 40.72, 40.5, 40.38, 40.43, 40.38, 39.83,
38.59, 37.02, 35.73, 35.04, 34.85, 34.91, 34.99, 34.99, 34.97,
34.96, 34.98, 35.07, 35.29, 35.54, 35.67, 35.63, 35.53, 35.53,
35.63, 35.68, 35.55, 35.28, 35.06, 35.09, 35.49, 36.22, 37.08,
37.8, 38.3, 38.73, 39.18, 39.62, 39.83, 39.73, 39.58, 39.57,
39.71, 39.91, 40, 39.98, 39.97, 40.08, 40.38, 40.81, 41.27, 41.69,
42.2, 42.92, 43.77, 44.49, 44.9, 45.03, 45.01, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 44.99, 45.03, 45.26, 45.83, 46.83,
48.2, 49.68, 50.95, 51.83, 52.19, 52, 51.35, 50.38, 49.38, 48.63,
48.15, 47.87, 47.78, 48.01, 48.63, 49.52, 50.39, 50.9, 50.96,
50.68, 50.3, 50.05, 49.94, 49.87, 49.82, 49.82, 49.88, 49.96,
50, 50, 49.98, 49.98, 50.16, 50.64, 51.43, 52.33, 53.01, 53.27,
53.22, 53.25, 53.75, 54.86, 56.36, 57.64, 58.28, 58.29, 57.94,
57.51, 57.07, 56.64, 56.43, 56.73, 57.5, 58.27, 58.55, 58.32,
57.99, 57.89, 57.92, 57.74, 57.12, 56.24, 55.51, 55.1, 54.97,
54.98, 55.02, 55.03, 54.86, 54.3, 53.25, 51.8, 50.36, 49.41,
49.06, 49.17, 49.4, 49.51, 49.52, 49.51, 49.45, 49.24, 48.84,
48.29, 47.74, 47.33, 47.12, 47.06, 47.07, 47.08, 47.05, 47.04,
47.25, 47.68, 47.93, 47.56, 46.31, 44.43, 42.7, 41.56, 41.03,
40.92, 40.92, 40.98, 41.19, 41.45, 41.54, 41.32, 40.85, 40.37,
40.09, 39.99, 39.99, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
39.98, 39.97, 40.1, 40.53, 41.36, 42.52, 43.71, 44.57, 45.01,
45.1, 45.04, 45, 45, 45, 45, 45, 45, 44.98, 44.97, 45.08, 45.39,
45.85, 46.2, 46.28, 46.21, 46.29, 46.74, 47.49, 48.35, 49.11,
49.63, 49.89, 49.94, 49.97, 50.14, 50.44, 50.78, 51.03, 51.12,
51.05, 50.85, 50.56, 50.26, 50.06, 50.1, 50.52, 51.36, 52.5,
53.63, 54.46, 54.9, 55.03, 55.09, 55.23, 55.35, 55.35, 55.23,
55.07, 54.99, 54.98, 54.97, 55.06, 55.37, 55.91, 56.66, 57.42,
58.07, 58.7, 59.24, 59.67, 59.95, 60.02, 60, 60, 60, 60, 60,
60.01, 60.06, 60.23, 60.65, 61.34, 62.17, 62.93, 63.53, 64, 64.41,
64.75, 65.04, 65.3, 65.57, 65.75, 65.74, 65.66, 65.62, 65.71,
65.91, 66.1, 66.26, 66.44, 66.61, 66.78, 66.91, 66.99, 66.91,
66.7, 66.56, 66.6, 66.83, 67.17, 67.45, 67.75, 68.15, 68.64,
69.15, 69.57, 69.79, 69.79, 69.72, 69.72, 69.81, 69.94, 70, 70.01,
70.02, 70.03)), .Names = c("Frame.ID", "Vehicle.velocity"), class = "data.frame", row.names = c(NA,
433L))
Frame.ID is the time frame in which the Vehicle.velocity was observed. There is some noise in the velocity variable and I want to smooth it.
Methodology
To smooth the velocity I am using following equation:
where,
Delta = 10
Nalpha = number of data points (rows)
i = 1, ... ,Nalpha (i.e. the row number)
D = minimum of {i-1, Nalpha - i, 3*delta=30}
xalpha = velocity
Question
I have gone through the documentation of filter and convolution in R. It seems that I have to know about convolution to do this. However, I have tried my best and can't understand how convolution works! The linked question has an answer which helped me in understanding some of the inner workings in the function but I am still not sure.
Could anyone here on SO please explain how this thing works? Or guide me to an alternative methodology to achieve the same purpose i.e. apply the equation?
My current code which works but is lengthy
Here is what uma looks like:
> head(uma)
Frame.ID Vehicle.velocity
1 13 40
2 14 40
3 15 40
4 16 40
5 17 40
6 18 40
uma$i <- 1:nrow(uma) # this is i
uma$im1 <- uma$i - 1
uma$Nai <- nrow(uma) - uma$i # this is Nalpha
uma$delta3 <- 30 # this is 3 times delta
uma$D <- pmin(uma$im1, uma$Nai, uma$delta3) # selecting the minimum of {i-1, Nalpha - i, 3*delta=15}
uma$imD <- uma$i - uma$D # i-D
uma$ipD <- uma$i + uma$D # i+D
uma <- ddply(uma, .(Frame.ID), transform, k = imD:ipD) # to include all k in the data frame
umai <- uma
umai$imk <- umai$i - umai$k # i-k
umai$aimk <- (-1) * abs(umai$imk) # -|i-k|
umai$delta <- 10
umai$kernel <- exp(umai$aimk/umai$delta) # The kernel in the equation i.e. EXP^-|i-k|/delta
umai$p <- umai$Vehicle.velocity[match(umai$k,umai$i)] #observed velocity in kth row as described in equation as t(k)
umai$kernelp <- umai$p * umai$kernel # the product of kernel and observed velocity in kth row as described in equation as t(k)
umair <- ddply(umai, .(Frame.ID), summarize, Z = sum(kernel), prod = sum(kernelp)) # summing the kernel to get Z and summing the product to get the numerator of the equation
umair$new.Y <- umair$prod/umair$Z # the final step to get the smoothed velocity
Plot
Just for reference, if I plot the observed and smoothed velocities against time frames we can see the result of smoothing:
ggplot() +
geom_point(data=uma,aes(y=Vehicle.velocity, x= Frame.ID)) +
geom_point(data=umair,aes(y=new.Y, x= Frame.ID), color="red")
Please help me making my code short and applicable to all vehicles (represented by Vehicle.ID in the data set) by guiding me about use of convolution.
dplyr
Alright, so I used following code and it works but takes 3 hours on 32 GB RAM. Can anyone suggest improvements to speed it up (1 hour each is taken by umal, umav and umaa)?
uma <- tbl_df(uma)
uma <- uma %>% # take data frame
group_by(Vehicle.ID) %>% # group by Vehicle ID
mutate(i = 1:length(Frame.ID), im1 = i-1, Nai = length(Frame.ID) - i,
Dv = pmin(im1, Nai, 30),
Da = pmin(im1, Nai, 120),
Dl = pmin(im1, Nai, 15),
imDv = i - Dv,
ipDv = i + Dv,
imDa = i - Da,
ipDa = i + Da,
imDl = i - Dl,
ipDl = i + Dl) %>% # finding i, i-1 and Nalpha-i, D, i-D and i+D for location, velocity and acceleration
ungroup()
umav <- uma %>%
group_by(Vehicle.ID, Frame.ID) %>%
do(data.frame(kv = .$imDv:.$ipDv)) %>%
left_join(x=., y=uma) %>%
mutate(imk = i - kv, aimk = (-1) * abs(imk), delta = 10, kernel = exp(aimk/delta)) %>%
ungroup() %>%
group_by(Vehicle.ID) %>%
mutate(p = Vehicle.velocity2[match(kv,i)], kernelp = p * kernel) %>%
ungroup() %>%
group_by(Vehicle.ID, Frame.ID) %>%
summarise(Z = sum(kernel), prod = sum(kernelp)) %>%
mutate(svel = prod/Z) %>%
ungroup()
umaa <- uma %>%
group_by(Vehicle.ID, Frame.ID) %>%
do(data.frame(ka = .$imDa:.$ipDa)) %>%
left_join(x=., y=uma) %>%
mutate(imk = i - ka, aimk = (-1) * abs(imk), delta = 10, kernel = exp(aimk/delta)) %>%
ungroup() %>%
group_by(Vehicle.ID) %>%
mutate(p = Vehicle.acceleration2[match(ka,i)], kernelp = p * kernel) %>%
ungroup() %>%
group_by(Vehicle.ID, Frame.ID) %>%
summarise(Z = sum(kernel), prod = sum(kernelp)) %>%
mutate(sacc = prod/Z) %>%
ungroup()
umal <- uma %>%
group_by(Vehicle.ID, Frame.ID) %>%
do(data.frame(kl = .$imDl:.$ipDl)) %>%
left_join(x=., y=uma) %>%
mutate(imk = i - kl, aimk = (-1) * abs(imk), delta = 10, kernel = exp(aimk/delta)) %>%
ungroup() %>%
group_by(Vehicle.ID) %>%
mutate(p = Local.Y[match(kl,i)], kernelp = p * kernel) %>%
ungroup() %>%
group_by(Vehicle.ID, Frame.ID) %>%
summarise(Z = sum(kernel), prod = sum(kernelp)) %>%
mutate(ycoord = prod/Z) %>%
ungroup()
umal <- select(umal,c("Vehicle.ID", "Frame.ID", "ycoord"))
umav <- select(umav, c("Vehicle.ID", "Frame.ID", "svel"))
umaa <- select(umaa, c("Vehicle.ID", "Frame.ID", "sacc"))
umair <- left_join(uma, umal) %>% left_join(x=., y=umav) %>% left_join(x=., y=umaa)
A good first step would be to take a for loop (which I'll hide with sapply) and perform the exponential smoothing for each index:
josilber1 <- function(uma) {
delta <- 10
sapply(1:nrow(uma), function(i) {
D <- min(i-1, nrow(uma)-i, 30)
rng <- (i-D):(i+D)
rng <- rng[rng >= 1 & rng <= nrow(uma)]
expabs <- exp(-abs(i-rng)/delta)
return(sum(uma$Vehicle.velocity[rng] * expabs) / sum(expabs))
})
}
A more involved approach would be to only compute the incremental change in the exponential smoothing function for each index (as opposed to re-summing at each index). The exponential smoothing function has a lower part (data before the current index; I include the current index in low in the code below) and an upper part (data after the current index; high in the code below). As we loop through the vector, all the data in the lower part gets weighted less (we divide by mult) and all the data in the upper part gets weighted more (we multiply by mult). The leftmost element is dropped from low, the leftmost element in high moves to low, and one element is added to the right side of high.
The actual code is a bit messier to deal with the beginning and ending of the vector and to deal with numerical stability issues (errors in high are multiplied by mult each iteration):
josilber2 <- function(uma) {
delta <- 10
x <- uma$Vehicle.velocity
ret <- c(x[1], rep(NA, nrow(uma)-1))
low <- x[1]
high <- 0
norm <- 1
old.D <- 0
mult <- exp(1/delta)
for (i in 2:nrow(uma)) {
D <- min(i-1, nrow(uma)-i, 30)
if (D == old.D + 1) {
low <- low / mult + x[i]
high <- high * mult - x[i] + x[i+D-1]/mult^(D-1) + x[i+D]/mult^D
norm <- norm + 2 / mult^D
} else if (D == old.D) {
low <- low / mult - x[i-(D+1)]/mult^(D+1) + x[i]
high <- high * mult - x[i] + x[i+D]/mult^D
} else {
low <- low / mult - x[i-(D+2)]/mult^(D+2) - x[i-(D+1)]/mult^(D+1) + x[i]
high <- high * mult - x[i]
norm <- norm - 2 / mult^(D+1)
}
# For numerical stability, recompute high every so often
if (i %% 50 == 0) {
rng <- (i+1):(i+D)
expabs <- exp(-abs(i-rng)/delta)
high <- sum(x[rng] * expabs)
}
ret[i] <- (low+high)/norm
old.D <- D
}
return(ret)
}
R code like josilber2 can often be sped up considerably using the Rcpp package:
library(Rcpp)
josilber3 <- cppFunction(
"
NumericVector josilber3(NumericVector x) {
double delta = 10.0;
NumericVector ret(x.size(), 0.0);
ret[0] = x[0];
double low = x[0];
double high = 0.0;
double norm = 1.0;
int oldD = 0;
double mult = exp(1/delta);
for (int i=1; i < x.size(); ++i) {
int D = i;
if (x.size()-i-1 < D) D = x.size()-i-1;
if (30 < D) D = 30;
if (D == oldD + 1) {
low = low / mult + x[i];
high = high * mult - x[i] + x[i+D-1]/pow(mult, D-1) + x[i+D]/pow(mult, D);
norm = norm + 2 / pow(mult, D);
} else if (D == oldD) {
low = low / mult - x[i-(D+1)]/pow(mult, D+1) + x[i];
high = high * mult - x[i] + x[i+D]/pow(mult, D);
} else {
low = low / mult - x[i-(D+2)]/pow(mult, D+2) - x[i-(D+1)]/pow(mult, D+1) + x[i];
high = high * mult - x[i];
norm = norm - 2 / pow(mult, D+1);
}
if (i % 50 == 0) {
high = 0.0;
for (int j=i+1; j <= i+D; ++j) {
high += x[j] * exp((i-j)/delta);
}
}
ret[i] = (low+high)/norm;
oldD = D;
}
return ret;
}")
We can now benchmark the improvements from these three new approaches:
all.equal(umair.fxn(uma), josilber1(uma))
# [1] TRUE
all.equal(umair.fxn(uma), josilber2(uma))
# [1] TRUE
all.equal(umair.fxn(uma), josilber3(uma$Vehicle.velocity))
# [1] TRUE
library(microbenchmark)
microbenchmark(umair.fxn(uma), josilber1(uma), josilber2(uma), josilber3(uma$Vehicle.velocity))
# Unit: microseconds
# expr min lq mean median uq max neval
# umair.fxn(uma) 370006.728 382327.4115 398554.71080 393495.052 404186.153 572801.355 100
# josilber1(uma) 12879.268 13640.1310 15981.82099 14265.610 14805.419 28959.230 100
# josilber2(uma) 4324.724 4502.8125 5753.47088 4918.835 5244.309 17328.797 100
# josilber3(uma$Vehicle.velocity) 41.582 54.5235 57.76919 57.435 60.099 90.998 100
We got a lot of improvement (25x) with the simpler josilber1 and a 70x total speedup with josilber2 (the advantage would be more with a larger delta value). With josilber3 we achieve a 6800x speedup, getting the runtime all the way down to 54 microseconds to process a single vehicle!

Resources