Interpolate NA values - r

I have two set of samples that are time independent. I would like to merge them and calculate the missing values
for the times where I do not have values of both. Simplified example:
A <- cbind(time=c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
Avalue=c(1, 2, 3, 2, 1, 2, 3, 2, 1, 2))
B <- cbind(time=c(15, 30, 45, 60), Bvalue=c(100, 200, 300, 400))
C <- merge(A,B, all=TRUE)
time Avalue Bvalue
1 10 1 NA
2 15 NA 100
3 20 2 NA
4 30 3 200
5 40 2 NA
6 45 NA 300
7 50 1 NA
8 60 2 400
9 70 3 NA
10 80 2 NA
11 90 1 NA
12 100 2 NA
By assuming linear change between each sample, it is possible to calculate the missing NA values.
Intuitively it is easy to see that the A value at time 15 and 45 should be 1.5. But a proper calculation for B
for instance at time 20 would be
100 + (20 - 15) * (200 - 100) / (30 - 15)
which equals 133.33333.
The first parenthesis being the time between estimate time and the last sample available.
The second parenthesis being the difference between the nearest samples.
The third parenthesis being the time between the nearest samples.
How can I use R to calculate the NA values?

Using the zoo package:
library(zoo)
Cz <- zoo(C)
index(Cz) <- Cz[,1]
Cz_approx <- na.approx(Cz)

The proper way to do this statistically and still get valid confidence intervals is to use Multiple Imputation. See Rubin's classic book, and there's an excellent R package for this (mi).

An ugly and probably inefficient Base R solution:
# Data provided:
A <- cbind(time=c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
Avalue=c(1, 2, 3, 2, 1, 2, 3, 2, 1, 2))
B <- cbind(time=c(15, 30, 45, 60), Bvalue=c(100, 200, 300, 400))
C <- merge(A,B, all=TRUE)
# Scalar valued at the minimum time difference: -> min_time_diff
min_time_diff <- min(diff(C$time))
# Adjust frequency of the series to hold all steps in range: -> df
df <- merge(C,
data.frame(time = seq(min_time_diff,
max(C$time),
by = min_time_diff)),
by = "time",
all = TRUE)
# Linear interpolation function handling ties,
# returns interpolated vector the same length
# a the input vector: -> vector
l_interp_vec <- function(na_vec){
approx(x = na_vec,
method = "linear",
ties = "constant",
n = length(na_vec))$y
}
# Applied to a dataframe, replacing NA values
# in each of the numeric vectors,
# with interpolated values.
# input is dataframe: -> dataframe()
interped_df <- data.frame(lapply(df, function(x){
if(is.numeric(x)){
# Store a scalar of min row where x isn't NA: -> min_non_na
min_non_na <- min(which(!(is.na(x))))
# Store a scalar of max row where x isn't NA: -> max_non_na
max_non_na <- max(which(!(is.na(x))))
# Store scalar of the number of rows needed to impute prior
# to first NA value: -> ru_lower
ru_lower <- ifelse(min_non_na > 1, min_non_na - 1, min_non_na)
# Store scalar of the number of rows needed to impute after
# the last non-NA value: -> ru_lower
ru_upper <- ifelse(max_non_na == length(x),
length(x) - 1,
(length(x) - (max_non_na + 1)))
# Store a vector of the ramp to function: -> l_ramp_up:
ramp_up <- as.numeric(
cumsum(rep(x[min_non_na]/(min_non_na), ru_lower))
)
# Apply the interpolation function on vector "x": -> y
y <- as.numeric(l_interp_vec(as.numeric(x[min_non_na:max_non_na])))
# Create a vector that combines the ramp_up vector
# and y if the first NA is at row 1: -> z
if(length(ramp_up) > 1 & max_non_na != length(x)){
# Create a vector interpolations if there are
# multiple NA values after the last value: -> lower_l_int
lower_l_int <- as.numeric(cumsum(rep(mean(diff(c(ramp_up, y))),
ru_upper+1)) +
as.numeric(x[max_non_na]))
# Store the linear interpolations in a vector: -> z
z <- as.numeric(c(ramp_up, y, lower_l_int))
}else if(length(ramp_up) > 1 & max_non_na == length(x)){
# Store the linear interpolations in a vector: -> z
z <- as.numeric(c(ramp_up, y))
}else if(min_non_na == 1 & max_non_na != length(x)){
# Create a vector interpolations if there are
# multiple NA values after the last value: -> lower_l_int
lower_l_int <- as.numeric(cumsum(rep(mean(diff(c(ramp_up, y))),
ru_upper+1)) +
as.numeric(x[max_non_na]))
# Store the linear interpolations in a vector: -> z
z <- as.numeric(c(y, lower_l_int))
}else{
# Store the linear interpolations in a vector: -> z
z <- as.numeric(y)
}
# Interpolate between points in x, return new x:
return(as.numeric(ifelse(is.na(x), z, x)))
}else{
x
}
}
)
)
# Subset interped df to only contain
# the time values in C, store a data frame: -> int_df_subset
int_df_subset <- interped_df[interped_df$time %in% C$time,]

Related

R: Inter- and extrapolate values in dataframe by matching column of another dataframe

I have two dataframes:
df1 <- data.frame(levels = c(1, 3, 5, 7, 9),
values = c(2.2, 5.3, 7.9, 5.4, 8.7))
df2 <- data.frame(levels = c(1, 4, 8, 12)) # other columns not necessary
I want the df1$values to be interpolated to the df2$levels, based on what the numbers in df1$levels are. So there is some interpolation, but also extrapolation to level 12 in the second dataframe.
Perhaps, do a complete based on the union of levels of both datasets and then use na.approx (from zoo) with rule = 2 (for extrapolation)
library(dplyr)
library(tidyr)
library(zoo)
df1 <- df1 %>%
complete(levels = union(levels, df2$levels)) %>%
mutate(values = na.approx(values, maxgap = Inf, rule = 2))
-output
df1
# A tibble: 8 x 2
# levels values
# <dbl> <dbl>
#1 1 2.2
#2 3 5.3
#3 4 6.6
#4 5 7.9
#5 7 5.4
#6 8 7.05
#7 9 8.7
#8 12 8.7
I'm sure this can be condensed, this is some code I wrote a long time ago that handles having to extrapolate at the head/tail of an ordered vector:
# Function to interpolate / extrapolate: l_estimate => function()
l_estimate <- function(vec){
# Function to perform-linear interpolation and return vector:
# .l_interp_vec => function()
.l_interp_vec <- function(vec){
interped_values <-
approx(x = vec, method = "linear", ties = "constant", n = length(vec))$y
return(ifelse(is.na(vec), interped_values[is.na(vec)], vec))
}
# Store a vector denoting the indices of the vector that are NA:
# na_idx => integer vector
na_idx <- is.na(vec)
# Store a scalar of min row where x isn't NA: min_non_na => integer vector
min_non_na <- min(which(!(na_idx)))
# Store a scalar of max row where x isn't NA: max_non_na => integer vector
max_non_na <- max(which(!(na_idx)))
# Store scalar of the number of rows needed to impute prior
# to first NA value: ru_lower => integer vector
ru_lower <- ifelse(min_non_na > 1, min_non_na - 1, min_non_na)
# Store scalar of the number of rows needed to impute after
# the last non-NA value: ru_upper => integer vector
ru_upper <- ifelse(
max_non_na == length(vec),
length(vec) - 1,
(length(vec) - (max_non_na + 1))
)
# Store a vector of the ramp to function: ramp_up => numeric vector
ramp_up <- as.numeric(
cumsum(rep(vec[min_non_na]/(min_non_na), ru_lower))
)
# Apply the interpolation function on vector: y => numeric vector
y <- as.numeric(.l_interp_vec(as.numeric(vec[min_non_na:max_non_na])))
# Create a vector that combines the ramp_up vector
# and y if the first NA is at row 1:
if(length(ramp_up) >= 1 & max_non_na != length(vec)){
# Create a vector interpolations if there are
# multiple NA values after the last value: lower_l_int => numeric vector
lower_l_int <- as.numeric(
cumsum(rep(mean(diff(c(ramp_up, y))), ru_upper+1)) +
as.numeric(vec[max_non_na])
)
# Store the linear interpolations in a vector: z => numeric vector
z <- as.numeric(c(ramp_up, y, lower_l_int))
}else if(length(ramp_up) > 1 & max_non_na == length(vec)){
# Store the linear interpolations in a vector: z => numeric
z <- as.numeric(c(ramp_up, y))
}else if(min_non_na == 1 & max_non_na != length(vec)){
# Create a vector interpolations if there are
# multiple NA values after the last value: lower_l_int => numeric vector
lower_l_int <- as.numeric(
cumsum(rep(mean(diff(c(ramp_up, y))), ru_upper+1)) +
as.numeric(vec[max_non_na])
)
# Store the linear interpolations in a vector: z => numeric vector
z <- as.numeric(c(y, lower_l_int))
}else{
# Store the linear interpolations in a vector: z => numeric vector
z <- as.numeric(y)
}
# Interpolate between points in x, return new x:
return(as.numeric(ifelse(is.na(vec), z, vec)))
}
# Apply the function on ordered data: data.frame => stdout(console)
transform(full_df[order(full_df$levels),],
values = l_estimate(values)
)

Fast sum of values of a vector above given thresholds

I have a vector of threshold values, thresholds, and another vector, x. I'd like to create a new vector, say vec_sum, of the same length as thresholds, that stores, for each element of thresholds, the sum of values of x larger than this element.
What is the fastest way of doing this?
The naive way I'm doing it is
vec_sum <- rep(NA,length(thresholds))
for(i in seq_along(thresholds))
{
vec_sum[i] <- sum(x[x>thresholds[i]])
}
In case it helps, thresholds is already sorted.
Here is another solution using cumsum:
f1 <- function(v, th){
v2 <- v[order(v)]
v2s <- rev(cumsum(rev(v2)))
return(v2s[findInterval(th, v2) + 1])
}
Here are some tests and comparison with the other answer (as well as the example data) by Ronak:
f2 <- function(x, thresholds){
if (all(x < thresholds[1])) return(rep(0, length(thresholds)))
if (all(x > thresholds[length(thresholds)])) return(rep(sum(x), length(thresholds)))
return(rev(cumsum(rev(tapply(x,
findInterval(x, thresholds, left.open = TRUE), sum)[-1]))))
}
test_th <- c(3, 5, 10)
test_x <- c(2, 3, 1, 19, 4, 6, 5, 15, 7:14, 16:18, 20)
vec_sum <- rep(NA,length(test_th))
for(i in seq_along(test_th)) {
vec_sum[i] <- sum(test_x[test_x>test_th[i]])
}
all(dplyr::near(f1(test_x, test_th), vec_sum))
# [1] TRUE
all(dplyr::near(f2(test_x, test_th), vec_sum))
# [1] TRUE
set.seed(123)
test_x <- rnorm(10000)
test_th <- sort(rnorm(100)) ## f2 requires sorted threshold values
vec_sum <- rep(NA,length(test_th))
for(i in seq_along(test_th)) {
vec_sum[i] <- sum(test_x[test_x>test_th[i]])
}
all(dplyr::near(f1(test_x, test_th), vec_sum))
# [1] TRUE
all(dplyr::near(f2(test_x, test_th), vec_sum))
# [1] FALSE
# Warning message:
# In x - y : longer object length is not a multiple of shorter object length
library(microbenchmark)
microbenchmark(
a = f1(test_x, test_th),
b = f2(test_x, test_th)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# a 587.116 682.864 900.3572 694.713 703.726 10647.206 100
# b 1157.213 1203.063 1260.0663 1223.600 1258.552 2143.069 100
Not sure if this is any faster, but we can use findInterval to cut x by thresholds. We take sum of each group using tapply and take cumsum in reverse.
as.integer(rev(cumsum(rev(tapply(x,
findInterval(x, thresholds, left.open = TRUE), sum)[-1]))))
Tested on
thresholds <- c(3, 5, 10)
x <- c(2, 3, 1, 19, 4, 6, 5, 15, 7:14, 16:18, 20) #1:20 in random order
vec_sum <- rep(NA,length(thresholds))
for(i in seq_along(thresholds)) {
vec_sum[i] <- sum(x[x>thresholds[i]])
}
vec_sum
#[1] 204 195 155
Using the proposed solution
as.integer(rev(cumsum(rev(tapply(x,
findInterval(x, thresholds, left.open = TRUE), sum)[-1]))))
#[1] 204 195 155
Explaining the answer. findInterval returns groups where each value of x belongs
findInterval(x, thresholds, left.open = TRUE)
#[1] 0 0 0 3 1 2 1 3 2 2 2 2 3 3 3 3 3 3 3 3
We use tapply to get sum of each group
tapply(x, findInterval(x, thresholds, left.open = TRUE), sum)
# 0 1 2 3
# 6 9 40 155
0-group should be excluded since they are smaller than all the values of threshold (hence -1). Group 2 should also contain sum from group 1 and group 3 should contain sum of group 1 and 2. So we reverse the sequence and take cumsum
cumsum(rev(tapply(x, findInterval(x, thresholds, left.open = TRUE), sum)[-1]))
# 3 2 1
#155 195 204
To get it in original order and to match it with threshold we reverse it again
rev(cumsum(rev(tapply(x, findInterval(x, thresholds, left.open = TRUE), sum)[-1])))
# 1 2 3
#204 195 155
Edge Cases :
If there are all values below threshold or all values above threshold, we might need to do an extra check and return the following.
if (all(x < thresholds[1])) rep(0, length(thresholds))
if (all(x > thresholds[length(thresholds)])) rep(sum(x), length(thresholds))

How to find the three closest (nearest) values within a vector?

I would like to find out the three closest numbers in a vector.
Something like
v = c(10,23,25,26,38,50)
c = findClosest(v,3)
c
23 25 26
I tried with sort(colSums(as.matrix(dist(x))))[1:3], and it kind of works, but it selects the three numbers with minimum overall distance not the three closest numbers.
There is already an answer for matlab, but I do not know how to translate it to R:
%finds the index with the minimal difference in A
minDiffInd = find(abs(diff(A))==min(abs(diff(A))));
%extract this index, and it's neighbor index from A
val1 = A(minDiffInd);
val2 = A(minDiffInd+1);
How to find two closest (nearest) values within a vector in MATLAB?
My assumption is that the for the n nearest values, the only thing that matters is the difference between the v[i] - v[i - (n-1)]. That is, finding the minimum of diff(x, lag = n - 1L).
findClosest <- function(x, n) {
x <- sort(x)
x[seq.int(which.min(diff(x, lag = n - 1L)), length.out = n)]
}
findClosest(v, 3L)
[1] 23 25 26
Let's define "nearest numbers" by "numbers with minimal sum of L1 distances". You can achieve what you want by a combination of diff and windowed sum.
You could write a much shorter function but I wrote it step by step to make it easier to follow.
v <- c(10,23,25,26,38,50)
#' Find the n nearest numbers in a vector
#'
#' #param v Numeric vector
#' #param n Number of nearest numbers to extract
#'
#' #details "Nearest numbers" defined as the numbers which minimise the
#' within-group sum of L1 distances.
#'
findClosest <- function(v, n) {
# Sort and remove NA
v <- sort(v, na.last = NA)
# Compute L1 distances between closest points. We know each point is next to
# its closest neighbour since we sorted.
delta <- diff(v)
# Compute sum of L1 distances on a rolling window with n - 1 elements
# Why n-1 ? Because we are looking at deltas and 2 deltas ~ 3 elements.
withingroup_distances <- zoo::rollsum(delta, k = n - 1)
# Now it's simply finding the group with minimum within-group sum
# And working out the elements
group_index <- which.min(withingroup_distances)
element_indices <- group_index + 0:(n-1)
v[element_indices]
}
findClosest(v, 2)
# 25 26
findClosest(v, 3)
# 23 25 26
A base R option, idea being we first sort the vector and subtract every ith element with i + n - 1 element in the sorted vector and select the group which has minimum difference.
closest_n_vectors <- function(v, n) {
v1 <- sort(v)
inds <- which.min(sapply(head(seq_along(v1), -(n - 1)), function(x)
v1[x + n -1] - v1[x]))
v1[inds: (inds + n - 1)]
}
closest_n_vectors(v, 3)
#[1] 23 25 26
closest_n_vectors(c(2, 10, 1, 20, 4, 5, 23), 2)
#[1] 1 2
closest_n_vectors(c(19, 23, 45, 67, 89, 65, 1), 2)
#[1] 65 67
closest_n_vectors(c(19, 23, 45, 67, 89, 65, 1), 3)
#[1] 1 19 23
In case of tie this will return the numbers with smallest value since we are using which.min.
BENCHMARKS
Since we have got quite a few answers, it is worth doing a benchmark of all the solutions till now
set.seed(1234)
x <- sample(100000000, 100000)
identical(findClosest_antoine(x, 3), findClosest_Sotos(x, 3),
closest_n_vectors_Ronak(x, 3), findClosest_Cole(x, 3))
#[1] TRUE
microbenchmark::microbenchmark(
antoine = findClosest_antoine(x, 3),
Sotos = findClosest_Sotos(x, 3),
Ronak = closest_n_vectors_Ronak(x, 3),
Cole = findClosest_Cole(x, 3),
times = 10
)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
#antoine 148.751 159.071 163.298 162.581 167.365 181.314 10 b
# Sotos 1086.098 1349.762 1372.232 1398.211 1453.217 1553.945 10 c
# Ronak 54.248 56.870 78.886 83.129 94.748 100.299 10 a
# Cole 4.958 5.042 6.202 6.047 7.386 7.915 10 a
An idea is to use zoo library to do a rolling operation, i.e.
library(zoo)
m1 <- rollapply(v, 3, by = 1, function(i)c(sum(diff(i)), c(i)))
m1[which.min(m1[, 1]),][-1]
#[1] 23 25 26
Or make it into a function,
findClosest <- function(vec, n) {
require(zoo)
vec1 <- sort(vec)
m1 <- rollapply(vec1, n, by = 1, function(i) c(sum(diff(i)), c(i)))
return(m1[which.min(m1[, 1]),][-1])
}
findClosest(v, 3)
#[1] 23 25 26
For use in a dataframe,
data%>%
group_by(var1,var2)%>%
do(data.frame(findClosest(.$val,3)))

subset data.frame for every level of a factor

Given the following vectors to build a dataframe :
set.seed(1)
x <- sample( LETTERS[1:4], 100, replace=TRUE)
y <- runif(100,0,100)
df <- data.frame(x,y)
I would like to have if possible, a clean code with a loop or apply or any other method to subset the data.frame by different conditionals for every level of factor x. For example:
level A y >30 | y <20
level B y >21 | y <12
level C y >42 | y <21
level D y >58 | y <13
A split apply combine approach where we use Map to iterate over the subsets and the conditions in parallel.
do.call(rbind,
Map(function(data, left, right) {
subset(x = data, subset = y > left | y < right)
},
data = split(df, df$x),
left = c(30, 21, 42, 58),
right = c(20, 12, 21, 13)
))
# x y
#A.5 A 63.349326
#A.10 A 59.876097
#A.11 A 97.617069
#A.12 A 73.179251
#A.22 A 49.559358
#A.24 A 17.344233
# ...
We split your data by x, subset each according to your conditions and combine the list to a single dataframe.
What about something like this
df[df$x == 'A' & (df$y > 30 | df$y < 20),]
# x y
# 2 A 71.117606
# 3 A 44.438057
# 6 A 63.244699
# 7 A 54.185802
# 11 A 39.577617
# 13 A 8.681545
# 29 A 94.437431
# ...
# or depending on what you mean by '&'
df[df$x == 'A' & df$y > 30,]
# x y
# 2 A 71.11761
# 3 A 44.43806
# 6 A 63.24470
# 7 A 54.18580
# 11 A 39.57762
# 29 A 94.43743
# 31 A 54.17604
# ...
# and then accordingly for the other cases
using library(data.table) we can do
lower = c(20, 12, 21, 13)
upper = c(30, 21, 42, 58)
setDT(df)[!between(y, lower[x], upper[x]), .SD, keyby=x]
# x y
# 1: A 63.349326
# 2: A 59.876097
# 3: A 97.617069
# 4: A 73.179251
# 5: A 49.559358
# 6: A 17.344233
# 7: A 51.116978
# ...

Find the lower points of the two data columns and compare [duplicate]

I'm looking for a computationally efficient way to find local maxima/minima for a large list of numbers in R.
Hopefully without for loops...
For example, if I have a datafile like 1 2 3 2 1 1 2 1, I want the function to return 3 and 7, which are the positions of the local maxima.
diff(diff(x)) (or diff(x,differences=2): thanks to #ZheyuanLi) essentially computes the discrete analogue of the second derivative, so should be negative at local maxima. The +1 below takes care of the fact that the result of diff is shorter than the input vector.
edit: added #Tommy's correction for cases where delta-x is not 1...
tt <- c(1,2,3,2,1, 1, 2, 1)
which(diff(sign(diff(tt)))==-2)+1
My suggestion above ( http://statweb.stanford.edu/~tibs/PPC/Rdist/ ) is intended for the case where the data are noisier.
#Ben's solution is pretty sweet. It doesn't handle the follwing cases though:
# all these return numeric(0):
x <- c(1,2,9,9,2,1,1,5,5,1) # duplicated points at maxima
which(diff(sign(diff(x)))==-2)+1
x <- c(2,2,9,9,2,1,1,5,5,1) # duplicated points at start
which(diff(sign(diff(x)))==-2)+1
x <- c(3,2,9,9,2,1,1,5,5,1) # start is maxima
which(diff(sign(diff(x)))==-2)+1
Here's a more robust (and slower, uglier) version:
localMaxima <- function(x) {
# Use -Inf instead if x is numeric (non-integer)
y <- diff(c(-.Machine$integer.max, x)) > 0L
rle(y)$lengths
y <- cumsum(rle(y)$lengths)
y <- y[seq.int(1L, length(y), 2L)]
if (x[[1]] == x[[2]]) {
y <- y[-1]
}
y
}
x <- c(1,2,9,9,2,1,1,5,5,1)
localMaxima(x) # 3, 8
x <- c(2,2,9,9,2,1,1,5,5,1)
localMaxima(x) # 3, 8
x <- c(3,2,9,9,2,1,1,5,5,1)
localMaxima(x) # 1, 3, 8
Use the zoo library function rollapply:
x <- c(1, 2, 3, 2, 1, 1, 2, 1)
library(zoo)
xz <- as.zoo(x)
rollapply(xz, 3, function(x) which.min(x)==2)
# 2 3 4 5 6 7
#FALSE FALSE FALSE TRUE FALSE FALSE
rollapply(xz, 3, function(x) which.max(x)==2)
# 2 3 4 5 6 7
#FALSE TRUE FALSE FALSE FALSE TRUE
Then pull the index using the 'coredata' for those values where 'which.max' is a "center value" signaling a local maximum. You could obviously do the same for local minima using which.min instead of which.max.
rxz <- rollapply(xz, 3, function(x) which.max(x)==2)
index(rxz)[coredata(rxz)]
#[1] 3 7
I am assuming you do not want the starting or ending values, but if you do , you could pad the ends of your vectors before processing, rather like telomeres do on chromosomes.
(I'm noting the ppc package ("Peak Probability Contrasts" for doing mass spectrometry analyses, simply because I was unaware of its availability until reading #BenBolker's comment above, and I think adding these few words will increase the chances that someone with a mass-spec interest will see this on a search.)
I took a stab at this today. I know you said hopefully without for loops but I stuck with using the apply function. Somewhat compact and fast and allows threshold specification so you can go greater than 1.
The function:
inflect <- function(x, threshold = 1){
up <- sapply(1:threshold, function(n) c(x[-(seq(n))], rep(NA, n)))
down <- sapply(-1:-threshold, function(n) c(rep(NA,abs(n)), x[-seq(length(x), length(x) - abs(n) + 1)]))
a <- cbind(x,up,down)
list(minima = which(apply(a, 1, min) == a[,1]), maxima = which(apply(a, 1, max) == a[,1]))
}
To a visualize it/play with thresholds you can run the following code:
# Pick a desired threshold # to plot up to
n <- 2
# Generate Data
randomwalk <- 100 + cumsum(rnorm(50, 0.2, 1)) # climbs upwards most of the time
bottoms <- lapply(1:n, function(x) inflect(randomwalk, threshold = x)$minima)
tops <- lapply(1:n, function(x) inflect(randomwalk, threshold = x)$maxima)
# Color functions
cf.1 <- grDevices::colorRampPalette(c("pink","red"))
cf.2 <- grDevices::colorRampPalette(c("cyan","blue"))
plot(randomwalk, type = 'l', main = "Minima & Maxima\nVariable Thresholds")
for(i in 1:n){
points(bottoms[[i]], randomwalk[bottoms[[i]]], pch = 16, col = cf.1(n)[i], cex = i/1.5)
}
for(i in 1:n){
points(tops[[i]], randomwalk[tops[[i]]], pch = 16, col = cf.2(n)[i], cex = i/1.5)
}
legend("topleft", legend = c("Minima",1:n,"Maxima",1:n),
pch = rep(c(NA, rep(16,n)), 2), col = c(1, cf.1(n),1, cf.2(n)),
pt.cex = c(rep(c(1, c(1:n) / 1.5), 2)), cex = .75, ncol = 2)
There are some good solutions provided, but it depends on what you need.
Just diff(tt) returns the differences.
You want to detect when you go from increasing values to decreasing values. One way to do this is provided by #Ben:
diff(sign(diff(tt)))==-2
The problem here is that this will only detect changes that go immediately from strictly increasing to strictly decreasing.
A slight change will allow for repeated values at the peak (returning TRUE for last occurence of the peak value):
diff(diff(x)>=0)<0
Then, you simply need to properly pad the front and back if you want to detect maxima at the beginning or end of
Here's everything wrapped in a function (including finding of valleys):
which.peaks <- function(x,partial=TRUE,decreasing=FALSE){
if (decreasing){
if (partial){
which(diff(c(FALSE,diff(x)>0,TRUE))>0)
}else {
which(diff(diff(x)>0)>0)+1
}
}else {
if (partial){
which(diff(c(TRUE,diff(x)>=0,FALSE))<0)
}else {
which(diff(diff(x)>=0)<0)+1
}
}
}
Late to the party, but this might be of interest for others. You can nowadays use the (internal) function find_peaks from ggpmisc package. You can parametrize it using threshold, span and strict arguments. Since ggpmisc package is aimed for using with ggplot2 you can directly plot minima and maxima using thestat_peaks and stat_valleys functions:
set.seed(1)
x <- 1:10
y <- runif(10)
# Maxima
x[ggpmisc:::find_peaks(y)]
[1] 4 7
y[ggpmisc:::find_peaks(y)]
[1] 0.9082078 0.9446753
# Minima
x[ggpmisc:::find_peaks(-y)]
[1] 5
y[ggpmisc:::find_peaks(-y)]
[1] 0.2016819
# Plot
ggplot(data = data.frame(x, y), aes(x = x, y = y)) + geom_line() + stat_peaks(col = "red") + stat_valleys(col = "green")
Answer by #42- is great, but I had a use case where I didn't want to use zoo. It's easy to implement this with dplyr using lag and lead:
library(dplyr)
test = data_frame(x = sample(1:10, 20, replace = TRUE))
mutate(test, local.minima = if_else(lag(x) > x & lead(x) > x, TRUE, FALSE)
Like the rollapply solution, you can control the window size and edge cases through the lag/lead arguments n and default, respectively.
In the case I'm working on, duplicates are frequent. So I have implemented a function that allows finding first or last extrema (min or max):
locate_xtrem <- function (x, last = FALSE)
{
# use rle to deal with duplicates
x_rle <- rle(x)
# force the first value to be identified as an extrema
first_value <- x_rle$values[1] - x_rle$values[2]
# differentiate the series, keep only the sign, and use 'rle' function to
# locate increase or decrease concerning multiple successive values.
# The result values is a series of (only) -1 and 1.
#
# ! NOTE: with this method, last value will be considered as an extrema
diff_sign_rle <- c(first_value, diff(x_rle$values)) %>% sign() %>% rle()
# this vector will be used to get the initial positions
diff_idx <- cumsum(diff_sign_rle$lengths)
# find min and max
diff_min <- diff_idx[diff_sign_rle$values < 0]
diff_max <- diff_idx[diff_sign_rle$values > 0]
# get the min and max indexes in the original series
x_idx <- cumsum(x_rle$lengths)
if (last) {
min <- x_idx[diff_min]
max <- x_idx[diff_max]
} else {
min <- x_idx[diff_min] - x_rle$lengths[diff_min] + 1
max <- x_idx[diff_max] - x_rle$lengths[diff_max] + 1
}
# just get number of occurences
min_nb <- x_rle$lengths[diff_min]
max_nb <- x_rle$lengths[diff_max]
# format the result as a tibble
bind_rows(
tibble(Idx = min, Values = x[min], NB = min_nb, Status = "min"),
tibble(Idx = max, Values = x[max], NB = max_nb, Status = "max")) %>%
arrange(.data$Idx) %>%
mutate(Last = last) %>%
mutate_at(vars(.data$Idx, .data$NB), as.integer)
}
The answer to the original question is:
> x <- c(1, 2, 3, 2, 1, 1, 2, 1)
> locate_xtrem(x)
# A tibble: 5 x 5
Idx Values NB Status Last
<int> <dbl> <int> <chr> <lgl>
1 1 1 1 min FALSE
2 3 3 1 max FALSE
3 5 1 2 min FALSE
4 7 2 1 max FALSE
5 8 1 1 min FALSE
The result indicates that the second minimum is equal to 1 and that this value is repeated twice starting at index 5. Therefore, a different result could be obtained by indicating this time to the function to find the last occurrences of local extremas:
> locate_xtrem(x, last = TRUE)
# A tibble: 5 x 5
Idx Values NB Status Last
<int> <dbl> <int> <chr> <lgl>
1 1 1 1 min TRUE
2 3 3 1 max TRUE
3 6 1 2 min TRUE
4 7 2 1 max TRUE
5 8 1 1 min TRUE
Depending on the objective, it is then possible to switch between the first and the last value of a local extremas. The second result with last = TRUE could also be obtained from an operation between columns "Idx" and "NB"...
Finally to deal with noise in the data, a function could be implemented to remove fluctuations below a given threshold. Code is not exposed since it goes beyond the initial question. I have wrapped it in a package (mainly to automate the testing process) and I give below a result example:
x_series %>% xtrem::locate_xtrem()
x_series %>% xtrem::locate_xtrem() %>% remove_noise()
Here's the solution for minima:
#Ben's solution
x <- c(1,2,3,2,1,2,1)
which(diff(sign(diff(x)))==+2)+1 # 5
Please regard the cases at Tommy's post!
#Tommy's solution:
localMinima <- function(x) {
# Use -Inf instead if x is numeric (non-integer)
y <- diff(c(.Machine$integer.max, x)) > 0L
rle(y)$lengths
y <- cumsum(rle(y)$lengths)
y <- y[seq.int(1L, length(y), 2L)]
if (x[[1]] == x[[2]]) {
y <- y[-1]
}
y
}
x <- c(1,2,9,9,2,1,1,5,5,1)
localMinima(x) # 1, 7, 10
x <- c(2,2,9,9,2,1,1,5,5,1)
localMinima(x) # 7, 10
x <- c(3,2,9,9,2,1,1,5,5,1)
localMinima(x) # 2, 7, 10
Please regard: Neither localMaxima nor localMinima can handle duplicated maxima/minima at start!
I had some trouble getting the locations to work in previous solutions and came up with a way to grab the minima and maxima directly. The code below will do this and will plot it, marking the minima in green and the maxima in red. Unlike the which.max() function this will pull all indices of the minima/maxima out of a data frame. The zero value is added in the first diff() function to account for the missing decreased length of the result that occurs whenever you use the function. Inserting this into the innermost diff() function call saves from having to add an offset outside of the logical expression. It doesn't matter much, but i feel it's a cleaner way to do it.
# create example data called stockData
stockData = data.frame(x = 1:30, y=rnorm(30,7))
# get the location of the minima/maxima. note the added zero offsets
# the location to get the correct indices
min_indexes = which(diff( sign(diff( c(0,stockData$y)))) == 2)
max_indexes = which(diff( sign(diff( c(0,stockData$y)))) == -2)
# get the actual values where the minima/maxima are located
min_locs = stockData[min_indexes,]
max_locs = stockData[max_indexes,]
# plot the data and mark minima with red and maxima with green
plot(stockData$y, type="l")
points( min_locs, col="red", pch=19, cex=1 )
points( max_locs, col="green", pch=19, cex=1 )
This function by Timothée Poisot is handy for noisy series:
May 3, 2009
An Algorithm To Find Local Extrema In A Vector
Filed under: Algorithm — Tags: Extrema, Time series — Timothée Poisot # 6:46pm
I spend some time looking for an algorithm to find local extrema in
a vector (time series). The solution I used is to “walk” through the
vector by step larger than 1, in order to retain only one value even
when the values are very noisy (see the picture at the end of the
post).
It goes like this :
findpeaks <- function(vec,bw=1,x.coo=c(1:length(vec)))
{
pos.x.max <- NULL
pos.y.max <- NULL
pos.x.min <- NULL
pos.y.min <- NULL for(i in 1:(length(vec)-1)) { if((i+1+bw)>length(vec)){
sup.stop <- length(vec)}else{sup.stop <- i+1+bw
}
if((i-bw)<1){inf.stop <- 1}else{inf.stop <- i-bw}
subset.sup <- vec[(i+1):sup.stop]
subset.inf <- vec[inf.stop:(i-1)]
is.max <- sum(subset.inf > vec[i]) == 0
is.nomin <- sum(subset.sup > vec[i]) == 0
no.max <- sum(subset.inf > vec[i]) == length(subset.inf)
no.nomin <- sum(subset.sup > vec[i]) == length(subset.sup)
if(is.max & is.nomin){
pos.x.max <- c(pos.x.max,x.coo[i])
pos.y.max <- c(pos.y.max,vec[i])
}
if(no.max & no.nomin){
pos.x.min <- c(pos.x.min,x.coo[i])
pos.y.min <- c(pos.y.min,vec[i])
}
}
return(list(pos.x.max,pos.y.max,pos.x.min,pos.y.min))
}
Link to original blog post
In the pracma package, use the
tt <- c(1,2,3,2,1, 1, 2, 1)
tt_peaks <- findpeaks(tt, zero = "0", peakpat = NULL,
minpeakheight = -Inf, minpeakdistance = 1, threshold = 0, npeaks = 0, sortstr = FALSE)
[,1] [,2] [,3] [,4]
[1,] 3 3 1 5
[2,] 2 7 6 8
That returns a matrix with 4 columns.
The first column is showing the local peaks' absolute values.
The 2nd column are the indices
The 3rd and 4th column are the start and end of the peaks (with potential overlap).
See https://www.rdocumentation.org/packages/pracma/versions/1.9.9/topics/findpeaks for details.
One caveat: I used it in a series of non-integers, and the peak was one index too late (for all peaks) and I do not know why. So I had to manually remove "1" from my index vector (no big deal).
Finding local maxima and minima for a not so easy sequence e.g. 1 0 1 1 2 0 1 1 0 1 1 1 0 1 I would give their positions at (1), 5, 7.5, 11 and (14) for maxima and 2, 6, 9, 13 for minima.
#Position 1 1 1 1 1
# 1 2 3 4 5 6 7 8 9 0 1 2 3 4
x <- c(1,0,1,1,2,0,1,1,0,1,1,1,0,1) #Frequency
# p v p v p v p v p p..Peak, v..Valey
peakPosition <- function(x, inclBorders=TRUE) {
if(inclBorders) {y <- c(min(x), x, min(x))
} else {y <- c(x[1], x)}
y <- data.frame(x=sign(diff(y)), i=1:(length(y)-1))
y <- y[y$x!=0,]
idx <- diff(y$x)<0
(y$i[c(idx,F)] + y$i[c(F,idx)] - 1)/2
}
#Find Peaks
peakPosition(x)
#1.0 5.0 7.5 11.0 14.0
#Find Valeys
peakPosition(-x)
#2 6 9 13
peakPosition(c(1,2,3,2,1,1,2,1)) #3 7
We see many nice functions and ideas with different features here. One issue of almost all examples is the efficiency. Many times we see the use of complex functions like diff() or for()-loops, which become slow when large data sets are involved. Let me introduce an efficient function I use every day, with minimal features, but very fast:
Local Maxima Function amax()
The purpose is to detect all local maxima in a real valued vector.
If the first element x[1] is the global maximum, it is ignored,
because there is no information about the previous emlement. If there
is a plateau, the first edge is detected.
#param x numeric vector
#return returns the indicies of local maxima. If x[1] = max, then
it is ignored.
amax <- function(x)
{
a1 <- c(0,x,0)
a2 <- c(x,0,0)
a3 <- c(0,0,x)
e <- which((a1 >= a2 & a1 > a3)[2:(length(x))])
if(!is.na(e[1] == 1))
if(e[1]==1)
e <- e[-1]
if(length(e) == 0) e <- NaN
return (e)
}
a <- c(1,2,3,2,1,5,5,4)
amax(a) # 3, 6
I posted this elsewhere, but I think this is an interesting way to go about it. I'm not sure what its computational efficiency is, but it's a very concise way of solving the problem.
vals=rbinom(1000,20,0.5)
text=paste0(substr(format(diff(vals),scientific=TRUE),1,1),collapse="")
sort(na.omit(c(gregexpr('[ ]-',text)[[1]]+1,ifelse(grepl('^-',text),1,NA),
ifelse(grepl('[^-]$',text),length(vals),NA))))
An enhancement (fast and simple method) to the formula proposed by #BEN and regarding to the cases proposed by #TOMMY:
the following recursive formula handle any cases:
dx=c(0,sign(diff(x)))
numberofzeros= length(dx) - sum(abs(dx)) -1 # to find the number of zeros
# in the dx minus the first one
# which is added intentionally.
#running recursive formula to clear middle zeros
# iterate for the number of zeros
for (i in 1:numberofzeros){
dx = sign(2*dx + c(0,rev(sign(diff(rev(dx))))))
}
Now, the formula provided by #Ben Bolker can be used with a little change:
plot(x)
points(which(diff(dx)==2),x[which(diff(dx)==2)],col = 'blue')#Local MIN.
points(which(diff(dx)==-2),x[which(diff(dx)==-2)],col = 'red')#Local MAX.
I liked #mikeck's solution so that I wouldn't have to convert my dataframes back and forth from a zoo object. But I also wanted to use a window wider than 1. Their solution only looks at the xth value away from the value of interest, not the values within x distance. Here is what I came up with. You would need to add an extra lag/lead line for every value away from the value of interest that you want to look.
x <- data.frame(AIC = c(98, 97, 96, 97, 98, 99, 98, 98, 97, 96, 95, 94, 93, 92, 93, 94, 95, 96, 95, 94, 93, 92, 91, 90, 89, 88))
x <- x %>%
mutate(local.minima = if_else(lag(AIC) > AIC & lead(AIC) > AIC &
lag(AIC, 2) > AIC & lead(AIC, 2) > AIC &
lag(AIC, 3) > AIC & lead(AIC, 3) > AIC, TRUE, FALSE),
local.minima = if_else(is.na(local.minima), TRUE, local.minima))

Resources