Backtesting open position counter for trading in R - r

Getting started on backtesting some trading data, in particular a very basic mean reversion idea and can't get my head around how to approach this concept.
How would I go about having a running 'posy' increase by 1 once DifFromFv (the deviation from fair value) reaches -10 and subsequently 'posy' increases by 1 as DifFromFv extends by multiples of -3 (-13,-16,-19, etc.) whilst having 'posy' decrease by 1 every time DifFromFv reverts back +5 from last changed 'posy'? Simply put, I am buying once the DifFromFv reaches 10 points and averaging every 3 points, whilst taking each individual average out for 5 points profit.
E.g:
DifFromFv posy
0.00 0
-10.00 1 #initial clip (target profit -5.00)
-11.50 1
-13.00 2 #avg #1 (target profit -8.00)
-16.60 3 #avg #2 (target profit -11.00)
-12.30 3
-11.00 2 #taking profit on avg #2
-14.10 2
-8.00 1 #taking profit on avg #1
-7.00 1
-5.00 0 #taking profit on initial clip
It should be noted that the take profit for every clip is consistently set at -5,-8,-11,etc. increments regardless of where the averages are filled as seen by the target profit for avg #2 being at -11.00 rather than -11.60. This is both to reduce margin of error in real-life fills vs data fills and also I'm pretty sure should make the approach to this concept a lot easier to think about.
Thanks in advance!

Next time please provide some code, even though your explanation is quite clear.
However, you didn't mention how you want to deal with large jumps in DifFromFv (for instance, if it goes from -3 to -18), so I leave it up to you.
Here is the code with comments:
library(plyr)
firstPosy = FALSE
DiffFair <- c(0, -10, -11.5, -13, -16.6, -12.3, -11, -14.1, -8, -7, -5) # Your data here
posy <- c(0)
buyPrices <- c(0) # Stores the prices at which you by your asset
targetProfit <- c(0) # Stores the target profit alongside with the vector above
steps <- c(0) # Stores your multiples of -3 after -10 (-10, -13, -16...)
PNL = 0
for (i in 2:length(DiffFair)) {
# Case where posy increments for the first time by one
if (DiffFair[i] <= -10 & DiffFair[i] > -13 & firstPosy == FALSE) {
firstPosy = TRUE
posy <- c(posy, 1)
steps <- c(steps, round_any(DiffFair[i], 10, f = ceiling))
lastChangePosy = DiffFair[i]
buyPrices <- c(buyPrices, DiffFair[i])
targetProfit <- c(targetProfit, -5)
}
else if (DiffFair[i] <= -13 & firstPosy == FALSE) {
firstPosy = TRUE
lastChangePosy = DiffFair[i]
steps <- c(steps, round_any(DiffFair[i] + 10, 3, f = ceiling) - 10)
buyPrices <- c(buyPrices, DiffFair[i])
targetProfit <- c(targetProfit, -5)
posy <- c(posy, tail(posy, n=1) + (-round_any(DiffFair[i] + 10, 3, f = ceiling) / 3) + 1)
}
# Posy increase
else if (tail(steps, n=1) > round_any(DiffFair[i] + 10, 3, f = ceiling) - 10 & DiffFair[i] <= -10) {
posy <- c(posy, posy[i-1] + 1)
steps <- c(steps, round_any(DiffFair[i] + 10, 3, f = ceiling) -10)
lastChangePosy = DiffFair[i]
buyPrices <- c(buyPrices, DiffFair[i])
targetProfit <- c(targetProfit, tail(targetProfit, n=1) - 3)
}
# Posy decrease
else if (DiffFair[i] >= tail(targetProfit, n=1) & tail(posy, n=1) > 0) {
if (tail(targetProfit, n=1) == -5) {
posy <- c(posy, 0)
}
else {
posy <- c(posy, posy[i-1] - 1)
}
lastChangePosy = DiffFair[i]
# Compute PNL and delete the target profit and buy price from the vectors
PNL = PNL + (DiffFair[i] - tail(buyPrices, n=1))
buyPrices <- buyPrices[-length(buyPrices)]
targetProfit <- targetProfit[-length(targetProfit)]
steps <- steps[-length(steps)]
if (DiffFair[i] > -10) {
firstPosy = FALSE
}
}
# Posy doesn't change
else {
posy <- c(posy, posy[i-1])
}
}
print(PNL)

Related

Create a function in R for simulating a coin flip game

In a coin flip game, you flip a fair coin until the difference between the number of heads and number of tails is 3. You are paid $8 at the end, but you have to pay $1 for each flip of the coins. Use N =100000 simulations and find the expected amount you could win.
I was able to use the following code for 1 game but it breaks for N=100,000 as it is too long. How can I turn this into a function that will give me expected payoff for 100,000 games?
#For 1 game
# number of games to play
n_games <- 1
bias <- 0.5
game_payoff <- c()
for (i in seq_len(n_games)) {
cost <- 0
flip_record <- c()
payoff <- c()
repeat{
cost <- cost + 1
flip <- rbinom(1, 1, prob = bias)
flip_record <- c(flip_record, flip)
# number of 0s/tails
n_tails <- length(flip_record) - sum(flip_record)
# number of 1s/heads
n_heads <- sum(flip_record)
if (abs(n_tails - n_heads) == 3) {
# record game payoff
game_payoff <- c(game_payoff, 8 - cost)
# print game payoff
print(paste0("single game payoff: ", 8 - cost))
break
}
}
}
Here is a solution.
I have added an argument bankruptcy in order to avoid very long games.
In the example run below, only 10,000 games (n_games) are played.
game <- function(prob = 0.5, pay = 8L, bankruptcy = 1e6) {
cost <- 0L
difference <- 0L
flip_record <- integer(bankruptcy)
# loop until the difference between the number of heads and
# number of tails is 3 or the coin was flipped
# more times than what the player can afford to flip
while(abs(difference) < 3 && cost < bankruptcy) {
flip <- sample(c(-1, 1), 1, prob = c(prob, 1 - prob))
cost <- cost + 1L
difference <- difference + flip
flip_record[cost] <- flip
}
win <- (abs(difference) == 3L)*pay
flip_record <- flip_record[seq.int(cost)]
list(win = win, cost = cost, flips = flip_record)
}
set.seed(2022)
n_games <- 1e4
prob <- 0.5
# the games are independent, this could use parallelization
system.time(
games_played <- replicate(n_games, game(), simplify = FALSE)
)
#> user system elapsed
#> 22.62 16.36 39.00
# extract the relevant values from the returned list
Wins <- sapply(games_played, `[[`, 'win')
Costs <- sapply(games_played, `[[`, 'cost')
# example differences sequences (game 1 and game 10)
Flips <- sapply(games_played, `[[`, 'flips')
cumsum(Flips[[1]])
#> [1] -1 -2 -1 -2 -1 -2 -1 0 1 0 1 2 3
cumsum(Flips[[10]])
#> [1] 1 0 -1 0 1 2 3
# it pays more times than not
table(Wins - Costs > 0)
#>
#> FALSE TRUE
#> 4258 5742
# the proportion of times the player turns a profit
mean(Wins - Costs > 0)
#> [1] 0.5742
# but when the players loose they loose more
# than the profit they get
mean(Wins - Costs) * n_games
#> [1] -9898
# the same number, different way of computing it
# and with formatted output
formatC(sum(Wins - Costs), big.mark = ",")
#> [1] "-9,898"
# the distribution of wins/losses is left skewed
# with a long tail => it doesn't pay to play
profit <- table(Wins - Costs)
barplot(profit)
Created on 2022-10-06 with reprex v2.0.2
It will be more performant to vectorize over the games:
N <- 1e5
d <- rep(1L, N) # current flip difference for all N games; always 1 after the first flip
n <- 1L # the number of flips so far
flip <- c(-1L, 1L)
cost <- 0L # initialize the total cost of playing N games
while (length(d)) {
d <- d + sample(flip, length(d), TRUE) # each flip adds or subtracts 1 from the difference
idx <- which(abs(d) < 3L) # which games have not ended?
cost <- cost + (n <- n + 1L)*(length(d) - length(idx)) # add the cost from the ended games
d <- d[idx] # remove the ended games from the simulation
}
# expected payoff
8 - cost/N
#> [1] -0.99824

Sequence with stopping values

This is like a regular tribonacci sequence, however, I want the sequence to stop whenever the term is at the min or max value.
This is what I have started
sequence <- function(a1=0, a2=0, a3=1, min=0, max=30) {
an <- c()
a[1] <- a1
a[2] <- a2
a[3] <- a3
while(a <= max || a >= min) {
a[i] <- a[i-1] + a[i-2] + a[i-3]
an <- c(an, a[i])
}
an
}
Are there any suggestions regarding my code setup, and how to fix the indexing?
Here is a way of coding this using abs():
tribonacci <- function(start=c(0, 0, 1), maxval=15) {
a <- start
i <- 3
while(abs(a[i]) <= maxval) {
i <- i + 1
a[i] <- a[i-3] + a[i-2] + a[i-1]
}
a[1:(i-1)]
}
tribonacci(maxval=24)
# [1] 0 0 1 1 2 4 7 13 24
tribonacci(start=c(0, 1, -3), maxval=30)
# [1] 0 1 -3 -2 -4 -9 -15 -28
Since we use the absolute value, we just need one argument to specify the maximum/minimum.

Generate vector of 'random' proportions of a given length within specific boundaries

I want to generate a vector of a given length, e.g., n = 5. Each value in the vector should be a proportion (i.e., a value between 0 and 1) so that across n elements they sum up to 1.
Unfortunately, I have two vectors: one (mymins) defines the allowed lower boundaries of each proportion and the other (mymaxs) defines the allowed top boundaries of each proportion.
In my example below the desired proportion for the first element is allowed to fall anywhere between 0.3 and 0.9. And for the last element, the desired proportion is allowed to fall between 0.05 and 0.7.
mymins <- c(0.3, 0.1, 0, 0.2, 0.05)
mymaxs <- c(0.9, 1, 1, 1, 0.7)
Let's assume that mymins are always 'legitimate' (i.e., their sum is never larger than 1).
How could I find a set of 5 proportions such that they all sum to 1 but lie within the boundaries?
Here is what I tried:
n = 5
mydif <- mymaxs - mymins # possible range for each proportion
myorder <- rank(mydif) # order those differences from smallest to largest
mytarget <- sum(mydif) # sum up the 5 ranges
x <- sort(runif(n))[myorder] # generate 5 random values an sort them in the order of mydif
x2 <- mymins + x / sum(x) * mytarget # rescale random values to sum up to mytarget and add them to mymins
x3 <- x2/sum(x2) # rescale x2 to sum up to 1
As you can see, I am not very far - because after rescaling some values are outside of their allowed boundaries.
I should probably also mention that I need this operation to be fast - because I am using it in an optimization loop.
I also tried to find a solution using optim, however the problem is that it always finds the same solution - and I need to generate a DIFFERENT solutions every time I find the proporotion:
myfun <- function(x) {
x <- round(x, 4)
abovemins <- x - mymins
n_belowmins <- sum(abovemins < 0)
if (n_belowmins > 0) return(100000)
belowmax <- x - mymaxs
n_abovemax <- sum(belowmax > 0)
if (n_abovemax > 0) return(100000)
mydist <- abs(sum(x) - 1)
return(mydist)
}
myopt <- optim(par = mymins + 0.01, fn = myfun)
myopt$par
sum(round(myopt$par, 4))
Thank you very much for your suggestions!
Perhaps its better to think of this in a different way. Your samples actually need to sum to 0.35 (which is 1 - sum(mymins)), then be added on to the minimum values
constrained_sample <- function(mymins, mymaxs)
{
sizes <- mymaxs - mymins
samp <- (runif(5) * sizes)
samp/sum(samp) * (1 - sum(mymins)) + mymins
}
It works like this:
constrained_sample(mymins, mymaxs)
#> [1] 0.31728333 0.17839397 0.07196067 0.29146744 0.14089459
We can test this works by running the following loop, which will print a message to the console if any of the criteria aren't met:
for(i in 1:1000)
{
test <- constrained_sample(mymins, mymaxs)
if(!all(test > mymins) | !all(test < mymaxs) | abs(sum(test) - 1) > 1e6) cat("failure")
}
This throws no errors, since the criteria are always met. However, as #GregorThomas points out, the bounds aren't realistic in this case. We can see a range of solutions constrained by your conditions using a boxplot:
samp <- constrained_sample(mymins, mymaxs)
for(i in 1:999) samp <- rbind(samp, constrained_sample(mymins, mymaxs))
df <- data.frame(val = c(samp[,1], samp[,2], samp[,3], samp[,4], samp[,5]),
index = factor(rep(1:5, each = 1000)))
ggplot(df, aes(x = index, y = val)) + geom_boxplot()
Because you need 5 random numbers to sum to 1, you really only have 4 independent numbers and one dependent number.
mymins <- c(0.3, 0.1, 0, 0.2, 0.05)
mymaxs <- c(0.9, 1, 1, 1, 0.7)
set.seed(42)
iter <- 1000
while(iter > 0 &&
(
(1 - sum(x <- runif(4, mymins[-5], mymaxs[-5]))) < mymins[5] ||
(1 - sum(x)) > mymaxs[5]
)
) iter <- iter - 1
if (iter < 1) {
# failed
stop("unable to find something within 1000 iterations")
} else {
x <- c(x, 1-sum(x))
}
sum(x)
# [1] 1
all(mymins <= x & x <= mymaxs)
# [1] TRUE
x
# [1] 0.37732330 0.21618036 0.07225311 0.24250359 0.09173965
The reason I use iter there is to make sure you don't take an "infinite" amount of time to find something. If your mymins and mymaxs combination make this mathematically infeasible (as your first example was), then you don't need to spin forever. If it is mathematically improbable to find it in a reasonable amount of time, you need to weigh how long you want to do this.
One reason this takes so long is that we are iteratively pulling entropy. If you expect this to go for a long time, then it is generally better to pre-calculate as much as you think you'll need (overall) and run things as a matrix.
set.seed(42)
n <- 10000
m <- matrix(runif(prod(n, length(mymins)-1)), nrow = n)
m <- t(t(m) * (mymaxs[-5] - mymins[-5]) + mymins[-5])
remainders <- (1 - rowSums(m))
ind <- mymins[5] <= remainders & remainders <= mymaxs[5]
table(ind)
# ind
# FALSE TRUE
# 9981 19
m <- cbind(m[ind,,drop=FALSE], remainders[ind])
nrow(m)
# [1] 19
rowSums(m)
# [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
head(m)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0.3405821 0.1306152 0.05931363 0.2199362 0.24955282
# [2,] 0.3601376 0.1367465 0.20235704 0.2477507 0.05300821
# [3,] 0.4469526 0.1279795 0.02265618 0.2881733 0.11423845
# [4,] 0.5450527 0.1029903 0.07503371 0.2052423 0.07168103
# [5,] 0.3161519 0.1469783 0.15290720 0.3268470 0.05711557
# [6,] 0.4782448 0.1185735 0.01664063 0.2178225 0.16871845
all(
mymins[1] <= m[,1] & m[,1] <= mymaxs[1],
mymins[2] <= m[,2] & m[,2] <= mymaxs[2],
mymins[3] <= m[,3] & m[,3] <= mymaxs[3],
mymins[4] <= m[,4] & m[,4] <= mymaxs[4],
mymins[5] <= m[,5] & m[,5] <= mymaxs[5]
)
# [1] TRUE
This time it took 10000 attempts to make 19 valid combinations. It might take more or fewer attempts based on randomness, so ymmv with regards to how much you need to pre-generate.
If your example bounds are realistic, we can refine them quite a bit, narrowing the range of possibilities. For the current version of the question with:
mymins = c(0.3, 0.1, 0, 0.2, 0.05)
mymaxs = c(0.9, 1, 1, 1, 0.7)
What's the max for x[1]? Well, if x[2:5] take on minimum values, they will add up to 0.1 + 0 + 0.2 + 0.05 = 0.35, so based on the other mins only we know that max value for x[1] is 1 - 0.35 = 0.65. The 0.9 in mymaxs is way too high.
We can calculate the actual max values taking the minimum of the max values based on the minimums and the mymaxs vector:
new_max = pmin(mymaxs, 1 - (sum(mymins) - mymins))
new_max
# [1] 0.65 0.45 0.35 0.55 0.40
We can similarly revise the min bounds, though in this case even the revised max bounds new_max are high enough that it would have any impact on the minimums.
new_min = pmax(mymins, 1 - (sum(new_max) - new_max))
new_min
# [1] 0.30 0.10 0.00 0.20 0.05
With these adjustments, we should be able to see easily if any solutions are possible (all(new_min < new_max)). And then generating random numbers as in r2evans's answer should go much quicker using the new bounds.

How to select top and bottom points in line graph [duplicate]

I'm looking for a computationally efficient way to find local maxima/minima for a large list of numbers in R.
Hopefully without for loops...
For example, if I have a datafile like 1 2 3 2 1 1 2 1, I want the function to return 3 and 7, which are the positions of the local maxima.
diff(diff(x)) (or diff(x,differences=2): thanks to #ZheyuanLi) essentially computes the discrete analogue of the second derivative, so should be negative at local maxima. The +1 below takes care of the fact that the result of diff is shorter than the input vector.
edit: added #Tommy's correction for cases where delta-x is not 1...
tt <- c(1,2,3,2,1, 1, 2, 1)
which(diff(sign(diff(tt)))==-2)+1
My suggestion above ( http://statweb.stanford.edu/~tibs/PPC/Rdist/ ) is intended for the case where the data are noisier.
#Ben's solution is pretty sweet. It doesn't handle the follwing cases though:
# all these return numeric(0):
x <- c(1,2,9,9,2,1,1,5,5,1) # duplicated points at maxima
which(diff(sign(diff(x)))==-2)+1
x <- c(2,2,9,9,2,1,1,5,5,1) # duplicated points at start
which(diff(sign(diff(x)))==-2)+1
x <- c(3,2,9,9,2,1,1,5,5,1) # start is maxima
which(diff(sign(diff(x)))==-2)+1
Here's a more robust (and slower, uglier) version:
localMaxima <- function(x) {
# Use -Inf instead if x is numeric (non-integer)
y <- diff(c(-.Machine$integer.max, x)) > 0L
rle(y)$lengths
y <- cumsum(rle(y)$lengths)
y <- y[seq.int(1L, length(y), 2L)]
if (x[[1]] == x[[2]]) {
y <- y[-1]
}
y
}
x <- c(1,2,9,9,2,1,1,5,5,1)
localMaxima(x) # 3, 8
x <- c(2,2,9,9,2,1,1,5,5,1)
localMaxima(x) # 3, 8
x <- c(3,2,9,9,2,1,1,5,5,1)
localMaxima(x) # 1, 3, 8
Use the zoo library function rollapply:
x <- c(1, 2, 3, 2, 1, 1, 2, 1)
library(zoo)
xz <- as.zoo(x)
rollapply(xz, 3, function(x) which.min(x)==2)
# 2 3 4 5 6 7
#FALSE FALSE FALSE TRUE FALSE FALSE
rollapply(xz, 3, function(x) which.max(x)==2)
# 2 3 4 5 6 7
#FALSE TRUE FALSE FALSE FALSE TRUE
Then pull the index using the 'coredata' for those values where 'which.max' is a "center value" signaling a local maximum. You could obviously do the same for local minima using which.min instead of which.max.
rxz <- rollapply(xz, 3, function(x) which.max(x)==2)
index(rxz)[coredata(rxz)]
#[1] 3 7
I am assuming you do not want the starting or ending values, but if you do , you could pad the ends of your vectors before processing, rather like telomeres do on chromosomes.
(I'm noting the ppc package ("Peak Probability Contrasts" for doing mass spectrometry analyses, simply because I was unaware of its availability until reading #BenBolker's comment above, and I think adding these few words will increase the chances that someone with a mass-spec interest will see this on a search.)
I took a stab at this today. I know you said hopefully without for loops but I stuck with using the apply function. Somewhat compact and fast and allows threshold specification so you can go greater than 1.
The function:
inflect <- function(x, threshold = 1){
up <- sapply(1:threshold, function(n) c(x[-(seq(n))], rep(NA, n)))
down <- sapply(-1:-threshold, function(n) c(rep(NA,abs(n)), x[-seq(length(x), length(x) - abs(n) + 1)]))
a <- cbind(x,up,down)
list(minima = which(apply(a, 1, min) == a[,1]), maxima = which(apply(a, 1, max) == a[,1]))
}
To a visualize it/play with thresholds you can run the following code:
# Pick a desired threshold # to plot up to
n <- 2
# Generate Data
randomwalk <- 100 + cumsum(rnorm(50, 0.2, 1)) # climbs upwards most of the time
bottoms <- lapply(1:n, function(x) inflect(randomwalk, threshold = x)$minima)
tops <- lapply(1:n, function(x) inflect(randomwalk, threshold = x)$maxima)
# Color functions
cf.1 <- grDevices::colorRampPalette(c("pink","red"))
cf.2 <- grDevices::colorRampPalette(c("cyan","blue"))
plot(randomwalk, type = 'l', main = "Minima & Maxima\nVariable Thresholds")
for(i in 1:n){
points(bottoms[[i]], randomwalk[bottoms[[i]]], pch = 16, col = cf.1(n)[i], cex = i/1.5)
}
for(i in 1:n){
points(tops[[i]], randomwalk[tops[[i]]], pch = 16, col = cf.2(n)[i], cex = i/1.5)
}
legend("topleft", legend = c("Minima",1:n,"Maxima",1:n),
pch = rep(c(NA, rep(16,n)), 2), col = c(1, cf.1(n),1, cf.2(n)),
pt.cex = c(rep(c(1, c(1:n) / 1.5), 2)), cex = .75, ncol = 2)
There are some good solutions provided, but it depends on what you need.
Just diff(tt) returns the differences.
You want to detect when you go from increasing values to decreasing values. One way to do this is provided by #Ben:
diff(sign(diff(tt)))==-2
The problem here is that this will only detect changes that go immediately from strictly increasing to strictly decreasing.
A slight change will allow for repeated values at the peak (returning TRUE for last occurence of the peak value):
diff(diff(x)>=0)<0
Then, you simply need to properly pad the front and back if you want to detect maxima at the beginning or end of
Here's everything wrapped in a function (including finding of valleys):
which.peaks <- function(x,partial=TRUE,decreasing=FALSE){
if (decreasing){
if (partial){
which(diff(c(FALSE,diff(x)>0,TRUE))>0)
}else {
which(diff(diff(x)>0)>0)+1
}
}else {
if (partial){
which(diff(c(TRUE,diff(x)>=0,FALSE))<0)
}else {
which(diff(diff(x)>=0)<0)+1
}
}
}
Late to the party, but this might be of interest for others. You can nowadays use the (internal) function find_peaks from ggpmisc package. You can parametrize it using threshold, span and strict arguments. Since ggpmisc package is aimed for using with ggplot2 you can directly plot minima and maxima using thestat_peaks and stat_valleys functions:
set.seed(1)
x <- 1:10
y <- runif(10)
# Maxima
x[ggpmisc:::find_peaks(y)]
[1] 4 7
y[ggpmisc:::find_peaks(y)]
[1] 0.9082078 0.9446753
# Minima
x[ggpmisc:::find_peaks(-y)]
[1] 5
y[ggpmisc:::find_peaks(-y)]
[1] 0.2016819
# Plot
ggplot(data = data.frame(x, y), aes(x = x, y = y)) + geom_line() + stat_peaks(col = "red") + stat_valleys(col = "green")
Answer by #42- is great, but I had a use case where I didn't want to use zoo. It's easy to implement this with dplyr using lag and lead:
library(dplyr)
test = data_frame(x = sample(1:10, 20, replace = TRUE))
mutate(test, local.minima = if_else(lag(x) > x & lead(x) > x, TRUE, FALSE)
Like the rollapply solution, you can control the window size and edge cases through the lag/lead arguments n and default, respectively.
In the case I'm working on, duplicates are frequent. So I have implemented a function that allows finding first or last extrema (min or max):
locate_xtrem <- function (x, last = FALSE)
{
# use rle to deal with duplicates
x_rle <- rle(x)
# force the first value to be identified as an extrema
first_value <- x_rle$values[1] - x_rle$values[2]
# differentiate the series, keep only the sign, and use 'rle' function to
# locate increase or decrease concerning multiple successive values.
# The result values is a series of (only) -1 and 1.
#
# ! NOTE: with this method, last value will be considered as an extrema
diff_sign_rle <- c(first_value, diff(x_rle$values)) %>% sign() %>% rle()
# this vector will be used to get the initial positions
diff_idx <- cumsum(diff_sign_rle$lengths)
# find min and max
diff_min <- diff_idx[diff_sign_rle$values < 0]
diff_max <- diff_idx[diff_sign_rle$values > 0]
# get the min and max indexes in the original series
x_idx <- cumsum(x_rle$lengths)
if (last) {
min <- x_idx[diff_min]
max <- x_idx[diff_max]
} else {
min <- x_idx[diff_min] - x_rle$lengths[diff_min] + 1
max <- x_idx[diff_max] - x_rle$lengths[diff_max] + 1
}
# just get number of occurences
min_nb <- x_rle$lengths[diff_min]
max_nb <- x_rle$lengths[diff_max]
# format the result as a tibble
bind_rows(
tibble(Idx = min, Values = x[min], NB = min_nb, Status = "min"),
tibble(Idx = max, Values = x[max], NB = max_nb, Status = "max")) %>%
arrange(.data$Idx) %>%
mutate(Last = last) %>%
mutate_at(vars(.data$Idx, .data$NB), as.integer)
}
The answer to the original question is:
> x <- c(1, 2, 3, 2, 1, 1, 2, 1)
> locate_xtrem(x)
# A tibble: 5 x 5
Idx Values NB Status Last
<int> <dbl> <int> <chr> <lgl>
1 1 1 1 min FALSE
2 3 3 1 max FALSE
3 5 1 2 min FALSE
4 7 2 1 max FALSE
5 8 1 1 min FALSE
The result indicates that the second minimum is equal to 1 and that this value is repeated twice starting at index 5. Therefore, a different result could be obtained by indicating this time to the function to find the last occurrences of local extremas:
> locate_xtrem(x, last = TRUE)
# A tibble: 5 x 5
Idx Values NB Status Last
<int> <dbl> <int> <chr> <lgl>
1 1 1 1 min TRUE
2 3 3 1 max TRUE
3 6 1 2 min TRUE
4 7 2 1 max TRUE
5 8 1 1 min TRUE
Depending on the objective, it is then possible to switch between the first and the last value of a local extremas. The second result with last = TRUE could also be obtained from an operation between columns "Idx" and "NB"...
Finally to deal with noise in the data, a function could be implemented to remove fluctuations below a given threshold. Code is not exposed since it goes beyond the initial question. I have wrapped it in a package (mainly to automate the testing process) and I give below a result example:
x_series %>% xtrem::locate_xtrem()
x_series %>% xtrem::locate_xtrem() %>% remove_noise()
Here's the solution for minima:
#Ben's solution
x <- c(1,2,3,2,1,2,1)
which(diff(sign(diff(x)))==+2)+1 # 5
Please regard the cases at Tommy's post!
#Tommy's solution:
localMinima <- function(x) {
# Use -Inf instead if x is numeric (non-integer)
y <- diff(c(.Machine$integer.max, x)) > 0L
rle(y)$lengths
y <- cumsum(rle(y)$lengths)
y <- y[seq.int(1L, length(y), 2L)]
if (x[[1]] == x[[2]]) {
y <- y[-1]
}
y
}
x <- c(1,2,9,9,2,1,1,5,5,1)
localMinima(x) # 1, 7, 10
x <- c(2,2,9,9,2,1,1,5,5,1)
localMinima(x) # 7, 10
x <- c(3,2,9,9,2,1,1,5,5,1)
localMinima(x) # 2, 7, 10
Please regard: Neither localMaxima nor localMinima can handle duplicated maxima/minima at start!
I had some trouble getting the locations to work in previous solutions and came up with a way to grab the minima and maxima directly. The code below will do this and will plot it, marking the minima in green and the maxima in red. Unlike the which.max() function this will pull all indices of the minima/maxima out of a data frame. The zero value is added in the first diff() function to account for the missing decreased length of the result that occurs whenever you use the function. Inserting this into the innermost diff() function call saves from having to add an offset outside of the logical expression. It doesn't matter much, but i feel it's a cleaner way to do it.
# create example data called stockData
stockData = data.frame(x = 1:30, y=rnorm(30,7))
# get the location of the minima/maxima. note the added zero offsets
# the location to get the correct indices
min_indexes = which(diff( sign(diff( c(0,stockData$y)))) == 2)
max_indexes = which(diff( sign(diff( c(0,stockData$y)))) == -2)
# get the actual values where the minima/maxima are located
min_locs = stockData[min_indexes,]
max_locs = stockData[max_indexes,]
# plot the data and mark minima with red and maxima with green
plot(stockData$y, type="l")
points( min_locs, col="red", pch=19, cex=1 )
points( max_locs, col="green", pch=19, cex=1 )
This function by Timothée Poisot is handy for noisy series:
May 3, 2009
An Algorithm To Find Local Extrema In A Vector
Filed under: Algorithm — Tags: Extrema, Time series — Timothée Poisot # 6:46pm
I spend some time looking for an algorithm to find local extrema in
a vector (time series). The solution I used is to “walk” through the
vector by step larger than 1, in order to retain only one value even
when the values are very noisy (see the picture at the end of the
post).
It goes like this :
findpeaks <- function(vec,bw=1,x.coo=c(1:length(vec)))
{
pos.x.max <- NULL
pos.y.max <- NULL
pos.x.min <- NULL
pos.y.min <- NULL for(i in 1:(length(vec)-1)) { if((i+1+bw)>length(vec)){
sup.stop <- length(vec)}else{sup.stop <- i+1+bw
}
if((i-bw)<1){inf.stop <- 1}else{inf.stop <- i-bw}
subset.sup <- vec[(i+1):sup.stop]
subset.inf <- vec[inf.stop:(i-1)]
is.max <- sum(subset.inf > vec[i]) == 0
is.nomin <- sum(subset.sup > vec[i]) == 0
no.max <- sum(subset.inf > vec[i]) == length(subset.inf)
no.nomin <- sum(subset.sup > vec[i]) == length(subset.sup)
if(is.max & is.nomin){
pos.x.max <- c(pos.x.max,x.coo[i])
pos.y.max <- c(pos.y.max,vec[i])
}
if(no.max & no.nomin){
pos.x.min <- c(pos.x.min,x.coo[i])
pos.y.min <- c(pos.y.min,vec[i])
}
}
return(list(pos.x.max,pos.y.max,pos.x.min,pos.y.min))
}
Link to original blog post
In the pracma package, use the
tt <- c(1,2,3,2,1, 1, 2, 1)
tt_peaks <- findpeaks(tt, zero = "0", peakpat = NULL,
minpeakheight = -Inf, minpeakdistance = 1, threshold = 0, npeaks = 0, sortstr = FALSE)
[,1] [,2] [,3] [,4]
[1,] 3 3 1 5
[2,] 2 7 6 8
That returns a matrix with 4 columns.
The first column is showing the local peaks' absolute values.
The 2nd column are the indices
The 3rd and 4th column are the start and end of the peaks (with potential overlap).
See https://www.rdocumentation.org/packages/pracma/versions/1.9.9/topics/findpeaks for details.
One caveat: I used it in a series of non-integers, and the peak was one index too late (for all peaks) and I do not know why. So I had to manually remove "1" from my index vector (no big deal).
Finding local maxima and minima for a not so easy sequence e.g. 1 0 1 1 2 0 1 1 0 1 1 1 0 1 I would give their positions at (1), 5, 7.5, 11 and (14) for maxima and 2, 6, 9, 13 for minima.
#Position 1 1 1 1 1
# 1 2 3 4 5 6 7 8 9 0 1 2 3 4
x <- c(1,0,1,1,2,0,1,1,0,1,1,1,0,1) #Frequency
# p v p v p v p v p p..Peak, v..Valey
peakPosition <- function(x, inclBorders=TRUE) {
if(inclBorders) {y <- c(min(x), x, min(x))
} else {y <- c(x[1], x)}
y <- data.frame(x=sign(diff(y)), i=1:(length(y)-1))
y <- y[y$x!=0,]
idx <- diff(y$x)<0
(y$i[c(idx,F)] + y$i[c(F,idx)] - 1)/2
}
#Find Peaks
peakPosition(x)
#1.0 5.0 7.5 11.0 14.0
#Find Valeys
peakPosition(-x)
#2 6 9 13
peakPosition(c(1,2,3,2,1,1,2,1)) #3 7
We see many nice functions and ideas with different features here. One issue of almost all examples is the efficiency. Many times we see the use of complex functions like diff() or for()-loops, which become slow when large data sets are involved. Let me introduce an efficient function I use every day, with minimal features, but very fast:
Local Maxima Function amax()
The purpose is to detect all local maxima in a real valued vector.
If the first element x[1] is the global maximum, it is ignored,
because there is no information about the previous emlement. If there
is a plateau, the first edge is detected.
#param x numeric vector
#return returns the indicies of local maxima. If x[1] = max, then
it is ignored.
amax <- function(x)
{
a1 <- c(0,x,0)
a2 <- c(x,0,0)
a3 <- c(0,0,x)
e <- which((a1 >= a2 & a1 > a3)[2:(length(x))])
if(!is.na(e[1] == 1))
if(e[1]==1)
e <- e[-1]
if(length(e) == 0) e <- NaN
return (e)
}
a <- c(1,2,3,2,1,5,5,4)
amax(a) # 3, 6
I posted this elsewhere, but I think this is an interesting way to go about it. I'm not sure what its computational efficiency is, but it's a very concise way of solving the problem.
vals=rbinom(1000,20,0.5)
text=paste0(substr(format(diff(vals),scientific=TRUE),1,1),collapse="")
sort(na.omit(c(gregexpr('[ ]-',text)[[1]]+1,ifelse(grepl('^-',text),1,NA),
ifelse(grepl('[^-]$',text),length(vals),NA))))
An enhancement (fast and simple method) to the formula proposed by #BEN and regarding to the cases proposed by #TOMMY:
the following recursive formula handle any cases:
dx=c(0,sign(diff(x)))
numberofzeros= length(dx) - sum(abs(dx)) -1 # to find the number of zeros
# in the dx minus the first one
# which is added intentionally.
#running recursive formula to clear middle zeros
# iterate for the number of zeros
for (i in 1:numberofzeros){
dx = sign(2*dx + c(0,rev(sign(diff(rev(dx))))))
}
Now, the formula provided by #Ben Bolker can be used with a little change:
plot(x)
points(which(diff(dx)==2),x[which(diff(dx)==2)],col = 'blue')#Local MIN.
points(which(diff(dx)==-2),x[which(diff(dx)==-2)],col = 'red')#Local MAX.
I liked #mikeck's solution so that I wouldn't have to convert my dataframes back and forth from a zoo object. But I also wanted to use a window wider than 1. Their solution only looks at the xth value away from the value of interest, not the values within x distance. Here is what I came up with. You would need to add an extra lag/lead line for every value away from the value of interest that you want to look.
x <- data.frame(AIC = c(98, 97, 96, 97, 98, 99, 98, 98, 97, 96, 95, 94, 93, 92, 93, 94, 95, 96, 95, 94, 93, 92, 91, 90, 89, 88))
x <- x %>%
mutate(local.minima = if_else(lag(AIC) > AIC & lead(AIC) > AIC &
lag(AIC, 2) > AIC & lead(AIC, 2) > AIC &
lag(AIC, 3) > AIC & lead(AIC, 3) > AIC, TRUE, FALSE),
local.minima = if_else(is.na(local.minima), TRUE, local.minima))

Missing last sequence in seq() in R

I have this example data
by<-200
to<-seq(from=by,to=35280,by=by)
Problem is that to ends at 35200 and ignore the last 80 which I need to involve in as last value.
Is there any straigthforward way how to achieve it?
I have tried along.with and length.out parameters but I cannot go trough.
You can place if statement for the last element of the vector such as in the following function :
seqlast <- function (from, to, by)
{
vec <- do.call(what = seq, args = list(from, to, by))
if ( tail(vec, 1) != to ) {
return(c(vec, to))
} else {
return(vec)
}
}
Then
by <- 200
to <- seqlast(from=by,to=35280,by=by)
will return
> head(to)
[1] 200 400 600 800 1000 1200
> tail(to)
[1] 34400 34600 34800 35000 35200 35280
In seq(), "The second form generates from, from+by, ..., up to the sequence value less than or equal to to." And also since 35280 is not in the requested sequence, it is not returned.
But you can use a calculation in the arguments it you wan to include the next value. Since you know the to value, assign it a name and use it.
by <- 200
out <- 35280
x <- seq(from = by, to = (out + by - out %% by), by = by)
length(x)
# [1] 177
x[length(x)]
# [1] 35400
If you want to include the to value, even if it is not in the requested sequence, you can write a little function to add it back on
seqil <- function(..., include.last = TRUE) {
x <- do.call(seq.default, list(...))
if(include.last) c(x, to) else x
}
by <- 200
x <- seqil(from = by, to = 35280, by = by)
tail(x)
# [1] 34400 34600 34800 35000 35200 35280
First of all, seq() is behaving as it should in your example. You want something that seq() by itself will simply not deliver.
One solution (there are certainly many) is to check weather "there was anything left" at the end of your sequence and, if so, add another element to it. (Or modify the last element of your sequence, it is not exactly clear what you are trying to achieve.) Like so:
step <- 200
end <- 35280
to<-seq(from=step,to=end,by=step)
# the modulus of your end point by step
m = end%%step
# if not zero, you have something to add to your sequence
if(m != 0) {
# add the end point
to = c(to, end)
}
tail(to,2)
# 35200 35280
Whilst not solving the exact issue raised my preferred solution to this is to extend the sequence to add an additional value so that the to value is included in the sequence rather than just appending the to value at the end.
This builds on the answers by both #djas and #Etienne Kintzler.
seq0 <- function(from = 1, to = 1, by = 1, incLast = TRUE){
out = do.call(what = seq, args = list(from, to, by))
if (incLast & to%%by != 0){
out = c(out, tail(out, 1) + by)
}
return(out)
}
Example outputs:
> seq0(from = 0, to = 20, by = 6, incLast = FALSE)
[1] 0 6 12 18
> seq0(from = 0, to = 20, by = 6, incLast = TRUE)
[1] 0 6 12 18 24
> seq0(from = 0, to = -20, by = -6, incLast = FALSE)
[1] 0 -6 -12 -18
> seq0(from = 0, to = -20, by = -6, incLast = TRUE)
[1] 0 -6 -12 -18 -24
This is a modification of Rich Scriven answer, that does not add an extra number, when it is not necessary (as with 20)
by <- 2
out <- 21
out <- 20
x <- seq(from = 0, to = out + by*ifelse(out %% by>0,1,0) - out %% by , by = by)
x
You can use c() to include the last number.
by<-200
c(seq(from=by,to=35280,by=by), 35280)

Resources