create sequence of numbers which are cyclic - r

I have a vector of months
m_vec <- c(3, 7, 11)
These months represent the start month of a season. All the months in each season are shown below:
season1 <- c(3,4,5,6)
season2 <- c(7,8,9,10)
season3 <- c(11,12,1,2)
I want to create a small function that takes a vector of start months and
generate the vector of months in each season. Some more examples are show below:
m_vec <- c(9,12,4,8)
season1 <- c(9,10,11)
season2 <- c(12,1,2,3)
season3 <- c(4,5,6,7,8)
m_vec <- c(12, 5, 9)
season1 <- c(12, 1, 2,3,4)
season2 <- c(5,6,7,8)
season3 <- c(9,10,11)
My for loop is not complete and I can't seem to even know where to get started with the logic
n_season <- length(m_vec)
temp_list <- list()
for(m in seq_along(m_vec)){
month_start <- m_vec[m]
month_start_next <- m_vec[m + 1]
month_start:month_start_next
}

First we can create some helper functions
cycle <- function(n) { function(x) (x-1) %% n + 1 }
split_at <- function(b) { function(x) split(x, cumsum(x %in% b)) }
The cycle() helper will return a function that will keep values in the range from 1 to the n you pass in. It does that using the modulus % operator. The split_at helper will return a function that takes a vector and splits it up when the values you pass in are found. It does that by using cumsum() to count when each of the break points are found.
Then we can take your input, create a vector of 12 months from your first starting month, wrap in in a cycler to keep it from 1-12, and then we can use split it up using your season breakpoints. Here's what that would look like:
month_cycle <- cycle(12)
season_splitter <- split_at(m_vec)
m_vec <- c(12, 5, 9)
seq(m_vec[1], length.out=12) |>
month_cycle() |>
season_splitter()
# $`1`
# [1] 12 1 2 3 4
# $`2`
# [1] 5 6 7 8
# $`3`
# [1] 9 10 11

m_vec <- c(12, 5, 9)
Map(function(x, y) head((((x:(x + ((y - x) %% 12))) - 1) %% 12) + 1, -1),
m_vec,
c(m_vec[-1], m_vec[1]))
#[[1]]
#[1] 12 1 2 3 4
#[[2]]
#[1] 5 6 7 8
#[[3]]
#[1] 9 10 11

One option is to convert to Date class, get the sequence and extract the months
library(lubridate)
fn1 <- function(mvec) {
new <- pmax(mvec-1, 1)
out <- Map(function(i, j) {
date1 <- mdy(i, truncated = 2)
date2 <- mdy(j, truncated = 2)
if(date1 > date2) {
date2 <- date2 + years(1)}
month(seq(date1, date2, by = "month"))
}, mvec, c(new[-1], new[1]))
if(length(out[[length(out)]]) < 2) {
out[[length(out)-1]] <- c(out[[length(out)-1]], out[[length(out)]])
out[[length(out)]] <- NULL
}
names(out) <- paste0("season", seq_along(out))
return(out)
}
-testing
> fn1(m_vec)
$season1
[1] 3 4 5 6
$season2
[1] 7 8 9 10
$season3
[1] 11 12 1 2
> fn1(c(9, 12, 4, 8))
$season1
[1] 9 10 11
$season2
[1] 12 1 2 3
$season3
[1] 4 5 6 7 8
> fn1(c(1, 5, 11))
$season1
[1] 1 2 3 4
$season2
[1] 5 6 7 8 9 10
$season3
[1] 11 12 1

Related

Perform set operations on duplicate items, in R (base R preferably) [duplicate]

I have 3 vectors
x <- c(1,3,5,7,3,8)
y <- c(3,5,7)
z <- c(3,3,8)
I want to find the elements of x that are not in y and not in z. Is there a function f that would give me the following output:
> f(x,y)
1 3 8
> f(x,z)
1 5 7
In other words, I want to find the "set difference" between 2 vectors, either of which may have repeated values. The functions %in%, match and setdiff do not work in this case for obvious reasons.
There should be some better ways to do this but here is one option
get_diff_vectors <- function(x, y) {
count_x <- table(x)
count_y <- table(y)
same_counts <- match(names(count_y), names(count_x))
count_x[same_counts] <- count_x[same_counts] - count_y
as.numeric(rep(names(count_x), count_x))
}
get_diff_vectors(x, y)
#[1] 1 3 8
get_diff_vectors(x, z)
#[1] 1 5 7
get_diff_vectors(x, c(5, 7))
#[1] 1 3 3 8
We count the frequency of x and y using table, match the numbers which occur in both and subtract the counts y from x. Finally recreate the remaining vector using rep.
Still not able to find a better way but here is dplyr way using the somewhat similar logic.
library(dplyr)
get_diff_vectors_dplyr <- function(x, y) {
df1 <- data.frame(x) %>% count(x)
df2 <- data.frame(y) %>% count(y)
final <- left_join(df1, df2, by = c("x" = "y")) %>%
mutate_at(c("n.x", "n.y"), funs(replace(., is.na(.), 0))) %>%
mutate(n = n.x - n.y)
rep(final$x, final$n)
}
get_diff_vectors_dplyr(x, y)
#[1] 1 3 8
get_diff_vectors_dplyr(x, z)
#[1] 1 5 7
get_diff_vectors_dplyr(x, c(5, 7))
#[1] 1 3 3 8
The vecsets package mentioned by OP has function vsetdiff which does this very easily
vecsets::vsetdiff(x, y)
#[1] 1 3 8
vecsets::vsetdiff(x, z)
#[1] 1 5 7
vecsets::vsetdiff(x, c(5, 7))
#[1] 1 3 3 8
Here's an attempt using make.unique to account for duplicates:
dupdiff <- function(x,y) x[-match(
make.unique(as.character(y)),
make.unique(as.character(x)),
nomatch=0
)]
Testing:
dupdiff(x,y)
#[1] 1 3 8
dupdiff(x,z)
#[1] 1 5 7
dupdiff(x, c(5, 7))
#[1] 1 3 3 8
dupdiff(x, c(5, 7, 9))
#[1] 1 3 3 8
match with a little for-loop does work:
> f(x, y)
[1] 1 3 8
> f(x, z)
[1] 1 5 7
Code
f <- function(s, r) {
for(i in 1:length(s)){
j <- match(s[i], r)
if(!is.na(j)) {
s[i] <- NA
r[j] <- NA
}
}
print(s[complete.cases(s)])
}
There is the new Hadley-verse package waldo
Makes real nice and concise overviews of the differences of objects, not only vectors
library(waldo)
compare(x, y)
#> `old`: 1 3 5 7 3 8
#> `new`: 3 5 7
compare(x, z)
#> `old`: 1 3 5 7 3 8
#> `new`: 3 3 8

Extract multiple ranges from a numeric vector

First, I simplify my question. I want to extract certain ranges from a numeric vector. For example, extracting 3 ranges from 1:20 at the same time :
1 < x < 5
8 < x < 12
17 < x < 20
Therefore, the expected output is 2, 3, 4, 9, 10, 11, 18, 19.
I try to use the function findInterval() and control arguments rightmost.closed and left.open to do that, but any arguments sets cannot achieve the goal.
x <- 1:20
v <- c(1, 5, 8, 12, 17, 20)
x[findInterval(x, v) %% 2 == 1]
# [1] 1 2 3 4 8 9 10 11 17 18 19
x[findInterval(x, v, rightmost.closed = T) %% 2 == 1]
# [1] 1 2 3 4 8 9 10 11 17 18 19 20
x[findInterval(x, v, left.open = T) %% 2 == 1]
# [1] 2 3 4 5 9 10 11 12 18 19 20
By the way, the conditions can also be a matrix like that :
[,1] [,2]
[1,] 1 5
[2,] 8 12
[3,] 17 20
I don't want to use for loop if it's not necessary.
I am grateful for any helps.
I'd probably do it using purrr::map2 or Map, passing your lower-bounds and upper-bounds as arguments and filtering your dataset with a custom function
library(purrr)
x <- 1:20
lower_bounds <- c(1, 8, 17)
upper_bounds <- c(5, 12, 20)
map2(
lower_bounds, upper_bounds, function(lower, upper) {
x[x > lower & x < upper]
}
)
You may use data.table::inrange and its incbounds argument. Assuming ranges are in a matrix 'm', as shown in your question:
x[data.table::inrange(x, m[ , 1], m[ , 2], incbounds = FALSE)]
# [1] 2 3 4 9 10 11 18 19
m <- matrix(v, ncol = 2, byrow = TRUE)
You were on the right path, and left.open indeed helps, but rightmost.closed actually concerns only the last interval rather than the right "side" of each interval. Hence, we need to use left.open twice. As you yourself figured out, it looks like an optimal way to do that is
x[findInterval(x, v) %% 2 == 1 & findInterval(x, v, left.open = TRUE) %% 2 == 1]
# [1] 2 3 4 9 10 11 18 19
Clearly there are alternatives. E.g.,
fun <- function(x, v)
if(length(v) > 1) v[1] < x & x < v[2] | fun(x, v[-1:-2]) else FALSE
x[fun(x, v)]
# [1] 2 3 4 9 10 11 18 19
I found an easy way just with sapply() :
x <- 1:20
v <- c(1, 5, 8, 12, 17, 20)
(v.df <- as.data.frame(matrix(v, 3, 2, byrow = T)))
# V1 V2
# 1 1 5
# 2 8 12
# 3 17 20
y <- sapply(x, function(x){
ind <- (x > v.df$V1 & x < v.df$V2)
if(any(ind)) x else NA
})
y[!is.na(y)]
# [1] 2 3 4 9 10 11 18 19

Loop with table function in R

I am a very beginner with R. I have a question about the table function. I have a result like this :
table(my_vector)
1 2 3
11 23 7
And I want to extract elements from a matrix :
From 1 to 11 as my_matrix[1:11,]
Form 11+1 to 11+23 as my_matrix[12:34,]
Form 11+23+1 to 11+23+7 as my_matrix[35:41,]
How can I do a loop with this ?
Think this should do it
my_matrix <- matrix(rep(1:41, times=3), 41)
my_vector <- rep(1:3, times=c(11, 23, 7))
my_tab <- table(my_vector)
my_csum1 <- c(0, cumsum(my_tab)) + 1
my_csum2 <- cumsum(my_tab)
my_list <- list()
for (i in 1:length(my_csum2)) {
my_list[[i]] <- my_matrix[my_csum1[i]:my_csum2[i], ]
}
lapply(my_list, range)
# [[1]]
# [1] 1 11
# [[2]]
# [1] 12 34
# [[3]]
# [1] 35 41

storing results of a for function in list or

add <- c( 2,3,4)
for (i in add){
a <- i +3
b <- a + 3
z <- a + b
print(z)
}
# Result
[1] 13
[1] 15
[1] 17
In R, it can print the result, but I want to save the results for further computation in a vector, data frame or list
Thanks in advance
Try something like:
add <- c(2, 3, 4)
z <- rep(0, length(add))
idx = 1
for(i in add) {
a <- i + 3
b <- a + 3
z[idx] <- a + b
idx <- idx + 1
}
print(z)
This is simple algebra, no need in a for loop at all
res <- (add + 3)*2 + 3
res
## [1] 13 15 17
Or if you want a data.frame
data.frame(a = add + 3, b = add + 6, c = (add + 3)*2 + 3)
# a b c
# 1 5 8 13
# 2 6 9 15
# 3 7 10 17
Though in general, when you are trying to something like that, it is better to create a function, for example
myfunc <- function(x) {
a <- x + 3
b <- a + 3
z <- a + b
z
}
myfunc(add)
## [1] 13 15 17
In cases when a loop is actually needed (unlike in your example) and you want to store its results, it is better to use *apply family for such tasks. For example, use lapply if you want a list back
res <- lapply(add, myfunc)
res
# [[1]]
# [1] 13
#
# [[2]]
# [1] 15
#
# [[3]]
# [1] 17
Or use sapply if you want a vector back
res <- sapply(add, myfunc)
res
## [1] 13 15 17
For a data.frame to keep all the info
add <- c( 2,3,4)
results <- data.frame()
for (i in add){
a <- i +3
b <- a + 3
z <- a + b
#print(z)
results <- rbind(results, cbind(a,b,z))
}
results
a b z
1 5 8 13
2 6 9 15
3 7 10 17
If you just want z then use a vector, no need for lists
add <- c( 2,3,4)
results <- vector()
for (i in add){
a <- i +3
b <- a + 3
z <- a + b
#print(z)
results <- c(results, z)
}
results
[1] 13 15 17
It might be instructive to compare these two results with those of #dugar:
> sapply(add, function(x) c(a=x+3, b=a+3, z=a+b) )
[,1] [,2] [,3]
a 5 6 7
b 10 10 10
z 17 17 17
That is the result of lazy evaluation and sometimes trips us up when computing with intermediate values. This next one should give a slightly more expected result:
> sapply(add, function(x) c(a=x+3, b=(x+3)+3, z=(x+3)+((x+3)+3)) )
[,1] [,2] [,3]
a 5 6 7
b 8 9 10
z 13 15 17
Those results are the transpose of #dugar. Using sapply or lapply often saves you the effort off setting up a zeroth case object and then incrementing counters.
> lapply(add, function(x) c(a=x+3, b=(x+3)+3, z=(x+3)+((x+3)+3)) )
[[1]]
a b z
5 8 13
[[2]]
a b z
6 9 15
[[3]]
a b z
7 10 17

How do I get the column name (or number) of a multi series zoo object when I apply by row?

I have a zoo object of 12 sets of monthly returns on stock tickers. I want to get the symbol, which is the name of the series, or at least the column, of each month's best performing stock. I've been trying to do this with applying the max function, by row. How do I get the column name?
#Apply 'max' function across each row. I need to get the col number out of this.
apply(tsPctChgs, 1, max, na.rm = TRUE)
The usual answer would be via which.max() however, do note that this will return only the first of the maximums if there are two or more observations taking the maximum value.
An alternative is which(x == max(x)), which would return all value taking the maximum in the result of a tie.
You can then use the index returned to select the series maximum. Handling NAs is covered below to try to keep the initial discussion simple.
require("zoo")
set.seed(1)
m <- matrix(runif(50), ncol = 5)
colnames(m) <- paste0("Series", seq_len(ncol(m)))
ind <- seq_len(nrow(m))
mz <- zoo(m, order.by = ind)
> apply(mz, 1, which.max)
1 2 3 4 5 6 7 8 9 10
3 5 5 1 4 1 1 2 3 2
> apply(mz, 1, function(x) which(x == max(x)))
1 2 3 4 5 6 7 8 9 10
3 5 5 1 4 1 1 2 3 2
So use that to select the series name
i1 <- apply(mz, 1, function(x) which(x == max(x)))
colnames(mz)[i1]
> i1 <- apply(mz, 1, function(x) which(x == max(x)))
> colnames(mz)[i1]
[1] "Series3" "Series5" "Series5" "Series1" "Series4" "Series1" "Series1"
[8] "Series2" "Series3" "Series2"
Handling tied maximums
To illustrate the different behaviour, copy the maximum from month 1 (series 3) into series 1
mz2 <- mz ## copy
mz2[1,1] <- mz[1,3]
mz2[1,]
> mz2[1,]
1 0.9347052 0.2059746 0.9347052 0.4820801 0.8209463
Now try the two approaches again
> apply(mz2, 1, which.max)
1 2 3 4 5 6 7 8 9 10
1 5 5 1 4 1 1 2 3 2
> apply(mz2, 1, function(x) which(x == max(x)))
$`1`
Series1 Series3
1 3
.... ## truncated output ###
Notice how which.max only returns the maximum in series 1.
To use this approach to select the series name, you need to apply something to the list returned by apply(), e.g.
i2 <- apply(mz2, 1, function(x) which(x == max(x)))
lapply(i2, function (i, zobj) colnames(zobj)[i], zobj = mz2)
$`1`
[1] "Series1" "Series3"
$`2`
[1] "Series5"
$`3`
[1] "Series5"
$`4`
[1] "Series1"
$`5`
[1] "Series4"
$`6`
[1] "Series1"
$`7`
[1] "Series1"
$`8`
[1] "Series2"
$`9`
[1] "Series3"
$`10`
[1] "Series2"
Handling NAs
As you have potential for NAs, I would do the following:
apply(mz, 1, which.max, na.rm = TRUE) ## as you did already
apply(mz, 1, function(x, na.rm = TRUE) {
if(na.rm) {
x <- x[!is.na(x)]
}
which(x == max(x))
})
Since apply converts to matrix, I would use rollapply with width=1:
require("zoo")
set.seed(1)
m <- matrix(runif(50), ncol=5)
mz <- setNames(zoo(m, seq(nrow(m))), paste0("Series",seq(ncol(m))))
rollapply(mz, 1, function(r) colnames(mz)[which.max(r)], by.column=FALSE)

Resources