R: Creating and Labelling Groups by Increments - r

I am working with the R programming language.
I have the following data:
set.seed(123)
my_data = data.frame(var1 = rnorm(100,100,100))
min = min(my_data$var1)
max = max(my_data$var)
Here is what I am trying to do:
Starting from the smallest value of var1, I would like to create a variable that groups values of var1 by some "fixed increment" (e.g. by 10) until the maximum value of var1 is reached
Then, I would then like to create another variable which labels each of these groups by the min/max value of that group
Here is my attempt to do this:
# create a vector of increments
breaks <- seq(min(my_data$var1), max(my_data$var1), by = 10)
# initialize new variables
my_data$class <- NA
my_data$label <- NA
# get the number of breaks
n <- length(breaks)
# Loop
for (i in 1:(n - 1)) {
# find which "class" (i.e. break) each value of var1 is located within
indices <- which(my_data$var1 > breaks[i] & my_data$var1 <= breaks[i + 1])
# make assignment
my_data$class[indices] <- i
# create labels
my_data$label[indices] <- paste(breaks[i], breaks[i + 1])
}
The code seems to have run, but I am not sure if this is correct (I don't think I have done this correctly because I see some NA's).
Can someone please tell show me how to do this correctly?
Thanks!

This could be done with a non-equi join
library(data.table)
my_data1 <- copy(my_data)
setDT(my_data1)[data.table(start = breaks, end = shift(breaks,
type = "lead", fill = last(breaks))), c("indices", "label") := .(.GRP, paste(start, end)),
on = .(var1 > start, var1 <= end), by = .EACHI]
-output
> head(my_data1)
var1 indices label
1: 43.95244 18 39.0831124359188 49.0831124359188
2: 76.98225 21 69.0831124359188 79.0831124359188
3: 255.87083 39 249.083112435919 259.083112435919
4: 107.05084 24 99.0831124359188 109.083112435919
5: 112.92877 25 109.083112435919 119.083112435919
6: 271.50650 41 269.083112435919 279.083112435919
compare it with OP's for loop
> head(my_data)
var1 class label
1 43.95244 18 39.0831124359188 49.0831124359188
2 76.98225 21 69.0831124359188 79.0831124359188
3 255.87083 39 249.083112435919 259.083112435919
4 107.05084 24 99.0831124359188 109.083112435919
5 112.92877 25 109.083112435919 119.083112435919
6 271.50650 41 269.083112435919 279.083112435919
Regarding the NAs in the output, it is a result of the seq output
> breaks
[1] -130.9168876 -120.9168876 -110.9168876 -100.9168876 -90.9168876 -80.9168876 -70.9168876 -60.9168876 -50.9168876 -40.9168876 -30.9168876
[12] -20.9168876 -10.9168876 -0.9168876 9.0831124 19.0831124 29.0831124 39.0831124 49.0831124 59.0831124 69.0831124 79.0831124
[23] 89.0831124 99.0831124 109.0831124 119.0831124 129.0831124 139.0831124 149.0831124 159.0831124 169.0831124 179.0831124 189.0831124
[34] 199.0831124 209.0831124 219.0831124 229.0831124 239.0831124 249.0831124 259.0831124 269.0831124 279.0831124 289.0831124 299.0831124
[45] 309.0831124
Note the max value is 309.083, and for the var1 > -130.9168876 would return FALSE for those values that are exactly same. Instead, it should be var1 >= -130.9168876. In order to correct this, we may need to concatenate with max at the end and then take the unique (in case there are duplicates)
breaks <- unique(c(seq(min, max, by = 10), max))
Now, we do the same
> setDT(my_data1)[data.table(start = breaks, end = shift(breaks,
+ type = "lead", fill = last(breaks))), c("indices", "label") := .(.GRP, paste(start, end)),
+ on = .(var1 >= start, var1 <= end), by = .EACHI]
>
> head(my_data1)
var1 indices label
1: 43.95244 18 39.0831124359188 49.0831124359188
2: 76.98225 21 69.0831124359188 79.0831124359188
3: 255.87083 39 249.083112435919 259.083112435919
4: 107.05084 24 99.0831124359188 109.083112435919
5: 112.92877 25 109.083112435919 119.083112435919
6: 271.50650 41 269.083112435919 279.083112435919
> head(my_data)
var1 class label
1 43.95244 18 39.0831124359188 49.0831124359188
2 76.98225 21 69.0831124359188 79.0831124359188
3 255.87083 39 249.083112435919 259.083112435919
4 107.05084 24 99.0831124359188 109.083112435919
5 112.92877 25 109.083112435919 119.083112435919
6 271.50650 41 269.083112435919 279.083112435919
> my_data1[is.na(indices)]
Empty data.table (0 rows and 3 cols): var1,indices,label

Related

Non-equi joins - comparing two data frames in R

I would like to filter a data frame based on the values present in a second data frame.
For example, match the rows from the first data frame that, in the column "BP", are higher than the first value of the "start_pos" column and smaller than "end_pos" column or just smaller than "end_pos" in the second data frame.
I need to repeat this procedure for all the values in the second data frame. Currently, I am performing these using a for loop. However, I would like to do it in a single command.
Data frame 1
CHR BP
29 836019
29 4417047
29 7589996
29 11052921
29 14009294
29 33174196
Data frame 2
start_pos end_pos gene_id
19774 19899 ENSBTAG00000046619
34627 35558 ENSBTAG00000006858
69695 71121 ENSBTAG00000039257
83323 84281 ENSBTAG00000035349
124849 179713 ENSBTAG00000001753
264298 264843 ENSBTAG00000005540
for(j in 1:nrow(tmp_markers)){
temp_out_markers<- tmp_markers[j,]
tmp_search<-tmp_gene[which((tmp_markers[j,"BP"]>=tmp_gene[,"start_pos"] & tmp_markers[j,"BP"]<= tmp_gene[,"end_pos"]) | (tmp_markers[j,"BP"]+interval>=tmp_gene[,"start_pos"] & tmp_markers[j,"BP"]+interval <=tmp_gene[,"end_pos"]) | (tmp_markers[j,"BP"]+interval>=tmp_gene[,"start_pos"] & tmp_markers[j,"BP"]+interval <=tmp_gene[,"end_pos"]) | (tmp_markers[j,"BP"]+interval>=tmp_gene[,"start_pos"] & tmp_markers[j,"BP"]+interval >=tmp_gene[,"end_pos"]& tmp_markers[j,"BP"]<=tmp_gene[,"start_pos"])| (tmp_markers[j,"BP"]-interval<=tmp_gene[,"end_pos"] & tmp_markers[j,"BP"]-interval >=tmp_gene[,"start_pos"])|(tmp_markers[j,"BP"]-interval<=tmp_gene[,"end_pos"] & tmp_markers[j,"BP"]-interval<=tmp_gene[,"start_pos"] & tmp_markers[j,"BP"]>=tmp_gene[,"end_pos"])),]
if(nrow(tmp_search)>0){
temp_out<-cbind(temp_out_markers[rep(seq_len(nrow(tmp_search))),],tmp_search)
temp_out[,"Distance_from_gene_start"]<-temp_out[,"BP"]-temp_out[,"start_pos"]
temp_out[,"Distance_from_gene_end"]<-temp_out[,"BP"]-temp_out[,"end_pos"]
output_genes<-rbind(temp_out,output_genes)
}
}
At the end, I want a data frame with all the rows that are within my tested intervals.
As I stated in a comment, your mock data won't result in a match, as the smallest BP value (836019) is larger than the largest end_pos (264843).
It could be also that I misunderstood altogether your problem!
I understand that you want to match the rows in df1 to those in df2 for which BP >= start_pos and BP <= end_pos. If it's so, we can achieve that using the non-equi joins provided by package data.table.
library(data.table)
result <- dt1[dt2,
.(BP, CHR, gene_id),
on = .(BP >= start_pos, BP <= end_pos),
nomatch = NULL,
by = .EACHI]
setnames(result, 1:2, names(dt2)[1:2])
result
start_pos end_pos BP CHR gene_id
1: 0.000000 2.000000 0 29 ABCD01
2: 4.571429 6.571429 6 30 ABCD03
3: 11.428571 13.428571 12 31 ABCD06
4: 16.000000 18.000000 18 32 ABCD08
5: 22.857143 24.857143 24 33 ABCD011
6: 29.714286 31.714286 30 34 ABCD014
In case you need the full 15 rows of dt2, simply omit the nomatch = NULL part.
DATA USED:
dt1 <- data.table(CHR = 29:34,
BP = seq(0, 30, length.out = 6),
key = "BP")
dt2 <- data.table(start_pos = seq(0, 32, length.out = 15),
gene_id = paste0("ABCD", rep(0, 3), 1:15))
dt2[, end_pos := start_pos + 2]
setcolorder(dt2, c(1, 3, 2))
Alternative with foverlaps
As #r2evans mentioned in a comment, data.table has another function, foverlaps than can be useful here. It checks if a range overlaps with one in another table, so we need to do a small trick to create a 0-width range in dt1:
dt1[, BP2 := BP]
We also need to have keyed data.tables:
setkey(dt1, "BP", "BP2")
setkey(dt2, "start_pos", "end_pos")
And then pass it to the working horse:
foverlaps(dt1, dt2)
start_pos end_pos gene_id CHR BP BP2
1: 0.000000 2.000000 ABCD01 29 0 0
2: 4.571429 6.571429 ABCD03 30 6 6
3: 11.428571 13.428571 ABCD06 31 12 12
4: 16.000000 18.000000 ABCD08 32 18 18
5: 22.857143 24.857143 ABCD011 33 24 24
6: 29.714286 31.714286 ABCD014 34 30 30
Of course we can get rid of BP2 later on by BP2 := NULL.
If we want the full 15 rows of dt2, the it's just inverting the order of the objects in the call:
foverlaps(dt2, dt1)
Thank you very much!
I ended with this solution and it is working very well.
foverlaps(tmp_gene, tmp_markers, by.x = c("start_pos","end_pos"), by.y =
key(tmp_markers),nomatch = 0)
Cheers.

How can I add additional columns to an existing data.frame, that are aligned on one specific column already in the data.frame?

I'm a new R user and I'm having trouble trying to replicate a left basic join and update that I would normally do in SQL. I've checked several previously asked questions on Stackoverflow but still cannot quite get this code right.
I've been trying to build out a data.frame, starting with a single data.frame representing only all possible zip codes. I have several additional data.frames each each of which that count construction years over a certain range (say 1990-1999), grouped by zip code. Note that each subsequent data.frame is only a subset of zip codes from the first data.frame. Essentially, what I'm trying to do is build out a table, starting with a data.frame representing of all possible zip codes, and link each individual range data.frame to the table so that my final table will show all ranges for each zip code. Each range data.frame will need to be aligned with the "ZIPS_ALL" variable. The 1990-1999, 2000-2009 and Zips_ALL data.frames are below:
1990-1999 2000-2009 zip_codes_all
ZIP Count ZIP Count ZIPS_ALL
19145 1 19145 1 19145
19146 2 19147 3 19146
19147 2 19148 1 19147
19148
I've tried using several different Left_Joins or merge from dplyr/base_r but when trying to attach each range, it overwrites the previous range so that my final table is all zip codes and the final range only. I need to keep all ranges of my table so that the final table shows all zip codes from "All Zip Codes", aligned with the ZIPS_ALL variable.
1990_1999_df <- left_join(x = zip_codes_all, y = 1990-1999, by =
c("ZIP_ALL" = "ZIP"))
2000_2009_df <- left_join(x = zip_codes_all, y = 2000-2009, by =
c("ZIP_ALL" = "ZIP"))
Expected results would have all range data.frames lined up with all possible zip codes data.frame where missing entries, would just be NA values; See below:
1990-1999 2000-2009 zip_codes_all
Count Count ZIPS_ALL
1 1 19145
2 NA 19146
2 1 19147
NA 1 19148
The dput code for my zip_codes_all variable is:
dput(droplevels(zip_codes_all[1:10,]))
structure(list(ZIP_ALL = c(23115L, 22960L, 22578L, 23936L, 23308L,
23875L, 23518L, 23139L, 23917L, 22967L)), row.names = c(NA, -10L
), .internal.selfref = <pointer: 0x0000000000201ef0>, class =
c("data.table",
"data.frame"))
My updated code with actual variable names. This code worked but I am wondering if there is a more efficient way of doing this where I don't have to add each range manually, since I have numerous ranges I need to build out.
#create your range counts by group
nn_data_1939_range <- nn_data[yearbuilt <= 1939 ,.N, by = ZIP][order(ZIP)]
nn_data_1949_range <- nn_data[yearbuilt >= 1940 & yearbuilt <= 1949 ,.N, by = ZIP][order(ZIP)]
nn_data_1959_range <- nn_data[yearbuilt >= 1950 & yearbuilt <= 1959 ,.N, by = ZIP][order(ZIP)]
nn_data_1969_range <- nn_data[yearbuilt >= 1960 & yearbuilt <= 1969 ,.N, by = ZIP][order(ZIP)]
nn_data_1979_range <- nn_data[yearbuilt >= 1970 & yearbuilt <= 1979 ,.N, by = ZIP][order(ZIP)]
nn_data_1989_range <- nn_data[yearbuilt >= 1980 & yearbuilt <= 1989 ,.N, by = ZIP][order(ZIP)]
nn_data_1999_range <- nn_data[yearbuilt >= 1990 & yearbuilt <= 1999 ,.N, by = ZIP][order(ZIP)]
nn_data_2004_range <- nn_data[yearbuilt >= 2000 & yearbuilt <= 2004 ,.N, by = ZIP][order(ZIP)]
nn_data_2005_range <- nn_data[yearbuilt >= 2005,.N, by = ZIP][order(ZIP)]
#Build your table by each range; adding each range to the previously created data.frame; join zip_all to zip
tbl_LessThan_1939 <- left_join(x = zip_codes_all, y = nn_data_1939_range, by = c("ZIP_ALL" = "ZIP"))
tbl_0_1949 <- left_join(x = tbl_LessThan_1939, nn_data_1949_range, by = c("ZIP_ALL" = "ZIP"))
tbl_0_1959 <- left_join(x = tbl_0_1949, nn_data_1959_range, by = c("ZIP_ALL" = "ZIP"))
tbl_0_1969 <- left_join(x = tbl_0_1959, nn_data_1969_range, by = c("ZIP_ALL" = "ZIP"))
tbl_0_1979 <- left_join(x = tbl_0_1969, nn_data_1979_range, by = c("ZIP_ALL" = "ZIP"))
tbl_0_1989 <- left_join(x = tbl_0_1979, nn_data_1989_range, by = c("ZIP_ALL" = "ZIP"))
tbl_0_1999 <- left_join(x = tbl_0_1989, nn_data_1999_range, by = c("ZIP_ALL" = "ZIP"))
tbl_0_2004 <- left_join(x = tbl_0_1999, nn_data_2004_range, by = c("ZIP_ALL" = "ZIP"))
tbl_0_present <- left_join(x = tbl_0_2004, nn_data_2005_range, by = c("ZIP_ALL" = "ZIP"))
All right, my best guess is your data looks something like this (though probably much bigger):
library(data.table)
set.seed(47)
nn_data_sample = data.table(
yearbuilt = rep(c(1938, 1942, 1951, 1963), each = 4),
ZIP = sample(c(90210, 19145, 19146, 19147, 19148, 19149), size = 16, replace = TRUE)
)
nn_data_sample
# yearbuilt ZIP
# 1: 1938 19149
# 2: 1938 19146
# 3: 1938 19148
# 4: 1938 19148
# 5: 1942 19147
# 6: 1942 19148
# 7: 1942 19146
# 8: 1942 19146
# 9: 1951 19147
This is nicely formatted data, in long format, which is easy to work with. You seem to want to (a) count rows by zipcode and by the decade they were built (more-or-less, with a little more granularity recently), and then (b) convert the long data (with one zipcode column and one time column) into a wide format, where the times are spread across many columns.
For (a), we will use the cut function to divide the years into the decade-like intervals you want, and then aggregate the rows by zip code and decade.
decade_data = nn_data_sample[, decade_built := cut(
yearbuilt,
breaks = c(0, seq(1939, 1999, by = 10), 2004, Inf))
][, .(n = .N), by = .(decade_built, ZIP)]
decade_data
# decade_built ZIP n
# 1: (0,1939] 19149 1
# 2: (0,1939] 19146 1
# 3: (0,1939] 19148 2
# 4: (1939,1949] 19147 1
# 5: (1939,1949] 19148 1
# 6: (1939,1949] 19146 2
# 7: (1949,1959] 19147 1
# 8: (1949,1959] 19149 1
# ...
For a lot of use cases, this is a great format to work with---data.table makes it easy to do things "by group", so if you have more operations you want to do to each decade, this should be your starting point. (Since we used := the decade_built column became part of the original data, you can look at it to verify that it worked right.)
But, if you want to change to wide format, dcast does that for us:
dcast(decade_data, ZIP ~ decade_built, value.var = "n")
# ZIP (0,1939] (1939,1949] (1949,1959] (1959,1969]
# 1: 19146 1 2 NA NA
# 2: 19147 NA 1 1 2
# 3: 19148 2 1 1 NA
# 4: 19149 1 NA 1 1
# 5: 90210 NA NA 1 1
If you want to edit the column names, you can either specify what you want from the top, using the labels argument of the cut function, or simply rename the columns at the end. Or do it in the middle, modifying the values of the decade_built column after it's created---do it wherever feels easiest.

identify consecutively overlapping segments in R

I need to aggregate overlapping segments into a single segment ranging all connected segments.
Note that a simple foverlaps cannot detect connections between non overlapping but connected segments, see the example for clarification. If it would rain on my segments in my plot I am looking for the stretches of dry ground.
So far I solve this problem by an iterative algorithm but I'm wondering if there is a more elegant and stright forward way for this problem. I'm sure not the first one to face it.
I was thinking about a non-equi rolling join, but faild to implement that
library(data.table)
(x <- data.table(start = c(41,43,43,47,47,48,51,52,54,55,57,59),
end = c(42,44,45,53,48,50,52,55,57,56,58,60)))
# start end
# 1: 41 42
# 2: 43 44
# 3: 43 45
# 4: 47 53
# 5: 47 48
# 6: 48 50
# 7: 51 52
# 8: 52 55
# 9: 54 57
# 10: 55 56
# 11: 57 58
# 12: 59 60
setorder(x, start)[, i := .I] # i is just a helper for plotting segments
plot(NA, xlim = range(x[,.(start,end)]), ylim = rev(range(x$i)))
do.call(segments, list(x$start, x$i, x$end, x$i))
x$grp <- c(1,3,3,2,2,2,2,2,2,2,2,4) # the grouping I am looking for
do.call(segments, list(x$start, x$i, x$end, x$i, col = x$grp))
(y <- x[, .(start = min(start), end = max(end)), k=grp])
# grp start end
# 1: 1 41 42
# 2: 2 47 58
# 3: 3 43 45
# 4: 4 59 60
do.call(segments, list(y$start, 12.2, y$end, 12.2, col = 1:4, lwd = 3))
EDIT:
That's brilliant, thanks, cummax & cumsum do the job, Uwe's Answer is slightly better than Davids comment.
end[.N] can get wrong results, try example data x below.
max(end) is correct in all cases, and faster.
x <- data.table(start = c(11866, 12696, 13813, 14011, 14041),
end = c(13140, 14045, 14051, 14039, 14045))
min(start) and start[1L] give the same (as x is ordered by start), the latter is faster.
grp on the fly is significantly faster, unfortunately I need grp assigned.
cumsum(cummax(shift(end, fill = 0)) < start) is significantly faster than cumsum(c(0, start[-1L] > cummax(head(end, -1L)))).
I did not test the package GenomicRanges solution.
The OP has requested to aggregate overlapping segments into a single segment ranging all connected segments.
Here is another solution which uses cummax() and cumsum() to identify groups of overlapping or adjacent segments:
x[order(start, end), grp := cumsum(cummax(shift(end, fill = 0)) < start)][
, .(start = min(start), end = max(end)), by = grp]
grp start end
1: 1 41 42
2: 2 43 45
3: 3 47 58
4: 4 59 60
Disclaimer: I have seen that clever approach somewhere else on SO but I cannot remember exactly where.
Edit:
As David Arenburg has pointed out, it is not necessary to create the grp variable separately. This can be done on-the-fly in the by = parameter:
x[order(start, end), .(start = min(start), end = max(end)),
by = .(grp = cumsum(cummax(shift(end, fill = 0)) < start))]
Visualisation
OP's plot can be amended to show also the aggregated segments (quick and dirty):
plot(NA, xlim = range(x[,.(start,end)]), ylim = rev(range(x$i)))
do.call(segments, list(x$start, x$i, x$end, x$i))
x[order(start, end), .(start = min(start), end = max(end)),
by = .(grp = cumsum(cummax(shift(end, fill = 0)) < start))][
, segments(start, grp + 0.5, end, grp + 0.5, "red", , 4)]
You can try a GenomicRanges approach. In the output each row is a group.
library(GenomicRanges)
x_gr <- with(x, GRanges(1, IRanges(start, end)))
as.data.table(reduce(x_gr, min.gapwidth=0))[,2:3]
start end
1: 41 42
2: 43 45
3: 47 58
4: 59 60
And a visual insepection can be done using Gviz. Here one has to know that the package has been built for biologists and genetic information. The pattern behind are DNA bases. Hence, one has to substract 1 of the segment ends to get the correct plot.
library(Gviz)
ga <- Gviz::GenomeAxisTrack()
xgr <- with(x, GRanges(1, IRanges(start, end = end - 1)))
xgr_red <- reduce(xgr, min.gapwidth=1)
ga <- GenomeAxisTrack()
GT <- lapply(xgr, GeneRegionTrack)
GT_red <- lapply(xgr_red, GeneRegionTrack, fill = "lightblue")
plotTracks(c(ga, GT, GT_red),from = min(x$start), to = max(x$start)+2)

Call objects in function wrapped around ggplot2-function [duplicate]

This question already has answers here:
How to use a variable to specify column name in ggplot
(6 answers)
Closed 4 years ago.
I am about to write a function to iterate my plots over various variables. Unfortunately I am getting an error i don't understand.
library(ggplot2)
library(dplyr)
library(purrr)
df <- data.frame(af = c(rep(1,6),rep(2,6),rep(3,6)),
p = c(rep(c(rep("A",2),rep("B",2),rep("C",2)),3)),
ele.1 = sample(c(1:100), size=6),
ele.2 = sample(c(1:100), size=6),
ele.3 = sample(c(1:100), size=6))
af p ele.1 ele.2 ele.3
1 A 99 1 68
1 A 55 38 72
1 B 70 36 13
1 B 86 77 89
1 C 7 24 49
1 C 89 23 53
2 A 99 1 68
2 A 55 38 72
....
test <- function(.x = df, .af = 1,.p=c("A","B"), .var = ele.1) {.x %>%
filter(af == .af & p %in% .p) %>%
ggplot(aes(x = .var, y = ele.2)) +
geom_point() +
geom_path()}
test(df)
this results in
**Error in FUN(X[[i]], ...) : object 'ele.1' not found
In addition: Warning message:
In FUN(X[[i]], ...) : restarting interrupted promise evaluation**
how could i call the object ele.1 in ggplot warped around that function?
hope this is no reword from another question.
cheers
If you set .var argument as character it runs. Is this what you are looking for?
test <- function(.x = df, .af = 1,.p=c("A","B"), .var = "ele.1") {
.x %>%filter(af == .af & p %in% .p)
ggplot() +geom_point(aes(x = .x[[.var]], y = .x[["ele.2"]])) + geom_path()
}
test(df)

Rolling window with dplyr to find value of factor

I have a matrix like this
head(a)
# A tibble: 6 x 4
date ROE ROFE ROTFE
<date> <dbl> <dbl> <dbl>
1 2000-01-31 0.033968932 0.0324214815 0.010205926
2 2000-02-29 0.006891111 -0.0003352941 -0.005230147
3 2000-03-31 0.006158519 0.0213992647 0.040399265
4 2000-04-28 0.060022222 0.0151191176 0.047586029
5 2000-05-31 -0.016960000 -0.0287617647 -0.036209559
6 2000-06-30 0.034133577 0.0144456522 0.030756522
I want to pick the value of a factor which has highest cumulative return last 2 months over time.
I have done something like this and it works.
However, my friend told me that it can be done in one or two lines of dplyr and I'm wondering if you could please show me how to do that.
index = as.Date(unique(a$date))
nmonth = 2;
mean.ROE = numeric()
for (i in 1:(length(index) - nmonth)) { # i = 2
index1 = index[i]
index2 = index[nmonth + i]
index3 = index[nmonth + i+1]
# Take a 2-month window of ROE returns:
b = a[a$date >= index1 & a$date < index2,] %>% mutate(cum.ROE = cumprod(1 + ROE)) %>% mutate(cum.ROFE = cumprod(1 + ROFE)) %>% mutate(cum.ROTFE = cumprod(1 + ROTFE))
# Use the cumulative return over the 2-month window to determine which factor is best.
mean.ROE1 = ifelse(b$cum.ROE[nmonth] > b$cum.ROFE[nmonth] & b$cum.ROE[nmonth] > b$cum.ROTFE[nmonth], a[a$date == index3,]$ROE, ifelse(b$cum.ROFE[nmonth] > b$cum.ROE[nmonth] & b$cum.ROFE[nmonth] > b$cum.ROTFE[nmonth], a[a$date == index3,]$ROFE, a[a$date == index3,]$ROTFE))
# Bind the answer to the answer vector
mean.ROE = rbind(mean.ROE, mean.ROE1)
}
Create a function maxret which takes 2 + nmonth rows, x, and calculates the cumulative returns, r, for each column of the first two rows. For the largest of those return the value in the last row of x.
Now use rollapplyr to apply it to a rolling window of width 2 + month:
library(zoo)
maxret <- function(x) {
r <- apply(1 + x[1:2, ], 2, prod)
x[2 + nmonth, which.max(r)]
}
z <- read.zoo(as.data.frame(a))
res <- rollapplyr(z, 2 + nmonth, maxret, by.column = FALSE)
giving the zoo series:
> res
2000-04-28 2000-05-31 2000-06-30
0.06002222 -0.03620956 0.03075652
If you want a data frame use fortify.zoo(res) .
Note: 1 The input was not provided in reproducible form in the question so I have assumed this data.frame:
Lines <-
"date ROE ROFE ROTFE
1 2000-01-31 0.033968932 0.0324214815 0.010205926
2 2000-02-29 0.006891111 -0.0003352941 -0.005230147
3 2000-03-31 0.006158519 0.0213992647 0.040399265
4 2000-04-28 0.060022222 0.0151191176 0.047586029
5 2000-05-31 -0.016960000 -0.0287617647 -0.036209559
6 2000-06-30 0.034133577 0.0144456522 0.030756522"
a <- read.table(text = Lines, header = TRUE)
Note 2: With the input in Note 1 or with zoo 1.8.1 (the development version of zoo) this line:
z <- read.zoo(as.data.frame(a))
could be simplified to just:
z <- read.zoo(a)
but we have added the as.data.frame part in the main code so it works with tibbles as well as straight data frames even with the current version of zoo on CRAN.

Resources