how to enclose combined cells as vector - r

I have got a data table like
library(data.table)
library(lifecontingencies)
dt <- data.table(cash = c(100,120), Flows = c(110,130),time = c(1,1),
Ids = c(2,2), int = c(0.02,0.04), Rates = c(0.02,0.04),
proba = c(0.9,0.8), bilities = c(0.7,0.6))
dt
# cash Flows time Ids int Rates proba bilities
#1: 100 110 1 2 0.02 0.02 0.9 0.7
#2: 120 130 1 2 0.04 0.04 0.8 0.6
and want to calculate
#presentValue(cashFlows, timeIds, interestRates, probabilities)
row-wise. How can I do it automatically instead of manually like so:
pV1 <- presentValue(cashFlows = c(100,110),
timeIds = c(1,2),
interestRates = c(0.02,0.02),
probabilities = c(0.9,0.7))
pV2 <- presentValue(cashFlows = c(120,130),
timeIds = c(1,2),
interestRates = c(0.04,0.04),
probabilities = c(0.8,0.6))
result <- c(pV1,pV2)
result
#162.2453 164.4231

As we are using data.table, one approach is to group by sequence of rows and apply the function
dt[, .(presValue = presentValue(cashFlows = unlist(c(cash, Flows)),
timeIds = unlist(c(time, Ids)),
interestRates = unlist(c(int, Rates)),
probabilities = unlist(c(proba, bilities)))), by = .(Row = 1:nrow(dt))]
# Row presValue
#1: 1 162.2453
#2: 2 164.4231
Another approach is to combine the multiple columns into one by melting and then apply the presentValue
dM <- melt(dt, measure = patterns('cash|Flow', 'time|Ids', 'int|Rates', 'proba|bilities'),
value.name = c('cashFlows', 'timeIds', 'interestRates', 'probabilities'))[,
rn := rowid(variable)][]
dM[, .(presValue = do.call(presentValue, .SD)),
by = .(Row = rn), .SDcols = cashFlows:probabilities]
# Row presValue
#1: 1 162.2453
#2: 2 164.4231

I would vectorize your calculation via apply:
apply(dt, 1, function(x) presentValue(cashFlows = x[1:2],
timeIds = x[3:4],
interestRates = x[5:6],
probabilities = x[7:8]))
# [1] 162.2453 164.4231
Note that you can manipulate indexes of row any way you want, for example, x[1:2] here stands for first and second cells of a row. You can choose first and fourth cells through x[c(1,4)], or just second cell via x[2]

It took me a minute to understand the presentValue and what it needed but I think this should do what you want.
apply(dt, 1, function(row) {
cashFlows <- c(row[1], row[2])
tIds <- c(row[3], row[4])
interestRates <- c(row[5], row[6])
probabilities <- c(row[7], row[8])
presentValue(cashFlows = cashFlows,
timeIds = tIds,
interestRates = interestRates,
probabilities = probabilities)
})

Related

Use of tail() in out-of-sample prediction

Below you see an out of sample rolling window estimation I found here: (https://www.r-bloggers.com/2017/11/formal-ways-to-compare-forecasting-models-rolling-windows/)
Here is my question: I know the tail() function returns the last n rows of a dataset. But I don't understand its purpose when its used in the random walk in line 13 or when calculating the errors in line 17 and 18. Any help on clarifying this would be much appreciated.
# = Number of windows and window size
w_size = 300
n_windows = nrow(X) - 300
# = Rolling Window Loop = #
forecasts = foreach(i=1:n_windows, .combine = rbind) %do%{
# = Select data for the window (in and out-of-sample) = #
X_in = X[i:(w_size + i - 1), ] # = change to X[1:(w_size + i - 1), ] forxpanding window
X_out = X[w_size + i, ]
# = Regression Model = #
m1 = lm(infl0 ~ . - prodl0, data = X_in)
f1 = predict(m1, X_out)
# = Random Walk = #
f2 = tail(X_in$infl0, 1)
return(c(f1, f2))
}
# = Calculate and plot errors = #
e1 = tail(X[ ,"infl0"], nrow(forecasts)) - forecasts[ ,1]
e2 = tail(X[ ,"infl0"], nrow(forecasts)) - forecasts[ ,2]
Here the function tail is applied to a vector because you select only the "inf10" column. In this case tail return the last element of the selected column.
df <- data.frame(A = c(1,2), B = c(3,4))
df[,"A"] # will return c(1,2)
tail(df[,"A"], 1) # will return 2
tail(df$B, 1) # will return 4

metaprogramming map on data.table list-columns

I cannot map over a nested column using data.table.
I made it an example.
library(data.table)
library(purrr)
DT <- setDT(list(
gp = c("A", "B"),
data = list(
setDT(list(d1 = c(1, 2, 3), d2 = c(2, 2, 4), d3 = c(0.2, 0.2, 0.4))),
setDT(list(d1 = c(10, 20, 30), d2 = c(20, 20, 40), d3 = c(0.2, 0.2, 0.4)))
),
metric = c("max", "min")
))
choose_a and choose_b are two of the n columns nested.
calc_name is the name of the calculated new column, that has been opereted by
the calc_metric_mean function
calc_metric_mean <- function(a, b, metric){
if(metric == "max"){
return(mean(c(max(a), max(b))))
}
if(metric == "min"){
return(mean(c(min(a), min(b))))
}
if(metric == "q74"){
return(mean(c(quantile(a, 74), quantile(b, 74))))
}
}
choose_a <- c("d1", "d2", "d2")
choose_b <- c("d3", "d1", "d2")
calc_name <- paste(choose_a, choose_b, sep = '')
metric <- "max"
for(i in 1:length(calc_name)){
DT[, calc_name[[i]] := map_dbl(
.x = data,
~calc_metric_mean(
a = choose_a[[i]],
b = choose_b[[i]],
metric = "max"
)
)]
}
The result would be
gp data d1d3 d2d1 d2d2
1: A <data.table[3x3]> 1.7 3.5 4
2: B <data.table[3x3]> 15.2 35.0 40
ADDED 2021-03-18
Second quiz: How about if you have the parameter "metric" in a column, outside the nested data?
The result would be
gp data metric d1d3 d2d1 d2d2
1: A <data.table[3x3]> max 1.7 3.5 4
2: B <data.table[3x3]> min 5.1 15 20
Sorry, if I haven't understood the question correctly, but if you're trying to produce the desired output using DT, using a for() loop with set() is an option:
for(i in 1:length(calc_name)){
set(DT, NULL, j = calc_name[i],
value = lapply(DT$data, function(x){
calc_metric_mean(a = x[[choose_a[i]]], b = x[[choose_b[i]]], metric = "max")
}
)
)
}
DT
This approach is in someways a nested for-loop, which isn't the most elegant, but it gets the job done and looping with set() can still be quite fast since it's updating by reference. One note is that this approach takes advantage of the fact that a data.table is a list with x[[choose_a[i]].
To get my code to work, I had to make two small changes to your example set up. First, because you created DT with structure, you need setDT(DT) to use set(). Second, I edited calc_metric_mean() to be more explicit about what it returns. Otherwise, it returned NULL for me:
calc_metric_mean <- function(a, b, metric){
if(metric == "max"){
return(mean(c(max(a), max(b))))
}
if(metric == "min"){
return(mean(c(min(a), min(b))))
}
if(metric == "q74"){
return(mean(c(quantile(a, 74), quantile(b, 74))))
}
}
There's another answer thanks to wonderful #diaggy 's answer.
for(i in 1:length(calc_name)){
DT[, calc_name[i] := lapply(DT$data, function(x){
calc_metric_mean(a = x[[choose_a[i]]], b = x[[choose_b[i]]], metric = "max")
})][]
}
This leads to the desired result too.
> DT
gp data d1d3 d2d1 d2d2
1: A <data.table[3x3]> 1.7 3.5 4
2: B <data.table[3x3]> 15.2 35 40
There're some comments to do:
The final empty [] is neccesary to list off the := result in the data.table (see 2.23 in faqs).
The double call x[[ is neccesary to assess the inner columns in a list-column. For some reason, x[, choose_a[i]] returns the character choose_a[i] and this won't work.
In the comparison, it is better #diaggy 's solution:
expr min lq mean median uq max neval
eval(diaggys_set) 3.589102 3.849702 4.487934 4.054001 4.516901 10.4261 100
eval(direct) 4.749001 5.127901 5.844534 5.386051 5.985651 12.9724 100
First Variation: using variables from the nested objetive
lapply is enough. See the #diaggy's Answer.
Second Variation: using variables from and Outside the nested objetive
If you have to load a parameter from other column, it is neccesary pass from lapply, to mapply.
for(i in 1:length(calc_name)){
set(DT, NULL, j = calc_name[i],
value = mapply(function(x, m){
calc_metric_mean(a = x[[choose_a[i]]], b = x[[choose_b[i]]], metric = m)
}, x = DT$data, m = DT$metric, SIMPLIFY = FALSE
)
)
}
> DT
gp data metric d1d3 d2d1 d2d2
1: A <data.table[3x3]> max 1.7 3.5 4
2: B <data.table[3x3]> min 5.1 15 20
SIMPLIFY = FALSE is required if it will return a list instead a vector.

Specific separator can separate the data frame

I try to execute this command
df2 <- as.data.frame.matrix(table(stack(setNames(strsplit(df$col1, "---", fixed = TRUE), df$id))[2:1]))
However I receive this error:
Error in table(stack(setNames(strsplit(df$col1, :
attempt to make a table with >= 2^31 elements
Any idea why this error happaned? Unfortunately I can't provide a reproducable example with this code because I can't find what caused this error.
What makes this command is that it make 0 and 1 values which separate by ---.
Example input:
data.frame(id = c(1,2), col1 = c("text---here","text---there"))
expected output
data.frame(id = c(1,2), text = c(1,1), here = c(1,0), there = c(0,1))
If the task in question is complex, it is worth splitting it into chunks. Try this:
x = data.frame(id = c(1,2), col1 = c("text---here","text---there")); x$col1 = as.vector(x$col1)
Split = strsplit(as.vector(x$col1), split = "---")
levels = unique(unlist(Split))
x = cbind(x, matrix(ncol = length(levels), nrow = nrow(x)))
for(i in 1:length(levels))
{
x[,ncol(x)-length(levels)+i] <- sapply(Split, function(x) max(x == levels[i]))
}
colnames(x) <- c("id", "col1", levels)
x
# id col1 text here there
# 1 1 text---here 1 1 0
# 2 2 text---there 1 0 1

how can i group a large number of elements quickly

I've created some 'data' here as an example. It's contains 100 elements where each start with a letter and are followed by 3 random numbers.
I want to know the best/quickest way to convert them into groups like i've started to do in the for loop.
Assume i need to create 50 groups and instead of 100 elements in 'data' there are a million.
The groupings themselves will be fairly random. I've used A000-A599 and A600-A999 as the first 2 groupings in my example but the groupings are not neatly spaced out e.g B000-B599 and B600-B999 are not necessarily the next groupings. The next groupings could for example be B000-C299,C300-C799,C800-D499 etc. I would need to enter these groupings manually.
I'm guessing a for loop isn't the best way to do this as it would take a long time to complete the loop.
library(stringr)
library(magicfor)
data <- paste(sample(LETTERS, 100, replace = T),
sample(str_pad(000:999, width = 3, side = "left", pad = "0"), 100, replace = T), sep = "")
magic_for()
for(x in seq_along(data)){
if( grepl("A[0-5]", data[1])){
range <- "A000-A599"
}elseif( grepl("A[6-9]", data[1])){
range <- "A600-A999"
}
put(range)
}
You can try something like this:
> #Round function
> roundUp <- function(x,to=10) {
+ to*(x%/%to + as.logical(x%%to))
+ }
> #Create a dataframe for easy store
> df <- data.frame(data = data, stringsAsFactors = F)
> df %>%
+ mutate(C = substr(data, 1, 1),
+ N = as.integer(substr(data, 2, 4))) %>%
+ mutate(N = roundUp(N, to = 500)) %>%
+ mutate(data2 = paste0(C, N)) %>%
+ select(data, data2)
data data2
1 U493 U500
2 A429 A500
3 N564 N1000
4 W656 W1000
5 J978 J1000
6 B232 B500
7 D240 D500
8 I796 I1000
9 E831 E1000
...(truncated)
The data2 field contains the new groups
Using the tidyverse packages, I would convert your data vector into a data.frame (or tibble) format.
library(tidyverse)
df <- tibble(my_variable = data) %>%
mutate(
first_char = substr(my_variable, 1, 1),
random_numbers = substr(my_variable, 2, 4)
)
Once you've gotten that far, it's easy to group the data however you want. Your loop could be achieved like so:
df %>%
mutate(
group = ifelse(as.numeric(random_numbers) < 600, "000-599", "600-999"),
desired_result = paste0(first_char, group)
)
I recommend reading the following (free) ebook cover to cover, it will equip you with a bunch of useful tools for everyday R tasks like the one you've outlined:
https://r4ds.had.co.nz/index.html
The groupings themselves will be fairly random. I've used A000-A599 and A600-A999 as the first 2 groupings in my example but the groupings are not neatly spaced out e.g B000-B599 and B600-B999 are not necessarily the next groupings. The next groupings could for example be B000-C299,C300-C799,C800-D499 etc.
Since your groups are lexicographic intervals, you could use rolling joins. In this case you only need to specify the lower bound for each group:
library(data.table)
# define decrement function
dec = function(x){
ltr = substr(x, 1, 1)
num = as.integer(substr(x, 2, 4))
w0 = num == 0L
ltr = replace(ltr, w0, LETTERS[match(ltr[w0], LETTERS) - 1L])
num = replace(num - 1L, w0, 999L)
sprintf("%s%03d", ltr, num)
}
# enumerate lower bounds and derive ranges
rangeDT = data.table(lb = c("A000", "A600", "B000", "C300", "C800"))
rangeDT[, ub := dec(shift(lb, type="lead", fill="Z999"))]
rangeDT[, range := sprintf("%s-%s", lb, ub)]
# lb ub range
# 1: A000 A599 A000-A599
# 2: A600 A999 A600-A999
# 3: B000 C299 B000-C299
# 4: C300 C799 C300-C799
# 5: C800 Z998 C800-Z998
Then the rolling update join is...
DT = data.table(x = data)
DT[, range := rangeDT[.SD, on=.(lb = x), roll=TRUE, x.range]]
The result looks like
> head(DT)
x range
1: C965 C800-Z999
2: Q973 C800-Z999
3: V916 C800-Z999
4: C701 C300-C799
5: A363 A000-A599
6: F144 C800-Z999
If your data were numeric, cut or findInterval from base R would work, but for whatever reason those do not support strings.
How about this?
library(data.table)
ranges <- c(paste0(LETTERS, "[0-5]"),paste0(LETTERS, "[6-9]"))
final <-lapply(ranges, function(y) {
matches <- grepl(y, data)
if(sum(matches)>0){
tmp <-data.table(element=data[matches], range=
paste0(str_sub(y,1,1), str_sub(y,3,3),0,0,"-", str_sub(y,1,1), str_sub(y,5,5),9,9))}
else return(NULL)
})
final_2 <- rbindlist(final)
# element range
# A374 A000-A599
# B498 B000-B599
# B064 B000-B599
# C131 C000-C599
# C460 C000-C599
# C099 C000-C599
structure(list(element = c("A374", "B498", "B064", "C131", "C460", "C099", "C193", "E428", "E108", "E527", "E138", "E375", "E312", "F046", "F417", "F094", "G142", "G461", "G068", "H372", "H523", "H027", "H506", "I470", "I169", "I050", "I495", "I405", "J298", "K165", "K169", "K131", "L510", "L210", "L277", "N257", "N554", "N452", "N484", "N247", "N373", "N492", "O347", "O221", "O176", "P578", "P477", "Q062", "Q257", "Q083", "R306", "S415", "S154", "S226", "S400", "T132", "T181", "T321", "V109", "V118", "V267", "W381", "W047", "X317", "X192", "Y390", "Y132", "Y327", "Y141", "Y353", "Z429", "C981", "D813", "F934", "G910", "G673", "G664", "I754", "I624", "L603", "N991", "N996", "O689", "O932", "P854", "P689", "P761", "P681", "Q631", "S620", "T923", "T841", "U787", "U929", "W942", "W702", "X770", "X880", "Y719", "Y969"), range = c("A000-A599", "B000-B599", "B000-B599", "C000-C599", "C000-C599", "C000-C599", "C000-C599", "E000-E599", "E000-E599", "E000-E599", "E000-E599", "E000-E599", "E000-E599", "F000-F599", "F000-F599", "F000-F599", "G000-G599", "G000-G599", "G000-G599", "H000-H599", "H000-H599", "H000-H599", "H000-H599", "I000-I599", "I000-I599", "I000-I599", "I000-I599", "I000-I599", "J000-J599", "K000-K599", "K000-K599", "K000-K599", "L000-L599", "L000-L599", "L000-L599", "N000-N599", "N000-N599", "N000-N599", "N000-N599", "N000-N599", "N000-N599", "N000-N599", "O000-O599", "O000-O599", "O000-O599", "P000-P599", "P000-P599", "Q000-Q599", "Q000-Q599", "Q000-Q599", "R000-R599", "S000-S599", "S000-S599", "S000-S599", "S000-S599", "T000-T599", "T000-T599", "T000-T599", "V000-V599", "V000-V599", "V000-V599", "W000-W599", "W000-W599", "X000-X599", "X000-X599", "Y000-Y599", "Y000-Y599", "Y000-Y599", "Y000-Y599", "Y000-Y599", "Z000-Z599", "C600-C999", "D600-D999", "F600-F999", "G600-G999", "G600-G999", "G600-G999", "I600-I999", "I600-I999", "L600-L999", "N600-N999", "N600-N999", "O600-O999", "O600-O999", "P600-P999", "P600-P999", "P600-P999", "P600-P999", "Q600-Q999", "S600-S999", "T600-T999", "T600-T999", "U600-U999", "U600-U999", "W600-W999", "W600-W999", "X600-X999", "X600-X999", "Y600-Y999", "Y600-Y999")), row.names = c(NA,
-100L), class = c("data.table", "data.frame"))

how to split data frame by time interval

I have two data frames, first is the daily return of 3 securities, second is the weights of the securities, as the following:
daily.return <- data.frame(date = seq.Date(from = as.Date("2015-01-01"),
by = "days",
length.out = 100),
a = runif(100,-0.1,0.1),
b = runif(100,-0.1,0.1),
c = runif(100,-0.1,0.1))
weights <- data.frame(startDate = c(as.Date("2015-01-01"),
as.Date("2015-02-10"),
as.Date("2015-03-15")),
endDate = c(as.Date("2015-02-09"),
as.Date("2015-03-14"),
as.Date("2015-04-10")),
a = c(0.3,0.5,0.2),
b = c(0.4,0.2,0.1),
c = c(0.3,0.3,0.7)
)
I know how to split data fame by weeks etc.., if we convert data frame to xts;but how to split this daily.return according to startDate and endDate in weights?
Suppose a fund have this three securities,how to calculate the fund nav and daily return?
This should do the job.
daily.return <- data.frame(date = seq.Date(from = as.Date("2015-01-01"),
by = "days",
length.out = 100),
a = runif(100,-0.1,0.1),
b = runif(100,-0.1,0.1),
c = runif(100,-0.1,0.1))
weights <- data.frame(startDate = c(as.Date("2015-01-01"),
as.Date("2015-02-10"),
as.Date("2015-03-15")),
endDate = c(as.Date("2015-02-09"),
as.Date("2015-03-14"),
as.Date("2015-04-10")),
a = c(0.3,0.5,0.2),
b = c(0.4,0.2,0.1),
c = c(0.3,0.3,0.7)
)
library(quantmod)
daily.xts <- as.xts(daily.return[,-1],daily.return[,1])
# Assuming that the total period is the same in both the data frames
weights.xts <- xts(matrix(NA,nrow(daily.xts),3),order.by=index(daily.xts))
names(weights.xts) <- c("a","b","c")
for (i in 1:nrow(weights)){
temp.inputs <- weights[i,]
temp.period <- paste(temp.inputs[,1],temp.inputs[,2],sep="/")
len <- nrow(weights.xts[temp.period])
weights.xts[temp.period,1:3] <- matrix(rep(as.numeric(temp.inputs[,3:5]),len),len,byrow=T)
}
weighted.returns <- daily.xts * weights.xts
weighted.returns <- as.xts(rowSums(weighted.returns),index(weighted.returns))
names(weighted.returns) <- "Weighted Returns"
weighted.returns$Cumulative <- cumsum(weighted.returns)
plot(weighted.returns$Cumulative)
You can split daily.return according to start and end date in weights using apply, performing row-wise operation
apply(weights, 1, function(x) daily.return[daily.return$date >= x[1]
& daily.return$date <= x[2], ])
This will give a list of 3 dataframes splitted according to the range in weights.
EDIT
If I have understood correctly, you want each value in the column a, b, c of the daily.return to multiply with respective columns in the weights.
apply(weights, 1, function(x) {
A <- daily.return[daily.return$date >= x[1] & daily.return$date <= x[2], ]
t(t(A[, 2:4]) * as.numeric(x[3:5]))
}
)

Resources