How to use multiple rolling windows using slider in R? - r

Here is my toy data.
df <- tibble::tribble(
~fund, ~dates, ~y, ~x,
"Fund_A", "03/31/2021", 0.04, 0.04,
"Fund_A", "04/30/2021", 0.04, -0.03,
"Fund_A", "05/31/2021", 0.03, 0.04,
"Fund_A", "06/30/2021", -0.01, 0.03,
"Fund_A", "07/31/2021", -0.06, -0.03,
"Fund_A", "08/31/2021", 0.04, 0.05,
"Fund_A", "09/30/2021", 0.01, -0.04,
"Fund_A", "10/31/2021", 0.02, -0.01,
"Fund_A", "11/30/2021", 0.03, -0.03,
"Fund_A", "12/31/2021", -0.02, 0.06,
"Fund_B", "03/31/2021", 0.01, 0.02,
"Fund_B", "04/30/2021", 0.01, 0.05,
"Fund_B", "05/31/2021", 0.05, -0.05,
"Fund_B", "06/30/2021", 0.01, -0.02,
"Fund_B", "07/31/2021", 0.04, 0.09,
"Fund_B", "08/31/2021", 0.02, -0.01,
"Fund_B", "09/30/2021", 0.02, 0.02,
"Fund_B", "10/31/2021", -0.01, 0.01,
"Fund_B", "11/30/2021", 0.05, 0.01,
"Fund_B", "12/31/2021", -0.03, 0.02
)
I have code that runs the rolling regression and spits out the regression output using slider package.
library(tidyverse)
library(slider)
library(broom)
df %>%
group_by(fund) %>%
mutate(model = slide(.x = cur_data(),
.f = possibly(~(lm(y ~ x, data = .x) %>%
tidy() %>%
filter(term != "(Intercept)")),
otherwise = NA),
.before = 5)) %>%
ungroup() %>%
unnest(model)
Now, I want to be able to run the above code with multiple values of funds and ".before" values and combine the results in one dataframe. In other words, I want the above code to work on say .before = seq(4, 7,1). It would be interesting to see an attempt using purrr map!

To carry out the same operation multiple times, we can use a for-loop or an apply function.
To keep the code tidy, I first made a function out of the code to repeat, with the value of .before as a parameter. Then lapply() executes that function multiple times. Then do.call(rbind) binds the resulting dataframes together.
df <- tibble::tribble(
~fund, ~dates, ~y, ~x,
"Fund_A", "03/31/2021", 0.04, 0.04,
...
"Fund_B", "12/31/2021", -0.03, 0.02
)
library('tidyverse')
library('slider')
library('broom')
#
# function that performs the action for a single value for .before; returns a dataframe
# example: calculate_coefficient(df, 4)
#
calculate_lm_values <- function(df, .before) {
df %>%
group_by(fund) %>%
mutate(model = slide(.x = cur_data(),
.f = possibly(~(lm(y ~ x, data = .x) %>%
tidy() %>%
filter(term != "(Intercept)")),
otherwise = NA),
.before = .before),
before = .before) %>%
ungroup() %>%
unnest(model)
}
#
# run function multiple times and bind rows together
#
df_results2 <- map_dfr(4:7, ~calculate_lm_values(df, .x))
# alternatively:
# df_results <- lapply( 4:7, function(x) calculate_lm_values(df, x) )
# df_results <- do.call(rbind, df_results)
df_results

Related

Output selected variables to global environment R function

I have function which is an extension of an earlier question here
Function to calculate median by column to an R dataframe that is done regularly to multiple dataframes
my function below
library(outliers)
MscoreMax <- 3
scores_na <- function(x, ...) {
not_na <- !is.na(x)
scores <- rep(NA, length(x))
scores[not_na] <- outliers::scores(na.omit(x), ...)
scores
}
mediansFunction <- function(x){
labmedians <- sapply(x[-1], median)
median_of_median <- median(labmedians)
grand_median <- median(as.matrix(x[-1]))
labMscore <- as.vector(round(abs(scores_na(labmedians, "mad")), digits = 2)) #calculate mscore by lab
labMscoreIndex <- which(labMscore > MscoreMax) #get the position in the vector that exceeds Mscoremax
x[-1][labMscoreIndex] <- NA # discharge values above threshold by making NA
return(x)
}
the function has the desired outcome of converting my Mscore values above the threshold to NA. However, I would like to send
labmedians
grand_median
labMscore
As their own variables to the global environment from within the function, but not as a list of items as 3 variables. Can i do this or is better to create a second function which is slightly different that sends the variables to the global environment as a function then use list2env outside the function afterwards to extract the variables as seperate items?
my df below
structure(list(Determination_No = 1:6, `2` = c(0.08, 0.08, 0.08,
0.08, 0.08, 0.08), `3` = c(0.08, 0.07, 0.07, 0.08, 0.07, 0.07
), `4` = c(0.07, 0.08, 0.08, 0.08, 0.07, 0.08), `5` = c(0.08,
0.08, 0.08, 0.08, 0.09, 0.09), `7` = c(0.09, 0.09, 0.11, 0.1,
0.1, 0.1), `8` = c(0.086, 0.087, 0.086, 0.09, 0.083, 0.079),
`10` = c(0.049748274, 0.049748274, 0.066331032, 0.066331032,
0.066331032, 0.049748274), `12` = c(0.086, 0.078, 0.078,
0.077, 0.077, 0.068)), class = "data.frame", row.names = c(NA,
-6L))
It is not recommended to write to global environment from inside the function. If you want to create multiple objects in the global environment return a named list from the function and use list2env.
mediansFunction <- function(x){
labmedians <- sapply(x[-1], median)
median_of_median <- median(labmedians)
grand_median <- median(as.matrix(x[-1]))
labMscore <- as.vector(round(abs(scores_na(labmedians, "mad")), digits = 2)) #calculate mscore by lab
labMscoreIndex <- which(labMscore > MscoreMax) #get the position in the vector that exceeds Mscoremax
x[-1][labMscoreIndex] <- NA # discharge values above threshold by making NA
dplyr::lst(data = x, labmedians, grand_median, labMscore)
}
result <- mediansFunction(df)
list2env(result, .GlobalEnv)
Now you have variables data, labmedians, grand_median and labMscore in the global environment.

Split data by difference between rows values

I'm looking to split dataset per 10 days properly. The step between days is not alway 1 : could be 2 in the case of -149 -> -147
Is there any way smarter than test every time difference between days and register begin and end indexes for split ?
df = structure(list(day = c(-155, -153, -152, -151, -150, -149, -147,
-146, -145, -144, -143, -142, -141, -140, -139, -138, -137, -135,
-134, -131), margin = c(0.02, 0.03, 0.065, 0.06, 0.07, 0.05,
0.035, 0.06, 0.0266666666666667, 0.03, 0.04, 0.06, 0.0366666666666667,
0.035, 0.09, 0.12, 0.045, 0.04, 0.02, 0.06)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
An option is to get the diff of adjacent elements of 'day' column, then do the cumulative sum (cumsum), to create a the diff column, use that column to create a grouping with %/% for splitting at each 10 value as this returns an index that increments on every 10, then use that column in group_split to split the data into list of data.frames
library(dplyr)
df %>%
mutate(diff = cumsum(c(0, diff(day))),
diff = pmax(0, (diff - 1)) %/% 10) %>%
group_split(diff, .keep = FALSE)

How to change the a axis to a time series in ggplot2

I'm trying to replicate the graph provided at https://www.chicagofed.org/research/data/cfnai/current-data since I will be needing graphs for data sets soon that look like this. I'm almost there, I can't seem to figure out how to change the x axis to the dates when using ggplot2. Specifically, I would like to change it to the dates in the Date column. I tried about a dozen ways and nothing is working. The data for this graph is under indexes on the website. Here's my code and the graph where dataSet is the data from the website:
library(ggplot2)
library(reshape2)
library(tidyverse)
library(lubridate)
df = data.frame(time = index(dataSet), melt(as.data.frame(dataSet)))
df
str(df)
df$data1.Date = as.Date(as.character(df$data1.Date))
str(df)
replicaPlot1 = ggplot(df, aes(x = time, y = value)) +
geom_area(aes(colour = variable, fill = variable)) +
stat_summary(fun = sum, geom = "line", size = 0.4) +
labs(title = "Chicago Fed National Activity Index (CFNAI) Current Data")
replicaPlot1 + scale_x_continuous(name = "time", breaks = waiver(), labels = waiver(), limits =
df$data1.Date)
replicaPlot1
Any sort of help on this would be very much appreciated!
G:\BOS\Common\R-Projects\Graphs\Replica of Chicago Fed National Acitivty index (PCA)\dataSet
Not sure what's your intention with data.frame(time = index(dataSet), melt(as.data.frame(dataSet))). When I download the data and read via readxl::read_excel I got a nice tibble with a date(time) column which after reshaping via tidyr::pivot_longer could easily be plotted and by making use of scale_x_datetime has a nicely formatted date axis:
Using just the first 20 rows of data try this:
library(ggplot2)
library(readxl)
library(tidyr)
df <- pivot_longer(df, -Date, names_to = "variable")
ggplot(df, aes(x = Date, y = value)) +
geom_area(aes(colour = variable, fill = variable)) +
stat_summary(fun = sum, geom = "line", size = 0.4) +
labs(title = "Chicago Fed National Activity Index (CFNAI) Current Data") +
scale_x_datetime(name = "time")
#> Warning: Removed 4 rows containing non-finite values (stat_summary).
#> Warning: Removed 4 rows containing missing values (position_stack).
Created on 2021-01-28 by the reprex package (v1.0.0)
DATA
# Data downloaded from https://www.chicagofed.org/~/media/publications/cfnai/cfnai-data-series-xlsx.xlsx?la=en
# df <- readxl::read_excel("cfnai-data-series-xlsx.xlsx")
# dput(head(df, 20))
df <- structure(list(Date = structure(c(
-87004800, -84412800, -81734400,
-79142400, -76464000, -73785600, -71193600, -68515200, -65923200,
-63244800, -60566400, -58060800, -55382400, -52790400, -50112000,
-47520000, -44841600, -42163200, -39571200, -36892800
), tzone = "UTC", class = c(
"POSIXct",
"POSIXt"
)), P_I = c(
-0.26, 0.16, -0.43, -0.09, -0.19, 0.58, -0.05,
0.21, 0.51, 0.33, -0.1, 0.12, 0.07, 0.04, 0.35, 0.04, -0.1, 0.14,
0.05, 0.11
), EU_H = c(
-0.06, -0.09, 0.01, 0.04, 0.1, 0.22, -0.04,
0, 0.32, 0.16, -0.2, 0.34, 0.06, 0.17, 0.17, 0.07, 0.12, 0.12,
0.15, 0.18
), C_H = c(
-0.01, 0.01, -0.05, 0.08, -0.07, -0.01,
0.12, -0.11, 0.1, 0.15, -0.04, 0.04, 0.17, -0.03, 0.05, 0.08,
0.09, 0.05, -0.06, 0.09
), SO_I = c(
-0.01, -0.07, -0.08, 0.02,
-0.16, 0.22, -0.08, -0.07, 0.38, 0.34, -0.13, -0.1, 0.08, -0.07,
0.06, 0.07, 0.12, -0.3, 0.35, 0.14
), CFNAI = c(
-0.34, 0.02, -0.55,
0.04, -0.32, 1, -0.05, 0.03, 1.32, 0.97, -0.46, 0.39, 0.38, 0.11,
0.63, 0.25, 0.22, 0.01, 0.49, 0.52
), CFNAI_MA3 = c(
NA, NA, -0.29,
-0.17, -0.28, 0.24, 0.21, 0.33, 0.43, 0.77, 0.61, 0.3, 0.1, 0.29,
0.37, 0.33, 0.37, 0.16, 0.24, 0.34
), DIFFUSION = c(
NA, NA, -0.17,
-0.14, -0.21, 0.16, 0.11, 0.17, 0.2, 0.5, 0.41, 0.28, 0.2, 0.32,
0.36, 0.32, 0.33, 0.25, 0.31, 0.47
)), row.names = c(NA, -20L), class = c(
"tbl_df",
"tbl", "data.frame"
))

tidyverse divide several columns by other columns n positions later (avoiding loops)

library(tidyverse)
dat <- tribble(
~Scenario, ~V1, ~V2, ~V3, ~V4,
1, 0.97, 0.46, 0.79, 0.25,
1, 0.21, 0.45, 0.23, 0.63,
1, 0.95, 0.97, 0.07, 0.61,
1, 0.93, 0.79, 0.23, 0.86,
2, 0.22, 0.01, 0.42, 0.47,
2, 0.71, 0.17, 0.16, 0.88,
3, 0.73, 0.38, 0.10, 0.77,
3, 0.49, 0.37, 0.90, 0.52,
3, 0.99, 0.71, 0.66, 0.05,
3, 0.72, 0.75, 0.69, 0.01,
3, 0.15, 0.87, 0.12, 0.02,
4, 0.94, 0.30, 0.91, 0.99)
I'm adding four new columns to this data, where each new column represents the sum of each V1:V4 column grouped by Scenario:
dat_new <- dat %>%
group_by(Scenario) %>%
mutate_at(vars(-group_cols()), .funs = list(sum = sum))
I'm looking for a simple way to divide V1 by V1_sum, V2 by V2_sum and so on assuming that a) I have as many original v columns as I have sum columns and b) that the data is correctly ordered and following my pattern where I first have all my v columns followed by the sum columns.
I just asked a another question here on SO where the focus was on pivoting the data to long format and then transform it back to wide format, but I was wondering if there was an easier solution in the tidyverse.
Note: I could probably just loop through each column and divide it by the column 4 positions later, but I was looking for a more elegant solution.
We can expand the function inside list instead of creating temporary sum columns and then divide
library(dplyr)
dat %>%
group_by(Scenario) %>%
mutate_at(vars(-group_cols()), .funs = list(percentage = ~ ./sum(.)))
If it is from dat_new, one option is map
library(purrr)
map2_dfc(dat %>%
select(V1:V4),
dat_new %>%
ungroup %>%
select(ends_with('sum')), `/`)
Or using base R
dat[2:5]/dat_new[6:9]

text to expression in function of variance estimation of derived parameters via Delta Method

I have written a function to perform matrix multiplication on each row of the data set pd.matrix. The function my.var.function performs as intended. However, now I want to generalize the function to handle matrices of variable sizes instead of just the example matrix with five columns.
To generalize the function I imagine that I will need to replace x[1], x[2], x[3], x[4], x[5] in the apply statement with something like x[1]:x[ncol(pd.matrix)]. I imagine I similarly will need to replace the two instances of (x1, x2, x3, x4, x5) within the function.
I have tried making these changes with eval(parse(text= followed by paste0 to create the desired series of x1, x2, x3, x4, x5 or x[1], x[2], x[3], x[4], x[5] for this example. However, I have been unable to get eval(parse(text= to work after trying numerous permutations.
How can I generalize the function and apply statement to handle a pd.matrix of n columns rather than five columns?
pd.matrix <- matrix(c(0.10, 0.20, 0.30, 0.40, 0.50,
0.11, 0.21, 0.31, 0.41, 0.51,
0.12, 0.22, 0.32, 0.42, 0.52,
0.13, 0.23, 0.33, 0.43, 0.53,
0.14, 0.24, 0.34, 0.44, 0.54), nrow = 5, byrow = TRUE)
vcv.mat = matrix(c(0.01, 0.0020, 0.0030, 0.0040, 0.0050,
0.0020, 0.02, 0.0031, 0.0041, 0.0051,
0.0030, 0.0031, 0.03, 0.0042, 0.0052,
0.0040, 0.0041, 0.0042, 0.04, 0.0053,
0.0050, 0.0051, 0.0052, 0.0053, 0.05), nrow = 5, byrow = TRUE)
my.var.function <- function(x1, x2, x3, x4, x5) {
my.pd <- matrix(c(x1, x2, x3, x4, x5), nrow = 1)
my.mat = my.pd %*% vcv.mat
my.var = my.mat %*% t(my.pd)
return(my.var = my.var)
}
apply(pd.matrix, 1, function(x) my.var.function(x[1], x[2], x[3], x[4], x[5]))
# [1] 0.0303160 0.0319642 0.0336588 0.0353998 0.0371872
The solution turned out to be very simple. Not sure why I did not see this solution before.
pd.matrix <- matrix(c(0.10, 0.20, 0.30, 0.40, 0.50,
0.11, 0.21, 0.31, 0.41, 0.51,
0.12, 0.22, 0.32, 0.42, 0.52,
0.13, 0.23, 0.33, 0.43, 0.53,
0.14, 0.24, 0.34, 0.44, 0.54), nrow = 5, byrow = TRUE)
vcv.mat = matrix(c(0.01, 0.0020, 0.0030, 0.0040, 0.0050,
0.0020, 0.02, 0.0031, 0.0041, 0.0051,
0.0030, 0.0031, 0.03, 0.0042, 0.0052,
0.0040, 0.0041, 0.0042, 0.04, 0.0053,
0.0050, 0.0051, 0.0052, 0.0053, 0.05), nrow = 5, byrow = TRUE)
my.var.function <- function(x) {
my.pd <- matrix(c(x), nrow = 1)
my.mat = my.pd %*% vcv.mat
my.var = my.mat %*% t(my.pd)
return(my.var = my.var)
}
apply(pd.matrix, 1, function(x) my.var.function(x))
# [1] 0.0303160 0.0319642 0.0336588 0.0353998 0.0371872

Resources