Calculate correlation on a monthly/weekly level - r

I am having trouble calculating the correlation coefficient between electricity prices of different countries on monthly/ weekly level. The dataset (https://github.com/Argiro1983/prices_df.git) looks like this:
prices_df<-structure(list(DATETIME = structure(c(1609459200, 1609462800,
1609466400, 1609470000, 1609473600, 1609477200, 1609480800, 1609484400,
1609488000, 1609491600), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
GR = c(50.87, 48.19, 44.68, 42.92, 40.39, 20.96, 39.63, 40.1,
20, 40.74), IT = c(50.87, 48.19, 44.68, 42.92, 40.39, 40.2,
39.63, 40.09, 41.27, 41.67), BG = c(49.95, 48.05, 49.62,
46.73, 45.39, 44.25, 36.34, 19.97, 20, 20.43), HU = c(45.54,
41.59, 40.05, 36.9, 34.47, 32.82, 27.7, 15, 8.43, 20.77),
TR = c(26.31, 24.06, 24.21, 23.2, 23.2, 26.31, 24.98, 26.31,
24.04, 26.31), SR = c(38.89, 34.86, 33.62, 28.25, 29.03,
29.22, 29.71, 1.08, 1.1, 36.07)), row.names = c(NA, 10L), class = "data.frame")
I have tried converting it to xts and using apply.monthly (or apply.weekly) as follows, but it does not work.
library(xts)
SEE_prices <- xts(x = prices_df, order.by = DATETIME)
storage.mode(SEE_prices) <- "numeric"
SEE_prices <- na.locf(SEE_prices)
library(tidyverse)
library(tidyquant)
apply.monthly(SEE_prices, cor(SEE_prices$GR, SEE_prices$SR))
Another way I tried to get correlation on weekly level was to use the dplyr package, but it also did not work:
library(lubridate)
library(magrittr)
library(dplyr)
prices_df %<>% mutate( DATETIME = ymd_hms(DATETIME) )
table1<- prices_df %>% group_by( year( DATETIME ), isoweek( DATETIME ) ) %>%
summarise( DateCount = n_distinct(date(DATETIME)), correlation = cor(prices_df$GR, prices_df$SR))
Does anybody have an idea on how to calculate weekly/monthly correlation on a dataset?
Thank you in advance.

Don't use $ in dplyr pipes. To calculate correlation try -
library(dplyr)
library(lubridate)
prices_df %>%
mutate(DATETIME = ymd_hms(DATETIME),
year = year(DATETIME), week = isoweek(DATETIME)) %>%
group_by(year, week) %>%
summarise(DateCount = n_distinct(date(DATETIME)),
correlation = cor(GR, SR), .groups = 'drop')

Related

First Derivative of Scatter Plot R

Hello I am working with sigmoidal data and am attempting to plot two scatter plots on top of each other: the raw data & the first derivative of the raw data. My issue doesn't lie in plotting the data, but more-so finding a function that will create an accurate representation of the first derivative.
What have I tried: Creating a function that calculates the slope of the current & next point: (y2-y1)/(x2-x1) & assigning the value to the current temperature.
dput() of Data Frame:
structure(list(Temperature = c(4.98, 5.49, 6.01, 6.5, 7.02, 7.52, 8.03, 8.52, 9.03, 9.54, 10.04, 10.54, 11.05, 11.55, 12.05, 12.55, 13.05, 13.56, 14.06, 14.57, 15.07, 15.57, 16.07, 16.59, 17.08, 17.59, 18.08, 18.59, 19.09, 19.6, 20.1, 20.64, 21.12, 21.63, 22.13, 22.62, 23.13, 23.63, 24.13, 24.63, 25.11, 25.62, 26.11, 26.68, 27.19, 27.7, 28.2, 28.71, 29.21, 29.71, 30.21, 30.7, 31.21, 31.69, 32.19, 32.69, 33.19, 33.7, 34.19, 34.68, 35.19, 35.68, 36.19, 36.69, 37.19, 37.7, 38.19, 38.7, 39.2, 39.7, 40.21, 40.7, 41.22, 41.71, 42.21, 42.71, 43.21, 43.72, 44.22, 44.72, 45.22, 45.73, 46.23, 46.73, 47.23, 47.97, 48.71, 49.23, 49.74, 50.23, 50.73, 51.23, 51.73, 52.24, 52.75, 53.24, 53.75, 54.24, 54.75, 55.26, 55.75, 56.25, 56.75, 57.24, 57.75, 58.27, 58.77, 59.26, 59.77, 60.26, 60.78, 61.27, 61.79, 62.27, 62.77, 63.29, 63.79, 64.27, 64.78, 65.3, 65.8, 66.27, 66.8, 67.3, 67.8, 68.31, 68.78, 69.3, 69.8, 70.32, 70.81, 71.32, 71.81, 72.33, 72.82, 73.31, 73.83, 74.33, 74.82, 75.32, 75.83, 76.34, 76.84, 77.35, 77.82, 78.34, 78.85, 79.36, 79.84, 80.35, 80.85, 81.36, 81.86, 82.37, 82.86, 83.37, 83.88, 84.36, 84.88, 85.38, 85.88, 86.38, 86.89, 87.38, 87.89, 88.39, 88.89, 89.4, 89.9, 90.39, 90.9, 91.4, 91.91, 92.37, 92.89, 93.4, 93.91, 94.41, 94.91, 95.42), Absorbance = c(1.401351929, 1.403320313, 1.405181885, 1.406326294, 1.407440186, 1.409118652, 1.410095215, 1.410797119, 1.411560059, 1.412918091, 1.413970947, 1.414245605, 1.416000366, 1.415435791, 1.41809082, 1.4190979, 1.419677734, 1.420150757, 1.421966553, 1.420333862, 1.422637939, 1.422790527, 1.423461914, 1.426513672, 1.426315308, 1.426071167, 1.426467896, 1.428710938, 1.428070068, 1.428817749, 1.429733276, 1.432144165, 1.432434082, 1.433227539, 1.434616089, 1.435806274, 1.434814453, 1.436096191, 1.436096191, 1.436447144, 1.437896729, 1.4375, 1.438934326, 1.440139771, 1.440139771, 1.441741943, 1.442108154, 1.443969727, 1.444778442, 1.443862915, 1.444534302, 1.445648193, 1.444473267, 1.446395874, 1.447219849, 1.446151733, 1.449569702, 1.449066162, 1.448852539, 1.4503479, 1.451385498, 1.45111084, 1.451217651, 1.453125, 1.452560425, 1.455047607, 1.455093384, 1.456665039, 1.457977295, 1.457336426, 1.458648682, 1.46043396, 1.462158203, 1.464813232, 1.463531494, 1.468048096, 1.468643188, 1.470748901, 1.471878052, 1.476257324, 1.478057861, 1.482040405, 1.484466553, 1.486129761, 1.48815918, 1.496520996, 1.499786377, 1.504302979, 1.507217407, 1.512985229, 1.517471313, 1.524108887, 1.528198242, 1.534637451, 1.539169312, 1.546142578, 1.554611206, 1.55809021, 1.56854248, 1.572875977, 1.580307007, 1.585739136, 1.592514038, 1.600067139, 1.609222412, 1.616607666, 1.622375488, 1.631469727, 1.635635376, 1.642929077, 1.649780273, 1.655014038, 1.661483765, 1.663742065, 1.671859741, 1.677200317, 1.677108765, 1.683380127, 1.684082031, 1.687438965, 1.694595337, 1.694961548, 1.696685791, 1.696685791, 1.699768066, 1.702514648, 1.703613281, 1.705093384, 1.70022583, 1.707595825, 1.707962036, 1.709075928, 1.705276489, 1.71055603, 1.709259033, 1.70916748, 1.709732056, 1.710189819, 1.710281372, 1.711868286, 1.711883545, 1.713104248, 1.713760376, 1.711120605, 1.709716797, 1.711776733, 1.712814331, 1.714324951, 1.711120605, 1.713378906, 1.712432861, 1.716125488, 1.710006714, 1.710845947, 1.711502075, 1.711120605, 1.710006714, 1.70980835, 1.708602905, 1.708236694, 1.710189819, 1.707672119, 1.706939697, 1.710006714, 1.706192017, 1.706573486, 1.706207275, 1.705734253, 1.706207275, 1.705184937, 1.70954895, 1.705841064, 1.702972412, 1.703979492, 1.703063965, 1.709350586, 1.703338623, 1.700408936, 1.705276489, 1.705368042)), row.names = 1621:1800, class = "data.frame")
Code For my Attempt
raw = "<insert dput line>>"
columns = c("Temperature","Absorbance")
first = data.frame(matrix(nrow=0,ncol=2))
colnames(dFrame) = columns
for (i in 1:nrow(raw)) {
if(i != nrow(raw)) {
cAbs = raw[i,2]
nextAbs = raw[i+1,2]
cT = raw[i,1]
nextT = raw[i+1,1]
Temperature = raw[i,1]
Absorbance =((nextAbs-cAbs)/(nextT-cT))
t <- data.frame(Temperature,Absorbance)
names(t) <- names(raw)
first <- rbind(first, t)
}
}
ggplot()+
geom_point(data=raw, aes(x=Temperature,y=Absorbance), color = "red") +
geom_point(data = first, aes(x=Temperature,y = Absorbance), color = "blue")
What I was expecting
I was expecting an output that had the shape of something like so:
library(dplyr); library(ggplot2)
df %>%
arrange(Temperature) %>%
mutate(slope = (Absorbance - lag(Absorbance))/
(Temperature - lag(Temperature))) %>%
ggplot(aes(Temperature)) +
geom_line(aes(y= Absorbance, color = "Absorbance"), size = 1.2) +
geom_point(aes(y= slope * 20 + 1.4, color = "slope")) +
geom_smooth(aes(y= slope * 20 + 1.4, color = "slope"), se = FALSE, size = 0.8) +
scale_y_continuous(sec.axis = sec_axis(trans = ~(.x - 1.4)/20, name = "slope"))
If the data is even a little noisy, calculating the derivative by first differencing can be very noisy.
You can get a better estimate by fitting a smoothing spline function and calculating the derivative of the spline function. By differentiating a smooth function, you get a smooth derivative.
In most cases, smooth.spline with default arguments is fine, but I recommend taking a look at the result and possibly tuning the smooth.spline parameters for more or less smoothing, depending on your judgment.
edit: I learned this approach from the Numerical Recipes textbook.
library(tidyverse)
df <- tibble(
x = seq(1, 15, by = 0.1),
y = sin(x) + runif(length(x), -0.2, 0.2),
d1_diff = c(NA, diff(y) / diff(x)),
d1_spline = smooth.spline(x, y) %>% predict(x, deriv = 1) %>% pluck("y")
)
df %>%
pivot_longer(-x) %>%
mutate(name = factor(name, unique(name))) %>%
ggplot() + aes(x, value, color = name) + geom_point() + geom_line() +
facet_wrap(~name, ncol = 1)
#> Warning: Removed 1 rows containing missing values (geom_point).
#> Warning: Removed 1 row(s) containing missing values (geom_path).
Created on 2022-10-26 with reprex v2.0.2

Seasonal plots for multiple variables using forecast::ggseasonplot

I'm trying to draw a seasonal plots for dataframe df, which contains time series data for 2 variables (value1 and value2):
df <- structure(list(date = structure(c(18292, 18321, 18352, 18382,
18413, 18443, 18474, 18505, 18535, 18566, 18596, 18627, 18658,
18686, 18717, 18747, 18778, 18808, 18839, 18870, 18900, 18931,
18961, 18992), class = "Date"), value1 = c(-2.94, -40.61, -6.89,
3.04, -3.5, 0.18, 6.79, 9.08, 9.35, 10.92, 20.53, 18.04, 24.6,
154.6, 30.4, 32.1, 27.7, 32.1, 19.2, 25.4, 28, 26.9, 21.7, 20.9
), value2 = c(-12.66, 7.56, -1.36, -14.39, -16.18, 3.29, -0.69,
-1.6, 13.47, 4.83, 4.56, 7.58, 28.7, 18.9, 39.1, 44, 52, 37.1,
28.2, 32.7, 17.2, 20.4, 31.4, 19.5)), class = "data.frame", row.names = c(NA,
-24L))
We could draw two time series in one plot using:
meltdf <- melt(df, id='date')
meltdf %>%
ggplot(aes(x=date, y=value, colour=variable, group=variable)) +
geom_point() +
geom_line()
Out:
But I hope to use ggseasonplot to draw two plots for value1 and value2 seperately, each one will be similar to the following plot:
library(forecast)
ggseasonplot(AirPassengers, col=rainbow(12), year.labels=TRUE)
The problem I met is how to convert each subset dataframe to ts objects:
meltdf %>%
filter(variable=='value1') %>%
as.ts() %>%
ggseasonplot(col=rainbow(12), year.labels=TRUE)
Thanks.
Update: implement with ggplot2 only:
meltdf %>%
filter(variable=='value2') %>%
select(-variable) %>%
mutate(
year = factor(year(date)), # use year to define separate curves
date = update(date, year = 1) # use a constant year for the x-axis
) %>%
ggplot(aes(date, value, color = year)) +
scale_x_date(date_breaks = "1 month", date_labels = "%b")+
geom_line()+
geom_point()
This is what the feasts package is for -- handling time series graphics with multiple series in the same data frame. Here is how to do it with the sample data provided.
library(tsibble)
library(feasts)
library(tidyr)
library(dplyr)
# Convert to tsibble object and plot using gg_season()
df %>%
pivot_longer(value1:value2) %>%
mutate(date = yearmonth(date)) %>%
as_tsibble(index = date, key = name) %>%
gg_season(value)
Created on 2022-02-14 by the reprex package (v2.0.1)
See https://otexts.com/fpp3/seasonal-plots.html for more examples.

How to customize colors for lines and points in feasts::gg_season()

I'm able to convert the following df to tsibble object and plot using gg_season():
library(tsibble)
library(feasts)
library(tidyr)
library(dplyr)
df <- structure(list(date = structure(c(18292, 18321, 18352, 18382,
18413, 18443, 18474, 18505, 18535, 18566, 18596, 18627, 18658,
18686, 18717, 18747, 18778, 18808, 18839, 18870, 18900, 18931,
18961, 18992), class = "Date"), value1 = c(-2.94, -40.61, -6.89,
3.04, -3.5, 0.18, 6.79, 9.08, 9.35, 10.92, 20.53, 18.04, 24.6,
154.6, 30.4, 32.1, 27.7, 32.1, 19.2, 25.4, 28, 26.9, 21.7, 20.9
), value2 = c(-12.66, 7.56, -1.36, -14.39, -16.18, 3.29, -0.69,
-1.6, 13.47, 4.83, 4.56, 7.58, 28.7, 18.9, 39.1, 44, 52, 37.1,
28.2, 32.7, 17.2, 20.4, 31.4, 19.5)), class = "data.frame", row.names = c(NA,
-24L))
# Convert to tsibble object and plot using gg_season()
df %>%
pivot_longer(value1:value2) %>%
mutate(date = yearmonth(date)) %>%
mutate(year = year(date)) %>%
as_tsibble(index = date, key = name) %>%
gg_season(value) +
geom_point() # +
# scale_color_manual(values = c('2020' = 'blue', '2021' = 'red'))
Now I try to customize colors based on different years, ie., setting blue for 2020, and red for 2021. I've added scale_color_manual(values = c('2020' = 'blue', '2021' = 'red')), but I didn't succeed yet, how could I do that correctly? Thanks.
Reference:
how to change the color in geom_point or lines in ggplot
...
gg_season(value, pal = c("#3333FF", "#FF3333")) +
geom_point()
The year scale here is a continuous one (explaining why the scale_color_manual line produces "Error: Continuous value supplied to discrete scale"). But we can give gg_season a vector of color codes to use in its pal parameter.

Summary code that counts the number of values less than specified numbers

I have a simple dataset (that I've titled 'summary') that includes a numeric column of values. I want to create code to summarize the number of rows less that specific values, such as 5, 10, 20, 30, etc.
Here is some of the data:
dput(summary[1:50,])
structure(list(S2S_Mins = c(NA, 101.15, 107.43, 205.5, 48.07,
34.9, 195.05, 17.58, 41.63, 74.27, 21.05, 32.27, 51.18, 17.88,
32.52, 26.98, 32.03, 40.03, 50.73, 54.38, 33.17, 19.97, 23.57,
41.82, 17.7, 20.9, 24.65, 16.48, 27.97, 94.47, 23.13, 22.63,
25.5, 43.8, 46.47, 33.98, 17.28, 27.57, 45.58, 34.52, 32.75,
35.92, 28.62, 17.48, 40.55, 38.8, 34.97, 41.95, 36.88, 21.58)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -50L))
I can go through and count the number of rows like this:
sum(summary$S2S_Mins < 5, na.rm = TRUE)
sum(summary$S2S_Mins < 10, na.rm = TRUE)
sum(summary$S2S_Mins < 20, na.rm = TRUE)
sum(summary$S2S_Mins < 30, na.rm = TRUE)
sum(summary$S2S_Mins < 60, na.rm = TRUE)
But I would like a summary function (or something similar) that will put this in a table for me, like follows:
TimeCategory Count
Less5 0
Less10 1
Less20 9
Less30 17
Less60 36
I have tried using dplyr with the summarize/summarise function, but I get errors:
#first try - gives a (1 x 0) tibble
summary %>% summarize(Less5 = nrow(S2S_Mins < 5), Less10 = nrow(S2S_Mins < 10))
#second try - gives error saying "unused argument (S2S_Mins < 5)"
summary %>% summarize(Less5 = n(S2S_Mins < 5), Less10 = n(S2S_Mins < 10))
Any pointers would be greatly appreciated. Thanks.
We can use sapply
v1 <- c(5, 10, 20, 30, 60)
out <- sapply(v1, function(x) sum(summary$S2S_Mins < x, na.rm = TRUE))
names(out) <- paste0("Less", v1)
stack(out)[2:1]
-ouput
ind values
1 Less5 0
2 Less10 0
3 Less20 7
4 Less30 19
5 Less60 43

Apply technical analysis indicator to multiple assets

I am trying to apply this simple technical analysis indicator to an xts dataframe called prices. But I can't manage to create the loop for the signal. Do you have some suggestions?
library(TTR)
library(Hmisc)
library(xts)
prices = structure(c(70.27, 70.29, 70.31, 70.67, 70.41, 70.53, 70.56,
69.61, 70.32, 69.97, 70.13, 68.88, 68.97, 70.75, 71.32, 71.32,
71.32, 72.02, 72.48, 73.33, 73.59, 73.93, 73.47, 72.13, 72.17,
73.18, 72.59, 73.34, 73.43, 72.78, 72.43, 72.3, 71.27, 71.51,
71.94, 71.1, 69.77, 70.02, 70.26, 69.6, 70.13, 70.13, 71.27,
70.58, 69.52, 69.58, 69.46, 69.62, 69.07, 69.98, 44.245, 44.125,
44.09, 44.155, 43.93, 44.305, 44.065, 43.37, 43.685, 43.285,
43.355, 42.305, 42.65, 43.64, 43.885, 43.885, 43.885, 44.12,
44.385, 44.78, 44.985, 44.985, 44.865, 44.38, 44.05, 44.65, 44.065,
44.62, 44.73, 44.32, 44.275, 44.145, 43.615, 43.975, 44.52, 44.335,
43.585, 43.715, 43.83, 43.735, 44.09, 44.005, 44.775, 44.325,
43.555, 43.535, 43.325, 43.425, 43.04, 43.45, 166.09, 166.44,
165.04, 167.69, 168.08, 169.17, 168.67, 167.19, 167.19, 164.39,
163.26, 159.64, 160.33, 162.83, 163.4, 163.4, 163.4, 164.79,
166.23, 168.3, 168.29, 169.34, 168.56, 166.81, 165.39, 165.98,
162.64, 163.78, 164.91, 164, 162.1, 162.25, 161.45, 162.08, 162.37,
160.09, 157.96, 158.45, 159.95, 159.75, 160.58, 160.51, 164.09,
161.96, 160.84, 161.41, 159.48, 159.45, 158.09, 158.49, 66, 66.19,
66.31, 67.17, 66.84, 67.32, 67.26, 66.19, 66.46, 65.62, 65.61,
63.87, 64.09, 64.73, 65.72, 65.72, 65.72, 66.11, 66.96, 67.53,
67.57, 67.53, 67.25, 65.98, 65.52, 66.19, 65.23, 66.2, 66.4,
65.53, 65.52, 65.37, 64.54, 64.57, 64.85, 64, 62.94, 63.18, 63.87,
63.3, 63.9, 63.83, 64.76, 64, 63.62, 63.92, 63.02, 63.27, 62.33,
62.65), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", format = "%Y-%m-%d", class = c("xts",
"zoo"), index = structure(c(1301616000, 1301875200, 1301961600,
1302048000, 1302134400, 1302220800, 1302480000, 1302566400, 1302652800,
1302739200, 1302825600, 1303084800, 1303171200, 1303257600, 1303344000,
1303430400, 1303689600, 1303776000, 1303862400, 1303948800, 1304035200,
1304294400, 1304380800, 1304467200, 1304553600, 1304640000, 1304899200,
1304985600, 1305072000, 1305158400, 1305244800, 1305504000, 1305590400,
1305676800, 1305763200, 1305849600, 1306108800, 1306195200, 1306281600,
1306368000, 1306454400, 1306713600, 1306800000, 1306886400, 1306972800,
1307059200, 1307318400, 1307404800, 1307491200, 1307577600), tzone = "UTC", tclass = "Date"), .Dim = c(50L,
4L), .Dimnames = list(NULL, c("A", "B", "C", "D")))
#I apply the EMA indicator to the prices xts dataframe
EMA20fn <- function(x) EMA(x, n=20)
EMA20prices <- xts(apply(prices, 2, EMA20fn), order.by=index(EMA20fn(prices[,1])))
#I know how to create the signals (in EMA20prices) for a single asset, but I don't know
#what kind of loop it's required to apply the signal to every asset in the dataframe
#prices
ema20tr <- Lag(ifelse(Lag(prices[,1])<Lag(EMA20prices[,1])& prices[,1]>EMA20prices[,1],1,
ifelse(Lag(prices[,1])>Lag(EMA20prices[,1])& prices[,1]<EMA20prices[,1],-1,0)))
ema20tr[is.na(ema20tr)] <- 0
ema20sig <- ifelse(ema20tr>1,0,0)
for(i in 2:length(prices[,1])){ema20sig[i] <- ifelse(ema20tr[i]==1,1,
ifelse(ema20tr[i]==-1,0,ema20sig[i-1]))}
ema20sig[is.na(ema20sig)] <- 1
Thank you in advance for the answers!
The following changes to your code will do what you want on all four columns (with the prices data structure as per the question)
library(TTR)
library(Hmisc)
library(xts)
#I apply the EMA indicator to the prices xts dataframe
EMA20fn <- function(x) EMA(x, n=20)
EMA20prices <- xts(apply(prices, 2, EMA20fn), order.by=index(EMA20fn(prices[,1])))
#I know how to create the signals (in EMA20prices) for a single asset, but I don't know
#what kind of loop it's required to apply the signal to every asset in the dataframe
#prices
ema20tr = NULL
for (j in 1:ncol(prices)) {
ema20tr <- cbind(ema20tr,Lag(ifelse(Lag(prices[,j])<Lag(EMA20prices[,j])& prices[,j]>EMA20prices[,j],1,
ifelse(Lag(prices[,j])>Lag(EMA20prices[,j])& prices[,j]<EMA20prices[,j],-1,0))))
}
ema20tr[is.na(ema20tr)] <- 0
ema20sig <- ifelse(ema20tr>1,0,0)
for (j in 1:ncol(prices)) {
for(i in 2:length(prices[,j])) {ema20sig[i,j] <- ifelse(ema20tr[i,j]==1,1,
ifelse(ema20tr[i,j]==-1,0,ema20sig[i-1,j]))}
}
ema20sig[is.na(ema20sig)] <- 1

Resources