convert List to zoo and use rollapply on the List - r

I would like to convert a list object to zoo and then apply rollapply on the zoo object. Short example reproduced below (I have 90,000 such files to process, using UNIX:)). Assume my list has two dataframes.
1) I would like to convert the date in each of the dataframes to this format:
dates <- as.Date(paste0(mylist$year, "-", mylist$month, "-", mylist$day), format="%Y-%m-%d")
z <- zoo(mylist, order.by=mylist[,1])
I understand lapply can do this but I tried without success.
Once I get my zoo object, I would like to use rollapply:
library(hydroTSM)#for daily2annual function but aggregate can do
x.3max <- rollapply(data=zooobject, width=3, FUN=sum, fill=NA, partial= TRUE,
align="center")
# Maximum value per year of 3-day total rainfall for each one of the simulations
z.3max.annual <- daily2annual(z.3max, FUN=max,na.rm=TRUE)#dates=1
What the code above does is it centers a 3-day window on each column of the dataframes in zooobject and sums the values. The, the max per year of the 3-day total is extracted.
mylist<- list(a,a)
mylist<-lapply(mylist, function(x) x[x[["Month"]] %in% c(12,1,2),])# extract data for DJF for individual sites
library(zoo)
a= structure(list(Year = c(1975L, 1975L, 1975L, 1975L, 1975L, 1975L
), Month = c(1L, 1L, 1L, 1L, 1L, 1L), Site = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "G100", class = "factor"), Day = 1:6,
sim01 = c(28.49, 29.04, 27.62, 28.43, 28.69, 29.16), sim02 = c(29.49,
30.04, 28.62, 29.43, 29.69, 30.16), sim03 = c(30.49, 31.04,
29.62, 30.43, 30.69, 31.16), sim04 = c(31.49, 32.04, 30.62,
31.43, 31.69, 32.16), sim05 = c(32.49, 33.04, 31.62, 32.43,
32.69, 33.16), sim06 = c(33.49, 34.04, 32.62, 33.43, 33.69,
34.16), sim07 = c(34.49, 35.04, 33.62, 34.43, 34.69, 35.16
), sim08 = c(35.49, 36.04, 34.62, 35.43, 35.69, 36.16), sim09 = c(36.49,
37.04, 35.62, 36.43, 36.69, 37.16), sim10 = c(37.49, 38.04,
36.62, 37.43, 37.69, 38.16), sim11 = c(38.49, 39.04, 37.62,
38.43, 38.69, 39.16), sim12 = c(39.49, 40.04, 38.62, 39.43,
39.69, 40.16), sim13 = c(40.49, 41.04, 39.62, 40.43, 40.69,
41.16), sim14 = c(41.49, 42.04, 40.62, 41.43, 41.69, 42.16
), sim15 = c(42.49, 43.04, 41.62, 42.43, 42.69, 43.16), sim16 = c(43.49,
44.04, 42.62, 43.43, 43.69, 44.16), sim17 = c(44.49, 45.04,
43.62, 44.43, 44.69, 45.16), sim18 = c(45.49, 46.04, 44.62,
45.43, 45.69, 46.16), sim19 = c(46.49, 47.04, 45.62, 46.43,
46.69, 47.16), sim20 = c(47.49, 48.04, 46.62, 47.43, 47.69,
48.16)), .Names = c("Year", "Month", "Site", "Day", "sim01",
"sim02", "sim03", "sim04", "sim05", "sim06", "sim07", "sim08",
"sim09", "sim10", "sim11", "sim12", "sim13", "sim14", "sim15",
"sim16", "sim17", "sim18", "sim19", "sim20"), row.names = c(NA,
6L), class = "data.frame")
Output should be similar to:
Year Site Sim01...
1975 G100 ...
1976 G100 ...
1977 G100 ...
Only the values in the months c(12,1,2) are needed.

This produces a list of zoo objects, Lz, and then performs rollapply on each component of the list giving L2. Finally L3 aggregates over year taking the max of each column.
library(zoo)
mylist <- list(a, a) # a is given at bottom of question
Lz <- lapply(mylist, read.zoo, index = 1:3, format = "%Y %m %d")
L2 <- lapply(Lz, rollapply, 3, sum, partial = TRUE)
L3 <- lapply(L2, function(z) aggregate(z, as.numeric(format(time(z), "%Y")), max))
giving:
> L3
[[1]]
sim01 sim02 sim03 sim04 sim05 sim06 sim07 sim08 sim09 sim10 sim11
1975 86.28 89.28 92.28 95.28 98.28 101.28 104.28 107.28 110.28 113.28 116.28
sim12 sim13 sim14 sim15 sim16 sim17 sim18 sim19 sim20
1975 119.28 122.28 125.28 128.28 131.28 134.28 137.28 140.28 143.28
[[2]]
sim01 sim02 sim03 sim04 sim05 sim06 sim07 sim08 sim09 sim10 sim11
1975 86.28 89.28 92.28 95.28 98.28 101.28 104.28 107.28 110.28 113.28 116.28
sim12 sim13 sim14 sim15 sim16 sim17 sim18 sim19 sim20
1975 119.28 122.28 125.28 128.28 131.28 134.28 137.28 140.28 143.28

Solved
lst1 <- lapply(list.files(pattern=".csv"),function(x) read.table(x,header=TRUE,sep="")) # read all files and data and replace -999.9 with NA
lst2<-lapply(lst1, function(x) x[x[["Month"]] %in% c(6,7,8),])#c(6,7,8) extract data for DJF for individual sites
names(lst2)<-list.files(pattern=".csv")
lapply(lst2,tail,4)
lst3<-lapply(lst2, function(x) x[!(names(x) %in% c("Site"))])
Lz <- lapply(lst3, read.zoo, index = 1:3, format = "%Y %m %d")
L2 <- lapply(Lz, rollapply, 3, sum, partial = TRUE)
L3 <- lapply(L2, function(z) aggregate(z, as.numeric(format(time(z), "%Y")), max))
mapply(
write.table,
x=L3, file=paste(names(L3), "csv", sep="."),
MoreArgs=list(row.names=FALSE, sep=",")
) # write files to folder keeping the list names as file names

Related

Calculate correlation on a monthly/weekly level

I am having trouble calculating the correlation coefficient between electricity prices of different countries on monthly/ weekly level. The dataset (https://github.com/Argiro1983/prices_df.git) looks like this:
prices_df<-structure(list(DATETIME = structure(c(1609459200, 1609462800,
1609466400, 1609470000, 1609473600, 1609477200, 1609480800, 1609484400,
1609488000, 1609491600), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
GR = c(50.87, 48.19, 44.68, 42.92, 40.39, 20.96, 39.63, 40.1,
20, 40.74), IT = c(50.87, 48.19, 44.68, 42.92, 40.39, 40.2,
39.63, 40.09, 41.27, 41.67), BG = c(49.95, 48.05, 49.62,
46.73, 45.39, 44.25, 36.34, 19.97, 20, 20.43), HU = c(45.54,
41.59, 40.05, 36.9, 34.47, 32.82, 27.7, 15, 8.43, 20.77),
TR = c(26.31, 24.06, 24.21, 23.2, 23.2, 26.31, 24.98, 26.31,
24.04, 26.31), SR = c(38.89, 34.86, 33.62, 28.25, 29.03,
29.22, 29.71, 1.08, 1.1, 36.07)), row.names = c(NA, 10L), class = "data.frame")
I have tried converting it to xts and using apply.monthly (or apply.weekly) as follows, but it does not work.
library(xts)
SEE_prices <- xts(x = prices_df, order.by = DATETIME)
storage.mode(SEE_prices) <- "numeric"
SEE_prices <- na.locf(SEE_prices)
library(tidyverse)
library(tidyquant)
apply.monthly(SEE_prices, cor(SEE_prices$GR, SEE_prices$SR))
Another way I tried to get correlation on weekly level was to use the dplyr package, but it also did not work:
library(lubridate)
library(magrittr)
library(dplyr)
prices_df %<>% mutate( DATETIME = ymd_hms(DATETIME) )
table1<- prices_df %>% group_by( year( DATETIME ), isoweek( DATETIME ) ) %>%
summarise( DateCount = n_distinct(date(DATETIME)), correlation = cor(prices_df$GR, prices_df$SR))
Does anybody have an idea on how to calculate weekly/monthly correlation on a dataset?
Thank you in advance.
Don't use $ in dplyr pipes. To calculate correlation try -
library(dplyr)
library(lubridate)
prices_df %>%
mutate(DATETIME = ymd_hms(DATETIME),
year = year(DATETIME), week = isoweek(DATETIME)) %>%
group_by(year, week) %>%
summarise(DateCount = n_distinct(date(DATETIME)),
correlation = cor(GR, SR), .groups = 'drop')

Function to calculate median by column to an R dataframe that is done regularly to multiple dataframes

Trying to write a function to combine multiple steps that are used regularly on an R dataframe. At the moment I stack individual lines, which is most inefficient. An Example each step I take at the moment
library(scores)
MscoreIndex <- 3
labMedians <- mapply(median, df[-1], na.rm = T) #calculate the median for each column except 1st
LabGrandMedian <- median(mapply(median, df[-1], na.rm = T),na.rm = T)
labMscore <- as.vector(round(abs(scores_na(labMedians, "mad")), digits = 2)) #calculate mscore by lab
labMscoreIndex <- which(labMscore > MscoreMax) #get the position in the vector that exceeds Mscoremax
df[-1][labMscoreIndex] <- NA # discharge values above threshold by making NA
An example my df below
structure(list(Determination_No = 1:6, `2` = c(55.94, 55.7, 56.59,
56.5, 55.98, 55.93), `3` = c(56.83, 56.54, 56.18, 56.5, 56.51,
56.34), `4` = c(56.39, 56.43, 56.53, 56.31, 56.47, 56.35), `5` = c(56.32,
56.29, 56.31, 56.32, 56.39, 56.32), `7` = c(56.48, 56.4, 56.54,
56.43, 56.73, 56.62), `8` = c(56.382, 56.258, 56.442, 56.258,
56.532, 56.264), `10` = c(56.3, 56.5, 56.2, 56.5, 56.7, 56.5),
`12` = c(56.11, 56.46, 56.1, 56.35, 56.36, 56.37)), class = "data.frame", row.names = c(NA,
-6L))
I started by trying to get the indivdual lab medians and the grandmedian with the following but got errors
I tried.
mediansFunction <- function(x){
analytemedians <- mapply(median(x[,-1]))
grandmedian <- median(x[,-1])
list(analytemedians,grandmedian)
}
mediansFunction(df)
But I get "Error in median.default(x[, -1]) : need numeric data"
Try :
mediansFunction <- function(x){
analytemedians <- sapply(x[-1], median)
median_of_median <- median(analytemedians)
grand_median <- median(as.matrix(x[-1]))
list(analytemedians = analytemedians,
median_of_median = median_of_median,
grand_median = grand_median)
}
mediansFunction(df)
#$analytemedians
# 2 3 4 5 7 8 10 12
#55.960 56.505 56.410 56.320 56.510 56.323 56.500 56.355
#$median_of_median
#[1] 56.3825
#$grand_median
#[1] 56.386

how do you subset a data frame based on column names?

I have this data frame:
dput(df)
structure(list(Server = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "servera", class = "factor"),
Date = structure(1:6, .Label = c("7/13/2017 15:01", "7/13/2017 15:02",
"7/13/2017 15:03", "7/13/2017 15:04", "7/13/2017 15:05",
"7/13/2017 15:06"), class = "factor"), Host_CPU = c(1.812950134,
2.288070679, 1.563278198, 1.925239563, 5.350669861, 2.612503052
), UsedMemPercent = c(38.19, 38.19, 38.19, 38.19, 38.19,
38.22), jvm1 = c(10.91, 11.13, 11.34, 11.56, 11.77, 11.99
), jvm2 = c(11.47, 11.7, 11.91, 12.13, 12.35, 12.57), jvm3 = c(75.65,
76.88, 56.93, 58.99, 65.29, 67.97), jvm4 = c(39.43, 40.86,
42.27, 43.71, 45.09, 45.33), jvm5 = c(27.42, 29.63, 31.02,
32.37, 33.72, 37.71)), .Names = c("Server", "Date", "Host_CPU",
"UsedMemPercent", "jvm1", "jvm2", "jvm3", "jvm4", "jvm5"), class = "data.frame", row.names = c(NA,
-6L))
I only want to be able to subset this data frame based on the vectors names in this variable:
select<-c("jvm3", "jvm4", "jvm5")
so, my final df should look like this:
structure(list(Server = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "servera", class = "factor"),
Date = structure(1:6, .Label = c("7/13/2017 15:01", "7/13/2017 15:02",
"7/13/2017 15:03", "7/13/2017 15:04", "7/13/2017 15:05",
"7/13/2017 15:06"), class = "factor"), Host_CPU = c(1.812950134,
2.288070679, 1.563278198, 1.925239563, 5.350669861, 2.612503052
), UsedMemPercent = c(38.19, 38.19, 38.19, 38.19, 38.19,
38.22), jvm3 = c(75.65, 76.88, 56.93, 58.99, 65.29, 67.97
), jvm4 = c(39.43, 40.86, 42.27, 43.71, 45.09, 45.33), jvm5 = c(27.42,
29.63, 31.02, 32.37, 33.72, 37.71)), .Names = c("Server",
"Date", "Host_CPU", "UsedMemPercent", "jvm3", "jvm4", "jvm5"), class = "data.frame", row.names = c(NA,
-6L))
any ideas?
Please revisit indices. If you use the index mechanism [ in R, you can use mainly three types of indices:
logical vectors: same length as the number of columns, TRUE means select the column
numeric vectors: selects columns based on position
character vectors: select columns based on name
If you use the index mechanism for data frames, you can treat these objects in two ways:
as a list, because they are internally lists
as a matrix, because they mimick matrix behaviour in many cases
Take the iris data frame as example to compare the multiple ways you can select columns from a data frame. If you treat it as a list, you have the following two options:
Use [[ if you want a single column in the form of a vector:
iris[["Species"]]
# [1] setosa setosa setosa ... : is a vector
Use [ if you want one or more columns, but you need a data frame back :
iris["Species"]
iris[c("Sepal.Width", "Species")]
If you treat it as a matrix, you just do the exact same as you would do with a matrix. If you don't specify any row indices, these commands are actually equivalent to the ones used above:
iris[ , "Species"] # is the same as iris[["Species"]]
iris[ , "Species", drop = FALSE] # is the same as iris["Species"]
iris[ , c("Sepal.Width", "Species")] # is the same as iris[c("Sepal.Width", "Species")]
So in your case, you simply need:
select <- c("Server","Date","Host_CPU","UsedMemPercent",
"jvm3","jvm4","jvm5")
df[select]
Note on subset: subset works, but should ONLY be used interactively. There's a warning on the help page stating :
This is a convenience function intended for use interactively. For
programming it is better to use the standard subsetting functions like
[, and in particular the non-standard evaluation of argument subset
can have unanticipated consequences.
Saving your dataframe to a variable df:
df <-
structure(
list(
Server = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "servera", class = "factor"),
Date = structure(
1:6,
.Label = c(
"7/13/2017 15:01",
"7/13/2017 15:02",
"7/13/2017 15:03",
"7/13/2017 15:04",
"7/13/2017 15:05",
"7/13/2017 15:06"
),
class = "factor"
),
Host_CPU = c(
1.812950134,
2.288070679,
1.563278198,
1.925239563,
5.350669861,
2.612503052
),
UsedMemPercent = c(38.19, 38.19, 38.19, 38.19, 38.19,
38.22),
jvm1 = c(10.91, 11.13, 11.34, 11.56, 11.77, 11.99),
jvm2 = c(11.47, 11.7, 11.91, 12.13, 12.35, 12.57),
jvm3 = c(75.65,
76.88, 56.93, 58.99, 65.29, 67.97),
jvm4 = c(39.43, 40.86,
42.27, 43.71, 45.09, 45.33),
jvm5 = c(27.42, 29.63, 31.02,
32.37, 33.72, 37.71)
),
.Names = c(
"Server",
"Date",
"Host_CPU",
"UsedMemPercent",
"jvm1",
"jvm2",
"jvm3",
"jvm4",
"jvm5"
),
class = "data.frame",
row.names = c(NA,-6L)
)
df[,select] should be what youre looking for
Here's one way:
df[,c(1:4,7:9)]
You can also use dplyr to select columns:
select(df, Server,Date,Host_CPU,UsedMemPercent,jvm3,jvm4,jvm5)

Apply technical analysis indicator to multiple assets

I am trying to apply this simple technical analysis indicator to an xts dataframe called prices. But I can't manage to create the loop for the signal. Do you have some suggestions?
library(TTR)
library(Hmisc)
library(xts)
prices = structure(c(70.27, 70.29, 70.31, 70.67, 70.41, 70.53, 70.56,
69.61, 70.32, 69.97, 70.13, 68.88, 68.97, 70.75, 71.32, 71.32,
71.32, 72.02, 72.48, 73.33, 73.59, 73.93, 73.47, 72.13, 72.17,
73.18, 72.59, 73.34, 73.43, 72.78, 72.43, 72.3, 71.27, 71.51,
71.94, 71.1, 69.77, 70.02, 70.26, 69.6, 70.13, 70.13, 71.27,
70.58, 69.52, 69.58, 69.46, 69.62, 69.07, 69.98, 44.245, 44.125,
44.09, 44.155, 43.93, 44.305, 44.065, 43.37, 43.685, 43.285,
43.355, 42.305, 42.65, 43.64, 43.885, 43.885, 43.885, 44.12,
44.385, 44.78, 44.985, 44.985, 44.865, 44.38, 44.05, 44.65, 44.065,
44.62, 44.73, 44.32, 44.275, 44.145, 43.615, 43.975, 44.52, 44.335,
43.585, 43.715, 43.83, 43.735, 44.09, 44.005, 44.775, 44.325,
43.555, 43.535, 43.325, 43.425, 43.04, 43.45, 166.09, 166.44,
165.04, 167.69, 168.08, 169.17, 168.67, 167.19, 167.19, 164.39,
163.26, 159.64, 160.33, 162.83, 163.4, 163.4, 163.4, 164.79,
166.23, 168.3, 168.29, 169.34, 168.56, 166.81, 165.39, 165.98,
162.64, 163.78, 164.91, 164, 162.1, 162.25, 161.45, 162.08, 162.37,
160.09, 157.96, 158.45, 159.95, 159.75, 160.58, 160.51, 164.09,
161.96, 160.84, 161.41, 159.48, 159.45, 158.09, 158.49, 66, 66.19,
66.31, 67.17, 66.84, 67.32, 67.26, 66.19, 66.46, 65.62, 65.61,
63.87, 64.09, 64.73, 65.72, 65.72, 65.72, 66.11, 66.96, 67.53,
67.57, 67.53, 67.25, 65.98, 65.52, 66.19, 65.23, 66.2, 66.4,
65.53, 65.52, 65.37, 64.54, 64.57, 64.85, 64, 62.94, 63.18, 63.87,
63.3, 63.9, 63.83, 64.76, 64, 63.62, 63.92, 63.02, 63.27, 62.33,
62.65), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", format = "%Y-%m-%d", class = c("xts",
"zoo"), index = structure(c(1301616000, 1301875200, 1301961600,
1302048000, 1302134400, 1302220800, 1302480000, 1302566400, 1302652800,
1302739200, 1302825600, 1303084800, 1303171200, 1303257600, 1303344000,
1303430400, 1303689600, 1303776000, 1303862400, 1303948800, 1304035200,
1304294400, 1304380800, 1304467200, 1304553600, 1304640000, 1304899200,
1304985600, 1305072000, 1305158400, 1305244800, 1305504000, 1305590400,
1305676800, 1305763200, 1305849600, 1306108800, 1306195200, 1306281600,
1306368000, 1306454400, 1306713600, 1306800000, 1306886400, 1306972800,
1307059200, 1307318400, 1307404800, 1307491200, 1307577600), tzone = "UTC", tclass = "Date"), .Dim = c(50L,
4L), .Dimnames = list(NULL, c("A", "B", "C", "D")))
#I apply the EMA indicator to the prices xts dataframe
EMA20fn <- function(x) EMA(x, n=20)
EMA20prices <- xts(apply(prices, 2, EMA20fn), order.by=index(EMA20fn(prices[,1])))
#I know how to create the signals (in EMA20prices) for a single asset, but I don't know
#what kind of loop it's required to apply the signal to every asset in the dataframe
#prices
ema20tr <- Lag(ifelse(Lag(prices[,1])<Lag(EMA20prices[,1])& prices[,1]>EMA20prices[,1],1,
ifelse(Lag(prices[,1])>Lag(EMA20prices[,1])& prices[,1]<EMA20prices[,1],-1,0)))
ema20tr[is.na(ema20tr)] <- 0
ema20sig <- ifelse(ema20tr>1,0,0)
for(i in 2:length(prices[,1])){ema20sig[i] <- ifelse(ema20tr[i]==1,1,
ifelse(ema20tr[i]==-1,0,ema20sig[i-1]))}
ema20sig[is.na(ema20sig)] <- 1
Thank you in advance for the answers!
The following changes to your code will do what you want on all four columns (with the prices data structure as per the question)
library(TTR)
library(Hmisc)
library(xts)
#I apply the EMA indicator to the prices xts dataframe
EMA20fn <- function(x) EMA(x, n=20)
EMA20prices <- xts(apply(prices, 2, EMA20fn), order.by=index(EMA20fn(prices[,1])))
#I know how to create the signals (in EMA20prices) for a single asset, but I don't know
#what kind of loop it's required to apply the signal to every asset in the dataframe
#prices
ema20tr = NULL
for (j in 1:ncol(prices)) {
ema20tr <- cbind(ema20tr,Lag(ifelse(Lag(prices[,j])<Lag(EMA20prices[,j])& prices[,j]>EMA20prices[,j],1,
ifelse(Lag(prices[,j])>Lag(EMA20prices[,j])& prices[,j]<EMA20prices[,j],-1,0))))
}
ema20tr[is.na(ema20tr)] <- 0
ema20sig <- ifelse(ema20tr>1,0,0)
for (j in 1:ncol(prices)) {
for(i in 2:length(prices[,j])) {ema20sig[i,j] <- ifelse(ema20tr[i,j]==1,1,
ifelse(ema20tr[i,j]==-1,0,ema20sig[i-1,j]))}
}
ema20sig[is.na(ema20sig)] <- 1

R calculate daily mean data from irregular hourly data

I have a large data set that I am trying to reformat. Unfortunately, I cannot aggregate daily data.
dataset1_updated<- structure(list(X = 1:5, Time = structure(c(1L, 1L, 2L, 3L, 3L), .Label = c("7/29/11 10:29", "7/29/11 10:30", "7/29/11 10:32"
), class = "factor"), O3 = c(32.032608222367, 32.032608222367,
32.032608222367, 32.032608222367, 32.032608222367), SO2 = c(2.611,
2.605, 2.744, 2.767, 2.778), NO = c(0.081, 0.081, 0.081, 0.081,
0.081), NO2 = c(1.938, 1.912, 1.912, 1.896, 1.863), NOx = c(2.019,
1.993, 1.993, 1.977, 1.944)), .Names = c("X", "Time", "O3",
"SO2", "NO", "NO2", "NOx"), row.names = c(NA, 5L), class = "data.frame")
I convert the data set to xts object, and apply daily mean function, the results are "NA". Could you please tell me what is missing?
x <- as.xts(as.POSIXct(dataset1_updated$Time, format="%m/%d/%Y %H:%M"))
x_up<- apply.daily(x, colMeans)
write.csv(as.data.frame(as.matrix(x_up)), file="test")
thank you,
We need to change the xts statement as the as.xts is applying only on the datetime class and not on the entire dataset
xt1 <- xts(dataset1_updated[-(1:2)], order.by = as.POSIXct(dataset1_updated$Time,
format = "%m/%d/%y %H:%M"))
x_up <- apply.daily(xt1, colMeans)
x_up
# O3 SO2 NO NO2 NOx
#2011-07-29 10:32:00 32.03261 2.701 0.081 1.9042 1.9852

Resources