aaply for data.table to find length of intersection of interval - r

I have data like this:
View(dose_merged)
SUBJECT_Blinded PACKID SACDPDAT SACRTDAT treatment_interval SD_SDAT SD_EDAT
1 1501301 10094 2012-05-26 2012-07-23 58 2012-01-03 2013-01-02
2 1601301 10555 2012-01-03 2012-01-31 28 2012-01-03 2013-01-0
With columns types in data table:
> mapply(class, dose_merged)
$SUBJECT_Blinded
[1] "numeric"
$PACKID
[1] "numeric"
$SACDPDAT
[1] "POSIXct" "POSIXt"
$SACRTDAT
[1] "POSIXct" "POSIXt"
$treatment_interval
[1] "Interval"
attr(,"package")
[1] "lubridate"
$SD_SDAT
[1] "POSIXct" "POSIXt"
$SD_EDAT
[1] "POSIXct" "POSIXt"
I want to determine the length of intersection of intervals: interval(SACDPDAT, SACRTDAT) and interval(SD_SDAT, SD_EDAT).
I am trying this:
dose_merged[,intersect1 := aaply(dose_merged, 1, function(x){intersect(interval(x[3],x[4]), interval(x[8],x[9]))})]
But then I get error message:
Error: error while computing 'x' when choosing method for 'intersect': Error in as.POSIXct.default(start) :
do not know how to convert 'start' to class “POSIXct”
The line
intersect(interval(x[3],x[4]), interval(x[8],x[9]))})
works for specified row x.
Any ideas what I am doing wrong ?
The first two rows of dput(dose_merge):
structure(list(SUBJECT_Blinded = c(1101001, 1101001), PACKID = c(10096,
10595), SACDPDAT = structure(c(1335304800, 1325545200), class = c("POSIXct",
"POSIXt"), tzone = ""), SACRTDAT = structure(c(1340316000, 1327964400
), class = c("POSIXct", "POSIXt"), tzone = ""), treatment_interval = structure(c(58,
28), class = structure("Interval", package = "lubridate")), TS_SDAT = structure(c(NA_real_,
NA_real_), class = c("POSIXct", "POSIXt"), tzone = ""), TS_EDAT = structure(c(NA_real_,
NA_real_), class = c("POSIXct", "POSIXt"), tzone = ""), SD_SDAT = structure(c(1325545200,
1325545200), class = c("POSIXct", "POSIXt"), tzone = ""), SD_EDAT = structure(c(1357081200,
1357081200), class = c("POSIXct", "POSIXt"), tzone = "")), .Names = c("SUBJECT_Blinded",
"PACKID", "SACDPDAT", "SACRTDAT", "treatment_interval", "TS_SDAT",
"TS_EDAT", "SD_SDAT", "SD_EDAT"), sorted = "SUBJECT_Blinded", class = c("data.table",
"data.frame"), row.names = c(NA, -2L), .internal.selfref = <pointer: 0x0000000002f30788>)

Related

reshape data into multiple columns using pivot_longer

I am using pivot_longer to reshape my data from wide to long format into multiple value columns. I know there are related questions (Pivot_longer 6 columns to 3 columns or Tidy dataset with pivot_longer: Multiple columns into two columns), but I could not find a solution so far, probably because my two columns will be of different class, the first one being POSIXct and the second one is numeric.
Here is a minimal working example:
structure(list(compid = c("AT9130162999", "AT9090003478", "AT9070005375",
"AT9130048156"), iso2c = c("AT", "AT", "AT", "AT"), nace4 = c("7010",
"4211", "2452", "7010"), lastyear = c("2018", "2019", "2019",
"2019"), `Closing date
Last avail. yr` = structure(c(1546214400,
1577750400, 1585612800, 1577750400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), `Closing date
Year - 1` = structure(c(1514678400,
1546214400, 1553990400, 1546214400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), `Closing date
Year - 2` = structure(c(NA,
1514678400, 1522454400, 1514678400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), `Closing date
Year - 3` = structure(c(NA,
1483142400, 1490918400, 1483142400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), `Closing date
Year - 4` = structure(c(NA,
1451520000, 1459382400, 1451520000), tzone = "UTC", class = c("POSIXct",
"POSIXt")), `Closing date
Year - 5` = structure(c(NA,
1419984000, 1427760000, 1419984000), tzone = "UTC", class = c("POSIXct",
"POSIXt")), `Closing date
Year - 6` = structure(c(NA,
1388448000, 1396224000, 1388448000), tzone = "UTC", class = c("POSIXct",
"POSIXt")), `Closing date
Year - 7` = structure(c(NA,
1356912000, 1364688000, 1356912000), tzone = "UTC", class = c("POSIXct",
"POSIXt")), `Closing date
Year - 8` = structure(c(NA,
1325289600, 1333152000, 1325289600), tzone = "UTC", class = c("POSIXct",
"POSIXt")), `Closing date
Year - 9` = structure(c(NA,
1293753600, 1301529600, 1293753600), tzone = "UTC", class = c("POSIXct",
"POSIXt")), operatinginc_last = c(NA, 482813, -94300, NA), operatinginc_year1 = c(NA,
423482, 780400, NA), operatinginc_year2 = c(NA, 404694, 1210300,
NA), ebit_last = c(1060000, 482813, -94300, 351292), ebit_year1 = c(1501000,
423482, 780400, 331415), ebit_year2 = c(NA, 404694, 1210300,
305492), operatingrev_last = c(28463000, 15842418, 13009700,
11742884), operatingrev_year1 = c(NA, 13734462, 13146300, 10682889
), operatingrev_year2 = c(NA, 13734462, 13146300, 10682889)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
So far, I have tried this:
df_l <- df %>%
pivot_longer(., cols = -(starts_with(c("compid","iso2c","nace4","lastyear","Closing"))),
values_to = "value", values_drop_na=T, names_sep = "_", names_to = c("variable","year"))
But now I would also like to reshape all the columns that start with Closing. How do I do (preferably in one step with pivot_longer)?
The expected output should then include a variable, year and value column, but also a closingdate and date column:
compid iso2c nace4 lastyear `closingdate ~ `date ~`variable ~`year ~ `value
<chr> <chr> <chr> <chr> <dttm> <dttm> <dttm> <dttm>
1 AT913~ AT 7010 2018 `Closing date Last avail. yr` 2018-12-31 ebit last 28463000
2 AT913~ AT 7010 2018 `Closing date Year - 1` 2017-12-31 ebit year1 15362687
2 AT913~ AT 7010 2018 `Closing date Year - 1` 2016-12-31 ebit year2 404694
I have no clue how you would do that in one call to pivot_longer, because you have different variables with different schemes. And you ALSO want to pivot to longer the closing date variable. So here it is in two calls with some cleaning of the closing variable.
library(tidyverse)
df_l <- pivot_longer(df, cols = starts_with("Closing"),
values_to = "date", values_drop_na=T, names_to = c("closing")) %>%
pivot_longer(., cols = contains("_"),
values_to = "value", values_drop_na=T, names_sep = '_', names_to = c("variable",'year')) %>%
mutate(closing = str_remove_all(closing,'Closing date') %>%
str_remove_all(.,'[:cntrl:]') %>%
str_squish() %>%
str_trim())

How to print milliseconds in dttm object

ex <- structure(list(rowid = 1:12, timestamp = structure(c(1505577931.8,
1505577931.8, 1505577931.8, 1509206767.39, 1509206767.39, 1511019574.47,
1511019574.47, 1511988378.544, 1511988378.544, 1511986281.239,
1511986281.239, 1512909143.7), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("rowid",
"timestamp"))
With the data above, how can I print milliseconds in timestamp column?
We can use format to print the milliseconds
format(ex$timestamp, "%Y-%m-%d %H:%M:%OS3")
#[1] "2017-09-16 16:05:31.799" "2017-09-16 16:05:31.799" "2017-09-16 16:05:31.799"
#[4] "2017-10-28 16:06:07.390" "2017-10-28 16:06:07.390" "2017-11-18 15:39:34.470"
#[7] "2017-11-18 15:39:34.470" "2017-11-29 20:46:18.543" "2017-11-29 20:46:18.543"
#[10] "2017-11-29 20:11:21.239" "2017-11-29 20:11:21.239" "2017-12-10 12:32:23.700"

rbind fails to bind datetime column

I am binding a number of data frames data frames and have noticed that I get weird values in one of the bindings. Datetime in second df is disturbed after binding, it is one hour less than in original df.
kk <- structure(list(date = structure(c(1499133600, 1499137200, 1499140800,
1499144400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
temp = c(14.7, 14.6, 14.3, 14.2)), .Names = c("date", "temp"
), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
ff <- structure(list(date = structure(c(1499144400, 1499148000, 1499151600,
1499155200), class = c("POSIXct", "POSIXt"), tzone = ""), temp = 14:17), .Names = c("date",
"temp"), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
Calling functions from different packages give me same result:
dplyr:: bind_rows(kk, ff)
data.table::rbindlist(list(kk, ff))
rbind(kk,ff)
I do not get what is going on. Could it have something to do with date format?

Estimate overnight returns for many stocks using a for loop and store it in a dataframe with stock names as column names

I am trying to estimate overnight returns for many stocks using a for loop and store it in a dataframe with stock names as column names. The trade has raw intraday data and trade2 has cleaned intraday data. list.namess has stock names. This is my code:
require(xts)
require(highfrequency)
OvernightRet<-list()
list.namess<- list.files(pattern="*.IS Equity")
list.namess<- list.namess[2]
for(Q in 1:length(list.namess)){
trade<-readRDS(list.namess[Q])
trade<-xts(trade[,-1], order.by = trade[,1])
colnames(trade)[c(1,2)]<-c("PRICE", "SIZE")
#Unduplicating
trade2<-do.call(rbind, lapply(split(trade,"days"), mergeTradesSameTimestamp))
trade2<-trade2[,1]
fun.first= function(x) first(x)
fun.last= function(x) last(x)
A=do.call(rbind, lapply(split(trade2, "days"), FUN=fun.first))
B=do.call(rbind, lapply(split(trade2, "days"), FUN=fun.last))
OvernightRetA <- (as.numeric(A)-as.numeric(lag.xts(B)))/as.numeric(lag.xts(B))
colnames(OvernightRetA)<-list.namess[Q]
OvernightRet[[Q]]<-OvernightRetA
}
df.OvernightRet<-do.call(merge, OvernightRet)
However, it gives error, probably because of not being able to rename the OvernightRetA:
Error in `colnames<-`(`*tmp*`, value = "ACEM IS Equity.rds") :
attempt to set 'colnames' on an object with less than two dimensions
In addition: There were 50 or more warnings (use warnings() to see the first 50)
> df.OvernightRet<-do.call(merge, OvernightRet)
Error in as.data.frame(x) : argument "x" is missing, with no default
As trade and trade2 is huge and not appropriate for dput. I am posting given Open(A), Close(B) and list of names (list.namess) for reproducibility of error.
dput(head(A,10))
structure(c(231.9, 236.35, 230, 226.85, 229.05, 225.7, 226.95,
224.55, 227, 234.65), class = c("xts", "zoo"), .indexCLASS = c("POSIXct",
"POSIXt"), .indexTZ = "Asia/Calcutta", tclass = c("POSIXct",
"POSIXt"), tzone = "Asia/Calcutta", Price = 1L, index = structure(c(1459481850,
1459741066, 1459827433, 1459913867, 1460000236, 1460086630, 1460345867,
1460432285, 1460518631, 1460950628), tzone = "Asia/Calcutta", tclass = c("POSIXct",
"POSIXt")), .Dim = c(10L, 1L), .Dimnames = list(NULL, "PRICE"))
dput(head(B,10))
structure(c(235.35, 231.2, 226.1, 229.05, 226.45, 225.75, 224.55,
223.75, 231.1, 228.6), class = c("xts", "zoo"), .indexCLASS = c("POSIXct",
"POSIXt"), .indexTZ = "Asia/Calcutta", tclass = c("POSIXct",
"POSIXt"), tzone = "Asia/Calcutta", Price = 1L, index = structure(c(1459508732,
1459767943, 1459854348, 1459940748, 1460027143, 1460113538, 1460374518,
1460465873, 1460545568, 1460977541), tzone = "Asia/Calcutta", tclass = c("POSIXct",
"POSIXt")), .Dim = c(10L, 1L), .Dimnames = list(NULL, "PRICE"))
dput(list.namess) "ACEM IS Equity.rds"
Kindly help me solve this error.
I believe the problem, as the error message implies, is that you are trying to assign a column header to a single value. You can work around this by changing the line above to:
OvernightRetA <- as.data.frame(as.numeric(A)-as.numeric(lag.xts(B)))/as.numeric(lag.xts(B))

Heatmap with xts

Hi all I want to plot a heatmap:
df ist {xts} and looking like this:
structure(c(1.3728813559322, 0.871666666666667, 0.586666666666667,
0.34, -0.31, -0.973333333333333, -1.52666666666667, -1.71333333333333,
-0.396666666666667, 0.698333333333333, 2.84666666666667, 4.68333333333333,
5.33833333333333, 5.66666666666667, 5.63666666666667, 5.69, 5.69666666666667,
5.54333333333333, 5.50833333333333, 4.335, 3.065, 2.42666666666667,
1.88666666666667, 1.47833333333333), .indexCLASS = c("POSIXct",
"POSIXt"), .indexTZ = "", tclass = c("POSIXct", "POSIXt"), tzone = "", class = c("xts",
"zoo"), index = c(1364770740, 1364774340, 1364777940, 1364781540,
1364785140, 1364788740, 1364792340, 1364795940, 1364799540, 1364803140,
1364806740, 1364810340, 1364813940, 1364817540, 1364821140, 1364824740,
1364828340, 1364831940, 1364835540, 1364839140, 1364842740, 1364846340,
1364849940, 1364853540), .Dim = c(24L, 1L), .Dimnames = list(
NULL, "df.xts"))
As in the following post I want to have y-axis 24 hours - one value per hour, and y-axis the date.
Is it possible to work with the existing xts format?
ggplot2 heatmap to assign colors to breaks
I also found another example with heatmap.plus().
z = matrix(rnorm(30),nrow=5,ncol=6);
rlab = matrix(as.character(c(1:5,2:6,3:7,4:8)),nrow=5,ncol=4);
clab = matrix(as.character(c(1:6,6:1)),nrow=6,ncol=2);
colnames(rlab) = LETTERS[1:dim(rlab)[2]];
colnames(clab) = 1:dim(clab)[2];
heatmap.plus(z,ColSideColors=clab,RowSideColors=rlab);
Example is running, but I would prefer a legend, and my data look different from df - and not an xts with date.
Thanks!
This perhaps isn't an answer to the heatmap question, but this code and output will not display properly in a comment. (Noting my comments above...) Using an object named datcreated from the dput output above, but with the tclass-attribute removed, I subsetted it and:
dput(dat['2013-04-01'])
structure(c(0.698333333333333, 2.84666666666667, 4.68333333333333,
5.33833333333333, 5.66666666666667, 5.63666666666667, 5.69, 5.69666666666667,
5.54333333333333, 5.50833333333333, 4.335, 3.065, 2.42666666666667,
1.88666666666667, 1.47833333333333), .indexCLASS = c("POSIXct",
"POSIXt"), .indexTZ = "", tzone = "", class = c("xts", "zoo"), index = c(1364803140,
1364806740, 1364810340, 1364813940, 1364817540, 1364821140, 1364824740,
1364828340, 1364831940, 1364835540, 1364839140, 1364842740, 1364846340,
1364849940, 1364853540), .Dim = c(15L, 1L), .Dimnames = list(
NULL, "df.xts"))
> d2 <- dat['2013-04-01']
> tclass(d2) <- "POSIXct"
> d2 #displays with time appropriate format
df.xts
2013-04-01 00:59:00 0.6983333
2013-04-01 01:59:00 2.8466667
2013-04-01 02:59:00 4.6833333
2013-04-01 03:59:00 5.3383333
2013-04-01 04:59:00 5.6666667
2013-04-01 05:59:00 5.6366667
2013-04-01 06:59:00 5.6900000
2013-04-01 07:59:00 5.6966667
2013-04-01 08:59:00 5.5433333
2013-04-01 09:59:00 5.5083333
2013-04-01 10:59:00 4.3350000
2013-04-01 11:59:00 3.0650000
2013-04-01 12:59:00 2.4266667
2013-04-01 13:59:00 1.8866667
2013-04-01 14:59:00 1.4783333
> dput(d2)
structure(c(0.698333333333333, 2.84666666666667, 4.68333333333333,
5.33833333333333, 5.66666666666667, 5.63666666666667, 5.69, 5.69666666666667,
5.54333333333333, 5.50833333333333, 4.335, 3.065, 2.42666666666667,
1.88666666666667, 1.47833333333333), .indexCLASS = c("POSIXct",
"POSIXt"), .indexTZ = "", tzone = "", class = c("xts", "zoo"), index = structure(c(1364803140,
1364806740, 1364810340, 1364813940, 1364817540, 1364821140, 1364824740,
1364828340, 1364831940, 1364835540, 1364839140, 1364842740, 1364846340,
1364849940, 1364853540), tclass = c("POSIXct", "POSIXt")), .Dim = c(15L,
1L), .Dimnames = list(NULL, "df.xts"))
No added tclass attribute after : tclass(d2) <- "POSIXct". So I wonder if the heatmap axis problem relates to an improperly formed xts object.

Resources