How to pass arguments to a function inside *apply family functions - r

I have the following dataset:
dput(tt2)
structure(c(1371.25, NA, 1373.95, NA, NA, 1373, NA, 1373.95,
1373.9, NA, NA, 1374, 1374.15, NA, 1374, 1373.85, 1372.55, 1374.05,
1374.15, 1374.75, NA, NA, 1375.9, 1374.05, NA, NA, NA, NA, NA,
NA, NA, 1375, NA, NA, NA, NA, NA, 1376.35, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1376.25, NA, 1378, 1376.5, NA, NA, NA, 1378,
1378, NA, NA, 1378.8, 231.9, 231.85, NA, 231.9, 231.85, 231.9,
231.8, 231.9, 232.6, 231.95, 232.35, 232, 232.1, 232.05, 232.05,
232.05, 231.5, 231.3, NA, NA, 231.1, 231.1, 231.1, 231, 231,
230.95, 230.6, 230.6, 230.7, 230.6, 231, NA, 231, 231, 231.45,
231.65, 231.4, 231.7, 231.3, 231.25, 231.25, 231.4, 231.4, 231.85,
231.75, 231.5, 231.55, 231.35, NA, 231.5, 231.5, NA, 231.5, 231.25,
231.15, 231, 231, 231, 231.05, NA), .Dim = c(60L, 2L), .indexCLASS = c("POSIXct",
"POSIXt"), tclass = c("POSIXct", "POSIXt"), .indexTZ = "Asia/Calcutta", tzone = "Asia/Calcutta", index = structure(c(1459482300,
1459483766.38983, 1459485231.77966, 1459486697.16949, 1459488162.55932,
1459489627.94915, 1459491093.33898, 1459492558.72881, 1459494025.11864,
1459495490.50847, 1459496955.89831, 1459498421.28814, 1459499887.67797,
1459501353.0678, 1459502818.45763, 1459504283.84746, 1459505749.23729,
1459507214.62712, 1459508680.01695, 1459510145.40678, 1459511610.79661,
1459513076.18644, 1459514541.57627, 1459516007.9661, 1459517474.35593,
1459518939.74576, 1459520405.13559, 1459521870.52542, 1459523335.91525,
1459524804.30508, 1459526269.69492, 1459527735.08475, 1459529200.47458,
1459530667.86441, 1459532134.25424, 1459533600.64407, 1459535066.0339,
1459536531.42373, 1459537996.81356, 1459539702.20339, 1459541167.59322,
1459542634.98305, 1459544100.37288, 1459545565.76271, 1459547031.15254,
1459548496.54237, 1459549961.9322, 1459551429.32203, 1459552894.71186,
1459554360.10169, 1459555829.49153, 1459557294.88136, 1459558760.27119,
1459560225.66102, 1459561691.05085, 1459563160.44068, 1459564625.83051,
1459566091.22034, 1459567557.61017, 1459569028), tclass = c("POSIXct",
"POSIXt"), tzone = "Asia/Calcutta"), .Dimnames = list(NULL, c("A",
"B")), class = c("xts", "zoo"))
I want to learn how to pass arguments to a function inside the apply family functions.
1st example:
Since there are NAs in the data, mean function returns NA. So I want to pass na.rm=TRUE:
tt<-apply.daily(tt2, function(x) sapply(x,mean(na.rm=TRUE)))
But it returns:
Error in mean.default(na.rm = TRUE) :
argument "x" is missing, with no default
2nd example:
I want to use period.sum function that takes only single column values and requires index of the column.
tt<-lapply(tt2, period.sum, endpoints(tt2))
Error in FUN(X[[i]], ...) : NA/NaN/Inf in foreign function call (arg 3)
I know the 2nd example can me solved with period.apply but as lapply is a general type function can the 2nd example be solved using lapply also?

You can pass arguments in all the functions of the apply family through the ellipsis (...) argument, cf. the help page on sapply. Now, apply.daily is just an extension to xts objects, see ?apply.daily.
apply.daily(tt2, mean, na.rm=TRUE)
#apply.daily( x, FUN, ...)

Related

r Replace multiple strings in a data frame column with multiple strings from a column of another data frame

I have a dataframe (df1) with a column "PartcipantID". Some ParticipantIDs are wrong and should be replaced with the correct ParticipantID. I have another dataframe (df2) where all Participant IDs appear in columns Goal_ID to T4. The Participant IDs in column "Goal_ID" are the correct IDs.
Now I want to replace all ParticipantIDs in df1 with all Goal_ID ParticipantIDs from df2.
This is my original dataframe (df1):
structure(list(Partcipant_ID = c("AA_SH_RA_91", "AA_SH_RA_91",
"AB_BA_PR_93", "AB_BH_VI_90", "AB_BH_VI_90", "AB_SA_TA_91", "AJ_BO_RA_92",
"AJ_BO_RA_92", "AK_SH_HA_91", "AL_EN_RA_95", "AL_MA_RA_95", "AL_SH_BA_99",
"AM_BO_AB_49", "AM_BO_AB_94", "AM_BO_AB_94", "AM_BO_AB_94", "AN_JA_AN_91",
"AN_KL_GE_11", "AN_KL_WO_91", "AN_MA_DI_95", "AN_MA_DI_95", "AN_SE_RA_95",
"AN_SE_RA_95", "AN_SI_RA_97", "AN_SO_PU_94", "AN_SU_RA_91", "AR_BO_RA_92",
"AR_KA_VI_94", "AR_KA_VI_94", "AS_AR_SO_90", "AS_AR_SU_95", "AS_KU_SO_90",
"AS_MO_AS_97", "AW_SI_OJ_97", "AW_SI_OJ_97", "AY_CH_SU_97", "BH_BE_LD_84",
"BH_BE_LI_83", "BH_BE_LI_83", "BH_BE_LI_84", "BH_KO_SA_87", "BH_PE_AB_89",
"BH_YA_SA_87", "BI_CH_PR_94", "BI_CH_PR_94"), Start_T2 = structure(c(NA,
NA, NA, NA, 1579514871, 1576658745, NA, 1579098225, NA, NA, 1576663067,
1576844759, NA, 1577330639, NA, NA, 1576693930, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 1577718380, 1577718380, 1577454467, NA,
NA, 1576352237, NA, NA, NA, NA, 1576420656, 1576420656, NA, NA,
1578031772, 1576872938, NA, NA), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), End_T2 = structure(c(NA, NA, NA, NA, 1579515709,
1576660469, NA, 1579098989, NA, NA, 1576693776, 1576845312, NA,
1577331721, NA, NA, 1576694799, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 1577719049, 1577719049, 1577455167, NA, NA, 1576352397,
NA, NA, NA, NA, 1576421607, 1576421607, NA, NA, 1578032408, 1576873875,
NA, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
45L), class = "data.frame")
And this is the reference data frame (df2):
structure(list(Goal_ID = c("AJ_BO_RA_92", "AL_EN_RA_95", "AM_BO_AB_49",
"AS_KU_SO_90", "BH_BE_LI_84", "BH_YA_SA_87", "BI_CH_PR_94", "BI_CH_PR_94"
), T2 = c("AJ_BO_RA_92", "AL_MA_RA_95", "AM_BO_AB_94", "AS_AR_SO_90",
"BH_BE_LI_83", "BH_YA_SA_87", "BI_NA_PR_94", "BI_NA_PR_94"),
T3 = c("AR_BO_RA_92", "AL_MA_RA_95", "AM_BO_AB_94", NA, "BH_BE_LI_83",
NA, "BI_CH_PR_94", "BI_CH_PR_94"), T4 = c("AJ_BO_RA_92",
"AL_MA_RA_95", "AM_BO_AB_94", NA, "BH_BE_LI_83", "BH_KO_SA_87",
"BI_CH_PR_94", "BI_CH_PR_94")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
For example, in my df1, I want
"AR_BO_RA_92" to be replaced by "AJ_BO_RA_92";
"AL_MA_RA_95" to be replaced by "AL_EN_RA_95";
"AM_BO_AB_94" to be replaced by "AM_BO_AB_49"
and so on...
I thought about using string_replace and I started with this:
df1$Partcipant_ID <- str_replace(df1$Partcipant_ID, "AR_BO_RA_92", "AJ_BO_RA_92")
But that is of course very unefficient because I have so many replacements and it would be nice to make use of my reference data frame. I just cannot figure it out myself.
I hope this is understandable. Please ask if you need additional information.
Thank you so much already!
You can use match to find where the string is located and excange those which have been found and are not NA like:
i <- match(df1$Partcipant_ID, unlist(df2[-1])) %% nrow(df2)
j <- !is.na(i)
df1$Partcipant_ID[j] <- df2$Goal_ID[i[j]]
df1$Partcipant_ID
# [1] "AA_SH_RA_91" "AA_SH_RA_91" "AB_BA_PR_93" "AB_BH_VI_90" "AB_BH_VI_90"
# [6] "AB_SA_TA_91" "AJ_BO_RA_92" "AJ_BO_RA_92" "AK_SH_HA_91" "AL_EN_RA_95"
#[11] "AL_MA_RA_95" "AL_SH_BA_99" "AM_BO_AB_49" "AM_BO_AB_94" "AM_BO_AB_94"
#[16] "AM_BO_AB_94" "AN_JA_AN_91" "AN_KL_GE_11" "AN_KL_WO_91" "AN_MA_DI_95"
#[21] "AN_MA_DI_95" "AN_SE_RA_95" "AN_SE_RA_95" "AN_SI_RA_97" "AN_SO_PU_94"
#[26] "AN_SU_RA_91" "AR_BO_RA_92" "AR_KA_VI_94" "AR_KA_VI_94" "AS_AR_SO_90"
#[31] "AS_AR_SU_95" "AS_KU_SO_90" "AS_MO_AS_97" "AW_SI_OJ_97" "AW_SI_OJ_97"
#[36] "AY_CH_SU_97" "BH_BE_LD_84" "BH_BE_LI_83" "BH_BE_LI_83" "BH_BE_LI_84"
#[41] "BH_KO_SA_87" "BH_PE_AB_89" "BH_YA_SA_87" "BI_CH_PR_94" "BI_CH_PR_94"
I think this might work. Create a true look up table with a column of correct and incorrect codes. I.e. stack the columns, then join the subsequent df3 to df1 and use coalesce to create a new part_id. You spelt participant wrong, which made me feel more human I always do that.
library(dplyr)
df3 <- df2[1:2] %>%
bind_rows(df2[c(1,3)] %>% rename(T2 = T3),
df2[c(1,4)] %>% rename(T2 = T4)) %>%
distinct()
df1 %>%
left_join(df3, by = c("Partcipant_ID" = "T2")) %>%
mutate(Goal_ID = coalesce(Goal_ID, Partcipant_ID)) %>%
select(Goal_ID, Partcipant_ID, Start_T2, End_T2)

create an etf portfolio csv data

I'm trying to calculate the adjusted sharpe ratio for a portfolio of two or more assets. what I need to do is:
I load the two csv files I generated.
structure(list(X.1 = 1:50, X = 1:50, date = structure(1:50, .Label = c("2019-07-01", "2019-07-02", "2019-07-03", "2019-07-05",
"2019-07-08", "2019-07-09", "2019-07-10", "2019-07-11", "2019-07-12",
"2019-07-15", "2019-07-16", "2019-07-17", "2019-07-18", "2019-07-19",
"2019-07-22", "2019-07-23", "2019-07-24", "2019-07-25", "2019-07-26",
"2019-07-29", "2019-07-30", "2019-07-31", "2019-08-01", "2019-08-02",
"2019-08-05", "2019-08-06", "2019-08-07", "2019-08-08", "2019-08-09",
"2019-08-12", "2019-08-13", "2019-08-14", "2019-08-15", "2019-08-16",
"2019-08-19", "2019-08-20", "2019-08-21", "2019-08-22", "2019-08-23",
"2019-08-26", "2019-08-27", "2019-08-28", "2019-08-29", "2019-08-30",
"2019-09-03", "2019-09-04", "2019-09-05", "2019-09-06", "2019-09-09",
"2019-09-10"), class = "factor"),
adjClose = c(130.8539817206, 131.863291017, 132.8033339891,
131.041990315, 131.2201037202, 131.1706277743, 130.4482789642,
128.7067256684, 128.8551535061, 129.5972926947, 129.2113803166,
130.6066019911, 130.6164971802, 130.3097463156, 130.5571260452,
129.8446724242, 130.4086982074, 129.7853012891, 130.0920521537,
130.1316329104, 130.4482789642, 131.4971690173, 134.103513361,
135.3428257349, 137.6826474969, 138.783156885, 138.83272938,
139.1301643497, 138.852558378, 141.7475920835, 141.2617816329,
144.4443358092, 146.0504846459, 144.8805737649, 142.8084434756,
144.2857038254, 143.3239974232, 142.37220552, 144.712027282,
144.1270718415, 146.3479196156, 146.5362950965, 145.9810831529,
146.0207411489, 146.2092980651, 146.4278025524, 143.7759526384,
144.7989509198, 142.2265571831, 139.7336196235), lagx = c(NA,
130.8539817206, 131.863291017, 132.8033339891, 131.041990315,
131.2201037202, 131.1706277743, 130.4482789642, 128.7067256684,
128.8551535061, 129.5972926947, 129.2113803166, 130.6066019911,
130.6164971802, 130.3097463156, 130.5571260452, 129.8446724242,
130.4086982074, 129.7853012891, 130.0920521537, 130.1316329104,
130.4482789642, 131.4971690173, 134.103513361, 135.3428257349,
137.6826474969, 138.783156885, 138.83272938, 139.1301643497,
138.852558378, 141.7475920835, 141.2617816329, 144.4443358092,
146.0504846459, 144.8805737649, 142.8084434756, 144.2857038254,
143.3239974232, 142.37220552, 144.712027282, 144.1270718415,
146.3479196156, 146.5362950965, 145.9810831529, 146.0207411489,
146.2092980651, 146.4278025524, 143.7759526384, 144.7989509198,
142.2265571831), pct_change = c(NA, 0.0076542098, 0.0070784591,
-0.0134410632, 0.0013573637, -0.0003771877, -0.0055374346,
-0.0135311755, 0.0011518968, 0.0057265023, -0.0029866748,
0.0106826275, 7.57575751427131e-05, -0.0023540132, 0.0018948007,
-0.0054869685, 0.0043250626, -0.0048032937, 0.0023579524,
0.0003041594, 0.0024273686, 0.0079765219, 0.0194353174, 0.0091568383,
0.0169943112, 0.0079297042, 0.0003570663, 0.002137818, -0.001999286,
0.0204238651, -0.0034390792, 0.0220330839, 0.0109972168,
-0.0080750017, -0.0145098584, 0.0102384388, -0.0067100166,
-0.0066852368, 0.0161688134, -0.0040586091, 0.0151751236,
0.001285521, -0.0038033143, 0.0002715915, 0.001289637, 0.0014922336,
-0.0184443216, 0.0070649564, -0.0180865922, -0.0178406425
), rollmeanx = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0.0022449273, 0.0020327583, 0.0016836143, 0.0029861115,
0.002801633, 0.0036635665, 0.0042995146, 0.0045093675, 0.0039069923,
0.0040805283, 0.0039373228, 0.0032693281, 0.0038882917, 0.0038227304,
0.004333512, 0.0045939924, 0.0042813625, 0.0044765504, 0.0044354613,
0.0044811565, 0.0036783992, 0.003643339, 0.0022001886, 0.0011618239
), rollsdx = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0.0077480465, 0.0076691234, 0.0076367931, 0.0078392534, 0.0079349676,
0.0087510466, 0.0086559326, 0.0082636055, 0.0090514164, 0.0091306015,
0.0092739515, 0.0093933857, 0.0096997509, 0.0097492652, 0.0099891559,
0.0098095162, 0.0099469924, 0.0098104499, 0.0098219087, 0.0098046681,
0.0107849756, 0.010771918, 0.01108063, 0.0116528378), roll_sharpe = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.2897410646, 0.2650574546,
0.2204609035, 0.380917847, 0.3530742789, 0.4186432387, 0.4967130412,
0.5456900702, 0.4316442951, 0.4469068418, 0.4245571962, 0.3480457574,
0.4008651116, 0.3921044585, 0.4338216423, 0.468319975, 0.4304177896,
0.4563042904, 0.4515885332, 0.4570431621, 0.3410669895, 0.3382256492,
0.1985616886, 0.0997030911)), class = "data.frame", row.names = c(NA,
-50L))
structure(list(X = 1:49, date = structure(1:49, .Label = c("2019-01-02",
"2019-01-03", "2019-01-04", "2019-01-07", "2019-01-08", "2019-01-09",
"2019-01-10", "2019-01-11", "2019-01-14", "2019-01-15", "2019-01-16",
"2019-01-17", "2019-01-18", "2019-01-22", "2019-01-23", "2019-01-24",
"2019-01-25", "2019-01-28", "2019-01-29", "2019-01-30", "2019-01-31",
"2019-02-01", "2019-02-04", "2019-02-05", "2019-02-06", "2019-02-07",
"2019-02-08", "2019-02-11", "2019-02-12", "2019-02-13", "2019-02-14",
"2019-02-15", "2019-02-19", "2019-02-20", "2019-02-21", "2019-02-22",
"2019-02-25", "2019-02-26", "2019-02-27", "2019-02-28", "2019-03-01",
"2019-03-04", "2019-03-05", "2019-03-06", "2019-03-07", "2019-03-08",
"2019-03-11", "2019-03-12", "2019-03-13"), class = "factor"),
adjClose = c(107.6401844169, 108.2682817731, 108.0425592857,
107.9738611374, 108.0621873281, 108.1897696036, 107.9346050527,
108.2192116672, 108.0229312434, 107.9247910315, 107.9149770103,
107.8266508196, 107.6990685441, 107.7677666924, 107.6401844169,
107.8070227772, 107.6401844169, 107.4929740991, 107.8070227772,
108.3958640486, 109.0043333624, 108.621586536, 108.4056780698,
108.5528883877, 108.4841902393, 108.6510285996, 108.7000987055,
108.4743762181, 108.4940042605, 108.4743762181, 108.778610875,
108.778610875, 109.033775426, 109.0730315107, 108.8865651081,
109.1711717227, 109.0926595531, 109.2496838922, 108.9945193412,
108.8178669598, 108.6019584936, 108.7589828327, 108.7197267479,
108.8473090234, 109.1515436803, 109.2496838922, 109.2300558498,
109.4361502948, 109.5637325703), lagx = c(NA, 107.6401844169,
108.2682817731, 108.0425592857, 107.9738611374, 108.0621873281,
108.1897696036, 107.9346050527, 108.2192116672, 108.0229312434,
107.9247910315, 107.9149770103, 107.8266508196, 107.6990685441,
107.7677666924, 107.6401844169, 107.8070227772, 107.6401844169,
107.4929740991, 107.8070227772, 108.3958640486, 109.0043333624,
108.621586536, 108.4056780698, 108.5528883877, 108.4841902393,
108.6510285996, 108.7000987055, 108.4743762181, 108.4940042605,
108.4743762181, 108.778610875, 108.778610875, 109.033775426,
109.0730315107, 108.8865651081, 109.1711717227, 109.0926595531,
109.2496838922, 108.9945193412, 108.8178669598, 108.6019584936,
108.7589828327, 108.7197267479, 108.8473090234, 109.1515436803,
109.2496838922, 109.2300558498, 109.4361502948), pct_change = c(NA,
0.00580130529379156, -0.00208919974584387, -0.000636247954609872,
0.000817364453597674, 0.00117924528324126, -0.00236406619337162,
0.0026299084064227, -0.00181702552912345, -0.000909338910569157,
-9.09421608741455e-05, -0.000819149904301271, -0.00118461818866857,
0.000637464711466883, -0.00118526622925381, 0.00154756486175119,
-0.00154996353084853, -0.00136948781102927, 0.00291306326814199,
0.00543232231753777, 0.0055820653641086, -0.00352367184650867,
-0.00199167119328355, 0.00135611608393343, -0.000633254930957689,
0.00153554331192595, 0.000451426507283543, -0.0020808830183651,
0.000180913613925363, -0.000180946349583393, 0.00279682425113514,
0, 0.00234023402384323, 0.000359906423762952, -0.0017124831003245,
0.00260697590864838, -0.000719683340030535, 0.00143729788046738,
-0.00234107689581361, -0.00162337662311708, -0.00198807157066812,
0.00144378271118579, -0.00036107600685043, 0.001172121540208,
0.00278726847685346, 0.000898311174948939, -0.000179694519491878,
0.00188323917137825, 0.0011644571840243), rollmeanx = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 0.00031862067098972, 0.000116566615996168,
0.000182745925763722, 0.000281463710606259, 0.000264830167591981,
0.000116642517518964, 0.000232323417850645, 0.000104557292577641,
0.00031427773713485, 0.000355611323978903, 0.000466119332375147,
0.000519712801832612, 0.000495718942211888, 0.00058524217844741,
0.000606405037048468, 0.000601392901535568, 0.000565433203128064,
0.000553892802578618, 0.000331113946269068, 0.000149816691434887,
-0.00012032609815416, 9.31190557875071e-05, 0.00031034358624828,
0.000289534272203531, 0.000310150654542886, 0.000325955011790718,
0.000358365497097116), rollsdx = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0.00268140767467829, 0.00240146073630199, 0.00235743767997561,
0.00236696184695034, 0.0023643008327706, 0.00240605774759231,
0.00234142001164779, 0.00228025110555633, 0.00230711786187933,
0.00229224984050652, 0.0023280202990145, 0.00231052893570155,
0.00233171418775393, 0.00237482539929513, 0.00236032672014123,
0.00235834925288879, 0.00239840602803656, 0.0024087481448098,
0.00240681026318947, 0.00213964886031918, 0.00176319827198963,
0.00160907538291628, 0.00163656991965793, 0.00162551270011354,
0.00161610064622736, 0.00163029258958719, 0.00163996465710043
), roll_sharpe = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.118825896561196,
0.0485398800130578, 0.0775188787877576, 0.11891349705062,
0.112012043442729, 0.0484786857820375, 0.0992232989787878,
0.0458534116364924, 0.136220928426624, 0.155136372002243,
0.200221335085637, 0.224932392666535, 0.212598501486754,
0.246435876347421, 0.256915719283212, 0.255005869380419,
0.235753744994942, 0.229950484351013, 0.137573763637803,
0.0700192887783178, -0.0682430898814242, 0.0578711580427878,
0.189630508614717, 0.178118738895924, 0.191912957442906,
0.199936510705267, 0.218520256241821)), class = "data.frame", row.names = c(NA,
-49L))
I choose between the two etfs according to the highest adjusted sharpe ratio on the last day of the month. (?)
I create a new csv file with the same columns calculated every month for the top ranked etf (on a daily basis). (?)
Desidered output is a file like those above, with data of top ranked etf
I have no idea about how to proceed. This for finding the last month day:
library(tidyverse)
library(roll)
library(quantmod)
library(httr)
library(jsonlite)
library(tidyverse)
library(hrbrthemes)
library(dplyr)
library(xts)
xdf <- data.frame()
xdf <- read.csv('tip.csv')
library(timeDate)
xdf$eom<-timeLastDayInMonth(xdf$somedate)
????
xdf_02 <- xdf %>% mutate(lagx = lag(adjClose)) %>%
mutate(pct_change = (adjClose - lagx)/adjClose)%>%
mutate(rollmeanx = roll_mean(pct_change, width = 22),rollsdx = roll_sd(pct_change, width=22)) %>%
mutate(roll_sharpe = rollmeanx / rollsdx)
write.csv(xdf_02,'tip_r.csv')
Tnx to all contributors

How can I get highcharter to represent a forecast object?

This is a follow-on to this question.
I am trying to get the pipeline given in that question to accept a forecast object as input:
Again, using this data:
> dput(t)
structure(c(2, 2, 267822980, 325286564, 66697091, 239352431,
94380295, 1, 126621669, 158555699, 32951026, 23, 108000151, 132505189,
29587564, 120381505, 25106680, 117506099, 22868767, 115940080,
22878163, 119286731, 22881061), .Dim = c(23L, 1L), index = structure(c(1490990400,
1490994000, 1490997600, 1491001200, 1491004800, 1491008400, 1491012000,
1491026400, 1491033600, 1491037200, 1491040800, 1491058800, 1491062400,
1491066000, 1491069600, 1491073200, 1491076800, 1491109200, 1491112800,
1491120000, 1491123600, 1491156000, 1491159600), tzone = "US/Mountain", tclass = c("POSIXct",
"POSIXt")), class = c("xts", "zoo"), .indexCLASS = c("POSIXct",
"POSIXt"), tclass = c("POSIXct", "POSIXt"), .indexTZ = "US/Mountain", tzone = "US/Mountain", .CLASS = "double", .Dimnames = list(
NULL, "count"))
I use
highchart(type = 'stock') %>%
hc_add_series(t) %>%
hc_xAxis(type = 'datetime')
To create
But if I follow this same recipe using
require("forecast")
t.arima <- auto.arima(t)
x <- forecast(t.arima, level = c(95, 80))
highchart(type = 'stock') %>%
hc_add_series(x) %>%
hc_xAxis(type = 'datetime')
I get this error:
Error in as.Date.ts(.) : unable to convert ts time to Date class
How can I show the forecast series along with the historical? I've seen this in the documentation, but don't understand why I'd be getting this error.
JS CONSOLE OUTPUT FOR JK:
DF DATA AFTER RE-INDEXING:
dput(df)
structure(list(Index = structure(c(1490968800, 1490972400, 1490976000,
1490979600, 1490983200, 1490986800, 1490990400, 1491004800, 1491012000,
1491015600, 1491019200, 1491037200, 1491040800, 1491044400, 1491048000,
1491051600, 1491055200, 1491087600, 1491091200, 1491098400, 1491102000,
1491134400, 1491138000, 1491217200, 1491220800, 1491224400, 1491228000,
1491231600, 1491235200, 1491238800, 1491242400, 1491246000, 1491249600,
1491253200, 1491256800, 1491260400, 1491264000, 1491267600), class = c("POSIXct",
"POSIXt")), Data = c(2, 2, 259465771, 315866206, 64582553, 233440220,
91918347, 1, 126563786, 158555699, 32951026, 23, 108000151, 132505189,
29587564, 120381505, 25106680, 117506099, 22868767, 115898351,
22878163, 119285747, 22881061, 157925588, 32447780, 223096830,
281656273, 45406684, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
Fitted = c(102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
`Point Forecast` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143), `Lo 80` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -16003477.5789723,
-16003477.5789723, -16003477.5789723, -16003477.5789723,
-16003477.5789723, -16003477.5789723, -16003477.5789723,
-16003477.5789723, -16003477.5789723, -16003477.5789723),
`Hi 80` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 220344625.293258, 220344625.293258, 220344625.293258,
220344625.293258, 220344625.293258, 220344625.293258, 220344625.293258,
220344625.293258, 220344625.293258, 220344625.293258), `Lo 95` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -78561041.5917782,
-78561041.5917782, -78561041.5917782, -78561041.5917782,
-78561041.5917782, -78561041.5917782, -78561041.5917782,
-78561041.5917782, -78561041.5917782, -78561041.5917782),
`Hi 95` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 282902189.306064, 282902189.306064, 282902189.306064,
282902189.306064, 282902189.306064, 282902189.306064, 282902189.306064,
282902189.306064, 282902189.306064, 282902189.306064)), .Names = c("Index",
"Data", "Fitted", "Point Forecast", "Lo 80", "Hi 80", "Lo 95",
"Hi 95"), row.names = c(NA, -38L), class = "data.frame")
Not sure this is due to the irregular time series.
Anyway, ggfortify:::fortify.forecast is your friend. Why? Because fortify (try to) transform all the R object in data frames. So:
library(highcharter)
library(forecast)
t.arima <- auto.arima(t)
x <- forecast(t, level = c(95, 80))
library(highcharter)
library(ggplot2)
library(ggfortify)
#>
#> Attaching package: 'ggfortify'
#> The following object is masked from 'package:forecast':
#>
#> gglagplot
class(x)
#> [1] "forecast"
df <- fortify(x)
head(df)
#> Index Data Fitted Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
#> 1 1 2 140658844 NA NA NA NA NA
#> 2 3601 2 121734145 NA NA NA NA NA
#> 3 7201 267822980 105355638 NA NA NA NA NA
#> 4 10801 325286564 127214522 NA NA NA NA NA
#> 5 14401 66697091 153863779 NA NA NA NA NA
#> 6 18001 239352431 142136089 NA NA NA NA NA
Now you can:
highchart(type = "stock") %>%
hc_add_series(df, "line", hcaes(Index, Data), name = "Original") %>%
hc_add_series(df, "line", hcaes(Index, Fitted), name = "Fitted") %>%
hc_add_series(df, "line", hcaes(Index, `Point Forecast`), name = "Forecast") %>%
hc_add_series(df, "arearange", hcaes(Index, low = `Lo 80`, high = `Hi 80`), name = "Interval")
As you can see, fortify can't detect the real time too. So you need to transform the Index in the time what you want.
The error
Error in as.Date.ts(.) : unable to convert ts time to Date class
is due to the fact that you have a ts object with a frequency that is not covered by the function as.Date.ts(.). When we see what this function does, this is what we get:
function (x, offset = 0, ...)
{
time.x <- unclass(time(x)) + offset
if (frequency(x) == 1)
as.Date(paste(time.x, 1, 1, sep = "-"))
else if (frequency(x) == 4)
as.Date(paste((time.x + 0.001)%/%1, 3 * (cycle(x) - 1) +
1, 1, sep = "-"))
else if (frequency(x) == 12)
as.Date(paste((time.x + 0.001)%/%1, cycle(x), 1, sep = "-"))
else stop("unable to convert ts time to Date class")
}
This function considers only 3 values for the frequency of a ts object: 1, 4, or 12. When we take a look at the frequency of your object x, we see that its frequency = 0.000277777777777778, so when highcharter calls the function using the ts objects in x it stops and gives you that error.
We have two options on how to "fix" it:
Transform t into a ts object (instead of a xts object) with frequency = 1 before running auto.arima and forecast;
After running auto.arima and forecast, we can create an index for the future dates and transform the ts objects in x into xts objects with the correct index.
I said "fix" because these solutions are not perfect, as we will see.
Option 1
t <- structure(
c(2, 2, 267822980, 325286564, 66697091, 239352431,
94380295, 1, 126621669, 158555699, 32951026, 23,
108000151, 132505189, 29587564, 120381505, 25106680,
117506099, 22868767, 115940080, 22878163, 119286731,
22881061),
.Dim = c(23L, 1L),
index = structure(c(1490990400, 1490994000, 1490997600,
1491001200, 1491004800, 1491008400,
1491012000, 1491026400, 1491033600,
1491037200, 1491040800, 1491058800,
1491062400, 1491066000, 1491069600,
1491073200, 1491076800, 1491109200,
1491112800, 1491120000, 1491123600,
1491156000, 1491159600),
tzone = "US/Mountain",
tclass = c("POSIXct","POSIXt")),
class = c("xts", "zoo"),
.indexCLASS = c("POSIXct","POSIXt"),
tclass = c("POSIXct", "POSIXt"),
.indexTZ = "US/Mountain",
tzone = "US/Mountain",
.CLASS = "double",
.Dimnames = list(NULL, "count"))
require("forecast")
library(highcharter)
# SOLUTION 1
t.tmp <- ts(t, start=1, end = length(t))
t.arima.1 <- auto.arima(t.tmp)
x.1 <- forecast(t.arima.1, level = c(95, 80))
highchart(type = 'stock') %>%
hc_add_series(x.1) %>%
hc_add_series(x.1$x, name = "Original") %>%
hc_add_series(x.1$fitted, name = "Fitted")
The problem with this approach is that we lose the dates (axis, tooltip, etc.).
Option 2, 1st try: Hourly Forecasts
I tried to create an hourly index for the future values, but for some reason Highcharter moves the intervals to the left (or there's some problem with the dates that I can't see/figure out).
Option 2, 2nd try: Daily Forecasts
When I changed it to a daily index for the future values it worked, but it's weird since we have hourly observations and the forecast part of our plot shows "daily forecasts".
Here is the full code:
t <- structure(
c(2, 2, 267822980, 325286564, 66697091, 239352431,
94380295, 1, 126621669, 158555699, 32951026, 23,
108000151, 132505189, 29587564, 120381505, 25106680,
117506099, 22868767, 115940080, 22878163, 119286731,
22881061),
.Dim = c(23L, 1L),
index = structure(c(1490990400, 1490994000, 1490997600,
1491001200, 1491004800, 1491008400,
1491012000, 1491026400, 1491033600,
1491037200, 1491040800, 1491058800,
1491062400, 1491066000, 1491069600,
1491073200, 1491076800, 1491109200,
1491112800, 1491120000, 1491123600,
1491156000, 1491159600),
tzone = "US/Mountain",
tclass = c("POSIXct","POSIXt")),
class = c("xts", "zoo"),
.indexCLASS = c("POSIXct","POSIXt"),
tclass = c("POSIXct", "POSIXt"),
.indexTZ = "US/Mountain",
tzone = "US/Mountain",
.CLASS = "double",
.Dimnames = list(NULL, "count"))
require("forecast")
library(highcharter)
library(xts)
t.arima <- auto.arima(t)
x <- forecast(t.arima, level = c(95, 80))
# Problem
## Time from 'forecast'
time.x <- time(x$mean) # ts variable
time.x # see that frequency = 0.000277777777777778
## Original time
time.t <- time(t) # POSIXct variable, use as.ts to see frequency
as.ts(time.t) # frequency = 1
## Try to transform back to formatted date
as.POSIXct(as.double(time.t), tz = "US/Mountain", origin = "1970-01-01")
as.POSIXct(as.double(time.x), tz = "US/Mountain", origin = "1970-01-01")
#--------------------------------------------------------#
# SOLUTION 1
t.tmp <- ts(t, start=1, end = length(t))
t.arima.1 <- auto.arima(t.tmp)
x.1 <- forecast(t.arima.1, level = c(95, 80))
highchart(type = 'stock') %>%
hc_add_series(x.1) %>%
hc_add_series(x.1$x, name = "Original") %>%
hc_add_series(x.1$fitted, name = "Fitted")
#------------------------------------------------------#
# SOLUTION 2 - With correct dates but wrong plot
## Create new forecast variable
x.2 <- forecast(t.arima.1, level = c(95, 80))
## Take forecast length
forecast.length <- length(time.x)
### Create New Forecast dates (HOUR)
### Since I don't know the exact forecast times, I'll add one HOUR
### for each obs starting from the last date in the original dataset
last.date <- time.t[length(time.t)]
new.forecast.time.hour <- as.POSIXct(last.date) + c((1:forecast.length)*3600)
## Insert date back
x.2$mean <- xts(x.1$mean, order.by = new.forecast.time.hour)
x.2$lower <- xts(x.1$lower, order.by = new.forecast.time.hour)
x.2$upper <- xts(x.1$upper, order.by = new.forecast.time.hour)
### Original Data
x.2$x <- xts(x.1$x, order.by = time.t)
### Fitted
x.2$fitted <- xts(x.1$fitted, order.by = time.t)
# Plot forecasts with correct date
highchart(type = 'stock') %>%
hc_add_series(x.2) %>%
hc_add_series(x.2$x, name = "Original") %>%
hc_add_series(x.2$fitted, name = "Fitted") %>%
hc_xAxis(type = 'datetime')
#------------------------------------------------------#
# SOLUTION 3 - Correct plot but only for daily forecasts
## Create new forecast variable
x.3 <- forecast(t.arima.1, level = c(95, 80))
## Take forecast length
forecast.length <- length(time.x)
### Create New Forecast dates (DAY)
### Since I don't know the exact forecast times, I'll add one DAY
### for each obs starting from the last date in the original dataset
last.date <- time.t[length(time.t)]
new.forecast.time.day <- as.POSIXct(last.date) + c((1:forecast.length)*3600*24)
## Add change from as.POSIXct to as.Date
new.forecast.time.day <- as.Date(new.forecast.time.day)
## Insert date back
x.3$mean <- xts(x.1$mean, order.by = new.forecast.time.day)
x.3$lower <- xts(x.1$lower, order.by = new.forecast.time.day)
x.3$upper <- xts(x.1$upper, order.by = new.forecast.time.day)
### Original Data
x.3$x <- xts(x.1$x, order.by = time.t)
### Fitted
x.3$fitted <- xts(x.1$fitted, order.by = time.t)
# Plot forecasts with correct date
highchart(type = 'stock') %>%
hc_add_series(x.3) %>%
hc_add_series(x.3$x, name = "Original") %>%
hc_add_series(x.3$fitted, name = "Fitted") %>%
hc_xAxis(type = 'datetime')
One other thing: the fitted values on my plots differ from the fitted values on jbkunst's plot because he used forecast directly on t, not on t.arima (just a typo, I believe). This way, my forecasts are based on an Arima model, while his are based on an ETS model.

Calculate percentage to total using rowPercents

I am trying to calculate a percentage to total for, lets say, the following reproducible example:
structure(c(197.95, 197.95, 197.95, 186.8, 190.51, 195.16, 199.81,
202.59, 202.59, 202.59, 92.28, 92.28, 90.07, 89.82, 87.36, 87.61,
90.56, 89.82, 90.07, 89.82, 20.43, 20.43, 20.43, 20.43, 20.43,
20.43, 20.43, 20.43, 20.43, 20.64, 24.7, 24.95, 24.54, 23.97,
23.97, 24.38, 24.38, 24.38, 24.54, 24.54, 37.4, 37.4, 37.4, 35.43,
35.43, 35.43, 35.43, 35.43, 35.43, 39.37, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 16.05,
16.05, 16.05, 16.05, 15.62, 15.62, 16.05, 15.62, 15.62, 15.62,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), index = structure(c(470620800,
470880000, 470966400, 471052800, 471139200, 471225600, 471484800,
471571200, 471657600, 471744000), tzone = "UTC", tclass = "Date"), .indexCLASS = "Date", .indexTZ = "UTC", tclass = "Date", tzone = "UTC", class = c("xts",
"zoo"), .Dim = c(10L, 9L), .Dimnames = list(NULL, c("AVON", "BA.",
"CMRG", "COB", "MGGT", "QQ.", "RR.", "SNR", "ULE")))
I need to return the same presentation of my data but each value is a percentage of the total of the row it belongs to. I did a lot of research and tried prop.table which returns a subscript error and finally I used rowPercents which is part of RcmdrMisc package. However, I could not find how to let it ignore the NA in my data set.
In the example provides there are two whole columns of NA. I can not drop them as the whole data set has some values for the subsequent rows.
Note the the class of my example is zoo and xts
You don't need any external packages for this.
dat.percent <- dat / rowSums(dat, na.rm = T) * 100
Check that it works:
> all(abs(rowSums(dat.percent, na.rm = T) - 100) < 0.0001)
[1] TRUE
prop.table does not seem to work with xts/zoo objects but this works:
library(xts)
prop.table(coredata(x), 1)
It returns all NAs which is correct since there is an NA in each row (and it is impossible to calculate the proportions without knowing every value). If you want to regard the NA values as zero then:
prop.table( na.fill(coredata(x), 0), 1)

How to change xticks locations and customize legend using levelplot (lattice library)

I am trying to move the position of x-ticks and x-labels from the bottom of the figure to its top.
In addition, my data has a bunch of NAs. Currently, levelplot just remove them and leave them as white space in the plot. I wondering if it is possible to add this NAs to the legend as well.
Any suggestions? Thanks!
Here is my code and its output:
require(lattice)
# see data from dput() below
rownames(data)=data[,1]
data_matrix=as.matrix(data[,2:11])
color = colorRampPalette(rev(c("#D73027", "#FC8D59", "#FEE090", "#FFFFBF", "#E0F3F8", "#91BFDB", "#4575B4")))(100)
levelplot(data_matrix, scale=list(x=list(rot=45)), ylab="Days", xlab="Strains", col.regions = color)
Data
data <-
structure(list(X = structure(1:17, .Label = c("Arcobacter", "Bacillus",
"Bordetella", "Campylobacter", "Chlamydia", "Clostridium ", "Corynebacterium",
"Enterococcus", "Escherichia", "Francisella", "Legionella", "Mycobacterium",
"Pseudomonas", "Rickettsia", "Staphylococcus", "Streptococcus",
"Treponema"), class = "factor"), day.0 = c(NA, -3.823301154,
NA, NA, NA, -3.518606107, NA, NA, NA, NA, NA, -4.859479387, NA,
NA, NA, -2.588402346, -2.668136603), day.2 = c(-4.006281239,
-3.024823788, NA, -5.202804501, NA, -3.237622321, NA, NA, -5.296138823,
-5.105469059, NA, NA, -4.901775198, NA, NA, -2.979144202, -3.050083791
), day.4 = c(-2.880770182, -3.210165554, -4.749097175, -5.209064234,
NA, -2.946480184, NA, -5.264113795, -5.341881713, -4.435780293,
NA, -4.810650076, -4.152531609, NA, NA, -3.106172794, -3.543161966
), day.6 = c(-2.869833226, -3.293283924, -3.831346387, NA, NA,
-3.323947791, NA, NA, NA, NA, NA, -4.397581863, -4.068855504,
NA, NA, -3.27028378, -3.662618619), day.8 = c(-3.873589331, -3.446192193,
-3.616207965, NA, NA, -3.13869325, NA, -5.010807453, NA, NA,
NA, -4.091502649, -4.412399025, -4.681675749, NA, -3.404738625,
-3.955464159), day.15 = c(-5.176583159, -2.512963066, -3.392832457,
NA, NA, -3.194662968, NA, -3.60440455, NA, NA, -4.875554468,
-2.507376205, -4.727255906, -5.27116754, -3.200499549, -3.361296145,
-4.320554841), day.22 = c(-4.550052847, -3.654013004, -3.486879661,
NA, NA, -3.614890858, NA, NA, NA, NA, -4.706690492, -2.200533317,
-4.836957953, NA, -4.390423731, NA, NA), day.29 = c(-4.730006329,
-3.46707372, -3.594457287, NA, NA, -3.800757834, NA, NA, NA,
NA, -4.285154089, -2.121152491, -4.816807055, -5.064577888, -2.945243736,
-4.479177287, -5.226435146), day.43 = c(-4.398680025, -3.144603215,
-3.642065153, NA, NA, -3.8268662, NA, NA, NA, NA, -4.762539208,
-2.156862316, -4.118608495, NA, -4.030291084, -4.678213147, NA
), day.57 = c(-4.689982547, -2.713502214, -3.51279797, NA, -5.069579266,
-3.495580794, NA, NA, NA, NA, -4.515973639, -1.90591075, -4.134826117,
-4.479351427, -3.482134037, -4.538534489, NA)), .Names = c("X",
"day.0", "day.2", "day.4", "day.6", "day.8", "day.15", "day.22",
"day.29", "day.43", "day.57"), class = "data.frame", row.names = c("Arcobacter",
"Bacillus", "Bordetella", "Campylobacter", "Chlamydia", "Clostridium ",
"Corynebacterium", "Enterococcus", "Escherichia", "Francisella",
"Legionella", "Mycobacterium", "Pseudomonas", "Rickettsia", "Staphylococcus",
"Streptococcus", "Treponema"))
Figure
The request to move the labels to the top is pretty easy (after looking at the ?xyplot under the scales section):
levelplot(data_matrix, scale=list(x=list(rot=45,alternating=2)),
ylab="Days", xlab="Strains", col.regions = color)
Trying to get the NA values into the color legend may take a bit more thinking, but it seems as though sensible values for the colorkey arguments for at and col might suffice.
levelplot(data_matrix, scale=list(x=list(rot=45,alternating=2)),
ylab="Days", xlab="Strains", col.regions = color,
colorkey=list(at=as.numeric( factor( c( seq(-5.5, -2, by=0.5),
"NA"))),
labels=as.character( c( seq(-5.5, -2, by=0.5),
"NA")),
col=c(color, "#FFFFFF") ) )

Resources