Related
This question already has an answer here:
summary stats across columns, where column names indicate groups
(1 answer)
Closed 2 years ago.
I have a dataframe with 59 rows and 371 columns. Rows are my observations/sites, and columns are imagery with a time stamp as column header. After sorting and cleaning the column headers I am left with this:
data<-structure(list(X20151126 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 0.277672673867523, 0.355025896133641, NA, NA, NA), X20151126.1 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.278084206794621, 0.355967923936499,
NA, NA, NA), X20151126.2 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 0.277672673867523, 0.355025896133641, NA, NA, NA), X20151126.3 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.278084206794621, 0.355967923936499,
NA, NA, NA), X20151216 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0.384717953500922, 0.476687361663067, 0.369193020073217,
0.509256980090469, 0.695446322082805), X20151216.1 = c(0.482005639964749,
0.477315968778509, 0.577629441578537, 0.521768662684214, 0.403182719183149,
0.495476052715638, 0.449110279438877, 0.572210709159168, 0.639095940861963,
0.663725301936293, 0.385697051441031, 0.476921890053818, 0.369067776922609,
0.509928156203107, 0.696681651400943), X20160105 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 0.414471298501795, 0.476356388531503,
0.363677767527229, 0.536833655761341, 0.627173043983676), X20160105.1 = c(0.557391753621561,
0.545031565171865, 0.611938234209565, 0.552637066670738, 0.462370657856108,
0.514062089559983, 0.517862730716598, 0.607005393447421, 0.62782746269337,
0.669313073182483, 0.415045773658901, 0.476891950837264, 0.364111066602943,
0.535738794288108, 0.627583094107998), X20160305 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 0.608603344509915, 0.635371377161601,
0.609235980874035, 0.55732646087173, 0.55009492623478), X20160305.1 = c(0.663171088076713,
0.594729693525975, 0.624064072126327, 0.60486680369263, 0.557099508853904,
0.525207209646858, 0.597555736004227, 0.55009547536139, 0.596528841673565,
0.609200814692677, 0.609410463440908, 0.635462899436559, 0.608558438182846,
0.557738728019745, 0.550279584646311), X20160315 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 0.643538550411662, 0.662874802494349,
0.641220282780279, 0.548161668675193, 0.574401192865252), X20160315.1 = c(0.668475363330821,
0.598489967449372, 0.627731925878224, 0.598095603148813, 0.567868324796377,
0.310253447067502, 0.628817539418026, 0.585246425272493, 0.562730022483348,
0.601923400619283, 0.644128383766072, 0.663057419883037, 0.640840244150391,
0.547853260818411, 0.57379435997677), X20160315.2 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 0.643538550411662, 0.662874802494349,
0.641220282780279, 0.548161668675193, 0.574401192865252), X20160315.3 = c(0.668475363330821,
0.598489967449372, 0.627731925878224, 0.598095603148813, 0.567868324796377,
0.310253447067502, 0.628817539418026, 0.585246425272493, 0.562730022483348,
0.601923400619283, 0.644128383766072, 0.663057419883037, 0.640840244150391,
0.547853260818411, 0.57379435997677), X20160325 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 0.617099941392511, 0.64324564948683,
0.610466453170061, 0.524114324562118, 0.558504551047362)), row.names = c("Site_00",
"Site_01", "Site_02", "Site_03", "Site_04", "Site_05", "Site_06",
"Site_09", "Site_10", "Site_11", "Site_12", "Site_13", "Site_16",
"Site_17", "Site_18"), class = "data.frame")
Some imagery, but not all, have multiples over the same time stamp e.g. X20151126, X20151126.1 and X20151126.2. However, not all of these multiples have values, as seen by the NA's.
I would like to create a new data frame where I reduce these multiples by calculating the mean (and where it only has NA I would like to keep NA). Ultimately, I would like this:
x20151126<-apply(data[grep("20151126",colnames(data), value=T)],
MARGIN=1,FUN=mean,na.rm=TRUE)
x20151216<-apply(data[grep("20151216",colnames(data), value=T)],
MARGIN=1,FUN=mean,na.rm=TRUE)
x20160105<-apply(data[grep("20160105",colnames(data), value=T)],
MARGIN=1,FUN=mean,na.rm=TRUE)
x20160305<-apply(data[grep("20160305",colnames(data), value=T)],
MARGIN=1,FUN=mean,na.rm=TRUE)
x20160315<-apply(data[grep("20160315",colnames(data), value=T)],
MARGIN=1,FUN=mean,na.rm=TRUE)
x20160325<-apply(data[grep("20160325",colnames(data), value=T)],
MARGIN=1,FUN=mean,na.rm=TRUE)
NewData<-as.data.frame(cbind(x20151126,x20151216,x20160105,x20160305,x20160315,x20160325))
Is there a way to automate this code, without having to specify all the column headers using grep? Like I mentioned, I have 371 column headers, and not all of them have multiples. Also, I have 9 different tables that all look similar to this.
Any suggestions or help would be appreciated.
You can use split.default to split data into similar named columns and use rowMeans to calculate row-wise mean for each list.
sapply(split.default(data, sub('\\..*', '', names(data))), rowMeans, na.rm = TRUE)
# X20151126 X20151216 X20160105 X20160305 X20160315 X20160325
#Site_00 NaN 0.4820056 0.5573918 0.6631711 0.6684754 NaN
#Site_01 NaN 0.4773160 0.5450316 0.5947297 0.5984900 NaN
#Site_02 NaN 0.5776294 0.6119382 0.6240641 0.6277319 NaN
#Site_03 NaN 0.5217687 0.5526371 0.6048668 0.5980956 NaN
#Site_04 NaN 0.4031827 0.4623707 0.5570995 0.5678683 NaN
#Site_05 NaN 0.4954761 0.5140621 0.5252072 0.3102534 NaN
#Site_06 NaN 0.4491103 0.5178627 0.5975557 0.6288175 NaN
#Site_09 NaN 0.5722107 0.6070054 0.5500955 0.5852464 NaN
#Site_10 NaN 0.6390959 0.6278275 0.5965288 0.5627300 NaN
#Site_11 NaN 0.6637253 0.6693131 0.6092008 0.6019234 NaN
#Site_12 0.2778784 0.3852075 0.4147585 0.6090069 0.6438335 0.6170999
#Site_13 0.3554969 0.4768046 0.4766242 0.6354171 0.6629661 0.6432456
#Site_16 NaN 0.3691304 0.3638944 0.6088972 0.6410303 0.6104665
#Site_17 NaN 0.5095926 0.5362862 0.5575326 0.5480075 0.5241143
#Site_18 NaN 0.6960640 0.6273781 0.5501873 0.5740978 0.5585046
Using sub we keep only common part of column names which is used to split data.
sub('\\..*', '', names(data))
#[1] "X20151126" "X20151126" "X20151126" "X20151126" "X20151216" "X20151216"
#[7] "X20160105" "X20160105" "X20160305" "X20160305" "X20160315" "X20160315"
#[13] "X20160315" "X20160315" "X20160325"
I have a dataframe (df1) with a column "PartcipantID". Some ParticipantIDs are wrong and should be replaced with the correct ParticipantID. I have another dataframe (df2) where all Participant IDs appear in columns Goal_ID to T4. The Participant IDs in column "Goal_ID" are the correct IDs.
Now I want to replace all ParticipantIDs in df1 with all Goal_ID ParticipantIDs from df2.
This is my original dataframe (df1):
structure(list(Partcipant_ID = c("AA_SH_RA_91", "AA_SH_RA_91",
"AB_BA_PR_93", "AB_BH_VI_90", "AB_BH_VI_90", "AB_SA_TA_91", "AJ_BO_RA_92",
"AJ_BO_RA_92", "AK_SH_HA_91", "AL_EN_RA_95", "AL_MA_RA_95", "AL_SH_BA_99",
"AM_BO_AB_49", "AM_BO_AB_94", "AM_BO_AB_94", "AM_BO_AB_94", "AN_JA_AN_91",
"AN_KL_GE_11", "AN_KL_WO_91", "AN_MA_DI_95", "AN_MA_DI_95", "AN_SE_RA_95",
"AN_SE_RA_95", "AN_SI_RA_97", "AN_SO_PU_94", "AN_SU_RA_91", "AR_BO_RA_92",
"AR_KA_VI_94", "AR_KA_VI_94", "AS_AR_SO_90", "AS_AR_SU_95", "AS_KU_SO_90",
"AS_MO_AS_97", "AW_SI_OJ_97", "AW_SI_OJ_97", "AY_CH_SU_97", "BH_BE_LD_84",
"BH_BE_LI_83", "BH_BE_LI_83", "BH_BE_LI_84", "BH_KO_SA_87", "BH_PE_AB_89",
"BH_YA_SA_87", "BI_CH_PR_94", "BI_CH_PR_94"), Start_T2 = structure(c(NA,
NA, NA, NA, 1579514871, 1576658745, NA, 1579098225, NA, NA, 1576663067,
1576844759, NA, 1577330639, NA, NA, 1576693930, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 1577718380, 1577718380, 1577454467, NA,
NA, 1576352237, NA, NA, NA, NA, 1576420656, 1576420656, NA, NA,
1578031772, 1576872938, NA, NA), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), End_T2 = structure(c(NA, NA, NA, NA, 1579515709,
1576660469, NA, 1579098989, NA, NA, 1576693776, 1576845312, NA,
1577331721, NA, NA, 1576694799, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 1577719049, 1577719049, 1577455167, NA, NA, 1576352397,
NA, NA, NA, NA, 1576421607, 1576421607, NA, NA, 1578032408, 1576873875,
NA, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
45L), class = "data.frame")
And this is the reference data frame (df2):
structure(list(Goal_ID = c("AJ_BO_RA_92", "AL_EN_RA_95", "AM_BO_AB_49",
"AS_KU_SO_90", "BH_BE_LI_84", "BH_YA_SA_87", "BI_CH_PR_94", "BI_CH_PR_94"
), T2 = c("AJ_BO_RA_92", "AL_MA_RA_95", "AM_BO_AB_94", "AS_AR_SO_90",
"BH_BE_LI_83", "BH_YA_SA_87", "BI_NA_PR_94", "BI_NA_PR_94"),
T3 = c("AR_BO_RA_92", "AL_MA_RA_95", "AM_BO_AB_94", NA, "BH_BE_LI_83",
NA, "BI_CH_PR_94", "BI_CH_PR_94"), T4 = c("AJ_BO_RA_92",
"AL_MA_RA_95", "AM_BO_AB_94", NA, "BH_BE_LI_83", "BH_KO_SA_87",
"BI_CH_PR_94", "BI_CH_PR_94")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
For example, in my df1, I want
"AR_BO_RA_92" to be replaced by "AJ_BO_RA_92";
"AL_MA_RA_95" to be replaced by "AL_EN_RA_95";
"AM_BO_AB_94" to be replaced by "AM_BO_AB_49"
and so on...
I thought about using string_replace and I started with this:
df1$Partcipant_ID <- str_replace(df1$Partcipant_ID, "AR_BO_RA_92", "AJ_BO_RA_92")
But that is of course very unefficient because I have so many replacements and it would be nice to make use of my reference data frame. I just cannot figure it out myself.
I hope this is understandable. Please ask if you need additional information.
Thank you so much already!
You can use match to find where the string is located and excange those which have been found and are not NA like:
i <- match(df1$Partcipant_ID, unlist(df2[-1])) %% nrow(df2)
j <- !is.na(i)
df1$Partcipant_ID[j] <- df2$Goal_ID[i[j]]
df1$Partcipant_ID
# [1] "AA_SH_RA_91" "AA_SH_RA_91" "AB_BA_PR_93" "AB_BH_VI_90" "AB_BH_VI_90"
# [6] "AB_SA_TA_91" "AJ_BO_RA_92" "AJ_BO_RA_92" "AK_SH_HA_91" "AL_EN_RA_95"
#[11] "AL_MA_RA_95" "AL_SH_BA_99" "AM_BO_AB_49" "AM_BO_AB_94" "AM_BO_AB_94"
#[16] "AM_BO_AB_94" "AN_JA_AN_91" "AN_KL_GE_11" "AN_KL_WO_91" "AN_MA_DI_95"
#[21] "AN_MA_DI_95" "AN_SE_RA_95" "AN_SE_RA_95" "AN_SI_RA_97" "AN_SO_PU_94"
#[26] "AN_SU_RA_91" "AR_BO_RA_92" "AR_KA_VI_94" "AR_KA_VI_94" "AS_AR_SO_90"
#[31] "AS_AR_SU_95" "AS_KU_SO_90" "AS_MO_AS_97" "AW_SI_OJ_97" "AW_SI_OJ_97"
#[36] "AY_CH_SU_97" "BH_BE_LD_84" "BH_BE_LI_83" "BH_BE_LI_83" "BH_BE_LI_84"
#[41] "BH_KO_SA_87" "BH_PE_AB_89" "BH_YA_SA_87" "BI_CH_PR_94" "BI_CH_PR_94"
I think this might work. Create a true look up table with a column of correct and incorrect codes. I.e. stack the columns, then join the subsequent df3 to df1 and use coalesce to create a new part_id. You spelt participant wrong, which made me feel more human I always do that.
library(dplyr)
df3 <- df2[1:2] %>%
bind_rows(df2[c(1,3)] %>% rename(T2 = T3),
df2[c(1,4)] %>% rename(T2 = T4)) %>%
distinct()
df1 %>%
left_join(df3, by = c("Partcipant_ID" = "T2")) %>%
mutate(Goal_ID = coalesce(Goal_ID, Partcipant_ID)) %>%
select(Goal_ID, Partcipant_ID, Start_T2, End_T2)
I'm trying to calculate the adjusted sharpe ratio for a portfolio of two or more assets. what I need to do is:
I load the two csv files I generated.
structure(list(X.1 = 1:50, X = 1:50, date = structure(1:50, .Label = c("2019-07-01", "2019-07-02", "2019-07-03", "2019-07-05",
"2019-07-08", "2019-07-09", "2019-07-10", "2019-07-11", "2019-07-12",
"2019-07-15", "2019-07-16", "2019-07-17", "2019-07-18", "2019-07-19",
"2019-07-22", "2019-07-23", "2019-07-24", "2019-07-25", "2019-07-26",
"2019-07-29", "2019-07-30", "2019-07-31", "2019-08-01", "2019-08-02",
"2019-08-05", "2019-08-06", "2019-08-07", "2019-08-08", "2019-08-09",
"2019-08-12", "2019-08-13", "2019-08-14", "2019-08-15", "2019-08-16",
"2019-08-19", "2019-08-20", "2019-08-21", "2019-08-22", "2019-08-23",
"2019-08-26", "2019-08-27", "2019-08-28", "2019-08-29", "2019-08-30",
"2019-09-03", "2019-09-04", "2019-09-05", "2019-09-06", "2019-09-09",
"2019-09-10"), class = "factor"),
adjClose = c(130.8539817206, 131.863291017, 132.8033339891,
131.041990315, 131.2201037202, 131.1706277743, 130.4482789642,
128.7067256684, 128.8551535061, 129.5972926947, 129.2113803166,
130.6066019911, 130.6164971802, 130.3097463156, 130.5571260452,
129.8446724242, 130.4086982074, 129.7853012891, 130.0920521537,
130.1316329104, 130.4482789642, 131.4971690173, 134.103513361,
135.3428257349, 137.6826474969, 138.783156885, 138.83272938,
139.1301643497, 138.852558378, 141.7475920835, 141.2617816329,
144.4443358092, 146.0504846459, 144.8805737649, 142.8084434756,
144.2857038254, 143.3239974232, 142.37220552, 144.712027282,
144.1270718415, 146.3479196156, 146.5362950965, 145.9810831529,
146.0207411489, 146.2092980651, 146.4278025524, 143.7759526384,
144.7989509198, 142.2265571831, 139.7336196235), lagx = c(NA,
130.8539817206, 131.863291017, 132.8033339891, 131.041990315,
131.2201037202, 131.1706277743, 130.4482789642, 128.7067256684,
128.8551535061, 129.5972926947, 129.2113803166, 130.6066019911,
130.6164971802, 130.3097463156, 130.5571260452, 129.8446724242,
130.4086982074, 129.7853012891, 130.0920521537, 130.1316329104,
130.4482789642, 131.4971690173, 134.103513361, 135.3428257349,
137.6826474969, 138.783156885, 138.83272938, 139.1301643497,
138.852558378, 141.7475920835, 141.2617816329, 144.4443358092,
146.0504846459, 144.8805737649, 142.8084434756, 144.2857038254,
143.3239974232, 142.37220552, 144.712027282, 144.1270718415,
146.3479196156, 146.5362950965, 145.9810831529, 146.0207411489,
146.2092980651, 146.4278025524, 143.7759526384, 144.7989509198,
142.2265571831), pct_change = c(NA, 0.0076542098, 0.0070784591,
-0.0134410632, 0.0013573637, -0.0003771877, -0.0055374346,
-0.0135311755, 0.0011518968, 0.0057265023, -0.0029866748,
0.0106826275, 7.57575751427131e-05, -0.0023540132, 0.0018948007,
-0.0054869685, 0.0043250626, -0.0048032937, 0.0023579524,
0.0003041594, 0.0024273686, 0.0079765219, 0.0194353174, 0.0091568383,
0.0169943112, 0.0079297042, 0.0003570663, 0.002137818, -0.001999286,
0.0204238651, -0.0034390792, 0.0220330839, 0.0109972168,
-0.0080750017, -0.0145098584, 0.0102384388, -0.0067100166,
-0.0066852368, 0.0161688134, -0.0040586091, 0.0151751236,
0.001285521, -0.0038033143, 0.0002715915, 0.001289637, 0.0014922336,
-0.0184443216, 0.0070649564, -0.0180865922, -0.0178406425
), rollmeanx = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0.0022449273, 0.0020327583, 0.0016836143, 0.0029861115,
0.002801633, 0.0036635665, 0.0042995146, 0.0045093675, 0.0039069923,
0.0040805283, 0.0039373228, 0.0032693281, 0.0038882917, 0.0038227304,
0.004333512, 0.0045939924, 0.0042813625, 0.0044765504, 0.0044354613,
0.0044811565, 0.0036783992, 0.003643339, 0.0022001886, 0.0011618239
), rollsdx = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0.0077480465, 0.0076691234, 0.0076367931, 0.0078392534, 0.0079349676,
0.0087510466, 0.0086559326, 0.0082636055, 0.0090514164, 0.0091306015,
0.0092739515, 0.0093933857, 0.0096997509, 0.0097492652, 0.0099891559,
0.0098095162, 0.0099469924, 0.0098104499, 0.0098219087, 0.0098046681,
0.0107849756, 0.010771918, 0.01108063, 0.0116528378), roll_sharpe = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.2897410646, 0.2650574546,
0.2204609035, 0.380917847, 0.3530742789, 0.4186432387, 0.4967130412,
0.5456900702, 0.4316442951, 0.4469068418, 0.4245571962, 0.3480457574,
0.4008651116, 0.3921044585, 0.4338216423, 0.468319975, 0.4304177896,
0.4563042904, 0.4515885332, 0.4570431621, 0.3410669895, 0.3382256492,
0.1985616886, 0.0997030911)), class = "data.frame", row.names = c(NA,
-50L))
structure(list(X = 1:49, date = structure(1:49, .Label = c("2019-01-02",
"2019-01-03", "2019-01-04", "2019-01-07", "2019-01-08", "2019-01-09",
"2019-01-10", "2019-01-11", "2019-01-14", "2019-01-15", "2019-01-16",
"2019-01-17", "2019-01-18", "2019-01-22", "2019-01-23", "2019-01-24",
"2019-01-25", "2019-01-28", "2019-01-29", "2019-01-30", "2019-01-31",
"2019-02-01", "2019-02-04", "2019-02-05", "2019-02-06", "2019-02-07",
"2019-02-08", "2019-02-11", "2019-02-12", "2019-02-13", "2019-02-14",
"2019-02-15", "2019-02-19", "2019-02-20", "2019-02-21", "2019-02-22",
"2019-02-25", "2019-02-26", "2019-02-27", "2019-02-28", "2019-03-01",
"2019-03-04", "2019-03-05", "2019-03-06", "2019-03-07", "2019-03-08",
"2019-03-11", "2019-03-12", "2019-03-13"), class = "factor"),
adjClose = c(107.6401844169, 108.2682817731, 108.0425592857,
107.9738611374, 108.0621873281, 108.1897696036, 107.9346050527,
108.2192116672, 108.0229312434, 107.9247910315, 107.9149770103,
107.8266508196, 107.6990685441, 107.7677666924, 107.6401844169,
107.8070227772, 107.6401844169, 107.4929740991, 107.8070227772,
108.3958640486, 109.0043333624, 108.621586536, 108.4056780698,
108.5528883877, 108.4841902393, 108.6510285996, 108.7000987055,
108.4743762181, 108.4940042605, 108.4743762181, 108.778610875,
108.778610875, 109.033775426, 109.0730315107, 108.8865651081,
109.1711717227, 109.0926595531, 109.2496838922, 108.9945193412,
108.8178669598, 108.6019584936, 108.7589828327, 108.7197267479,
108.8473090234, 109.1515436803, 109.2496838922, 109.2300558498,
109.4361502948, 109.5637325703), lagx = c(NA, 107.6401844169,
108.2682817731, 108.0425592857, 107.9738611374, 108.0621873281,
108.1897696036, 107.9346050527, 108.2192116672, 108.0229312434,
107.9247910315, 107.9149770103, 107.8266508196, 107.6990685441,
107.7677666924, 107.6401844169, 107.8070227772, 107.6401844169,
107.4929740991, 107.8070227772, 108.3958640486, 109.0043333624,
108.621586536, 108.4056780698, 108.5528883877, 108.4841902393,
108.6510285996, 108.7000987055, 108.4743762181, 108.4940042605,
108.4743762181, 108.778610875, 108.778610875, 109.033775426,
109.0730315107, 108.8865651081, 109.1711717227, 109.0926595531,
109.2496838922, 108.9945193412, 108.8178669598, 108.6019584936,
108.7589828327, 108.7197267479, 108.8473090234, 109.1515436803,
109.2496838922, 109.2300558498, 109.4361502948), pct_change = c(NA,
0.00580130529379156, -0.00208919974584387, -0.000636247954609872,
0.000817364453597674, 0.00117924528324126, -0.00236406619337162,
0.0026299084064227, -0.00181702552912345, -0.000909338910569157,
-9.09421608741455e-05, -0.000819149904301271, -0.00118461818866857,
0.000637464711466883, -0.00118526622925381, 0.00154756486175119,
-0.00154996353084853, -0.00136948781102927, 0.00291306326814199,
0.00543232231753777, 0.0055820653641086, -0.00352367184650867,
-0.00199167119328355, 0.00135611608393343, -0.000633254930957689,
0.00153554331192595, 0.000451426507283543, -0.0020808830183651,
0.000180913613925363, -0.000180946349583393, 0.00279682425113514,
0, 0.00234023402384323, 0.000359906423762952, -0.0017124831003245,
0.00260697590864838, -0.000719683340030535, 0.00143729788046738,
-0.00234107689581361, -0.00162337662311708, -0.00198807157066812,
0.00144378271118579, -0.00036107600685043, 0.001172121540208,
0.00278726847685346, 0.000898311174948939, -0.000179694519491878,
0.00188323917137825, 0.0011644571840243), rollmeanx = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 0.00031862067098972, 0.000116566615996168,
0.000182745925763722, 0.000281463710606259, 0.000264830167591981,
0.000116642517518964, 0.000232323417850645, 0.000104557292577641,
0.00031427773713485, 0.000355611323978903, 0.000466119332375147,
0.000519712801832612, 0.000495718942211888, 0.00058524217844741,
0.000606405037048468, 0.000601392901535568, 0.000565433203128064,
0.000553892802578618, 0.000331113946269068, 0.000149816691434887,
-0.00012032609815416, 9.31190557875071e-05, 0.00031034358624828,
0.000289534272203531, 0.000310150654542886, 0.000325955011790718,
0.000358365497097116), rollsdx = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0.00268140767467829, 0.00240146073630199, 0.00235743767997561,
0.00236696184695034, 0.0023643008327706, 0.00240605774759231,
0.00234142001164779, 0.00228025110555633, 0.00230711786187933,
0.00229224984050652, 0.0023280202990145, 0.00231052893570155,
0.00233171418775393, 0.00237482539929513, 0.00236032672014123,
0.00235834925288879, 0.00239840602803656, 0.0024087481448098,
0.00240681026318947, 0.00213964886031918, 0.00176319827198963,
0.00160907538291628, 0.00163656991965793, 0.00162551270011354,
0.00161610064622736, 0.00163029258958719, 0.00163996465710043
), roll_sharpe = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.118825896561196,
0.0485398800130578, 0.0775188787877576, 0.11891349705062,
0.112012043442729, 0.0484786857820375, 0.0992232989787878,
0.0458534116364924, 0.136220928426624, 0.155136372002243,
0.200221335085637, 0.224932392666535, 0.212598501486754,
0.246435876347421, 0.256915719283212, 0.255005869380419,
0.235753744994942, 0.229950484351013, 0.137573763637803,
0.0700192887783178, -0.0682430898814242, 0.0578711580427878,
0.189630508614717, 0.178118738895924, 0.191912957442906,
0.199936510705267, 0.218520256241821)), class = "data.frame", row.names = c(NA,
-49L))
I choose between the two etfs according to the highest adjusted sharpe ratio on the last day of the month. (?)
I create a new csv file with the same columns calculated every month for the top ranked etf (on a daily basis). (?)
Desidered output is a file like those above, with data of top ranked etf
I have no idea about how to proceed. This for finding the last month day:
library(tidyverse)
library(roll)
library(quantmod)
library(httr)
library(jsonlite)
library(tidyverse)
library(hrbrthemes)
library(dplyr)
library(xts)
xdf <- data.frame()
xdf <- read.csv('tip.csv')
library(timeDate)
xdf$eom<-timeLastDayInMonth(xdf$somedate)
????
xdf_02 <- xdf %>% mutate(lagx = lag(adjClose)) %>%
mutate(pct_change = (adjClose - lagx)/adjClose)%>%
mutate(rollmeanx = roll_mean(pct_change, width = 22),rollsdx = roll_sd(pct_change, width=22)) %>%
mutate(roll_sharpe = rollmeanx / rollsdx)
write.csv(xdf_02,'tip_r.csv')
Tnx to all contributors
I want to run Pearson correlations of each row of a matrix (dat) vs a vector (v1), as part of a loop, and output the correlation coefficients and associated p-values in a table. Here is an example for random data (data pasted at the end):
result_table <- data.frame(matrix(ncol = 2, nrow = nrow(dat)))
colnames(result_table) <- c("correlation_coefficient", "pvalue")
for(i in 1:nrow(dat)){
print(i)
corr <- cor.test(as.numeric(dat[i,]), v1, na.action = "na.omit")
result_table[i,1] <- corr$estimate
result_table[i,2] <- corr$p.value
}
When cor.test() removes missing data, sometimes there are not enough observations remaining and the loop stops with an error (for example at row 11). I would like the loop to continue running, just leaving the values in the result table as NAs. I think the result table should then look like this:
> result_table
correlation_coefficient pvalue
1 0.68422642 0.04206591
2 -0.15895586 0.70694013
3 -0.37005028 0.53982309
4 0.08448970 0.89255250
5 0.86860091 0.05603661
6 0.19544883 0.75274040
7 -0.94695380 0.01454887
8 -0.03817885 0.94275955
9 -0.15214122 0.77354897
10 -0.22997890 0.70978386
11 NA NA
12 NA NA
13 -0.27769887 0.59415930
14 -0.09768153 0.81800885
15 -0.20986632 0.61790214
16 -0.40474976 0.31990456
17 -0.00605937 0.98863896
18 0.02176976 0.95919460
19 -0.14755097 0.72733118
20 -0.25830856 0.50216600
I would also like the errors to keep being printed
Here is the data:
> dput(v1)
c(-0.840396, 0.4746047, -1.101857, 0.5164767, 1.2203134, -0.9758888,
-0.3657913, -0.6272523, -0.5853803, 1.7367901)
> dput(dat)
structure(list(s1 = c(-0.52411895, 0.14709633, 0.05433954, 0.7504406,
-0.59971988, -0.59679685, -0.12571854, 0.73289705, -0.71668771,
-0.04813957, -0.67849896, -0.11947141, -0.26371884, -1.34137162,
2.60928064, -1.23397547, 0.51811222, -4.10759883, -0.70127093,
7.51914575), s2 = c(0.21446623, -0.27281487, NA, NA, NA, NA,
NA, NA, -0.62468391, NA, NA, NA, -3.84387999, 0.64010069, NA,
NA, NA, NA, NA, NA), s3 = c(0.3461212, 0.279062, NA, NA, NA,
-0.4737744, 0.6313365, -2.8472641, 1.2647846, 2.2524449, -0.7913039,
-0.752590307, -3.535815266, 1.692385187, 3.55789764, -1.694910854,
-3.624517121, -4.963855198, 2.395998161, 5.35680032), s4 = c(0.3579742,
0.3522745, -1.1720907, 0.4223402, 0.146605, -0.3175295, -1.383926807,
-0.688551166, NA, NA, NA, NA, NA, 0.703612974, 1.79890268, -2.625404608,
-3.235884921, -2.845474098, 0.058650461, 1.83900702), s5 = c(1.698104376,
NA, NA, NA, NA, NA, -1.488000007, -0.739488766, 0.276012387,
0.49344994, NA, NA, -1.417434166, -0.644962513, 0.04010434, -3.388182254,
2.900252493, -1.493417096, -2.852256003, -0.98871696), s6 = c(0.3419271,
0.2482013, -1.2230283, 0.270752, -0.6653978, -1.1357202, NA,
NA, NA, NA, NA, NA, NA, NA, -1.0288213, -1.17817328, 6.1682455,
1.02759131, -3.80372867, -2.6249692), s7 = c(0.3957243, 0.8758406,
NA, NA, NA, NA, NA, 0.60196247, -1.28631859, -0.5754757, NA,
NA, NA, NA, NA, NA, NA, NA, NA, -2.6303001), s8 = c(-0.26409595,
1.2643281, 0.05687957, -0.09459169, -0.7875279, NA, NA, NA, NA,
NA, NA, NA, 2.42442997, -0.00445559, -1.0341522, 2.47315322,
0.1190265, 5.82533417, 0.82239131, -0.8279679), s9 = c(0.237123,
-0.5004619, 0.4447322, -0.2155249, -0.2331443, 1.3438071, -0.3817672,
1.9228182, 0.305661, -0.01348, NA, NA, 3.4009042, 0.8268469,
0.2061843, -1.1228663, -0.1443778, 4.8789902, 1.3480328, 0.4258486
), s10 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0.5211859, 0.2196643, -1.2333367, 0.1186947, 1.478086, 0.5211859,
0.2196643)), .Names = c("s1", "s2", "s3", "s4", "s5", "s6", "s7",
"s8", "s9", "s10"), class = "data.frame", row.names = c(NA, -20L
))
A solution with tryCatch could be
for(i in 1:nrow(dat)){
print(i)
corr <- tryCatch(cor.test(as.numeric(dat[i,]), v1, na.action = "na.omit"), error = function(e) return(NA))
if(length(corr) == 1){
result_table[i,1] <- NA
result_table[i,2] <- NA
}else{
result_table[i,1] <- corr$estimate
result_table[i,2] <- corr$p.value
}
}
Here is a solution with tryCatch():
Replacing the for loop with:
for(i in 1:nrow(dat)){
tryCatch({
print(i)
corr <- cor.test(as.numeric(dat[i,]), v1, na.action = "na.omit") # Correlation miRNA activity vs CNVs for that gene
result_table[i,1] <- corr$estimate
result_table[i,2] <- corr$p.value
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}
I can see that there are a lot of questions similar to this, but I cant find solution for my particular problem.
Data:
risk_accum <- structure(list(date = structure(c(1465948800, 1465952400, 1465956000,
1465959600, 1465963200, 1465966800, 1465970400, 1465974000, 1465977600,
1465981200, 1465984800, 1465988400, 1465992000, 1465995600, 1465999200,
1466002800, 1466006400, 1466010000, 1466013600, 1466017200, 1466020800,
1466024400, 1466028000, 1466031600, 1466035200, 1466038800, 1466042400,
1466046000, 1466049600, 1466053200, 1466056800, 1466060400, 1466064000,
1466067600, 1466071200, 1466074800, 1466078400, 1466082000, 1466085600,
1466089200, 1466092800, 1466096400, 1466100000, 1466103600, 1466107200,
1466110800, 1466114400, 1466118000, 1466121600, 1466125200, 1466128800,
1466132400, 1466136000, 1466139600, 1466143200, 1466146800, 1466150400,
1466154000, 1466157600, 1466161200, 1466164800, 1466168400, 1466172000,
1466175600, 1466179200, 1466182800, 1466186400, 1466190000, 1466193600,
1466197200, 1466200800, 1466204400, 1466208000, 1466211600, 1466215200,
1466218800, 1466222400, 1466226000, 1466229600, 1466233200, 1466236800,
1466240400, 1466244000, 1466247600, 1466251200, 1466254800, 1466258400,
1466262000, 1466265600, 1466269200, 1466272800, 1466276400, 1466280000,
1466283600, 1466287200, 1466290800, 1466294400, 1466298000, 1466301600,
1466305200, 1466308800, 1466312400, 1466316000, 1466319600, 1466323200,
1466326800, 1466330400, 1466334000, 1466337600, 1466341200, 1466344800,
1466348400, 1466352000, 1466355600, 1466359200, 1466362800, 1466366400,
1466370000, 1466373600, 1466377200), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), risk = c(NA, NA, NA, 1, 2, 3, 4, 5, 6, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 2, 3, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, NA, NA)), .Names = c("date",
"risk"), row.names = c(NA, -120L), class = c("tbl_df", "tbl",
"data.frame"))
And code to generate graph:
#color variable
color_var <- vector(mode = "double",length = length(risk_accum$risk))
color_var[color_var== '0']<-NA
color_var[risk_accum$risk<6] <- "green4"
color_var[risk_accum$risk>=6 & risk_accum$risk<12] <- "yellow2"
color_var[risk_accum$risk>=12] <- "red"
#plot of Effective Blight Hours accumulation
ggplot(risk_accum)+
geom_line(aes(x = date, y = risk), color = color_var)+
scale_y_continuous(name = "EBH accumulation")+
scale_colour_manual(values=c("green", "yellow", "red"))+
geom_line(aes(date, y= 12), linetype= "dotted", size = 0.1)+
theme(axis.title.x = element_blank())
I need to get a legend which would explain the traffic light system (red is danger, etc) and manually added threshold risk line.
Add your color variable to the dataset, map to that variable inside aes, and use scale_*_identity to directly use the colors.
risk_accum$color_var <- NA
risk_accum$color_var[risk_accum$risk<6] <- "green4"
risk_accum$color_var[risk_accum$risk>=6 & risk_accum$risk<12] <- "yellow2"
risk_accum$color_var[risk_accum$risk>=12] <- "red"
ggplot(risk_accum)+
geom_line(aes(x = date, y = risk, color = color_var)) +
scale_y_continuous(name = "EBH accumulation")+
scale_color_identity(guide = 'legend') +
geom_line(aes(date, y= 12), linetype= "dotted", size = 0.1)+
theme(axis.title.x = element_blank())
You can also add your threshold to the legend:
ggplot(risk_accum)+
geom_line(aes(x = date, y = risk, color = color_var)) +
geom_line(aes(date, y= 12, linetype = "threshold"), size = 0.1)+
scale_y_continuous(name = "EBH accumulation")+
scale_color_identity(guide = 'legend') +
scale_linetype_manual(values = 2) +
theme(axis.title.x = element_blank())