create an etf portfolio csv data - r

I'm trying to calculate the adjusted sharpe ratio for a portfolio of two or more assets. what I need to do is:
I load the two csv files I generated.
structure(list(X.1 = 1:50, X = 1:50, date = structure(1:50, .Label = c("2019-07-01", "2019-07-02", "2019-07-03", "2019-07-05",
"2019-07-08", "2019-07-09", "2019-07-10", "2019-07-11", "2019-07-12",
"2019-07-15", "2019-07-16", "2019-07-17", "2019-07-18", "2019-07-19",
"2019-07-22", "2019-07-23", "2019-07-24", "2019-07-25", "2019-07-26",
"2019-07-29", "2019-07-30", "2019-07-31", "2019-08-01", "2019-08-02",
"2019-08-05", "2019-08-06", "2019-08-07", "2019-08-08", "2019-08-09",
"2019-08-12", "2019-08-13", "2019-08-14", "2019-08-15", "2019-08-16",
"2019-08-19", "2019-08-20", "2019-08-21", "2019-08-22", "2019-08-23",
"2019-08-26", "2019-08-27", "2019-08-28", "2019-08-29", "2019-08-30",
"2019-09-03", "2019-09-04", "2019-09-05", "2019-09-06", "2019-09-09",
"2019-09-10"), class = "factor"),
adjClose = c(130.8539817206, 131.863291017, 132.8033339891,
131.041990315, 131.2201037202, 131.1706277743, 130.4482789642,
128.7067256684, 128.8551535061, 129.5972926947, 129.2113803166,
130.6066019911, 130.6164971802, 130.3097463156, 130.5571260452,
129.8446724242, 130.4086982074, 129.7853012891, 130.0920521537,
130.1316329104, 130.4482789642, 131.4971690173, 134.103513361,
135.3428257349, 137.6826474969, 138.783156885, 138.83272938,
139.1301643497, 138.852558378, 141.7475920835, 141.2617816329,
144.4443358092, 146.0504846459, 144.8805737649, 142.8084434756,
144.2857038254, 143.3239974232, 142.37220552, 144.712027282,
144.1270718415, 146.3479196156, 146.5362950965, 145.9810831529,
146.0207411489, 146.2092980651, 146.4278025524, 143.7759526384,
144.7989509198, 142.2265571831, 139.7336196235), lagx = c(NA,
130.8539817206, 131.863291017, 132.8033339891, 131.041990315,
131.2201037202, 131.1706277743, 130.4482789642, 128.7067256684,
128.8551535061, 129.5972926947, 129.2113803166, 130.6066019911,
130.6164971802, 130.3097463156, 130.5571260452, 129.8446724242,
130.4086982074, 129.7853012891, 130.0920521537, 130.1316329104,
130.4482789642, 131.4971690173, 134.103513361, 135.3428257349,
137.6826474969, 138.783156885, 138.83272938, 139.1301643497,
138.852558378, 141.7475920835, 141.2617816329, 144.4443358092,
146.0504846459, 144.8805737649, 142.8084434756, 144.2857038254,
143.3239974232, 142.37220552, 144.712027282, 144.1270718415,
146.3479196156, 146.5362950965, 145.9810831529, 146.0207411489,
146.2092980651, 146.4278025524, 143.7759526384, 144.7989509198,
142.2265571831), pct_change = c(NA, 0.0076542098, 0.0070784591,
-0.0134410632, 0.0013573637, -0.0003771877, -0.0055374346,
-0.0135311755, 0.0011518968, 0.0057265023, -0.0029866748,
0.0106826275, 7.57575751427131e-05, -0.0023540132, 0.0018948007,
-0.0054869685, 0.0043250626, -0.0048032937, 0.0023579524,
0.0003041594, 0.0024273686, 0.0079765219, 0.0194353174, 0.0091568383,
0.0169943112, 0.0079297042, 0.0003570663, 0.002137818, -0.001999286,
0.0204238651, -0.0034390792, 0.0220330839, 0.0109972168,
-0.0080750017, -0.0145098584, 0.0102384388, -0.0067100166,
-0.0066852368, 0.0161688134, -0.0040586091, 0.0151751236,
0.001285521, -0.0038033143, 0.0002715915, 0.001289637, 0.0014922336,
-0.0184443216, 0.0070649564, -0.0180865922, -0.0178406425
), rollmeanx = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0.0022449273, 0.0020327583, 0.0016836143, 0.0029861115,
0.002801633, 0.0036635665, 0.0042995146, 0.0045093675, 0.0039069923,
0.0040805283, 0.0039373228, 0.0032693281, 0.0038882917, 0.0038227304,
0.004333512, 0.0045939924, 0.0042813625, 0.0044765504, 0.0044354613,
0.0044811565, 0.0036783992, 0.003643339, 0.0022001886, 0.0011618239
), rollsdx = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0.0077480465, 0.0076691234, 0.0076367931, 0.0078392534, 0.0079349676,
0.0087510466, 0.0086559326, 0.0082636055, 0.0090514164, 0.0091306015,
0.0092739515, 0.0093933857, 0.0096997509, 0.0097492652, 0.0099891559,
0.0098095162, 0.0099469924, 0.0098104499, 0.0098219087, 0.0098046681,
0.0107849756, 0.010771918, 0.01108063, 0.0116528378), roll_sharpe = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.2897410646, 0.2650574546,
0.2204609035, 0.380917847, 0.3530742789, 0.4186432387, 0.4967130412,
0.5456900702, 0.4316442951, 0.4469068418, 0.4245571962, 0.3480457574,
0.4008651116, 0.3921044585, 0.4338216423, 0.468319975, 0.4304177896,
0.4563042904, 0.4515885332, 0.4570431621, 0.3410669895, 0.3382256492,
0.1985616886, 0.0997030911)), class = "data.frame", row.names = c(NA,
-50L))
structure(list(X = 1:49, date = structure(1:49, .Label = c("2019-01-02",
"2019-01-03", "2019-01-04", "2019-01-07", "2019-01-08", "2019-01-09",
"2019-01-10", "2019-01-11", "2019-01-14", "2019-01-15", "2019-01-16",
"2019-01-17", "2019-01-18", "2019-01-22", "2019-01-23", "2019-01-24",
"2019-01-25", "2019-01-28", "2019-01-29", "2019-01-30", "2019-01-31",
"2019-02-01", "2019-02-04", "2019-02-05", "2019-02-06", "2019-02-07",
"2019-02-08", "2019-02-11", "2019-02-12", "2019-02-13", "2019-02-14",
"2019-02-15", "2019-02-19", "2019-02-20", "2019-02-21", "2019-02-22",
"2019-02-25", "2019-02-26", "2019-02-27", "2019-02-28", "2019-03-01",
"2019-03-04", "2019-03-05", "2019-03-06", "2019-03-07", "2019-03-08",
"2019-03-11", "2019-03-12", "2019-03-13"), class = "factor"),
adjClose = c(107.6401844169, 108.2682817731, 108.0425592857,
107.9738611374, 108.0621873281, 108.1897696036, 107.9346050527,
108.2192116672, 108.0229312434, 107.9247910315, 107.9149770103,
107.8266508196, 107.6990685441, 107.7677666924, 107.6401844169,
107.8070227772, 107.6401844169, 107.4929740991, 107.8070227772,
108.3958640486, 109.0043333624, 108.621586536, 108.4056780698,
108.5528883877, 108.4841902393, 108.6510285996, 108.7000987055,
108.4743762181, 108.4940042605, 108.4743762181, 108.778610875,
108.778610875, 109.033775426, 109.0730315107, 108.8865651081,
109.1711717227, 109.0926595531, 109.2496838922, 108.9945193412,
108.8178669598, 108.6019584936, 108.7589828327, 108.7197267479,
108.8473090234, 109.1515436803, 109.2496838922, 109.2300558498,
109.4361502948, 109.5637325703), lagx = c(NA, 107.6401844169,
108.2682817731, 108.0425592857, 107.9738611374, 108.0621873281,
108.1897696036, 107.9346050527, 108.2192116672, 108.0229312434,
107.9247910315, 107.9149770103, 107.8266508196, 107.6990685441,
107.7677666924, 107.6401844169, 107.8070227772, 107.6401844169,
107.4929740991, 107.8070227772, 108.3958640486, 109.0043333624,
108.621586536, 108.4056780698, 108.5528883877, 108.4841902393,
108.6510285996, 108.7000987055, 108.4743762181, 108.4940042605,
108.4743762181, 108.778610875, 108.778610875, 109.033775426,
109.0730315107, 108.8865651081, 109.1711717227, 109.0926595531,
109.2496838922, 108.9945193412, 108.8178669598, 108.6019584936,
108.7589828327, 108.7197267479, 108.8473090234, 109.1515436803,
109.2496838922, 109.2300558498, 109.4361502948), pct_change = c(NA,
0.00580130529379156, -0.00208919974584387, -0.000636247954609872,
0.000817364453597674, 0.00117924528324126, -0.00236406619337162,
0.0026299084064227, -0.00181702552912345, -0.000909338910569157,
-9.09421608741455e-05, -0.000819149904301271, -0.00118461818866857,
0.000637464711466883, -0.00118526622925381, 0.00154756486175119,
-0.00154996353084853, -0.00136948781102927, 0.00291306326814199,
0.00543232231753777, 0.0055820653641086, -0.00352367184650867,
-0.00199167119328355, 0.00135611608393343, -0.000633254930957689,
0.00153554331192595, 0.000451426507283543, -0.0020808830183651,
0.000180913613925363, -0.000180946349583393, 0.00279682425113514,
0, 0.00234023402384323, 0.000359906423762952, -0.0017124831003245,
0.00260697590864838, -0.000719683340030535, 0.00143729788046738,
-0.00234107689581361, -0.00162337662311708, -0.00198807157066812,
0.00144378271118579, -0.00036107600685043, 0.001172121540208,
0.00278726847685346, 0.000898311174948939, -0.000179694519491878,
0.00188323917137825, 0.0011644571840243), rollmeanx = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 0.00031862067098972, 0.000116566615996168,
0.000182745925763722, 0.000281463710606259, 0.000264830167591981,
0.000116642517518964, 0.000232323417850645, 0.000104557292577641,
0.00031427773713485, 0.000355611323978903, 0.000466119332375147,
0.000519712801832612, 0.000495718942211888, 0.00058524217844741,
0.000606405037048468, 0.000601392901535568, 0.000565433203128064,
0.000553892802578618, 0.000331113946269068, 0.000149816691434887,
-0.00012032609815416, 9.31190557875071e-05, 0.00031034358624828,
0.000289534272203531, 0.000310150654542886, 0.000325955011790718,
0.000358365497097116), rollsdx = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0.00268140767467829, 0.00240146073630199, 0.00235743767997561,
0.00236696184695034, 0.0023643008327706, 0.00240605774759231,
0.00234142001164779, 0.00228025110555633, 0.00230711786187933,
0.00229224984050652, 0.0023280202990145, 0.00231052893570155,
0.00233171418775393, 0.00237482539929513, 0.00236032672014123,
0.00235834925288879, 0.00239840602803656, 0.0024087481448098,
0.00240681026318947, 0.00213964886031918, 0.00176319827198963,
0.00160907538291628, 0.00163656991965793, 0.00162551270011354,
0.00161610064622736, 0.00163029258958719, 0.00163996465710043
), roll_sharpe = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.118825896561196,
0.0485398800130578, 0.0775188787877576, 0.11891349705062,
0.112012043442729, 0.0484786857820375, 0.0992232989787878,
0.0458534116364924, 0.136220928426624, 0.155136372002243,
0.200221335085637, 0.224932392666535, 0.212598501486754,
0.246435876347421, 0.256915719283212, 0.255005869380419,
0.235753744994942, 0.229950484351013, 0.137573763637803,
0.0700192887783178, -0.0682430898814242, 0.0578711580427878,
0.189630508614717, 0.178118738895924, 0.191912957442906,
0.199936510705267, 0.218520256241821)), class = "data.frame", row.names = c(NA,
-49L))
I choose between the two etfs according to the highest adjusted sharpe ratio on the last day of the month. (?)
I create a new csv file with the same columns calculated every month for the top ranked etf (on a daily basis). (?)
Desidered output is a file like those above, with data of top ranked etf
I have no idea about how to proceed. This for finding the last month day:
library(tidyverse)
library(roll)
library(quantmod)
library(httr)
library(jsonlite)
library(tidyverse)
library(hrbrthemes)
library(dplyr)
library(xts)
xdf <- data.frame()
xdf <- read.csv('tip.csv')
library(timeDate)
xdf$eom<-timeLastDayInMonth(xdf$somedate)
????
xdf_02 <- xdf %>% mutate(lagx = lag(adjClose)) %>%
mutate(pct_change = (adjClose - lagx)/adjClose)%>%
mutate(rollmeanx = roll_mean(pct_change, width = 22),rollsdx = roll_sd(pct_change, width=22)) %>%
mutate(roll_sharpe = rollmeanx / rollsdx)
write.csv(xdf_02,'tip_r.csv')
Tnx to all contributors

Related

Average columns with similar headers using R [duplicate]

This question already has an answer here:
summary stats across columns, where column names indicate groups
(1 answer)
Closed 2 years ago.
I have a dataframe with 59 rows and 371 columns. Rows are my observations/sites, and columns are imagery with a time stamp as column header. After sorting and cleaning the column headers I am left with this:
data<-structure(list(X20151126 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 0.277672673867523, 0.355025896133641, NA, NA, NA), X20151126.1 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.278084206794621, 0.355967923936499,
NA, NA, NA), X20151126.2 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 0.277672673867523, 0.355025896133641, NA, NA, NA), X20151126.3 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.278084206794621, 0.355967923936499,
NA, NA, NA), X20151216 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0.384717953500922, 0.476687361663067, 0.369193020073217,
0.509256980090469, 0.695446322082805), X20151216.1 = c(0.482005639964749,
0.477315968778509, 0.577629441578537, 0.521768662684214, 0.403182719183149,
0.495476052715638, 0.449110279438877, 0.572210709159168, 0.639095940861963,
0.663725301936293, 0.385697051441031, 0.476921890053818, 0.369067776922609,
0.509928156203107, 0.696681651400943), X20160105 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 0.414471298501795, 0.476356388531503,
0.363677767527229, 0.536833655761341, 0.627173043983676), X20160105.1 = c(0.557391753621561,
0.545031565171865, 0.611938234209565, 0.552637066670738, 0.462370657856108,
0.514062089559983, 0.517862730716598, 0.607005393447421, 0.62782746269337,
0.669313073182483, 0.415045773658901, 0.476891950837264, 0.364111066602943,
0.535738794288108, 0.627583094107998), X20160305 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 0.608603344509915, 0.635371377161601,
0.609235980874035, 0.55732646087173, 0.55009492623478), X20160305.1 = c(0.663171088076713,
0.594729693525975, 0.624064072126327, 0.60486680369263, 0.557099508853904,
0.525207209646858, 0.597555736004227, 0.55009547536139, 0.596528841673565,
0.609200814692677, 0.609410463440908, 0.635462899436559, 0.608558438182846,
0.557738728019745, 0.550279584646311), X20160315 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 0.643538550411662, 0.662874802494349,
0.641220282780279, 0.548161668675193, 0.574401192865252), X20160315.1 = c(0.668475363330821,
0.598489967449372, 0.627731925878224, 0.598095603148813, 0.567868324796377,
0.310253447067502, 0.628817539418026, 0.585246425272493, 0.562730022483348,
0.601923400619283, 0.644128383766072, 0.663057419883037, 0.640840244150391,
0.547853260818411, 0.57379435997677), X20160315.2 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 0.643538550411662, 0.662874802494349,
0.641220282780279, 0.548161668675193, 0.574401192865252), X20160315.3 = c(0.668475363330821,
0.598489967449372, 0.627731925878224, 0.598095603148813, 0.567868324796377,
0.310253447067502, 0.628817539418026, 0.585246425272493, 0.562730022483348,
0.601923400619283, 0.644128383766072, 0.663057419883037, 0.640840244150391,
0.547853260818411, 0.57379435997677), X20160325 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 0.617099941392511, 0.64324564948683,
0.610466453170061, 0.524114324562118, 0.558504551047362)), row.names = c("Site_00",
"Site_01", "Site_02", "Site_03", "Site_04", "Site_05", "Site_06",
"Site_09", "Site_10", "Site_11", "Site_12", "Site_13", "Site_16",
"Site_17", "Site_18"), class = "data.frame")
Some imagery, but not all, have multiples over the same time stamp e.g. X20151126, X20151126.1 and X20151126.2. However, not all of these multiples have values, as seen by the NA's.
I would like to create a new data frame where I reduce these multiples by calculating the mean (and where it only has NA I would like to keep NA). Ultimately, I would like this:
x20151126<-apply(data[grep("20151126",colnames(data), value=T)],
MARGIN=1,FUN=mean,na.rm=TRUE)
x20151216<-apply(data[grep("20151216",colnames(data), value=T)],
MARGIN=1,FUN=mean,na.rm=TRUE)
x20160105<-apply(data[grep("20160105",colnames(data), value=T)],
MARGIN=1,FUN=mean,na.rm=TRUE)
x20160305<-apply(data[grep("20160305",colnames(data), value=T)],
MARGIN=1,FUN=mean,na.rm=TRUE)
x20160315<-apply(data[grep("20160315",colnames(data), value=T)],
MARGIN=1,FUN=mean,na.rm=TRUE)
x20160325<-apply(data[grep("20160325",colnames(data), value=T)],
MARGIN=1,FUN=mean,na.rm=TRUE)
NewData<-as.data.frame(cbind(x20151126,x20151216,x20160105,x20160305,x20160315,x20160325))
Is there a way to automate this code, without having to specify all the column headers using grep? Like I mentioned, I have 371 column headers, and not all of them have multiples. Also, I have 9 different tables that all look similar to this.
Any suggestions or help would be appreciated.
You can use split.default to split data into similar named columns and use rowMeans to calculate row-wise mean for each list.
sapply(split.default(data, sub('\\..*', '', names(data))), rowMeans, na.rm = TRUE)
# X20151126 X20151216 X20160105 X20160305 X20160315 X20160325
#Site_00 NaN 0.4820056 0.5573918 0.6631711 0.6684754 NaN
#Site_01 NaN 0.4773160 0.5450316 0.5947297 0.5984900 NaN
#Site_02 NaN 0.5776294 0.6119382 0.6240641 0.6277319 NaN
#Site_03 NaN 0.5217687 0.5526371 0.6048668 0.5980956 NaN
#Site_04 NaN 0.4031827 0.4623707 0.5570995 0.5678683 NaN
#Site_05 NaN 0.4954761 0.5140621 0.5252072 0.3102534 NaN
#Site_06 NaN 0.4491103 0.5178627 0.5975557 0.6288175 NaN
#Site_09 NaN 0.5722107 0.6070054 0.5500955 0.5852464 NaN
#Site_10 NaN 0.6390959 0.6278275 0.5965288 0.5627300 NaN
#Site_11 NaN 0.6637253 0.6693131 0.6092008 0.6019234 NaN
#Site_12 0.2778784 0.3852075 0.4147585 0.6090069 0.6438335 0.6170999
#Site_13 0.3554969 0.4768046 0.4766242 0.6354171 0.6629661 0.6432456
#Site_16 NaN 0.3691304 0.3638944 0.6088972 0.6410303 0.6104665
#Site_17 NaN 0.5095926 0.5362862 0.5575326 0.5480075 0.5241143
#Site_18 NaN 0.6960640 0.6273781 0.5501873 0.5740978 0.5585046
Using sub we keep only common part of column names which is used to split data.
sub('\\..*', '', names(data))
#[1] "X20151126" "X20151126" "X20151126" "X20151126" "X20151216" "X20151216"
#[7] "X20160105" "X20160105" "X20160305" "X20160305" "X20160315" "X20160315"
#[13] "X20160315" "X20160315" "X20160325"

r Replace multiple strings in a data frame column with multiple strings from a column of another data frame

I have a dataframe (df1) with a column "PartcipantID". Some ParticipantIDs are wrong and should be replaced with the correct ParticipantID. I have another dataframe (df2) where all Participant IDs appear in columns Goal_ID to T4. The Participant IDs in column "Goal_ID" are the correct IDs.
Now I want to replace all ParticipantIDs in df1 with all Goal_ID ParticipantIDs from df2.
This is my original dataframe (df1):
structure(list(Partcipant_ID = c("AA_SH_RA_91", "AA_SH_RA_91",
"AB_BA_PR_93", "AB_BH_VI_90", "AB_BH_VI_90", "AB_SA_TA_91", "AJ_BO_RA_92",
"AJ_BO_RA_92", "AK_SH_HA_91", "AL_EN_RA_95", "AL_MA_RA_95", "AL_SH_BA_99",
"AM_BO_AB_49", "AM_BO_AB_94", "AM_BO_AB_94", "AM_BO_AB_94", "AN_JA_AN_91",
"AN_KL_GE_11", "AN_KL_WO_91", "AN_MA_DI_95", "AN_MA_DI_95", "AN_SE_RA_95",
"AN_SE_RA_95", "AN_SI_RA_97", "AN_SO_PU_94", "AN_SU_RA_91", "AR_BO_RA_92",
"AR_KA_VI_94", "AR_KA_VI_94", "AS_AR_SO_90", "AS_AR_SU_95", "AS_KU_SO_90",
"AS_MO_AS_97", "AW_SI_OJ_97", "AW_SI_OJ_97", "AY_CH_SU_97", "BH_BE_LD_84",
"BH_BE_LI_83", "BH_BE_LI_83", "BH_BE_LI_84", "BH_KO_SA_87", "BH_PE_AB_89",
"BH_YA_SA_87", "BI_CH_PR_94", "BI_CH_PR_94"), Start_T2 = structure(c(NA,
NA, NA, NA, 1579514871, 1576658745, NA, 1579098225, NA, NA, 1576663067,
1576844759, NA, 1577330639, NA, NA, 1576693930, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 1577718380, 1577718380, 1577454467, NA,
NA, 1576352237, NA, NA, NA, NA, 1576420656, 1576420656, NA, NA,
1578031772, 1576872938, NA, NA), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), End_T2 = structure(c(NA, NA, NA, NA, 1579515709,
1576660469, NA, 1579098989, NA, NA, 1576693776, 1576845312, NA,
1577331721, NA, NA, 1576694799, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 1577719049, 1577719049, 1577455167, NA, NA, 1576352397,
NA, NA, NA, NA, 1576421607, 1576421607, NA, NA, 1578032408, 1576873875,
NA, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
45L), class = "data.frame")
And this is the reference data frame (df2):
structure(list(Goal_ID = c("AJ_BO_RA_92", "AL_EN_RA_95", "AM_BO_AB_49",
"AS_KU_SO_90", "BH_BE_LI_84", "BH_YA_SA_87", "BI_CH_PR_94", "BI_CH_PR_94"
), T2 = c("AJ_BO_RA_92", "AL_MA_RA_95", "AM_BO_AB_94", "AS_AR_SO_90",
"BH_BE_LI_83", "BH_YA_SA_87", "BI_NA_PR_94", "BI_NA_PR_94"),
T3 = c("AR_BO_RA_92", "AL_MA_RA_95", "AM_BO_AB_94", NA, "BH_BE_LI_83",
NA, "BI_CH_PR_94", "BI_CH_PR_94"), T4 = c("AJ_BO_RA_92",
"AL_MA_RA_95", "AM_BO_AB_94", NA, "BH_BE_LI_83", "BH_KO_SA_87",
"BI_CH_PR_94", "BI_CH_PR_94")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
For example, in my df1, I want
"AR_BO_RA_92" to be replaced by "AJ_BO_RA_92";
"AL_MA_RA_95" to be replaced by "AL_EN_RA_95";
"AM_BO_AB_94" to be replaced by "AM_BO_AB_49"
and so on...
I thought about using string_replace and I started with this:
df1$Partcipant_ID <- str_replace(df1$Partcipant_ID, "AR_BO_RA_92", "AJ_BO_RA_92")
But that is of course very unefficient because I have so many replacements and it would be nice to make use of my reference data frame. I just cannot figure it out myself.
I hope this is understandable. Please ask if you need additional information.
Thank you so much already!
You can use match to find where the string is located and excange those which have been found and are not NA like:
i <- match(df1$Partcipant_ID, unlist(df2[-1])) %% nrow(df2)
j <- !is.na(i)
df1$Partcipant_ID[j] <- df2$Goal_ID[i[j]]
df1$Partcipant_ID
# [1] "AA_SH_RA_91" "AA_SH_RA_91" "AB_BA_PR_93" "AB_BH_VI_90" "AB_BH_VI_90"
# [6] "AB_SA_TA_91" "AJ_BO_RA_92" "AJ_BO_RA_92" "AK_SH_HA_91" "AL_EN_RA_95"
#[11] "AL_MA_RA_95" "AL_SH_BA_99" "AM_BO_AB_49" "AM_BO_AB_94" "AM_BO_AB_94"
#[16] "AM_BO_AB_94" "AN_JA_AN_91" "AN_KL_GE_11" "AN_KL_WO_91" "AN_MA_DI_95"
#[21] "AN_MA_DI_95" "AN_SE_RA_95" "AN_SE_RA_95" "AN_SI_RA_97" "AN_SO_PU_94"
#[26] "AN_SU_RA_91" "AR_BO_RA_92" "AR_KA_VI_94" "AR_KA_VI_94" "AS_AR_SO_90"
#[31] "AS_AR_SU_95" "AS_KU_SO_90" "AS_MO_AS_97" "AW_SI_OJ_97" "AW_SI_OJ_97"
#[36] "AY_CH_SU_97" "BH_BE_LD_84" "BH_BE_LI_83" "BH_BE_LI_83" "BH_BE_LI_84"
#[41] "BH_KO_SA_87" "BH_PE_AB_89" "BH_YA_SA_87" "BI_CH_PR_94" "BI_CH_PR_94"
I think this might work. Create a true look up table with a column of correct and incorrect codes. I.e. stack the columns, then join the subsequent df3 to df1 and use coalesce to create a new part_id. You spelt participant wrong, which made me feel more human I always do that.
library(dplyr)
df3 <- df2[1:2] %>%
bind_rows(df2[c(1,3)] %>% rename(T2 = T3),
df2[c(1,4)] %>% rename(T2 = T4)) %>%
distinct()
df1 %>%
left_join(df3, by = c("Partcipant_ID" = "T2")) %>%
mutate(Goal_ID = coalesce(Goal_ID, Partcipant_ID)) %>%
select(Goal_ID, Partcipant_ID, Start_T2, End_T2)

Using gsub (or similar) to extract from a vector and keep last 4 digits of column names

I am trying to use gsub or substr or anything similar to keep the column names which consist of symbols and a date. The symbols.f is stored in a vector (which can take on different ticker symbols);
symbols.f <- c("NVDA.f", "GOOG.f", "GE.f")
I then have the following colnames() from the dput() below.
[1] "GE.f.12.31.2017"
[2] "GE.f.12.31.2016"
[3] "GE.f.12.31.2015"
[4] "GE.f.12.31.2014"
[5] "GOOG.f.12.31.2017"
[6] "GOOG.f.12.31.2016"
[7] "GOOG.f.12.31.2015"
[8] "GOOG.f.12.31.2014"
[9] "NVDA.f.1.28.2018"
[10] "NVDA.f.1.29.2017"
[11] "NVDA.f.1.31.2016"
[12] "NVDA.f.1.25.2015"
What I am trying to do is to keep the ticker and also keep the yearor last 4 digits of the column names. So for example for the first two tickers;
[1] "GE2017"
[2] "GE2016"
[3] "GE2015"
[4] "GE2014"
[5] "GOOG2017"
[6] "GOOG2016"
[7] "GOOG2015"
[8] "GOOG2014"
I am able to extract the last 4 digits or all characters but cannot seem to do it jointly or in one go.
Data:
df <- structure(list(GE.f.12.31.2017 = c(18211000, NA, 46549000, 21923000,
5790000, 140110000, 38696000, 53874000, 83968000, 20273000, NA,
41024000, 6207000, 377945000, 15153000, 134591000, 21400000,
61893000, 108575000, 82597000, NA, 21122000, NA, 292560000, NA,
NA, NA, 702000, 125682000, -62127000, NA, 22775000, 64257000,
-39984000), GE.f.12.31.2016 = c(10525000, NA, 42687000, 22354000,
2867000, 149029000, 44313000, 50518000, 68070000, 16436000, NA,
34449000, 1833000, 365183000, 14435000, 136211000, 20772000,
70364000, 105080000, 83040000, NA, 4688000, NA, 284667000, NA,
NA, NA, 702000, 139532000, -64412000, NA, 18626000, 75822000,
-11052000), GE.f.12.31.2015 = c(10372000, NA, 43013000, 22515000,
5109000, 280896000, 31973000, 54095000, 65526000, 17797000, NA,
42784000, 3105000, 493071000, 13680000, 197602000, 27453000,
138270000, 144659000, 79175000, NA, 4836000, NA, 389961000, NA,
NA, NA, 702000, 140020000, -42454000, NA, 21085000, 98268000,
14945000), GE.f.12.31.2014 = c(15916000, NA, 23237000, 17639000,
6566000, 460743000, 35505000, 48070000, 53207000, 13182000, NA,
44247000, 6183000, 654954000, 12067000, 261424000, 18203000,
229564000, 186596000, 70801000, NA, 8772000, NA, 518023000, NA,
NA, NA, 702000, 155333000, -27876000, NA, 14717000, 128159000,
61770000), GOOG.f.12.31.2017 = c(10715000, 91156000, 18705000,
749000, 2983000, 124308000, 7813000, 42383000, 16747000, 2692000,
NA, 3352000, 680000, 197295000, 3137000, 3969000, 10651000, 24183000,
3943000, 16641000, NA, NA, NA, 44793000, NA, NA, NA, 40247000,
113247000, -992000, NA, -992000, 152502000, 133063000), GOOG.f.12.31.2016 = c(12918000,
73415000, 15632000, 268000, 3175000, 105408000, 5878000, 34234000,
16468000, 3307000, NA, 2202000, 383000, 167497000, 2041000, 3935000,
5851000, 16756000, 3935000, 7770000, NA, NA, NA, 28461000, NA,
NA, NA, 36307000, 105131000, -2402000, NA, -2402000, 139036000,
119261000), GOOG.f.12.31.2015 = c(15409000, 56517000, 13459000,
491000, 1590000, 90114000, 5183000, 29016000, 15869000, 3847000,
NA, 3432000, 251000, 147461000, 1931000, 7648000, 4327000, 19310000,
1995000, 5825000, NA, NA, NA, 27130000, NA, NA, NA, 32982000,
89223000, -1874000, NA, -1874000, 120331000, 100615000), GOOG.f.12.31.2014 = c(16585000,
46048000, 9974000, NA, 2637000, 78656000, 3079000, 23883000,
15599000, 4607000, NA, 3363000, 176000, 129187000, 1715000, 8015000,
2803000, 16779000, 2992000, 5320000, NA, NA, NA, 25327000, NA,
NA, NA, 28767000, 75066000, 27000, NA, 27000, 103860000, 83654000
), NVDA.f.1.28.2018 = c(7108000, NA, 1265000, 796000, NA, 9255000,
NA, 997000, 618000, 52000, NA, 319000, NA, 11241000, 596000,
2e+06, NA, 1153000, 1985000, 632000, NA, NA, NA, 3770000, NA,
NA, NA, 7471000, NA, NA, NA, NA, 7471000, 6801000), NVDA.f.1.29.2017 = c(1766000,
5032000, 826000, 794000, NA, 8536000, NA, 521000, 618000, 104000,
NA, 62000, NA, 9841000, 485000, 2791000, 325000, 1788000, 1985000,
3e+05, NA, NA, NA, 4079000, NA, NA, NA, 1000, 6108000, -5055000,
4708000, -16000, 5762000, 5040000), NVDA.f.1.31.2016 = c(596000,
4441000, 505000, 418000, NA, 6053000, NA, 466000, 618000, 166000,
NA, 67000, NA, 7370000, 296000, 1434000, 532000, 2351000, 7000,
533000, NA, NA, NA, 2901000, NA, NA, NA, 1000, 4350000, -4052000,
4170000, -4000, 4469000, 3685000), NVDA.f.1.25.2015 = c(497000,
4126000, 474000, 483000, 63000, 5713000, NA, 557000, 618000,
222000, NA, 91000, NA, 7201000, 293000, 1398000, 471000, 896000,
1384000, 489000, NA, NA, NA, 2783000, NA, NA, NA, 1000, 3949000,
-3387000, 3855000, 8000, 4418000, 3578000)), .Names = c("GE.f.12.31.2017",
"GE.f.12.31.2016", "GE.f.12.31.2015", "GE.f.12.31.2014", "GOOG.f.12.31.2017",
"GOOG.f.12.31.2016", "GOOG.f.12.31.2015", "GOOG.f.12.31.2014",
"NVDA.f.1.28.2018", "NVDA.f.1.29.2017", "NVDA.f.1.31.2016", "NVDA.f.1.25.2015"
), row.names = c("Cash And Cash Equivalents", "Short Term Investments",
"Net Receivables", "Inventory", "Other Current Assets", "Total Current Assets",
"Long Term Investments", "Property Plant and Equipment", "Goodwill",
"Intangible Assets", "Accumulated Amortization", "Other Assets",
"Deferred Long Term Asset Charges", "Total Assets", "Accounts Payable",
"Short/Current Long Term Debt", "Other Current Liabilities",
"Total Current Liabilities", "Long Term Debt", "Other Liabilities",
"Deferred Long Term Liability Charges", "Minority Interest",
"Negative Goodwill", "Total Liabilities", "Misc. Stocks Options Warrants",
"Redeemable Preferred Stock", "Preferred Stock", "Common Stock",
"Retained Earnings", "Treasury Stock", "Capital Surplus", "Other Stockholder Equity",
"Total Stockholder Equity", "Net Tangible Assets"), class = "data.frame")
Will this regex work?
gsub("\\..*\\.", "", colnames(df))
It removes the first and last '.' and everything in between.
#[1] "GE2017" "GE2016" "GE2015" "GE2014" "GOOG2017"
#[6] "GOOG2016" "GOOG2015" "GOOG2014" "NVDA2018" "NVDA2017"
#[11] "NVDA2016" "NVDA2015"
# '\\.' = match a dot, '.' = match anything, '*' = match the previous 0 or more times
# so \\..*\\. means "anything 0 or more times, preceded by a dot, followed by a dot")
# the \\ are escapes so the regex can differentiate whether you mean the
# expression '.' (anything) or '\\.' (actual dot)
Here is an alternative to the answer given by #Ape using sub with capture groups:
sub("^([^.]+).*?(\\d+)$", "\\1\\2", colnames(df))
Demo

Adding legend to ggplot with geom_line with factor color and manually added line

I can see that there are a lot of questions similar to this, but I cant find solution for my particular problem.
Data:
risk_accum <- structure(list(date = structure(c(1465948800, 1465952400, 1465956000,
1465959600, 1465963200, 1465966800, 1465970400, 1465974000, 1465977600,
1465981200, 1465984800, 1465988400, 1465992000, 1465995600, 1465999200,
1466002800, 1466006400, 1466010000, 1466013600, 1466017200, 1466020800,
1466024400, 1466028000, 1466031600, 1466035200, 1466038800, 1466042400,
1466046000, 1466049600, 1466053200, 1466056800, 1466060400, 1466064000,
1466067600, 1466071200, 1466074800, 1466078400, 1466082000, 1466085600,
1466089200, 1466092800, 1466096400, 1466100000, 1466103600, 1466107200,
1466110800, 1466114400, 1466118000, 1466121600, 1466125200, 1466128800,
1466132400, 1466136000, 1466139600, 1466143200, 1466146800, 1466150400,
1466154000, 1466157600, 1466161200, 1466164800, 1466168400, 1466172000,
1466175600, 1466179200, 1466182800, 1466186400, 1466190000, 1466193600,
1466197200, 1466200800, 1466204400, 1466208000, 1466211600, 1466215200,
1466218800, 1466222400, 1466226000, 1466229600, 1466233200, 1466236800,
1466240400, 1466244000, 1466247600, 1466251200, 1466254800, 1466258400,
1466262000, 1466265600, 1466269200, 1466272800, 1466276400, 1466280000,
1466283600, 1466287200, 1466290800, 1466294400, 1466298000, 1466301600,
1466305200, 1466308800, 1466312400, 1466316000, 1466319600, 1466323200,
1466326800, 1466330400, 1466334000, 1466337600, 1466341200, 1466344800,
1466348400, 1466352000, 1466355600, 1466359200, 1466362800, 1466366400,
1466370000, 1466373600, 1466377200), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), risk = c(NA, NA, NA, 1, 2, 3, 4, 5, 6, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 2, 3, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, NA, NA)), .Names = c("date",
"risk"), row.names = c(NA, -120L), class = c("tbl_df", "tbl",
"data.frame"))
And code to generate graph:
#color variable
color_var <- vector(mode = "double",length = length(risk_accum$risk))
color_var[color_var== '0']<-NA
color_var[risk_accum$risk<6] <- "green4"
color_var[risk_accum$risk>=6 & risk_accum$risk<12] <- "yellow2"
color_var[risk_accum$risk>=12] <- "red"
#plot of Effective Blight Hours accumulation
ggplot(risk_accum)+
geom_line(aes(x = date, y = risk), color = color_var)+
scale_y_continuous(name = "EBH accumulation")+
scale_colour_manual(values=c("green", "yellow", "red"))+
geom_line(aes(date, y= 12), linetype= "dotted", size = 0.1)+
theme(axis.title.x = element_blank())
I need to get a legend which would explain the traffic light system (red is danger, etc) and manually added threshold risk line.
Add your color variable to the dataset, map to that variable inside aes, and use scale_*_identity to directly use the colors.
risk_accum$color_var <- NA
risk_accum$color_var[risk_accum$risk<6] <- "green4"
risk_accum$color_var[risk_accum$risk>=6 & risk_accum$risk<12] <- "yellow2"
risk_accum$color_var[risk_accum$risk>=12] <- "red"
ggplot(risk_accum)+
geom_line(aes(x = date, y = risk, color = color_var)) +
scale_y_continuous(name = "EBH accumulation")+
scale_color_identity(guide = 'legend') +
geom_line(aes(date, y= 12), linetype= "dotted", size = 0.1)+
theme(axis.title.x = element_blank())
You can also add your threshold to the legend:
ggplot(risk_accum)+
geom_line(aes(x = date, y = risk, color = color_var)) +
geom_line(aes(date, y= 12, linetype = "threshold"), size = 0.1)+
scale_y_continuous(name = "EBH accumulation")+
scale_color_identity(guide = 'legend') +
scale_linetype_manual(values = 2) +
theme(axis.title.x = element_blank())

How to change xticks locations and customize legend using levelplot (lattice library)

I am trying to move the position of x-ticks and x-labels from the bottom of the figure to its top.
In addition, my data has a bunch of NAs. Currently, levelplot just remove them and leave them as white space in the plot. I wondering if it is possible to add this NAs to the legend as well.
Any suggestions? Thanks!
Here is my code and its output:
require(lattice)
# see data from dput() below
rownames(data)=data[,1]
data_matrix=as.matrix(data[,2:11])
color = colorRampPalette(rev(c("#D73027", "#FC8D59", "#FEE090", "#FFFFBF", "#E0F3F8", "#91BFDB", "#4575B4")))(100)
levelplot(data_matrix, scale=list(x=list(rot=45)), ylab="Days", xlab="Strains", col.regions = color)
Data
data <-
structure(list(X = structure(1:17, .Label = c("Arcobacter", "Bacillus",
"Bordetella", "Campylobacter", "Chlamydia", "Clostridium ", "Corynebacterium",
"Enterococcus", "Escherichia", "Francisella", "Legionella", "Mycobacterium",
"Pseudomonas", "Rickettsia", "Staphylococcus", "Streptococcus",
"Treponema"), class = "factor"), day.0 = c(NA, -3.823301154,
NA, NA, NA, -3.518606107, NA, NA, NA, NA, NA, -4.859479387, NA,
NA, NA, -2.588402346, -2.668136603), day.2 = c(-4.006281239,
-3.024823788, NA, -5.202804501, NA, -3.237622321, NA, NA, -5.296138823,
-5.105469059, NA, NA, -4.901775198, NA, NA, -2.979144202, -3.050083791
), day.4 = c(-2.880770182, -3.210165554, -4.749097175, -5.209064234,
NA, -2.946480184, NA, -5.264113795, -5.341881713, -4.435780293,
NA, -4.810650076, -4.152531609, NA, NA, -3.106172794, -3.543161966
), day.6 = c(-2.869833226, -3.293283924, -3.831346387, NA, NA,
-3.323947791, NA, NA, NA, NA, NA, -4.397581863, -4.068855504,
NA, NA, -3.27028378, -3.662618619), day.8 = c(-3.873589331, -3.446192193,
-3.616207965, NA, NA, -3.13869325, NA, -5.010807453, NA, NA,
NA, -4.091502649, -4.412399025, -4.681675749, NA, -3.404738625,
-3.955464159), day.15 = c(-5.176583159, -2.512963066, -3.392832457,
NA, NA, -3.194662968, NA, -3.60440455, NA, NA, -4.875554468,
-2.507376205, -4.727255906, -5.27116754, -3.200499549, -3.361296145,
-4.320554841), day.22 = c(-4.550052847, -3.654013004, -3.486879661,
NA, NA, -3.614890858, NA, NA, NA, NA, -4.706690492, -2.200533317,
-4.836957953, NA, -4.390423731, NA, NA), day.29 = c(-4.730006329,
-3.46707372, -3.594457287, NA, NA, -3.800757834, NA, NA, NA,
NA, -4.285154089, -2.121152491, -4.816807055, -5.064577888, -2.945243736,
-4.479177287, -5.226435146), day.43 = c(-4.398680025, -3.144603215,
-3.642065153, NA, NA, -3.8268662, NA, NA, NA, NA, -4.762539208,
-2.156862316, -4.118608495, NA, -4.030291084, -4.678213147, NA
), day.57 = c(-4.689982547, -2.713502214, -3.51279797, NA, -5.069579266,
-3.495580794, NA, NA, NA, NA, -4.515973639, -1.90591075, -4.134826117,
-4.479351427, -3.482134037, -4.538534489, NA)), .Names = c("X",
"day.0", "day.2", "day.4", "day.6", "day.8", "day.15", "day.22",
"day.29", "day.43", "day.57"), class = "data.frame", row.names = c("Arcobacter",
"Bacillus", "Bordetella", "Campylobacter", "Chlamydia", "Clostridium ",
"Corynebacterium", "Enterococcus", "Escherichia", "Francisella",
"Legionella", "Mycobacterium", "Pseudomonas", "Rickettsia", "Staphylococcus",
"Streptococcus", "Treponema"))
Figure
The request to move the labels to the top is pretty easy (after looking at the ?xyplot under the scales section):
levelplot(data_matrix, scale=list(x=list(rot=45,alternating=2)),
ylab="Days", xlab="Strains", col.regions = color)
Trying to get the NA values into the color legend may take a bit more thinking, but it seems as though sensible values for the colorkey arguments for at and col might suffice.
levelplot(data_matrix, scale=list(x=list(rot=45,alternating=2)),
ylab="Days", xlab="Strains", col.regions = color,
colorkey=list(at=as.numeric( factor( c( seq(-5.5, -2, by=0.5),
"NA"))),
labels=as.character( c( seq(-5.5, -2, by=0.5),
"NA")),
col=c(color, "#FFFFFF") ) )

Resources