Conserving a monthly sequence in reshape in R - r

RI_Final
Names = c("Name", "29/01/1992", "29/02/1992",
"29/03/1992", "29/04/1992", "29/05/1992", "29/06/1992", "29/07/1992",
"29/08/1992", "29/09/1992", "29/10/1992", "29/11/1992", "29/12/1992",
"29/01/1993", "28/02/1993", "29/03/1993", "29/04/1993", "29/05/1993",
"29/06/1993", "29/07/1993", "29/08/1993", "29/09/1993", "29/10/1993",
"29/11/1993", "29/12/1993", "29/01/1994", "28/02/1994", "29/03/1994",
"29/04/1994", "29/05/1994", "29/06/1994", "29/07/1994", "29/08/1994",
"29/09/1994", "29/10/1994", "29/11/1994", "29/12/1994", "29/01/1995",
"28/02/1995", "29/03/1995", "29/04/1995", "29/05/1995", "29/06/1995",
"29/07/1995", "29/08/1995", "29/09/1995", "29/10/1995", "29/11/1995",
"29/12/1995", "29/01/1996", "29/02/1996", "29/03/1996", "29/04/1996",
"29/05/1996", "29/06/1996", "29/07/1996", "29/08/1996", "29/09/1996",
"29/10/1996", "29/11/1996", "29/12/1996", "29/01/1997", "28/02/1997",
"29/03/1997", "29/04/1997", "29/05/1997", "29/06/1997", "29/07/1997",
"29/08/1997", "29/09/1997", "29/10/1997", "29/11/1997", "29/12/1997",
"29/01/1998", "28/02/1998", "29/03/1998", "29/04/1998", "29/05/1998",
"29/06/1998", "29/07/1998", "29/08/1998", "29/09/1998", "29/10/1998",
"29/11/1998", "29/12/1998", "29/01/1999", "28/02/1999", "29/03/1999",
"29/04/1999", "29/05/1999", "29/06/1999", "29/07/1999", "29/08/1999",
"29/09/1999", "29/10/1999", "29/11/1999", "29/12/1999", "29/01/2000",
"29/02/2000", "29/03/2000", "29/04/2000", "29/05/2000", "29/06/2000",
"29/07/2000", "29/08/2000", "29/09/2000", "29/10/2000", "29/11/2000",
"29/12/2000"))), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"), .internal.selfref = <pointer: (nil)>)
I have a DF I want to reshaped to Panel DF this was my solution
reshaped.RI_Final<- reshape(RI_Final,
direction="long",
varying=list(names(RI_Final)[2:290]),
v.names="Value",
timevar="Month")
the variables from 2:290 are a monthly sequence from 02/1992 to 12/2015 in my solution I can't find in Month Column the dates.

Related

Use of purrr's "modify_if" with a function

I'm trying to apply the discretize_rgr function (here) of the package funModeling to multiple columns of a dataframe.
For a single column, it is working for me in this way:
discretize_rgr(input = df.div$to_be_discretized, target = df.div$TARGET, max_n_bins=10)
So, I'm trying to use the purrr package to manage multiple columns in this way:
df.div %>%
modify_if( is.numeric, ~ discretize_rgr(., target = df.div$TARGET, max_n_bins=10))
but I'm get the following error:
Error in order(fpoints_top) : argument 1 is not a vector
What's wrong?
UPDATE (example data)
structure(list(to_be_discretized = c(0.0152096300012854, 0.0132660373578711,
0.014699121782711, 0.0157102877064037, 0.0197417484744586, 0.019651999420645
), TARGET = c(27136, 30048, 34840, 138812, 191088, 240370)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L))

how to predict using var with exogenous variables in R

I have the following data:
library(data.table)
modelling_dt_train <- structure(list(`1` = c(54593L, 74481L, 85566L, 97637L, 101081L,
184089L, 158895L, 153780L, 153681L, 157188L, 142216L, 136437L,
135501L, 111264L, 123259L, 110397L, 146034L, 162900L, 132499L,
121516L, 119651L, 114045L, 112551L, 123209L, 134930L, 132147L,
151327L, 155666L, 158538L, 205766L, 200407L, 219588L, 231954L,
179884L, 159121L, 156148L, 136191L, 132956L, 202086L, 141047L,
118490L, 116595L, 127620L, 135962L, 137419L, 127334L, 158804L,
139142L, 181773L, 228278L, 272373L, 186666L, 148791L, 143608L,
169634L, 188149L, 239867L, 332543L, 253463L, 240574L, 237245L,
275466L, 262755L, 241538L, 303377L),
`2` = c(148181L, 186894L,
243357L, 227298L, 195640L, 412137L, 363152L, 355169L, 296208L,
328993L, 281652L, 308027L, 316254L, 249293L, 320821L, 220521L,
284411L, 263807L, 258093L, 261060L, 320153L, 311547L, 279734L,
258453L, 269697L, 313700L, 255285L, 232495L, 305346L, 393256L,
390655L, 527039L, 529056L, 450689L, 425190L, 372144L, 303765L,
324658L, 365035L, 285178L, 230985L, 251308L, 290378L, 279595L,
294676L, 391377L, 445682L, 364056L, 441207L, 516852L, 673401L,
415677L, 304000L, 266365L, 311924L, 314192L, 407313L, 664519L,
456920L, 384978L, 351644L, 432627L, 409624L, 386330L, 487679L
),
`3` = c(60217L, 66492L, 66675L, 76400L, 117252L, 264527L,
256384L, 241815L, 187115L, 193106L, 177620L, 140833L, 188291L,
110069L, 163581L, 107650L, 118319L, 118821L, 122383L, 117267L,
134962L, 121227L, 124952L, 111740L, 137493L, 163895L, 60653L,
69311L, 88810L, 128620L, 132077L, 153399L, 162989L, 151866L,
127325L, 122813L, 115284L, 103765L, 113185L, 101607L, 92379L,
98646L, 94376L, 98069L, 98972L, 103074L, 142199L, 123497L, 141823L,
205582L, 251187L, 109603L, 80711L, 80799L, 84175L, 104965L, 181221L,
245377L, 201378L, 235504L, 188925L, 214614L, 220312L, 191591L,
203292L)),
.Names = c("1", "2", "3"), class = c("data.table",
"data.frame"), row.names = c(NA, -65L))
modelling_x_train <- structure(list(`1` = c(1982134L, 1968327L, 2019222L, 2025126L,
2033065L, 2188202L, 2066808L, 2070103L, 2041154L, 2201142L, 2105848L,
2067669L, 2005707L, 2239632L, 2435928L, 2363759L, 2444016L, 2556139L,
2807283L, 2674632L, 2687984L, 2889011L, 2839239L, 2712064L, 2928420L,
2889533L, 3106868L, 2746471L, 2953436L, 3225171L, 2926874L, 2914124L,
3210355L, 2847523L, 2890636L, 3268445L, 2941468L, 2931027L, 2906610L,
3222324L, 2833093L, 2978953L, 3196315L, 3055240L, 3210672L, 3368890L,
3046191L, 2960181L, 3341146L, 3227672L, 3062702L, 3197227L, 3445476L,
3441273L, 3651232L, 3566179L, 3619685L, 3716756L, 3600666L, 3732533L,
3695464L, 3857145L, 3700072L, 3608183L, 3904237L),
`2` = c(4082316L,
4644387L, 5230567L, 5115720L, 4729153L, 5658227L, 5492034L, 5443022L,
5094415L, 5939637L, 5354626L, 5509783L, 5438960L, 4912936L, 5736293L,
5167632L, 5244341L, 5580274L, 5750346L, 5358527L, 5916955L, 6129790L,
5245982L, 5801479L, 5683117L, 5721551L, 6972176L, 7072498L, 7979325L,
8324202L, 7434885L, 8189438L, 8062609L, 7658496L, 8066643L, 8528136L,
7515745L, 8276800L, 8227022L, 6523804L, 5780869L, 6481060L, 6912797L,
6276934L, 6592158L, 6908732L, 6067945L, 6459707L, 6910377L, 6645470L,
6538196L, 6694136L, 7484290L, 7299620L, 8532078L, 7713988L, 7256825L,
8237839L, 7834919L, 7725377L, 7291804L, 8224205L, 7784470L, 7514557L,
8164590L),
`3` = c(3181556L, 3232260L, 3272852L, 3233534L, 2876956L,
2979204L, 3275916L, 3345278L, 2951867L, 2976889L, 3289397L, 2955148L,
3306653L, 1861934L, 2239827L, 2207356L, 2335514L, 2387791L, 2592206L,
2371527L, 2586856L, 2447660L, 2322218L, 2342827L, 2666258L, 2627928L,
2525534L, 2521129L, 2573991L, 2752528L, 2538251L, 2676848L, 2802139L,
2702108L, 2630417L, 2778233L, 2725544L, 2723849L, 2795745L, 1954820L,
1842684L, 2132844L, 2182141L, 2041725L, 2355857L, 2414334L, 2350885L,
2367547L, 2436918L, 2328244L, 2390647L, 2460700L, 3081623L, 2877487L,
3025104L, 3108909L, 3172441L, 3267766L, 3354357L, 3273165L, 3322516L,
3342817L, 3413854L, 3217624L, 2736617L)),
.Names = c("1", "2",
"3"), class = c("data.table", "data.frame"), row.names = c(NA,
-65L))
where modelling_dt_train is the time series of 3 products and modelling_x_train is an exogenous variable (which is also a time series) for the same products.
I am estimating a VAR model using the following code
library(vars)
x <- log(modelling_dt_train)
x <- x[,lapply(.SD,function(x){ifelse(is.infinite(x),0,x)})]
modelling_x_train <- log(modelling_x_train)
modelling_x_train <- modelling_x_train[,lapply(.SD,function(x){ifelse(is.infinite(x),0,x)})]
x_mat <- as.matrix(x)
dx <- x_mat
var = VAR(dx, p=p, exogen = modelling_x_train, season = 18)
So far so good, but when I want to predict the values for 12 periods using
predict(var, newdata = modelling_dt_test, dumvar = modelling_x_test, n.ahead = 12)
I get an error:
Error in predict.varest(var, newdata = modelling_dt_test, dumvar = modelling_x_test, :
Column names of dumvar do not coincide with exogen.
The newdata and the dumvar that I am using are some future values of the previous same products, but further in time
modelling_x_test <- structure(list(`1` = c(4447896L, 4779229L, 4628391L, 4737933L,
5102152L, 4838918L, 4955183L, 5258605L, 5084001L, 4798945L, 5204015L,
5129690L, 5101568L),
`2` = c(6108187L, 6733956L, 7065148L, 7111155L,
6513151L, 7622806L, 7062042L, 7206067L, 7144091L, 7412266L, 6752614L,
7705255L, 7487054L),
`3` = c(1716975L, 2022198L, 2122109L, 2155489L,
2428639L, 2433860L, 2717315L, 2471655L, 2795100L, 2908946L, 2581813L,
2633578L, 2666302L)),
.Names = c("1", "2", "3"), class = c("data.table",
"data.frame"), row.names = c(NA, -13L))
modelling_dt_test <- structure(list(`1` = c(244876L, 275993L, 256180L, 321256L, 316042L,
275097L, 250842L, 245543L, 233386L, 218958L, 254270L, 238804L,
234079L),
`2` = c(375278L, 429496L, 478816L, 532311L, 442922L,
485787L, 460750L, 501956L, 454178L, 425800L, 413112L, 434328L,
446069L),
`3` = c(119577L, 139870L, 127951L, 125017L, 138176L,
114517L, 129880L, 120941L, 159176L, 157890L, 149554L, 144210L,
165979L)),
.Names = c("1", "2", "3"), class = c("data.table",
"data.frame"), row.names = c(NA, -13L))
EDIT
In the source code here, at line 58 there is this check. But what this line is checking is if colnames(data.all) (which are "X1" "X2" "X3" "X1.l1" "X2.l1" "X3.l1" "const" "sd1" "sd2" "sd3" "sd4" "sd5" "sd6" "sd7" "sd8" "sd9" "sd10" "sd11" "sd12" "sd13" "sd14" "sd15" "sd16" "sd17" "X1.1" "X2.1" "X3.1" , so it seems to me as the coefficients) are equal with colnames(modelling_x_test) which are "1" "2" "3" (the products). How can these ever be equal ?

Unexpected behavior in data.table join when using nomatch and allow.cartesian

I have 2 data tables and am trying to get a column cor from cortable into finaltable.
cortable
cor,tickerkey
0.7539,AAL_AAN
0.573,AAL_ABB
0.6384,AAL_ACM
0.7193,AAL_ACXM
0.8386,AAL_ADP
0.7392,AAL_ADT
0.732,AAL_AER
0.4805,AAL_AGCO
0.9363,AAL_AL
0.9064,AAL_ALK
0.7545,AAL_ALSN
0.8586,AAL_AME
0.3356,AAL_AMT
0.8239,AAL_AN
0.8637,AAL_AOS
0.7638,AAL_APD
0.7915,AAL_APH
0.8785,AAL_APOL
0.8073,AAL_ARMH
0.7744,AAL_ASH
0.4179,AAL_ATLS
0.8282,AAL_AWI
-0.2539,AAL_AWK
0.8213,AAL_AXLL
0.827,AAL_BA
0.8642,AAL_BC
0.7982,AAL_BCO
0.2002,AAL_BEAV
0.7079,AAL_BERY
0.858,AAL_BGC
0.5943,AAL_BRK.B
0.1522,AAL_BWC
0.2793,AAL_CAR
0.8537,AAL_CAT
0.9115,AAL_CBI
dput
cortable<-structure(list(cor = c("0.7539", "0.573", "0.6384", "0.7193",
"0.8386", "0.7392", "0.732", "0.4805", "0.9363", "0.9064", "0.7545",
"0.8586", "0.3356", "0.8239", "0.8637", "0.7638", "0.7915", "0.8785",
"0.8073", "0.7744", "0.4179", "0.8282", "-0.2539", "0.8213",
"0.827", "0.8642", "0.7982", "0.2002", "0.7079", "0.858", "0.5943",
"0.1522", "0.2793", "0.8537", "0.9115"),
tickerkey = c("AAL_AAN", "AAL_ABB", "AAL_ACM", "AAL_ACXM", "AAL_ADP", "AAL_ADT", "AAL_AER",
"AAL_AGCO", "AAL_AL", "AAL_ALK", "AAL_ALSN", "AAL_AME", "AAL_AMT",
"AAL_AN", "AAL_AOS", "AAL_APD", "AAL_APH", "AAL_APOL", "AAL_ARMH",
"AAL_ASH", "AAL_ATLS", "AAL_AWI", "AAL_AWK", "AAL_AXLL", "AAL_BA",
"AAL_BC", "AAL_BCO", "AAL_BEAV", "AAL_BERY", "AAL_BGC", "AAL_BRK.B",
"AAL_BWC", "AAL_CAR", "AAL_CAT", "AAL_CBI")), .Names = c("cor",
"tickerkey"), row.names = c(NA, -35L), class = c("data.table",
"data.frame"), sorted = "tickerkey")
finaltable
tickerkey,ticker1,ticker2
AAL_ALK,AAL,ALK
AAL_CAR,AAL,CAR
AAL_CHRW,AAL,CHRW
AAL_CNW,AAL,CNW
AAL_CSX,AAL,CSX
AAL_DAL,AAL,DAL
AAL_EXPD,AAL,EXPD
AAL_FDX,AAL,FDX
AAL_HTZ,AAL,HTZ
AAL_JBHT,AAL,JBHT
dput
finaltable<-structure(list(tickerkey = c("AAL_ALK", "AAL_CAR", "AAL_CHRW",
"AAL_CNW", "AAL_CSX", "AAL_DAL", "AAL_EXPD", "AAL_FDX", "AAL_HTZ",
"AAL_JBHT"), ticker1 = c("AAL", "AAL", "AAL", "AAL", "AAL", "AAL",
"AAL", "AAL", "AAL", "AAL"), ticker2 = c("ALK", "CAR", "CHRW",
"CNW", "CSX", "DAL", "EXPD", "FDX", "HTZ", "JBHT")), .Names = c("tickerkey",
"ticker1", "ticker2"), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), sorted = "tickerkey")
I am trying to achieve that with the code as below.
setkey(cortable, "tickerkey")
setkey(finaltable, "tickerkey")
finaltable[cortable,cor:=cor,allow.cartesian=TRUE,nomatch=0]
The correct expected output would be finaltable
tickerkey,ticker1,ticker2,cor
AAL_ALK,AAL,ALK,0.9064
AAL_CAR,AAL,CAR,0.2793
with the rest of the rows having value of NA for cor
but it gives an output
finaltable
tickerkey,ticker1,ticker2,cor
AAL_ALK,AAL,ALK,0.2793
AAL_CAR,AAL,CAR,0.9064
with the rest of the rows NA for cor. and a warning on execution.. In [.data.table(finaltable, cortable, :=(cor, cor), allow.cartesian = TRUE, : Supplied 2 items to be assigned to 35 items of column 'cor' (recycled leaving remainder of 1 items).
If I remove nomatch argument, the mismatch doesn't happen.
I am trying to find out exactly whats causing this behavior, as I have a lot of code/data analysis that uses this and without knowing the exact cause of a probable unexpected behavior, brings down my confidence in all the data generated so far.
I tried to look into the definition/behaviour of nomatch, didn't find much in the context of the above usage. If anyone could give some explanation, will be very helpful.
This should work:
merge(cortable, finaltable, by=c('tickerkey'))[,list(tickerkey,ticker1,ticker2,cor)]
or you could do
cortable[finaltable][!is.na(cor)][,list(tickerkey,ticker1,ticker2,cor)]
the latter approach assumes you've set keys whereas the first doesn't matter if the keys are set.

summing integer64 columns not doing what I expect

I do not understand what is going on here. Why does sum work outside of data.table and not inside it? data.table version is 1.94 and bit64 is loaded.
dput(dt)
structure(list(Date = c(20150422L, 20150422L, 20150422L, 20150422L,
20150423L, 20150423L, 20150423L, 20150423L, 20150424L, 20150424L,
20150424L, 20150424L), totcap = structure(c(5.30519039464278e-314,
5.34352625144878e-314, 5.21151503979773e-314, 5.18159473949947e-314,
5.36659973716195e-314, 5.3767197559193e-314, 5.31749562227391e-314,
5.48717086915892e-314, 5.34891674084389e-314, 5.22243170680067e-314,
5.22969347328787e-314, 5.23636617172838e-314), class = "integer64")), .Names = c("Date",
"totcap"), class = c("data.table", "data.frame"), row.names = c(NA,
-12L), .internal.selfref = )
> sum(dt$totcap)
integer64
[1] 128782928014
> dt[,sum(totcap),by=Date]
Date V1
1: 20150422 2.104183e-313
2: 20150423 2.154799e-313
3: 20150424 2.103741e-313

generating and filling new data frames in lapply-"do not know how to convert x to class POSIXlt"

I am trying to generate a new data frame containing weekly encounter histories for an animal based on one row in a data frame that contains the animal ID (BandNo) first and last day we tracked it (FDay, Lday), and the fate of the animal when we stopped tracking it (fate) as well as other covariates.
here is an example data frame for one individual, object "a"
structure(list(BandNo = structure(1L, .Label = c("1234", "4201",
"4203", "4205", "4207", "4208", "4209", "4213", "4214", "4215",
"4216", "4217", "4219", "4221", "4223", "4224", "4226", "4227",
"4228", "4229", "4230", "4231", "4232", "4233", "4234", "4236",
"4237", "4238", "4239", "4241", "4242", "4245", "4247", "4248",
"4249", "4253", "4254", "4256", "4257", "4258", "4259", "4261",
"4262", "4263", "4264", "4271", "4272", "4273", "4276", "4277",
"4280", "4282", "4284", "4288", "4289", "4292", "4293", "4296",
"4298", "4299", "4501", "4502", "4503", "4504", "4505", "4507",
"4508", "4509", "4510", "4511", "4512", "4513", "4514", "4515",
"4516", "4517", "4518", "4519", "4520", "4521", "4525", "4526",
"4527", "4529", "4530", "4532", "4535", "4539", "4596", "4598",
"4599", "6101", "6102", "6104", "6105", "6106", "6107", "6108",
"6109", "6111", "6112", "6113", "6114", "6115", "6116", "6118",
"6119", "8002", "8003", "8004", "8005", "8006", "8007", "8008",
"8009", "8010", "8011", "8012", "8013", "8014", "8015", "8017",
"8018", "8019", "8020", "8021", "8097", "8098", "8099", "8402",
"8403", "8404", "8405", "8406", "8408", "8409", "8410", "8411",
"8412", "8413", "8414", "8416", "8417", "8418", "8419", "8422",
"8423", "8426", "8427", "8429", "8430", "8431", "8432", "8433",
"8458", "8497", "8498"), class = "factor"), FDay = structure(1380171600, class = c("POSIXct",
"POSIXt"), tzone = "America/Bogota"), Lday = structure(1392094800, class = c("POSIXct",
"POSIXt"), tzone = "America/Bogota"), ObsLength = 138, Fate = "Predation",
FieldName = structure(7L, .Label = c("Bryan", "Dassow", "H1",
"H2", "NARD", "SAY160", "SAY320", "SAY40A", "Schaeffer",
"SIB", "Wessels"), class = "factor"), Landscape = structure(2L, .Label = c("CHW",
"SAY", "SIB"), class = "factor"), Sex = structure(1L, .Label = c("F",
"M"), class = "factor")), .Names = c("BandNo", "FDay", "Lday",
"ObsLength", "Fate", "FieldName", "Landscape", "Sex"), row.names = 1L, class = "data.frame")
I can successfully create the new data frame I want (y) using this code :
library(lubridate)
mydate<-seq(from=a$FDay,to=a$Lday,by='week')
newband<-rep(a$BandNo,length(mydate))
newfate<-rep("Survive",length(mydate))
newfate[length(mydate)]<-a$Fate
y<-data.frame(newband,mydate,newfate)
y$FieldName<-a$FieldName
y$Sex<-a$Sex
y$Landscape<-a$Landscape
y$WeekID<-week(a$mydate)
y$Year<-year(a$mydate))
but when I try to apply it over a list of one-row data frames using the following code, I get the error message "do not know how to convert x to class POSIXlt"
the previous steps, used in lapply now, giving the error
b<-list(a)
d<-lapply(b,function(x){
mydate<-seq(from=x$FDay,to=x$Lday,by='week')
newband<-rep(x$BandNo,length(mydate))
newfate<-rep("Survive",length(mydate))
newfate[length(mydate)]<-x$Fate
y<-data.frame(newband,mydate,newfate)
y$FieldName<-x$FieldName
y$Sex<-x$Sex
y$Landscape<-x$Landscape
y$WeekID<-week(x$mydate)
y$Year<-year(x$mydate)})
Thanks for any help!
I made a real bonehead mistake...I should have been referencing y$ mydate at the end. here is what that lapply function should look like:
d<-lapply(adult2,function(x){
mydate<-seq(from=x$FDay,to=x$Lday,by='week')
newband<-rep(x$BandNo,length(mydate))
newfate<-rep("Survive",length(mydate))
newfate[length(mydate)]<-x$Fate
y<-data.frame(newband,mydate,newfate)
y$FieldName<-x$FieldName
y$Sex<-x$Sex
y$Landscape<-x$Landscape
y$WeekID<-week(y$mydate)
y$Year<-year(y$mydate)
return(y)}
)

Resources