summing integer64 columns not doing what I expect - r

I do not understand what is going on here. Why does sum work outside of data.table and not inside it? data.table version is 1.94 and bit64 is loaded.
dput(dt)
structure(list(Date = c(20150422L, 20150422L, 20150422L, 20150422L,
20150423L, 20150423L, 20150423L, 20150423L, 20150424L, 20150424L,
20150424L, 20150424L), totcap = structure(c(5.30519039464278e-314,
5.34352625144878e-314, 5.21151503979773e-314, 5.18159473949947e-314,
5.36659973716195e-314, 5.3767197559193e-314, 5.31749562227391e-314,
5.48717086915892e-314, 5.34891674084389e-314, 5.22243170680067e-314,
5.22969347328787e-314, 5.23636617172838e-314), class = "integer64")), .Names = c("Date",
"totcap"), class = c("data.table", "data.frame"), row.names = c(NA,
-12L), .internal.selfref = )
> sum(dt$totcap)
integer64
[1] 128782928014
> dt[,sum(totcap),by=Date]
Date V1
1: 20150422 2.104183e-313
2: 20150423 2.154799e-313
3: 20150424 2.103741e-313

Related

Filtering or matching by exact time

When I try and filter a df by an exact time, not between two different times it produces a NA. I am trying to create a variable in df1 that is based on information in df2.
Here is my data
dput(df2)
structure(list(Time = structure(c(1647531450.72, 1647531451.757,
1647531452.794, 1647531453.83, 1647531454.867, 1647531455.818,
1647531456.854, 1647531457.891, 1647531458.928, 1647531459.878,
1647531460.915, 1647531461.952, 1647531462.902, 1647531463.939,
1647531464.976, 1647531466.013, 1647531467.05, 1647531468, 1647531469.037,
1647531470.074, 1647531471.11, 1647531472.147, 1647531473.098,
1647531474.134, 1647531475.171, 1647531476.208, 1647531477.245,
1647531478.195, 1647531479.232, 1647531480.269, 1647531481.306,
1647531482.342, 1647531483.293, 1647531484.33, 1647531485.366,
1647531486.317, 1647531487.354, 1647531488.39, 1647531489.427,
1647531490.378, 1647531491.414, 1647531492.451, 1647531493.488,
1647531494.438, 1647531495.475, 1647531496.512, 1647531497.549,
1647531498.586, 1647531499.536, 1647531500.573, 1647531501.61,
1647531502.56, 1647531503.597, 1647531504.634, 1647531505.67,
1647531506.621, 1647531507.658, 1647531508.694, 1647531509.645
), tzone = "", class = c("POSIXct", "POSIXt")), LAT = c(17.8799454,
17.8799729, 17.8799952, 17.8800159, 17.8800416, 17.8800708, 17.8801,
17.8801292, 17.8801567, 17.8801877, 17.8802237, 17.8802581, 17.8802873,
17.8803148, 17.8803474, 17.8803818, 17.8804161, 17.8804471, 17.8804763,
17.8805089, 17.8805381, 17.880569, 17.8806034, 17.880636, 17.8806721,
17.8807048, 17.8807374, 17.8808061, 17.8808405, 17.8808783, 17.8808783,
17.8809556, 17.8809968, 17.8810346, 17.8810724, 17.8810724, 17.8811497,
17.8811892, 17.8812288, 17.88127, 17.8813112, 17.8813524, 17.8813954,
17.8814383, 17.8814813, 17.8815208, 17.8815603, 17.8815964, 17.8816359,
17.8816737, 17.8817132, 17.8817562, 17.8817974, 17.8818438, 17.8818885,
17.8819314, 17.8819726, 17.8820104, 17.88205), LON = c(-62.8613544,
-62.8613338, -62.8613063, -62.8612857, -62.861265, -62.8612513,
-62.8612307, -62.8612101, -62.8611894, -62.8611757, -62.8611688,
-62.8611482, -62.8611276, -62.8611139, -62.8611001, -62.8610795,
-62.8610658, -62.861052, -62.8610314, -62.8610176, -62.860997,
-62.8609833, -62.8609627, -62.8609489, -62.8609352, -62.8609214,
-62.8609008, -62.8608733, -62.8608665, -62.8608596, -62.8608596,
-62.8608459, -62.860839, -62.8608321, -62.8608252, -62.8608252,
-62.8608115, -62.8608115, -62.8608115, -62.8608115, -62.8608115,
-62.8608115, -62.8608115, -62.8608115, -62.8608115, -62.8608046,
-62.8607977, -62.8607909, -62.860784, -62.8607771, -62.8607771,
-62.8607771, -62.8607771, -62.8607771, -62.8607771, -62.8607703,
-62.8607634, -62.8607496, -62.8607428)), class = "data.frame", row.names = c(NA,
-59L))
and the df containing the information I want to filter by
dput(df1)
structure(list(date = structure(19068, class = "Date"), RaceStartTime = structure(1647531480, tzone = "", class = c("POSIXct",
"POSIXt"))), class = "data.frame", row.names = "event.2")
I have tried the following
df1$lon <- df2$LON[match(df1$RaceStartTime, df2$Time)]
I have also tried
df1$lon <- df2%>%
filter(Time == df1$RaceStartTime)
Both of these produce empty rows, can some one point out the obvious mistake?!
EDIT:
The structure appears the same
str(df1$RaceStartTime)
POSIXct[1:1], format: "2022-03-17 15:38:00"
str(df2$Time)
POSIXct[1:59], format: "2022-03-17 15:37:30"
Thanks
POSIXct format by default only prints whole seconds, but its underlying representation can contain fractional seconds (as your data in df2 does).
You can remove the fractional seconds by doing:
df2$Time <- lubridate::floor_date(df2$Time)
So now you get:
df2%>%
filter(Time == df1$RaceStartTime)
#> Time LAT LON
#> 1 2022-03-17 15:38:00 17.88088 -62.86086

Use of purrr's "modify_if" with a function

I'm trying to apply the discretize_rgr function (here) of the package funModeling to multiple columns of a dataframe.
For a single column, it is working for me in this way:
discretize_rgr(input = df.div$to_be_discretized, target = df.div$TARGET, max_n_bins=10)
So, I'm trying to use the purrr package to manage multiple columns in this way:
df.div %>%
modify_if( is.numeric, ~ discretize_rgr(., target = df.div$TARGET, max_n_bins=10))
but I'm get the following error:
Error in order(fpoints_top) : argument 1 is not a vector
What's wrong?
UPDATE (example data)
structure(list(to_be_discretized = c(0.0152096300012854, 0.0132660373578711,
0.014699121782711, 0.0157102877064037, 0.0197417484744586, 0.019651999420645
), TARGET = c(27136, 30048, 34840, 138812, 191088, 240370)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L))

how to predict using var with exogenous variables in R

I have the following data:
library(data.table)
modelling_dt_train <- structure(list(`1` = c(54593L, 74481L, 85566L, 97637L, 101081L,
184089L, 158895L, 153780L, 153681L, 157188L, 142216L, 136437L,
135501L, 111264L, 123259L, 110397L, 146034L, 162900L, 132499L,
121516L, 119651L, 114045L, 112551L, 123209L, 134930L, 132147L,
151327L, 155666L, 158538L, 205766L, 200407L, 219588L, 231954L,
179884L, 159121L, 156148L, 136191L, 132956L, 202086L, 141047L,
118490L, 116595L, 127620L, 135962L, 137419L, 127334L, 158804L,
139142L, 181773L, 228278L, 272373L, 186666L, 148791L, 143608L,
169634L, 188149L, 239867L, 332543L, 253463L, 240574L, 237245L,
275466L, 262755L, 241538L, 303377L),
`2` = c(148181L, 186894L,
243357L, 227298L, 195640L, 412137L, 363152L, 355169L, 296208L,
328993L, 281652L, 308027L, 316254L, 249293L, 320821L, 220521L,
284411L, 263807L, 258093L, 261060L, 320153L, 311547L, 279734L,
258453L, 269697L, 313700L, 255285L, 232495L, 305346L, 393256L,
390655L, 527039L, 529056L, 450689L, 425190L, 372144L, 303765L,
324658L, 365035L, 285178L, 230985L, 251308L, 290378L, 279595L,
294676L, 391377L, 445682L, 364056L, 441207L, 516852L, 673401L,
415677L, 304000L, 266365L, 311924L, 314192L, 407313L, 664519L,
456920L, 384978L, 351644L, 432627L, 409624L, 386330L, 487679L
),
`3` = c(60217L, 66492L, 66675L, 76400L, 117252L, 264527L,
256384L, 241815L, 187115L, 193106L, 177620L, 140833L, 188291L,
110069L, 163581L, 107650L, 118319L, 118821L, 122383L, 117267L,
134962L, 121227L, 124952L, 111740L, 137493L, 163895L, 60653L,
69311L, 88810L, 128620L, 132077L, 153399L, 162989L, 151866L,
127325L, 122813L, 115284L, 103765L, 113185L, 101607L, 92379L,
98646L, 94376L, 98069L, 98972L, 103074L, 142199L, 123497L, 141823L,
205582L, 251187L, 109603L, 80711L, 80799L, 84175L, 104965L, 181221L,
245377L, 201378L, 235504L, 188925L, 214614L, 220312L, 191591L,
203292L)),
.Names = c("1", "2", "3"), class = c("data.table",
"data.frame"), row.names = c(NA, -65L))
modelling_x_train <- structure(list(`1` = c(1982134L, 1968327L, 2019222L, 2025126L,
2033065L, 2188202L, 2066808L, 2070103L, 2041154L, 2201142L, 2105848L,
2067669L, 2005707L, 2239632L, 2435928L, 2363759L, 2444016L, 2556139L,
2807283L, 2674632L, 2687984L, 2889011L, 2839239L, 2712064L, 2928420L,
2889533L, 3106868L, 2746471L, 2953436L, 3225171L, 2926874L, 2914124L,
3210355L, 2847523L, 2890636L, 3268445L, 2941468L, 2931027L, 2906610L,
3222324L, 2833093L, 2978953L, 3196315L, 3055240L, 3210672L, 3368890L,
3046191L, 2960181L, 3341146L, 3227672L, 3062702L, 3197227L, 3445476L,
3441273L, 3651232L, 3566179L, 3619685L, 3716756L, 3600666L, 3732533L,
3695464L, 3857145L, 3700072L, 3608183L, 3904237L),
`2` = c(4082316L,
4644387L, 5230567L, 5115720L, 4729153L, 5658227L, 5492034L, 5443022L,
5094415L, 5939637L, 5354626L, 5509783L, 5438960L, 4912936L, 5736293L,
5167632L, 5244341L, 5580274L, 5750346L, 5358527L, 5916955L, 6129790L,
5245982L, 5801479L, 5683117L, 5721551L, 6972176L, 7072498L, 7979325L,
8324202L, 7434885L, 8189438L, 8062609L, 7658496L, 8066643L, 8528136L,
7515745L, 8276800L, 8227022L, 6523804L, 5780869L, 6481060L, 6912797L,
6276934L, 6592158L, 6908732L, 6067945L, 6459707L, 6910377L, 6645470L,
6538196L, 6694136L, 7484290L, 7299620L, 8532078L, 7713988L, 7256825L,
8237839L, 7834919L, 7725377L, 7291804L, 8224205L, 7784470L, 7514557L,
8164590L),
`3` = c(3181556L, 3232260L, 3272852L, 3233534L, 2876956L,
2979204L, 3275916L, 3345278L, 2951867L, 2976889L, 3289397L, 2955148L,
3306653L, 1861934L, 2239827L, 2207356L, 2335514L, 2387791L, 2592206L,
2371527L, 2586856L, 2447660L, 2322218L, 2342827L, 2666258L, 2627928L,
2525534L, 2521129L, 2573991L, 2752528L, 2538251L, 2676848L, 2802139L,
2702108L, 2630417L, 2778233L, 2725544L, 2723849L, 2795745L, 1954820L,
1842684L, 2132844L, 2182141L, 2041725L, 2355857L, 2414334L, 2350885L,
2367547L, 2436918L, 2328244L, 2390647L, 2460700L, 3081623L, 2877487L,
3025104L, 3108909L, 3172441L, 3267766L, 3354357L, 3273165L, 3322516L,
3342817L, 3413854L, 3217624L, 2736617L)),
.Names = c("1", "2",
"3"), class = c("data.table", "data.frame"), row.names = c(NA,
-65L))
where modelling_dt_train is the time series of 3 products and modelling_x_train is an exogenous variable (which is also a time series) for the same products.
I am estimating a VAR model using the following code
library(vars)
x <- log(modelling_dt_train)
x <- x[,lapply(.SD,function(x){ifelse(is.infinite(x),0,x)})]
modelling_x_train <- log(modelling_x_train)
modelling_x_train <- modelling_x_train[,lapply(.SD,function(x){ifelse(is.infinite(x),0,x)})]
x_mat <- as.matrix(x)
dx <- x_mat
var = VAR(dx, p=p, exogen = modelling_x_train, season = 18)
So far so good, but when I want to predict the values for 12 periods using
predict(var, newdata = modelling_dt_test, dumvar = modelling_x_test, n.ahead = 12)
I get an error:
Error in predict.varest(var, newdata = modelling_dt_test, dumvar = modelling_x_test, :
Column names of dumvar do not coincide with exogen.
The newdata and the dumvar that I am using are some future values of the previous same products, but further in time
modelling_x_test <- structure(list(`1` = c(4447896L, 4779229L, 4628391L, 4737933L,
5102152L, 4838918L, 4955183L, 5258605L, 5084001L, 4798945L, 5204015L,
5129690L, 5101568L),
`2` = c(6108187L, 6733956L, 7065148L, 7111155L,
6513151L, 7622806L, 7062042L, 7206067L, 7144091L, 7412266L, 6752614L,
7705255L, 7487054L),
`3` = c(1716975L, 2022198L, 2122109L, 2155489L,
2428639L, 2433860L, 2717315L, 2471655L, 2795100L, 2908946L, 2581813L,
2633578L, 2666302L)),
.Names = c("1", "2", "3"), class = c("data.table",
"data.frame"), row.names = c(NA, -13L))
modelling_dt_test <- structure(list(`1` = c(244876L, 275993L, 256180L, 321256L, 316042L,
275097L, 250842L, 245543L, 233386L, 218958L, 254270L, 238804L,
234079L),
`2` = c(375278L, 429496L, 478816L, 532311L, 442922L,
485787L, 460750L, 501956L, 454178L, 425800L, 413112L, 434328L,
446069L),
`3` = c(119577L, 139870L, 127951L, 125017L, 138176L,
114517L, 129880L, 120941L, 159176L, 157890L, 149554L, 144210L,
165979L)),
.Names = c("1", "2", "3"), class = c("data.table",
"data.frame"), row.names = c(NA, -13L))
EDIT
In the source code here, at line 58 there is this check. But what this line is checking is if colnames(data.all) (which are "X1" "X2" "X3" "X1.l1" "X2.l1" "X3.l1" "const" "sd1" "sd2" "sd3" "sd4" "sd5" "sd6" "sd7" "sd8" "sd9" "sd10" "sd11" "sd12" "sd13" "sd14" "sd15" "sd16" "sd17" "X1.1" "X2.1" "X3.1" , so it seems to me as the coefficients) are equal with colnames(modelling_x_test) which are "1" "2" "3" (the products). How can these ever be equal ?

Conserving a monthly sequence in reshape in R

RI_Final
Names = c("Name", "29/01/1992", "29/02/1992",
"29/03/1992", "29/04/1992", "29/05/1992", "29/06/1992", "29/07/1992",
"29/08/1992", "29/09/1992", "29/10/1992", "29/11/1992", "29/12/1992",
"29/01/1993", "28/02/1993", "29/03/1993", "29/04/1993", "29/05/1993",
"29/06/1993", "29/07/1993", "29/08/1993", "29/09/1993", "29/10/1993",
"29/11/1993", "29/12/1993", "29/01/1994", "28/02/1994", "29/03/1994",
"29/04/1994", "29/05/1994", "29/06/1994", "29/07/1994", "29/08/1994",
"29/09/1994", "29/10/1994", "29/11/1994", "29/12/1994", "29/01/1995",
"28/02/1995", "29/03/1995", "29/04/1995", "29/05/1995", "29/06/1995",
"29/07/1995", "29/08/1995", "29/09/1995", "29/10/1995", "29/11/1995",
"29/12/1995", "29/01/1996", "29/02/1996", "29/03/1996", "29/04/1996",
"29/05/1996", "29/06/1996", "29/07/1996", "29/08/1996", "29/09/1996",
"29/10/1996", "29/11/1996", "29/12/1996", "29/01/1997", "28/02/1997",
"29/03/1997", "29/04/1997", "29/05/1997", "29/06/1997", "29/07/1997",
"29/08/1997", "29/09/1997", "29/10/1997", "29/11/1997", "29/12/1997",
"29/01/1998", "28/02/1998", "29/03/1998", "29/04/1998", "29/05/1998",
"29/06/1998", "29/07/1998", "29/08/1998", "29/09/1998", "29/10/1998",
"29/11/1998", "29/12/1998", "29/01/1999", "28/02/1999", "29/03/1999",
"29/04/1999", "29/05/1999", "29/06/1999", "29/07/1999", "29/08/1999",
"29/09/1999", "29/10/1999", "29/11/1999", "29/12/1999", "29/01/2000",
"29/02/2000", "29/03/2000", "29/04/2000", "29/05/2000", "29/06/2000",
"29/07/2000", "29/08/2000", "29/09/2000", "29/10/2000", "29/11/2000",
"29/12/2000"))), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"), .internal.selfref = <pointer: (nil)>)
I have a DF I want to reshaped to Panel DF this was my solution
reshaped.RI_Final<- reshape(RI_Final,
direction="long",
varying=list(names(RI_Final)[2:290]),
v.names="Value",
timevar="Month")
the variables from 2:290 are a monthly sequence from 02/1992 to 12/2015 in my solution I can't find in Month Column the dates.

Unexpected behavior in data.table join when using nomatch and allow.cartesian

I have 2 data tables and am trying to get a column cor from cortable into finaltable.
cortable
cor,tickerkey
0.7539,AAL_AAN
0.573,AAL_ABB
0.6384,AAL_ACM
0.7193,AAL_ACXM
0.8386,AAL_ADP
0.7392,AAL_ADT
0.732,AAL_AER
0.4805,AAL_AGCO
0.9363,AAL_AL
0.9064,AAL_ALK
0.7545,AAL_ALSN
0.8586,AAL_AME
0.3356,AAL_AMT
0.8239,AAL_AN
0.8637,AAL_AOS
0.7638,AAL_APD
0.7915,AAL_APH
0.8785,AAL_APOL
0.8073,AAL_ARMH
0.7744,AAL_ASH
0.4179,AAL_ATLS
0.8282,AAL_AWI
-0.2539,AAL_AWK
0.8213,AAL_AXLL
0.827,AAL_BA
0.8642,AAL_BC
0.7982,AAL_BCO
0.2002,AAL_BEAV
0.7079,AAL_BERY
0.858,AAL_BGC
0.5943,AAL_BRK.B
0.1522,AAL_BWC
0.2793,AAL_CAR
0.8537,AAL_CAT
0.9115,AAL_CBI
dput
cortable<-structure(list(cor = c("0.7539", "0.573", "0.6384", "0.7193",
"0.8386", "0.7392", "0.732", "0.4805", "0.9363", "0.9064", "0.7545",
"0.8586", "0.3356", "0.8239", "0.8637", "0.7638", "0.7915", "0.8785",
"0.8073", "0.7744", "0.4179", "0.8282", "-0.2539", "0.8213",
"0.827", "0.8642", "0.7982", "0.2002", "0.7079", "0.858", "0.5943",
"0.1522", "0.2793", "0.8537", "0.9115"),
tickerkey = c("AAL_AAN", "AAL_ABB", "AAL_ACM", "AAL_ACXM", "AAL_ADP", "AAL_ADT", "AAL_AER",
"AAL_AGCO", "AAL_AL", "AAL_ALK", "AAL_ALSN", "AAL_AME", "AAL_AMT",
"AAL_AN", "AAL_AOS", "AAL_APD", "AAL_APH", "AAL_APOL", "AAL_ARMH",
"AAL_ASH", "AAL_ATLS", "AAL_AWI", "AAL_AWK", "AAL_AXLL", "AAL_BA",
"AAL_BC", "AAL_BCO", "AAL_BEAV", "AAL_BERY", "AAL_BGC", "AAL_BRK.B",
"AAL_BWC", "AAL_CAR", "AAL_CAT", "AAL_CBI")), .Names = c("cor",
"tickerkey"), row.names = c(NA, -35L), class = c("data.table",
"data.frame"), sorted = "tickerkey")
finaltable
tickerkey,ticker1,ticker2
AAL_ALK,AAL,ALK
AAL_CAR,AAL,CAR
AAL_CHRW,AAL,CHRW
AAL_CNW,AAL,CNW
AAL_CSX,AAL,CSX
AAL_DAL,AAL,DAL
AAL_EXPD,AAL,EXPD
AAL_FDX,AAL,FDX
AAL_HTZ,AAL,HTZ
AAL_JBHT,AAL,JBHT
dput
finaltable<-structure(list(tickerkey = c("AAL_ALK", "AAL_CAR", "AAL_CHRW",
"AAL_CNW", "AAL_CSX", "AAL_DAL", "AAL_EXPD", "AAL_FDX", "AAL_HTZ",
"AAL_JBHT"), ticker1 = c("AAL", "AAL", "AAL", "AAL", "AAL", "AAL",
"AAL", "AAL", "AAL", "AAL"), ticker2 = c("ALK", "CAR", "CHRW",
"CNW", "CSX", "DAL", "EXPD", "FDX", "HTZ", "JBHT")), .Names = c("tickerkey",
"ticker1", "ticker2"), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), sorted = "tickerkey")
I am trying to achieve that with the code as below.
setkey(cortable, "tickerkey")
setkey(finaltable, "tickerkey")
finaltable[cortable,cor:=cor,allow.cartesian=TRUE,nomatch=0]
The correct expected output would be finaltable
tickerkey,ticker1,ticker2,cor
AAL_ALK,AAL,ALK,0.9064
AAL_CAR,AAL,CAR,0.2793
with the rest of the rows having value of NA for cor
but it gives an output
finaltable
tickerkey,ticker1,ticker2,cor
AAL_ALK,AAL,ALK,0.2793
AAL_CAR,AAL,CAR,0.9064
with the rest of the rows NA for cor. and a warning on execution.. In [.data.table(finaltable, cortable, :=(cor, cor), allow.cartesian = TRUE, : Supplied 2 items to be assigned to 35 items of column 'cor' (recycled leaving remainder of 1 items).
If I remove nomatch argument, the mismatch doesn't happen.
I am trying to find out exactly whats causing this behavior, as I have a lot of code/data analysis that uses this and without knowing the exact cause of a probable unexpected behavior, brings down my confidence in all the data generated so far.
I tried to look into the definition/behaviour of nomatch, didn't find much in the context of the above usage. If anyone could give some explanation, will be very helpful.
This should work:
merge(cortable, finaltable, by=c('tickerkey'))[,list(tickerkey,ticker1,ticker2,cor)]
or you could do
cortable[finaltable][!is.na(cor)][,list(tickerkey,ticker1,ticker2,cor)]
the latter approach assumes you've set keys whereas the first doesn't matter if the keys are set.

Resources