Unexpected tryCatch behaviour when encountering a warning - r

I have example data as follows:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
wt = 15*round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
DT$Group <- as.character(DT$Group)
The second DT, DT2, is just a copy of DT with one additional change in the Group column, namely a string value.
DT2 <- copy(DT)
DT2[2,5] <- "something"
What I want to do, to convert columns (in this case colum 5 Group) to numeric if that is possible for most values. The (already working) code to do this is as follows:
# Put object names in the environment in a vector
dfs <- ls()
conv_to_num_check <- function(z) is.character(z) && (mean(grepl("^ *-?[\\d.]+(?:e-?\\d+)?$", z, perl = TRUE), na.rm=TRUE)>0.9)
for (i in seq_along(dfs)) {
fetch_cols <- which(sapply(get(dfs[i]), conv_to_num_check))
setDT(get(dfs[i]))[, (fetch_cols) := lapply(.SD, as.numeric), .SDcols = fetch_cols]
}
But because I thought this might go wrong (because of non data.frames in the environment), I put it in a tryCatch. The behaviour of the tryCatch is however not as I expected, because it ends up not changing DT2.
for (i in seq_along(dfs)) {
tryCatch(
expr = {
fetch_cols <- which(sapply(get(dfs[i]), conv_to_num_check))
print(paste0("The following columns of ", dfs[i], " will be converted (named interger (0) = no columns converted)"))
print(fetch_cols)
print("BEFORE")
print(str(get(dfs[i])))
setDT(get(dfs[i]))[, (fetch_cols) := lapply(.SD, as.numeric), .SDcols = fetch_cols]
print("AFTER")
print(str(get(dfs[i])))
}, error = function(e){ #Do this if an error is caught...
}, warning = function(w){ # Do this if an warning is caught...
}, finally = {# Do this at the end before quitting the tryCatch structure.
}
)
}
Could someone explain to me why this is the case?
Is there a better way to make sure that my code does not crash?

Related

Efficiently apply custom function in specific date ranges to groups

I am to calculate a number of different centrality and spread indicators on multiple timeframes on a relatively large data set ~1million rows. I have had multiple different tries, but the algorithm that I end up at is still waaay too slow for my purpose.
Here is my current iteration:
ts_rollapply <- function(COI, DATE_COL, FUN, n, unit = c("day", "week", "month", "year"), verbose = FALSE, ...) {
# Initiate Variables
APPLY_FUNC <- match.fun(FUN = FUN)
LAST_DATE <- last_date(DATE_COL, n = n, unit = match.arg(unit))
result <- vector(mode = "numeric", length = length(COI))
for(i in seq_along(COI)) {
# Extract range from Column of Interest
APPLY_RANGE <- COI[DATE_COL > LAST_DATE[i] & DATE_COL <= DATE_COL[i]]
# Apply function to extracted range
result[i] <- APPLY_FUNC(APPLY_RANGE, ...)
if(verbose && i%%100 == 0) {
ARL <- length(APPLY_RANGE)
writeLines(sprintf("Last Date: %10s, Current Date: %10s, Iteration: %3d, Length: %3d, Mean: %.2f",
LAST_DATE[i], DATE_COL[i], i, ARL, result[i]))
}
}
result
}
Note that I have also made a helper function to extract certain time periods (last_date), which is implemented as follows:
last_date <- function(x, n = 1, unit = c("day", "week", "month", "year")) {
require(lubridate)
# Stop function if x is not Class Date.
if(!is.Date(x)) stop("x is not class: Date")
if(any(is.na(x))) stop("x contains NA")
# Match unit and Perform Calculation
unit <- match.arg(unit)
result <- switch(unit,
day = x - n,
week = x - (7L*n),
month = x %m-% months(n),
year = x %m-% months(12L*n))
result
}
The problem that I face is that the function work as intended when I run it on a small sample, but it fail (time-wise) when I scale it to the full dataset. And I cannot figure out whether it is the function implementation that I have made, which is slow. Or if it is that way in which I call the function in my data.table.
library(data.table)
library(lubridate)
# Functions to apply -- I have multiple others, but these should work as example
functions <- c("mean", "median", "sd")
# Toy Data:
DT <- data.table(store = rep(1:10, each = 1000),
sales = rnorm(n = 10000, mean = 4500, sd = 2500),
date = rep(seq(ymd("2015-01-01"), by = "day", length.out = 1000), 10))
# How i call the ts_rollapply function
DT[, paste("sales_quarter", functions, sep = "_") := lapply(functions, function(x) ts_rollapply(sales, date, x, n = 3, unit = "month", na.rm = T)), store]
Any help on how to speed up my computation would be much appreciated!
One way is to do a non-equi join
DT[, (cols) :=
DT[.(STORE=STORE, START_DATE=DATE - 7L, END_DATE=DATE),
on=.(STORE, DATE>=START_DATE, DATE<=END_DATE),
lapply(functions, function(f) get(f)(SALES)), by=.EACHI][, (1:3) := NULL]
]
A faster way should be to fill in the SALES for all dates and use data.table::frollapply as mentioned in the comments.
res <- DT[DT[, .(DATE=seq(min(DATE), max(DATE), by="1 day")), STORE], on=.(STORE, DATE)][,
(cols) := lapply(functions, function(f) frollapply(SALES, 7L, f, na.rm=TRUE))]
DT[res, on=.(STORE, DATE), names(res) := mget(paste0("i.", names(res)))]
If the above suits your real-life problem, then we can create a function with it.
data:
library(data.table)
functions <- c("mean", "median", "sd")
nr <- 1e6
DT <- data.table(STORE=rep(1:10, each=nr/10),
SALES=rnorm(nr, 4500, 2500),
DATE=rep(seq(as.IDate("2015-01-01"), by="day", length.out=nr/10), 10))
cols <- paste("sales_quarter", functions, sep = "_")

Extracting vector with certain qualities from longer vector

I have the following data.table:
DT <- data.table(A = c(rep("aa",2),rep("bb",2)),
B = c(rep("H",2),rep("Na",2)),
Low = c(0,3,1,1),
High = c(8,10,9,8),
Time =c("0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10"),
Intensity = c("0,0,0,0,561464,0,0,0,0,0,0","0,0,0,6548,5464,5616,0,0,0,68716,0","5658,12,6548,6541,8,5646854,54565,56465,546,65,0","0,561464,0,0,0,0,0,0,0,0,0")
)
and use this code to extract the the highest number of consecutive intensity values above a certain value For a more detailed explanation on how this calculation works please see Reading and counting of consecutive points:
newCols <- do.call(rbind, Map(function(u, v, x, y) {
u1 <- as.numeric(u)
v1 <- as.numeric(v)
lb <- which.min(abs(x - u1))
ub <- which.min(abs(y - u1))
v3 <- as.numeric(v[(lb+1):(ub-1)])
i3 = with(rle(v3 > min(as.numeric(v[c(lb, ub)]))),
pmax(max(lengths[values]), 0))
data.frame(Consec.Points.base = i3)
},
strsplit(DT$Time, ","), strsplit(DT$Intensity, ","), DT$Low, DT$High))
DT <- cbind(DT, newCols)
I was wondering how it would be possible to instead of getting the length of the Consec.Points.base, to extract their actual points (Time and Intensity) as two vectors?
Thanks a lot in advance!
I think this answers your question, but let me know if I made a mistake, or something needs more thought/clarification.
DT <- data.table(A = c(rep("aa",2),rep("bb",2)),
B = c(rep("H",2),rep("Na",2)),
Low = c(0,3,1,1),
High = c(8,10,9,8),
Time =c("0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10"),
Intensity = c("0,0,0,0,561464,0,0,0,0,0,0","0,0,0,6548,5464,5616,0,0,0,68716,0","5658,12,6548,6541,8,5646854,54565,56465,546,65,0","0,561464,0,0,0,0,0,0,0,0,0")
)
# unique identifier
DT[, i := .I]
# re-structure
DT2 <- DT[, .(Time = as.numeric(strsplit(Time, ",")[[1]]),
Intensity = as.numeric(strsplit(Intensity, ",")[[1]])), by = i]
DT2 <- merge(DT2, DT[, .(i,A,B,Low,High)], by="i")
DT2 <- DT2[between(Time, Low, High, incbounds = FALSE),]
DT2[, IntensityGood := Intensity != min(Intensity), by=i]
# encode each part of sequence with its own value, if not FALSE
encoder <- function(x){
rle.response <- rle(x)
v2 <- rep(0, length(rle.response$values))
v2[rle.response$values!=FALSE] <- which(rle.response$values != FALSE)
rep(v2, rle.response$lengths)
}
DT2[, encodeI := encoder(IntensityGood), by = i]
# remove ones which are all 0, easily handle seperately
DT3 <- DT2[, test := all(encodeI==0), by=i][test==FALSE,][, test:=NULL]
# get count - can infer missing are 0
count <- DT3[encodeI!=0, .(max(table(encodeI))), by = i]
# get sequence
findMaxDt <- DT3[encodeI != 0, .N, by=.(i, encodeI)]
DT3 <- merge(DT3, findMaxDt, by=c("i", "encodeI"))
DT3 <- DT3[, Best := N==max(N), by=i]
DT3[Best==TRUE, .(list(Intensity)), by=i]

r data.table usage in function call

I want to perform a data.table task over and over in a function call: Reduce number of levels for large categorical variables My problem is similar to Data.table and get() command (R) or pass column name in data.table using variable in R but I can't get it to work
Without a function call this works just fine:
# Load data.table
require(data.table)
# Some data
set.seed(1)
dt <- data.table(type = factor(sample(c("A", "B", "C"), 10e3, replace = T)),
weight = rnorm(n = 10e3, mean = 70, sd = 20))
# Decide the minimum frequency a level needs...
min.freq <- 3350
# Levels that don't meet minumum frequency (using data.table)
fail.min.f <- dt[, .N, type][N < min.freq, type]
# Call all these level "Other"
levels(dt$type)[fail.min.f] <- "Other"
but wrapped like
reduceCategorical <- function(variableName, min.freq){
fail.min.f <- dt[, .N, variableName][N < min.freq, variableName]
levels(dt[, variableName][fail.min.f]) <- "Other"
}
I only get errors like:
reduceCategorical(dt$x, 3350)
Fehler in levels(df[, variableName][fail.min.f]) <- "Other" :
trying to set attribute of NULL value
And sometimes
Error is: number of levels differs
One possibility is to define your own re-leveling function using data.table::setattr that will modify dt in place. Something like
DTsetlvls <- function(x, newl)
setattr(x, "levels", c(setdiff(levels(x), newl), rep("other", length(newl))))
Then use it within another predefined function
f <- function(variableName, min.freq){
fail.min.f <- dt[, .N, by = variableName][N < min.freq, get(variableName)]
dt[, DTsetlvls(get(variableName), fail.min.f)]
invisible()
}
f("type", min.freq)
levels(dt$type)
# [1] "C" "other"
Some other data.table alternatives
f <- function(var, min.freq) {
fail.min.f <- dt[, .N, by = var][N < min.freq, get(var)]
dt[get(var) %in% fail.min.f, (var) := "Other"]
dt[, (var) := factor(get(var))]
}
Or using set/.I
f <- function(var, min.freq) {
fail.min.f <- dt[, .I[.N < min.freq], by = var]$V1
set(dt, fail.min.f, var, "other")
set(dt, NULL, var, factor(dt[[var]]))
}
Or combining with base R (doesn't modify original data set)
f <- function(df, variableName, min.freq){
fail.min.f <- df[, .N, by = variableName][N < min.freq, get(variableName)]
levels(df$type)[fail.min.f] <- "Other"
df
}
Alternatively, we could stick we characters instead (if type is a character), you could simply do
f <- function(var, min.freq) dt[, (var) := if(.N < min.freq) "other", by = var]
You are referencing things little differently in the wrapper, to get "type" column name you are using the whole variableName which is actually a vector same with getting levels, you are not using variableName directly as done in function
The error is because value of fail.min.f is coming NULL owing to referencing.

Calculating columns in data.table with variable

I'm trying to calculate columns in a data.table having the calculation passed by variable. The following is the same as what I'm trying to achieve:
dt <- data.table(mpg)
dt[, list(manufacturer, model, mpg_cyl_cty=cty/cyl, mpg_cyl_hwy=hwy/cyl)]
where I want mpg_cyl_cty=cty/cyl, mpg_cyl_hwy=hwy/cyl to come from a variable like:
var <- c('mpg_cyl_cty=cty/cyl', 'mpg_cyl_hwy=hwy/cyl')
dt[, list(manufacturer, model, var)]
I guess there are more problems to this as what type var should be assigned (c or list) and how dt is called, via list or c.
Hope somebody has a suggestion as I'm not finding anything on the WWW.
library(ggplot2)
library(data.table)
dt <- data.table(mpg)
# The original calculation
dt1 <- dt[, list(manufacturer, model, mpg_cyl_cty=cty/cyl, mpg_cyl_hwy=hwy/cyl)]
var <- c('mpg_cyl_cty=cty/cyl', 'mpg_cyl_hwy=hwy/cyl')
# create a string to pass for evaluation
expr <- paste0("`:=`(", paste0(var, collapse = ", "), ")")
dt2 <- dt[,
.(manufacturer, model, cty, cyl, hwy)
][, eval(parse(text = expr)) # evaluate the expression
][, c("cty", "cyl", "hwy") := NULL] # delete unnecessary columns
> print(all.equal(dt1, dt2))
[1] TRUE
Slightly different approach to avoid eval(parse(.)) and operate on language objects.
Instead of c('mpg_cyl_cty=cty/cyl', 'mpg_cyl_hwy=hwy/cyl') it takes just c("cty","hwy") input.
library(data.table)
dt = as.data.table(ggplot2::mpg)
r.expected = dt[, list(manufacturer, model, mpg_cyl_cty=cty/cyl, mpg_cyl_hwy=hwy/cyl)]
cyl.ratio.j = function(var){
substitute(lhs := rhs, list(
lhs = as.name(paste0("mpg_cyl_", var)),
rhs = call("/", as.name(var), as.name("cyl"))
))
}
r = dt[, eval(cyl.ratio.j("cty"))
][, eval(cyl.ratio.j("hwy"))
][, .SD, .SDcols = c("manufacturer", "model", paste0("mpg_cyl_", c("cty","hwy")))]
all.equal(r.expected, r)
#[1] TRUE

Error with cor.test, results display the following...not enough finite observations

I am trying to run a correlation test. Instead I am getting the following error.
Error in cor.test.default(Indicator, Revpolu) :
not enough finite observations What does this mean?
The script stops at table_corr <- setDT(table)[, list(Revpolu, Indicator), by = list(Club, Date)]
The table_corr variable is storing 1161 observations of 4 variables.
library(data.table)
dat <- read.csv("corrtest.csv", header = TRUE)
dat$Date <- as.Date(paste0(format(strptime(as.character(dat$Date),
"%m/%d/%y"),
"%Y/%m"),"/1"))
table <- (setDT(dat)[, list(Revenue = sum(Revenues),
Hours = sum(Hours),
Indicator = mean(Indicator)),
by = list(Club, Date)])
table$Hours[table$Hours == 0 ] <-NA
table <- table[complete.cases(table), ]
table[, Revpolu := Revenue / Hours]
table_corr <- setDT(table)[, list(Revpolu, Indicator),
by = list(Club, Date)]
testing <- table_corr[, list(Correlation = cor.test(Indicator, Revpolu)[["estimate"]]),
by = Club]
testing <- testing[complete.cases(testing), ]
correl <- testing[, round(Correlation, digits = 2)]
done <- round(mean(correl), digits = 2)

Resources