Calculating columns in data.table with variable - r

I'm trying to calculate columns in a data.table having the calculation passed by variable. The following is the same as what I'm trying to achieve:
dt <- data.table(mpg)
dt[, list(manufacturer, model, mpg_cyl_cty=cty/cyl, mpg_cyl_hwy=hwy/cyl)]
where I want mpg_cyl_cty=cty/cyl, mpg_cyl_hwy=hwy/cyl to come from a variable like:
var <- c('mpg_cyl_cty=cty/cyl', 'mpg_cyl_hwy=hwy/cyl')
dt[, list(manufacturer, model, var)]
I guess there are more problems to this as what type var should be assigned (c or list) and how dt is called, via list or c.
Hope somebody has a suggestion as I'm not finding anything on the WWW.

library(ggplot2)
library(data.table)
dt <- data.table(mpg)
# The original calculation
dt1 <- dt[, list(manufacturer, model, mpg_cyl_cty=cty/cyl, mpg_cyl_hwy=hwy/cyl)]
var <- c('mpg_cyl_cty=cty/cyl', 'mpg_cyl_hwy=hwy/cyl')
# create a string to pass for evaluation
expr <- paste0("`:=`(", paste0(var, collapse = ", "), ")")
dt2 <- dt[,
.(manufacturer, model, cty, cyl, hwy)
][, eval(parse(text = expr)) # evaluate the expression
][, c("cty", "cyl", "hwy") := NULL] # delete unnecessary columns
> print(all.equal(dt1, dt2))
[1] TRUE

Slightly different approach to avoid eval(parse(.)) and operate on language objects.
Instead of c('mpg_cyl_cty=cty/cyl', 'mpg_cyl_hwy=hwy/cyl') it takes just c("cty","hwy") input.
library(data.table)
dt = as.data.table(ggplot2::mpg)
r.expected = dt[, list(manufacturer, model, mpg_cyl_cty=cty/cyl, mpg_cyl_hwy=hwy/cyl)]
cyl.ratio.j = function(var){
substitute(lhs := rhs, list(
lhs = as.name(paste0("mpg_cyl_", var)),
rhs = call("/", as.name(var), as.name("cyl"))
))
}
r = dt[, eval(cyl.ratio.j("cty"))
][, eval(cyl.ratio.j("hwy"))
][, .SD, .SDcols = c("manufacturer", "model", paste0("mpg_cyl_", c("cty","hwy")))]
all.equal(r.expected, r)
#[1] TRUE

Related

Unexpected tryCatch behaviour when encountering a warning

I have example data as follows:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
wt = 15*round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
DT$Group <- as.character(DT$Group)
The second DT, DT2, is just a copy of DT with one additional change in the Group column, namely a string value.
DT2 <- copy(DT)
DT2[2,5] <- "something"
What I want to do, to convert columns (in this case colum 5 Group) to numeric if that is possible for most values. The (already working) code to do this is as follows:
# Put object names in the environment in a vector
dfs <- ls()
conv_to_num_check <- function(z) is.character(z) && (mean(grepl("^ *-?[\\d.]+(?:e-?\\d+)?$", z, perl = TRUE), na.rm=TRUE)>0.9)
for (i in seq_along(dfs)) {
fetch_cols <- which(sapply(get(dfs[i]), conv_to_num_check))
setDT(get(dfs[i]))[, (fetch_cols) := lapply(.SD, as.numeric), .SDcols = fetch_cols]
}
But because I thought this might go wrong (because of non data.frames in the environment), I put it in a tryCatch. The behaviour of the tryCatch is however not as I expected, because it ends up not changing DT2.
for (i in seq_along(dfs)) {
tryCatch(
expr = {
fetch_cols <- which(sapply(get(dfs[i]), conv_to_num_check))
print(paste0("The following columns of ", dfs[i], " will be converted (named interger (0) = no columns converted)"))
print(fetch_cols)
print("BEFORE")
print(str(get(dfs[i])))
setDT(get(dfs[i]))[, (fetch_cols) := lapply(.SD, as.numeric), .SDcols = fetch_cols]
print("AFTER")
print(str(get(dfs[i])))
}, error = function(e){ #Do this if an error is caught...
}, warning = function(w){ # Do this if an warning is caught...
}, finally = {# Do this at the end before quitting the tryCatch structure.
}
)
}
Could someone explain to me why this is the case?
Is there a better way to make sure that my code does not crash?

r data.table lapply with multiple SDcols

I am trying to use an lapply that takes two lists of columns into account (all are numeric type) in what would seem to be a simple line of code, but the output is not as expected.
This is my code :
x<-50
measure <- c("haz", "waz", "whz", "htcm", "wtkg", "bmi")
new_measure_1.5 <- paste(measure, "1.5", sep = "_")
new_temp_cols<-paste("temp", new_measure_1.5, sep = "_")
new_columns<-paste(new_measure_1.5, "1", sep="_")
newcols_1.5_months<-function(x, agedays, new_temp_cols, y){
ifelse(agedays==x, new_temp_cols, y)
}
DT[, (new_columns) := lapply(.SD, function(y) newcols_1.5_months(x, agedays, new_temp_cols, y)), .SDcols = new_columns ]
The above code results in new_columns (haz_1.5_1, waz_1.5_1, whz_1.5_1, wtkg_1.5_1, htcm_1.5_1, bmi_1.5_1) holding the names of the columns in the list new_temp_cols (temp_haz_1.5, temp_waz_1.5, temp_whz_1.5, temp_wtkg_1.5, temp_htcm_1.5, temp_bmi_1.5) as opposed to the values they hold in my data table. It seems that R is reading the list as a vector of strings rather than a vector of columns. Why is this?
I've tried using multiple .SDcols, but this doesn't work :
DT[, (new_columns) := lapply(.SD, function(y) newcols_1.5_months(x, agedays, new_temp_cols, y)), .SDcols = c(new_columns, new_temp_cols) ]
Is there a simple fix to this?
**** Editing to add a small subset of dummy data similar to my data table
measure<-c("haz", "waz")
new_measure_1.5 <- paste(measure, "1.5", sep = "_")
new_temp_cols<-paste("temp", new_measure_1.5, sep = "_")
new_columns<-paste(new_measure_1.5, "1", sep="_")
anthro <- data.table
(agedays = c(25,50,53,22,37,50,12,45,50,15,33,50),
temp_haz_1.5 = c(1.2,1.5,1.7,2.0,4.5,6.7,6.8,6.7,4.5,6.6,8.9,6.7),
temp_waz_1.5 = c(3.2,1.8,6.7,2.8,3.5,7.7,9.8,1.7,6.9,3.8,0.9,4.7),
haz_1.5_1 = c(1.2,2.5,4.7,7.0,4.7,6.8,6.3,2.7,5.5,8.6,3.9,6.7),
waz_1.5_1 =c(6.2,2.5,5.7,7.0,2.5,7.7,8.8,9.7,2.5,4.6,5.9,6.7))
If we have multiple columns, then use Map to loop over each of the corresponding sets of columns and apply the function.
library(data.table)
x <- 50
DT[, (new_columns) := Map(function(u, y)
newcols_1.5_months(u, DT[['agedays']], x, y),
.SD[, new_columns, with = FALSE],
.SD[, new_temp_cols, with = FALSE]) ]
Perhaps the function can be
newcols_1.5_months<- function(u, agedays, x, y){
ifelse(agedays==x, u, y)
}
data
DT <- data.table(agedays = c(25,50,53,22,37,50,12,45,50,15,33,50),
temp_haz_1.5 = c(1.2,1.5,1.7,2.0,4.5,6.7,6.8,6.7,4.5,6.6,8.9,6.7),
temp_waz_1.5 = c(3.2,1.8,6.7,2.8,3.5,7.7,9.8,1.7,6.9,3.8,0.9,4.7),
haz_1.5_1 = c(1.2,2.5,4.7,7.0,4.7,6.8,6.3,2.7,5.5,8.6,3.9,6.7),
waz_1.5_1 =c(6.2,2.5,5.7,7.0,2.5,7.7,8.8,9.7,2.5,4.6,5.9,6.7))

R data.table grouped sum for column referenced by name stored in a variable

The problem is as follows: I have a data.table with columns A and B. A summary is required and its name is passed as a character vector in variable var1.
I have tried to find an answer for some time now, see e.g. this and this SO posts. Being unable to find a proper solution, I feel forced to ask this myself.
Now what I want to do is (using data.frame)
tmp[, var1] <- rep(1, nrow(tmp))
tmp <- aggregate(formula(paste(var1, "~ A + B")), tmp, sum)
but I fail to do so with data.table with my last and best effort being
tmp <- tmp[, list(..var1 = .N), by = list(A, B)]
Now, what is wrong with my code and how do I fix it?
And note that I do NOT want to use the := operator, because I want the result to be exactly as it would be from aggregate().
Edit 1: A working example:
library(data.table)
tmp <- data.table(A=c("R","G","G","B","B","B"), B=c(1,1,1,2,1,2))
print(tmp)
var1 <- "C"
tmp[, var1] <- rep(1, nrow(tmp))
tmp2 <- aggregate(formula(paste(var1, "~ A + B")), tmp, sum)
print(tmp2)
tmp3 <- tmp[, list(..var1 = .N), by = list(A, B)]
print(tmp3)
Hope that I did not misread your qn. Here are some options:
1) using base::setNames
DT[, setNames(.(.N), var1), by=.(A, B)]
2) using data.table::setnames
setnames(DT[, .N, by=.(A, B)], "N", var1)[]
3) using base::structure followed by base::as.list
DT[, as.list(structure(.N, names=var1)), by=.(A, B)]
data:
DT <- data.table(A=c(1,1,2,2), B=c(1,1,2,3))
var1 <- "myCol"

r data.table usage in function call

I want to perform a data.table task over and over in a function call: Reduce number of levels for large categorical variables My problem is similar to Data.table and get() command (R) or pass column name in data.table using variable in R but I can't get it to work
Without a function call this works just fine:
# Load data.table
require(data.table)
# Some data
set.seed(1)
dt <- data.table(type = factor(sample(c("A", "B", "C"), 10e3, replace = T)),
weight = rnorm(n = 10e3, mean = 70, sd = 20))
# Decide the minimum frequency a level needs...
min.freq <- 3350
# Levels that don't meet minumum frequency (using data.table)
fail.min.f <- dt[, .N, type][N < min.freq, type]
# Call all these level "Other"
levels(dt$type)[fail.min.f] <- "Other"
but wrapped like
reduceCategorical <- function(variableName, min.freq){
fail.min.f <- dt[, .N, variableName][N < min.freq, variableName]
levels(dt[, variableName][fail.min.f]) <- "Other"
}
I only get errors like:
reduceCategorical(dt$x, 3350)
Fehler in levels(df[, variableName][fail.min.f]) <- "Other" :
trying to set attribute of NULL value
And sometimes
Error is: number of levels differs
One possibility is to define your own re-leveling function using data.table::setattr that will modify dt in place. Something like
DTsetlvls <- function(x, newl)
setattr(x, "levels", c(setdiff(levels(x), newl), rep("other", length(newl))))
Then use it within another predefined function
f <- function(variableName, min.freq){
fail.min.f <- dt[, .N, by = variableName][N < min.freq, get(variableName)]
dt[, DTsetlvls(get(variableName), fail.min.f)]
invisible()
}
f("type", min.freq)
levels(dt$type)
# [1] "C" "other"
Some other data.table alternatives
f <- function(var, min.freq) {
fail.min.f <- dt[, .N, by = var][N < min.freq, get(var)]
dt[get(var) %in% fail.min.f, (var) := "Other"]
dt[, (var) := factor(get(var))]
}
Or using set/.I
f <- function(var, min.freq) {
fail.min.f <- dt[, .I[.N < min.freq], by = var]$V1
set(dt, fail.min.f, var, "other")
set(dt, NULL, var, factor(dt[[var]]))
}
Or combining with base R (doesn't modify original data set)
f <- function(df, variableName, min.freq){
fail.min.f <- df[, .N, by = variableName][N < min.freq, get(variableName)]
levels(df$type)[fail.min.f] <- "Other"
df
}
Alternatively, we could stick we characters instead (if type is a character), you could simply do
f <- function(var, min.freq) dt[, (var) := if(.N < min.freq) "other", by = var]
You are referencing things little differently in the wrapper, to get "type" column name you are using the whole variableName which is actually a vector same with getting levels, you are not using variableName directly as done in function
The error is because value of fail.min.f is coming NULL owing to referencing.

Filter out data.table columns based on summary statistics

I often need to filter out columns with a low variance from a data.table. The column names are not known in advance.
dt = data.table(mtcars)
# calculate standard deviation with arbitrary max value of 1:
mask = dt[,lapply(.SD, function(x) sd(x, na.rm = TRUE) > 1)]
# The columns with the FALSE values in row 1 need to be removed
mask.t = t(mask)
mask.t = which(mask.t)
dt[,mask.t,with=FALSE]
The approach above is clunky. Is there a more elegant way to filter out columns out of a data.table for which the column statistic evaluates to TRUE?
These work:
dt[, .SD, .SDcols=unlist(mask)]
dt[, .SD, .SDcols=which(unlist(mask))]
All together now:
variance.filter = function(df) {
mask = df[,lapply(.SD, function(x) sd(x,na.rm = TRUE) > 1)]
df = df[, .SD, .SDcols = unlist(mask)]
}
EDIT in the current development version of data.table (1.12.9), .SDcols accepts a function filter for columns, so this would work:
variance.filter = function(df) {
df[ , .SD, .SDcols = function(x) sd(x, na.rm = TRUE) > 1]
}

Resources