Running aggregate function within dmapply (ddR package) - r

I would like to run the aggregate function within the dmapply function as offered through the ddR package.
Desired results
The desired results reflect a simple output generated via aggregate in base:
aggregate(
x = mtcars$mpg,
FUN = function(x) {
mean(x, na.rm = TRUE)
},
by = list(trans = mtcars$am)
)
which produces:
trans x
1 0 17.14737
2 1 24.39231
Attempt - ddmapply
I would like to arrive at the same results while utilising ddmapply, as attempted below:
# ddR
require(ddR)
# ddR object creation
distMtcars <- as.dframe(mtcars)
# Aggregate / ddmapply
dmapply(
FUN = function(x, y) {
aggregate(FUN = mean(x, na.rm = TRUE),
x = x,
by = list(trans = y))
},
distMtcars$mpg,
y = distMtcars$am,
output.type = "dframe",
combine = "rbind"
)
The code fails:
Error in match.fun(FUN) : 'mean(x, na.rm = TRUE)' is not a
function, character or symbol Called from: match.fun(FUN)
Updates
Fixing error pointed out by #Mike removes the error, however, does not produce the desired result. The code:
# Avoid namespace conflict with other packages
ddR::collect(
dmapply(
FUN = function(x, y) {
aggregate(
FUN = function(x) {
mean(x, na.rm = TRUE)
},
x = x,
by = list(trans = y)
)
},
distMtcars$mpg,
y = distMtcars$am,
output.type = "dframe",
combine = "rbind"
)
)
yields:
[1] trans x
<0 rows> (or 0-length row.names)

It works fine for me if you change your aggregate function to be consistent with the one you call earlier: FUN = function(x) mean(x, na.rm = T). The reason it can't find mean(x, na.rm = T) is because it isn't a function (it's a function call), rather mean is a function.
Also it will give you NA results unless you change your x = distMtcars$mpg to x = collect(distMtcars)$mpg. Same for y. With all that said, I think this should work for you:
res <-dmapply(
FUN = function(x, y) {
aggregate(FUN = function(x) mean(x, na.rm = TRUE),
x = x,
by = list(trans = y))
},
x = list(collect(distMtcars)$mpg),
y = list(collect(distMtcars)$am),
output.type = "dframe",
combine = "rbind"
)
Then you can do collect(res) to see the result.
collect(res)
# trans x
#1 0 17.14737
#2 1 24.39231

Related

Can't convert from <double> to <integer> due to loss of precision

Tried running this code and I am getting this error message:
"Must subset columns with a valid subscript vector. Can't convert from double to integer due to loss of precision." Could someone help either fix or convert so it recognizes the dataframe columns appropriately
data1 <- wins.df(data1, data1$q, wins.limits = c(.01, .99), append.wins.label = FALSE, verbose = TRUE)
Here is the function:
wins.df <-
function(X,
var,
wins.limits = c(.01, .99),
append.wins.label = TRUE,
verbose = TRUE) {
Y <- X
x <- X[, var]
x.w <- wins(x, wins.limits)
var.w <- var
if (append.wins.label)
var.w <- paste(var, ".w", sep = "")
Y[, var.w] <- x.w
if (verbose) {
print(summary(Y[, var.w])) print(summary(X[, var]))
}
return(Y)
}

r how to keep print method for custom class

i have defined a method for printing a vector with the class test:
print.test <- function(x, ...) {
x <- formatC(
as.numeric(x),
format = "f",
big.mark = ".",
decimal.mark = ",",
digits = 1
)
x[x == "NA"] <- "-"
x[x == "NaN"] <- "-"
print.default(x)
}
which works fine for the following
a <- c(1000.11, 2000.22, 3000.33)
class(a) <- c("test", class(a))
print(a)
[1] "1.000,11" "2.000,22" "3.000,33"
this also works:
round(a)
[1] "1.000,0" "2.000,0" "3.000,0"
this does not:
median(a)
[1] 2000.22
class(median(a))
[1] "numeric"
now my question is: do i need to write a custom method for this class to use median e.g. and if so what would it look like or is there another way (as i simply would like this class to print the data in a certain format)?
The problem is that median.default returns an object of class numeric therefore autoprinting of the returned object does not call your custom print method.
The following will do so.
median.test <- function(x, na.rm = FALSE, ...){
y <- NextMethod(x, na.rm = na.rm, ...)
class(y) <- c("test", class(y))
y
}
median(a)
#[1] "2.000,2"
As for the handling of NA values, I will first define another method for a base R function. It is not strictly needed but save some code lines if objects of class test are used frequently.
c.test <- function(x, ...){
y <- NextMethod(x, ...)
class(y) <- c("test", class(y))
y
}
b <- c(a, NA)
class(b)
#[1] "test" "numeric"
median(b)
#[1] "-"
median(b, na.rm = TRUE)
#[1] "2.000,2"
EDIT.
The following defines a generic function wMedian, a default method and a method for objects of class "currency", as requested by the OP in a comment.
Note that there must be a method print.currency, which I don't redefine since it's exactly the same as print.test above. As for the other methods, I have made them simpler with the help of a new function, as.currency.
median.currency <- function(x, na.rm = FALSE, ...){
y <- NextMethod(x, na.rm = na.rm, ...)
as.currency(y)
}
c.currency <- function(x, ...){
y <- NextMethod(x, ...)
as.currency(y)
}
as.currency <- function(x){
class(x) <- c("currency", class(x))
x
}
wMedian <- function(x, ...) UseMethod("wMedian")
wMedian.default <- function(x, ...){
matrixStats::weightedMedian(x, ...)
}
wMedian.currency <- function(x, w = NULL, idxs = NULL, na.rm = FALSE, interpolate = is.null(ties), ties = NULL, ...) {
y <- NextMethod(x, w = w, idxs = idxs, na.rm = na.rm, interpolate = interpolate, ties = ties, ... )
as.currency(y)
}
set.seed(1)
x <- rnorm(10)
wMedian(x, w = (1:10)/10)
#[1] 0.4084684
wMedian(as.currency(x), w = (1:10)/10)
#[1] "0,4"

Remove an argument / element from ellipsis

I want to drop an element from an ellipsis because I want to set it manaully inside my function. The example below illustrates the point as the error is sensible given I'm passing na.rm in manually and via ellipsis. How can I remove na.rm from the ellipsis so that the error isn't thrown?
mymean <- function(x, ...){
dots <- list(...)
if (is.null(dots$na.rm)) {
na_arg <- TRUE
} else {
na_arg <- dots$na.rm
}
mean(x, na.rm = na_arg, ...)
}
set.seed(10); a <- c(rnorm(20), NA)
mymean(a)
## [1] -0.06053267
mymean(a, trim = .5)
## [1] -0.08275319
mymean(a, na.rm = TRUE, trim = .1)
## Error in mean.default(x, na.rm = na_arg, ...) :
## formal argument "na.rm" matched by multiple actual arguments
1) lm Look at the source code of lm. Following that:
mymean1 <- function(x, ...){
mc <- match.call()
mc[[1]] <- as.name("mean")
if (is.null(mc$na.rm)) mc$na.rm <- TRUE
eval(mc, parent.frame())
}
mymean1(a, na.rm = TRUE, trim = .1)
## [1] -0.01886715
mymean1(a, trim = .1)
## [1] -0.01886715
2) do.call This would also work although there is a subtle difference, namely ... gets evaluated. In this and most cases it won't matter but it might in some less usual cases.
mymean2 <- function(x, ...) {
dots <- list(...)
if (! "na.rm" %in% names(dots)) dots$na.rm <- TRUE
do.call("mean", c(list(x), dots))
}
mymean2(a, na.rm = TRUE, trim = .1)
## [1] -0.01886715
mymean2(a, trim = .1)
## [1] -0.01886715
3) ordinary arg However, it would be easier to just write:
mymean3 <- function(x, na.rm = TRUE, ...) mean(x, na.rm = na.rm, ...)
mymean3(a, na.rm = TRUE, trim = .1)
## [1] -0.01886715
mymean3(a, trim = .1)
## [1] -0.01886715
This is the usual way wrappers with different defaults are written. For example see the read.csv source code.

what does the "data.frame$.variable" mean in a function?

I have seen a function:
smotest <- list(name = "SMOTE with more neighbors!",
func = function (x, y) {
library(DMwR)
dat <- if (is.data.frame(x)) x else as.data.frame(x)
dat$.y <- y
dat <- SMOTE(.y ~ ., data = dat, k = 10)
list(x = dat[, !grepl(".y", colnames(dat), fixed = TRUE)],
y = dat$.y)
},
first = TRUE)
In this function, what does thedat$.ymean, eapecially the ".", and why it has a ".".

Clean, simple function factories in R

Short example. I am exploring the behavior of a function by testing it with different "specs", f(spec). I wrote down one spec by hand, spec1, and am creating new specs as variations on it. To do this, I decided to write a function:
spec1 = list(fy = list(a = 1), fx = list(f1 = function(x) 10-x, f2 = function(x) 2-x))
make_spec = function(f = function(x) 10-x, xtheta = 2)
list(fy = list(a = 1), fx = list(f1 = f, f2 = function(x) xtheta-x))
res1 = make_spec()
# first problem: they don't match
all.equal(res1,spec1)
# [1] "Component “fx”: Component “f2”: target, current do not match when deparsed"
# ^ this happens, even though...
res1$fx$f2(4) == spec1$fx$f2(4)
# TRUE
# second problem: res1 is fugly
res1
# $fy
# $fy$a
# [1] 1
#
#
# $fx
# $fx$f1
# function (x)
# 10 - x
# <environment: 0x000000000f8f2e20>
#
# $fx$f2
# function (x)
# xtheta - x
# <environment: 0x000000000f8f2e20>
str(res1)
# even worse
My goals for make_spec are...
all.equal(spec1, res1) and/or identical(spec1, res1)
for str(res1) to be human-readable (no <environment: ptr> tags or srcfilecopy)
to avoid substitute and eval altogether if possible (not a high priority)
to avoid writing out the second arg of substitute (see "full" example below)
Is there an idiomatic way to achieve some or all of these goals?
Full example. I'm not sure if the example above fully covers my use case, so here's the latter:
spec0 = list(
v_dist = list(
pdf = function(x) 1,
cdf = function(x) x,
q = function(x) x,
supp = c(0,1)
)
,
ucondv_dist = {
ucondv_dist = list()
ucondv_dist$condmean = function(v) 10-v
ucondv_dist$pdf = function(u,v) dnorm(u, ucondv_dist$condmean(v), 50)
ucondv_dist$cdf = function(u,v) pnorm(u, ucondv_dist$condmean(v), 50)
ucondv_dist
}
)
make_spec = function(ycondx_condmean = function(x) 10-x, ycondx_sd = 50){
s = substitute(list(
x_dist = list(
pdf = function(x) 1,
cdf = function(x) x,
q = function(x) x,
supp = c(0,1)
)
,
ycondx_dist = {
ycondx_dist = list()
ycondx_dist$condmean = ycondx_condmean
ycondx_dist$pdf = function(u,v) dnorm(u, ycondx_dist$condmean(v), ycondx_sd)
ycondx_dist$cdf = function(u,v) pnorm(u, ycondx_dist$condmean(v), ycondx_sd)
ycondx_dist
}
)
, list(ycondx_condmean=ycondx_condmean, ycondx_sd = ycondx_sd))
eval(s, .GlobalEnv)
}
res0 = make_spec()
Side note. I don't know if "function factory" is the right term here, since I am not a computer scientist, but it seems related. I found only a paragraph on the concept related to R.
The enclosing environments of the functions are different leading to the difference in output/difference in deparsing. So, there are two things to do to get the desired output:
make the environments the same
substitute the variables from the enclosing environments into the function bodies.
However, doing it this way you get a double dose of the eval/substitute you didn't want, so maybe there would be an alternative.
make_spec <- function(f = function(x) 10-x, xtheta = 2) {
e <- parent.frame()
fixClosure <- function(func)
eval(eval(substitute(substitute(func)), parent.frame()), e)
list(fy = list(a = 1), fx = list(
f1 = fixClosure(f),
f2 = fixClosure(function(x) xtheta-x)
))
}
spec1 <- list(fy = list(a = 1), fx = list(f1 = function(x) 10-x, f2 = function(x) 2-x))
res1 <- make_spec()
all.equal(res1, spec1)
[1] TRUE

Resources