I was wondering if it might be possible to replace my for() loop with an equivalent *apply() family?
I have tried lapply() but I can't get it to work. Is this possible in BASE R?
(dat <- data.frame(id=rep(c("A", "B"), c(2, 6)), mp=c(1, 5, 2, 1, 1, 1, 5, 6), sp=c(.2, .3, .2, .2, .2, .2, .6, .6),
cont=c(F, T, F, F, T, T, T, T), pos=c(1, 1, rep(1:2, 3)),
out=c(1, 1, 1, 1, 1, 1, 2, 2)))
##### for loop:
for (x in split(dat, dat$id)) {
pos_constant <- (length(unique(x$pos)) == 1)
if (pos_constant) {
next
}
group_out <- split(x,x$out)
for (x_sub in group_out) {
mps <- x_sub[x_sub$cont==TRUE,"mp"]
sps <- x_sub[x_sub$cont==TRUE,"sp"]
mps_constant <- length(unique(mps)) %in% c(1,0)
sps_constant <- length(unique(sps)) %in% c(1,0)
r <- !mps_constant || !sps_constant
if (r) {
stop(sprintf("'%s' has a wrong value.", x[,"id"][1]), call. = FALSE)
}
}
}
##### `lapply()` solution without success:
lapply(split(dat, dat$id), function(x){
pos_constant <- (length(unique(x$pos)) == 1)
if (pos_constant) {
lapply(split(x, x$out), function(x_sub){
mps <- x_sub[x_sub$cont==TRUE,"mp"]
sps <- x_sub[x_sub$cont==TRUE,"sp"]
mps_constant <- length(unique(mps)) %in% c(1,0)
sps_constant <- length(unique(sps)) %in% c(1,0)
r <- !mps_constant || !sps_constant
if (r) {
stop(sprintf("'%s' has a wrong value.", x[,"id"][1]), call. = FALSE)
}
}
}
}
A similar option is
lapply(split(dat, dat$id), function(x){
pos_constant <- (length(unique(x$pos)) == 1)
if (!pos_constant) {
lapply(split(x, x$out), function(x_sub){
mps <- x_sub[x_sub$cont==TRUE,"mp"]
sps <- x_sub[x_sub$cont==TRUE,"sp"]
mps_constant <- length(unique(mps)) %in% c(1,0)
sps_constant <- length(unique(sps)) %in% c(1,0)
r <- !mps_constant || !sps_constant
if (r) {
stop(sprintf("'%s' has a wrong value.", x[,"id"][1]), call. = FALSE)
}
})
}
})
#Error: 'B' has a wrong value.
If we want to return message as well
lapply(split(dat, dat$id), function(x){
pos_constant <- (length(unique(x$pos)) == 1)
if (!pos_constant) {
lapply(split(x, x$out), function(x_sub){
mps <- x_sub[x_sub$cont==TRUE,"mp"]
sps <- x_sub[x_sub$cont==TRUE,"sp"]
mps_constant <- length(unique(mps)) %in% c(1,0)
sps_constant <- length(unique(sps)) %in% c(1,0)
r <- !mps_constant || !sps_constant
if (r) {
stop(sprintf("'%s' has a wrong value.",
x[,"id"][1]), call. = FALSE)
}
})
} else {
message(sprintf("'%s' is ok.", x[,"id"][1]))
}
})
#'A' is ok.
#Error: 'B' has a wrong value.
Related
I am looking to have each frame of a scatter plot be filtered by another vector with a certain bin width and have it it roll through those. For example I can do this by:
library(ggplot2)
library(gganimate)
#example data
iris <- datasets::iris
#plot x and y
g <- ggplot(iris) + geom_point(aes(x = Petal.Width,y = Petal.Length))
#filter x and y by a third value with a bin width of 2 steping through by 0.5
g + transition_filter(transition_length = 1,
filter_length = 1,
4 < Sepal.Length & Sepal.Length < 6,
4.5 < Sepal.Length & Sepal.Length < 6.5,
5 < Sepal.Length & Sepal.Length < 7,
5.5 < Sepal.Length & Sepal.Length < 7.5,
6 < Sepal.Length & Sepal.Length < 8)
However - writing out each filter condition is tedious, and I would like to filter a different dataset with a ~20 binwidth steping through by 1 over a 300 point range so writing 100+ filters is not practical.
Is there another way to do this?
A while ago I wanted this exact function but didn't actually see anything in gganimate to do it, so I wrote something that would get the job done. Below is what I came up with, so I ended up rebuilding gganimate with this function included to avoid using :::.
I wrote this a while ago so I don't recall the exact intention of each argument at the moment of writing it (ALWAYS REMEMBER TO DOCUMENT YOUR CODE).
Here is what I recall
span : expression that can be evaluated within the data layers
size : how much data to be shown at once
enter_length/exit_length : Don't exactly recall how it works in relation to each other or size/span
range : a subset range
retain_data_order : logical - don't remember why this is here (sorry!)
library(gganimate)
#> Loading required package: ggplot2
library(rlang)
library(tweenr)
library(stringi)
get_row_event <- gganimate:::get_row_event
is_placeholder <- gganimate:::is_placeholder
recast_event_times <- gganimate:::recast_event_times
recast_times <- gganimate:::recast_times
TransitionSpan <- ggplot2::ggproto('TransitionSpan',
TransitionEvents,
finish_data = function (self, data, params)
{
lapply(data, function(d) {
split_panel <- stri_match(d$group, regex = "^(.+)<(.*)>(.*)$")
if (is.na(split_panel[1]))
return(list(d))
d$group <- match(d$group, unique(d$group))
empty_d <- d[0, , drop = FALSE]
d <- split(d, as.integer(split_panel[, 3]))
frames <- rep(list(empty_d), params$nframes)
frames[as.integer(names(d))] <- d
frames
})
},
setup_params = function(self, data, params) {
# browser()
params$start <- get_row_event(data, params$span_quo, "start")
time_class <- if (is_placeholder(params$start))
NULL
else params$start$class
end_quo <- expr(!!params$span_quo + diff(range(!!params$span_quo))*!!params$size_quo)
params$end <- get_row_event(data, end_quo, "end",
time_class)
params$enter_length <- get_row_event(data, params$enter_length_quo,
"enter_length", time_class)
params$exit_length <- get_row_event(data, params$exit_length_quo,
"exit_length", time_class)
params$require_stat <- is_placeholder(params$start) || is_placeholder(params$end) ||
is_placeholder(params$enter_length) || is_placeholder(params$exit_length)
static = lengths(params$start$values) == 0
params$row_id <- Map(function(st, end, en, ex, s) if (s)
character(0)
else paste(st, end, en, ex, sep = "_"), st = params$start$values,
end = params$end$values, en = params$enter_length$values,
ex = params$exit_length$values, s = static)
params
},
setup_params2 = function(self, data, params, row_vars) {
late_start <- FALSE
if (is_placeholder(params$start)) {
params$start <- get_row_event(data, params$start_quo, 'start', after = TRUE)
late_start <- TRUE
} else {
params$start$values <- lapply(row_vars$start, as.numeric)
}
size <- expr(!!params$size_quo)
time_class <- params$start$class
if (is_placeholder(params$end)) {
params$end <- get_row_event(data, params$end_quo, 'end', time_class, after = TRUE)
} else {
params$end$values <- lapply(row_vars$end, as.numeric)
}
if (is_placeholder(params$enter_length)) {
params$enter_length <- get_row_event(data, params$enter_length_quo, 'enter_length', time_class, after = TRUE)
} else {
params$enter_length$values <- lapply(row_vars$enter_length, as.numeric)
}
if (is_placeholder(params$exit_length)) {
params$exit_length <- get_row_event(data, params$exit_length_quo, 'exit_length', time_class, after = TRUE)
} else {
params$exit_length$values <- lapply(row_vars$exit_length, as.numeric)
}
times <- recast_event_times(params$start, params$end, params$enter_length, params$exit_length)
params$span_size <- diff(times$start$range)*eval_tidy(size)
range <- if (is.null(params$range)) {
low <- min(unlist(Map(function(start, enter) {
start - (if (length(enter) == 0) 0 else enter)
}, start = times$start$values, enter = times$enter_length$values)))
high <- max(unlist(Map(function(start, end, exit) {
(if (length(end) == 0) start else end) + (if (length(exit) == 0) 0 else exit)
}, start = times$start$values, end = times$end$values, exit = times$exit_length$values)))
range <- c(low, high)
} else {
if (!inherits(params$range, time_class)) {
stop('range must be given in the same class as time', call. = FALSE)
}
as.numeric(params$range)
}
full_length <- diff(range)
frame_time <- recast_times(
seq(range[1], range[2], length.out = params$nframes),
time_class
)
frame_length <- full_length / params$nframes
rep_frame <- round(params$span_size/frame_length)
lowerl <- c(rep(frame_time[1],rep_frame), frame_time[2:(params$nframes-rep_frame+1)])
upperl <- c(frame_time[1:(params$nframes-rep_frame)], rep(frame_time[params$nframes-rep_frame+1], rep_frame))
start <- lapply(times$start$values, function(x) {
round((params$nframes - 1) * (x - range[1])/full_length) + 1
})
end <- lapply(times$end$values, function(x) {
if (length(x) == 0) return(numeric())
round((params$nframes - 1) * (x - range[1])/full_length) + 1
})
enter_length <- lapply(times$enter_length$values, function(x) {
if (length(x) == 0) return(numeric())
round(x / frame_length)
})
exit_length <- lapply(times$exit_length$values, function(x) {
if (length(x) == 0) return(numeric())
round(x / frame_length)
})
params$range <- range
params$frame_time <- frame_time
static = lengths(start) == 0
params$row_id <- Map(function(st, end, en, ex, s) if (s) character(0) else paste(st, end, en, ex, sep = '_'),
st = start, end = end, en = enter_length, ex = exit_length, s = static)
params$lowerl <- lowerl
params$upperl <- upperl
params$frame_span <- upperl - lowerl
params$frame_info <- data.frame(
frame_time = frame_time,
lowerl = lowerl,
upperl = upperl,
frame_span = upperl - lowerl
)
params$nframes <- nrow(params$frame_info)
params
},
expand_panel = function(self, data, type, id, match, ease, enter, exit, params, layer_index) {
#browser()
row_vars <- self$get_row_vars(data)
if (is.null(row_vars))
return(data)
data$group <- paste0(row_vars$before, row_vars$after)
start <- as.numeric(row_vars$start)
end <- as.numeric(row_vars$end)
if (is.na(end[1]))
end <- NULL
enter_length <- as.numeric(row_vars$enter_length)
if (is.na(enter_length[1]))
enter_length <- NULL
exit_length <- as.numeric(row_vars$exit_length)
if (is.na(exit_length[1]))
exit_length <- NULL
data$.start <- start
all_frames <- tween_events(data, c(ease,"linear"),
params$nframes, !!start, !!end, c(1, params$nframes),
enter, exit, !!enter_length, !!exit_length)
if(params$retain_data_order){
all_frames <- all_frames[order(as.numeric(all_frames$.id)),]
} else {
all_frames <- all_frames[order(all_frames$.start, as.numeric(all_frames$.id)),]
}
all_frames$group <- paste0(all_frames$group, '<', all_frames$.frame, '>')
all_frames$.frame <- NULL
all_frames$.start <- NULL
all_frames
})
transition_span <- function(span, size = 0.5, enter_length = NULL, exit_length = NULL, range = NULL, retain_data_order = T){
span_quo <- enquo(span)
size_quo <- enquo(size)
enter_length_quo <- enquo(enter_length)
exit_length_quo <- enquo(exit_length)
gganimate:::require_quo(span_quo, "span")
ggproto(NULL, TransitionSpan,
params = list(span_quo = span_quo,
size_quo = size_quo, range = range, enter_length_quo = enter_length_quo,
exit_length_quo = exit_length_quo,
retain_data_order = retain_data_order))
}
g <- ggplot(iris) +
geom_point(aes(x = Petal.Width,y = Petal.Length, color = Sepal.Length)) +
viridis::scale_color_viridis()
a <- g + transition_span(Sepal.Length, .1, 1, 1)
animate(a, renderer = gganimate::gifski_renderer())
Created on 2021-08-11 by the reprex package (v2.0.0)
catTestfisher <-
function (tab)
{
st <- if (!is.matrix(tab) || nrow(tab) < 2 | ncol(tab) <
2)
list(p.value = NA, statistic = NA, parameter = NA)
else {
rowcounts <- tab %*% rep(1, ncol(tab))
tab <- tab[rowcounts > 0, ]
if (!is.matrix(tab))
list(p.value = NA, statistic = NA, parameter = NA)
else fisher.test(tab)
}
list(P = st$p.value, stat = "", df = "",
testname = "Fisher's Exact", statname = "", latexstat = "", namefun = "",
plotmathstat = "")
}
I wanted to use library(Hmisc)'s summaryM function but with Fisher's exact test, so I wrote a catTestfisher function and set catTest = catTestfisher in my own summaryM2 function, which is exactly the same as summaryM, except for catTest = catTestfisher
summaryM2 <-
function (formula, groups = NULL, data = NULL, subset, na.action = na.retain,
overall = FALSE, continuous = 10, na.include = FALSE, quant = c(0.025,
0.05, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 0.95,
0.975), nmin = 100, test = FALSE, conTest = conTestkw,
catTest = catTestfisher, ordTest = ordTestpo)
{
marg <- length(data) && ".marginal." %in% names(data)
if (marg)
formula <- update(formula, . ~ . + .marginal.)
formula <- Formula(formula)
Y <- if (!missing(subset) && length(subset))
model.frame(formula, data = data, subset = subset, na.action = na.action)
else model.frame(formula, data = data, na.action = na.action)
X <- model.part(formula, data = Y, rhs = 1)
Y <- model.part(formula, data = Y, lhs = 1)
getlab <- function(x, default) {
lab <- attr(x, "label")
if (!length(lab) || lab == "")
default
else lab
}
if (marg) {
xm <- X$.marginal.
X$.marginal. <- NULL
}
else xm <- rep("", nrow(X))
if (length(X)) {
xname <- names(X)
if (length(xname) == 1 && !length(groups))
groups <- xname
if (!length(groups) && length(xname) > 1) {
warnings("Must specify groups when > 1 right hand side variable is present.\ngroups taken as first right hand variable.")
groups <- xname[1]
}
svar <- if (length(xname) == 1)
factor(rep(".ALL.", nrow(X)))
else do.call("interaction", list(X[setdiff(xname, groups)],
sep = " "))
group <- X[[groups]]
glabel <- getlab(group, groups)
}
else {
svar <- factor(rep(".ALL.", nrow(Y)))
group <- rep("", nrow(Y))
groups <- group.freq <- NULL
glabel <- ""
}
quants <- unique(c(quant, 0.025, 0.05, 0.125, 0.25, 0.375,
0.5, 0.625, 0.75, 0.875, 0.95, 0.975))
nv <- ncol(Y)
nameY <- names(Y)
R <- list()
for (strat in levels(svar)) {
instrat <- svar == strat
n <- integer(nv)
type <- n
comp <- dat <- vector("list", nv)
names(comp) <- names(dat) <- nameY
labels <- Units <- vector("character", nv)
if (test) {
testresults <- vector("list", nv)
names(testresults) <- names(comp)
}
gr <- group[instrat]
xms <- xm[instrat]
if (all(xms != ""))
xms <- rep("", length(xms))
group.freq <- table(gr)
group.freq <- group.freq[group.freq > 0]
if (overall)
group.freq <- c(group.freq, Combined = sum(group.freq))
for (i in 1:nv) {
w <- Y[instrat, i]
if (length(attr(w, "label")))
labels[i] <- attr(w, "label")
if (length(attr(w, "units")))
Units[i] <- attr(w, "units")
if (!inherits(w, "mChoice")) {
if (!is.factor(w) && !is.logical(w) && length(unique(w[!is.na(w)])) <
continuous)
w <- as.factor(w)
s <- !is.na(w)
if (na.include && !all(s) && length(levels(w))) {
w <- na.include(w)
levels(w)[is.na(levels(w))] <- "NA"
s <- rep(TRUE, length(s))
}
n[i] <- sum(s & xms == "")
w <- w[s]
g <- gr[s, drop = TRUE]
if (is.factor(w) || is.logical(w)) {
tab <- table(w, g)
if (test) {
if (is.ordered(w))
testresults[[i]] <- ordTest(g, w)
else testresults[[i]] <- catTest(tab)
}
if (nrow(tab) == 1) {
b <- casefold(dimnames(tab)[[1]], upper = TRUE)
pres <- c("1", "Y", "YES", "PRESENT")
abse <- c("0", "N", "NO", "ABSENT")
jj <- match(b, pres, nomatch = 0)
if (jj > 0)
bc <- abse[jj]
else {
jj <- match(b, abse, nomatch = 0)
if (jj > 0)
bc <- pres[jj]
}
if (jj) {
tab <- rbind(tab, rep(0, ncol(tab)))
dimnames(tab)[[1]][2] <- bc
}
}
if (overall)
tab <- cbind(tab, Combined = apply(tab, 1,
sum))
comp[[i]] <- tab
type[i] <- 1
}
else {
sfn <- function(x, quant) {
o <- options(digits = 10)
on.exit(options(o))
c(quantile(x, quant), Mean = mean(x), SD = sqrt(var(x)),
N = sum(!is.na(x)))
}
qu <- tapply(w, g, sfn, simplify = TRUE, quants)
if (test)
testresults[[i]] <- conTest(g, w)
if (overall)
qu$Combined <- sfn(w, quants)
comp[[i]] <- matrix(unlist(qu), ncol = length(quants) +
3, byrow = TRUE, dimnames = list(names(qu),
c(format(quants), "Mean", "SD", "N")))
if (any(group.freq <= nmin))
dat[[i]] <- lapply(split(w, g), nmin = nmin,
function(x, nmin) if (length(x) <= nmin)
x
else NULL)
type[i] <- 2
}
}
else {
w <- as.numeric(w) == 1
n[i] <- sum(!is.na(apply(w, 1, sum)) & xms ==
"")
g <- as.factor(gr)
ncat <- ncol(w)
tab <- matrix(NA, nrow = ncat, ncol = length(levels(g)),
dimnames = list(dimnames(w)[[2]], levels(g)))
if (test) {
pval <- numeric(ncat)
names(pval) <- dimnames(w)[[2]]
d.f. <- stat <- pval
}
for (j in 1:ncat) {
tab[j, ] <- tapply(w[, j], g, sum, simplify = TRUE,
na.rm = TRUE)
if (test) {
tabj <- rbind(table(g) - tab[j, ], tab[j,
])
st <- catTest(tabj)
pval[j] <- st$P
stat[j] <- st$stat
d.f.[j] <- st$df
}
}
if (test)
testresults[[i]] <- list(P = pval, stat = stat,
df = d.f., testname = st$testname, statname = st$statname,
latexstat = st$latexstat, plotmathstat = st$plotmathstat)
if (overall)
tab <- cbind(tab, Combined = apply(tab, 1,
sum))
comp[[i]] <- tab
type[i] <- 3
}
}
labels <- ifelse(nchar(labels), labels, names(comp))
R[[strat]] <- list(stats = comp, type = type, group.freq = group.freq,
labels = labels, units = Units, quant = quant, data = dat,
N = sum(!is.na(gr) & xms == ""), n = n, testresults = if (test) testresults)
}
structure(list(results = R, group.name = groups, group.label = glabel,
call = call, formula = formula), class = "summaryM")
}
After trying to test it on the following data, I get a warning and an error:
library(Hmisc)
set.seed(173)
sex <- factor(sample(c("m","f"), 500, rep=TRUE))
treatment <- factor(sample(c("Drug","Placebo"), 500, rep=TRUE))
> summaryM2(sex ~ treatment, test=TRUE, overall = TRUE)
Error in round(teststat, 2) :
non-numeric argument to mathematical function
I tried stepping through the summaryM2 function line by line, but could not figure out what's causing the problem.
In your catTestfisher function, the output variables stat (test statistic) and df (degrees of freedom) should be numeric variables not empty strings. In the programming stat is coverted to teststat for rounding before being outputted (hence the error message for round("", 2) is non-numeric argument to mathematical function). See lines 1718 to 1721 in the summary.formula code) .
You can set df = NULL but a value is required for stat (not NA or NULL) otherwise no output is returned. You can get around the problem by setting stat = 0 (or any other number), and then only displaying the p value using prtest = "P".
catTestfisher2 <- function (tab)
{
st <- fisher.test(tab)
list(P = st$p.value, stat = 0, df = NULL,
testname = st$method, statname = "", latexstat = "", namefun = "",
plotmathstat = "")
}
output <- summaryM(sex ~ treatment, test=TRUE, overall = TRUE, catTest = catTestfisher2)
print(output, prtest = "P")
Descriptive Statistics (N=500)
+-------+-----------+-----------+-----------+-------+
| |Drug |Placebo |Combined |P-value|
| |(N=257) |(N=243) |(N=500) | |
+-------+-----------+-----------+-----------+-------+
|sex : m|0.52 (133)|0.52 (126)|0.52 (259)| 1 |
+-------+-----------+-----------+-----------+-------+
Note there is no need to define your own summaryM2 function. Just use catTest = to pass in your function.
I have the following problem. I have a piece-wise linear function described by (xPoints, yPoints) and want to compute fast--I have to do it over and over again--the implied y-value for a long list of x's, where x could fall outside the range of xPoints. I have coded a function f_pwl that computes the implied y-value, but it is slow, so I was trying to parallelize its call. But it is actually slower than using data.table := syntax. I will appreciate suggestions to speed things up either by improving my f_pwl function, or by implementing an efficient parallelization, as I have access to 20 cores to speed things up.
Here is a sample code.
# libraries
require(data.table) # for fread, work with large data
require(abind) # for abind()
require(foreach) # for parallel processing, used with doParallel
require(doParallel) # for parallel processing, used with foreach
f_pwl <- function(x) {
temp <- as.vector( rep(NA, length = length(x)), mode = "double" )
for (i in seq(from = 1, to = length(x), by = 1)) {
if (x[i] > max(xPoints) | x[i] < min(xPoints)) {
# nothing to do, temp[i] <- NA
} else if (x[i] == max(xPoints)) {
# value equal max(yPoints)
temp[i] <- max(yPoints)
} else {
# value is f_pwl(x)
xIndexVector = as.logical( x[i] >= xPoints & abind(xPoints[2:length(xPoints)], max(xPoints)) > x[i] )
xIndexVector_plus1 = shift( xIndexVector, n = 1, fill = FALSE, type = "lag" )
alpha_j = (xPoints[xIndexVector_plus1] - x[i])/(xPoints[xIndexVector_plus1] - xPoints[xIndexVector])
temp[i] <- alpha_j %*% yPoints[xIndexVector] + (1-alpha_j) %*% yPoints[xIndexVector_plus1]
}
} # end for i
as.vector( temp, mode = "double" )
}
## Main program
xPoints <- c(4, 9, 12, 15, 18, 21)
yPoints <- c(1, 2, 3, 4, 5, 6)
x <- rnorm(1e4, mean = 12, sd = 5)
dt <- as.data.table( x )
dt[ , c("y1", "y2", "y3") := as.vector( mode = "double", NA ) ]
# data.table := command
system.time({
dt[, y2 := f_pwl( x ) ]
})
# mapply
system.time({
dt[ , y1 := mapply( f_pwl, x ), by=.I ]
})
# parallel
system.time({
#setup parallel backend to use many processors
cores=detectCores()
cl <- makeCluster(cores[1]-1, type="FORK") #not to overload your computer
registerDoParallel(cl)
dt$y3 <- foreach(i=1:nrow(dt), .combine=cbind) %dopar% {
tempY <- f_pwl( dt$x[i] )
tempY
}
#stop cluster
stopCluster(cl)
})
summary( dt[ , .(y1-y2, y1-y3, y2-y3)] )
First, calculate and store the alpha_j's.
Then, sort DT by x first and cut it into the relevant intervals before performing your linear interpolation
alpha <- c(NA, diff(yPoints) / diff(xPoints))
DT[order(x),
y := alpha[.GRP] * (x - xPoints[.GRP-1L]) + yPoints[.GRP-1L],
by=cut(x, xPoints)]
Please let me know how it performs.
data:
library(data.table)
## Main program
set.seed(27L)
xPoints <- c(4, 9, 12, 15, 18, 21)
yPoints <- c(1, 2, 3, 4, 5, 6)
DT <- data.table(x=rnorm(1e4, mean=12, sd=5))
check:
f_pwl <- function(x) {
temp <- as.vector( rep(NA, length = length(x)), mode = "double" )
for (i in seq(from = 1, to = length(x), by = 1)) {
if (x[i] > max(xPoints) | x[i] < min(xPoints)) {
# nothing to do, temp[i] <- NA
} else if (x[i] == max(xPoints)) {
# value equal max(yPoints)
temp[i] <- max(yPoints)
} else {
# value is f_pwl(x)
xIndexVector = as.logical( x[i] >= xPoints & abind(xPoints[2:length(xPoints)], max(xPoints)) > x[i] )
xIndexVector_plus1 = shift( xIndexVector, n = 1, fill = FALSE, type = "lag" )
alpha_j = (xPoints[xIndexVector_plus1] - x[i])/(xPoints[xIndexVector_plus1] - xPoints[xIndexVector])
temp[i] <- alpha_j %*% yPoints[xIndexVector] + (1-alpha_j) %*% yPoints[xIndexVector_plus1]
}
} # end for i
as.vector( temp, mode = "double" )
}
system.time({
DT[, yOP := f_pwl( x ) ]
})
DT[abs(y-yOP) > 1e-6]
#Empty data.table (0 rows) of 3 cols: x,y,yOP
I'd like to ask a question considering subsetting in R. I tried to calculate Multiple Imputation ANOVA using the function mi.anova (miceadds) in R. Actually there is no possibility to only use subsets of the input.
Afterwards I tried to restructure my mids structure into a datlist, subsetting it there and I wanted to return it to a mids structure which was not possible because of the unequal length of the data.frames.
As well I tried to use the with and pool function (mice) to handle the problem, which doesn't give me the expected output, I'd Need.
Actually my last solution would be to rewrite the mi.anova function from the miceadds package which allows me to use subsets. Honestly I don't feel very comfortable when trying to rewrite function, and I don't have any idea how to rewrite it.
Is there maybe anyone who could help me out? Or does anyone suggest another solution?
Thanks a lot & best regards,
Pascal
function (mi.res, formula, type = 2)
{
if (type == 3) {
TAM::require_namespace_msg("car")
}
mi.list <- mi.res
if (class(mi.list) == "mids.1chain") {
mi.list <- mi.list$midsobj
}
if (class(mi.list) == "mids") {
m <- mi.list$m
h1 <- list(rep("", m))
for (ii in 1:m) {
h1[[ii]] <- as.data.frame(mice::complete(mi.list,
ii))
}
mi.list <- h1
}
if (class(mi.res) == "mi.norm") {
mi.list <- mi.list$imp.data
}
if (type == 2) {
anova.imp0 <- lapply(mi.list, FUN = function(dat) {
stats::lm(formula, data = dat)
})
anova.imp <- lapply(anova.imp0, FUN = function(obj) {
summary(stats::aov(obj))
})
}
if (type == 3) {
Nimp <- length(mi.list)
vars <- all.vars(stats::as.formula(formula))[-1]
VV <- length(vars)
ma_contrasts <- as.list(1:VV)
names(ma_contrasts) <- vars
dat <- mi.list[[1]]
for (vv in 1:VV) {
ma_contrasts[[vars[vv]]] <- "contr.sum"
if (!is.factor(dat[, vars[vv]])) {
ma_contrasts[[vars[vv]]] <- NULL
}
}
anova.imp0 <- lapply(as.list(1:Nimp), FUN = function(ii) {
dat <- mi.list[[ii]]
mod1 <- stats::lm(formula, data = dat, contrasts = ma_contrasts)
return(mod1)
})
anova.imp <- lapply(as.list(1:Nimp), FUN = function(ii) {
obj <- anova.imp0[[ii]]
car::Anova(obj, type = 3)
})
}
if (type == 2) {
FF <- nrow(anova.imp[[1]][[1]]) - 1
}
if (type == 3) {
FF <- nrow(anova.imp[[1]]["F value"]) - 2
}
anova.imp.inf <- t(sapply(1:FF, FUN = function(ff) {
micombine.F(sapply(1:(length(anova.imp)), FUN = function(ii) {
if (type == 2) {
r1 <- anova.imp[[ii]][[1]]$"F value"[ff]
}
if (type == 3) {
r1 <- anova.imp[[ii]]$"F value"[ff + 1]
}
return(r1)
}), df1 = ifelse(type == 2, anova.imp[[1]][[1]]$Df[ff],
anova.imp[[1]]["Df"][ff + 1, 1]), display = FALSE)
}))
res <- anova.imp.inf[, c(3, 4, 1, 2)]
res <- matrix(res, ncol = 4)
res[, 3] <- round(res[, 3], 4)
res[, 4] <- round(res[, 4], 6)
g1 <- rownames(anova.imp[[1]][[1]])[1:FF]
if (type == 3) {
g1 <- rownames(anova.imp[[1]])[1 + 1:FF]
}
rownames(res) <- g1
res <- data.frame(res)
if (type == 2) {
SS <- rowMeans(matrix(unlist(lapply(anova.imp, FUN = function(ll) {
ll[[1]][, 2]
})), ncol = length(mi.list)))
}
if (type == 3) {
SS <- rowMeans(matrix(unlist(lapply(anova.imp, FUN = function(ll) {
l2 <- ll["Sum Sq"][-1, 1]
return(l2)
})), ncol = length(mi.list)))
}
r.squared <- sum(SS[-(FF + 1)])/sum(SS)
res$eta2 <- round(SS[-(FF + 1)]/sum(SS), 6)
res$partial.eta2 <- round(SS[-(FF + 1)]/(SS[-(FF + 1)] +
SS[FF + 1]), 6)
g1 <- c("F value", "Pr(>F)")
colnames(res)[3:4] <- g1
colnames(res)[1:2] <- c("df1", "df2")
c1 <- colnames(res)
res <- rbind(res, res[1, ])
rownames(res)[nrow(res)] <- "Residual"
res[nrow(res), ] <- NA
res <- data.frame(SSQ = SS, res)
colnames(res)[-1] <- c1
cat("Univariate ANOVA for Multiply Imputed Data", paste0("(Type ",
type, ")"), " \n\n")
cat("lm Formula: ", formula)
cat(paste("\nR^2=", round(r.squared, 4), sep = ""), "\n")
cat("..........................................................................\n")
cat("ANOVA Table \n")
print(round(res, 5))
invisible(list(r.squared = r.squared, anova.table = res,
type = type))
}
I am computing cooccurrences of species in very huge datasets using the cooccur package.
This package is based on a probabilistic model which is very demanding in term of calculations.
Thus, I was wondering how could I parallelize the calculation to have faster results.
I have seen that packages like doParallel or snowfall could do the job but I tried to use them and did not really succeed since they need a loop structure.
install.packages("cooccur")
library(cooccur)
data(finches)
system.time(
co <- cooccur(finches, thresh = FALSE, spp_names = TRUE)
)
With this example, the computation is fast but it's very slow with bigger datasets.
Note that on Ubuntu the coocur package relies on gmp which needs sudo apt-get install libgmp3-dev.
It looks like if you wanted to parallelize this function you'd have to jump into the function itself and see which (if any) of the nested loops can be pulled apart. There there are /tons/ of loops.
Which nested loops cause you the most problems (and should be parallelized) may depend on your particular problem and particular dataset. To help diagnose the issue, consider using hadley's profiling function (below) to help identify places you might rewrite the function. Keep in mind you may want to run your profiling tests (and speed tests) with a relatively large amount of data so you can find the right places to trim. At which point, you should also consider whether it is worth the time.
library(cooccur)
library(devtools)
library(lineprof)
data(finches)
devtools::install_github("hadley/lineprof")
l <- lineprof(co <- cooccur(finches, thresh = FALSE, spp_names = TRUE))
shine(l)
To start off, you might want to look at the big 1:nrow(obs_coocur) loop. In tests with the finch dataset I wasn't able to eek out a speed up and the results seemed somewhat degenerate (lots of NA rows needed to be cleaned out and even then the results weren't identical).
Abandoned draft function below:
mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
{
FUN <- match.fun(FUN)
answer <- mclapply(X = X, FUN = FUN, ...)
if (USE.NAMES && is.character(X) && is.null(names(answer)))
names(answer) <- X
if (!identical(simplify, FALSE) && length(answer))
simplify2array(answer, higher = (simplify == "array"))
else answer
}
cooccurLocal <- function (mat, type = "spp_site", thresh = TRUE, spp_names = FALSE,
true_rand_classifier = 0.1, prob = "hyper", only_effects = FALSE,
eff_standard = TRUE, eff_matrix = FALSE)
{
if (type == "spp_site") {
spp_site_mat <- mat
}
if (type == "site_spp") {
spp_site_mat <- t(mat)
}
if (spp_names == TRUE) {
spp_key <- data.frame(num = 1:nrow(spp_site_mat), spp = row.names(spp_site_mat))
}
spp_site_mat[spp_site_mat > 0] <- 1
nsite <- ncol(spp_site_mat)
nspp <- nrow(spp_site_mat)
spp_pairs <- choose(nspp, 2)
incidence <- prob_occur <- matrix(nrow = nspp, ncol = 2)
obs_cooccur <- prob_cooccur <- exp_cooccur <- matrix(nrow = spp_pairs,
ncol = 3)
prob_share_site <- c(0:(nsite + 1))
incidence <- cbind(c(1:nrow(spp_site_mat)), rowSums(spp_site_mat,
na.rm = T))
prob_occur <- cbind(c(1:nrow(spp_site_mat)), rowSums(spp_site_mat,
na.rm = T)/nsite)
pb <- txtProgressBar(min = 0, max = (nspp + nrow(obs_cooccur)),
style = 3)
row <- 0
for (spp in 1:nspp) {
if (spp < nspp) {
for (spp_next in (spp + 1):nspp) {
row <- row + 1
pairs <- 0
for (site in 1:nsite) {
if (spp_site_mat[spp, site] > 0 & spp_site_mat[spp_next,
site] > 0) {
pairs <- pairs + 1
}
}
obs_cooccur[row, 1] <- spp
obs_cooccur[row, 2] <- spp_next
obs_cooccur[row, 3] <- pairs
prob_cooccur[row, 1] <- spp
prob_cooccur[row, 2] <- spp_next
prob_cooccur[row, 3] <- prob_occur[spp, 2] *
prob_occur[spp_next, 2]
exp_cooccur[row, 1] <- spp
exp_cooccur[row, 2] <- spp_next
exp_cooccur[row, 3] <- prob_cooccur[row, 3] *
nsite
}
}
setTxtProgressBar(pb, spp)
}
if (thresh == TRUE) {
n_pairs <- nrow(prob_cooccur)
prob_cooccur <- prob_cooccur[exp_cooccur[, 3] >= 1, ]
obs_cooccur <- obs_cooccur[exp_cooccur[, 3] >= 1, ]
exp_cooccur <- exp_cooccur[exp_cooccur[, 3] >= 1, ]
n_omitted <- n_pairs - nrow(prob_cooccur)
pb <- txtProgressBar(min = 0, max = (nspp + nrow(obs_cooccur)),
style = 3)
}
output <- data.frame(matrix(nrow = 0, ncol = 9))
colnames(output) <- c("sp1", "sp2", "sp1_inc", "sp2_inc",
"obs_cooccur", "prob_cooccur", "exp_cooccur", "p_lt",
"p_gt")
output <- mcsapply(1:nrow(obs_cooccur), function(row) {
sp1 <- obs_cooccur[row, 1]
sp2 <- obs_cooccur[row, 2]
sp1_inc <- incidence[incidence[, 1] == sp1, 2]
sp2_inc <- incidence[incidence[, 1] == sp2, 2]
max_inc <- max(sp1_inc, sp2_inc)
min_inc <- min(sp1_inc, sp2_inc)
prob_share_site <- rep(0, (nsite + 1))
if (prob == "hyper") {
if (only_effects == FALSE) {
all.probs <- phyper(0:min_inc, min_inc, nsite -
min_inc, max_inc)
prob_share_site[1] <- all.probs[1]
for (j in 2:length(all.probs)) {
prob_share_site[j] <- all.probs[j] - all.probs[j -
1]
}
}
else {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- 1
}
}
}
}
}
if (prob == "comb") {
if (only_effects == FALSE) {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- coprob(max_inc = max_inc,
j = j, min_inc = min_inc, nsite = nsite)
}
}
}
}
else {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- 1
}
}
}
}
}
p_lt <- 0
p_gt <- 0
for (j in 0:nsite) {
if (j <= obs_cooccur[row, 3]) {
p_lt <- prob_share_site[(j + 1)] + p_lt
}
if (j >= obs_cooccur[row, 3]) {
p_gt <- prob_share_site[(j + 1)] + p_gt
}
if (j == obs_cooccur[row, 3]) {
p_exactly_obs <- prob_share_site[(j + 1)]
}
}
p_lt <- round(p_lt, 5)
p_gt <- round(p_gt, 5)
p_exactly_obs <- round(p_exactly_obs, 5)
prob_cooccur[row, 3] <- round(prob_cooccur[row, 3], 3)
exp_cooccur[row, 3] <- round(exp_cooccur[row, 3], 1)
output[row, ] <- c(sp1, sp2, sp1_inc, sp2_inc, obs_cooccur[row,
3], prob_cooccur[row, 3], exp_cooccur[row, 3], p_lt,
p_gt)
return(output)
}, simplify=FALSE)
output <- do.call("rbind", output)
output <- output[!is.na(output$sp1),]
close(pb)
if (spp_names == TRUE) {
sp1_name <- merge(x = data.frame(order = 1:length(output$sp1),
sp1 = output$sp1), y = spp_key, by.x = "sp1", by.y = "num",
all.x = T, sort = FALSE)
sp2_name <- merge(x = data.frame(order = 1:length(output$sp2),
sp2 = output$sp2), y = spp_key, by.x = "sp2", by.y = "num",
all.x = T, sort = FALSE)
output$sp1_name <- sp1_name[with(sp1_name, order(order)),
"spp"]
output$sp2_name <- sp2_name[with(sp2_name, order(order)),
"spp"]
}
true_rand <- (nrow(output[(output$p_gt >= 0.05 & output$p_lt >=
0.05) & (abs(output$obs_cooccur - output$exp_cooccur) <=
(nsite * true_rand_classifier)), ]))
output_list <- list(call = match.call(), results = output,
positive = nrow(output[output$p_gt < 0.05, ]), negative = nrow(output[output$p_lt <
0.05, ]), co_occurrences = (nrow(output[output$p_gt <
0.05 | output$p_lt < 0.05, ])), pairs = nrow(output),
random = true_rand, unclassifiable = nrow(output) - (true_rand +
nrow(output[output$p_gt < 0.05, ]) + nrow(output[output$p_lt <
0.05, ])), sites = nsite, species = nspp, percent_sig = (((nrow(output[output$p_gt <
0.05 | output$p_lt < 0.05, ])))/(nrow(output))) *
100, true_rand_classifier = true_rand_classifier)
if (spp_names == TRUE) {
output_list$spp_key <- spp_key
output_list$spp.names = row.names(spp_site_mat)
}
else {
output_list$spp.names = c(1:nrow(spp_site_mat))
}
if (thresh == TRUE) {
output_list$omitted <- n_omitted
output_list$pot_pairs <- n_pairs
}
class(output_list) <- "cooccur"
if (only_effects == F) {
output_list
}
else {
effect.sizes(mod = output_list, standardized = eff_standard,
matrix = eff_matrix)
}
}