To cross validation for CCLE (Cancer Cell Line Encyclopedia) drug data I tried to convert the following codes from matlab to R. However, I was unsuccessful. Matlab codes work fine and can create both a *cross.mat that is a group of 10 fold CV data for each data set and a *data.mat that is the grouped data of 10 times of CV of each data set.
I will be appreciate if you can help me find my mistake.
#This function is about 10-fold cross-validation data grouping
getcrossMatrixs <- function(MM){
library(pracma)
N <- nnz(MM)
zeroM <- matrix(0L, nrow = dim(MM)[1], ncol = dim(MM)[2])
D <- randperm(N)
first <- floor(N/10)
w = which(MM != 0, arr.ind=TRUE);
nrows=w[,1]; ncols=w[,2]
crossdata <- list()
for (i in 1:10) {
crossdata[[i]] <- zeroM
}
for (i in 1:10){
for (j in (1+(i-1)*first):(i*first)){
crossdata[[i]][c(nrows[D[j]]),c(ncols[D[j]]) ] <- MM[c(nrows[D[j]]),c(ncols[D[j]])]
}
}
k <- (N-(10*first))
i <- 10*first+1
for (j in 1:k){
crossdata[[j]][c(nrows[D[i]]),c(ncols[D[i]]) ] <- MM[c(nrows[D[i]]),c(ncols[D[i]])]
i <- i+1
}
}
#The following lines is the main for calling above function.
library(foreach)
n.cores <- parallel::detectCores()
my.cluster <- parallel::makeCluster(
n.cores,
type = "PSOCK"
)
print(my.cluster)
#> socket cluster with 16 nodes on host 'localhost'
doParallel::registerDoParallel(cl = my.cluster)
foreach::getDoParRegistered()
#> [1] TRUE
CCLEdata <- list()
#MM<-matrix(read_csv("MM.csv", col_names = FALSE, show_col_types = FALSE), rownames.force = NA)
MM <- matrix(seq(0, 4.5, length.out = 11784), nrow = 491) #datamatrix like CCLE drug activity area sensitivity matrrix(491*24)
foreach(i = 1:10) %dopar% {
CCLEcross <- getcrossMatrixs(MM)
CCLEdata[[i]] <- CCLEcross
}
#> [[1]]
#> NULL
#>
#> [[2]]
#> NULL
#>
#> [[3]]
#> NULL
#>
#> [[4]]
#> NULL
#>
#> [[5]]
#> NULL
#>
#> [[6]]
#> NULL
#>
#> [[7]]
#> NULL
#>
#> [[8]]
#> NULL
#>
#> [[9]]
#> NULL
#>
#> [[10]]
#> NULL
Created on 2022-08-29 with reprex v2.0.2
Actually when I use the original CCLE dataset the error is changing in the main.R:
Error in { : task 1 failed - "is.numeric(x) || is.complex(x) is not TRUE"
or
Error in { :
task 1 failed - "attempt to select less than one element in integerOneIndex"
%These are from Matlab
function [crossdata] = getcrossMatrixs(MM)
N = nnz(MM(:));
zeroM = zeros(size(MM));
D = randperm(N);
first = floor(N/10);
[nrows,ncols] = find(MM);
crossdata = {};
for i = 1:10
crossdata{i} = zeroM;
end
for i = 1:10
for j = 1+(i-1)*first:i*first
crossdata{i}(nrows(D(j)),ncols(D(j))) = MM(nrows(D(j)),ncols(D(j)));
end
end
k=N -10*first ;
i=10*first+1;
for j=1:k
crossdata{j}(nrows(D(i)),ncols(D(i))) = MM(nrows(D(i)),ncols(D(i)));
i=i+1;
end
end
load('MM.mat')
parfor i=1:10
[CCLEcross] = getcrossMatrixs(MM);
CCLEdata{i}=CCLEcross;
end
I didn't look too closely to figure out what was wrong. I based this function on the Matlab function supplied. Note that for this particular example, going parallel is more expensive due to overhead. Parallel will provide performance with large enough matrices and/or more samples.
library(parallel)
MM <- matrix(seq(0, 4.5, length.out = 11784), nrow = 491)
getcrossMatrixs <- function(MM, parts = 10L) {
D <- sample(which(MM != 0))
first <- length(D) %/% parts
last <- length(D) %% parts
idx <- c(0L, cumsum(c(rep(first + 1L, last), rep(first, parts - last))))
mZero <- matrix(0, nrow(MM), ncol(MM))
lapply(1:parts, function(i, m) {m[D[(idx[i] + 1L):idx[i + 1L]]] <- MM[D[(idx[i] + 1L):idx[i + 1L]]]; m}, mZero)
}
reps <- 10L
clust <- makeCluster(min(detectCores() - 1L, reps))
clusterExport(clust, c("getcrossMatrixs", "MM"))
CCLEdata <- parLapply(clust, 1:reps, function(x) getcrossMatrixs(MM))
stopCluster(clust)
# check that each set of matrices returned has all elements of MM
identical(rep(list(MM), reps), lapply(1:reps, function(i) Reduce("+", CCLEdata[[i]], matrix(0, nrow(MM), ncol(MM)))))
#> [1] TRUE
And here's a cleaned-up version of the Matlab function:
function [crossdata] = getcrossMatrixs(MM)
idx = find(MM);
N = length(nrows);
zeroM = zeros(size(MM));
idx = idx(randperm(N));
first = floor(N/10);
crossdata = cell(10, 1);
for i = 1:10
crossdata{i} = zeroM;
end
for i = 1:10
j = 1 + (i - 1)*first:i*first;
crossdata{i}(idx(j)) = MM(idx(j));
end
k = N - 10*first;
j = 10*first + 1;
for i = 1:k
crossdata{i}(idx(j)) = MM(idx(j));
j = j + 1;
end
end
Related
I have noticed that using the statistical test fligner.test from the r stats package provides different results with a simple transformation, even though this shouldn't be the case.
Here an example (the difference for the original dataset is much more dramatic):
g <- factor(rep(1:2, each=6))
x1 <- c(2,2,6,6,1,4,5,3,5,6,5,5)
x2 <- (x1-1)/5 #> cor(x1,x2) [1] 1
fligner.test(x1,g) # chi-squared = 4.2794, df = 1, p-value = 0.03858
fligner.test(x2,g) # chi-squared = 4.8148, df = 1, p-value = 0.02822
Looking at the function code, I have noticed that the median centering might be causing the issue:
x1 <- x1 - tapply(x1,g,median)[g]
x2 <- x2 - tapply(x2,g,median)[g]
unique(abs(x1)) # 1 3 2 0
unique(abs(x2)) # 0.2 0.6 0.4 0.2 0.0 <- repeated 0.2
Is this a known issue, and how should this inconsistency be resolved?
I think your analysis is correct here. In your example the problem ultimately occurs because (0.8 - 0.6) == 0.2 is FALSE unless rounded to 15 decimal places. You should file a bug report, since this is avoidable.
If you are desperate in the meantime, you can adapt stats:::fligner.test.default by applying a tiny bit of rounding at the median centering stage to remove floating point inequalities:
fligner <- function (x, g, ...)
{
if (is.list(x)) {
if (length(x) < 2L)
stop("'x' must be a list with at least 2 elements")
DNAME <- deparse1(substitute(x))
x <- lapply(x, function(u) u <- u[complete.cases(u)])
k <- length(x)
l <- lengths(x)
if (any(l == 0))
stop("all groups must contain data")
g <- factor(rep(1:k, l))
x <- unlist(x)
}
else {
if (length(x) != length(g))
stop("'x' and 'g' must have the same length")
DNAME <- paste(deparse1(substitute(x)), "and",
deparse1(substitute(g)))
OK <- complete.cases(x, g)
x <- x[OK]
g <- g[OK]
g <- factor(g)
k <- nlevels(g)
if (k < 2)
stop("all observations are in the same group")
}
n <- length(x)
if (n < 2)
stop("not enough observations")
x <- round(x - tapply(x, g, median)[g], 15)
a <- qnorm((1 + rank(abs(x))/(n + 1))/2)
a <- a - mean(a)
v <- sum(a^2)/(n - 1)
a <- split(a, g)
STATISTIC <- sum(lengths(a) * vapply(a, mean, 0)^2)/v
PARAMETER <- k - 1
PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
names(STATISTIC) <- "Fligner-Killeen:med chi-squared"
names(PARAMETER) <- "df"
METHOD <- "Fligner-Killeen test of homogeneity of variances"
RVAL <- list(statistic = STATISTIC, parameter = PARAMETER,
p.value = PVAL, method = METHOD, data.name = DNAME)
class(RVAL) <- "htest"
return(RVAL)
}
This now returns the correct result for both your vectors:
fligner(x1,g)
#>
#> Fligner-Killeen test of homogeneity of variances
#>
#> data: x1 and g
#> Fligner-Killeen:med chi-squared = 4.2794, df = 1, p-value = 0.03858
fligner(x2,g)
#>
#> Fligner-Killeen test of homogeneity of variances
#>
#> data: x2 and g
#> Fligner-Killeen:med chi-squared = 4.2794, df = 1, p-value = 0.03858
I have a vector as follows:
playtimes <- c("1H18M20S", "1H27M5S", "18M27S", "56M38S", "21S")
and I want to convert these to playtimes in second. For example, the resulting vector would be something like this:
playtimeInSeconds <- c(4700, 5225, 1107, 3398, 21)
Im having trouble with separating the strings correctly based on the H, M and S. I wrote the following that works for the playtimes under 1 hour
minutes <- gsub("M.*", "", playtime)
seconds <- gsub(".*M", "", playtime) %>%
gsub("S", "", .)
totalPlaytime <- as.numeric(minutes)*60 + as.numeric(seconds)
But Im not sure how to tackle the H portion of some strings.
You could strsplit and adapt the length of the list elements reversely to 3 which allows you to use sapply to get a matrix where you apply the matrix product %*%.
m <- sapply(strsplit(p, 'H|M|S'), \(x) as.double(rev(`length<-`(rev(x), 3))))
res <- as.vector(t(replace(m, is.na(m), 0)) %*% rbind(3600, 60, 1))
res
# [1] 4700 5225 1107 3398 21
interesting problem. here is a solution that potentially could be more efficient but does the job
# function from https://www.statworx.com/de/blog/strsplit-but-keeping-the-delimiter/
strsplit <- function(x,
split,
type = "remove",
perl = FALSE,
...) {
if (type == "remove") {
# use base::strsplit
out <- base::strsplit(x = x, split = split, perl = perl, ...)
} else if (type == "before") {
# split before the delimiter and keep it
out <- base::strsplit(x = x,
split = paste0("(?<=.)(?=", split, ")"),
perl = TRUE,
...)
} else if (type == "after") {
# split after the delimiter and keep it
out <- base::strsplit(x = x,
split = paste0("(?<=", split, ")"),
perl = TRUE,
...)
} else {
# wrong type input
stop("type must be remove, after or before!")
}
return(out)
}
# convert to seconds
to_seconds <- c(H = 60 * 60,
M = 60,
S = 1)
get_seconds <- function(value, unit) {
value * to_seconds[unit]
}
# example vector
playtimes <- c("1H18M20S", "1H27M5S", "18M27S", "56M38S", "21S")
# extract time parts
times <- strsplit(playtimes,
split = "[A-Z]",
type = "after")
times
#> [[1]]
#> [1] "1H" "18M" "20S"
#>
#> [[2]]
#> [1] "1H" "27M" "5S"
#>
#> [[3]]
#> [1] "18M" "27S"
#>
#> [[4]]
#> [1] "56M" "38S"
#>
#> [[5]]
#> [1] "21S"
# calculate each time in seconds
sapply(times,
function(t) {
# split numeric and unit part
t_split <- strsplit(x = t,
split = "[A-Z]",
type = "before")
# calculate seconds for each unit part
times_in_seconds <- get_seconds(value = as.numeric(sapply(t_split, `[`, 1)),
unit = sapply(t_split, `[`, 2))
# sum of all parts
sum(times_in_seconds)
})
#> [1] 4700 5225 1107 3398 21
I followed the example given in the 3rd answer here and made the following
playtime <- sapply(playtime, function(x){paste(paste(rep(0, 3 - str_count(x, '[0-9]+')), collapse = ' '), x)})
totalPlaytime <- time_length(hms(playtime))
Short, sweet, and checks for potential errors where the playtime is less that 1 hr or less than 1 min.
I am trying to build a double optimization in R. By double optimization, I mean that there is going to be an inner call, in which I optimize a function called inner_function(), and then an outer call, in which I optimize an outer_function() whose output is computed using the optimization of inner_function().
I can make this work when inner_function() is optimized through optim() and outer_function() takes only one argument and is optimized through optimize():
constructor_function <- function(data, fixed = c(FALSE, FALSE)) {
params <- fixed
function(p) {
params[!fixed] <- p
a <- data[1]
b <- data[2]
c <- data[3]
d <- data[4]
e <- params[1]
f <- params[2]
## Calculate something
tot <- abs(a + b + c + d + e + f)
return(tot)
}
}
inner_function <- constructor_function(c(1, 2, 3, 4))
inner_function(c(5, 6))
#> [1] 21
optim(c(0, 0), inner_function)$par
#> [1] -3.454274 -6.545726
sum(optim(c(0, 0), inner_function)$par)
#> [1] -10
outer_function <- function(first_factor) {
inner_function <- constructor_function(c(first_factor, 2, 3, 4))
values <- optim(c(0, 0), inner_function)$par
tot <- sum(values)
return(tot)
}
# check
outer_function(1)
#> [1] -10
optimize(outer_function, lower = 0, upper = 4)
#> $minimum
#> [1] 3.99994
#>
#> $objective
#> [1] -12.99994
# check
outer_function(3.99994)
#> [1] -12.99994
But I can't make the double optimization work when the outer function (now called outer_function_args) takes more than one argument, so that it can be optimized only with optim():
outer_function_args <- function(first_factor, second_factor) {
inner_function <- constructor_function(c(first_factor, second_factor, 3, 4))
values <- optim(c(0, 0), inner_function)$par
tot <- sum(values)
return(tot)
}
outer_function_args(1,2)
#> [1] -10
optim(par=c(0,2), outer_function_args)
#> Error in fn(par, ...): argument "second_factor" is missing, with no default
The error mentions that argument "second_factor" is missing, but outer_function_args is running correctly.
Created on 2021-04-15 by the reprex package (v0.3.0)
You need to modify your function to take in the parameters as a vector, like so:
outer_function_args <- function(par) {
inner_function <- constructor_function(c(par[1], par[2], 3, 4))
values <- optim(c(0, 0), inner_function)$par
tot <- sum(values)
return(tot)
}
outer_function_args(par = c(1, 2))
#> [1] -10
optim(par=c(0,2), outer_function_args)
#$par
#[1] 3355434 3355445
#
#$value
#[1] -6710886
#
#$counts
#function gradient
# 253 NA
#
#$convergence
#[1] 0
#
#$message
#NULL
From the documentation of optim(par, fn) in help("optim"):
fn A function to be minimized (or maximized), with first argument the vector of parameters over which minimization is to take place. It should return a scalar result.
While constructing a data-frame, columns are replicated if lengths differ.
> data.frame(x = c(1,2), y = NA_integer_)
x y
1 1 NA
2 2 NA
However, when I try to do this with bit64::NA_integer64_, I get an error. Does anyone know what could be happening? rep() works if it is called separately on bit64::NA_integer64_.
> data.frame(x = c(1,2), y = bit64::NA_integer64_)
Error in data.frame(x = c(1, 2), y = bit64::NA_integer64_) :
arguments imply differing number of rows: 2, 1
> rep(bit64::NA_integer64_, 2)
integer64
[1] <NA> <NA>
data.frame will only recycle:
Vector with no attributes other than names
factor
AsIs character
Date
POSIXct
tibble doesn't have this problem.
tibble::tibble(x = c(1,2), y = bit64::NA_integer64_)
#> # A tibble: 2 x 2
#> x y
#> <dbl> <int64>
#> 1 1 NA
#> 2 2 NA
Here is the relevant snippet from data.frame
for (i in seq_len(n)[nrows < nr]) {
xi <- vlist[[i]]
if (nrows[i] > 0L && (nr%%nrows[i] == 0L)) {
xi <- unclass(xi)
fixed <- TRUE
for (j in seq_along(xi)) {
xi1 <- xi[[j]]
if (is.vector(xi1) || is.factor(xi1))
xi[[j]] <- rep(xi1, length.out = nr)
else if (is.character(xi1) && inherits(xi1, "AsIs"))
xi[[j]] <- structure(rep(xi1, length.out = nr),
class = class(xi1))
else if (inherits(xi1, "Date") || inherits(xi1, "POSIXct"))
xi[[j]] <- rep(xi1, length.out = nr)
else {
fixed <- FALSE
break
}
}
if (fixed) {
vlist[[i]] <- xi
next
}
}
stop(gettextf("arguments imply differing number of rows: %s",
paste(unique(nrows), collapse = ", ")), domain = NA)
}
I am in the process of transforming a traditional loop to a foreach loop for solving the shoelace formula problem in R; however, I am not getting the right accumulation with the foreach loop.
library("foreach")
x = c(0, 4, 4, 0)
# coordinates of points
y = c(0, 0, 4, 4)
# coordinates of points
points <- length(x)
area <- 0
# Accumulates area in the loop
i <- 0
j <- points
# using foreach loop
area <- foreach(i = seq(x), .combine = "+") %do% {
(x[[j]] + x[[i]]) * (y[[j]] - y[[i]])
j <- i
}
area # 10
This is just 1 + 2 + 3 + 4. It has not taken into account the points in x and y.
# using traditional loop
area <- vector("list", length(x))
for (i in seq_along(x)) {
area[[i]] <- (x[[j]] + x[[i]]) * (y[[j]] - y[[i]])
j <- i
}
area
# [[1]]
# [1] 0
# [[2]]
# [1] 0
# [[3]]
# [1] -32
# [[4]]
# [1] 0
The sum is 32 units, which is correct.
What am I doing wrong with the foreach loop? Thank you.
foreach is returning the last calculated expression, as in regular functions.
So, you can do:
area <- foreach(i = seq(x)) %do% {
j0 <- j
j <- i
(x[[j0]] + x[[i]]) * (y[[j0]] - y[[i]])
}