Use loop or apply command to calculate U statistics in R - r

I am thinking of using a loop or apply command to solve this problem but failed.

If using a data.table, the solution can be found with the aid of the CJ.dt function (https://stackoverflow.com/a/27347397/5744762). Below is what I think you are looking for with the limited description you provided.
library(data.table)
#Create sample datasets
DT_tor <- data.table(ID = 1:100, time_tor = abs(rnorm(100)))
DT_bunny <- data.table(ID = 1:100, time_bunny = abs(rnorm(100)))
#CJ.dt function
CJ.dt = function(X,Y) {
stopifnot(is.data.table(X),is.data.table(Y))
k = NULL
X = X[, c(k=1, .SD)]
setkey(X, k)
Y = Y[, c(k=1, .SD)]
setkey(Y, NULL)
X[Y, allow.cartesian=TRUE][, k := NULL][]
}
#Crossjoin two data.tables
DT_CJ <- CJ.dt(DT_tor, DT_bunny)
#Get a score for the tortoise and a score for the bunny
Score_tor <- DT_CJ[time_tor < time_bunny, .N]
Score_bunny <- DT_CJ[time_tor > time_bunny, .N]

Related

Extracting vector with certain qualities from longer vector

I have the following data.table:
DT <- data.table(A = c(rep("aa",2),rep("bb",2)),
B = c(rep("H",2),rep("Na",2)),
Low = c(0,3,1,1),
High = c(8,10,9,8),
Time =c("0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10"),
Intensity = c("0,0,0,0,561464,0,0,0,0,0,0","0,0,0,6548,5464,5616,0,0,0,68716,0","5658,12,6548,6541,8,5646854,54565,56465,546,65,0","0,561464,0,0,0,0,0,0,0,0,0")
)
and use this code to extract the the highest number of consecutive intensity values above a certain value For a more detailed explanation on how this calculation works please see Reading and counting of consecutive points:
newCols <- do.call(rbind, Map(function(u, v, x, y) {
u1 <- as.numeric(u)
v1 <- as.numeric(v)
lb <- which.min(abs(x - u1))
ub <- which.min(abs(y - u1))
v3 <- as.numeric(v[(lb+1):(ub-1)])
i3 = with(rle(v3 > min(as.numeric(v[c(lb, ub)]))),
pmax(max(lengths[values]), 0))
data.frame(Consec.Points.base = i3)
},
strsplit(DT$Time, ","), strsplit(DT$Intensity, ","), DT$Low, DT$High))
DT <- cbind(DT, newCols)
I was wondering how it would be possible to instead of getting the length of the Consec.Points.base, to extract their actual points (Time and Intensity) as two vectors?
Thanks a lot in advance!
I think this answers your question, but let me know if I made a mistake, or something needs more thought/clarification.
DT <- data.table(A = c(rep("aa",2),rep("bb",2)),
B = c(rep("H",2),rep("Na",2)),
Low = c(0,3,1,1),
High = c(8,10,9,8),
Time =c("0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10"),
Intensity = c("0,0,0,0,561464,0,0,0,0,0,0","0,0,0,6548,5464,5616,0,0,0,68716,0","5658,12,6548,6541,8,5646854,54565,56465,546,65,0","0,561464,0,0,0,0,0,0,0,0,0")
)
# unique identifier
DT[, i := .I]
# re-structure
DT2 <- DT[, .(Time = as.numeric(strsplit(Time, ",")[[1]]),
Intensity = as.numeric(strsplit(Intensity, ",")[[1]])), by = i]
DT2 <- merge(DT2, DT[, .(i,A,B,Low,High)], by="i")
DT2 <- DT2[between(Time, Low, High, incbounds = FALSE),]
DT2[, IntensityGood := Intensity != min(Intensity), by=i]
# encode each part of sequence with its own value, if not FALSE
encoder <- function(x){
rle.response <- rle(x)
v2 <- rep(0, length(rle.response$values))
v2[rle.response$values!=FALSE] <- which(rle.response$values != FALSE)
rep(v2, rle.response$lengths)
}
DT2[, encodeI := encoder(IntensityGood), by = i]
# remove ones which are all 0, easily handle seperately
DT3 <- DT2[, test := all(encodeI==0), by=i][test==FALSE,][, test:=NULL]
# get count - can infer missing are 0
count <- DT3[encodeI!=0, .(max(table(encodeI))), by = i]
# get sequence
findMaxDt <- DT3[encodeI != 0, .N, by=.(i, encodeI)]
DT3 <- merge(DT3, findMaxDt, by=c("i", "encodeI"))
DT3 <- DT3[, Best := N==max(N), by=i]
DT3[Best==TRUE, .(list(Intensity)), by=i]

R bootstrap weighted mean by group with data table

I am trying to combine two approaches:
Bootstrapping multiple columns in data.table in a scalable fashion
with
Bootstrap weighted mean in R
Here is some random data:
## Generate sample data
# Function to randomly generate weights
set.seed(7)
rtnorm <- function(n, mean, sd, a = -Inf, b = Inf){
qnorm(runif(n, pnorm(a, mean, sd), pnorm(b, mean, sd)), mean, sd)
}
# Generate variables
nps <- round(runif(3500, min=-1, max=1), 0) # nps value which takes 1, 0 or -1
group <- sample(letters[1:11], 3500, TRUE) # groups
weight <- rtnorm(n=3500, mean=1, sd=1, a=0.04, b=16) # weights between 0.04 and 16
# Build data frame
df = data.frame(group, nps, weight)
# The following packages / libraries are required:
require("data.table")
require("boot")
This is the code from the first post above boostrapping the weighted mean:
samplewmean <- function(d, i, j) {
d <- d[i, ]
w <- j[i, ]
return(weighted.mean(d, w))
}
results_qsec <- boot(data= df[, 2, drop = FALSE],
statistic = samplewmean,
R=10000,
j = df[, 3 , drop = FALSE])
This works totally fine.
Below ist the code from the second post above bootstrapping the mean by groups within a data table:
dt = data.table(df)
stat <- function(x, i) {x[i, (m=mean(nps))]}
dt[, list(list(boot(.SD, stat, R = 100))), by = group]$V1
This, too, works fine.
I have trouble combining both approaches:
Running …
dt[, list(list(boot(.SD, samplewmean, R = 5000, j = dt[, 3 , drop = FALSE]))), by = group]$V1
… brings up the error message:
Error in weighted.mean.default(d, w) :
'x' and 'w' must have the same length
Running …
dt[, list(list(boot(dt[, 2 , drop = FALSE], samplewmean, R = 5000, j = dt[, 3 , drop = FALSE]))), by = group]$V1
… brings up a different error:
Error in weighted.mean.default(d, w) :
(list) object cannot be coerced to type 'double'
I still have problems getting my head around the arguments in data.table and how to combine functions running data.table.
I would appreciate any help.
It is related to how data.table behaves within the scope of a function. d is still a data.table within samplewmean even after subsetting with i whereas weighted.mean is expecting numerical vector of weights and of values. If you unlist before calling weighted.mean, you will be able to fix this error
Error in weighted.mean.default(d, w) :
(list) object cannot be coerced to type 'double'
Code to unlist before passing into weighted.mean:
samplewmean <- function(d, i, j) {
d <- d[i, ]
w <- j[i, ]
return(weighted.mean(unlist(d), unlist(w)))
}
dt[, list(list(boot(dt[, 2 , drop = FALSE], samplewmean, R = 5000, j = dt[, 3 , drop = FALSE]))), by = group]$V1
A more data.table-like (data.table version >= v1.10.2) syntax is probably as follows:
#a variable named original is being passed in from somewhere and i am unable to figure out from where
samplewmean <- function(d, valCol, wgtCol, original) {
weighted.mean(unlist(d[, ..valCol]), unlist(d[, ..wgtCol]))
}
dt[, list(list(boot(.SD, statistic=samplewmean, R=1, valCol="nps", wgtCol="weight"))), by=group]$V1
Or another possible syntax is: (see data.table faq 1.6)
samplewmean <- function(d, valCol, wgtCol, original) {
weighted.mean(unlist(d[, eval(substitute(valCol))]), unlist(d[, eval(substitute(wgtCol))]))
}
dt[, list(list(boot(.SD, statistic=samplewmean, R=1, valCol=nps, wgtCol=weight))), by=group]$V1

data.table: parallel execution of row-wise function

I want to apply a function to some colums in every row of a data.table. I do this using something like this:
require(data.table)
## create some random data
n = 1000
p = 1000
set.seed(1)
data.raw <- matrix(rnorm(n*p), nrow = n, ncol = p)
rownames(data.raw) <- lapply(1:n, FUN = function(x, length)paste(sample(c(letters, LETTERS), length, replace=TRUE), collapse=""), length = 10)
colnames(data.raw) <- samples <- paste0("X", 1:n)
data.t <- data.table(data.raw)
data.t[, id := rownames(data.raw)]
setkey(data.t, id)
# apply function for each row
f <- function(x){return(data.frame(result1 = "abc", result2 = "def"))}
data.t[, c("result1", "result2") := f(.SD), .SDcols = samples, by = id]
is there any (easy) way to parallelize the execution of f for every id in the data.table?
I know that there are some questions here about parallelization of data.table, but I couldn't find a good answer in any of these.

r data.table usage in function call

I want to perform a data.table task over and over in a function call: Reduce number of levels for large categorical variables My problem is similar to Data.table and get() command (R) or pass column name in data.table using variable in R but I can't get it to work
Without a function call this works just fine:
# Load data.table
require(data.table)
# Some data
set.seed(1)
dt <- data.table(type = factor(sample(c("A", "B", "C"), 10e3, replace = T)),
weight = rnorm(n = 10e3, mean = 70, sd = 20))
# Decide the minimum frequency a level needs...
min.freq <- 3350
# Levels that don't meet minumum frequency (using data.table)
fail.min.f <- dt[, .N, type][N < min.freq, type]
# Call all these level "Other"
levels(dt$type)[fail.min.f] <- "Other"
but wrapped like
reduceCategorical <- function(variableName, min.freq){
fail.min.f <- dt[, .N, variableName][N < min.freq, variableName]
levels(dt[, variableName][fail.min.f]) <- "Other"
}
I only get errors like:
reduceCategorical(dt$x, 3350)
Fehler in levels(df[, variableName][fail.min.f]) <- "Other" :
trying to set attribute of NULL value
And sometimes
Error is: number of levels differs
One possibility is to define your own re-leveling function using data.table::setattr that will modify dt in place. Something like
DTsetlvls <- function(x, newl)
setattr(x, "levels", c(setdiff(levels(x), newl), rep("other", length(newl))))
Then use it within another predefined function
f <- function(variableName, min.freq){
fail.min.f <- dt[, .N, by = variableName][N < min.freq, get(variableName)]
dt[, DTsetlvls(get(variableName), fail.min.f)]
invisible()
}
f("type", min.freq)
levels(dt$type)
# [1] "C" "other"
Some other data.table alternatives
f <- function(var, min.freq) {
fail.min.f <- dt[, .N, by = var][N < min.freq, get(var)]
dt[get(var) %in% fail.min.f, (var) := "Other"]
dt[, (var) := factor(get(var))]
}
Or using set/.I
f <- function(var, min.freq) {
fail.min.f <- dt[, .I[.N < min.freq], by = var]$V1
set(dt, fail.min.f, var, "other")
set(dt, NULL, var, factor(dt[[var]]))
}
Or combining with base R (doesn't modify original data set)
f <- function(df, variableName, min.freq){
fail.min.f <- df[, .N, by = variableName][N < min.freq, get(variableName)]
levels(df$type)[fail.min.f] <- "Other"
df
}
Alternatively, we could stick we characters instead (if type is a character), you could simply do
f <- function(var, min.freq) dt[, (var) := if(.N < min.freq) "other", by = var]
You are referencing things little differently in the wrapper, to get "type" column name you are using the whole variableName which is actually a vector same with getting levels, you are not using variableName directly as done in function
The error is because value of fail.min.f is coming NULL owing to referencing.

Fast crosstabs and stats on all pairs of variables

I am trying to calculate a measure of association between all variables in a data.table. (This is not a stats question, but as an aside: the variables are all factors, and the measure is Cramér's V.)
Example dataset:
p = 50; n = 1e5; # actual dataset has p > 1e3, n > 1e5, much wider but barely longer
set.seed(1234)
obs <- as.data.table(
data.frame(
cbind( matrix(sample(c(LETTERS[1:4],NA), n*(p/2), replace=TRUE),
nrow=n, ncol=p/2),
matrix(sample(c(letters[1:6],NA), n*(p/2), replace=TRUE),
nrow=n, ncol=p/2) ),
stringsAsFactors=TRUE ) )
I am currently using the split-apply-combine approach, which involves looping (via plyr::adply) through all pairs of indices and returning one row for each pair. (I attempted to parallelize adply but failed.)
# Calculate Cramér's V between all variables -- my kludgey approach
pairs <- t( combn(ncol(obs), 2) ) # nx2 matrix contains indices of upper triangle of df
# library('doParallel') # I tried to parallelize -- bonus points for help here (Win 7)
# cl <- makeCluster(8)
# registerDoParallel(cl)
library('plyr')
out <- adply(pairs, 1, function(ix) {
complete_cases <- obs[,which(complete.cases(.SD)), .SDcols=ix]
chsq <- chisq.test(x= dcast(data = obs[complete_cases, .SD, .SDcols=ix],
formula = paste( names(obs)[ix], collapse='~'),
value.var = names(obs)[ix][1], # arbitrary
fun.aggregate=length)[,-1, with=FALSE] )
return(data.table(index_1 = ix[1],
var_1 = names(obs)[ix][1],
index_2 = ix[2],
var_2 = names(obs)[ix][2],
cramers_v = sqrt(chsq$statistic /
(sum(chsq$observed) *
(pmin(nrow(chsq$observed),
ncol(chsq$observed) ) -1 ) )
) )
)
})[,-1] #}, .parallel = TRUE)[,-1] # using .parallel returns Error in do.ply(i) :
# task 1 failed - "object 'obs' not found"
out <- data.table(out) # adply won't return a data.table
# stopCluster(cl)
What are my options for speeding up this calculation? My challenge is in passing the row-wise operation on pairs into the column-wise calculations in obs. I am wondering if it is possible to generate the column pairs directly into J, but the Force is just not strong enough with this data.table padawan.
First, I would go with 'long' data format as following:
obs[, id := 1:n]
mobs <- melt(obs, id.vars = 'id')
Next set key on data table setkeyv(mobs, 'id').
Finally, iterate through variables and do calculations on pairs:
out <- list()
for(i in 1:p) {
vari <- paste0('X', i)
tmp <- mobs[mobs[variable == vari]]
nn <- tmp[!(is.na(value) | is.na(i.value)), list(i.variable = i.variable[1], nij = length(id)), keyby = list(variable, value, i.value)]
cj <- nn[, CJ(value = value, i.value = i.value, sorted = FALSE, unique = TRUE), by = variable]
setkeyv(cj, c('variable', 'value', 'i.value'))
nn <- nn[cj]
nn[is.na(nij), nij := 0]
nn[, ni := sum(nij), by = list(variable, i.value)]
nn[, nj := sum(nij), by = list(variable, value)]
nn[, c('n', 'r', 'k') := list(sum(nij), length(unique(i.value)), length(unique(value))), by = variable]
out[[i]] <- nn[, list(i.variable = vari, cramers_v = (sqrt(sum((nij - ni * nj / n) ^ 2 / (ni * nj / n)) / n[1]) / min(k[1] - 1, r[1] - 1))), by = variable]
}
out <- rbindlist(out)
So you need to iterate only once through variables. As you see I would also wouldn't use chisq.test and would write computations myself.

Resources