Repeat analysis for several datasets in R

Repeat analysis for several datasets in R - r

How can I repeat this code for each subject (xxx), so that the results are added to the data.frame (centralities)?
fullDataDetrend_xxx <- subset(fullDataDetrend, subjno == xxx, select=c(subjno,depressed,sad,tired,interest,happy,neg_thoughts,concentration_probl,ruminating,activity,datevar,timestamp,dayno,beepno))
model_xxx <- var1(
fullDataDetrend_xxx)
model_xxx_omega <- getmatrix(model_xxx, "omega_zeta")
centrality_model_xxx_omega <- centrality(model_xxx_omega )
centralities[nrow(centralities) + 1,] <- c("xxx",centrality_model_xxx_omega$InExpectedInfluence)

Did as suggested:
fullDataDetrend_split <- split(fulldataDetrend, fulldataDetrend$subjno)
then, to estimate network, pull centrality estimates, and write to centralities in global environment:
analyze_one <- function(dataframe){
network_model <- var1(
dataframe,
vars = useVars,
contemporaneous = "ggm",
dayvar = "dayno",
beepvar = "beepno",
estimator = "FIML",
verbose = TRUE,
omega_zeta = "full")
model_omega <- getmatrix(network_model, "omega_zeta")
centrality_omega<- centrality(model_omega)
model_beta <- getmatrix(network_model, "beta")
centrality_beta<- centrality(model_beta)
subjno <- as.list(dataframe[1,2])
centralities[nrow(centralities) + 1,] <- c(subjno,centrality_omega$InExpectedInfluence,centrality_beta$InExpectedInfluence,centrality_beta$OutExpectedInfluence)
assign('centralities',centralities, envir=.GlobalEnv)
}
then rerun the code with lapply for all dataframes (with ignoring errors):
lapply_with_error <- function(X,FUN,...){
lapply(X, function(x, ...) tryCatch(FUN(x, ...),
error=function(e) NULL))
}
lapply_with_error(fullDataDetrend_split, FUN = analyze_one)

Related

parallelizing lapply with parLapply does not recognize objects even though I suppied them

I am trying to parallelize something with parLapply. I am exporting all necessary information to the cores, but somehow I am getting an error saying that it cannot find the object 'market_time' (first line of the function that is called in parLapply. However, this object is just a column of the data table 'dt' that I export to the cores.
library('data.table')
library('numDeriv')
library('snow')
cores=detectCores()
cl <- makeCluster(cores[1], type = 'PSOCK')
markets <- unique(dt[, market_time])
R = 10000
nu_p <- rnorm(n = R, -2, 0.5)
nu_xr <- rnorm(n = R, 2, 0.5)
nu_xm <- rnorm(n = R, 2, 0.5)
nu_xj <- rnorm(n = R, 2, 0.5)
clusterExport(cl,c('dt','nu_p','nu_xr','nu_xm','nu_xj')
temp <- parLapply(cl, markets,calc_mc_w, dt=dt,nu_p=nu_p,nu_xr= nu_xr,
nu_xm=nu_xm,nu_xj=nu_xj)
where the function calc_mc_w calls:
calc_mc_w <- function(m, dt,nu_p,nu_xr,nu_xm,nu_xj){
dt_mkt = dt[market_time==m,]
market_time <- dt_mkt[, market_time]
x_m <- dt_mkt[, x_m]
x_j <- dt_mkt[, x_j]
x_r <- dt_mkt[, x_r]
p <- as.matrix(dt_mkt[, p])
xi <- dt_mkt[, xi]
p <- as.matrix(dt_mkt[, p])
jacobian <- jacobian(function(x){calc_shares(x, x_m, x_j, x_r, xi, nu_p,
nu_xm, nu_xj, nu_xr,
market_time)},p)
output <- dt_mkt[,c('prod','market','time','retailer')]
#Get a system of equations with as many equations as unknowns
retailers = unique(dt_mkt[, retailer])
temp <- lapply(retailers,calc_mc_w_r,dt_mkt = dt_mkt, jacobian = jacobian)
temp <- rbindlist(temp)
output <- merge(output,temp,by.x = c('prod','retailer'),
by.y = c('prod','retailer'), allow.cartesian=TRUE)
output
}
calc_mc_w_r <- function(r, dt_mkt, jacobian){
dt_r = dt_mkt[retailer == r,]
result <- dt_r[,c('prod','retailer')]
rows = (dt_mkt[,'retailer']== r)
jacobian_r = jacobian[rows,rows]
result <- result[,mc_w := solve(jacobian_r, dt_r[,shares]+ jacobian_r %*% dt_r[,p])]
result
}
The error I get is:
Error in checkForRemoteErrors(val) :
2 nodes produced errors; first error: object 'market_time' not found
If instead, I do not export the data table dt, but instead each column of it, I get the same error but just for 'jacobian' which is something that I calculate in the function (I do not want to calculate it across the whole dataset as it is super costly, which is why I just want to calculate it on each subset).

I am trying to use various transformations of the response variable and fit corresponding these model, and obtain residual plots for each model

library(GLMsData)
data(fluoro)
lambda <- seq(-2,2,0.5)
lm.out <- list()
for(i in length(lambda)){
if(i != 0){
y <- (fluoro$Dose^lambda-1)/lambda
} else {
y <- log(fluoro$Dose)
}
lm.out[[i]] <- lm(y[i]~Time, data = fluoro, na.exclude = T)
}
print(lm.out)
Error in model.frame.default(formula = y[i] ~ Time, data = fluoro, drop.unused.levels = TRUE) : variable lengths differ (found for 'Time')
I am trying to use various transformations of the response variable and fit these corresponding models, and obtain residual plots for each model.
I need a help. Thanks

Here is a corrected version of the for loop in the question.
data(fluoro, package = "GLMsData")
lambda <- seq(-2, 2, 0.5)
lm.out <- list()
for(i in 1:length(lambda)){
if(lambda[i] != 0){
y <- (fluoro$Dose^lambda[i]-1)/lambda[i]
} else {
y <- log(fluoro$Dose)
}
lm.out[[i]] <- lm(y ~ Time, data = fluoro, na.action = na.exclude)
}
print(lm.out)
And a version with a boxcox function defined and used in a lapply loop.
boxcox <- function(x, lambda, na.rm = FALSE){
if(na.rm) x <- x[!is.na(x)]
if(lambda == 0){
log(x)
} else {
(x^lambda - 1)/lambda
}
}
lm_out2 <- lapply(lambda, \(l){
lm(boxcox(Dose, lambda = l) ~ Time, data = fluoro, na.action = na.exclude)
})
Check that both ways above produce the same results.
coef_list <- sapply(lm.out, coef)
coef_list2 <- sapply(lm_out2, coef)
identical(coef_list, coef_list2)
#[1] TRUE
smry_list <- lapply(lm.out, summary)
smry_list2 <- lapply(lm_out2, summary)
pval_list <- sapply(smry_list, \(fit) fit$coefficients[, "Pr(>|t|)"])
pval_list2 <- sapply(smry_list2, \(fit) fit$coefficients[, "Pr(>|t|)"])
identical(pval_list, pval_list2)
#[1] TRUE
R2_list <- sapply(smry_list, "[[", "r.squared")
R2_list2 <- sapply(smry_list2, "[[", "r.squared")
identical(R2_list, R2_list2)
#[1] TRUE

homals package for Nonlinear PCA in R: Error in dimnames(x) <- dn : length of 'dimnames' [1] not equal to array extent

I am trying to implement NLPCA (Nonlinear PCA) on a data set using the homals package in R but I keep on getting the following error message:
Error in dimnames(x) <- dn : length of 'dimnames' [1] not equal to array extent
The data set I use can be found in the UCI ML Repository and it's called dat when imported in R: https://archive.ics.uci.edu/ml/datasets/South+German+Credit+%28UPDATE%29
Here is my code (some code is provided once the data set is downloaded):
nlpcasouthgerman <- homals(dat, rank=1, level=c('nominal','numerical',rep('nominal',2),
'numerical','nominal',
rep('ordinal',2), rep('nominal',2),
'ordinal','nominal','numerical',
rep('nominal',2), 'ordinal',
'nominal','ordinal',rep('nominal',3)),
active=c(FALSE, rep(TRUE, 20)), ndim=3, verbose=1)
I am trying to predict the first attribute, therefore I set it to be active=FALSE.
The output looks like this (skipped all iteration messages):
Iteration: 1 Loss Value: 0.000047
Iteration: 2 Loss Value: 0.000044
...
Iteration: 37 Loss Value: 0.000043
Iteration: 38 Loss Value: 0.000043
Error in dimnames(x) <- dn :
length of 'dimnames' [1] not equal to array extent
I don't understand why this error comes up. I have used the same code on some other data set and it worked fine so I don't see why this error persists. Any suggestions about what might be going wrong and how I could fix this issue?
Thanks!

It seems the error comes from code generating NAs in the homals function, specifically for your data for the number_credits levels, which causes problems with sort(as.numeric((rownames(clist[[i]])))) and the attempt to catch the error, since one of the levels does not give an NA value.
So either you have to modify the homals function to take care of such an edge case, or change problematic factor levels. This might be something to file as a bug report to the package maintainer.
As a work-around in your case you could do something like:
levels(dat$number_credits)[1] <- "_1"
and the function should run without problems.
Edit:
I think one solution would be to change one line of code in the homals function, but no guarantee this does work as intended. Better submit a bug report to the package author/maintainer - see https://cran.r-project.org/web/packages/homals/ for the address.
Using rnames <- as.numeric(rownames(clist[[i]]))[order(as.numeric(rownames(clist[[i]])))] instead of rnames <- sort(as.numeric((rownames(clist[[i]])))) would allow the following code to identify NAs, but I am not sure why the author did not try to preserve factor levels outright.
Anyway, you could run a modified function in your local environment, which would require to explicitly call internal (not exported) homals functions, as shown below. Not necessarily the best approach, but would help you out in a pinch.
homals <- function (data, ndim = 2, rank = ndim, level = "nominal", sets = 0,
active = TRUE, eps = 0.000001, itermax = 1000, verbose = 0) {
dframe <- data
name <- deparse(substitute(dframe))
nobj <- nrow(dframe)
nvar <- ncol(dframe)
vname <- names(dframe)
rname <- rownames(dframe)
for (j in 1:nvar) {
dframe[, j] <- as.factor(dframe[, j])
levfreq <- table(dframe[, j])
if (any(levfreq == 0)) {
newlev <- levels(dframe[, j])[-which(levfreq == 0)]
}
else {
newlev <- levels(dframe[, j])
}
dframe[, j] <- factor(dframe[, j], levels = sort(newlev))
}
varcheck <- apply(dframe, 2, function(tl) length(table(tl)))
if (any(varcheck == 1))
stop("Variable with only 1 value detected! Can't proceed with estimation!")
active <- homals:::checkPars(active, nvar)
rank <- homals:::checkPars(rank, nvar)
level <- homals:::checkPars(level, nvar)
if (length(sets) == 1)
sets <- lapply(1:nvar, "c")
if (!all(sort(unlist(sets)) == (1:nvar))) {
print(cat("sets union", sort(unlist(sets)), "\n"))
stop("inappropriate set structure !")
}
nset <- length(sets)
mis <- rep(0, nobj)
for (l in 1:nset) {
lset <- sets[[l]]
if (all(!active[lset]))
(next)()
jset <- lset[which(active[lset])]
for (i in 1:nobj) {
if (any(is.na(dframe[i, jset])))
dframe[i, jset] <- NA
else mis[i] <- mis[i] + 1
}
}
for (j in 1:nvar) {
k <- length(levels(dframe[, j]))
if (rank[j] > min(ndim, k - 1))
rank[j] <- min(ndim, k - 1)
}
x <- cbind(homals:::orthogonalPolynomials(mis, 1:nobj, ndim))
x <- homals:::normX(homals:::centerX(x, mis), mis)$q
y <- lapply(1:nvar, function(j) homals:::computeY(dframe[, j], x))
sold <- homals:::totalLoss(dframe, x, y, active, rank, level, sets)
iter <- pops <- 0
repeat {
iter <- iter + 1
y <- homals:::updateY(dframe, x, y, active, rank, level, sets,
verbose = verbose)
smid <- homals:::totalLoss(dframe, x, y, active, rank, level,
sets)/(nobj * nvar * ndim)
ssum <- homals:::totalSum(dframe, x, y, active, rank, level, sets)
qv <- homals:::normX(homals:::centerX((1/mis) * ssum, mis), mis)
z <- qv$q
snew <- homals:::totalLoss(dframe, z, y, active, rank, level,
sets)/(nobj * nvar * ndim)
if (verbose > 0)
cat("Iteration:", formatC(iter, digits = 3, width = 3),
"Loss Value: ", formatC(c(smid), digits = 6,
width = 6, format = "f"), "\n")
r <- abs(qv$r)/2
ops <- sum(r)
aps <- sum(La.svd(crossprod(x, mis * z), 0, 0)$d)/ndim
if (iter == itermax) {
stop("maximum number of iterations reached")
}
if (smid > sold) {
warning(cat("Loss function increases in iteration ",
iter, "\n"))
}
if ((ops - pops) < eps)
break
else {
x <- z
pops <- ops
sold <- smid
}
}
ylist <- alist <- clist <- ulist <- NULL
for (j in 1:nvar) {
gg <- dframe[, j]
c <- homals:::computeY(gg, z)
d <- as.vector(table(gg))
lst <- homals:::restrictY(d, c, rank[j], level[j])
y <- lst$y
a <- lst$a
u <- lst$z
ylist <- c(ylist, list(y))
alist <- c(alist, list(a))
clist <- c(clist, list(c))
ulist <- c(ulist, list(u))
}
dimlab <- paste("D", 1:ndim, sep = "")
for (i in 1:nvar) {
if (ndim == 1) {
ylist[[i]] <- cbind(ylist[[i]])
ulist[[i]] <- cbind(ulist[[i]])
clist[[i]] <- cbind(clist[[i]])
}
options(warn = -1)
# Here is the line that I changed in the code:
# rnames <- sort(as.numeric((rownames(clist[[i]]))))
rnames <- as.numeric(rownames(clist[[i]]))[order(as.numeric(rownames(clist[[i]])))]
options(warn = 0)
if ((any(is.na(rnames))) || (length(rnames) == 0))
rnames <- rownames(clist[[i]])
if (!is.matrix(ulist[[i]]))
ulist[[i]] <- as.matrix(ulist[[i]])
rownames(ylist[[i]]) <- rownames(ulist[[i]]) <- rownames(clist[[i]]) <- rnames
rownames(alist[[i]]) <- paste(1:dim(alist[[i]])[1])
colnames(clist[[i]]) <- colnames(ylist[[i]]) <- colnames(alist[[i]]) <- dimlab
colnames(ulist[[i]]) <- paste(1:dim(as.matrix(ulist[[i]]))[2])
}
names(ylist) <- names(ulist) <- names(clist) <- names(alist) <- colnames(dframe)
rownames(z) <- rownames(dframe)
colnames(z) <- dimlab
dummymat <- as.matrix(homals:::expandFrame(dframe, zero = FALSE, clean = FALSE))
dummymat01 <- dummymat
dummymat[dummymat == 2] <- NA
dummymat[dummymat == 0] <- Inf
scoremat <- array(NA, dim = c(dim(dframe), ndim), dimnames = list(rownames(dframe),
colnames(dframe), paste("dim", 1:ndim, sep = "")))
for (i in 1:ndim) {
catscores.d1 <- do.call(rbind, ylist)[, i]
dummy.scores <- t(t(dummymat) * catscores.d1)
freqlist <- apply(dframe, 2, function(dtab) as.list(table(dtab)))
cat.ind <- sequence(sapply(freqlist, length))
scoremat[, , i] <- t(apply(dummy.scores, 1, function(ds) {
ind.infel <- which(ds == Inf)
ind.minfel <- which(ds == -Inf)
ind.nan <- which(is.nan(ds))
ind.nael <- which((is.na(ds) + (cat.ind != 1)) ==
2)
ds[-c(ind.infel, ind.minfel, ind.nael, ind.nan)]
}))
}
disc.mat <- apply(scoremat, 3, function(xx) {
apply(xx, 2, function(cols) {
(sum(cols^2, na.rm = TRUE))/nobj
})
})
result <- list(datname = name, catscores = ylist, scoremat = scoremat,
objscores = z, cat.centroids = clist, ind.mat = dummymat01,
loadings = alist, low.rank = ulist, discrim = disc.mat,
ndim = ndim, niter = iter, level = level, eigenvalues = r,
loss = smid, rank.vec = rank, active = active, dframe = dframe,
call = match.call())
class(result) <- "homals"
result
}

Models in a list can't be predicted upon

Consider a list of models that can be created by:
fits = vector(mode="list",length=10)
for(i in 1:10)
{
fits[[i]] = lm(nox~poly(dis,i),data=Boston)
}
Where, Boston dataset is used, that can be found in the MASS library.
Now, in order to make predictions:
dislim = range(Boston$dis)
dis.grid = seq(from = dislim[1],to = dislim[2],by = 0.1)
This is done to give values of dis upon which nox's values are predicted.
Now, in order to make predictions, we can do the following:
predict(fits[[1]],list(dis = dis.grid))
But this results in an error:
Error: variable 'poly(dis, i)' was fitted with type "nmatrix.1" but type "nmatrix.10" was supplied
In addition: Warning message:
In Z/rep(sqrt(norm2[-1L]), each = length(x)) :
longer object length is not a multiple of shorter object length
But, when I do the following:
lm.Boston = lm(nox~poly(dis,3),data=Boston)
lm.Boston.pred = predict(lm.Boston,list(dis = dis.grid))
It works fine. So, why can't I do that in the case of a list?

The correct way to specify a dynamic formula is to use paste and as.formula
library(MASS)
data(Boston)
dislim <- range(Boston$dis)
dis.grid <- seq(from = dislim[1],to = dislim[2],by = 0.1)
models <- lapply(1:10, function(i){
form = as.formula(paste0("nox~", "poly(dis," , i, ")"))
lm(form, data=Boston)
})
to predict
lapply(models, function(x){
predict(x, list(dis = dis.grid))
})
EDIT: Another way to build the formula (as per MrFlick comment) is:
`lm(bquote(nox~poly(dis,.(i))), data=Boston)`
models1 <- lapply(1:10, function(i){
lm(bquote(nox~poly(dis,.(i))), data=Boston)
})
Additionally (as per Nathan Werth comment) if the formulation:
models2 <- lapply(1:10, function(i){
lm(nox~poly(dis,i),data=Boston)
})
is used, the i is being treated as a variable in the model and it is possible to exploit such behavior in the following way:
predict(models2[[1]], list(dis = dis.grid, i = 1)
library(purrr)
models <- lapply(1:10, function(i){
form = as.formula(paste0("nox~", "poly(dis," , i, ")"))
lm(form, data=Boston)
})
models1 <- lapply(1:10, function(i){
lm(bquote(nox~poly(dis,.(i))), data=Boston)
})
models2 <- lapply(1:10, function(i){
lm(nox~poly(dis,i),data=Boston)
})
missuse <- lapply(models, function(x){
predict(x,list(dis = dis.grid))
})
MrFlick <- lapply(models1, function(x){
predict(x,list(dis = dis.grid))
})
NathanWerth <- purrr::map2(models2, 1:10, function(x, y){
predict(x,list(dis = dis.grid, i = y ))
})
purrr::pmap(list(missuse, MrFlick, NathanWerth), function(x, y, z) c(identical(x, y), identical(x, z)))

Multiple loop Syntax Error

I cannot figure out what's going wrong with my loop and it is already too complicated for my current level. I have already tried applybut obviously I do something wrong, so I didn't use it at all.
library('wavelets')
library('benford.analysis')
indeces <- ls() # my initial datasets
wfilters <- array(c("haar","la8","d4","c6")) # filter option in "modwt" function
wfiltname <- array(c("h","l","d","c")) # to rename the new objects
for (i in 1:nrow(as.array(indeces))) {
x <- get(as.matrix(indeces[i]))
x <- x[,2]
# Creates modwt objects equal to the number of filters
for (j in 1:nrow(as.array(wfilters))) {
x <- wavelets::modwt(x, filter = wfilters[j], n.levels = 4,
boundary = "periodic")
# A loop that creates a matrix with benford fun output per modwt n.levels option
for (l in 1:4) {
x <- as.matrix(x#W$W[l]) # n.levels are represented as x#W$W1, x#W$W2,...
x <- benford.analysis::benford(x, number.of.digits = 1,
sign = "both", discrete = T,
round = 3) # accepts matrices
x[,l] <- x$bfd$data.dist # it always has 9 elements
}
assign(paste0("b", wfiltname[j], indeces[i]), x)
}
}
The above loop should be reproducible with any data (where the values are in second column). The error I get is the following:
Error in array(x, c(length(x), 1L), if (!is.null(names(x))) list(names(x), :
'data' must be of a vector type, was 'NULL'

Thanks to #Cath and #jogo I made it work after some improvements. Here's the correct code:
temp <- list.files(path = "...")
list2env(
lapply(setNames(temp, make.names(gsub("*.csv$", "", temp))),
read.csv), envir = .GlobalEnv)
rm(temp)
indeces <- ls()
wfilters <- array(c("haar","la8","d4","c6"))
wfiltname <- array(c("h","l","d","c"))
k <- data.frame(matrix(nrow = 9,ncol = 4))
nlvl <- 4
for (i in 1:length(indeces)) {
x <- as.matrix(get(indeces[i]))
for (j in 1:length(wfilters)) {
y <- wavelets::modwt(as.matrix(x), filter = wfilters[j], n.levels = nlvl,
boundary = "periodic")
y <- as.matrix(y#W)
for(m in 1:nlvl) {
z <- as.matrix(y[[m]])
z <- benford.analysis::benford(z, number.of.digits = 1, sign = "both", discrete = TRUE, round = 16)
k[m] <- as.data.frame(z$bfd$data.dist)
colnames(k)[m] <- paste0(wfilters[j], "W", m)
}
assign(paste0(indeces[i], wfiltname[j]), k)
}
}
rm(x,y,z,i,j,m,k)
I would appreciate if there is a way to write it more efficiently. Thank you very much

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Repeat analysis for several datasets in R - r

Related

parallelizing lapply with parLapply does not recognize objects even though I suppied them

I am trying to use various transformations of the response variable and fit corresponding these model, and obtain residual plots for each model

homals package for Nonlinear PCA in R: Error in dimnames(x) <- dn : length of 'dimnames' [1] not equal to array extent

Models in a list can't be predicted upon

Multiple loop Syntax Error

Categories

Resources