Iterative pairwise comparisons across columns in R - r

I have a very large matrix 200 x 1500, where the rows are samples and the columns are data. I want to do pairwise comparisons of all 1500 columns (~1.1M tests), so combn would take too long. I'm trying to run a linear model on the first column against each 1499 other columns, process/write the pvalue to output file (i.e.data1.tsv) and then repeat for the second column (excluding the comparison with the first column) and save to data2.tsv. And continue until all comparisons have been made. Below is the code for what I am trying to achieve.
library(data.table)
df = as.data.frame(matrix(runif(20, min=0, max=100), nrow=4))
colnames(df) = c("data_1", "data_2", "data_3", "data_4", "data_5")
rownames(df) = c("sample_1", "sample_2", "sample_3", "sample_4")
pval_1 = as.numeric(summary(lm(data_1 ~ data_2, data=df))$coefficients[,"Pr(>|t|)"][2])
pval_2 = as.numeric(summary(lm(data_1 ~ data_3, data=df))$coefficients[,"Pr(>|t|)"][2])
pval_3 = as.numeric(summary(lm(data_1 ~ data_4, data=df))$coefficients[,"Pr(>|t|)"][2])
pval_4 = as.numeric(summary(lm(data_1 ~ data_5, data=df))$coefficients[,"Pr(>|t|)"][2])
data_1 = data.frame(id1 = c("data_1","data_1","data_1","data_1"),
id2 = c("data_2","data_3","data_4","data_5"),
pval = c(pval_1, pval_2,pval_3,pval_4))
fwrite(data_1, to_path)
pval_5 = as.numeric(summary(lm(data_2 ~ data_3, data=df))$coefficients[,"Pr(>|t|)"][2])
pval_6 = as.numeric(summary(lm(data_2 ~ data_4, data=df))$coefficients[,"Pr(>|t|)"][2])
pval_7 = as.numeric(summary(lm(data_2 ~ data_5, data=df))$coefficients[,"Pr(>|t|)"][2])
data_2 = data.frame(id1 = c("data_2","data_2","data_2"),
id2 = c("data_3","data_4","data_5"),
pval = c(pval_5, pval_6,pval_7))
fwrite(data_2, to_path)
pval_8 = as.numeric(summary(lm(data_3 ~ data_4, data=df))$coefficients[,"Pr(>|t|)"][2])
pval_9 = as.numeric(summary(lm(data_3 ~ data_5, data=df))$coefficients[,"Pr(>|t|)"][2])
data_3 = data.frame(id1 = c("data_3","data_3"),
id2 = c("data_4","data_5"),
pval = c(pval_8,pval_9))
fwrite(data_3, to_path )

I don't understand why it is important to split the files instead of creating a dataframe containing all the pairwise p-values between variables.
Being said that, take a look.
I created a random dataframe with the dimension that you want to work (by the way, this is not that large...):
library(Hmisc)
df = as.data.frame(matrix(runif(200 * 1500, min=0, max=10), nrow=200))
Later, by using the function rcorr from the Hmisc library. I runned the correlation matrix that will provide you also the p-values between variables.
mycor <- rcorr(as.matrix(df), type="pearson")
mycor_p = mycor$P
Having done that, I extract the upper matrix from the mycor_p matrix and create a 3-column matrix.
index <- which(upper.tri(mycor_p, diag = TRUE), arr.ind = TRUE)
dim_n <- dimnames(mycor_p)
res = data.frame(row = dim_n[[1]][index[, 1]],
col = dim_n[[2]][index[, 2]],
val = mycor_p[index])
Finally, remove NA cases
final_df = res[complete.cases(res), ]
And export this dataframe:
write.csv(final_df, file = your_path, row.names=FALSE)

Related

How to find each variables contribution to RDA and partial RDA in R (vegan package)?

data("mite") # Load mite species abundance data
data("mite.env") # Load envdata
# Hellinger transform the community data
mite.spe.hel <- decostand(mite, method = "hellinger")
mite.env <- mite.env[,1:2]
mite.env$SoilCont <- rnorm(70,5,2)
# Standardize quantitative environmental data
mite.env$SubsDens <- decostand(mite.env$SubsDens, method = "standardize")
mite.env$WatrCont <- decostand(mite.env$WatrCont, method = "standardize")
mite.env$SoilCont <- decostand(mite.env$SoilCont, method = "standardize")
#Relative contribution of the variables
rda.env <- rda(mite.env[,1:3])
head(sort(round(100*scores(rda.env, display = "sp", scaling = 0)[,1]^2, 3), decreasing = TRUE))
> WatrCont SubsDens SoilCont
> 50.468 38.404 11.128
I want to calculate similar contributions for a rda regressed against the species data and a partial rda. Here is my attempt but the output doesn't look correct. What can I do?
mite.spe.rda.signif <- rda(mite.spe.hel ~ WatrCont + SubsDens + SoilCont, data = mite.env)
head(sort(round(100*scores(mite.spe.rda.signif, display = "sp", scaling = 0)[,1]^2, 3), decreasing = TRUE))
> LCIL TVEL LRUG ONOV SUCT HMIN
> 29.904 13.785 9.420 9.040 5.812 5.010
#Partial RDA
mite.spe.partial.rda <- rda(mite.spe.hel ~ WatrCont + SubsDens + Condition(SoilCont), data = mite.env)
head(sort(round(100*scores(mite.spe.partial.rda , display = "sp", scaling = 0)[,1]^2, 3), decreasing = TRUE))
> LCIL TVEL ONOV LRUG SUCT HMIN
> 27.141 14.428 10.044 9.826 5.788 5.785

Loop a function in r to create a new table

I have a dataframe in r and want to perform the levene's/ variance test on multiple variables with two groups and save all results in a table. I have tried to do this using a for() loop and sapply() but I get neither working:
df <- data.frame(
x = rnorm(100, 0, 1),
y = rnorm(100, 50, 1),
z = rnorm(100, 70, 2),
group = rep(c(0,1), each = 50)
)
varlist <- c("x","y","z")
res.var <- character(length(varlist))
res.f <- numeric(length(varlist))
res.p <- numeric(length(varlist))
Option 1)
for(i in seq_along(varlist)) {
form <- substitute(i ~ group, list(i = as.name(varlist)))
result <- var.test(
formula = form,
data = df)
res.var[i] <- varlist[i]
res.f[i] <- result$estimate
res.p[i] <- result$p.value
}
Option 2:
sapply(varlist, function(x) {
form <- substitute(i ~ group, list(i = as.name(varlist)))
result <- var.test(
formula = form,
data = df)
res.var[i] <- varlist[i]
res.f[i] <- result$estimate
res.p[i] <- result$p.value
})
Maybe there's an easier way to that this. I'd be glad for any help ;o) Thank you in advance.

A function to fit a random forest model and return the results of specified data

consider the following data frame:
dat1 <- data.frame(Loc = rep(c("NY","MA","FL","GA"), each = 1000),
Region = rep(c("a","b","c","d"),each = 1000),
ID = rep(c(1:10), each=200),
var1 = rnorm(1000),
var2=rnorm(1000),
var3=rnorm(1000))
Loc and Region are two grouping variables for ID. Assume I have several other data frames like dat1. I am trying to write a function that will automatically fit a random forest model to the data. I want to specify the dataframe, grouping variable, and columns that I want it to use.
I have tried variants of the following functions, but keep getting error messages that say Error in get(dat, envir = .GlobalEnv) : invalid first argument when I try to run them
library(caret)
library(randomForest)
rand.f <- function(dat,groupvar,cols){
model <- train(groupvar ~ paste0(cols,collapse = "+"), data = dat, method = "rf", trControl = trainControl("cv", number = 10), importance = T)
c.e <- model$finalModel$confusion[, "class.error"]
print(c.e)
}
rand.f(dat="dat1", groupvar = "Region", cols = 5:6)
model$bestTune
##################
rand.f <- function(dat,groupvar,cols){
model <- train(get(dat, envir=.GlobalEnv)[,groupvar] ~ paste0(cols,collapse = "+"), data = dat, method = "rf", trControl = trainControl("cv", number = 10), importance = T)
c.e <- model$finalModel$confusion[, "class.error"]
print(c.e)
}
rand.f(dat="dat1", groupvar = "Region", cols = 5:6)
model$bestTune
what am I doing wrong?
The following should be working:
rand.f <- function(dat,outcome){
model <- train(x = dat[, cols, drop=F]
, y = dat[, outcome]
, method = "rf"
, trControl = trainControl("cv", number = 2)
, importance = T)
c.e <- model$finalModel$confusion[, "class.error"]
return(c.e)
}
which also works for numbers as well as vectors for the column names, e.g.
cols <- colnames(dat1)[5:6]
Note that I renamed the 'grouping' variable as it is a bit unclear what the grouping variable should be in this context. I have renamed it as outcome that is to be predicted to highlight what this stands for. If you did indeed try to predict the region, you can ignore this comment.
If you do want to trigger this function for different groups in your data, i.e. separate forests for different subsets, then you would best do that outside of this function.

Adapting the meansd moderator option in sjPlot interaction

I am using sjPlot, the sjp.int function, to plot an interaction of an lme.
The options for the moderator values are means +/- sd, quartiles, all, max/min. Is there a way to plot the mean +/- 2sd?
Typically it would be like this:
model <- lme(outcome ~ var1+var2*time, random=~1|ID, data=mydata, na.action="na.omit")
sjp.int(model, show.ci=T, mdrt.values="meansd")
Many thanks
Reproducible example:
#create data
mydata <- data.frame( SID=sample(1:150,400,replace=TRUE),age=sample(50:70,400,replace=TRUE), sex=sample(c("Male","Female"),200, replace=TRUE),time= seq(0.7, 6.2, length.out=400), Vol =rnorm(400),HCD =rnorm(400))
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
mydata2 <- NAins(mydata,0.1)
#run the lme which gives error message
model = lme(Vol ~ age+sex*time+time* HCD, random=~time|SID,na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#lmer works
model2 = lmer(Vol ~ age+sex*time+time* HCD+(time|SID),control=lmerControl(check.nobs.vs.nlev = "ignore",check.nobs.vs.rankZ = "ignore", check.nobs.vs.nRE="ignore"), na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model2, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#plotting gives problems (jittered lines)
plot(mydf)
With sjPlot, it's currently not possible. However, I have written a package especially dedicated to compute and plot marginal effects: ggeffects. This package is a bit more flexible (for marginal effects plots).
In the ggeffects-package, there's a ggpredict()-function, where you can compute marginal effects at specific values. Once you know the sd of your model term in question, you can specify these values in the function call to plot your interaction:
library(ggeffects)
# plot interaction for time and var2, for values
# 10, 30 and 50 of var2
mydf <- ggpredict(model, terms = c("time", "var2 [10,30,50]"))
plot(mydf)
There are some examples in the package-vignette, see especially this section.
Edit
Here are the results, based on your reproducible example (note that GitHub-Version is currently required!):
# requires at least the GitHub-Versiob 0.1.0.9000!
library(ggeffects)
library(nlme)
library(lme4)
library(glmmTMB)
#create data
mydata <-
data.frame(
SID = sample(1:150, 400, replace = TRUE),
age = sample(50:70, 400, replace = TRUE),
sex = sample(c("Male", "Female"), 200, replace = TRUE),
time = seq(0.7, 6.2, length.out = 400),
Vol = rnorm(400),
HCD = rnorm(400)
)
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1) {
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop * n * m)
id <- sample(0:(m * n - 1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x) {
df[rows[x], cols[x]] <<- NA
})
return(df)
}
mydata2 <- NAins(mydata, 0.1)
# run the lme, works now
model = lme(
Vol ~ age + sex * time + time * HCD,
random = ~ time |
SID,
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
lme-plot
# lmer also works
model2 <- lmer(
Vol ~ age + sex * time + time * HCD + (time |
SID),
control = lmerControl(
check.nobs.vs.nlev = "ignore",
check.nobs.vs.rankZ = "ignore",
check.nobs.vs.nRE = "ignore"
),
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model2, terms = c("time", "HCD [-2.5, -0.5, 2.0]"), ci.lvl = NA)
# plotting works, but only w/o CI
plot(mydf)
lmer-plot
# lmer also works
model3 <- glmmTMB(
Vol ~ age + sex * time + time * HCD + (time | SID),
data = mydata2
)
summary(model)
mydf <- ggpredict(model3, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
plot(mydf, facets = T)
glmmTMB-plots

How can a blocking factor be included in makeClassifTask() from mlr package?

In some classification tasks, using mlr package, I need to deal with a data.frame similar to this one:
set.seed(pi)
# Dummy data frame
df <- data.frame(
# Repeated values ID
ID = sort(sample(c(0:20), 100, replace = TRUE)),
# Some variables
X1 = runif(10, 1, 10),
# Some Label
Label = sample(c(0,1), 100, replace = TRUE)
)
df
I need to cross-validate the model keeping together the values with the same ID, I know from the tutorial that:
https://mlr-org.github.io/mlr-tutorial/release/html/task/index.html#further-settings
We could include a blocking factor in the task. This would indicate that some observations "belong together" and should not be separated when splitting the data into training and test sets for resampling.
The question is how can I include this blocking factor in the makeClassifTask?
Unfortunately, I couldn't find any example.
What version of mlr do you have? Blocking should be part of it since a while. You can find it directly as an argument in makeClassifTask
Here is an example for your data:
df$ID = as.factor(df$ID)
df2 = df
df2$ID = NULL
df2$Label = as.factor(df$Label)
tsk = makeClassifTask(data = df2, target = "Label", blocking = df$ID)
res = resample("classif.rpart", tsk, resampling = cv10)
# to prove-check that blocking worked
lapply(1:10, function(i) {
blocks.training = df$ID[res$pred$instance$train.inds[[i]]]
blocks.testing = df$ID[res$pred$instance$test.inds[[i]]]
intersect(blocks.testing, blocks.training)
})
#all entries are empty, blocking indeed works!
The answer by #jakob-r no longer works. My guess is something changed with cv10.
Minor edit to use "blocking.cv = TRUE"
Complete working example:
set.seed(pi)
# Dummy data frame
df <- data.frame(
# Repeated values ID
ID = sort(sample(c(0:20), 100, replace = TRUE)),
# Some variables
X1 = runif(10, 1, 10),
# Some Label
Label = sample(c(0,1), 100, replace = TRUE)
)
df
df$ID = as.factor(df$ID)
df2 = df
df2$ID = NULL
df2$Label = as.factor(df$Label)
resDesc <- makeResampleDesc("CV",iters=10,blocking.cv = TRUE)
tsk = makeClassifTask(data = df2, target = "Label", blocking = df$ID)
res = resample("classif.rpart", tsk, resampling = resDesc)
# to prove-check that blocking worked
lapply(1:10, function(i) {
blocks.training = df$ID[res$pred$instance$train.inds[[i]]]
blocks.testing = df$ID[res$pred$instance$test.inds[[i]]]
intersect(blocks.testing, blocks.training)
})

Resources