Function inputs from a list - r

How can I run a function (in R) where some of the inputs are pulled from a list (or data frame)? Am I right in thinking that this would be more efficient than running a for-loop?
I am running simulations and want to change the variable values, but as they take a long time to run I want them to run overnight and to just tick through the different values automatically.
Here's the code for the function:
n = 10000
mu = 0
sd = 1
n.sub = 100
iboot = 100
isim = 1000 ### REDUCED FOR THIS EXAMPLE ###
var.values <- NULL
var.values.pop <- NULL
hist.fn <- function(n,mu,sd,n.sub,iboot)
{
Pop <- rnorm(n,mu,sd)
var.pop <- var(Pop)
Samp <- sample(Pop, n.sub, replace = FALSE)
var.samp <- var(Samp)
for(i in 1:isim) {
for(j in 1:iboot) {
Boot <- sample(Samp, n.sub, replace = TRUE)
var.values[j] <- var(Boot)
}
Samp <- sample(Pop, n.sub, replace = FALSE)
var.values.pop[i] <- var(Samp)
}
hist.pop <- hist(var.values.pop,plot=F)
hist.boot <- hist(var.values,plot=F)
#mypath = file.path("C:", "Output", paste("hist.boot_n.", n.sub, "_var.", sd^2, "_isim.", isim, "_iboot.", iboot, ".wmf", sep=""))
#win.metafile(file=mypath)
plot.new() #### ADDED FOR THIS EXAMPLE INSTEAD OF OUTPUTTING TO FILE ####
plot(hist.pop, freq=FALSE, xlim=range(var.values.pop, var.values), ylim=range(hist.pop$density, hist.boot$density), main = paste("Histogram of variances \n n=",n.sub," mu=",mu,"var=",sd^2,"\n n.sim=",isim,"n.boot=",iboot,"\n"), cex.main=0.8, xlab="Variance", col="red")
plot(hist.boot, freq=FALSE, col="blue", border="blue", add=T, density=20, angle=45)
abline(v=var.pop, lty=2, col="black", lwd=2)
legend("topright", legend=c("sample","bootstrap"),col=c("red","blue"),lty=1,lwd=2,bty="n",cex=0.7)
#dev.off()
}
hist.fn(n,mu,sd,n.sub,iboot)
Then I want sd, n.sub, and iboot to change by running through the following values:
sd <- c(1,10,100,1000)
n.sub <- c(4,10,100,1000)
iboot <- c(100,1000,10000)

Perhaps something like this?
n = 10000
mu = 0
sd = 1
n.sub = 100
iboot = 100
isim = 1000
sd <- c(1,10,100,1000)
n.sub <- c(4,10,100,1000)
iboot <- c(100,1000,10000)
# hist.fn parameters: n,mu,sd,n.sub,iboot
params <- expand.grid(n = n, mu = mu, sd = sd,
n.sub = n.sub, iboot = iboot)
apply(params, 1, FUN = function(x) do.call(hist.fn, as.list(x) ) )
You probably want to put these:
var.values <- NULL
var.values.pop <- NULL
Inside hist.fn, because assigning values to variables outside a function doesn't work like you seem to think.

You should use do.call, which will apply the function using arguments in a list. I have simplified your example to run less loops for the example. You can modify the printline of the script in order to monitor your progress for a larger job:
# The function
hist.fn <- function(n,mu,isim,sd,n.sub,iboot)
{
Pop <- rnorm(n,mu,sd)
var.pop <- var(Pop)
Samp <- sample(Pop, n.sub, replace = FALSE)
var.samp <- var(Samp)
var.values <- NaN*seq(isim) # sets up an empty vector for results
var.values.pop <- NaN*seq(isim) # sets up an empty vector for results
for(i in seq(isim)) {
for(j in seq(iboot)) {
Boot <- sample(Samp, n.sub, replace = TRUE)
var.values[j] <- var(Boot)
print(paste("i =", i, "; j =", j))
}
Samp <- sample(Pop, n.sub, replace = FALSE)
var.values.pop[i] <- var(Samp)
}
list(var.values=var.values, var.values.pop=var.values.pop) #returns results in the form of a list
}
# Global variables
n = 100
mu = 0
isim = 10
# Changing variables
sd <- c(1,10,20,30)
n.sub <- c(4,10,20,30)
iboot <- c(100,200,300,400)
df <- data.frame(sd=sd, n.sub=n.sub, iboot=iboot)
res <- vector(mode="list", nrow(df)) # sets up an empty list for results
for(i in seq(nrow(df))){
res[[i]] <- do.call(hist.fn, c(n=n, mu=mu, isim=isim, df[i,]) )
}
res # show results

sd <- 1:3
n.sub <- 4:6
iboot <- 7:9
funct1<-function(x,y,z) print(x+y+z)
for (i in 1:length(sd)){
funct1(sd[i],n.sub[i],iboot[i])
}
just an example. Doing it with loop.

Related

How to see the distribution of variables after clustering with gower distance in R. How can i see the distribution of variables accross the clusters?

I have calculated dendrograms of my dataset with the divisive and agglomerative method
library(cluster)
library(fpc)
gower.dist <- daisy(data.cluster, metric=c("gower"))
divisive.clust <- diana(as.matrix(gower.dist),
diss = TRUE, keep.diss = TRUE)
plot(divisive.clust, main = "Divisive")
aggl.clust.c <- hclust(gower.dist, method = "complete")
plot(aggl.clust.c,
main = "Agglomerative, complete linkages")
I also have the results in a table with the amounts of cases in the clusters, etc.
cstats.table <- function(dist, tree, k) {
clust.assess <- c("cluster.number","n","within.cluster.ss","average.within","average.between",
"wb.ratio","dunn2","avg.silwidth")
clust.size <- c("cluster.size")
stats.names <- c()
row.clust <- c()
output.stats <- matrix(ncol = k, nrow = length(clust.assess))
cluster.sizes <- matrix(ncol = k, nrow = k)
for(i in c(1:k)){
row.clust[i] <- paste("Cluster-", i, " size")
}
for(i in c(2:k)){
stats.names[i] <- paste("Test", i-1)
for(j in seq_along(clust.assess)){
output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]
}
for(d in 1:k) {
cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
cluster.sizes[d, i]
}
}
output.stats.df <- data.frame(output.stats)
cluster.sizes <- data.frame(cluster.sizes)
cluster.sizes[is.na(cluster.sizes)] <- 0
rows.all <- c(clust.assess, row.clust)
# rownames(output.stats.df) <- clust.assess
output <- rbind(output.stats.df, cluster.sizes)[ ,-1]
colnames(output) <- stats.names[2:k]
rownames(output) <- rows.all
is.num <- sapply(output, is.numeric)
output[is.num] <- lapply(output[is.num], round, 2)
output
}
stats.df.divisive <- cstats.table(gower.dist, divisive.clust, 7)
stats.df.divisive
stats.df.aggl <-cstats.table(gower.dist, aggl.clust.c, 7)
#complete linkages looks like the most balanced approach
stats.df.aggl

how to organize the output of MLE using R

I wrote down this function for MLE estimation and then I apply it for different settings of parameters.
Finally, I bind all results for an output.
But is not working i have problem with the output and also I need to organize the output like the attached image using R program.
enter image description here
could some one help me please?
What should I fix and how can I print the results like the picture attached.
thank you in advance
rbssn<- function(n,alpha,beta)
{
if(!is.numeric(n)||!is.numeric(alpha)||!is.numeric(beta))
{stop("non-numeric argument to mathematical function")}
if(alpha<=0){ stop("alpha must be positive")}
if(beta<=0) { stop("beta must be positive") }
z <- rnorm(n,0,1)
r <- beta*((alpha*z*0.5)+sqrt((alpha*z*0.5)^2+1))^2
return(r)
}
#Function
mymle <- function(n,alpha,beta,rep)
{
theta=c(alpha,beta) # store starting values
#Tables
LHE=array(0, c(2,rep));
rownames(LHE)= c("MLE_alpha", "MLE_beta")
#Bias
bias= array(0, c(2,rep));
rownames(bias)= c("bias_alpha", "bias_beta")
#Simulation
set.seed(1)
#Loop
for(i in 1:rep){
myx <- exp(-rbssn(n, alpha, beta))
Score <- function(x) {
y <- numeric(2)
y[1] <- (-n/x[1])*(1+2/(x[1]^2)) - (1/(x[2]*x[1]^3))*sum(log(myx)) - (x[2]/(x[1]^3))*sum(1/log(myx))
y[2] <- -(n/(2*x[2])) + sum((1/(x[2]-log(myx)))) - (1/(2*(x[1]^2)*(x[2]^2)))*sum(log(myx)) + (1/(2*x[1]^2))*sum(1/(log(myx)))
y
}
Sin <- c(alpha,beta)
mle<- nleqslv(Sin, Score, control=list(btol=.01))[1]
LHE[i,]= mle
bias[i,]= c(mle[1]-theta[1], mle[2]-theta[2])
}
# end for i
#Format results
L <-round(apply(LHE, 1, mean), 3) # MLE of all the applied iterations
bs <-round(apply(bias,1, mean),3) # bias of all the applied iterations
row<- c(L, bs)
#Format a label
lab <- paste0('n= ',n,';',' alpha= ',alpha,';',' beta= ',beta)
row2 <- c(lab,row)
row2 <- as.data.frame(t(row2))
return(row2)
}
#Bind all
#Example 1
ex1 <- mymle(n = 20,alpha = 1,beta = 0.5,rep = 100)
ex2 <- mymle(n = 50,alpha = 2,beta = 0.5,rep = 100)
ex3 <- mymle(n = 100,alpha = 3,beta = 0.5,rep = 100)
#Example 2
ex4 <- mymle(n = 20,alpha = 0.5,beta = 0.5,rep = 100)
ex5 <- mymle(n = 50,alpha = 0.5,beta = 1,rep = 100)
ex6 <- mymle(n = 100,alpha = 0.5,beta = 1,rep = 100)
df <- rbind(ex1,ex2,ex3,ex4,ex5,ex6)
Any help will be appreciated.

Looping a function in R

I have written a cross validation/grid search style code in R that tries to find an optimal threshold value for a given value of mtry (using the random forest algorithm). I have posted my code below using the Sonar data from the library mlbench However, there seems to be some problems with this code.
library(caret)
library(mlbench)
library(randomForest)
res <- matrix(0, nrow = 10, ncol = 6)
colnames(res) <- c("mtry","Threshhold","Accuracy", "PositivePred", "NegativePred", "F-value")
out <- matrix(0, nrow = 17, ncol = 6)
colnames(out) <- c("mtry","Threshhold","Avg.Accuracy", "Avg.PosPred", "Avg.NegPred", "Avg.F_Value")
rep <- matrix(0, nrow = 10, ncol = 6)
colnames(out) <- c("mtry","Threshhold","Avg_Accuracy", "Avg_PosPred", "Avg_NegPred", "Avg_F_Value")
data(Sonar)
N=Sonar
### creating 10 folds
folds <- cut(seq(1,nrow(N)),breaks=10,labels=FALSE)
for (mtry in 5:14) {
K=mtry-4
for(thresh in seq(1,9,0.5)) {
J = 2*thresh-1
dataset<-N[sample(nrow(N)),] #### mix up the dataset N
for(I in 1:10){
#Segement your data by fold using the which() function
testIndexes <- which(folds==I,arr.ind=TRUE)
N_test <- dataset[testIndexes, ] ### select each fold for test
N_train <- dataset[-testIndexes, ] ### select rest for training
rf = randomForest(Class~., data = N_train, mtry=mtry, ntree=500)
pred = predict(rf, N_test, type="prob")
label = as.factor(ifelse(pred[,2]>=thresh,"M","R"))
confusion = confusionMatrix(N_test$Class, label)
res[I,1]=mtry
res[I,2]=thresh
res[I,3]=confusion$overall[1]
res[I,4]=confusion$byClass[3]
res[I,5]=confusion$byClass[4]
res[I,6]=confusion$byClass[7]
}
print(res)
out[J,1] = mtry
out[J,2] = thresh
out[J,3] = mean(res[,2])
out[J,4] = mean(res[,3])
out[J,5] = mean(res[,4])
out[J,6] = mean(res[,5])
}
print(out)
rep[K,1] = mtry
rep[K,2] = thresh
rep[K,3] = mean(out[,2])
rep[K,4] = mean(out[,3])
rep[K,5] = mean(out[,4])
rep[K,6] = mean(out[,5])
}
print(rep)
Earlier, I wrote a similar code with the "iris" dataset, and I did not seem to have any problems:
library(caret)
library(randomForest)
data(iris)
N <- iris
N$Species = ifelse(N$Species == "setosa", "a", "b")
N$Species = as.factor(N$Species)
res <- matrix(0, nrow = 10, ncol = 5)
colnames(res) <- c("Threshhold","Accuracy", "PositivePred", "NegativePred", "F-value")
out <- matrix(0, nrow = 9, ncol = 5)
colnames(out) <- c("Threshhold","Avg.Accuracy", "Avg.PosPred", "Avg.NegPred", "Avg.F_Value")
### creating 10 folds
folds <- cut(seq(1,nrow(N)),breaks=10,labels=FALSE)
for(J in 1:9) {
thresh = J/10
dataset<-N[sample(nrow(N)),] #### mix up the dataset N
for(I in 1:10){
#Segement your data by fold using the which() function
testIndexes <- which(folds==I,arr.ind=TRUE)
N_test <- dataset[testIndexes, ] ### select each fold for test
N_train <- dataset[-testIndexes, ] ### select rest for training
rf = randomForest(Species~., data = N_train, mtry=3, ntree=10)
pred = predict(rf, N_test, type="prob")
label = as.factor(ifelse(pred[,1]>=thresh,"a","b"))
confusion = confusionMatrix(N_test$Species, label)
res[I,1]=thresh
res[I,2]=confusion$overall[1]
res[I,3]=confusion$byClass[3]
res[I,4]=confusion$byClass[4]
res[I,5]=confusion$byClass[7]
}
print(res)
out[J,1] = thresh
out[J,2] = mean(res[,2])
out[J,3] = mean(res[,3])
out[J,4] = mean(res[,4])
out[J,5] = mean(res[,5])
}
print(out)
Could someone please assist me in debugging the first code?
Thanks
You need to close parenthesis ) in your for loop.
Replace this
for(thresh in seq(1,9,0.5) {
with
for(thresh in seq(1,9,0.5)) {
Update:
Also, it appears that your thresh is always above 1 giving a single value R in the label, as it is never above thresh.
label = as.factor(ifelse(pred[,2]>=thresh,"M","R"))
and that creates a problem in the next statement
confusion = confusionMatrix(N_test$Class, label)
I tested with 0.5, and I get no error.
label = as.factor(ifelse(pred[,2]>=0.5,"M","R"))
If you can define a better thresh - to stay between 0 and 1, you should be fine.

Plot clprofiles function without hitting enter each time

I'm looking for a way to get all plots of the variables without hitting enter each time.
if you're familiar with this function clprofiles of Kprototype, you know this message Hit <Return> to see next plot:, i want to see all plots of the variables at once.
Now i've tried doing a 'for loop' after the instruction clprofiles(kpres, df) :
clprofiles(kpres, df)
for (i in 1:length(t)) {
print("
")
}
But it's useless.
Thanks for your help.
In that case, you will have to override the default behaviour of clprofiles. Add this new function my.clprofiles to your script:
my.clprofiles <- function(object, x, vars = NULL, col = NULL){
library(RColorBrewer)
if(length(object$cluster) != nrow(x)) stop("Size of x does not match cluster result!")
if(is.null(vars)) vars <- 1:ncol(x)
if(!is.numeric(vars)) vars <- sapply(vars, function(z) return(which(colnames(x)==z)))
if(length(vars) < 1) stop("Specified variable names do not match x!")
if(is.null(col)){
k <- max(unique(object$cluster))
if(k > 2) col <- brewer.pal(k, "Set3")
if(k == 2) col <- c("lightblue","orange")
if(k == 1) col <- "lightblue"
}
clusids <- sort(unique(object$cluster))
if(length(col) != max(clusids)) warning("Length of col should match number of clusters!")
#REMOVE PROMPT
#par(ask=TRUE)
par(mfrow=c(2,2))
for(i in vars){
if(is.numeric(x[,i])){
boxplot(x[,i]~object$cluster, col = col, main = colnames(x)[i])
legend("topright", legend=clusids, fill = col)
}
if(is.factor(x[,i])){
tab <- table(x[,i], object$cluster)
for(j in 1:length(object$size)) tab[,j] <- tab[,j]/object$size[j]
barplot(t(tab), beside = TRUE, main = colnames(x)[i], col = col)
}
}
invisible()
}
And then you can call it once without having to hit Enter:
my.clprofiles(kpres,x)
which produces the same plot as in the first answer.
You can override three of the four prompts (but not the first one) since the plotting method is within the clprofiles command. If your goal is just to get all the plots to print on a single plot, this will do it:
library(clustMixType)
# Example from documentation
n <- 100; prb <- 0.9; muk <- 1.5
clusid <- rep(1:4, each = n)
x1 <- sample(c("A","B"), 2*n, replace = TRUE, prob = c(prb, 1-prb))
x1 <- c(x1, sample(c("A","B"), 2*n, replace = TRUE, prob = c(1-prb, prb)))
x1 <- as.factor(x1)
x2 <- sample(c("A","B"), 2*n, replace = TRUE, prob = c(prb, 1-prb))
x2 <- c(x2, sample(c("A","B"), 2*n, replace = TRUE, prob = c(1-prb, prb)))
x2 <- as.factor(x2)
x3 <- c(rnorm(n, mean = -muk), rnorm(n, mean = muk), rnorm(n, mean = -muk), rnorm(n, mean = muk))
x4 <- c(rnorm(n, mean = -muk), rnorm(n, mean = muk), rnorm(n, mean = -muk), rnorm(n, mean = muk))
x <- data.frame(x1,x2,x3,x4)
kpres <- kproto(x, 4)
Then you can make the plot by preparing with par first:
> par(mfrow=c(2,2))
> clprofiles(kpres, x)
Hit <Return> to see next plot:
>
And it produces:
I found another solution that shows the plots in an external window (full screen) and instead of presing "enter" each time, you just have to click
dev.new(width=5,height=4,noRStudioGD = TRUE)
clprofiles(kpres,df)

Avoiding Loops to Generate a Complex Dataframe with Nested Lists

Here is a kind of DF, I have to generate to store simulations data).
nbSimul <- 100
nbSampleSizes <- 4
nbCensoredRates <- 4
sampleSize <- c(100, 50, 30, 10)
censoredRate <- c(0.1, 0.3, 0.5, 0.8)
df.sampled <- data.frame(cas = numeric() ,
distribution = character(),
simul = numeric() ,
sampleSize = numeric() ,
censoredRate = numeric() ,
dta = I(list()) ,
quantileLD = I(list()) ,
stringsAsFactors = FALSE)
v <- 0 # Scenario indicator
for(k in 1:nbCensoredRates){
for(j in 1:nbSampleSizes){
for(i in 1:nbSimul){
# Scenario Id + Other info
v <- v + 1
df.sampled[v,"cas"] <- v
df.sampled[v,"distribution"] <- "logNormal"
df.sampled[v,"simul"] <- i
df.sampled[v,"sampleSize"] <- sampleSize[j]
df.sampled[v,"censoredRate"] <- censoredRate[k]
X <- rlnorm(sampleSize[j], meanlog = 0, sdlog = 1)
estimatedLD <- array(9)
for(w in 1:9){
estimatedLD[w] <- quantile(X, probs=censoredRate[k], type=w)[[1]]
}
df.sampled$dta[v] <- list(X)
df.sampled$quantileLD[v] <- list(estimatedLD[1:9])
}
}
}
Which is quite difficult to read.
I would like to find a way to avoid loops, and to reference easily scenarios (v) and attached variables.
Any idea?

Resources