How can I save randomly generated train and test datasets? - r

I am using a for loop to generate 100 different train and test sets.
What I want to do now, is to save these 100 different train and test sets in order to be able to have a look at e.g. where iteration was 17.
This code shows my program with the for loop and the division into train and test set:
result_df<-matrix(ncol=3,nrow=100)
colnames(result_df)<-c("Acc","Sens","Spec")
for (g in 1:100 )
{
# Divide into Train and test set
smp_size <- floor(0.8 * nrow(mydata1))
train_ind <- sample(seq_len(nrow(mydata1)), size = smp_size)
train <- mydata1[train_ind, ]
test <- mydata1[-train_ind, ]
REST OF MY CODE
# Calculate some statistics
overall <- cm$overall
overall.accuracy <- format(overall['Accuracy'] * 100, nsmall =2, digits = 2)
overall.sensitivity <- format(cm$byClass['Sensitivity']* 100, nsmall =2, digits = 2)
overall.specificity <- format(cm$byClass['Specificity']* 100, nsmall =2, digits = 2)
result_df[g,1] <- overall.accuracy
result_df[g,2] <- overall.sensitivity
result_df[g,3] <- overall.specificity
}
How can I do this?

You could do the following, for example, saving each test and train sets as elements in a list:
result_df<-matrix(ncol=3,nrow=100)
colnames(result_df)<-c("Acc","Sens","Spec")
testlist <- list()
trainlist <- list()
for (g in 1:100 )
{
# Divide into Train and test set
smp_size <- floor(0.8 * nrow(mydata1))
train_ind <- sample(seq_len(nrow(mydata1)), size = smp_size)
train <- mydata1[train_ind, ]
test <- mydata1[-train_ind, ]
trainlist[[g]] <- train
testlist[[g]] <- test
}
EDIT
To retrieve the 7th element of these lists you could use trainlist[[7]]

You can save those in csv file by using the following method
write.csv(train, file = paste0("train-", Sys.time(), ".csv", sep=""))
write.csv(test, file = paste0("test-", Sys.time(), ".csv", sep=""))

One option could be to save the row indexes of your partitions, rather than saving all the datasets, and then select the rows indexes for the iteration you're interested in.
The caret package has a function called createDataPartition, which will do this for you:
library(caret)
df <- data.frame(col1 = rnorm(100), col2 = rnorm(100))
# create 100 partitions
train.idxs <- createDataPartition(1:nrow(df), times = 100, p = 0.8)
for(i in 1:length(train.idxs)) {
# create train and test sets
idx <- train.idxs[[i]]
train.df <- df[idx, ]
test.df <- df[-idx, ]
# calculate statistics ...
result_df[i,1] <- overall.accuracy
result_df[i,2] <- overall.sensitivity
result_df[i,3] <- overall.specificity
}
# check the datasets for the nth partition
# train set
df[train.idxs[[n]], ]
# test set
df[-train.idxs[[n]], ]

Put your code in a function and do a lapply():
result_df <- matrix(ncol=3, nrow=100)
colnames(result_df)<-c("Acc", "Sens", "Spec")
SIMg <- function(g) {
# Divide into Train and test set
smp_size <- floor(0.8 * nrow(mydata1))
train_ind <- sample(seq_len(nrow(mydata1)), size = smp_size)
train <- mydata1[train_ind, ]
test <- mydata1[-train_ind, ]
REST OF THE CODE
return(list(train=train, test=test, ...))
}
L <- lapply(1:100, SIMg)
The resulting list L has 100 elements, each element is a list containing the two dataframes and your results for one simulation run.
To get separate lists trainlist and testlist you can do:
trainlist <- lallpy(L, '[[', "train")
testlist <- lallpy(L, '[[', "test")

Related

rmse function issue in R

I have an R code that contains some nested bracket for loop within which I used rmse() function from Metrics package. I tried it without the function and it worked, but inside my nested R code it does not.
Here is what I desire to do with R
I have generated a 50-time series dataset.
I lice the same time series dataset into chunks of the following sizes: 2,3,...,48,49 making me have 48 different time series formed from step 1 above.
I divided each 48-time series dataset into train and test sets so I can use rmse function in Metrics package to get the Root Mean Squared Error (RMSE) for the 48 subseries formed in step 2.
The RMSE for each series is then tabulated according to their chunk sizes
I obtained the best ARIMA model for each 48 different time series data set.
My R code
# simulate arima(1,0,0)
library(forecast)
library(Metrics)
n <- 50
phi <- 0.5
set.seed(1)
wn <- rnorm(n, mean=0, sd=1)
ar1 <- sqrt((wn[1])^2/(1-phi^2))
for(i in 2:n){
ar1[i] <- ar1[i - 1] * phi + wn[i]
}
ts <- ar1
t<-length(ts)# the length of the time series
li <- seq(n-2)+1 # vector of block sizes(i.e to be between 1 and n exclusively)
RMSEblk<-matrix(nrow = 1, ncol = length(li))#vector to store block means
colnames(RMSEblk)<-li
for (b in 1:length(li)){
l<- li[b]# block size
m <- ceiling(t / l) # number of blocks
blk<-split(ts, rep(1:m, each=l, length.out = t)) # divides the series into blocks
singleblock <- vector() #initialize vector to receive result from for loop
for(i in 1:10){
res<-sample(blk, replace=T, 100) # resamples the blocks
res.unlist<-unlist(res, use.names = F) # unlist the bootstrap series
# Split the series into train and test set
train <- head(res.unlist, round(length(res.unlist) * 0.6))
h <- length(res.unlist) - length(train)
test <- tail(res.unlist, h)
# Forecast for train set
model <- auto.arima(train)
future <- forecast(test, model=model,h=h)
nfuture <- as.numeric(out$mean) # makes the `future` object a vector
# use the `rmse` function from `Metrics` package
RMSE <- rmse(test, nn)
singleblock[i] <- RMSE # Assign RMSE value to final result vector element i
}
#singleblock
RMSEblk[b]<-mean(singleblock) #store into matrix
}
RMSEblk
The error I got
#Error in rmse(test, nn): unused argument (nn)
#Traceback:
But when I wrote
library(forecast)
train <- head(ar1, round(length(ar1) * 0.6))
h <- length(ar1) - length(train)
test <- tail(ar1, h)
model <- auto.arima(train)
#forecast <- predict(model, h)
out <- forecast(test, model=model,h=h)
nn <- as.numeric(out$mean)
rmse(test, nn)
It did work
Please point out what I am missing?
I am able to run your code after making two very small corrections in your for loop. See the two commented lines:
for (b in 1:length(li)){
l<- li[b]
m <- ceiling(t / l)
blk<-split(ts, rep(1:m, each=l, length.out = t))
singleblock <- vector()
for(i in 1:10){
res<-sample(blk, replace=T, 100)
res.unlist<-unlist(res, use.names = F)
train <- head(res.unlist, round(length(res.unlist) * 0.6))
h <- length(res.unlist) - length(train)
test <- tail(res.unlist, h)
model <- auto.arima(train)
future <- forecast(test, model=model,h=h)
nfuture <- as.numeric(future$mean) # EDITED: `future` instead of `out`
RMSE <- rmse(test, nfuture) # EDITED: `nfuture` instead of `nn`
singleblock[i] <- RMSEi
}
RMSEblk[b]<-mean(singleblock)
}
It is possible that these typos did not result in errors because nn and out were defined in the global environment while you ran the for loop. A good debugging tip is to restart R and try to reproduce the problem.
Your code does not define nn. Other code that works has nn. To start code with clean slate use this line as first executable line:
rm(list=ls())

R code to normalize the test set

I would like to normalize the data this way:
(trainData - mean(trainData)) / sd(trainData)
(testData - mean(trainData)) / sd(trainData)
For the Train set I can use the function scale(). How can I do for the test set? I tried in different ways the lapply() function .. but I did not succeed.
Many thanks! An exemple of code:
Train <- data.frame(matrix(c(1:100),10,10))
Test <- data.frame(matrix(sample(1:100),10,10))
scaled.Train <- scale(Train)
ct <- ncol(Test)
rt <- nrow(Test)
ncol(Train)
sdmatrix <- data.frame(matrix(,rt,ct))
for (i in 1:ct){
sdmatrix[1,i] <- mean(Train[,i])
sdmatrix[2,i] <- sd(Train[,i])
}
Test <- rbind(Test, sdmatrix)
normTest <- function(x){
a <- x[rt-1]
b <- x[rt]
x <- (x-a)/b
}
Test <- lapply(Test[1:(rt-2),],normTest)

Predicting chunks with M models in R

I have dataset (HEART). I split it into chunks. I would like to predict each chunk with his (M=3) previous models. In this case, I would like to predict chunk number 10 - with models 7,8,9. chunk 9 - with models 6,7,8... chunk 4 - with models 1,2,3.
Here is my code:
library(caret)
dat1 <- read.csv(url("http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"), header = FALSE,sep = ",")
colnames(dat1) <- c(LETTERS[1:(ncol (dat1)-1)],"CLA")
dat1$CLA<-as.factor (dat1$CLA)
chunk <- 30
n <- nrow(dat1)
r <- rep(1:floor(n/chunk),each=chunk)[1:n]
d <- split(dat1,r)
N<-floor(n/chunk)
cart.models <- list()
for(i in 1:N){cart.models[[i]]<-rpart(CLA~ ., data = d[[i]]) }
for (i in (1+M):N) { k=0
for (j in (i-M):(i-1)) {
k=k+1
d[[i]][,(ncol(d[[i]])+k)]<-(predict(cart.models[[j]], d[[i]][,c(-14)], type = "class") )
}
}
I get the following Error:
Error in `[<-.data.frame`(`*tmp*`, , (ncol(d[[i]]) + k), value = c(1L, :
new columns would leave holes after existing columns
Your question is a bit puzzling, you load caret without using any functions from it. The objective seems like a time series analyses but instead of building on one chunk and predicting on the one that comes after it, you have a more complex desire, so createTimeSlices from caret won't do the trick.
You could create custom folds in caret with index and indexOut arguments in trainControl but that would ultimately lead to the creation of more models (21 to be exact) than is required for the presented objective (9). So I do believe loops are an appropriate way:
create the models:
library(rpart)
N <- 9
cart.models <- list()
for(i in 1:N){
cart.models[[i]] <- rpart(CLA~ ., data = d[[i]])
}
N can be 9 since 10 will not be utilized later on.
create a matrix to store the values:
cart.predictions <- matrix(nrow = chunk, ncol = length(4:10)*3)
it should have the same number of rows as there are predictions in each chunk (so 30) and it should have as many columns are there are predictions (three models for 4:10 chunks).
k <- 0 #as a counter
for (j in 4:10) { #prediction on chunks 4:10
p <- j-3
pred <- list()
for(i in p : (p+2)) { #using models (chink - 3) : (chunk - 1)
k = k + 1
predi <- predict(cart.models[[i]], d[[j]], type = "class")
cart.predictions[,k] <- predi
}
}
this creates a numeric matrix for predictions. By default when R converts factors to numeric it gives them numbers: 1 to the first level, 2 to the second etc - so to get the levels (0:4) you can just:
cart.predictions <- as.data.frame(cart.predictions - 1)
to create the column names:
names <- expand.grid(3:1, 4:10)
names$Var1 <- with(names, Var2 - Var1)
colnames(cart.predictions) <- make.names(paste0(names$Var1,"_", names$Var2))
lets check if it correct:
prediction from model 5 on chunk 6 converted to numeric
as.numeric(as.character(predict(cart.models[[5]], d[[6]], type = "class")))
should be equal to
cart.predictions[["X5_6"]] #that's how the names were designed
all.equal(as.numeric(as.character(predict(cart.models[[5]], d[[6]], type = "class"))),
cart.predictions[["X5_6"]])
#output
TRUE
or you can create a character matrix in the first place:
cart.predictions <- matrix(data = NA_character_, nrow = chunk, ncol = length(4:10)*3)
k <- 0 #as a counter
for (j in 4:10) {
p <- j-3
pred <- list()
for(i in p : (p+2)) {
k = k + 1
predi <- predict(cart.models[[i]], d[[j]], type = "class")
cart.predictions[,k] <- predi
}
}
cart.predictions <- as.data.frame(cart.predictions)
This should be the preferred method if the classes are certain "names".

R function to return multiple data frames

I have the following function to return 9 data frames:
split_data <- function(dataset, train_perc = 0.6, cv_perc = 0.2, test_perc = 0.2)
{
m <- nrow(dataset)
n <- ncol(dataset)
#Sort the data randomly
data_perm <- dataset[sample(m),]
#Split data into training, CV, and test sets
train <- data_perm[1:round(train_perc*m),]
cv <- data_perm[(round(train_perc*m)+1):round((train_perc+cv_perc)*m),]
test <- data_perm[(round((train_perc+cv_perc)*m)+1):round((train_perc+cv_perc+test_perc)*m),]
#Split sets into X and Y
X_train <- train[c(1:(n-1))]
Y_train <- train[c(n)]
X_cv <- cv[c(1:(n-1))]
Y_cv <- cv[c(n)]
X_test <- test[c(1:(n-1))]
Y_test <- test[c(n)]
}
My code runs fine, but no data frames are created. Is there a way to do this? Thanks
This will store the nine data.frames in a list
split_data <- function(dataset, train_perc = 0.6, cv_perc = 0.2, test_perc = 0.2) {
m <- nrow(dataset)
n <- ncol(dataset)
#Sort the data randomly
data_perm <- dataset[sample(m),]
# list to store all data.frames
out <- list()
#Split data into training, CV, and test sets
out$train <- data_perm[1:round(train_perc*m),]
out$cv <- data_perm[(round(train_perc*m)+1):round((train_perc+cv_perc)*m),]
out$test <- data_perm[(round((train_perc+cv_perc)*m)+1):round((train_perc+cv_perc+test_perc)*m),]
#Split sets into X and Y
out$X_train <- train[c(1:(n-1))]
out$Y_train <- train[c(n)]
out$X_cv <- cv[c(1:(n-1))]
out$Y_cv <- cv[c(n)]
out$X_test <- test[c(1:(n-1))]
out$Y_test <- test[c(n)]
return(out)
}
If you want dataframes to be created in the workspace at the end, this is what you'll need to do:-
1) Create empty variable (which may equal out to NULL i.e. Y_test = NULL) in your R console.
2) Assign "<<-" operator to the same variables created in Step 1 inside your function i.e.
X_train <<- train[c(1:(n-1))]
Y_train <<- train[c(n)]
X_cv <<- cv[c(1:(n-1))]
Y_cv <<- cv[c(n)]
X_test <<- test[c(1:(n-1))]
Y_test <<- test[c(n)]
This shall make you access the newly created data from your workspace.

Passing mclapply() a parameter from for (i in range)

I'm trying to do this:
nmf.sub <- function(n){
sub.data.matrix <- data.matrix[, (index[n, ])] ## the index is a permutation of the original matrix at a 0.8 resampling proportion (doesn't really matter)
temp.result <- nmf(sub.data.matrix, rank = 2, seed = 12345) ## want to change 2 to i
return(temp.result)
}
class.list <- list()
for (i in nmf.rank){ ## nmf.rank is 2:4
results.list <- mclapply(mc.cores = 16, 1:resamp.iterations, function(n) nmf.sub(n)) ## resamp.iterations is 10, nmf.sub is defined above
}
But instead of having rank = 2 in the nmf for temp.result, I want to have rank = i
Any idea how I could pass it that parameter? Just passing it through mclapply as function(n, i) doesn't work.
You seemingly have two loops: one for i in nmf.rank and one for n in 1:resamp.iterations. Therefore, you need to pass both i and n to nmf.sub e.g. like in:
nmf.sub <- function(n, i){
## the index is a permutation of the original matrix at a 0.8
## resampling proportion (doesn't really matter)
sub.data.matrix <- data.matrix[, (index[n, ])]
## want to change 2 to i
temp.result <- nmf(sub.data.matrix, rank = i, seed = 12345)
return(temp.result)
}
resamp.iterations <- 10
nmf.rank <- 2:4
res <- lapply(nmf.rank, function(i){
results.list <- mclapply(mc.cores = 16, 1:resamp.iterations,
function(n) nmf.sub(n,i))
})
## then you can flatten/reshape res
Regarding your comment (below) about efficiency: the bulk of the numerical calculations is performed within the nmf() function, therefore the loop is properly set up, in the sense that each process/core gets a numerically intensive job. However, to speed up computation you might consider using the previously computed result, instead of the seed 12345 (unless using the latter seed is mandatory for some reason related to your problem). In the following example I get a 30-40% reduction in execution time:
library(NMF)
RNGkind("L'Ecuyer-CMRG") ## always use this when using mclapply()
nr <- 19
nc <- 2e2
set.seed(123)
data.matrix <- matrix(rexp(nc*nr),nr,nc)
resamp.iterations <- 10
nmf.rank <- 2:4
index <- t(sapply(1:resamp.iterations, function(n) sample.int(nc,nc*0.8)))
nmf.sub <- function(n, i){
sub.data.matrix <- data.matrix[ ,index[n, ]]
temp.result <- nmf(sub.data.matrix, rank = i, seed = 12345)
return(temp.result)
}
## version 1
system.time({
res <- lapply(nmf.rank, function(i){
results.list <- mclapply(mc.cores = 16, 1:resamp.iterations,
function(n) nmf.sub(n,i))
})
})
## version 2: swap internal and external loops
system.time({
res <-
mclapply(mc.cores=16, 1:resamp.iterations, function(n){
res2 <- nmf(data.matrix[ ,index[n, ]], rank=2, seed = 12345)
res3 <- nmf(data.matrix[ ,index[n, ]], rank=3, seed = 12345)
res4 <- nmf(data.matrix[ ,index[n, ]], rank=4, seed = 12345)
list(res2,res3,res4)
})
})
## version 3: use previous calculation as starting point
## ==> 30-40% reduction in computing time
system.time({
res <-
mclapply(mc.cores=16, 1:resamp.iterations, function(n){
res2 <- nmf(data.matrix[ ,index[n, ]], rank=2, seed = 12345)
res3 <- nmf(data.matrix[ ,index[n, ]], rank=3, seed = res2)
res4 <- nmf(data.matrix[ ,index[n, ]], rank=4, seed = res3)
list(res2,res3,res4)
})
})

Resources