data_list has different datasets.
daily_svm <- dt_list
weekly_svm <- dt_list
for(i in seq_along(dt_list)){
tmp <- dt_list[[i]]
train <- tmp[1:(nrow(tmpdat)-486), ]
test <- tmp[506:686,]
d_svm <- svm(load ~ daily, data = train,
type = "eps-regression")
daily_svm[[i]] <- predict(d_svm, test)
I am running this model, but i get this error.
Error in daily_svm[[i]] <- predict(d_svm, test) :
replacement has length zero
Related
I'm trying to speed up a script that otherwise takes days to handle larger data sets. So, is there a way to completely vectorize the following script:
# k-fold cross validation
df <- trees # a data frame 'trees' from R.
df <- df[sample(nrow(df)), ] # randomly shuffles the data.
k <- 10 # Number of folds. Note k=nrow(df) in the leave-one-out cross validation.
folds <- cut(seq(from=1, to=nrow(df)), breaks=k, labels=FALSE) # creates unique numbers for k equally size folds.
df$ID <- folds # adds fold IDs.
df[paste("pred", 1:10, sep="")] <- NA # adds multiple columns "pred1"..."pred10" to speed up the following loop.
library(mgcv)
for(i in 1:k) {
# looping for different models:
m1 <- gam(Volume ~ s(Height), data=df, subset=(ID != i))
m2 <- gam(Volume ~ s(Girth), data=df, subset=(ID != i))
m3 <- gam(Volume ~ s(Girth) + s(Height), data=df, subset=(ID != i))
# looping for predictions:
df[df$ID==i, "pred1"] <- predict(m1, df[df$ID==i, ], type="response")
df[df$ID==i, "pred2"] <- predict(m2, df[df$ID==i, ], type="response")
df[df$ID==i, "pred3"] <- predict(m3, df[df$ID==i, ], type="response")
}
# calculating residuals:
df$res1 <- with(df, Volume - pred1)
df$res2 <- with(df, Volume - pred2)
df$res3 <- with(df, Volume - pred3)
Model <- paste("m", 1:10, sep="") # creates a vector of model names.
# creating a vector of mean-square errors (MSE):
MSE <- with(df, c(
sum(res1^2) / nrow(df),
sum(res2^2) / nrow(df),
sum(res3^2) / nrow(df)
))
model.mse <- data.frame(Model, MSE, R2) # creates a data frame of model names, mean-square errors and coefficients of determination.
model.mse <- model.mse[order(model.mse$MSE), ] # rearranges the previous data frame in order of increasing mean-square errors.
I'd appreciate any help. This code takes several days if run on 30,000 different GAM models and 3 predictors. Thanks
I am using a for loop to generate 100 different train and test sets.
What I want to do now, is to save these 100 different train and test sets in order to be able to have a look at e.g. where iteration was 17.
This code shows my program with the for loop and the division into train and test set:
result_df<-matrix(ncol=3,nrow=100)
colnames(result_df)<-c("Acc","Sens","Spec")
for (g in 1:100 )
{
# Divide into Train and test set
smp_size <- floor(0.8 * nrow(mydata1))
train_ind <- sample(seq_len(nrow(mydata1)), size = smp_size)
train <- mydata1[train_ind, ]
test <- mydata1[-train_ind, ]
REST OF MY CODE
# Calculate some statistics
overall <- cm$overall
overall.accuracy <- format(overall['Accuracy'] * 100, nsmall =2, digits = 2)
overall.sensitivity <- format(cm$byClass['Sensitivity']* 100, nsmall =2, digits = 2)
overall.specificity <- format(cm$byClass['Specificity']* 100, nsmall =2, digits = 2)
result_df[g,1] <- overall.accuracy
result_df[g,2] <- overall.sensitivity
result_df[g,3] <- overall.specificity
}
How can I do this?
You could do the following, for example, saving each test and train sets as elements in a list:
result_df<-matrix(ncol=3,nrow=100)
colnames(result_df)<-c("Acc","Sens","Spec")
testlist <- list()
trainlist <- list()
for (g in 1:100 )
{
# Divide into Train and test set
smp_size <- floor(0.8 * nrow(mydata1))
train_ind <- sample(seq_len(nrow(mydata1)), size = smp_size)
train <- mydata1[train_ind, ]
test <- mydata1[-train_ind, ]
trainlist[[g]] <- train
testlist[[g]] <- test
}
EDIT
To retrieve the 7th element of these lists you could use trainlist[[7]]
You can save those in csv file by using the following method
write.csv(train, file = paste0("train-", Sys.time(), ".csv", sep=""))
write.csv(test, file = paste0("test-", Sys.time(), ".csv", sep=""))
One option could be to save the row indexes of your partitions, rather than saving all the datasets, and then select the rows indexes for the iteration you're interested in.
The caret package has a function called createDataPartition, which will do this for you:
library(caret)
df <- data.frame(col1 = rnorm(100), col2 = rnorm(100))
# create 100 partitions
train.idxs <- createDataPartition(1:nrow(df), times = 100, p = 0.8)
for(i in 1:length(train.idxs)) {
# create train and test sets
idx <- train.idxs[[i]]
train.df <- df[idx, ]
test.df <- df[-idx, ]
# calculate statistics ...
result_df[i,1] <- overall.accuracy
result_df[i,2] <- overall.sensitivity
result_df[i,3] <- overall.specificity
}
# check the datasets for the nth partition
# train set
df[train.idxs[[n]], ]
# test set
df[-train.idxs[[n]], ]
Put your code in a function and do a lapply():
result_df <- matrix(ncol=3, nrow=100)
colnames(result_df)<-c("Acc", "Sens", "Spec")
SIMg <- function(g) {
# Divide into Train and test set
smp_size <- floor(0.8 * nrow(mydata1))
train_ind <- sample(seq_len(nrow(mydata1)), size = smp_size)
train <- mydata1[train_ind, ]
test <- mydata1[-train_ind, ]
REST OF THE CODE
return(list(train=train, test=test, ...))
}
L <- lapply(1:100, SIMg)
The resulting list L has 100 elements, each element is a list containing the two dataframes and your results for one simulation run.
To get separate lists trainlist and testlist you can do:
trainlist <- lallpy(L, '[[', "train")
testlist <- lallpy(L, '[[', "test")
I am trying to write a program where the data and the place holder for the y (output) variable are given to the function. The function produces the confusion matrix for the data set and the test data. This is in fact my 5th attempt at this sort of function- which is why most of this function is from a manual using the iris data as the data set- but I seem to get stuck on the y.vec input for the function. Is my method for inserting the y variable into the function correct?
Here is my function.
function(data,y.vec)
{
library(e1071)
library(rpart)
data=data
index <- 1:nrow(data)
testindex <- sample(index, trunc(length(index)/3))
testset <- data[testindex,]
trainset <- data[-testindex,]
svm.model <- svm(as.factor(data[y.vec]) ~ ., data = trainset, cost = 100, gamma = 1)
svm.pred <- predict(svm.model, testset[,-y.vec])
table(pred = svm.pred, true = testset[,y.vec])
}
myFunc <- function(df, y.vec)
{
library(e1071)
df[,y.vec] <- as.factor(df[,y.vec])
set.seed(1)
index <- 1:nrow(df)
testindex <- sample(index, trunc(length(index)/3))
testset <- df[testindex,]
trainset <- df[-testindex,]
svm.model <- svm(as.formula(paste(y.vec, "~ .")), data = trainset, cost = 100, gamma = 1)
svm.pred <- predict(svm.model, testset[,!(names(testset) %in% y.vec)])
return(table(pred = svm.pred, true = testset[,y.vec]))
}
myFunc(iris, "Species")
I have the following function to return 9 data frames:
split_data <- function(dataset, train_perc = 0.6, cv_perc = 0.2, test_perc = 0.2)
{
m <- nrow(dataset)
n <- ncol(dataset)
#Sort the data randomly
data_perm <- dataset[sample(m),]
#Split data into training, CV, and test sets
train <- data_perm[1:round(train_perc*m),]
cv <- data_perm[(round(train_perc*m)+1):round((train_perc+cv_perc)*m),]
test <- data_perm[(round((train_perc+cv_perc)*m)+1):round((train_perc+cv_perc+test_perc)*m),]
#Split sets into X and Y
X_train <- train[c(1:(n-1))]
Y_train <- train[c(n)]
X_cv <- cv[c(1:(n-1))]
Y_cv <- cv[c(n)]
X_test <- test[c(1:(n-1))]
Y_test <- test[c(n)]
}
My code runs fine, but no data frames are created. Is there a way to do this? Thanks
This will store the nine data.frames in a list
split_data <- function(dataset, train_perc = 0.6, cv_perc = 0.2, test_perc = 0.2) {
m <- nrow(dataset)
n <- ncol(dataset)
#Sort the data randomly
data_perm <- dataset[sample(m),]
# list to store all data.frames
out <- list()
#Split data into training, CV, and test sets
out$train <- data_perm[1:round(train_perc*m),]
out$cv <- data_perm[(round(train_perc*m)+1):round((train_perc+cv_perc)*m),]
out$test <- data_perm[(round((train_perc+cv_perc)*m)+1):round((train_perc+cv_perc+test_perc)*m),]
#Split sets into X and Y
out$X_train <- train[c(1:(n-1))]
out$Y_train <- train[c(n)]
out$X_cv <- cv[c(1:(n-1))]
out$Y_cv <- cv[c(n)]
out$X_test <- test[c(1:(n-1))]
out$Y_test <- test[c(n)]
return(out)
}
If you want dataframes to be created in the workspace at the end, this is what you'll need to do:-
1) Create empty variable (which may equal out to NULL i.e. Y_test = NULL) in your R console.
2) Assign "<<-" operator to the same variables created in Step 1 inside your function i.e.
X_train <<- train[c(1:(n-1))]
Y_train <<- train[c(n)]
X_cv <<- cv[c(1:(n-1))]
Y_cv <<- cv[c(n)]
X_test <<- test[c(1:(n-1))]
Y_test <<- test[c(n)]
This shall make you access the newly created data from your workspace.
I am trying to write r2, rmse, coefficients, and standardized coefficients from stepAIC to a .CSV file:
NO3_lmres_ClimateOnly <- data.frame()
for (i in unique(Data$SeasAlltxt)){
print (i)
subdata1 <- subset(Data, SeasAlltxt == i)
for (j in unique(Data$ALSCIDtxtall)){
subdata2 <- subset(subdata1, ALSCIDtxtall == j)
fit <- lm(NO3resid~Avg94NO3+MaxDepth_m+MaxDepthDOY+FirstZeroDOY+PeakToGone+PRISMppt+PRISMtmax, data = subdata2, na.action = na.omit)
step <- stepAIC(fit, direction="both")
rmse <- round(sqrt(mean(resid(step)^2)), 3)
r2 <- round(summary(step)$r.squared, 3)
coefs <- summary(step)$coefficients
stdcoefs <- lm.beta(step)
stdcoefs <- unname(stdcoefs)
params <- names(stdcoefs)
tempvalues <- data.frame(i,j,rmse,r2,coefs,stdcoefs,params)
colnames(tempvalues) <- c('SeasAlltxt', 'ALSCIDtxtall', 'rmse', 'r2', 'coef', 'stdcoef','param')
NO3_lmres_ClimateOnly <- rbind(NO3_lmres_ClimateOnly,tempvalues)
}
}
write.csv(NO3_lmres_ClimateOnly, file = "NO3_ClimateOnly_stats.csv")
However, the above code produces this error:
Error in data.frame(i, j, rmse, r2, coefs, stdcoefs, params) :
arguments imply differing number of rows: 1, 3, 2, 0
I would also like to write the p-value associated with each parameter to the output table.
Any suggestions for how to accomplish this?
Maybe you want to change your code:
for (j in unique(subdata1$ALSCIDtxtall))
...
coefs <- summary(step)$coefficients[,1]
...
tempvalues <- data.frame(t(c(i,j,rmse,r2,coefs,stdcoefs,params)),stringsAsFactors=F)
colnames(tempvalues ) <- c('SeasAlltxt', 'ALSCIDtxtall', 'rmse', 'r2', names(coefs), paste('stdcoef:',params),params)
Bud the final rbind will give you an error when stepAIC select different number of coefficients.
Think of using a list instead:
Define cont=1 outside the for
then, change the following lines:
tempvalues <- data.frame(t(c(i,j,rmse,r2,coefs,stdcoefs,params)),stringsAsFactors=F)
colnames(tempvalues ) <- c('SeasAlltxt', 'ALSCIDtxtall', 'rmse', 'r2', names(coefs), paste('stdcoef:',params),params)
NO3_lmres_ClimateOnly[[cont]] <- tempvalues
cont=cont+1
Good luck!!