I am getting an error while creating a PDF report out of R markdown file. Below is the snippet of the error:
Error in --dayBikeData <- read.csv("D:\\Madhav\\Study\\MSIS\\PredictiveLearning\\Week-1\\Homework\\Bike-Sharing-Dataset\\day.csv") :
object 'dayBikeData' not found
Calls: <Anonymous> ... handle -> withCallingHandlers -> withVisible -> eval -> eval
Execution halted
I have this object -dayBikeData in the session but still it is giving the error don't know how to proceed on this.
Code for fetching the data from the csv file:
```{r}
dayBikeData <- read.csv("D:\\Madhav\\Study\\MSIS\\PredictiveLearning
\\Week-1\\Homework\\Bike-Sharing-Dataset\\day.csv")
# Performs each of the operation asked in the question
basicOperations <- function(inputData){
lenData <- length(inputData)
avg <- round(mean(inputData, na.rm = TRUE), digits = 2) # mean calculation
standardDeviation <- round(sd(inputData), digits = 2) # Standard deviation
sem <- round(standardDeviation/sqrt(lenData), digits = 2)
# Formula for CI is mean - error where error is
error = round(qnorm(0.975)*standardDeviation/sqrt(lenData), digits = 2)
lower_ci <- avg - error
upper_ci <- avg + error
# resultList <- list(obs = lenData, mean = avg, standarDeviation = sd,
# standardMeanError= sem, lowerCI = lower_ci, upperCI = upper_ci
resultList <- c(lenData, avg, standardDeviation, sem,lower_ci,upper_ci)
print(resultList)
}
#Calculations for the Year Wise Data
# dData2011 <- dayBikeData[dayBikeData$yr==0,]
# dData2012 <- dayBikeData[dayBikeData$yr==1,]
dData2011ResultSet <- basicOperations(dayBikeData[dayBikeData$yr==0,]$cnt)
dData2012ResultSet <- basicOperations(dayBikeData[dayBikeData$yr==1,]$cnt)
#Calculations for the Holiday Wise Data
# dDataHoliady_0 <- dayBikeData[dayBikeData$holiday ==0,]
# dDataHoliady_1 <- dayBikeData[dayBikeData$holiday ==1,]
dDataHoliady0ResultSet <- basicOperations(dayBikeData[dayBikeData$holiday ==0,]$cnt)
dDataHoliady1ResultSet <- basicOperations(dayBikeData[dayBikeData$holiday ==1,]$cnt)
#Calculations for the WorkingDay Wise Data
# dDataWorkingDay_0 <- dayBikeData[dayBikeData$workingday ==0,]
# dDataWorkingDay_1 <- dayBikeData[dayBikeData$workingday ==1,]
dDataWorkingDay0ResultSet <- basicOperations(dayBikeData[dayBikeData$workingday ==0,]$cnt)
dDataWorkingDay1ResultSet <- basicOperations(dayBikeData[dayBikeData$workingday ==1,]$cnt)
#Calculations for the Temperature wise data
avgTemp <- mean(dayBikeData$temp, na.rm = TRUE)
dDataTempGreaterEq <- dayBikeData[dayBikeData$temp >= avgTemp,]
dDataTempLess <- dayBikeData[dayBikeData$temp < avgTemp,]
dDataTempGreaterEqResultSet <- basicOperations(dDataTempGreaterEq$cnt)
dDataTempLessResultSet <- basicOperations(dDataTempLess$cnt)
#Calculations for the Weather wise data
# dDataWeather_1 <- dayBikeData[dayBikeData$weathersit ==1,]
# dDataWeather_2 <- dayBikeData[dayBikeData$weathersit ==2,]
# dDataWeather_3 <- dayBikeData[dayBikeData$weathersit ==3,]
dDataWeather1ResultSet <- basicOperations(dayBikeData[dayBikeData$weathersit ==1,]$cnt)
dDataWeather2ResultSet <- basicOperations(dayBikeData[dayBikeData$weathersit ==2,]$cnt)
dDataWeather3ResultSet <- basicOperations(dayBikeData[dayBikeData$weathersit ==3,]$cnt)
#Calculations for the Season wise data
# dDataSeason_1 <- dayBikeData[dayBikeData$season ==1,]
# dDataSeason_2 <- dayBikeData[dayBikeData$season ==2,]
# dDataSeason_3 <- dayBikeData[dayBikeData$season ==3,]
# dDataSeason_4 <- dayBikeData[dayBikeData$season ==4,]
dDataSeason1ResultSet <- basicOperations(dayBikeData[dayBikeData$season ==1,]$cnt)
dDataSeason2ResultSet <- basicOperations(dayBikeData[dayBikeData$season ==2,]$cnt)
dDataSeason3ResultSet <- basicOperations(dayBikeData[dayBikeData$season ==3,]$cnt)
dDataSeason4ResultSet <- basicOperations(dayBikeData[dayBikeData$season ==4,]$cnt)
#Constrcut a row wise data
resultData <- rbind(dData2011ResultSet, dData2012ResultSet, dDataHoliady0ResultSet,
dDataHoliady1ResultSet,dDataWorkingDay0ResultSet,
dDataWorkingDay1ResultSet,dDataTempGreaterEqResultSet,
dDataTempLessResultSet, dDataWeather1ResultSet,
dDataWeather2ResultSet, dDataWeather3ResultSet,dDataSeason1ResultSet,
dDataSeason2ResultSet, dDataSeason3ResultSet,dDataSeason4ResultSet)
colnames(resultData) <- c("N","Mean","SD" , "SEM","Lower_CI", "UPPER_CI")
rownames(resultData) <- c("Year-0", "Year-1", "Holiday-0", "Holiday-1", "WorkingDay-0",
"WorkingDay-1","Temperature >=","Temperature <", "Weather-1",
"Weather-2","Weather-3","Season-1","Season-2", "Season-3",
"Season-4")
df.resultData <- as.data.frame(resultData)
df.resultData["Value"] <- NA
df.resultData$Value <- c(2011, 2012, 0,1, 0,1,1, 0, 1,2,3,1,2,3,4)
df.resultData = df.resultData[,c(7,1,2,3,4,5,6)]
library(knitr)
# print(xtable(df.resultData), type = "latex")
kable(df.resultData, format = "markdown")
write.csv(df.resultData, file = "D:\\X\\Study\\MSIS\\PredictiveLearning\\OutputResult.csv")
Your file path is wrong... There is a new line and lots of spaces in the middle of it.
> "D:\\Madhav\\Study\\MSIS\\PredictiveLearning
+ \\Week-1\\Homework\\Bike-Sharing-Dataset\\day.csv"
[1] "D:\\Madhav\\Study\\MSIS\\PredictiveLearning\n \\Week-1\\Homework\\Bike-Sharing-Dataset\\day.csv"
So the file is not getting read properly and hence the object is not available in the knitr session.
I downloaded your dataset from UCI Machine Learning Repository, saved your markdown in a new folder, adjusted the filenames by deleting the paths, ran it, and it worked fine.
So I maybe your session is corrupt, or the paths are wrong, or something. Try what I did and it should work.
Proof:
Related
I am attempting to use a batch file to run the funcion SPLOT to bring in variables to my response surface code. I have confirmed the variables are passing properly and are numeric, however, when it gets to the as.coded.data function, I get the error message that it can not locate the variable. Is this a limitation of this function? It worked when I manually entered the values in place of the these variables.
'''
SPLOT <-function(workdir, savedir, TREAT1_LABEL, TREAT2_LABEL, TREAT3_LABEL,RESP1_LABEL, RESP2_LABEL,
T1LOW, T1HIGH, T1MID, T2LOW, T2HIGH, T2MID,
T3LOW, T3HIGH,T3MID){
file_list <- list.files(path=workdir, pattern="*.csv")
for (x in 1:NROW(file_list)) {
PROJ_DATA<-read.csv(file=file_list[x])
i <- 1
while (i <= NROW(file_list)) {
name<-regmatches(file_list[x], regexpr("*.*", file_list[x])) # extract the text from the file-name inorder to name the plot
mytitle = paste(name,".pdf")
mytitle1 = paste(name, ".tiff")
PROJ_DATA.adj <- PROJ_DATA
results <- "savedir"
setwd(workdir)
#### Summarize Data ####
PROJ_DATA1 <- PROJ_DATA.adj %>% group_by(FACTOR) %>%
summarise(N = length(FACTOR),
mean.TREAT1 = mean(TREAT1, na.rm=TRUE),
mean.TREAT2 = mean(TREAT2, na.rm=TRUE),
mean.TREAT3 = mean(TREAT3, na.rm=TRUE),
mean.RESP1 = mean(RESP1, na.rm=TRUE),
mean.RESP2 = mean(RESP2, na.rm=TRUE))%>% drop_na()
PROJ_DATA2 <- na.omit(PROJ_DATA1)
#### Relativize the Dataset - coding ####
PROJ_DATA.coded <- as.coded.data(PROJ_DATA, TREAT1.coded ~ (mean.TREAT - T1MID)/(0.5*(T1HIGH-T1LOW)),
TREAT2.coded ~ (mean.TREAT2 - T2MID)/(0.5*(T2HIGH-T2LOW)),
TREAT3.coded ~ (mean.TREAT3 - T3MID)/(0.5*(T2HIGH-T2LOW)))
PROJ_DATA2.rsm <- rsm(RESP1 ~ FACTOR + SO(TREAT1.coded, TREAT2.coded, TREAT3.coded), data = PROJ_DATA.coded)
PROJ_DATA2.rsm$studres <- rstudent(PROJ_DATA2.rsm)
summary(PROJ_DATA2.rsm)
Every time I run the script it always gives me an error: Error in { : task 1 failed - "could not find function "%>%""
I already check every post on this forum and tried to apply it but no one works.
Please advise any solution.
Please note: I have only 2 cores on my PC.
My code is as follows:
library(dplyr) # For basic data manipulation
library(ncdf4) # For creating NetCDF files
library(tidync) # For easily dealing with NetCDF data
library(ggplot2) # For visualising data
library(doParallel) # For parallel processing
MHW_res_grid <- readRDS("C:/Users/SUDHANSHU KUMAR/Desktop/MTech Project/R/MHW_result.Rds")
# Function for creating arrays from data.frames
df_acast <- function(df, lon_lat){
# Force grid
res <- df %>%
right_join(lon_lat, by = c("lon", "lat")) %>%
arrange(lon, lat)
# Convert date values to integers if they are present
if(lubridate::is.Date(res[1,4])) res[,4] <- as.integer(res[,4])
# Create array
res_array <- base::array(res[,4], dim = c(length(unique(lon_lat$lon)), length(unique(lon_lat$lat))))
dimnames(res_array) <- list(lon = unique(lon_lat$lon),
lat = unique(lon_lat$lat))
return(res_array)
}
# Wrapper function for last step before data are entered into NetCDF files
df_proc <- function(df, col_choice){
# Determine the correct array dimensions
lon_step <- mean(diff(sort(unique(df$lon))))
lat_step <- mean(diff(sort(unique(df$lat))))
lon <- seq(min(df$lon), max(df$lon), by = lon_step)
lat <- seq(min(df$lat), max(df$lat), by = lat_step)
# Create full lon/lat grid
lon_lat <- expand.grid(lon = lon, lat = lat) %>%
data.frame()
# Acast only the desired column
dfa <- plyr::daply(df[c("lon", "lat", "event_no", col_choice)],
c("event_no"), df_acast, .parallel = T, lon_lat = lon_lat)
return(dfa)
}
# We must now run this function on each column of data we want to add to the NetCDF file
doParallel::registerDoParallel(cores = 2)
prep_dur <- df_proc(MHW_res_grid, "duration")
prep_max_int <- df_proc(MHW_res_grid, "intensity_max")
prep_cum_int <- df_proc(MHW_res_grid, "intensity_cumulative")
prep_peak <- df_proc(MHW_res_grid, "date_peak")
I am in a group project for a class and one of the people in my group ran the normalization, as well as creating the test/train sets so that we all have the same sets to work from (we're all utilizing different algorithms). I am assigned with running the KNN algorithm.
We had multiple columns with NA's so those columns were omitted (<-NULL). When attempting to run the KNN I keep getting the error of
Error in knn(train = trainsetne, test = testsetne, cl = ne_train_target, :
no missing values are allowed
I ran which(is.na(dataset$col)) and found:
which(is.na(testsetne$median_days_on_market))
# [1] 8038 8097 8098 8100 8293 8304
When I look through the dataset those cells do not have missing data.
I am wondering if I may get some help with how to either find and fix the "No missing values" or to find a work around (if any).
I am sorry if I am missing something simple. Any help is appreciated.
I have listed the code that we have below:
ne$pending_ratio_yy <- ne$total_listing_count_yy <- ne$average_listing_price_yy <- ne$median_square_feet_yy <- ne$median_listing_price_per_square_feet_yy <- ne$pending_listing_count_yy <- ne$price_reduced_count_yy <- ne$median_days_on_market_yy <- ne$new_listing_count_yy <- ne$price_increased_count_yy <- ne$active_listing_count_yy <- ne$median_listing_price_yy <- ne$flag <- NULL
ne$pending_ratio_mm <- ne$total_listing_count_mm <- ne$average_listing_price_mm <- ne$median_square_feet_mm <- ne$median_listing_price_per_square_feet_mm <- ne$pending_listing_count_mm <- ne$price_reduced_count_mm <- ne$price_increased_count_mm <- ne$new_listing_count_mm <- ne$median_days_on_market_mm <- ne$active_listing_count_mm <- ne$median_listing_price_mm <- NULL
ne$factor_month_date <- as.factor(ne$month_date_yyyymm)
ne$factor_median_days_on_market <- as.factor(ne$median_days_on_market)
train20ne= sample(1:20893, 4179)
trainsetne=ne[train20ne,1:10]
testsetne=ne[-train20ne,1:10]
#This is where I start to come in
ne_train_target <- ne[train20ne, 3]
ne_test_target <- ne[-train20ne, 3]
predict_1 <- knn(train = trainsetne, test = testsetne, cl=ne_train_target, k=145)
# Error in knn(train = trainsetne, test = testsetne, cl = ne_train_target, :
# no missing values are allowed
Could anyone help me with some little problem?
When I plot the frontier I get the following message: "Error in colnames<-(tmp, value = c("targetRisk", "targetReturn")) :
attempt to set 'colnames' on an object with less than two dimensions"(see below for detail). How could I solve this. Thanks a lot.
Portfolio construction & Optimisation
Assets: LUTAX, PFODX,BRGAX,GFAFX,NMSAX,EGINX,IPOYX,SCWFX,FGLDX,PAGEX
Getting monthly returns of the assets
library(quantmod)
library(tseries)
library(timeSeries)
LUTAX <- monthlyReturn((getSymbols("LUTAX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(LUTAX) <- c("LUTAX")
PFODX <- monthlyReturn((getSymbols("PFODX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(PFODX) <- c("PFODX")
BRGAX <- monthlyReturn((getSymbols("BRGAX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(BRGAX) <- c("BRGAX")
GFAFX <- monthlyReturn((getSymbols("GFAFX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(GFAFX) <- c("GFAFX")
NMSAX <- monthlyReturn((getSymbols("NMSAX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(NMSAX) <- c("NMSAX")
EGINX <- monthlyReturn((getSymbols("EGINX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(EGINX) <- c("EGINX")
IPOYX <- monthlyReturn((getSymbols("IPOYX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(IPOYX) <- c("IPOYX")
SCWFX <- monthlyReturn((getSymbols("SCWFX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(SCWFX) <- c("SCWFX")
FGLDX <- monthlyReturn((getSymbols("FGLDX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(FGLDX) <- c("FGLDX")
PAGEX <- monthlyReturn((getSymbols("PAGEX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(PAGEX) <- c("PAGEX")
Merging returns of the assets (excluding NA's)
portfolio_returns <- merge(LUTAX, PFODX,BRGAX,GFAFX,NMSAX,EGINX,IPOYX,SCWFX,FGLDX,PAGEX,all=F)
data <- as.timeSeries(portfolio_returns)
Optimisation portfolio
library(fPortfolio)
spec <- portfolioSpec()
setNFrontierPoints <- 25
setSolver(spec) <- "solveRquadprog"
constraints <- c("minW[1:1]=0.12","maxW[1:1]=0.18","minW[2:2]=0.12","maxW[2:2]=0.18",
"minW[3:3]=0.10","maxW[3:3]=0.15","minW[4:4]=0.08","maxW[4:4]=0.12",
"minW[5:5]=0.08","maxW[5:5]=0.12","minW[6:6]=0.05","maxW[6:6]=0.10",
"minW[7:7]=0.05","maxW[7:7]=0.10","minW[8:8]=0.08","maxW[8:8]=0.12",
"minW[9:9]=0.05","maxW[9:9]=0.10","minW[10:10]=0.08","maxW[10:10]=0.12",
"minsumW[c(1:1,2:2)]=0.27","maxsumW[c(1:1,2:2)]=0.33",
"minsumW[c(3:3,4:4,6:6,10:10)]=0.37","maxsumW[c(3:3,4:4,6:6,10:10)]=0.43",
"minsumW[c(5:5,7:7,8:8,9:9)]=0.27","maxsumW[c(5:5,7:7,8:8,9:9)]=0.33",
"maxsumW[c(1:1,2:2,3:3,4:4,5:5,6:6,7:7,8:8,9:9,10:10)]=1")
portfolioConstraints(data,spec,constraints)
frontier<- portfolioFrontier(data,spec,constraints)
print(frontier)
tailoredFrontierPlot(frontier)
After running the last command above I get the following message: "Error in colnames<-(tmp, value = c("targetRisk", "targetReturn")) :
attempt to set 'colnames' on an object with less than two dimensions"
I recently posted a similar inquiry in the shiny google group, but did not find a solution. We are developing a Shiny app and as the subject indicates we get an "error: subscript out of bounds" message upon running the app. However, when we isolate the offending code and run it on its own in RStudio, there is no error.
This makes me wonder if there is a bug in Shiny itself, or if we are missing something.
Please see the instructions below along with a small example that produces the error. We are using Shiny version 0.8.0 and RStudio 0.98.501.
Thanks for your help!
To run the app, place ui.R and server.R (see below) in a folder and run
library(shiny)
runApp("<folder path>")
It should produce a user interface with a button on the left, but on the right you will see "error: subscript out of bounds".
However, if just run the following three lines of code (approximately lines 57-59 in server.R)
show=data.frame(ps=c(4,-1,0,1),ns=c(0,1,0,0),ts=c(45842,15653,28535,21656))
best.fit1=regsubsets(ts~ps+ns,data=show,nvmax=1)
pred1=predict.regsubsets(best.fit1,show,id=1) # line that offends Shiny
in RStudio (need to include the function "predict.regsubsets" - given at the beginning of server.R), then there are no errors.
#####################
## server.R
#####################
library(rms)
library(leaps)
library(shiny)
library(datasets)
library(stringr)
library(ttutils)
library(plyr)
library(utils)
library(ggplot2)
# object is a regsubsets object
# newdata is of the form of a row or collection of rows in the dataset
# id specifies the number of terms in the model, since regsubsets objects
# includes models of size 1 up to a specified number
predict.regsubsets=function(object,newdata,id,...){
form=as.formula(object$call[[2]])
mat=model.matrix(form,newdata)
mat.dims=dim(mat)
coefi=coef(object,id=id)
xvars=names(coefi)
# because mat only has those categorical variable categories associated with newdata,
# it is possible that xvars (whose variables are defined by the "best" model of size i)
# has a category that is not in mat
diffs=setdiff(xvars,colnames(mat))
ndiffs=length(diffs)
if(ndiffs>0){
# add columns of 0's for each variable in xvars that is not in mat
mat=cbind(mat,matrix(0,mat.dims[1],ndiffs))
# for the last "ndiffs" columns, make appropriate names
colnames(mat)[(mat.dims[2]+1):(mat.dims[2]+ndiffs)]=diffs
mat[,xvars]%*%coefi
}
else{
mat[,xvars]%*%coefi
}
}
# Define server logic required to summarize and view the selected dataset
shinyServer(function(input, output) {
mainTable1 <- reactive({
})
output$table21 <- renderTable({
mainTable1()
})
formulamodel1 <- reactive({
#ticketsale<-dataset1Input()
show=data.frame(ps=c(4,-1,0,1),ns=c(0,1,0,0),ts=c(45842,15653,28535,21656))
best.fit1=regsubsets(ts~ps+ns,data=show,nvmax=1)
pred1=predict.regsubsets(best.fit1,show,id=1)
})
output$model1fit <- renderPrint({
formulamodel1()
})
})
######################
## end server.R
######################
######################
## ui.R
######################
library(rms)
library(leaps)
library(shiny)
library(datasets)
library(stringr)
library(ttutils)
library(plyr)
library(utils)
library(ggplot2)
shinyUI(pageWithSidebar(
headerPanel("Forecasting ticket sales for xxx"),
sidebarPanel(
p(strong("Model Fitting")),
selectInput("order1", "Sort results by:",c("a","b","c")),
submitButton("Run Model")
),
mainPanel(
h3(strong("Model fit without using ticket sales") ),
tableOutput("table21"),
verbatimTextOutput(outputId = "model1fit")
)
))
These three lines only seem to work when executed in the global environment. If you take that snippet and run it inside of a local({...}) block you'll see the same error.
The error is coming from the first line of predict.regsubsets, where you look at object$call[[2]]. It's object$call that is very different depending on whether it's executed in the global environment or not; it's created in leaps:::regsubsets.formula by calling sys.call(sys.parent()). Perhaps this needs to be sys.call(sys.parent(0)) (just a guess)?
Thanks to John Harrison for this answer. He attempted to reply via the shiny Google group but the system deleted his answers, as well as my attempt later to post his solution. Here it is.
John Harrison says:
The issue is with the regsubsets function:
> test_env <- new.env(parent = globalenv())
> with(test_env, {show=data.frame(ps=c(4,-1,0,1),ns=c(0,1,0,0),ts=c(45842,15653,28535,21656))
+ best.fit1=regsubsets(ts~ps+ns,data=show,nvmax=1)
+ #pred1=predict.regsubsets(best.fit1,show,id=1)
+ #pred1
+ best.fit1})
Subset selection object
Call: eval(expr, envir, enclos)
2 Variables (and intercept)
Forced in Forced out
ps FALSE FALSE
ns FALSE FALSE
1 subsets of each size up to 1
Selection Algorithm: exhaustive
You can see it gets it Call: output relative to the environment its in:
> getAnywhere(regsubsets.formula)
A single object matching ‘regsubsets.formula’ was found
It was found in the following places
registered S3 method for regsubsets from namespace leaps
namespace:leaps
with value
function (x, data, weights = NULL, nbest = 1, nvmax = 8, force.in = NULL,
force.out = NULL, intercept = TRUE, method = c("exhaustive",
"backward", "forward", "seqrep"), really.big = FALSE,
...)
{
formula <- x
rm(x)
mm <- match.call()
mm$formula <- formula
mm$x <- NULL
mm$nbest <- mm$nvmax <- mm$force.in <- mm$force.out <- NULL
mm$intercept <- mm$method <- mm$really.big <- NULL
mm[[1]] <- as.name("model.frame")
mm <- eval(mm, sys.frame(sys.parent()))
x <- model.matrix(terms(formula, data = data), mm)[, -1]
y <- model.extract(mm, "response")
wt <- model.extract(mm, "weights")
if (is.null(wt))
wt <- rep(1, length(y))
else wt <- weights
a <- leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
force.in = force.in, force.out = force.out, intercept = intercept)
rval <- switch(1 + pmatch(method[1], c("exhaustive", "backward",
"forward", "seqrep"), nomatch = 0), stop(paste("Ambiguous or unrecognised method name :",
method)), leaps.exhaustive(a, really.big), leaps.backward(a),
leaps.forward(a), leaps.seqrep(a))
rval$call <- sys.call(sys.parent())
rval
}
<environment: namespace:leaps>
rval$call <- sys.call(sys.parent())
is the offending line of code
I replied:
I'm in a bit over my head in terms of these R functions, environments, etc. I roughly followed your explanation above but I don't understand it enough to have any real sort of idea of what to do to fix it (or whether it is even fixable). Could you easily point me in the right direction?
John replied:
You could define your own regsubsets function:
myregsubsets <- function (x, data, weights = NULL, nbest = 1, nvmax = 8, force.in = NULL,
force.out = NULL, intercept = TRUE, method = c("exhaustive",
"backward", "forward", "seqrep"), really.big = FALSE,
...){
formula <- x
rm(x)
mm <- match.call()
mm$formula <- formula
mm$x <- NULL
mm$nbest <- mm$nvmax <- mm$force.in <- mm$force.out <- NULL
mm$intercept <- mm$method <- mm$really.big <- NULL
mm[[1]] <- as.name("model.frame")
mm <- eval(mm, sys.frame(sys.parent()))
x <- model.matrix(terms(formula, data = data), mm)[, -1]
y <- model.extract(mm, "response")
wt <- model.extract(mm, "weights")
if (is.null(wt))
wt <- rep(1, length(y))
else wt <- weights
a <- leaps:::leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
force.in = force.in, force.out = force.out, intercept = intercept)
rval <- switch(1 + pmatch(method[1], c("exhaustive", "backward",
"forward", "seqrep"), nomatch = 0), stop(paste("Ambiguous or unrecognised method name :",
method)), leaps:::leaps.exhaustive(a, really.big), leaps:::leaps.backward(a),
leaps:::leaps.forward(a), leaps:::leaps.seqrep(a))
rval$call <- sys.call(sys.parent())
rval$x <- formula
rval
}
predict.regsubsets=function(object,newdata,id,...){
form=as.formula(object$x)
mat=model.matrix(form,newdata)
mat.dims=dim(mat)
coefi=coef(object,id=id)
xvars=names(coefi)
# because mat only has those categorical variable categories associated with newdata,
# it is possible that xvars (whose variables are defined by the "best" model of size i)
# has a category that is not in mat
diffs=setdiff(xvars,colnames(mat))
ndiffs=length(diffs)
if(ndiffs>0){
# add columns of 0's for each variable in xvars that is not in mat
mat=cbind(mat,matrix(0,mat.dims[1],ndiffs))
# for the last "ndiffs" columns, make appropriate names
colnames(mat)[(mat.dims[2]+1):(mat.dims[2]+ndiffs)]=diffs
mat[,xvars]%*%coefi
}
else{
mat[,xvars]%*%coefi
}
}
Later, John added:
The regsubsets function assumed the user was calling it in a certain fashion. The myregsubsets is a replacement for regsubsets.formula. In your predict.regsubsets you access the formula using as.formula(object$call[[2]]). When nested in environments this doesnt give you what is expected. The myregsubsets replacement gets the formula using rval$x <- formula. The changed predict.regsubsets then uses form=as.formula(object$x) rather then as.formula(object$call[[2]]).