Related
I have the following data table that I want to use to predict DE prices based on the other variables in the data table with the GLM (= Generalized Linear Model).
set.seed(123)
dt.data <- data.table(date = seq(as.Date('2019-01-01'), by = '1 day', length.out = 731),
'DE' = rnorm(731, 30, 1), 'windDE' = rnorm(731, 10, 1),
'consumptionDE' = rnorm(731, 50, 1), 'nuclearDE' = rnorm(731, 8, 1),
'solarDE' = rnorm(731, 1, 1), check.names = FALSE)
dt.forecastData <- dt.data
dt.forecastData <- na.omit(dt.forecastData)
fromTestDate <- "2019-12-31"
fromDateTest <- base::toString(fromTestDate)
## Create train and test date-vectors depending on fromDateTest: ##
v.train <- which(dt.forecastData$date <= fromDateTest)
v.test <- which(dt.forecastData$date == as.Date(fromDateTest)+1)
## Create data tables for train and test data with specific date range (fromTestDate): ##
dt.train <- dt.forecastData[v.train]
v.trainDate <- dt.train$date
dt.test <- dt.forecastData[v.test]
v.testDate <- dt.test$date
## Delete column "date" of train and test data for model fitting: ##
dt.train <- dt.train[, c("date") := NULL]
dt.test <- dt.test[, c("date") := NULL]
## MODEL FITTING: ##
## Generalized Linear Model: ##
xgbModel <- stats::glm(DE ~ .-1, data = dt.train,
family = quasi(link = "identity", variance = "constant"))
## Train and Test Data PREDICTION with xgbModel: ##
dt.train$prediction <- stats::predict.glm(xgbModel, dt.train)
dt.test$prediction <- stats::predict.glm(xgbModel, dt.test)
## Add date columns to dt.train and dt.test: ##
dt.train <- data.table(date = v.trainDate, dt.train)
dt.test <- data.table(date = v.testDate, dt.test)
Here in this code I train the model with the data from 2019-01-01 to 2019-12-31 and test it with the day-ahead forecast from 2020-01-01.
Now I want to create a for-loop so that I run my model 365 in total, as follows:
Run 1:
a) use 01-01-2019 to 31-12-2019 to train my model
b) predict for 01-01-2020 (test data)
c) use the actual data point for 01-01-2020 to evaluate the prediction
Run 2:
a) use 01-01-2019 to 01-01-2020 to train my model
b) predict for 02-01-2020
c) use the actual data point for 02-01-2020 to evaluate the prediction
etc.
In the end, I want to plot e.g. the cumulate sum of the individual prediction performances Or the histogram of the individual prediction performances and some summary statistics (mean, median, sd, etc.)
Unfortunately, I don't know how to start with the loop and where I can save my predictions of each run?
I hope someone can help me with this!
Basically, you have to construct a vector that contains the end dates for each run. Then, you can pick one of the end dates in each iteration of the loop, run the model and predict one day ahead. Using your code, this may look something like this:
set.seed(123)
dt.data <- data.table(date = seq(as.Date('2019-01-01'), by = '1 day', length.out = 731),
'DE' = rnorm(731, 30, 1), 'windDE' = rnorm(731, 10, 1),
'consumptionDE' = rnorm(731, 50, 1), 'nuclearDE' = rnorm(731, 8, 1),
'solarDE' = rnorm(731, 1, 1), check.names = FALSE)
dt.forecastData <- dt.data
dt.forecastData <- na.omit(dt.forecastData)
Here, I construct a vector holding all days between Dec 31 2019 and Jan 15 2020, adapt as needed:
# vector of all end dates
eval.dates <- seq.Date(from = as.Date("2019-12-31"),
to = as.Date("2020-01-15"),
by = 1)
Here, I create a storage file for the one-day ahead predictions
# storage file for all predictions
test.predictions <- numeric(length = length(eval.dates))
Now, run the loop using your code and pick one of the end dates in each iteration:
for(ii in 1:length(eval.dates)){ # loop start
fromTestDate <- eval.dates[ii] # get end date for iteration
fromDateTest <- base::toString(fromTestDate)
## Create train and test date-vectors depending on fromDateTest: ##
v.train <- which(dt.forecastData$date <= fromDateTest)
v.test <- which(dt.forecastData$date == as.Date(fromDateTest)+1)
## Create data tables for train and test data with specific date range (fromTestDate): ##
dt.train <- dt.forecastData[v.train]
v.trainDate <- dt.train$date
dt.test <- dt.forecastData[v.test]
v.testDate <- dt.test$date
## Delete column "date" of train and test data for model fitting: ##
dt.train <- dt.train[, c("date") := NULL]
dt.test <- dt.test[, c("date") := NULL]
## MODEL FITTING: ##
## Generalized Linear Model: ##
xgbModel <- stats::glm(DE ~ .-1, data = dt.train,
family = quasi(link = "identity", variance = "constant"))
## Train and Test Data PREDICTION with xgbModel: ##
test.predictions[ii] <- stats::predict.glm(xgbModel, dt.test)
# verbose
print(ii)
} # loop end
As you can see, this is a bit of a shortened version of your code and I omitted the predictions for the training set for brevity. They can easily be added along the lines of the code you have above.
You did not specify which measures you want to use to evaluate your out-of-sample predictions. The object test.predictions holds all your one-step-ahead predictions and you can use this to compute RMSEs, LPS or whatever quantification of predictive power that you'd like to use.
Im trying to run the dccfit function for some stock returns in order to model the covariance matrix.
In the model documentation its recommended to use a an xts object.
I found an example of how to run the function with a data frame under the bellow link and the example works.
But when I try to run it with my own data set I do not get it to work.
http://www.unstarched.net/2013/01/03/the-garch-dcc-model-and-2-stage-dccmvt-estimation/
https://faculty.washington.edu/ezivot/econ589/DCCgarchPowerpoint.pdf
I have tried to change the dimnames and the rownames without result
library(zoo)
library(rugarch)
library(xts)
library(rmgarch)
getSymbols(Symbols = c("^GSPC", "SPN"),
env = parent.frame(),
reload.Symbols = FALSE,
verbose = FALSE,
warnings = TRUE,
src = "yahoo",
symbol.lookup = TRUE,
auto.assign = getOption('getSymbols.auto.assign',TRUE),
from = "1927-01-01",
to = "2018-04-28")
GSPC_dataframe <- data.frame(date=index(GSPC), coredata(GSPC))
## SPECIFYING DCC GARCH
# univariate normal GARCH(1,1) for each series
garch11.spec = ugarchspec(mean.model = list(armaOrder = c(0,0)),variance.model = list(garchOrder = c(1,1),model = "sGARCH"),distribution.model = "norm")
# dcc specification - GARCH(1,1) for conditional correlations
dcc.garch11.spec = dccspec(uspec = multispec( replicate(2, garch11.spec) ),dccOrder = c(1,1),distribution = "mvnorm")
dcc.garch11.spec
## ESTIMATE DCC GARCH data needs to be a dataframe
# I have tried to see if idderent options work
dcc.fit = dccfit(dcc.garch11.spec, data = GSPC$LogReturn)
dcc.fit = dccfit(dcc.garch11.spec, data = GSPC_dataframe$LogReturn)
dcc.fit = dccfit(dcc.garch11.spec, data = GSPC)
dcc.fit = dccfit(dcc.garch11.spec, data = GSPC_dataframe)
I have tried to run the function with both an its object and a data frame and I'm getting the same error:
"Error in dimnames(x) <- dn : length of 'dimnames' [2] not equal to array extent"
Grateful for any help or direction!
I'm not sure what a 'dccfit function' is, and it seems like your method is unnecessarily complex. Anyway, if you want a basic correlation matrix and covariance matrix, the script below will do it for you.
library(quantmod)
library(tidyverse)
library(lubridate)
symbols <- c("AAPL", "MSFT", "GOOG")
getSymbols(symbols)
stocks <- data.frame(as.xts(merge(AAPL, MSFT, GOOG)))
stocks$date <- row.names(stocks)
row.names(stocks) <- NULL
# head(stocks)
jan31 <- ymd("2018-01-31")
days_to_keep <- jan31 %m+% months(0:11)
# days_to_keep
res <- stocks %>%
select(ends_with("Close"), date) %>%
mutate(date = ymd(date)) %>%
filter(date %in% days_to_keep)
# correlation matrix:
res %>% select(-date) %>% cor()
# AAPL.Close MSFT.Close GOOG.Close
# AAPL.Close 1.0000000 0.9198357 0.9313001
# MSFT.Close 0.9198357 1.0000000 0.9103830
# GOOG.Close 0.9313001 0.9103830 1.0000000
# Or more visually...
res %>% select(-date) %>% pairs()
# covariance matrix:
res %>% select(-date) %>% cov()
Instead of this line:
GSPC_dataframe <- data.frame(date=index(GSPC), coredata(GSPC))
You should extract the closing prices from both datasets and merge them into one xts time series:
data <- cbind(Cl(GSPC), Cl(SPN))
Than you can run the dccfit like this:
dcc.fit = dccfit(dcc.garch11.spec, data)
A look at your data with head(GSPC) or head(SPN) will give you some idea why this was not working.
I am currently using genomic expression levels, age, and smoking intensity levels to predict the number of days Lung Cancer Patients have to live. I have a small amount of data; 173 patients and 20,438 variables, including gene expression levels (which make up for 20,436). I have split up my data into test and training, utilizing an 80:20 ratio. There are no missing values in the data.
I am using knn() to train the model. Here is what the code looks like:
prediction <- knn(train = trainData, test = testData, cl = trainAnswers, k=1)
Nothing seems out of the ordinary until you notice that k=1. "Why is k=1?" you may ask. The reason k=1 is because when k=1, the model is the most accurate. This makes no sense to me. There are quite a few concerns:
I am using knn() to predict a continuous variable. I should be using something along the lines of, cox maybe.
The model is waaaaaaay too accurate. Here are a few examples of the test answer and the model's predictions. For the first patient, the number of days to death is 274. The model predicts 268. For the second patient, test: 1147, prediction: 1135. 3rd, test: 354, prediction: 370. 4th, test: 995, prediction 995. How is this possible? Out of the entire test data, the model was only off by and average of 9.0625 days! The median difference was 7 days, and the mode was 6 days. Here is a graph of the results:
Bar Graph.
So I guess my main question is what does knn() do, what does k represent, and how is the model so accurate when k=1? Here is my entire code (I am unable to attach the actual data):
# install.packages(c('caret', 'skimr', 'RANN', 'randomForest', 'fastAdaboost', 'gbm', 'xgboost', 'caretEnsemble', 'C50', 'earth'))
library(caret)
# Gather the data and store it in variables
LUAD <- read.csv('/Users/username/Documents/ClinicalData.csv')
geneData <- read.csv('/Users/username/Documents/GenomicExpressionLevelData.csv')
geneData <- data.frame(geneData)
row.names(geneData) = geneData$X
geneData <- geneData[2:514]
colNamesGeneData <- gsub(".","-",colnames(geneData),fixed = TRUE)
colnames(geneData) = colNamesGeneData
# Organize the data
# Important columns are 148 (smoking), 123 (OS Month, basically how many days old), and the gene data. And column 2 (barcode).
LUAD = data.frame(LUAD$patient, LUAD$TOBACCO_SMOKING_HISTORY_INDICATOR, LUAD$OS_MONTHS, LUAD$days_to_death)[complete.cases(data.frame(LUAD$patient, LUAD$TOBACCO_SMOKING_HISTORY_INDICATOR, LUAD$OS_MONTHS, LUAD$days_to_death)), ]
rownames(LUAD)=LUAD$LUAD.patient
LUAD <- LUAD[2:4]
# intersect(rownames(LUAD),colnames(geneData))
# ind=which(colnames(geneData)=="TCGA-778-7167-01A-11R-2066-07")
gene_expression=geneData[, rownames(LUAD)]
# Merge the two datasets to use the geneomic expression levels in your model
LUAD <- data.frame(LUAD,t(gene_expression))
LUAD.days_to_death <- LUAD[,3]
LUAD <- LUAD[,c(1:2,4:20438)]
LUAD <- data.frame(LUAD.days_to_death,LUAD)
set.seed(401)
# Number of Rows in the training data (createDataPartition(dataSet, percentForTraining, boolReturnAsList))
trainRowNum <- createDataPartition(LUAD$LUAD.days_to_death, p=0.8, list=FALSE)
# Training/Test Dataset
trainData <- LUAD[trainRowNum, ]
testData <- LUAD[-trainRowNum, ]
x = trainData[, c(2:20438)]
y = trainData$LUAD.days_to_death
v = testData[, c(2:20438)]
w = testData$LUAD.days_to_death
# Imputing missing values into the data
preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
library(RANN)
if (anyNA(trainData)) {
trainData <- predict(preProcess_missingdata_model, newdata = trainData)
}
anyNA(trainData)
# Normalizing the data
preProcess_range_model <- preProcess(trainData, method='range')
trainData <- predict(preProcess_range_model, newdata = trainData)
trainData$LUAD.days_to_death <- y
apply(trainData[,1:20438], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
preProcess_range_model_Test <- preProcess(testData, method='range')
testData <- predict(preProcess_range_model_Test, newdata = testData)
testData$LUAD.days_to_death <- w
apply(testData[,1:20438], 2, FUN=function(v){c('min'=min(v), 'max'=max(v))})
# To uncomment, select the text and press 'command' + 'shift' + 'c'
# set.seed(401)
# options(warn=-1)
# subsets <- c(1:10)
# ctrl <- rfeControl(functions = rfFuncs,
# method = "repeatedcv",
# repeats = 5,
# verbose = TRUE)
# lmProfile <- rfe(x=trainData[1:20437], y=trainAnswers,
# sizes = subsets,
# rfeControl = ctrl)
# lmProfile
trainAnswers <- trainData[,1]
testAnswers <- testData[,1]
library(class)
prediction <- knn(train = trainData, test = testData, cl = trainAnswers, k=1)
#install.packages("plotly")
library(plotly)
Test_Question_Number <- c(1:32)
prediction2 <- data.frame(prediction[1:32])
prediction2 <- as.numeric(as.vector(prediction2[c(1:32),]))
data <- data.frame(Test_Question_Number, prediction2, testAnswers)
names(data) <- c("Test Question Number","Prediction","Answer")
p <- plot_ly(data, x = ~Test_Question_Number, y = ~prediction2, type = 'bar', name = 'Prediction') %>%
add_trace(y = ~testAnswers, name = 'Answer') %>%
layout(yaxis = list(title = 'Days to Death'), barmode = 'group')
p
merge <- data.frame(prediction2,testAnswers)
difference <- abs((merge[,1])-(merge[,2]))
difference <- sort(difference)
meanDifference <- mean(difference)
medianDifference <- median(difference)
modeDifference <- names(table(difference))[table(difference)==max(table(difference))]
cat("Mean difference:", meanDifference, "\n")
cat("Median difference:", medianDifference, "\n")
cat("Mode difference:", modeDifference,"\n")
Lastly, for clarification purposes, ClinicalData.csv is the age, days to death, and smoking intensity data. The other .csv is the genomic expression data. The data above line 29 doesn't really matter, so you can just skip to the part of the code where it says "set.seed(401)".
Edit: Some samples of the data:
days_to_death OS_MONTHS
121 3.98
NACC1 2001.5708 2363.8063 1419.879
NACC2 58.2948 61.8157 43.4386
NADK 706.868 1053.4424 732.1562
NADSYN1 1628.7634 912.1034 638.6471
NAE1 832.8825 793.3014 689.7123
NAF1 140.3264 165.4858 186.355
NAGA 1523.3441 1524.4619 1858.9074
NAGK 983.6809 899.869 1168.2003
NAGLU 621.3457 510.9453 1172.511
NAGPA 346.9762 257.5654 275.5533
NAGS 460.7732 107.2116 321.9763
NAIF1 217.1219 202.5108 132.3054
NAIP 101.2305 87.8942 77.261
NALCN 13.9628 36.7031 48.0809
NAMPT 3245.6584 1257.8849 5465.6387
Because K = 1 is the most complex knn model. It has the most flexible decision boundary. It creates an overfit. It will perform well within the training data by poorly on a holdout set (but not always).
Hi i am new in the field of time series.
I want to make predictions for a given time series
I use the code below :
library(forecast)
library(TSPred)
dataSet <- 'data'
dataSetPath <- paste0("data/", dataSet, '.csv')
# load data
recDF <- read.csv(dataSetPath, skip=0)
rt = ts(recDF["s2"])
if(dataSet=="data"){
nTrain <- 3000
nSkip <- nTrain
nData <- length(rt)
testLength <- nData - nSkip
# testLength
arima_output90 = vector(mode="numeric", length=testLength)
real = vector(mode="numeric", length=testLength)
pred2 <- arimapred(rt[seq(1, nTrain)], n.ahead=testLength)
forecast::auto.arima(rt[seq(1, nTrain)])
# Brute force ARIMA - recompute model every step
# while making predictions for the next N hours.
for(i in nSkip+1:testLength)
{
# Compute ARIMA on the full dataset up to this point
trainSet = window(rt, start=i-nTrain, end=i)
fit_arima <- forecast::auto.arima(trainSet)
# fcast_arima <- predict(fit_arima, n.ahead = 5, se.fit = TRUE)
# mean <- fcast_arima$pred
# std <- fcast_arima$se
fcast_arima <- forecast(fit_arima, h=50)
pred <- fcast_arima$mean
arima_output50[i] = pred[50]
real[i] = rt[i]
cat("step: ",i ,"true : ", rt[i], " prediction: ", pred[50], '\n')
}
I want to plot in a graph predicted and true values, in same graph to have a visualization of true values and predicted values for same time step.
How can this done?
In the model above in timestep t,does prediction pred[50] refer to value rt[i+50] (i want 50 time steps ahead prediction) , or refer to
rt[i](estimated from model brute force trained, from previous values)?
Where i is current timestep as in code, and rt is the real value for timestep i.
You can use:
ts.plot(fit_arima$x, fit_arima$fitted, pred, type='o', col=c('blue', 'green', 'red'), lty=c(1,2,3))
legend('topleft', c('train', 'fitted', 'forecast'), col=c('blue', 'green', 'red'), lty=c(1,2,3))
ts.plot automatically fetches the timestamps from the time series and plots them on the x-axis. For the standard AirPassengers data you will get the following output.
I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))