How to manually build predictions from xgboost model - r

I am trying to figure out how to generate probabilities from the xgboost model tree so they match what I would get from the predict function.
First I build the model
library(xgboost)
#install.packages("ModelMetrics")
library(ModelMetrics)
set.seed(100)
# - Extreme gbm
y = as.integer(testDF$y)
x = testDF[,-which(names(testDF) %in% c('y'))]
var.names <- names(x)
x = as.matrix(x)
x = matrix(as.numeric(x),nrow(x),ncol(x))
nround = 10
XX <- xgboost(param=param, data = x, label = y, nrounds=nround, missing = NA)
Then I wrote some code to build all of the rules that would result in a particular leaf
baseTree <- xgb.model.dt.tree(model = XX)
Leafs <- filter(baseTree, Feature == 'Leaf')
Branches <- filter(baseTree, Feature != 'Leaf')
Branches$Feature = var.names[as.numeric(Branches$Feature) + 1]
FullRules = rep(NA, nrow(Leafs))
AllRules <- foreach(i = 1:nrow(Leafs), .combine = 'rbind') %do% {
theLeaf = Leafs[i,]
theNode = theLeaf$Node
theID = theLeaf$ID
count = 1
RuleText = ''
while(theNode != 0){
FF <- filter(Branches, Yes == theID | No == theID | Missing == theID)
isYes = FF$Yes == theID
isNo = FF$No == theID
isMissing = FF$Missing == theID
FullRules[i] = ifelse(isYes & isMissing
, paste0("(", FF$Feature, " < ", FF$Split, " | is.na(", FF$Feature, "))")
, NA)
FullRules[i] = ifelse(isNo & isMissing
, paste0("(", FF$Feature, " >= ", FF$Split, " | is.na(", FF$Feature, "))")
, FullRules[i])
FullRules[i] = ifelse(isYes & !isMissing
, paste0(FF$Feature, " < ", FF$Split)
, FullRules[i])
FullRules[i] = ifelse(isNo & !isMissing
, paste0(FF$Feature, " >= ", FF$Split)
, FullRules[i])
FullRules[i] = ifelse(isMissing & !isYes & !isNo
, paste0("is.na(", FF$Feature, ")")
, FullRules[i])
if(count == 1){
RuleText = FullRules[i]
} else{
RuleText = paste0(RuleText, " & ", FullRules[i])
}
theNode = FF$Node
theID = FF$ID
count = count + 1
}
data.frame(
Leafs[i,]
,RuleText
)
}
Now I pick out 1 row and attempted to match the probabilities. In this case it matches. The loop will go through and indicate TRUE for all of the rules that are met for this particular customer. Then I can filter down to those rows and sum those up to get the logodds estimates. Then I convert those to probabilities.
TT <- testDF[25,]
ff <- foreach(i = 1:nrow(AllRules), .combine = 'rbind') %do% {
TT %>% transmute_(
Tree = as.character(AllRules$RuleText[i])
, Quality = AllRules$Quality[i])
}
predict(XX, as.matrix(TT[,var.names]))
#[1] 0.05571342
filter(ff, Tree) %>%
summarise(
Q1 = sum(sqrt(Quality^2))
# ,Q2 = sum(sqrt(Quality^2))
, Prob1 = exp(Q1)/(1+exp(Q1))
, Prob2 = 1-Prob1
)
# Q1 Prob1 Prob2
#1 2.830209 0.9442866 0.0557134
But in this case it does not match the predict function...
TT <- testDF[17,]
ff <- foreach(i = 1:nrow(AllRules), .combine = 'rbind') %do% {
TT %>% transmute_(
Tree = as.character(AllRules$RuleText[i])
, Quality = AllRules$Quality[i])
}
predict(XX, as.matrix(TT[,var.names]))
#[1] 0.1386877
filter(ff, Tree) %>%
summarise(
Q1 = sum(sqrt(Quality^2))
# ,Q2 = sum(sqrt(Quality^2))
, Prob1 = exp(Q1)/(1+exp(Q1))
, Prob2 = 1-Prob1
)
# Q1 Prob1 Prob2
#1 1.967608 0.877354 0.122646

To generate the prediction you just need to sum up the values of the individual leafs that the person falls within for each booster
filter(ff, Tree) %>%
summarise(
Q1 = sum(Quality)
, Prob1 = exp(Q1)/(1+exp(Q1))
, Prob2 = 1-Prob1
)

Related

How to plot/extract the BIC values from the step function

I need to plot the BIC value from each regression step in the step function using ggplot. I have no idea how to use ggplot to plot each steps BIC value.
form_model <- formula(lm(price~sqft_living+sqft_lot+waterfront+sqft_above+sqft_basement+years_since_renovations+age_of_house+grade_int+bed_int+bath_int+floors_dummy+view_dummy+condition_dummy+basement_dummy+renovated_dummy+weekend_dummy))
mod <- lm(price~1)
n <- (nrow(House_Regr))
forwardBIC <- step(mod,form_model,direction = "forward", k=log(n) )
Here is the model that i am using.
Start: AIC=181611.1
price ~ 1
Df Sum of Sq RSS AIC
+ sqft_living 1 5.5908e+16 6.9104e+16 178111
+ grade_int 1 4.2600e+16 8.2413e+16 179154
+ sqft_above 1 3.8988e+16 8.6024e+16 179407
+ view_dummy 1 1.5755e+16 1.0926e+17 180822
+ sqft_basement 1 1.1560e+16 1.1345e+17 181045
+ bed_int 1 1.0586e+16 1.1443e+17 181096
+ floors_dummy 1 8.6756e+15 1.1634e+17 181194
+ waterfront 1 8.1097e+15 1.1690e+17 181223
+ basement_dummy 1 3.8336e+15 1.2118e+17 181435
+ bath_int 1 2.1104e+15 1.2290e+17 181519
+ renovated_dummy 1 1.3665e+15 1.2365e+17 181555
+ years_since_renovations 1 8.6785e+14 1.2414e+17 181579
+ sqft_lot 1 8.2901e+14 1.2418e+17 181580
+ condition_dummy 1 6.4654e+14 1.2437e+17 181589
<none> 1.2501e+17 181611
+ age_of_house 1 1.7600e+14 1.2484e+17 181611
+ weekend_dummy 1 9.3267e+11 1.2501e+17 181620
Step: AIC=178111
price ~ sqft_living
Df Sum of Sq RSS AIC
+ view_dummy 1 4.7046e+15 6.4399e+16 177702
+ age_of_house 1 4.5059e+15 6.4598e+16 177721
+ waterfront 1 4.3957e+15 6.4708e+16 177731
+ grade_int 1 3.1890e+15 6.5915e+16 177840
+ years_since_renovations 1 3.0576e+15 6.6046e+16 177852
+ bed_int 1 1.7778e+15 6.7326e+16 177965
+ bath_int 1 1.7527e+15 6.7351e+16 177968
+ renovated_dummy 1 7.2312e+14 6.8381e+16 178057
+ basement_dummy 1 3.1144e+14 6.8793e+16 178093
+ sqft_above 1 1.6922e+14 6.8935e+16 178105
+ sqft_basement 1 1.6922e+14 6.8935e+16 178105
+ sqft_lot 1 1.2746e+14 6.8977e+16 178109
<none> 6.9104e+16 178111
+ condition_dummy 1 3.6244e+13 6.9068e+16 178117
+ floors_dummy 1 1.0259e+13 6.9094e+16 178119
+ weekend_dummy 1 5.9534e+12 6.9098e+16 178119
Here is a small output from the regression. I need to plot each steps BIC value using ggplot. My idea would be to just extract the BIC value for each step then plot them using ggplot but as i have said i have no idea how to accomplish this or if extracting the BIC is even necessary for ggplot.
How would i go about plotting the BIC for each step in the regression on ggplot?
I wouldn't recommend doing this usually, so if there is an answer using real functions then go for it. There is a function called in this: extractAIC that is storing the results, and then printing those tables. You can get the step function by typing it in the console. Quick scan showed me that in the variable aod inside this function it is storing the tables that it prints for each iteration.
A hacky way is to make a list inside this function, update the list with the table each time it changes and then either add it to the response (the usual way) or assign it out to the global environment (bad way). As I don't know anything about the class of the response of the step function, I've opted for the bad way. The full function is here. You can search for the # (!) addition flag to see where I've added it in.
The AIC column contains the BIC values. You can see it changes when you change the k value in the step call
Hope this works ok for you, I'm using the example in the step function
step2 <- function (object, scope, scale = 0, direction = c("both", "backward",
"forward"), trace = 1, keep = NULL, steps = 1000, k = 2,
...)
{
# (!) addition
aod.all <- list()
mydeviance <- function(x, ...) {
dev <- deviance(x)
if (!is.null(dev))
dev
else extractAIC(x, k = 0)[2L]
}
cut.string <- function(string) {
if (length(string) > 1L)
string[-1L] <- paste0("\n", string[-1L])
string
}
re.arrange <- function(keep) {
namr <- names(k1 <- keep[[1L]])
namc <- names(keep)
nc <- length(keep)
nr <- length(k1)
array(unlist(keep, recursive = FALSE), c(nr, nc), list(namr,
namc))
}
step.results <- function(models, fit, object, usingCp = FALSE) {
change <- sapply(models, "[[", "change")
rd <- sapply(models, "[[", "deviance")
dd <- c(NA, abs(diff(rd)))
rdf <- sapply(models, "[[", "df.resid")
ddf <- c(NA, diff(rdf))
AIC <- sapply(models, "[[", "AIC")
heading <- c("Stepwise Model Path \nAnalysis of Deviance Table",
"\nInitial Model:", deparse(formula(object)), "\nFinal Model:",
deparse(formula(fit)), "\n")
aod <- data.frame(Step = I(change), Df = ddf, Deviance = dd,
`Resid. Df` = rdf, `Resid. Dev` = rd, AIC = AIC,
check.names = FALSE)
if (usingCp) {
cn <- colnames(aod)
cn[cn == "AIC"] <- "Cp"
colnames(aod) <- cn
}
attr(aod, "heading") <- heading
fit$anova <- aod
fit
}
Terms <- terms(object)
object$call$formula <- object$formula <- Terms
md <- missing(direction)
direction <- match.arg(direction)
backward <- direction == "both" | direction == "backward"
forward <- direction == "both" | direction == "forward"
if (missing(scope)) {
fdrop <- numeric()
fadd <- attr(Terms, "factors")
if (md)
forward <- FALSE
}
else {
if (is.list(scope)) {
fdrop <- if (!is.null(fdrop <- scope$lower))
attr(terms(update.formula(object, fdrop)), "factors")
else numeric()
fadd <- if (!is.null(fadd <- scope$upper))
attr(terms(update.formula(object, fadd)), "factors")
}
else {
fadd <- if (!is.null(fadd <- scope))
attr(terms(update.formula(object, scope)), "factors")
fdrop <- numeric()
}
}
models <- vector("list", steps)
if (!is.null(keep))
keep.list <- vector("list", steps)
n <- nobs(object, use.fallback = TRUE)
fit <- object
bAIC <- extractAIC(fit, scale, k = k, ...)
edf <- bAIC[1L]
bAIC <- bAIC[2L]
if (is.na(bAIC))
stop("AIC is not defined for this model, so 'step' cannot proceed")
if (bAIC == -Inf)
stop("AIC is -infinity for this model, so 'step' cannot proceed")
nm <- 1
if (trace) {
cat("Start: AIC=", format(round(bAIC, 2)), "\n", cut.string(deparse(formula(fit))),
"\n\n", sep = "")
flush.console()
}
models[[nm]] <- list(deviance = mydeviance(fit), df.resid = n -
edf, change = "", AIC = bAIC)
if (!is.null(keep))
keep.list[[nm]] <- keep(fit, bAIC)
usingCp <- FALSE
while (steps > 0) {
steps <- steps - 1
AIC <- bAIC
ffac <- attr(Terms, "factors")
scope <- factor.scope(ffac, list(add = fadd, drop = fdrop))
aod <- NULL
change <- NULL
if (backward && length(scope$drop)) {
aod <- drop1(fit, scope$drop, scale = scale, trace = trace,
k = k, ...)
rn <- row.names(aod)
row.names(aod) <- c(rn[1L], paste("-", rn[-1L]))
if (any(aod$Df == 0, na.rm = TRUE)) {
zdf <- aod$Df == 0 & !is.na(aod$Df)
change <- rev(rownames(aod)[zdf])[1L]
}
}
if (is.null(change)) {
if (forward && length(scope$add)) {
aodf <- add1(fit, scope$add, scale = scale, trace = trace,
k = k, ...)
rn <- row.names(aodf)
row.names(aodf) <- c(rn[1L], paste("+", rn[-1L]))
aod <- if (is.null(aod))
aodf
else rbind(aod, aodf[-1, , drop = FALSE])
}
attr(aod, "heading") <- NULL
nzdf <- if (!is.null(aod$Df))
aod$Df != 0 | is.na(aod$Df)
aod <- aod[nzdf, ]
if (is.null(aod) || ncol(aod) == 0)
break
nc <- match(c("Cp", "AIC"), names(aod))
nc <- nc[!is.na(nc)][1L]
o <- order(aod[, nc])
# (!) addition
aod.all <- c(aod.all, list(aod))
if (trace)
print(aod[o, ])
if (o[1L] == 1)
break
change <- rownames(aod)[o[1L]]
}
usingCp <- match("Cp", names(aod), 0L) > 0L
fit <- update(fit, paste("~ .", change), evaluate = FALSE)
fit <- eval.parent(fit)
nnew <- nobs(fit, use.fallback = TRUE)
if (all(is.finite(c(n, nnew))) && nnew != n)
stop("number of rows in use has changed: remove missing values?")
Terms <- terms(fit)
bAIC <- extractAIC(fit, scale, k = k, ...)
edf <- bAIC[1L]
bAIC <- bAIC[2L]
if (trace) {
cat("\nStep: AIC=", format(round(bAIC, 2)), "\n",
cut.string(deparse(formula(fit))), "\n\n", sep = "")
flush.console()
}
if (bAIC >= AIC + 1e-07)
break
nm <- nm + 1
models[[nm]] <- list(deviance = mydeviance(fit), df.resid = n -
edf, change = change, AIC = bAIC)
if (!is.null(keep))
keep.list[[nm]] <- keep(fit, bAIC)
}
if (!is.null(keep))
fit$keep <- re.arrange(keep.list[seq(nm)])
# (!) addition
assign("aod.all", aod.all, envir = .GlobalEnv)
step.results(models = models[seq(nm)], fit, object, usingCp)
}
lm1 <- lm(Fertility ~ ., data = swiss)
slm1 <- step2(lm1)
aod.all

R: incorporating fisher.test into Hmisc's summaryM leads to error

catTestfisher <-
function (tab)
{
st <- if (!is.matrix(tab) || nrow(tab) < 2 | ncol(tab) <
2)
list(p.value = NA, statistic = NA, parameter = NA)
else {
rowcounts <- tab %*% rep(1, ncol(tab))
tab <- tab[rowcounts > 0, ]
if (!is.matrix(tab))
list(p.value = NA, statistic = NA, parameter = NA)
else fisher.test(tab)
}
list(P = st$p.value, stat = "", df = "",
testname = "Fisher's Exact", statname = "", latexstat = "", namefun = "",
plotmathstat = "")
}
I wanted to use library(Hmisc)'s summaryM function but with Fisher's exact test, so I wrote a catTestfisher function and set catTest = catTestfisher in my own summaryM2 function, which is exactly the same as summaryM, except for catTest = catTestfisher
summaryM2 <-
function (formula, groups = NULL, data = NULL, subset, na.action = na.retain,
overall = FALSE, continuous = 10, na.include = FALSE, quant = c(0.025,
0.05, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 0.95,
0.975), nmin = 100, test = FALSE, conTest = conTestkw,
catTest = catTestfisher, ordTest = ordTestpo)
{
marg <- length(data) && ".marginal." %in% names(data)
if (marg)
formula <- update(formula, . ~ . + .marginal.)
formula <- Formula(formula)
Y <- if (!missing(subset) && length(subset))
model.frame(formula, data = data, subset = subset, na.action = na.action)
else model.frame(formula, data = data, na.action = na.action)
X <- model.part(formula, data = Y, rhs = 1)
Y <- model.part(formula, data = Y, lhs = 1)
getlab <- function(x, default) {
lab <- attr(x, "label")
if (!length(lab) || lab == "")
default
else lab
}
if (marg) {
xm <- X$.marginal.
X$.marginal. <- NULL
}
else xm <- rep("", nrow(X))
if (length(X)) {
xname <- names(X)
if (length(xname) == 1 && !length(groups))
groups <- xname
if (!length(groups) && length(xname) > 1) {
warnings("Must specify groups when > 1 right hand side variable is present.\ngroups taken as first right hand variable.")
groups <- xname[1]
}
svar <- if (length(xname) == 1)
factor(rep(".ALL.", nrow(X)))
else do.call("interaction", list(X[setdiff(xname, groups)],
sep = " "))
group <- X[[groups]]
glabel <- getlab(group, groups)
}
else {
svar <- factor(rep(".ALL.", nrow(Y)))
group <- rep("", nrow(Y))
groups <- group.freq <- NULL
glabel <- ""
}
quants <- unique(c(quant, 0.025, 0.05, 0.125, 0.25, 0.375,
0.5, 0.625, 0.75, 0.875, 0.95, 0.975))
nv <- ncol(Y)
nameY <- names(Y)
R <- list()
for (strat in levels(svar)) {
instrat <- svar == strat
n <- integer(nv)
type <- n
comp <- dat <- vector("list", nv)
names(comp) <- names(dat) <- nameY
labels <- Units <- vector("character", nv)
if (test) {
testresults <- vector("list", nv)
names(testresults) <- names(comp)
}
gr <- group[instrat]
xms <- xm[instrat]
if (all(xms != ""))
xms <- rep("", length(xms))
group.freq <- table(gr)
group.freq <- group.freq[group.freq > 0]
if (overall)
group.freq <- c(group.freq, Combined = sum(group.freq))
for (i in 1:nv) {
w <- Y[instrat, i]
if (length(attr(w, "label")))
labels[i] <- attr(w, "label")
if (length(attr(w, "units")))
Units[i] <- attr(w, "units")
if (!inherits(w, "mChoice")) {
if (!is.factor(w) && !is.logical(w) && length(unique(w[!is.na(w)])) <
continuous)
w <- as.factor(w)
s <- !is.na(w)
if (na.include && !all(s) && length(levels(w))) {
w <- na.include(w)
levels(w)[is.na(levels(w))] <- "NA"
s <- rep(TRUE, length(s))
}
n[i] <- sum(s & xms == "")
w <- w[s]
g <- gr[s, drop = TRUE]
if (is.factor(w) || is.logical(w)) {
tab <- table(w, g)
if (test) {
if (is.ordered(w))
testresults[[i]] <- ordTest(g, w)
else testresults[[i]] <- catTest(tab)
}
if (nrow(tab) == 1) {
b <- casefold(dimnames(tab)[[1]], upper = TRUE)
pres <- c("1", "Y", "YES", "PRESENT")
abse <- c("0", "N", "NO", "ABSENT")
jj <- match(b, pres, nomatch = 0)
if (jj > 0)
bc <- abse[jj]
else {
jj <- match(b, abse, nomatch = 0)
if (jj > 0)
bc <- pres[jj]
}
if (jj) {
tab <- rbind(tab, rep(0, ncol(tab)))
dimnames(tab)[[1]][2] <- bc
}
}
if (overall)
tab <- cbind(tab, Combined = apply(tab, 1,
sum))
comp[[i]] <- tab
type[i] <- 1
}
else {
sfn <- function(x, quant) {
o <- options(digits = 10)
on.exit(options(o))
c(quantile(x, quant), Mean = mean(x), SD = sqrt(var(x)),
N = sum(!is.na(x)))
}
qu <- tapply(w, g, sfn, simplify = TRUE, quants)
if (test)
testresults[[i]] <- conTest(g, w)
if (overall)
qu$Combined <- sfn(w, quants)
comp[[i]] <- matrix(unlist(qu), ncol = length(quants) +
3, byrow = TRUE, dimnames = list(names(qu),
c(format(quants), "Mean", "SD", "N")))
if (any(group.freq <= nmin))
dat[[i]] <- lapply(split(w, g), nmin = nmin,
function(x, nmin) if (length(x) <= nmin)
x
else NULL)
type[i] <- 2
}
}
else {
w <- as.numeric(w) == 1
n[i] <- sum(!is.na(apply(w, 1, sum)) & xms ==
"")
g <- as.factor(gr)
ncat <- ncol(w)
tab <- matrix(NA, nrow = ncat, ncol = length(levels(g)),
dimnames = list(dimnames(w)[[2]], levels(g)))
if (test) {
pval <- numeric(ncat)
names(pval) <- dimnames(w)[[2]]
d.f. <- stat <- pval
}
for (j in 1:ncat) {
tab[j, ] <- tapply(w[, j], g, sum, simplify = TRUE,
na.rm = TRUE)
if (test) {
tabj <- rbind(table(g) - tab[j, ], tab[j,
])
st <- catTest(tabj)
pval[j] <- st$P
stat[j] <- st$stat
d.f.[j] <- st$df
}
}
if (test)
testresults[[i]] <- list(P = pval, stat = stat,
df = d.f., testname = st$testname, statname = st$statname,
latexstat = st$latexstat, plotmathstat = st$plotmathstat)
if (overall)
tab <- cbind(tab, Combined = apply(tab, 1,
sum))
comp[[i]] <- tab
type[i] <- 3
}
}
labels <- ifelse(nchar(labels), labels, names(comp))
R[[strat]] <- list(stats = comp, type = type, group.freq = group.freq,
labels = labels, units = Units, quant = quant, data = dat,
N = sum(!is.na(gr) & xms == ""), n = n, testresults = if (test) testresults)
}
structure(list(results = R, group.name = groups, group.label = glabel,
call = call, formula = formula), class = "summaryM")
}
After trying to test it on the following data, I get a warning and an error:
library(Hmisc)
set.seed(173)
sex <- factor(sample(c("m","f"), 500, rep=TRUE))
treatment <- factor(sample(c("Drug","Placebo"), 500, rep=TRUE))
> summaryM2(sex ~ treatment, test=TRUE, overall = TRUE)
Error in round(teststat, 2) :
non-numeric argument to mathematical function
I tried stepping through the summaryM2 function line by line, but could not figure out what's causing the problem.
In your catTestfisher function, the output variables stat (test statistic) and df (degrees of freedom) should be numeric variables not empty strings. In the programming stat is coverted to teststat for rounding before being outputted (hence the error message for round("", 2) is non-numeric argument to mathematical function). See lines 1718 to 1721 in the summary.formula code) .
You can set df = NULL but a value is required for stat (not NA or NULL) otherwise no output is returned. You can get around the problem by setting stat = 0 (or any other number), and then only displaying the p value using prtest = "P".
catTestfisher2 <- function (tab)
{
st <- fisher.test(tab)
list(P = st$p.value, stat = 0, df = NULL,
testname = st$method, statname = "", latexstat = "", namefun = "",
plotmathstat = "")
}
output <- summaryM(sex ~ treatment, test=TRUE, overall = TRUE, catTest = catTestfisher2)
print(output, prtest = "P")
Descriptive Statistics (N=500)
+-------+-----------+-----------+-----------+-------+
| |Drug |Placebo |Combined |P-value|
| |(N=257) |(N=243) |(N=500) | |
+-------+-----------+-----------+-----------+-------+
|sex : m|0.52 (133)|0.52 (126)|0.52 (259)| 1 |
+-------+-----------+-----------+-----------+-------+
Note there is no need to define your own summaryM2 function. Just use catTest = to pass in your function.

Q-learning R has length zero

I am trying to inplement a simulation of a simplified blackjack game that will return the best policy at each state s.
The blackjack simulation seems to work properly, but i somehow get an error when trying to apply the Q learning algorithm to reach the optimal policy.
Here's my code, i believe it's well documented, error is in the Q-learning block, starting at ~line 170, it is also reproducible :
#Application reinforcement learning for black jack. We will suppose here that the croupier only has 1 pack of cards
#Initial tabs
packinit = c(rep(1,4), rep(2,4),rep(3,4),rep(4,4),rep(5,4),rep(6,4),rep(7,4),rep(8,4),
rep(9,4),rep(10,16))
#In our game and for simplicifaction of the problem, aces will always count as 1. Other figures are worth 10.
#If both player and croupier have same score, then player looses.
#Croupier will draw cards until he has 17 or more.
handPinit = NULL # will contain hand of player
handCinit = NULL # will contain hand of the croupier
list = list(handPinit, handCinit, packinit)
# Methods ####################################################################################
##############################################################################################
#Random integer, returns an integer to choose card
randInt = function(pack){
int = runif(1) * length(pack)
int = int+1
int = round(int, 0)
return(int)
}
#Picks a card, asimResults it to the desired hand and deletes it from the package.
pickC = function(hand, pack){
int = randInt(pack)
hand = c(hand, pack[int])
pack = pack[-int]
return(list(hand, pack))
}
score = function(handC){
return(sum(handC, na.rm = T))
}
printWinner = function(resultList){
res = resultList[[4]]
p = res[1]
c = res[2]
if((p > c && p <= 21) || (p <= 21 && c > 21)){
cat("Player has won with ", p, ", croupier has ", c, ".\n", sep = "")
}else{
cat("Player has lost with ", p, ", croupier has ", c, ".\n", sep = "")
}
}
#Black jack sim :
simulation = function(handP, handC, pack){
#Matrix to stock choice and next state, 1st is state, 2nd is choice, 3rd is reward, 4th is start state
cs = NULL
#pick first card
temp = NULL
temp = pickC(handP, pack)
handP = temp[[1]]
pack = temp[[2]]
temp = pickC(handC, pack)
handC = temp[[1]]
pack = temp[[2]]
#stock result
cs = rbind(cs, c(score(handP), 1, 0.1, 0))
#pick second card
temp = pickC(handP, pack)
handP = temp[[1]]
pack = temp[[2]]
temp = pickC(handC, pack)
handC = temp[[1]]
pack = temp[[2]]
#stock result
cs = rbind(cs, c(score(handP), 1, 0.1, cs[length(cs[,1]), 1]))
#reward stock final
reward = NULL
#to change with algo decision
while(score(handC) < 17){
#rand number to choose action, 1 = draw
rand = round(2*runif(1),0)
#if a = 1, draw a card
if(rand == 1 && score(handP) < 21){
temp = pickC(handP, pack)
handP = temp[[1]]
pack = temp[[2]]
cs = rbind(cs, c(score(handP), 1, 0.1, cs[length(cs[,1]), 1] ))
}else{
cs = rbind(cs, c(score(handP), 0, 0.1, cs[length(cs[,1]), 1]))
}
#if croupier < 17, he draws a card
if(score(handC) < 17){
temp = pickC(handC, pack)
handC = temp[[1]]
pack = temp[[2]]
}
}
#get scores
scores = c(score(handP), score(handC))
resultList = list(handP, handC, pack, scores)
#get reward
res = resultList[[4]]
p = res[1]
c = res[2]
if((p > c && p <= 21) || (p <= 21 && c > 21)){
reward = 100
}else{
reward = -50
}
#AsimResults reward as the reward of the last line of cs
cs[length(cs[,1]), 3] = reward
#return full list
resultList = list(handP, handC, pack, scores, cs)
return(resultList)
}
#Function for simulation, outputs tab containins states, actions and choices
simRand = function(k){
resultsRand = NULL
for(i in 1:k){
#init pack and hands
pack = c(rep(1,4), rep(2,4),rep(3,4),rep(4,4),rep(5,4),rep(6,4),rep(7,4),rep(8,4),
rep(9,4),rep(10,16))
handC = NULL
handP = NULL
#simulation k
res = simulation(handP, handC, pack)
resultsRand = rbind(resultsRand, res[[5]])
#resets for next iteration
pack = c(rep(1,4), rep(2,4),rep(3,4),rep(4,4),rep(5,4),rep(6,4),rep(7,4),rep(8,4),
rep(9,4),rep(10,16))
handC = NULL
handP = NULL
}
return(resultsRand)
}
#test
for(i in 1:10){
results = simulation(handPinit, handCinit, packinit)
printWinner(results)
}
#used to max the Qvalue decision
getRowMax = function(tab){
temp = tab[1]
for(i in 2:length(tab)){
if(tab[i] > temp){
temp = tab[i]
}
}
}
#####################################################################
#Q-learning
#####################################################################
#Represent sets of Q(s, a)
Qvalues = matrix(1, nrow = 30, ncol = 2)
simResults = simRand(1000)
#Hyperparameters
alpha = 0.9
discount = 0.1
#for all rows simulated, update qvalues.
for(i in 1:length(simResults[,1])){
st = simResults[i, 4] #st
a = simResults[i, 2] #a
stPlusOne = simResults[i, 1] #st+1
Qvalues[st, a] = Qvalues[st, a] + alpha * ( simResults[i,3] * discount * getRowMax(Qvalues[stPlusOne, ]) - Qvalues[st, a] )
}
As LucyMLi points out:
First you need to add return(temp) object to the getRowMax function.
But there is another issue with your simulation, because some of the
values in simResults[, 1] are 0, which means Qvalues[stPlusOne, ] will
be empty and thus you can't compute getRowMax().

Is there a way to see the formula that R uses for the survfit confidence intervals? [duplicate]

This question already has answers here:
How can I view the source code for a function?
(13 answers)
Closed 7 years ago.
I want to be able to see how the summary of survfit calculates its confidence intervals. Is there a way that I can ask R to show me how it calculated these to show me the formula?
Thanks :)
You can find the source code like this. First look at survfit:
> getAnywhere("survfit")
A single object matching ‘survfit’ was found
It was found in the following places
package:survival
namespace:survival
with value
function (formula, ...)
{
UseMethod("survfit", formula)
}
<bytecode: 0x000000000edccc88>
<environment: namespace:survival>
>
This tells us we have to look at survfit.formula, which is what that UseMethod call is saying. So we do that and we get a lot of code:
> getAnywhere("survfit.formula")
A single object matching ‘survfit.formula’ was found
It was found in the following places
package:survival
registered S3 method for survfit from namespace survival
namespace:survival
with value
function (formula, data, weights, subset, na.action, etype, id,
istate, ...)
{
Call <- match.call()
Call[[1]] <- as.name("survfit")
mfnames <- c("formula", "data", "weights", "subset", "na.action",
"istate", "id", "etype")
temp <- Call[c(1, match(mfnames, names(Call), nomatch = 0))]
temp[[1]] <- as.name("model.frame")
if (is.R())
m <- eval.parent(temp)
else m <- eval(temp, sys.parent())
Terms <- terms(formula, c("strata", "cluster"))
ord <- attr(Terms, "order")
if (length(ord) & any(ord != 1))
stop("Interaction terms are not valid for this function")
n <- nrow(m)
Y <- model.extract(m, "response")
if (!is.Surv(Y))
stop("Response must be a survival object")
casewt <- model.extract(m, "weights")
if (is.null(casewt))
casewt <- rep(1, n)
if (!is.null(attr(Terms, "offset")))
warning("Offset term ignored")
id <- model.extract(m, "id")
istate <- model.extract(m, "istate")
temp <- untangle.specials(Terms, "cluster")
if (length(temp$vars) > 0) {
if (length(temp$vars) > 1)
stop("can not have two cluster terms")
if (!is.null(id))
stop("can not have both a cluster term and an id variable")
id <- m[[temp$vars]]
Terms <- Terms[-temp$terms]
}
ll <- attr(Terms, "term.labels")
if (length(ll) == 0)
X <- factor(rep(1, n))
else X <- strata(m[ll])
if (!is.Surv(Y))
stop("y must be a Surv object")
etype <- model.extract(m, "etype")
if (!is.null(etype)) {
if (attr(Y, "type") == "mcounting" || attr(Y, "type") ==
"mright")
stop("cannot use both the etype argument and mstate survival type")
if (length(istate))
stop("cannot use both the etype and istate arguments")
status <- Y[, ncol(Y)]
etype <- as.factor(etype)
temp <- table(etype, status == 0)
if (all(rowSums(temp == 0) == 1)) {
newlev <- levels(etype)[order(-temp[, 2])]
}
else newlev <- c(" ", levels(etype)[temp[, 1] > 0])
status <- factor(ifelse(status == 0, 0, as.numeric(etype)),
labels = newlev)
if (attr(Y, "type") == "right")
Y <- Surv(Y[, 1], status, type = "mstate")
else if (attr(Y, "type") == "counting")
Y <- Surv(Y[, 1], Y[, 2], status, type = "mstate")
else stop("etype argument incompatable with survival type")
}
if (attr(Y, "type") == "left" || attr(Y, "type") == "interval")
temp <- survfitTurnbull(X, Y, casewt, ...)
else if (attr(Y, "type") == "right" || attr(Y, "type") ==
"counting")
temp <- survfitKM(X, Y, casewt, ...)
else if (attr(Y, "type") == "mright" || attr(Y, "type") ==
"mcounting")
temp <- survfitCI(X, Y, weights = casewt, id = id, istate = istate,
...)
else {
stop("unrecognized survival type")
}
if (is.null(temp$states))
class(temp) <- "survfit"
else class(temp) <- c("survfitms", "survfit")
if (!is.null(attr(m, "na.action")))
temp$na.action <- attr(m, "na.action")
temp$call <- Call
temp
}
<bytecode: 0x000000003f6a8c28>
<environment: namespace:survival>
We scan this and eventually notice a call to survfitCI close to the end. Sounds like what we are looking for. So once again into the breech:
> getAnywhere("survfitCI")
A single object matching ‘survfitCI’ was found
It was found in the following places
package:survival
namespace:survival
with value
function (X, Y, weights, id, istate, type = c("kaplan-meier",
"fleming-harrington", "fh2"), se.fit = TRUE, conf.int = 0.95,
conf.type = c("log", "log-log", "plain", "none"), conf.lower = c("usual",
"peto", "modified"))
{
method <- match.arg(type)
conf.type <- match.arg(conf.type)
conf.lower <- match.arg(conf.lower)
if (is.logical(conf.int)) {
if (!conf.int)
conf.type <- "none"
conf.int <- 0.95
}
type <- attr(Y, "type")
if (type != "mright" && type != "mcounting" && type != "right" &&
type != "counting")
stop(paste("Cumulative incidence computation doesn't support \"",
type, "\" survival data", sep = ""))
n <- nrow(Y)
status <- Y[, ncol(Y)]
ncurve <- length(levels(X))
state.names <- attr(Y, "states")
if (missing(istate) || is.null(istate))
istate <- rep(0L, n)
else if (is.factor(istate) || is.character(istate)) {
temp <- as.factor(istate)
appear <- (levels(istate))[unique(as.numeric(istate))]
state.names <- unique(c(attr(Y, "states"), appear))
istate <- as.numeric(factor(as.character(istate), levels = state.names))
}
else if (!is.numeric(istate) || any(istate != floor(istate)))
stop("istate should be a vector of integers or a factor")
if (length(id) == 0)
id <- 1:n
if (length(istate) == 1)
istate <- rep(istate, n)
if (length(istate) != n)
stop("wrong length for istate")
states <- sort(unique(c(istate, 1:length(attr(Y, "states")))))
docurve2 <- function(entry, etime, status, istate, wt, states,
id, se.fit) {
ftime <- factor(c(entry, etime))
ltime <- levels(ftime)
ftime <- matrix(as.integer(ftime), ncol = 2)
timeset <- as.numeric(ltime[sort(unique(ftime[, 2]))])
nstate <- length(states)
uid <- sort(unique(id))
P <- as.vector(tapply(wt, factor(istate, levels = states),
sum)/sum(wt))
P <- ifelse(is.na(P), 0, P)
cstate <- istate[match(uid, id)]
storage.mode(wt) <- "double"
storage.mode(cstate) <- "integer"
storage.mode(status) <- "integer"
fit <- .Call(Csurvfitci, ftime, order(ftime[, 1]) - 1L,
order(ftime[, 2]) - 1L, length(timeset), status,
cstate - 1L, wt, match(id, uid) - 1L, P, as.integer(se.fit))
prev0 <- table(factor(cstate, levels = states), exclude = NA)/length(cstate)
if (se.fit)
list(time = timeset, pmat = t(fit$p), std = sqrt(t(fit$var)),
n.risk = colSums(fit$nrisk), n.event = fit$nevent,
n.censor = fit$ncensor, prev0 = prev0, cumhaz = array(fit$cumhaz,
dim = c(nstate, nstate, length(timeset))))
else list(time = timeset, pmat = t(fit$p), n.risk = colSums(fit$nrisk),
n.event = fit$nevent, n.censor = fit$ncensor, prev0 = prev0,
cumhaz = array(fit$cumhaz, dim = c(nstate, nstate,
length(timeset))))
}
if (any(states == 0)) {
state0 <- TRUE
states <- states + 1
istate <- istate + 1
status <- ifelse(status == 0, 0, status + 1)
}
else state0 <- FALSE
curves <- vector("list", ncurve)
names(curves) <- levels(X)
if (ncol(Y) == 2) {
indx <- which(status == istate & status != 0)
if (length(indx)) {
warning("an observation transitions to it's starting state, transition ignored")
status[indx] <- 0
}
if (length(id) && any(duplicated(id)))
stop("Cannot have duplicate id values with (time, status) data")
entry <- rep(min(-1, 2 * min(Y[, 1]) - 1), n)
for (i in levels(X)) {
indx <- which(X == i)
curves[[i]] <- docurve2(entry[indx], Y[indx, 1],
status[indx], istate[indx], weights[indx], states,
id[indx], se.fit)
}
}
else {
if (missing(id) || is.null(id))
stop("the id argument is required for start:stop data")
indx <- order(id, Y[, 2])
indx1 <- c(NA, indx)
indx2 <- c(indx, NA)
same <- (id[indx1] == id[indx2] & !is.na(indx1) & !is.na(indx2))
if (any(same & X[indx1] != X[indx2])) {
who <- 1 + min(which(same & X[indx1] != X[indx2]))
stop("subject is in two different groups, id ", (id[indx1])[who])
}
if (any(same & Y[indx1, 2] != Y[indx2, 1])) {
who <- 1 + min(which(same & Y[indx1, 2] != Y[indx2,
1]))
stop("gap in follow-up, id ", (id[indx1])[who])
}
if (any(Y[, 1] == Y[, 2]))
stop("cannot have start time == stop time")
if (any(same & Y[indx1, 3] == Y[indx2, 3] & Y[indx1,
3] != 0)) {
who <- 1 + min(which(same & Y[indx1, 1] != Y[indx2,
2]))
warning("subject changes to the same state, id ",
(id[indx1])[who])
}
if (any(same & weights[indx1] != weights[indx2])) {
who <- 1 + min(which(same & weights[indx1] != weights[indx2]))
stop("subject changes case weights, id ", (id[indx1])[who])
}
indx <- order(Y[, 2])
uid <- unique(id)
temp <- (istate[indx])[match(uid, id[indx])]
istate <- temp[match(id, uid)]
for (i in levels(X)) {
indx <- which(X == i)
curves[[i]] <- docurve2(Y[indx, 1], Y[indx, 2], status[indx],
istate[indx], weights[indx], states, id[indx],
se.fit)
}
}
grabit <- function(clist, element) {
temp <- (clist[[1]][[element]])
if (is.matrix(temp)) {
nc <- ncol(temp)
matrix(unlist(lapply(clist, function(x) t(x[[element]]))),
byrow = T, ncol = nc)
}
else {
xx <- as.vector(unlist(lapply(clist, function(x) x[element])))
if (class(temp) == "table")
matrix(xx, byrow = T, ncol = length(temp))
else xx
}
}
kfit <- list(n = as.vector(table(X)), time = grabit(curves,
"time"), n.risk = grabit(curves, "n.risk"), n.event = grabit(curves,
"n.event"), n.censor = grabit(curves, "n.censor"), prev = grabit(curves,
"pmat"), prev0 = grabit(curves, "prev0"))
nstate <- length(states)
kfit$cumhaz <- array(unlist(lapply(curves, function(x) x$cumhaz)),
dim = c(nstate, nstate, length(kfit$time)))
if (length(curves) > 1)
kfit$strata <- unlist(lapply(curves, function(x) length(x$time)))
if (se.fit)
kfit$std.err <- grabit(curves, "std")
if (state0) {
kfit$prev <- kfit$prev[, -1]
if (se.fit)
kfit$std.err <- kfit$std.err[, -1]
kfit$prev0 <- kfit$prev0[, -1]
}
if (se.fit) {
std.err <- kfit$std.err
zval <- qnorm(1 - (1 - conf.int)/2, 0, 1)
surv <- 1 - kfit$prev
if (conf.type == "plain") {
temp <- zval * std.err
kfit <- c(kfit, list(lower = pmax(kfit$prev - temp,
0), upper = pmin(kfit$prev + temp, 1), conf.type = "plain",
conf.int = conf.int))
}
if (conf.type == "log") {
xx <- ifelse(kfit$prev == 1, 1, 1 - kfit$prev)
temp1 <- ifelse(surv == 0, NA, exp(log(xx) + zval *
std.err/xx))
temp2 <- ifelse(surv == 0, NA, exp(log(xx) - zval *
std.err/xx))
kfit <- c(kfit, list(lower = pmax(1 - temp1, 0),
upper = 1 - temp2, conf.type = "log", conf.int = conf.int))
}
if (conf.type == "log-log") {
who <- (surv == 0 | surv == 1)
temp3 <- ifelse(surv == 0, NA, 1)
xx <- ifelse(who, 0.1, kfit$surv)
temp1 <- exp(-exp(log(-log(xx)) + zval * std.err/(xx *
log(xx))))
temp1 <- ifelse(who, temp3, temp1)
temp2 <- exp(-exp(log(-log(xx)) - zval * std.err/(xx *
log(xx))))
temp2 <- ifelse(who, temp3, temp2)
kfit <- c(kfit, list(lower = 1 - temp1, upper = 1 -
temp2, conf.type = "log-log", conf.int = conf.int))
}
}
kfit$states <- state.names
kfit$type <- attr(Y, "type")
kfit
}
<bytecode: 0x000000002ce81838>
<environment: namespace:survival>
Somewhere in there is your answer.

Replacement has length zero: can't find the issue with my loop

I'm trying to modify some code from a chapter of Quantitative Trading with R to work with returns instead of raw prices. Everythings to be going okay with the exception of the "PROFIT AND LOSS" section of my code. It keeps returning "Error in qty_x[i] = (vec[i] + prev_x_qty) : replacement has length zero" When looking at my variables I can't seem to find any problems. I've included the code for reproduction.
# LOAD LIBRARIES
library(quantmod)
library(xts)
# FUNCTIONS
# ROLLING BETA
pcbeta = function(dF){
r = prcomp( ~ dF$x[-1] + dF$y[-1])
return(r$rotation[2, 1] / r$rotation[1,1])
}
rolling_beta = function(z, width){
rollapply(z, width = width, FUN = pcbeta,
by.column = FALSE, align = 'right')
}
# GET TICKER DATA
SPY = getSymbols('SPY', adjust=T, auto.assign=FALSE)
AAPL = getSymbols('AAPL', adjust=T, auto.assign=FALSE)
# IN-SAMPLE DATE RANGE
in_start_date = '2011-01-01'
in_end_date = '2011-12-31'
in_range = paste(in_start_date, '::', in_end_date, sep='')
# RETRIEVE IN-SAMPLE DATA
x_in = SPY[in_range, 6]
y_in = AAPL[in_range, 6]
dF_in = cbind(x_in, y_in)
names(dF_in) = c('x','y')
# OUT-OF-SAMPLE DATE RANGE
out_start_date= '2012-01-01'
out_end_date = '2012-12-31'
out_range = paste(out_start_date, '::', out_end_date, sep='')
# RETRIEVE OUT-OF-SAMPLE DATA
x_out = SPY[out_range, 6]
y_out = AAPL[out_range, 6]
dF_out = cbind(x_out, y_out)
names(dF_out) = c('x', 'y')
# CALCULATE RETURNS (IN AND OUT OF SAMPLE)
returns_in = diff(dF_in) / dF_in
returns_out = diff(dF_out) / dF_out
# DEFINE ROLLING WINDOW LENGTH
window_length = 10
# FIND BETAS
betas_in = rolling_beta(returns_in, window_length)
betas_out = rolling_beta(returns_out, window_length)
# FIND SPREADS
spreadR_in = returns_in$y - betas_in * returns_in$x
spreadR_out = returns_out$y - betas_out * returns_out$x
names(spreadR_in) = c('spread')
names(spreadR_out) = c('spread')
# FIND THRESHOLD
threshold = sd(spreadR_in, na.rm=TRUE)
# FORM DATA SETS
data_in = merge(returns_in, betas_in, spreadR_in)
data_out = merge(x_out, y_out, returns_out, betas_out, spreadR_out)
names(data_out) = c('xp', 'yp', 'x', 'y', 'betas_out', 'spread')
data_in = data_in[-1]
data_out = data_out[-1]
# GENERATE BUY AND SELL SIGNALS FOR OUT OF SAMPLE
buys = ifelse(data_out$spread > threshold, 1, 0)
sells = ifelse(data_out$spread < -threshold, -1, 0)
data_out$signal = buys+sells
# PROFIT AND LOSS
prev_x_qty = 0
position = 0
trade_size = 100
signal = as.numeric(data_out$signal)
signal[is.na(signal)] = 0
beta = as.numeric(data_out$betas_out)
ratio = (data_out$yp/data_out$xp)
vec = round(beta*trade_size*ratio)
qty_x = rep(0, length(signal))
qty_y = rep(0, length(signal))
for(i in 1:length(signal)){
if(signal[i] == 1 && position == 0){
#buy the spread
prev_x_qty = vec[i]
qty_x[i] = -prev_x_qty
qty_y[i] = trade_size
position = 1
}
if(signal[i] == -1 && position == 0){
#buy the spread
prev_x_qty = vec[i]
qty_x[i] = prev_x_qty
qty_y[i] = -trade_size
position = -1
}
if(signal[i] == 1 && position == -1){
# we are short the spread and need to buy
qty_x[i] = -(vec[i] + prev_x_qty)
prev_x_qty = vec[i]
qty_y[i] = 2 * trade_size
position = 1
}
if(signal[i] == -1 && position == 1){
# we are short the spread and need to buy
qty_x[i] = (vec[i] + prev_x_qty)
prev_x_qty = vec[i]
qty_y[i] = -2 * trade_size
position = -1
}
}

Resources