My goal is to create a function that, when looped over multiple variables of a data frame, will return a new data frame containing the percents and 95% confidence intervals for each level of each variable.
As an example, if I applied this function to "cyl" and "am" from the mtcars data frame, I would want this as the final result:
variable level ci.95
1 cyl 4 34.38 (19.50, 53.11)
2 cyl 6 21.88 (10.35, 40.45)
3 cyl 8 43.75 (27.10, 61.94)
4 am 0 59.38 (40.94, 75.5)
5 am 1 40.62 (24.50, 59.06)
So, far I have function that seems to work for a single variable; however, I have two issues that I'm hoping the community can help me with:
General R-ifying my code. I'm still an R novice. I've read enough posts to know that R enthusiasts generally discourage using for loops, but I still really struggle with using the apply functions (which seems to be the alternative to for loops in most cases).
Applying this function to a list of variables - resulting in a single data frame containing the returned values from the function for each level of each variable.
Here's where I'm at with my code so far:
t1.props <- function(x, data = NULL) {
# Grab dataframe and/or variable name
if(!missing(data)){
var <- data[,deparse(substitute(x))]
} else {
var <- x
}
# Grab variable name for use in ouput
var.name <- substitute(x)
# Omit observations with missing data
var.clean <- na.omit(var)
# Number of nonmissing observations
n <- length(var.clean)
# Grab levels of variable
levels <- sort(unique(var.clean))
# Create an empty data frame to store values
out <- data.frame(variable = NA,
level = NA,
ci.95 = NA)
# Estimate prop, se, and ci for each level of the variable
for(i in seq_along(levels)) {
prop <- paste0("prop", i)
se <- paste0("se", i)
log.prop <- paste0("log.trans", i)
log.se <- paste0("log.se", i)
log.l <- paste0("log.l", i)
log.u <- paste0("log.u", i)
lcl <- paste0("lcl", i)
ucl <- paste0("ucl", i)
# Find the proportion for each level of the variable
assign(prop, sum(var.clean == levels[i]) / n)
# Find the standard error for each level of the variable
assign(se, sd(var.clean == levels[i]) /
sqrt(length(var.clean == levels[i])))
# Perform a logit transformation of the original percentage estimate
assign(log.prop, log(get(prop)) - log(1 - get(prop)))
# Transform the standard error of the percentage to a standard error of its
# logit transformation
assign(log.se, get(se) / (get(prop) * (1 - get(prop))))
# Calculate the lower and upper confidence bounds of the logit
# transformation
assign(log.l,
get(log.prop) -
qt(.975, (length(var.clean == levels[i]) - 1)) * get(log.se))
assign(log.u,
get(log.prop) +
qt(.975, (length(var.clean == levels[i]) - 1)) * get(log.se))
# Finally, perform inverse logit transformations to get the confidence bounds
assign(lcl, exp(get(log.l)) / (1 + exp(get(log.l))))
assign(ucl, exp(get(log.u)) / (1 + exp(get(log.u))))
# Create a combined 95% CI variable for easy copy/paste into Word tables
ci.95 <- paste0(round(get(prop) * 100, 2), " ",
"(", sprintf("%.2f", round(get(lcl) * 100, 2)), ",", " ",
round(get(ucl) * 100, 2), ")")
# Populate the "out" data frame with values
out <- rbind(out, c(as.character(var.name), levels[i], ci.95))
}
# Remove first (empty) row from out
# But only in the first iteration
if (is.na(out[1,1])) {
out <- out[-1, ]
rownames(out) <- 1:nrow(out)
}
out
}
data(mtcars)
t1.props(cyl, mtcars)
I appreciate any help or advice you have to offer.
You can also keep the function mainly intact and use lapply over it:
vars <- c("cyl", "am")
lapply(vars, t1.props, data=mtcars)
[[1]]
variable level ci.95
1 cyl 4 34.38 (19.50, 53.11)
2 cyl 6 21.88 (10.35, 40.45)
3 cyl 8 43.75 (27.10, 61.94)
[[2]]
variable level ci.95
1 am 0 59.38 (40.94, 75.5)
2 am 1 40.62 (24.50, 59.06)
And combine them all into one data frame with:
lst <- lapply(vars, t1.props, data=mtcars)
do.call(rbind,lst)
Data
You must simplify the var and var.name assignments to:
t1.props <- function(x, data = NULL) {
# Grab dataframe and/or variable name
if(!missing(data)){
var <- data[,x]
} else {
var <- x
}
# Grab variable name for use in ouput
var.name <- x
# Omit observations with missing data
var.clean <- na.omit(var)
# Number of nonmissing observations
n <- length(var.clean)
# Grab levels of variable
levels <- sort(unique(var.clean))
# Create an empty data frame to store values
out <- data.frame(variable = NA,
level = NA,
ci.95 = NA)
# Estimate prop, se, and ci for each level of the variable
for(i in seq_along(levels)) {
prop <- paste0("prop", i)
se <- paste0("se", i)
log.prop <- paste0("log.trans", i)
log.se <- paste0("log.se", i)
log.l <- paste0("log.l", i)
log.u <- paste0("log.u", i)
lcl <- paste0("lcl", i)
ucl <- paste0("ucl", i)
# Find the proportion for each level of the variable
assign(prop, sum(var.clean == levels[i]) / n)
# Find the standard error for each level of the variable
assign(se, sd(var.clean == levels[i]) /
sqrt(length(var.clean == levels[i])))
# Perform a logit transformation of the original percentage estimate
assign(log.prop, log(get(prop)) - log(1 - get(prop)))
# Transform the standard error of the percentage to a standard error of its
# logit transformation
assign(log.se, get(se) / (get(prop) * (1 - get(prop))))
# Calculate the lower and upper confidence bounds of the logit
# transformation
assign(log.l,
get(log.prop) -
qt(.975, (length(var.clean == levels[i]) - 1)) * get(log.se))
assign(log.u,
get(log.prop) +
qt(.975, (length(var.clean == levels[i]) - 1)) * get(log.se))
# Finally, perform inverse logit transformations to get the confidence bounds
assign(lcl, exp(get(log.l)) / (1 + exp(get(log.l))))
assign(ucl, exp(get(log.u)) / (1 + exp(get(log.u))))
# Create a combined 95% CI variable for easy copy/paste into Word tables
ci.95 <- paste0(round(get(prop) * 100, 2), " ",
"(", sprintf("%.2f", round(get(lcl) * 100, 2)), ",", " ",
round(get(ucl) * 100, 2), ")")
# Populate the "out" data frame with values
out <- rbind(out, c(as.character(var.name), levels[i], ci.95))
}
# Remove first (empty) row from out
# But only in the first iteration
if (is.na(out[1,1])) {
out <- out[-1, ]
rownames(out) <- 1:nrow(out)
}
out
}
The nice thing about all the functions you're using is that they are already vectorized (except sd and qt, but you can easily vectorize them for specific arguments with Vectorize). This means you can pass vectors to them without needing to write a single loop. I left out the parts of your function that deal with preparing the input and prettying up the output.
t1.props <- function(var, data=mtcars) {
N <- nrow(data)
levels <- names(table(data[,var]))
count <- unclass(table(data[,var])) # counts
prop <- count / N # proportions
se <- sqrt(prop * (1-prop)/(N-1)) # standard errors of props.
lprop <- log(prop) - log(1-prop) # logged prop
lse <- se / (prop*(1-prop)) # logged se
stat <- Vectorize(qt, "df")(0.975, N-1) # tstats
llower <- lprop - stat*lse # log lower
lupper <- lprop + stat*lse # log upper
lower <- exp(llower) / (1 + exp(llower)) # lower ci
upper <- exp(lupper) / (1 + exp(lupper)) # upper ci
data.frame(variable=var,
level=levels,
perc=100*prop,
lower=100*lower,
upper=100*upper)
}
So, the only explicit applying/looping comes when you apply the function to multiple variables as follows
## Apply your function to two variables
do.call(rbind, lapply(c("cyl", "am"), t1.props))
# variable level perc lower upper
# 4 cyl 4 34.375 19.49961 53.11130
# 6 cyl 6 21.875 10.34883 40.44691
# 8 cyl 8 43.750 27.09672 61.94211
# 0 am 0 59.375 40.94225 75.49765
# 1 am 1 40.625 24.50235 59.05775
As far as the loop in your code, it's not like that is particularly important in terms of efficiency, but you can see how much easier code can be to read when its concise - and apply functions offer a lot of simple one-line solutions.
I think the most important thing to change in your code is the use of assign and get. Instead, you can store variables in lists or another data structure, and use setNames, names<-, or names(...) <- to name the components when needed.
Related
So, just a touch of backstory. I've been learning biostatistics in the past 4-5 months in university, 6 months of biomathematics before that. I only started deep diving into programming around 5 days ago.
I've been trying to redo t.test() with my own function.
test2 = function(t,u){
T = (mean(t) - u) / ( sd(t) / sqrt(length(t)))
t1=round(T, digits=5)
df=length(t)
cat(paste('t - value =', t1,
'\n','df =', df-1,
'\n','Alternative hipotézis: a minta átlag nem egyenlő a hipotetikus átlaggal'))
}
I tried searching the formula for the p-value, I found one, but when I used it, my value was different from the one within the t.test.
The t-value and the df do match t.test().
I highly appreciate any help, thank you.
P.s: Don't worry about the last line, it's in Hungarian.
The p-value can be derived from the probability function of the t distribution pt. Using this and making the notation more common with sample x and population mean mu we can use something like:
test2 <- function(x, u){
t <- (mean(x) - u) / (sd(x) / sqrt(length(x)))
df <- length(x) - 1
cat('t-value =', t, ', df =', df, ', p =', 2 * (1 - pt(q=t, df=df)), '\n')
}
set.seed(123) # remove this for other random values
## random sample
x <- rnorm(10, mean=5.5)
## population mean
mu <- 5
## own function
test2(x, mu)
## one sample t-test from R
t.test(x, mu=mu)
We get for the own test2:
t-value = 1.905175 , df = 9, p = 0.08914715
and for R's t.test
One Sample t-test
data: x
t = 1.9052, df = 9, p-value = 0.08915
alternative hypothesis: true mean is not equal to 5
95 percent confidence interval:
4.892330 6.256922
sample estimates:
mean of x
5.574626
The definitive source of what R is doing is the source code. If you look at the source code for stats:::t.test.default (which you can get by typing stats:::t.test.default into the console, without parentheses at the end and hitting enter), you'll see that for a single-sample test like the one you're trying to do above, you would get the following:
nx <- length(x)
mx <- mean(x)
vx <- var(x)
df <- nx - 1
stderr <- sqrt(vx/nx)
tstat <- (mx - mu)/stderr
if (alternative == "less") {
pval <- pt(tstat, df)
}
else if (alternative == "greater") {
pval <- pt(tstat, df, lower.tail = FALSE)
}
else {
pval <- 2 * pt(-abs(tstat), df)
}
These are the relevant pieces (there's a lot more code in there, too).
Very new to R and RStudio and the whole concept of coding language. I'm trying to create reproducible code so I can properly ask a question.
The first error says:
Error in colSums(cTrain * log(pTrain) + cCar * log(pCar) + cSM * log(pSM)) :
'x' must be an array of at least two dimensions
Using this code, where can I fix this so that 'x' can have two dimensions?
mydata <- structure(list(LUGGAGE=c(0,1,0,1,0), GA=c(0,0,0,0,0), TRAIN_AV=c(1,1,1,1,1), CAR_AV=c(1,1,1,1,1), SM_AV=c(1,1,1,1,1),
TRAIN_TT=c(114,142,235,193,227), TRAIN_CO=c(40,109,124,90,94),
SM_TxT=c(44,91,179,119,108), SM_CO=c(46,132,132,127,118),
CAR_TT=c(140,110,170,150,286), CAR_CO=c(123,104,80,95,169), CHOICE=c(2,2,3,3,2)),
.Names=c("Luggage","GA","TRAIN_AV","CAR_AV","SM_AV","TRAIN_TT","TRAIN_CO","SM_TT","SM_CO","CAR_TT","CAR_CO","CHOICE"),
row.names=c(NA,5L), class="data.frame")
## Initial value of parameters
initPar <- 8
### Log-Likelihood Function of the Logit Model
library("maxLik")
loglik <- function(x) {
## Parameters
# Alternative Specific Constants
asc_train <- x[1]
asc_sm <- x[2]
# Travel Time to Destination
ttime <- x[3]
# Travel Cost to Destination
tcost_train <- x[4]
tcost_car <- x[5]
tcost_sm <- x[6]
# Effect of Swiss Annual Season Ticket
ga <- x[7]
# Effect of luggage
luggage <- x[8]
## Log-Likelihood Variable
LL = 0
## Utility Function Vin
train <- asc_train*matrix(1, nrow=nrow(mydata), ncol = 1) + tcost_train*mydata$TRAIN_CO + ttime*mydata$TRAIN_TT/100 + ga*mydata$GA + luggage*mydata$LUGGAGE
car <- tcost_car*mydata$CAR_CO + ttime*mydata$CAR_TT/100 + luggage*mydata$LUGGAGE
sm <- asc_sm*matrix(1, nrow=nrow(mydata), ncol = 1) + tcost_sm*mydata$SM_CO + ttime*mydata$SM_TT/100 + ga*mydata$GA + luggage*mydata$LUGGAGE
## exp(Vin) and Control for Mode Availability
train <- mydata$TRAIN_AV *exp(train)
car <- mydata$CAR_AV *exp(car)
sm <- mydata$SM_AV *exp(sm)
## Choice Probabilities
deno <- (train + car + sm)
## Individual Choice Probabilities
pTrain <- mydata$TRAIN_AV *(train / deno)
pCar <- mydata$CAR_AV *(car / deno)
pSM <- mydata$SM_AV *(sm / deno)
pTrain <- (pTrain!=0) *pTrain + (pTrain==0)
pCar <- (pCar!=0) *pCar + (pCar==0)
pSM <- (pSM!=0) *pSM + (pSM==0)
## Choice Results
cTrain <- mydata$CHOICE == "1"
cCar <- mydata$CHOICE == "3"
cSM <- mydata$CHOICE == "2"
## Log-Likelihood Function
LL <- colSums(cTrain*log(pTrain) + cCar*log(pCar) + cSM*log(pSM))
}
### Maximization of Log-Likelihood Function ###
# Parameter Optimization
result <- maxLik(loglik, start=numeric(initPar))
# Parameter Estimation, Hessian Matrix Calculation
parameters <- result$estimate
hessianMatrix <- result$hessian
# T-Statistic Calculation
tval <- parameters/sqrt(-diag(solve(hessianMatrix)))
# L(0), Log-Likelihood When All parameters = 0
L0 <- loglik(numeric(initPar))
# LL, Maximumum Likelihood
LL <- result$maximum
Nicely asked question with a reproducible example; upvoted!
Your problem was very simple. Your function looks for a variable called mydata$LUGGAGE that doesn't exist. R is case sensitive and your column is called mydata$Luggage.
All you have to do is
names(mydata)[1] <- "LUGGAGE"
Now run your script and you should get this result:
result <- maxLik(loglik, start=numeric(initPar))
result
# Maximum Likelihood estimation
# Newton-Raphson maximisation, 30 iterations
# Return code 2: successive function values within tolerance limit
# Log-Likelihood: -1.744552e-07 (8 free parameter(s))
# Estimate(s): -277.7676 -250.6531 8.651811 -1.680196 -4.208955 -1.281697 0 354.4692
The question is given like this:
Read the file diabetes.csv. There are two variables called BMI and Outcome. The variable Outcome takes on only two values: 0 and 1. Conduct a non-parametric two sample test for the hypothesis that the standard deviation of BMI is the same for both Outcome values
bmi <- diabetes$BMI
bmi
outcome <- diabetes$Outcome
outcome
n <- length(bmi)
# tstat
tstat <- ???
# Describe the population and draw synthetic samples
f1 <- function()
{
x <- c(bmi, outcome)
x <- sample(x)
m1 <- sd(x[1:n])
m2 <- sd(x[(n+1):length(x)])
return(m1 - m2)
}
# Create sampling distribution
sdist <- replicate(10000, f1())
plot(density(sdist))
# Gap
gap <- abs(mean(sdist) - tstat)
abline(v = mean(sdist) + c(-1,1) * gap, col = "dark orange")
s1 <- sdist[sdist <(mean(sdist - gap)) | sdist >(mean(sdist + gap))]
pvalue <- length(s1) / length(sdist)
pvalue
The data is in some dataset called "diabetes". My question is how to represent the "t-statistic" since the outcome is binary?
Use this code:
# Sort the table diabetes on accending order of Outcome to separate the BMI
# values with outcome = 0 and BMI values with outcome = 1
diabetes = diabetes[order(diabetes$Outcome),]
View(diabetes)
# Find the number of values with outcome = 0
n = length(which(diabetes$Outcome == 0))
# Find total number of rows
l = length(diabetes$BMI)
# Find BMI values to create the sample later on
g = diabetes$BMI
# Create function to take the values of BMI and shuffle it every time and
# to find the difference between the standard deviations
f1 = function()
{
x = sample(g)
z = abs(sd(x[1:n]) - sd(x[(n+1):l]))
return(z)
}
# Replicate the function several times
dist = replicate(100000,f1())
# Plot density of distribution
plot(density(dist))
polygon(density(dist),col="green")
diabetes0 = diabetes[diabetes$Outcome == 0,]
diabetes1 = diabetes[diabetes$Outcome == 1,]
View(diabetes0)
View(diabetes1)
# Find the difference between standard deviation of BMI when outcome = 0 and
# when outcome = 1
tstat = abs(sd(diabetes0$BMI) - sd(diabetes1$BMI))
tstat
abline(v=tstat)
rside = dist[dist>tstat]
pvalue = length(rside)/length(dist)
pvalue
I am trying to run a multi-level model on multiply imputed data (created with Amelia); the sample is based on a clustered sample with group = 24, N= 150.
library("ZeligMultilevel")
ML.model.0 <- zelig(dv~1 + tag(1|group), model="ls.mixed",
data=a.out$imputations)
summary(ML.model.0)
This code produces the following error code:
Error in object[[1]]$result$call :
$ operator not defined for this S4 class
If I run a OLS regression, it works:
model.0 <- zelig(dv~1, model="ls", data=a.out$imputations)
m.0 <- coef(summary(model.0))
print(m.0, digits = 2)
Value Std. Error t-stat p-value
[1,] 45 0.34 130 2.6e-285
I am happy to provide a working example.
require(Zelig)
require(Amelia)
require(ZeligMultilevel)
data(freetrade)
length(freetrade$country) #grouping variable
#Imputation of missing data
a.out <- amelia(freetrade, m=5, ts="year", cs="country")
# Models: (1) OLS; (2) multi-level
model.0 <- zelig(polity~1, model="ls", data=a.out$imputations)
m.0 <- coef(summary(model.0))
print(m.0, digits = 2)
ML.model.0 <- zelig(polity~1 + tag(1|country), model="ls.mixed", data=a.out$imputations)
summary(ML.model.0)
I think the issue may be with how Zelig interfaces with Amelia's mi class. Therefore, I turned toward an alternative R package: lme4.
require(lme4)
write.amelia(obj=a.out, file.stem="inmi", format="csv", na="NA")
diff <-list(5) # a list to store each model, 5 is the number of the imputed datasets
for (i in 1:5) {
file.name <- paste("inmi", 5 ,".csv",sep="")
data.to.use <- read.csv(file.name)
diff[[5]] <- lmer(polity ~ 1 + (1 | country),
data = data.to.use)}
diff
The result is the following:
[[1]]
[1] 5
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
Linear mixed model fit by REML
Formula: polity ~ 1 + (1 | country)
Data: data.to.use
AIC BIC logLik deviance REMLdev
1006 1015 -499.9 1002 999.9
Random effects:
Groups Name Variance Std.Dev.
country (Intercept) 14.609 3.8222
Residual 17.839 4.2236
Number of obs: 171, groups: country, 9
Fixed effects:
Estimate Std. Error t value
(Intercept) 2.878 1.314 2.19
The results remain the same when I replace diff[[5]] by diff[[4]], diff[[3]] etc. Still, I am wondering whether this is actually the results for the combined dataset or for one single imputed data set. Any thoughts? Thanks!
I modified the summary function for this object (fetched the source and opened up ./R/summary.R file). I added some curly braces to make the code flow and changed a getcoef to coef. This should work for this particular case, but I'm not sure if it's general. Function getcoef searches for slot coef3, and I have never seen this. Perhaps #BenBolker can throw an eye here? I can't guarantee this is what the result looks like, but the output looks legit to me. Perhaps you could contact the package authors to correct this in the future version.
summary(ML.model.0)
Model: ls.mixed
Number of multiply imputed data sets: 5
Combined results:
Call:
zelig(formula = polity ~ 1 + tag(1 | country), model = "ls.mixed",
data = a.out$imputations)
Coefficients:
Value Std. Error t-stat p-value
[1,] 2.902863 1.311427 2.213515 0.02686218
For combined results from datasets i to j, use summary(x, subset = i:j).
For separate results, use print(summary(x), subset = i:j).
Modified function:
summary.MI <- function (object, subset = NULL, ...) {
if (length(object) == 0) {
stop('Invalid input for "subset"')
} else {
if (length(object) == 1) {
return(summary(object[[1]]))
}
}
# Roman: This function isn't fecthing coefficients robustly. Something goes wrong. Contact package author.
getcoef <- function(obj) {
# S4
if (!isS4(obj)) {
coef(obj)
} else {
if ("coef3" %in% slotNames(obj)) {
obj#coef3
} else {
obj#coef
}
}
}
#
res <- list()
# Get indices
subset <- if (is.null(subset)) {
1:length(object)
} else {
c(subset)
}
# Compute the summary of all objects
for (k in subset) {
res[[k]] <- summary(object[[k]])
}
# Answer
ans <- list(
zelig = object[[1]]$name,
call = object[[1]]$result#call,
all = res
)
#
coef1 <- se1 <- NULL
#
for (k in subset) {
# tmp <- getcoef(res[[k]]) # Roman: I changed this to coef, not 100% sure if the output is the same
tmp <- coef(res[[k]])
coef1 <- cbind(coef1, tmp[, 1])
se1 <- cbind(se1, tmp[, 2])
}
rows <- nrow(coef1)
Q <- apply(coef1, 1, mean)
U <- apply(se1^2, 1, mean)
B <- apply((coef1-Q)^2, 1, sum)/(length(subset)-1)
var <- U+(1+1/length(subset))*B
nu <- (length(subset)-1)*(1+U/((1+1/length(subset))*B))^2
coef.table <- matrix(NA, nrow = rows, ncol = 4)
dimnames(coef.table) <- list(rownames(coef1),
c("Value", "Std. Error", "t-stat", "p-value"))
coef.table[,1] <- Q
coef.table[,2] <- sqrt(var)
coef.table[,3] <- Q/sqrt(var)
coef.table[,4] <- pt(abs(Q/sqrt(var)), df=nu, lower.tail=F)*2
ans$coefficients <- coef.table
ans$cov.scaled <- ans$cov.unscaled <- NULL
for (i in 1:length(ans)) {
if (is.numeric(ans[[i]]) && !names(ans)[i] %in% c("coefficients")) {
tmp <- NULL
for (j in subset) {
r <- res[[j]]
tmp <- cbind(tmp, r[[pmatch(names(ans)[i], names(res[[j]]))]])
}
ans[[i]] <- apply(tmp, 1, mean)
}
}
class(ans) <- "summaryMI"
ans
}
I am trying to build a rolling regression function based on the example here, but in addition to returning the predicted values, I would like to return the some rolling model diagnostics (i.e. coefficients, t-values, and mabye R^2). I would like the results to be returned in discrete objects based on the type of results. The example provided in the link above sucessfully creates thr rolling predictions, but I need some assistance packaging and writing out the rolling model diagnostics:
In the end, I would like the function to return three (3) objects:
Predictions
Coefficients
T values
R^2
Below is the code:
require(zoo)
require(dynlm)
## Create Some Dummy Data
set.seed(12345)
x <- rnorm(mean=3,sd=2,100)
y <- rep(NA,100)
y[1] <- x[1]
for(i in 2:100) y[i]=1+x[i-1]+0.5*y[i-1]+rnorm(1,0,0.5)
int <- 1:100
dummydata <- data.frame(int=int,x=x,y=y)
zoodata <- as.zoo(dummydata)
rolling.regression <- function(series) {
mod <- dynlm(formula = y ~ L(y) + L(x), data = as.zoo(series)) # get model
nextOb <- max(series[,'int'])+1 # To get the first row that follows the window
if (nextOb<=nrow(zoodata)) { # You won't predict the last one
# 1) Make Predictions
predicted <- predict(mod,newdata=data.frame(x=zoodata[nextOb,'x'],y=zoodata[nextOb,'y']))
attributes(predicted) <- NULL
c(predicted=predicted,square.res <-(predicted-zoodata[nextOb,'y'])^2)
# 2) Extract coefficients
#coefficients <- coef(mod)
# 3) Extract rolling coefficient t values
#tvalues <- ????(mod)
# 4) Extract rolling R^2
#rsq <-
}
}
rolling.window <- 20
results.z <- rollapply(zoodata, width=rolling.window, FUN=rolling.regression, by.column=F, align='right')
So after figuring out how to extract t values from model (i.e. mod) , what do I need to do to make the function return three (3) seperate objects (i.e. Predictions, Coefficients, and T-values)?
I am fairly new to R, really new to functions, and extreemly new to zoo, and I'm stuck.
Any assistance would be greatly appreciated.
I hope I got you correctly, but here is a small edit of your function:
rolling.regression <- function(series) {
mod <- dynlm(formula = y ~ L(y) + L(x), data = as.zoo(series)) # get model
nextOb <- max(series[,'int'])+1 # To get the first row that follows the window
if (nextOb<=nrow(zoodata)) { # You won't predict the last one
# 1) Make Predictions
predicted=predict(mod,newdata=data.frame(x=zoodata[nextOb,'x'],y=zoodata[nextOb,'y']))
attributes(predicted)<-NULL
#Solution 1; Quicker to write
# c(predicted=predicted,
# square.res=(predicted-zoodata[nextOb,'y'])^2,
# summary(mod)$coef[, 1],
# summary(mod)$coef[, 3],
# AdjR = summary(mod)$adj.r.squared)
#Solution 2; Get column names right
c(predicted=predicted,
square.res=(predicted-zoodata[nextOb,'y'])^2,
coef_intercept = summary(mod)$coef[1, 1],
coef_Ly = summary(mod)$coef[2, 1],
coef_Lx = summary(mod)$coef[3, 1],
tValue_intercept = summary(mod)$coef[1, 3],
tValue_Ly = summary(mod)$coef[2, 3],
tValue_Lx = summary(mod)$coef[3, 3],
AdjR = summary(mod)$adj.r.squared)
}
}
rolling.window <- 20
results.z <- rollapply(zoodata, width=rolling.window, FUN=rolling.regression, by.column=F, align='right')
head(results.z)
predicted square.res coef_intercept coef_Ly coef_Lx tValue_intercept tValue_Ly tValue_Lx AdjR
20 10.849344 0.721452 0.26596465 0.5798046 1.049594 0.38309211 7.977627 13.59831 0.9140886
21 12.978791 2.713053 0.26262820 0.5796883 1.039882 0.37741499 7.993014 13.80632 0.9190757
22 9.814676 11.719999 0.08050796 0.5964808 1.073941 0.12523824 8.888657 15.01353 0.9340732
23 5.616781 15.013297 0.05084124 0.5984748 1.077133 0.08964998 9.881614 16.48967 0.9509550
24 3.763645 6.976454 0.26466039 0.5788949 1.068493 0.51810115 11.558724 17.22875 0.9542983
25 9.433157 31.772658 0.38577698 0.5812665 1.034862 0.70969330 10.728395 16.88175 0.9511061
To see how it works, make a small example with a regression:
x <- rnorm(1000); y <- 2*x + rnorm(1000)
reg <- lm(y ~ x)
summary(reg)$coef
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.02694322 0.03035502 0.8876033 0.374968
x 1.97572544 0.03177346 62.1816310 0.000000
As you can see, calling summary first and then getting the coefficients of it (coef(summary(reg)) works as well) gives you a table with estimates, standard errors, and t-values. So estimates are saved in column 1 of that table, t-values in column 3. And that's how I obtain them in the updated rolling.regression function.
EDIT
I updated my solution; now it also contains the adjusted R2. If you just want the normal R2, get rid of the .adj.
EDIT 2
Quick and dirty hack how to name the columns:
rolling.regression <- function(series) {
mod <- dynlm(formula = y ~ L(y) + L(x), data = as.zoo(series)) # get model
nextOb <- max(series[,'int'])+1 # To get the first row that follows the window
if (nextOb<=nrow(zoodata)) { # You won't predict the last one
# 1) Make Predictions
predicted=predict(mod,newdata=data.frame(x=zoodata[nextOb,'x'],y=zoodata[nextOb,'y']))
attributes(predicted)<-NULL
#Get variable names
strVar <- c("Intercept", paste0("L", 1:(nrow(summary(mod)$coef)-1)))
vec <- c(predicted=predicted,
square.res=(predicted-zoodata[nextOb,'y'])^2,
AdjR = summary(mod)$adj.r.squared,
summary(mod)$coef[, 1],
summary(mod)$coef[, 3])
names(vec)[4:length(vec)] <- c(paste0("Coef_", strVar), paste0("tValue_", strVar))
vec
}
}