automatically convert polynomial to expression in ggplot2 title

automatically convert polynomial to expression in ggplot2 title - r

Here is some code I am using to auto generate some regression fits;
require(ggplot2)
# Prep data
nPts = 200
prepared=runif(nPts,0,10)
rich=5-((prepared-5)^2)/5 + 5*runif(length(prepared))
df <- data.frame(rich=rich, prepared=prepared)
deg = 1 # User variable
lm <- lm(df$rich ~ poly(df$prepared, deg, raw=T))
# Create expression
coefs <- lm$coefficients
eq <- paste0(round(coefs,2),'*x^', 0:length(coefs), collapse='+') # (1)
pl <- ggplot(df, aes(x=prepared, y=rich)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ poly(x,deg), size = 1) +
ggtitle(eq) # (2)
print(pl)
This code should run (with ggplot2 installed). The problem is in the lines marked 1 and 2:
Generates a string representation of the polynomial
Sets the string as the plot title
As it stand my title is "6.54*x^0+0.09*x^1+6.54*x^2". However I want a more attractive rendering so that (2) is more like would be seen with:
ggtitle(expression(6.54*x^0+0.09*x^1+6.54*x^2)) # (2')
i.e, powers raised, multiplications dropped etc. Any help much appreciated!

Here's a function that I built to solve my problem;
poly_expression <- function(coefs){
# build the string
eq <- paste0(round(coefs,2),'*x^', (1:length(coefs)-1), collapse='+')
# some cleaning
eq <- gsub('\\+\\-','-', eq) # +-n -> -n
eq <- gsub('\\*x\\^0','', eq) # n*x^0 <- n
eq <- gsub('x\\^1','x', eq) # n*x^1 <- nx
eq <- parse(text=eq) # return expressions
return(eq)
}
Then ggtitle(poly_expression(coefs)) renders as required.

Related

Unable to make code run - Error in `lag()`:! `n`

I need help solving this error, I am not sure how to, but it seems as when I run the "dp_stat" in the end, an error appears when I do my OLS model. Essentially I try to find the cumulative difference between a benchmark and a given individual predictive model. Here dp is my independent variable and sg is my dependent variable. datanu is my excel data. I'm not sure how to attach the data here, however here is a link to the excel and the code: https://drive.google.com/drive/folders/12BOuNBODURIP7CQIBZWMmHFc1d7zXHxN?usp=sharing If anyone has a fix it would mean the world!
"#Error in lag():! n must be a positive integer, not a double vector of length 1."
rm(list= ls()) # Clear global environment
invisible(gc()) # Free up unused R-occupied memory
cat("\014") # Clear console output: equivalent to ctrl + L
library("tseries")
library("readxl")
library("Metrics")
library("lubridate")
library("ggplot2")
library("data.table")
library("dyn")
library("reshape2")
#header TRUE fordi første row er navne.
datanu <- read_xlsx("~/Documents/6.semester/Bachelor/Data/datanu.xlsx",
na = "NaN",
sheet = "datax",
)
myts <- ts(datanu, start=c(1872, 1), end=c(2020, 12), frequency=12)
plot(myts[, c("dp", "dy", "ep", "de")])
get_statistics <- function(myts, dp, sg, h=1, start=1872, end=2020, est_periods_OOS = 20) {
#### IS ANALYSIS
#1. Historical mean model for en portefølje
avg <- mean(window(myts, start, end)[, sg], na.rm=TRUE)
IS_error_N <- (window(myts, start, end)[, sg] - avg)
#2. OLS model
#reg <- dyn$lm(sg ~ lag(as.numeric(dp), 1), data=window(myts, start, end))
reg <- dyn$lm(eval(parse(text=sg)) ~ lag(eval(parse(text=dp)), -1), data=window(myts, start, end)) #Error in `lag()`:! `n` must be a positive integer, not a double vector of length 1.
IS_error_A <- reg$residuals
#OOS ANALYSIS
OOS_error_N <- numeric(end - start - est_periods_OOS)
OOS_error_A <- numeric(end - start - est_periods_OOS)
#anvender kun information op til forecasten er lavet.
j <- 0
for (i in (start + est_periods_OOS):(end-1)) {
j <- j + 1
#Get the actual ERP that you want to predict
actual_ERP <- as.numeric(window(myts, i+1, i+1)[, sg])
#1. Historical mean model
OOS_error_N[j] <- actual_ERP - mean(window(myts, start, i)[, sg], na.rm=TRUE)
#2. OLS model
reg_OOS <- dyn$lm(eval(parse(text=sg)) ~ lag(eval(parse(text=dp)), -1),
data=window(myts, start, i))
#Compute_error
df <- data.frame(x=as.numeric(window(myts, i, i)[, dp]))
names(df) <- dp
pred_ERP <- predict.lm(reg_OOS, newdata=df)
OOS_error_A[j] <- pred_ERP - actual_ERP
}
#Compute statistics
MSE_N <- mean(OOS_error_N^2)
MSE_A <- mean(OOS_error_A^2)
T <- length(!is.na(myts[, sg]))
OOS_R2 <- 1 - MSE_A/MSE_N
#Is the -1 enough (maybe -2 needed because of lag)?
OOS_oR2 <- OOS_R2 - (1-OOS_R2)*(reg$df.residual)/(T - 1)
dRMSE <- sqrt(MSE_N) - sqrt(MSE_A)
##
#### CREATE PLOT
IS <- cumsum(IS_error_N[2:length(IS_error_N)]^2)-cumsum(IS_error_A^2)
OOS <- cumsum(OOS_error_N^2)-cumsum(OOS_error_A^2)
df <- data.frame(x=seq.int(from=start + 1 + est_periods_OOS, to=end),
IS=IS[(1 + est_periods_OOS):length(IS)],
OOS=OOS) #Because you lose one observation due to the lag
#Shift IS errors vertically, so that the IS line begins
# at zero on the date of first OOS prediction. (se Goyal/Welch (2008, side 1465))
df$IS <- df$IS - df$IS[1]
df <- melt(df, id.var="x")
plotGG <- ggplot(df) +
geom_line(aes(x=x, y=value,color=variable)) +
geom_rect(data=data.frame(),#Needed by ggplot2, otherwise not transparent
aes(xmin=2008, xmax=2010,ymin=-0.2,ymax=0.2),
fill='red',
alpha=0.1) +
scale_y_continuous('Cumulative SSE Difference', limits=c(-0.2, 0.2)) +
scale_x_continuous('Year')
##
return(list(IS_error_N = IS_error_N,
IS_error_A = reg$residuals,
OOS_error_N = OOS_error_N,
OOS_error_A = OOS_error_A,
IS_R2 = summary(reg)$r.squared,
IS_aR2 = summary(reg)$adj.r.squared,
OOS_R2 = OOS_R2,
OOS_oR2 = OOS_oR2,
dRMSE = dRMSE,
plotGG = plotGG))
}
dp_stat <- get_statistics(myts, "dp", "sg", start=1872)
dp_stat$plotGG

As the error message states, n must be a positive integer, not a double vector of length 1. The error comes from you providing n = -1 (i.e., a negative number) as an argument. I assume your idea is to have a negative number of positions to lag by. However, the lag() function only accepts a positive number of lag positions. Instead of lag(), you should use lead() with n = 1 to achieve the desired result.

'X' as an array of at least two dimensions

Very new to R and RStudio and the whole concept of coding language. I'm trying to create reproducible code so I can properly ask a question.
The first error says:
Error in colSums(cTrain * log(pTrain) + cCar * log(pCar) + cSM * log(pSM)) :
'x' must be an array of at least two dimensions
Using this code, where can I fix this so that 'x' can have two dimensions?
mydata <- structure(list(LUGGAGE=c(0,1,0,1,0), GA=c(0,0,0,0,0), TRAIN_AV=c(1,1,1,1,1), CAR_AV=c(1,1,1,1,1), SM_AV=c(1,1,1,1,1),
TRAIN_TT=c(114,142,235,193,227), TRAIN_CO=c(40,109,124,90,94),
SM_TxT=c(44,91,179,119,108), SM_CO=c(46,132,132,127,118),
CAR_TT=c(140,110,170,150,286), CAR_CO=c(123,104,80,95,169), CHOICE=c(2,2,3,3,2)),
.Names=c("Luggage","GA","TRAIN_AV","CAR_AV","SM_AV","TRAIN_TT","TRAIN_CO","SM_TT","SM_CO","CAR_TT","CAR_CO","CHOICE"),
row.names=c(NA,5L), class="data.frame")
## Initial value of parameters
initPar <- 8
### Log-Likelihood Function of the Logit Model
library("maxLik")
loglik <- function(x) {
## Parameters
# Alternative Specific Constants
asc_train <- x[1]
asc_sm <- x[2]
# Travel Time to Destination
ttime <- x[3]
# Travel Cost to Destination
tcost_train <- x[4]
tcost_car <- x[5]
tcost_sm <- x[6]
# Effect of Swiss Annual Season Ticket
ga <- x[7]
# Effect of luggage
luggage <- x[8]
## Log-Likelihood Variable
LL = 0
## Utility Function Vin
train <- asc_train*matrix(1, nrow=nrow(mydata), ncol = 1) + tcost_train*mydata$TRAIN_CO + ttime*mydata$TRAIN_TT/100 + ga*mydata$GA + luggage*mydata$LUGGAGE
car <- tcost_car*mydata$CAR_CO + ttime*mydata$CAR_TT/100 + luggage*mydata$LUGGAGE
sm <- asc_sm*matrix(1, nrow=nrow(mydata), ncol = 1) + tcost_sm*mydata$SM_CO + ttime*mydata$SM_TT/100 + ga*mydata$GA + luggage*mydata$LUGGAGE
## exp(Vin) and Control for Mode Availability
train <- mydata$TRAIN_AV *exp(train)
car <- mydata$CAR_AV *exp(car)
sm <- mydata$SM_AV *exp(sm)
## Choice Probabilities
deno <- (train + car + sm)
## Individual Choice Probabilities
pTrain <- mydata$TRAIN_AV *(train / deno)
pCar <- mydata$CAR_AV *(car / deno)
pSM <- mydata$SM_AV *(sm / deno)
pTrain <- (pTrain!=0) *pTrain + (pTrain==0)
pCar <- (pCar!=0) *pCar + (pCar==0)
pSM <- (pSM!=0) *pSM + (pSM==0)
## Choice Results
cTrain <- mydata$CHOICE == "1"
cCar <- mydata$CHOICE == "3"
cSM <- mydata$CHOICE == "2"
## Log-Likelihood Function
LL <- colSums(cTrain*log(pTrain) + cCar*log(pCar) + cSM*log(pSM))
}
### Maximization of Log-Likelihood Function ###
# Parameter Optimization
result <- maxLik(loglik, start=numeric(initPar))
# Parameter Estimation, Hessian Matrix Calculation
parameters <- result$estimate
hessianMatrix <- result$hessian
# T-Statistic Calculation
tval <- parameters/sqrt(-diag(solve(hessianMatrix)))
# L(0), Log-Likelihood When All parameters = 0
L0 <- loglik(numeric(initPar))
# LL, Maximumum Likelihood
LL <- result$maximum

Nicely asked question with a reproducible example; upvoted!
Your problem was very simple. Your function looks for a variable called mydata$LUGGAGE that doesn't exist. R is case sensitive and your column is called mydata$Luggage.
All you have to do is
names(mydata)[1] <- "LUGGAGE"
Now run your script and you should get this result:
result <- maxLik(loglik, start=numeric(initPar))
result
# Maximum Likelihood estimation
# Newton-Raphson maximisation, 30 iterations
# Return code 2: successive function values within tolerance limit
# Log-Likelihood: -1.744552e-07 (8 free parameter(s))
# Estimate(s): -277.7676 -250.6531 8.651811 -1.680196 -4.208955 -1.281697 0 354.4692

Predict segmented lm outside of package

I have an array of outputs from hundreds of segmented linear models (made using the segmented package in R). I want to be able to use these outputs on new data, using the predict function. To be clear, I do not have the segmented linear model objects in my workspace; I just saved and reimported the relevant outputs (e.g. the coefficients and breakpoints). For this reason I can't simply use the predict.segmented function from the segmented package.
Below is a toy example based on this link that seems promising, but does not match the output of the predict.segmented function.
library(segmented)
set.seed(12)
xx <- 1:100
zz <- runif(100)
yy <- 2 + 1.5*pmax(xx-35,0) - 1.5*pmax(xx-70,0) +
15*pmax(zz-0.5,0) + rnorm(100,0,2)
dati <- data.frame(x=xx,y=yy,z=zz)
out.lm<-lm(y~x,data=dati)
o<-## S3 method for class 'lm':
segmented(out.lm,seg.Z=~x,psi=list(x=c(30,60)),
control=seg.control(display=FALSE))
# Note that coefficients with U in the name are differences in slopes, not slopes.
# Compare:
slope(o)
coef(o)[2] + coef(o)[3]
coef(o)[2] + coef(o)[3] + coef(o)[4]
# prediction
pred <- data.frame(x = 1:100)
pred$dummy1 <- pmax(pred$x - o$psi[1,2], 0)
pred$dummy2 <- pmax(pred$x - o$psi[2,2], 0)
pred$dummy3 <- I(pred$x > o$psi[1,2]) * (coef(o)[2] + coef(o)[3])
pred$dummy4 <- I(pred$x > o$psi[2,2]) * (coef(o)[2] + coef(o)[3] + coef(o)[4])
names(pred)[-1]<- names(model.frame(o))[-c(1,2)]
# compute the prediction, using standard predict function
# computing confidence intervals further
# suppose that the breakpoints are fixed
pred <- data.frame(pred, predict(o, newdata= pred,
interval="confidence"))
# Try prediction using the predict.segment version to compare
test <- predict.segmented(o)
plot(pred$fit, test, ylim = c(0, 100))
abline(0,1, col = "red")
# At least one segment not being predicted correctly?
Can I use the base r predict() function (not the segmented.predict() function) with the coefficients and break points saved from segmented linear models?
UPDATE
I figured out that the code above has issues (don't use it). Through some reverse engineering of the segmented.predict() function, I produced the design matrix and use that to predict values instead of directly using the predict() function. I do not consider this a full answer of the original question yet because predict() can also produce confidence intervals for the prediction, and I have not yet implemented that--question still open for someone to add confidence intervals.
library(segmented)
## Define function for making matrix of dummy variables (this is based on code from predict.segmented())
dummy.matrix <- function(x.values, x_names, psi.est = TRUE, nameU, nameV, diffSlope, est.psi) {
# This function creates a model matrix with dummy variables for a segmented lm with two breakpoints.
# Inputs:
# x.values: the x values of the segmented lm
# x_names: the name of the column of x values
# psi.est: this is legacy from the predict.segmented function, leave it set to 'TRUE'
# obj: the segmented lm object
# nameU: names (class character) of 3rd and 4th coef, which are "U1.x" "U2.x" for lm with two breaks. Example: names(c(obj$coef[3], obj$coef[4]))
# nameV: names (class character) of 5th and 6th coef, which are "psi1.x" "psi2.x" for lm with two breaks. Example: names(c(obj$coef[5], obj$coef[6]))
# diffSlope: the coefficients (class numeric) with the slope differences; called U1.x and U2.x for lm with two breaks. Example: c(o$coef[3], o$coef[4])
# est.psi: the estimated break points (class numeric); these are the estimated breakpoints from segmented.lm. Example: c(obj$psi[1,2], obj$psi[2,2])
#
n <- length(x.values)
k <- length(est.psi)
PSI <- matrix(rep(est.psi, rep(n, k)), ncol = k)
newZ <- matrix(x.values, nrow = n, ncol = k, byrow = FALSE)
dummy1 <- pmax(newZ - PSI, 0)
if (psi.est) {
V <- ifelse(newZ > PSI, -1, 0)
dummy2 <- if (k == 1)
V * diffSlope
else V %*% diag(diffSlope)
newd <- cbind(x.values, dummy1, dummy2)
colnames(newd) <- c(x_names, nameU, nameV)
} else {
newd <- cbind(x.values, dummy1)
colnames(newd) <- c(x_names, nameU)
}
# if (!x_names %in% names(coef(obj.seg)))
# newd <- newd[, -1, drop = FALSE]
return(newd)
}
## Test dummy matrix function----------------------------------------------
set.seed(12)
xx<-1:100
zz<-runif(100)
yy<-2+1.5*pmax(xx-35,0)-1.5*pmax(xx-70,0)+15*pmax(zz-.5,0)+rnorm(100,0,2)
dati<-data.frame(x=xx,y=yy,z=zz)
out.lm<-lm(y~x,data=dati)
#1 segmented variable, 2 breakpoints: you have to specify starting values (vector) for psi:
o<-segmented(out.lm,seg.Z=~x,psi=c(30,60),
control=seg.control(display=FALSE))
slope(o)
plot.segmented(o)
summary(o)
# Test dummy matrix fn with the same dataset
newdata <- dati
nameU1 <- c("U1.x", "U2.x")
nameV1 <- c("psi1.x", "psi2.x")
diffSlope1 <- c(o$coef[3], o$coef[4])
est.psi1 <- c(o$psi[1,2], o$psi[2,2])
test <- dummy.matrix(x.values = newdata$x, x_names = "x", psi.est = TRUE,
nameU = nameU1, nameV = nameV1, diffSlope = diffSlope1, est.psi = est.psi1)
# Predict response variable using matrix multiplication
col1 <- matrix(1, nrow = dim(test)[1])
test <- cbind(col1, test) # Now test is the same as model.matrix(o)
predY <- coef(o) %*% t(test)
plot(predY[1,])
lines(predict.segmented(o), col = "blue") # good, predict.segmented gives same answer

How can I improve the Integration and Parameterization of Convolved Distributions?

I am trying to solve for the parameters of a gamma distribution that is convolved with both normal and lognormal distributions. I can experimentally derive parameters for both the normal and lognormal components, hence, I just want to solve for the gamma params.
I have attempted 3 approaches to this problem:
1) generating convolved random datasets (i.e. rnorm()+rlnorm()+rgamma()) and using least-squares regression on the linear- or log-binned histograms of the data (not shown, but was very biased by RNG and didn't optimize well at all.)
2) "brute-force" numerical integration of the convolving functions (example code #1)
3) numerical integration approaches w/ the distr package. (example code #2)
I have had limited success with all three approaches. Importantly, these approaches seem to work well for "nominal" values for the gamma parameters, but they all begin to fail when k(shape) is low and theta(scale) is high—which is where my experimental data resides. please find the examples below.
Straight-up numerical Integration
# make the functions
f.N <- function(n) dnorm(n, N[1], N[2])
f.L <- function(l) dlnorm(l, L[1], L[2])
f.G <- function(g) dgamma(g, G[1], scale=G[2])
# make convolved functions
f.Z <- function(z) integrate(function(x,z) f.L(z-x)*f.N(x), -Inf, Inf, z)$value # L+N
f.Z <- Vectorize(f.Z)
f.Z1 <- function(z) integrate(function(x,z) f.G(z-x)*f.Z(x), -Inf, Inf, z)$value # G+(L+N)
f.Z1 <- Vectorize(f.Z1)
# params of Norm, Lnorm, and Gamma
N <- c(0,5)
L <- c(2.5,.5)
G <- c(2,7) # this distribution is the one we ultimately want to solve for.
# G <- c(.5,10) # 0<k<1
# G <- c(.25,5e4) # ballpark params of experimental data
# generate some data
set.seed(1)
rN <- rnorm(1e4, N[1], N[2])
rL <- rlnorm(1e4, L[1], L[2])
rG <- rgamma(1e4, G[1], scale=G[2])
Z <- rN + rL
Z1 <- rN + rL + rG
# check the fit
hist(Z,freq=F,breaks=100, xlim=c(-10,50), col=rgb(0,0,1,.25))
hist(Z1,freq=F,breaks=100, xlim=c(-10,50), col=rgb(1,0,0,.25), add=T)
z <- seq(-10,50,1)
lines(z,f.Z(z),lty=2,col="blue", lwd=2) # looks great... convolution performs as expected.
lines(z,f.Z1(z),lty=2,col="red", lwd=2) # this works perfectly so long as k(shape)>=1
# I'm guessing the failure to compute when shape 0 < k < 1 is due to
# numerical integration problems, but I don't know how to fix it.
integrate(dgamma, -Inf, Inf, shape=1, scale=1) # ==1
integrate(dgamma, 0, Inf, shape=1, scale=1) # ==1
integrate(dgamma, -Inf, Inf, shape=.5, scale=1) # !=1
integrate(dgamma, 0, Inf, shape=.5, scale=1) # != 1
# Let's try to estimate gamma anyway, supposing k>=1
optimFUN <- function(par, N, L) {
print(par)
-sum(log(f.Z1(Z1[1:4e2])))
}
f.G <- function(g) dgamma(g, par[1], scale=par[2])
fitresult <- optim(c(1.6,5), optimFUN, N=N, L=L)
par <- fitresult$par
lines(z,f.Z1(z),lty=2,col="green3", lwd=2) # not so great... likely better w/ more data,
# but it is SUPER slow and I observe large step sizes.
Attempting convolving via distr package
# params of Norm, Lnorm, and Gamma
N <- c(0,5)
L <- c(2.5,.5)
G <- c(2,7) # this distribution is the one we ultimately want to solve for.
# G <- c(.5,10) # 0<k<1
# G <- c(.25,5e4) # ballpark params of experimental data
# make the distributions and "convolvings'
dN <- Norm(N[1], N[2])
dL <- Lnorm(L[1], L[2])
dG <- Gammad(G[1], G[2])
d.NL <- d(convpow(dN+dL,1))
d.NLG <- d(convpow(dN+dL+dG,1)) # for large values of theta, no matter how I change
# getdistrOption("DefaultNrFFTGridPointsExponent"), grid size is always wrong.
# Generate some data
set.seed(1)
rN <- r(dN)(1e4)
rL <- r(dL)(1e4)
rG <- r(dG)(1e4)
r.NL <- rN + rL
r.NLG <- rN + rL + rG
# check the fit
hist(r.NL, freq=F, breaks=100, xlim=c(-10,50), col=rgb(0,0,1,.25))
hist(r.NLG, freq=F, breaks=100, xlim=c(-10,50), col=rgb(1,0,0,.25), add=T)
z <- seq(-10,50,1)
lines(z,d.NL(z), lty=2, col="blue", lwd=2) # looks great... convolution performs as expected.
lines(z,d.NLG(z), lty=2, col="red", lwd=2) # this appears to work perfectly
# for most values of K and low values of theta
# this is looking a lot more promising... how about estimating gamma params?
optimFUN <- function(par, dN, dL) {
tG <- Gammad(par[1],par[2])
d.NLG <- d(convpow(dN+dL+tG,1))
p <- d.NLG(r.NLG)
p[p==0] <- 1e-15 # because sometimes very low probabilities evaluate to 0...
# ...and logs don't like that.
-sum(log(p))
}
fitresult <- optim(c(1,1e4), optimFUN, dN=dN, dL=dL)
fdG <- Gammad(fitresult$par[1], fitresult$par[2])
fd.NLG <- d(convpow(dN+dL+fdG,1))
lines(z,fd.NLG(z), lty=2, col="green3", lwd=2) ## this works perfectly when ~k>1 & ~theta<100... but throws
## "Error in validityMethod(object) : shape has to be positive" when k decreases and/or theta increases
## (boundary subject to RNG).
Can i speed up the integration in example 1? can I increase the grid size in example 2 (distr package)? how can I address the k<1 problem? can I rescale the data in a way that will better facilitate evaluation at high theta values?
Is there a better way all-together?
Help!

Well, convolution of function with gaussian kernel calls for use of Gauss–Hermite quadrature. In R it is implemented in special package: https://cran.r-project.org/web/packages/gaussquad/gaussquad.pdf
UPDATE
For convolution with Gamma distribution this package might be useful as well via Gauss-Laguerre quadrature
UPDATE II
Here is quick code to convolute gaussian with lognormal,
hopefully not a lot of bugs and and prints some reasonable looking graph
library(gaussquad)
n.quad <- 170 # integration order
# get the particular weights/abscissas as data frame with 2 observables and n.quad observations
rule <- ghermite.h.quadrature.rules(n.quad, mu = 0.0)[[n.quad]]
# test function - integrate 1 over exp(-x^2) from -Inf to Inf
# should get sqrt(pi) as an answer
f <- function(x) {
1.0
}
q <- ghermite.h.quadrature(f, rule)
print(q - sqrt(pi))
# convolution of lognormal with gaussian
# because of the G-H rules, we have to make our own function
# for simplicity, sigmas are one and mus are zero
sqrt2 <- sqrt(2.0)
c.LG <- function(z) {
#print(z)
f.LG <- function(x) {
t <- (z - x*sqrt2)
q <- 0.0
if (t > 0.0) {
l <- log(t)
q <- exp( - 0.5*l*l ) / t
}
q
}
ghermite.h.quadrature(Vectorize(f.LG), rule) / (pi*sqrt2)
}
library(ggplot2)
p <- ggplot(data = data.frame(x = 0), mapping = aes(x = x))
p <- p + stat_function(fun = Vectorize(c.LG))
p <- p + xlim(-1.0, 5.0)
print(p)

How can I avoid declaring a global variable in this R code?

I am trying to modify Kyle Gorman's autoloess function to be callable as a method in ggplot2's stat_smooth. autoloess is a simply wrapper which runs loess through an optimiser to find the value of span which minimises AICc.
I have created something which runs successfully, but only by using a global variable. Is there a more elegant, idomatic way of programming this?
My code:
AICc.loess <- function(fit) {
# compute AIC_C for a LOESS fit, from:
#
# Hurvich, C.M., Simonoff, J.S., and Tsai, C. L. 1998. Smoothing
# parameter selection in nonparametric regression using an improved
# Akaike Information Criterion. Journal of the Royal Statistical
# Society B 60: 271â€“293.
#
# #param fit loess fit
# #return 'aicc' value
stopifnot(inherits(fit, 'loess'))
# parameters
n <- fit$n
trace <- fit$trace.hat
sigma2 <- sum(resid(fit) ^ 2) / (n - 1)
return(log(sigma2) + 1 + 2 * (2 * (trace + 1)) / (n - trace - 2))
}
.autoloess.magic.w <- NULL
autoloess <- function(formula, data, weights, span=c(0.01, 2.0)) {
.autoloess.magic.w <- ~weights
fit <- loess(formula=formula,
data=data,
weights=.autoloess.magic.w)
stopifnot(length(span) == 2)
# loss function in form to be used by optimize
f <- function(span) AICc.loess(update(fit, span=span))
# find best loess according to loss function
res <- update(fit, span=optimize(f, span)$minimum)
cat(paste("Optimal span:", res$pars$span, "\n"))
return(res)
}
And a quick test:
# Test
library(ggplot2)
set.seed(1984)
# Create a cubic curve
df <- data.frame(x=1:2500, y=500000 +
(-1000*(1:2500)) +
((1:2500)^2) +
-0.00025*((1:2500)^3) +
rnorm(2500, sd=60000),
ww=runif(2500, min=0, max=10))
# Use loess span
ggplot(df, aes(x=x, y=y, weight=ww)) + geom_point() + stat_smooth(method="loess")
# Use autoloess
ggplot(df, aes(x=x, y=y, weight=ww)) + geom_point() + stat_smooth(method="autoloess")

You can use the weight variable (seems like it is there when the function is called):
autoloess <- function(formula, data, weights, span=c(0.01, 2.0)) {
fit <- loess(formula = formula,
data = data,
weights=weight)
stopifnot(length(span) == 2)
# loss function in form to be used by optimize
f <- function(span) AICc.loess(update(fit, span=span))
# find best loess according to loss function
res <- update(fit, span=optimize(f, span)$minimum)
cat(paste("Optimal span:", res$pars$span, "\n"))
return(res)
}

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

automatically convert polynomial to expression in ggplot2 title - r

Related

Unable to make code run - Error in `lag()`:! `n`

'X' as an array of at least two dimensions

Predict segmented lm outside of package

How can I improve the Integration and Parameterization of Convolved Distributions?

How can I avoid declaring a global variable in this R code?

Categories

Resources