Build line through points with variances in both coordinates [closed] - r

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 3 years ago.
Improve this question
I have some points, and both of each point's coordinates have variances. It is stored in arrays (just an example):
x <- c(1, 2, 3, 4, 5)
y <- c(1, 2, 3, 4, 5)
dx <- c(0.1, 0.1, 0.1, 0.1, 0.1)
dy <- c(0.1, 0.1, 0.1, 0.1, 0.1)
and each point's coordinates are (x +/- dx, y +/- dy).
I want to fit it with line y=k*x and get result: k +/- dk.

Terry Therneau answered this on rhelp earlier this year, citing a 1987 paper by Prof. Ripley:
Besides "total least squares" it is also call Deming regression and orthogonal regression:
Rhelp text at Baron's R Search page
# Generalized Deming regression, based on Ripley, Analyst, 1987:377-383.
#
deming <- function(x, y, xstd, ystd, jackknife=TRUE, dfbeta=FALSE,
scale=TRUE) {
Call <- match.call()
n <- length(x)
if (length(y) !=n) stop("x and y must be the same length")
if (length(xstd) != length(ystd))
stop("xstd and ystd must be the same length")
# Do missing value processing
nafun <- get(options()$na.action)
if (length(xstd)==n) {
tdata <- nafun(data.frame(x=x, y=y, xstd=xstd, ystd=ystd))
x <- tdata$x
y <- tdata$y
xstd <- tdata$xstd
ystd <- tdata$ystd
}
else {
tdata <- nafun(data.frame(x=x, y=y))
x <- tdata$x
y <- tdata$y
if (length(xstd) !=2) stop("Wrong length for std specification")
xstd <- xstd[1] + xstd[2]*x
ystd <- ystd[1] + ystd[2] * y
}
if (any(xstd <=0) || any(ystd <=0)) stop("Std must be positive")
minfun <- function(beta, x, y, xv, yv) {
w <- 1/(yv + beta^2*xv)
alphahat <- sum(w * (y - beta*x))/ sum(w)
sum(w*(y-(alphahat + beta*x))^2)
}
minfun0 <- function(beta, x, y, xv, yv) {
w <- 1/(yv + beta^2*xv)
alphahat <- 0 #constrain to zero
sum(w*(y-(alphahat + beta*x))^2)
}
afun <-function(beta, x, y, xv, yv) {
w <- 1/(yv + beta^2*xv)
sum(w * (y - beta*x))/ sum(w)
}
fit <- optimize(minfun, c(.1, 10), x=x, y=y, xv=xstd^2, yv=ystd^2)
coef = c(intercept=afun(fit$minimum, x, y, xstd^2, ystd^2),
slope=fit$minimum)
fit0 <- optimize(minfun0, coef[2]*c(.5, 1.5), x=x, y=y,
xv=xstd^2, yv=ystd^2)
w <- 1/(ystd^2 + (coef[2]*xstd)^2) #weights
u <- w*(ystd^2*x + xstd^2*coef[2]*(y-coef[1])) #imputed "true" value
if (is.logical(scale) && scale) {
err1 <- (x-u)/ xstd
err2 <- (y - (coef[1] + coef[2]*u))/ystd
sigma <- sum(err1^2 + err2^2)/(n-2)
# Ripley's paper has err = [y - (a + b*x)] * sqrt(w); gives the same SS
}
else sigma <- scale^2
test1 <- (coef[2] -1)*sqrt(sum(w *(x-u)^2)/sigma) #test for beta=1
test2 <- coef[1]*sqrt(sum(w*x^2)/sum(w*(x-u)^2) /sigma) #test for a=0
rlist <- list(coefficient=coef, test1=test1, test0=test2, scale=sigma,
err1=err1, err2=err2, u=u)
if (jackknife) {
delta <- matrix(0., nrow=n, ncol=2)
for (i in 1:n) {
fit <- optimize(minfun, c(.5, 1.5)*coef[2],
x=x[-i], y=y[-i], xv=xstd[-i]^2, yv=ystd[-i]^2)
ahat <- afun(fit$minimum, x[-i], y[-i], xstd[-i]^2, ystd[-i]^2)
delta[i,] <- coef - c(ahat, fit$minimum)
}
rlist$variance <- t(delta) %*% delta
if (dfbeta) rlist$dfbeta <- delta
}
rlist$call <- Call
class(rlist) <- 'deming'
rlist
}
print.deming <- function(x, ...) {
cat("\nCall:\n", deparse(x$call), "\n\n", sep = "")
if (is.null(x$variance)) {
table <- matrix(0., nrow=2, ncol=3)
table[,1] <- x$coefficient
table[,2] <- c(x$test0, x$test1)
table[,3] <- pnorm(-2*abs(table[,2]))
dimnames(table) <- list(c("Intercept", "Slope"),
c("Coef", "z", "p"))
}
else {
table <- matrix(0., nrow=2, ncol=4)
table[,1] <- x$coefficient
table[,2] <- sqrt(diag(x$variance))
table[,3] <- c(x$test0, x$test1)
table[,4] <- pnorm(-2*abs(table[,3]))
dimnames(table) <- list(c("Intercept", "Slope"),
c("Coef", "se(coef)", "z", "p"))
}
print(table, ...)
cat("\n Scale=", format(x$scale, ...), "\n")
invisible(x)
}

You're looking to perform a total least squares fit. There's a whole book on this, "The total least squares problem: computational aspects and analysis", by Sabine van Huffel, Joos Vandewalle. Wikpedia's article should provide enough for you to code up a solution - it's basically "take the SVD of a slightly augmented system"

Related

How can I fix the while loop problem in R?

I wrote the code as like below, and sometime it gets proper value but sometime it could not give me the value for a long time.
I guess it looks like it has infinite problem with while function but I couldn't get it how to fix it.
I've already tried to search about the while loop but I guess I wrote proeprly but I couldn't get it why it sometime run properly and sometime run not.
Could you please give me advice or the proper modification?
Thank you.
rm(list=ls())
library(readxl)
library(dplyr)
library(ggplot2)
library(MASS)
# Mean Vector, Covariance Matrix Construction
mu <- c(0,0,0)
mu <- t(mu)
mu <- t(mu)
mu
# Construct 40 random variables for Phase II
mu2 <- c(1, 2, 1)
mu2 <- t(mu2)
mu2 <- t(mu2)
mu2
Sigma <- matrix(c(1, 0.9, 0.9, 0.9, 1, 0.9, 0.9, 0.9, 1), 3)
Sigma
getResult <- function(Result) {
# Construct 50 Random Variables for Phase I
Obs <- mvrnorm(50, mu = mu, Sigma = Sigma)
VecT2 <- apply(Obs, 2, mean)
VecT2 <- round(VecT2, 3)
ST2 <- cov(Obs)
ST2 <- round(ST2, 3)
Obs <- as.matrix(Obs)
T2All <- rep(0, nrow(Obs))
for(i in 1:nrow(Obs)) {
T2All[i] = t(Obs[i, ] - VecT2) %*% solve(ST2) %*% (Obs[i, ] - VecT2)
}
# Construct Control Limit
Alpha <- 0.005
M <- nrow(Obs)
M
p <- ncol(Obs)
p
UCL <- ((p * (M-1) * (M + 1))) / ((M - p) * M) * qf((1-Alpha), p, (M-p))
UCL <- round(UCL, 3)
Compare <- which(T2All > UCL)
# Repeat when is there are Out of Control in Phase I with eliminating it
while(isTRUE(Compare > UCL)) {
Obs <- Obs[-Compare,]
Alpha <- 0.005
M <- nrow(Obs)
p <- ncol(Obs)
UCL <- ((p * (M-1) * (M + 1))) / ((M - p) * M) * qf((1-Alpha), p, (M-p))
Compare <- which(T2All > UCL)
}
UCL <- round(UCL, 3)
# Prepare Observations two types of cases with Variable 20_1, Variable 20_2
Obs20_1 <- mvrnorm(20, mu = mu, Sigma = Sigma)
Obs20_2 <- mvrnorm(20, mu = mu2, Sigma = Sigma)
Obs40 <- rbind(Obs20_1, Obs20_2)
Obs40 <- as.matrix(Obs40)
T2 <- rep(0, nrow(Obs40))
for(i in 1:nrow(Obs40)) {
T2[i] = t(Obs40[i, ] - mu) %*% solve(Sigma) %*% (Obs40[i, ] - mu)
}
Result <- which(T2 > UCL)[1]
# Repeat when Out of Control occur in ARL0 section
while(isTRUE(Result < 20)) {
Obs20_1 <- mvrnorm(20, mu = mu, Sigma = Sigma)
Obs40 <- rbind(Obs20_1, Obs20_2)
Obs40 <- as.matrix(Obs40)
T2 <- rep(0, nrow(Obs40))
for(i in 1:nrow(Obs40)) {
T2[i] = t(Obs40[i, ] - mu) %*% solve(Sigma) %*% (Obs40[i, ] - mu)
}
Result <- which(T2 > UCL)[1]
}
Result
}
# Result
Final <- replicate(n = 200, expr = getResult(Result))
Final <- Final - 20
Final
mean(Final)
You could try using a for loop instead of a while loop.

How to find an algebraic expression from an R code?

I have the below codes for a bivariate normal distribution:
library(mnormt)
x <- seq(-5, 5, 0.25)
y <- seq(-5, 5, 0.25)
mu <- c(0, 0)
sigma <- matrix(c(2, -1, -1, 2), nrow = 2)
f <- function(x, y) dmnorm(cbind(x, y), mu, sigma)
z <- outer(x, y, f)
a) I would like to know what the algebraic expression z=f(x,y) is based on the above codes (please write the algebraic expression explicitly). b) Indeed, numbers 2, -1, -1 and 2 in matrix(c(2, -1, -1, 2), nrow = 2) are which parameters in the algebraic expression z=f(x,y)?
If you want to see the source code, you may go to see there there.
I comment the code for you :
dmnorm <- function(x, mean=rep(0,d), varcov, log=FALSE)
{
# number of variable
d <- if(is.matrix(varcov)) ncol(varcov) else 1
if(d==1) return(dnorm(x, mean, sqrt(varcov), log=log))
x <- if (is.vector(x)) t(matrix(x)) else data.matrix(x)
if(ncol(x) != d) stop("mismatch of dimensions of 'x' and 'varcov'")
if(is.matrix(mean)) {
if ((nrow(x) != nrow(mean)) || (ncol(mean) != d))
stop("mismatch of dimensions of 'x' and 'mean'") }
if(is.vector(mean)) mean <- outer(rep(1, nrow(x)), as.vector(matrix(mean,d)))
# center
X <- t(x - mean)
# compute the inverse of sigma
conc <- pd.solve(varcov, log.det=TRUE)
# Q is the exponential part
Q <- colSums((conc %*% X)* X)
# compute the log determinant
log.det <- attr(conc, "log.det")
# log likelihood
logPDF <- as.vector(Q + d*logb(2*pi) + log.det)/(-2)
if(log) logPDF else exp(logPDF)
}
It is a strick application of this equation :
Which come from this website.

Generate functional data from Gaussian Process in R

Model:
X(t) = 4*t + e(t);
t € [0; 1]
e(t) is a Gaussian process with zero mean and covariance function f(s, t) = exp( -|t - s| )
The final result over 100 runs (=100 gray lines) with 50 sampled points each should be like the gray area in the picture.
The green line is what I get from the code below.
library(MASS)
kernel_1 <- function(x, y){
exp(- abs(x - y))
}
cov_matrix <- function(x, kernel_fn, ...) {
outer(x, x, function(a, b) kernel_fn(a, b, ...))
}
draw_samples <- function(x, N=1, kernel_fn, ...) {
set.seed(100)
Y <- matrix(NA, nrow = length(x), ncol = N)
for (n in 1:N) {
K <- cov_matrix(x, kernel_fn, ...)
Y[, n] <- mvrnorm(1, mu = rep(0, times = length(x)), Sigma = K)
}
Y
}
x <- seq(0, 1, length.out = 51) # x-coordinates
model1 <- function(obs, x) {
model1_data <- matrix(NA, nrow = obs, ncol = length(x))
for(i in 1:obs){
e <- draw_samples(x, 1, kernel_fn = kernel_1)
X <- c()
for (p in 1:length(x)){
t <- x[p]
val <- (4*t) + e[p,]
X = c(X, val)
}
model1_data[i,] <- X
}
model1_data
}
# model1(100, x)
Because you have set.seed in draw_samples, you are getting the same random numbers with each draw. If you remove it, then you can do:
a <- model1(100, x)
matplot(t(a), type = "l", col = 'gray')
to get

Coverage probability for an unspecified CDF

I used the following r code to determine the coverage probability.
theta <- seq(0,1, length = 100)
CD_theta <- function(y, p, n){
1 - pbinom (y, size = n, prob = p) + 1/2*dbinom(y, size = n, prob = p)
}
y <- 5
n <- 100
phat <- y/n
mytheta <- CD_theta(5, theta, 100)
set.seed(650)
ci <- list()
n <- 100
B <- 1000
result = rep(NA, B)
all_confInt <- function(B) {
for (i in 1:B){
boot.sample <- sample(mytheta, replace = TRUE)
lower <- theta[which.min(abs(boot.sample - .025))]
upper <- theta[which.min(abs(boot.sample - .975))]
ci[[i]] <- data.frame(lowerCI = lower, upperCI = upper)
intervals <- unlist(ci)
}
return(intervals)
}
df <- data.frame(matrix(all_confInt(B), nrow=B, byrow=T))
colnames(df)[1] <- "Lower"
colnames(df)[2] <- "Upper"
names(df)
dim(df)
mean(df$Lower < phat & df$Upper > phat)*100
However, I obtained 6.4% which is too low. Why am I getting really lower percentage?. Is there any problem in the r function?

How to deal with perfect fit linear model

The data I'm dealing with occasionally has a "perfectly fitting" linear model. For each regression I run, I need to extract the r.squared value which I've been doing with summary(mymodel)$r.squared but this fails in the case of a perfectly fitting model (see below).
df <- data.frame(x = c(1,2,3,4,5), y = c(1,1,1,1,1))
mymodel <- lm(y ~ x, data = df)
summary(mymodel)$r.squared #This raises a warning
0.5294
How can I handle these cases? Basically, I think I want to do something like
If(mymodel is a perfect fit)
rsquared = 1
else
rsquared = summary(mymodel)$r.squared
You can use tryCatch
df <- data.frame(x = c(1,2,3,4,5), y = c(1,1,1,1,1))
mymodel <- lm(y ~ x, data = df)
summary(mymodel)$r.squared #This raises a warning
tryCatch(summary(mymodel)$r.squared, warning = function(w) return(1))
# [1] 1
And with an added conditional to catch specific warnings
df <- data.frame(x = c(1,2,3,4,5), y = c(1,1,1,1,1))
mymodel <- lm(y ~ x, data = df)
summary(mymodel)$r.squared #This raises a warning
f <- function(expr) {
tryCatch(expr,
warning = function(w) {
if (grepl('perfect fit', w))
return(1)
else return(w)
})
}
f(TRUE)
# [1] TRUE
f(sum(1:5))
# [1] 15
f(summary(mymodel)$r.squared)
# [1] 1
f(warning('this is not a fit warning'))
# <simpleWarning in doTryCatch(return(expr), name, parentenv, handler): this is not a fit warning>
If you want to make sure that everything will be working perfect then you can just slightly modify the source code (type summary.lm to see the original code):
df <- data.frame(x = c(1,2,3,4,5), y = c(1,1,1,1,1))
mymodel <- lm(y ~ x, data = df)
This is how i modified it. All is the same as the original summary function apart from the bit at the bottom of the function.
summary2 <- function (object, correlation = FALSE, symbolic.cor = FALSE,
...)
{
z <- object
p <- z$rank
rdf <- z$df.residual
if (p == 0) {
r <- z$residuals
n <- length(r)
w <- z$weights
if (is.null(w)) {
rss <- sum(r^2)
}
else {
rss <- sum(w * r^2)
r <- sqrt(w) * r
}
resvar <- rss/rdf
ans <- z[c("call", "terms", if (!is.null(z$weights)) "weights")]
class(ans) <- "summary.lm"
ans$aliased <- is.na(coef(object))
ans$residuals <- r
ans$df <- c(0L, n, length(ans$aliased))
ans$coefficients <- matrix(NA, 0L, 4L)
dimnames(ans$coefficients) <- list(NULL, c("Estimate",
"Std. Error", "t value", "Pr(>|t|)"))
ans$sigma <- sqrt(resvar)
ans$r.squared <- ans$adj.r.squared <- 0
return(ans)
}
if (is.null(z$terms))
stop("invalid 'lm' object: no 'terms' component")
if (!inherits(object, "lm"))
warning("calling summary.lm(<fake-lm-object>) ...")
Qr <- qr(object)
n <- NROW(Qr$qr)
if (is.na(z$df.residual) || n - p != z$df.residual)
warning("residual degrees of freedom in object suggest this is not an \"lm\" fit")
r <- z$residuals
f <- z$fitted.values
w <- z$weights
if (is.null(w)) {
mss <- if (attr(z$terms, "intercept"))
sum((f - mean(f))^2)
else sum(f^2)
rss <- sum(r^2)
}
else {
mss <- if (attr(z$terms, "intercept")) {
m <- sum(w * f/sum(w))
sum(w * (f - m)^2)
}
else sum(w * f^2)
rss <- sum(w * r^2)
r <- sqrt(w) * r
}
resvar <- rss/rdf
p1 <- 1L:p
R <- chol2inv(Qr$qr[p1, p1, drop = FALSE])
se <- sqrt(diag(R) * resvar)
est <- z$coefficients[Qr$pivot[p1]]
tval <- est/se
ans <- z[c("call", "terms", if (!is.null(z$weights)) "weights")]
ans$residuals <- r
ans$coefficients <- cbind(est, se, tval, 2 * pt(abs(tval),
rdf, lower.tail = FALSE))
dimnames(ans$coefficients) <- list(names(z$coefficients)[Qr$pivot[p1]],
c("Estimate", "Std. Error", "t value", "Pr(>|t|)"))
ans$aliased <- is.na(coef(object))
ans$sigma <- sqrt(resvar)
ans$df <- c(p, rdf, NCOL(Qr$qr))
if (p != attr(z$terms, "intercept")) {
df.int <- if (attr(z$terms, "intercept"))
1L
else 0L
ans$r.squared <- mss/(mss + rss)
ans$adj.r.squared <- 1 - (1 - ans$r.squared) * ((n -
df.int)/rdf)
ans$fstatistic <- c(value = (mss/(p - df.int))/resvar,
numdf = p - df.int, dendf = rdf)
}
else ans$r.squared <- ans$adj.r.squared <- 0
ans$cov.unscaled <- R
dimnames(ans$cov.unscaled) <- dimnames(ans$coefficients)[c(1,
1)]
#below is the only change to the code
#instead of ans$r.squared <- 1 the original code had a warning
if (is.finite(resvar) && resvar < (mean(f)^2 + var(f)) *
1e-30) {
ans$r.squared <- 1 #this is practically the only change in the source code. Originally it had the warning here
}
#moved the above lower in the order of the code so as not to affect the original code
#checked it and seems to be working properly
if (correlation) {
ans$correlation <- (R * resvar)/outer(se, se)
dimnames(ans$correlation) <- dimnames(ans$cov.unscaled)
ans$symbolic.cor <- symbolic.cor
}
if (!is.null(z$na.action))
ans$na.action <- z$na.action
class(ans) <- "summary.lm"
ans
}
Run the new formula and see that it works now without any warnings. No other if or else if conditions are required.
> summary2(mymodel)$r.squared
[1] 1
One option to catch a perfect fit is to determine the residuals: if it is a perfect fit, the sum of residuals will be zero.
x = 1:5
# generate 3 sets of y values, last set is random values
y = matrix(data = c(rep(1,5),1:5,rnorm(5)), nrow = 5)
tolerance = 0.0001
r.sq = array(NA,ncol(y))
# check fit for three sets
for (i in 1:ncol(y)){
fit = lm(y[,i]~x)
# determine sum of residuals
if (sum(abs(resid(fit))) < tolerance) {
# perfect fit case
r.sq[i] = 1 } else {
# non-perfect fit case
r.sq[i] = summary(fit)$r.squared
}
}
print(r.sq)
# [1] 1.0000000 1.0000000 0.7638879

Resources