So I have an assignment, where I have to show the convergence of regression coefficients to a certain value if the observed variable has a measurement error. The idea is to show the convergence depending on the number of observations as well as on the standard deviations of the variables.
I built the following function that should create a matrix with the regression coefficients depending on the number of observations. In a later step I would want to show this in a plot and then in a shiny webapp.
The function is:
Deviation <- function(N, sd_v = 1, sd_u = 1, sd_w = 1){
b_1 <- 1
b_2 <- 2
for ( j in length(1:N)){
v <- rnorm(j, mean = 0, sd_v)
u <- rnorm(j, mean = 0, sd_u)
w <- rnorm(j, mean = 0, sd_w)
X <- u + w
Y <- b_1 + b_2 * X + v
Reg <- lm(Y~X)
if (j==1) {
Coeffs <- matrix(Reg$coefficients)
} else {
Coeffs <- rbind(Coeffs, Reg$coefficients)
}
}
Coeffs <- as.data.frame(Coeffs)
return(Coeffs)
}
Deviation(100)
I always get the error that the variable Coeffs is not defined...
Thanks in advance!
As pointed out in the discussion, one possible solution is to change the length(1:N), to simply 1:Nas written below. This works for me.
Deviation <- function(N, sd_v = 1, sd_u = 1, sd_w = 1){
b_1 <- 1
b_2 <- 2
for ( j in 1:N){
v <- rnorm(j, mean = 0, sd_v)
u <- rnorm(j, mean = 0, sd_u)
w <- rnorm(j, mean = 0, sd_w)
X <- u + w
Y <- b_1 + b_2 * X + v
Reg <- lm(Y~X)
if (j==1) {
Coeffs <- matrix(Reg$coefficients)
} else {
Coeffs <- rbind(Coeffs, Reg$coefficients)
}
}
Coeffs <- as.data.frame(Coeffs)
return(Coeffs)
}
followed by...
Deviation(100)
Related
How to write a distribution of piecewise functions in R? For example, if a random variable X is a N(0,1) if p=1 and X~N(0,2) when p=0. I try the following code:
if(p==1)(X=rnorm(1,0,2))?
You can use ifelse:
X <- function(size){
ifelse(sample(0:1,size,replace = TRUE),rnorm(size,0,1),rnorm(size,0,2))
}
50% of the time (on average), X will sample from a N(0,1) variable and the other 50% of the time it will sample from N(0,2).
How it works can be seen more clearly if you change the definition of X so that the means of the two variables sampled from are different:
X <- function(size){
ifelse(sample(0:1,size,replace = TRUE),rnorm(size,0,1),rnorm(size,4,1))
}
Then hist(X(10000)) yields:
library(tidyverse)
#define the function pieces
g =function(x) rnorm(1,0,2)
h =function(x) rnorm(1,0,1)
#define the input
p = c(1,0,1,1,0)
#longer input
#p = sample(c(0,1),2000,replace = T)
piecewise_function= function(p) {
case_when( p==1 ~ g() , # a condition a tilde and a function
p==0 ~ h() ,
T ~ NA) #what to do if neither condition is met.
}
piecewise_function(p)
Try any of these where n is the number of sample size:
rnorm(n, 0, 1 * (p == 1) + 2 * (p == 0))
rnorm(n, 0, ifelse(p == 1, 1, 2))
rnorm(n, 0, 1 + !p)
I have an R programm for a regression that somehow gives me an error message that I do not understand. The regression model takes as input heat input heat data (Q_htg) and the corresponding temperature data (T_amb) and then builds a linear regression for those two variables. Afterwards I want to use the trained regression model to predict some outputs. Here is the code:
dalinearPowerScaling2.function <-
function(Dataset,
numberOfDaysForAggregation,
normOutsideTemperature) {
heatingPower <- Dataset$Q_htg
outSideTemperature <- Dataset$T_amb
aggregationLevel <- numberOfDaysForAggregation * 1440
index <- 0
meanValuesOutsideTemperature <-
vector(, length(outSideTemperature) / aggregationLevel)
for (i in seq(1, length(outSideTemperature), aggregationLevel)) {
sum <- 0
for (j in seq(i, i + aggregationLevel - 1, 1)) {
sum <- sum + outSideTemperature[j]
}
index <- index + 1
meanValuesOutsideTemperature[index] <- sum / aggregationLevel
}
index <- 0
meanValuesHeatingDemand <-
vector(, length(heatingPower) / aggregationLevel)
for (i in seq(1, length(heatingPower), aggregationLevel)) {
sum <- 0
for (j in seq(i, i + aggregationLevel - 1, 1)) {
sum <- sum + heatingPower[j]
}
index <- index + 1
meanValuesHeatingDemand[index] <- sum / aggregationLevel
}
linearModel <-
lm(meanValuesHeatingDemand ~ meanValuesOutsideTemperature)
abline(linearModel, col = "red")
pred <- predict(linearModel, data.frame(meanValuesOutsideTemperature = c(normOutsideTemperature)))
List<-list(meanValuesHeatingDemand, meanValuesOutsideTemperature)
List2 <- vector("list", length(heatingPower)/aggregationLevel)
for (i in seq(1, length(meanValuesHeatingDemand),1)){
List2 [[i]]<-c(meanValuesHeatingDemand[i], meanValuesOutsideTemperature[i])
}
List3<-List2[order(sapply(List2, function(x) x[1], simplify=TRUE), decreasing=FALSE)]
firstTemperatureWithHeatingDemand<-0
firstHeatingDemand<-0
for (i in seq(1, length(List3), 1)) {
if(List3[[i]][1]>0) {
firstTemperatureWithHeatingDemand<-List3[[i]][2]
firstHeatingDemand<-List3[[i]][1]
break}
}
regression2ValuesX <- vector(, 5)
regression2ValuesY <- vector(, 5)
regression2ValuesX [1] <- firstTemperatureWithHeatingDemand
regression2ValuesY [1] <-firstHeatingDemand
List3<-List2[order(sapply(List2, function(x) x[1], simplify=TRUE), decreasing=TRUE)]
for (i in seq(1, length(regression2ValuesX) - 1, 1)) {
regression2ValuesX[i + 1]<-List3[[i]][2]
regression2ValuesY[i + 1]<-List3[[i]][1]
}
plot(regression2ValuesX, regression2ValuesY)
linearModel2 <-
lm(regression2ValuesY ~ regression2ValuesX)
abline(linearModel2, col = "blue")
pred <- predict(linearModel2, data.frame(regression2ValuesX = c(normOutsideTemperature)))
paste("Predicted heating demand:", round(pred))
}
When I run with the command
linearPowerScaling2.function(data_heat_test, 1, -12)
I get the error message:
Error in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...) :
plot.new has not been called yet
3.
int_abline(a = a, b = b, h = h, v = v, untf = untf, ...)
2.
abline(linearModel, col = "red") at LinearPowerScaling2_Function.R#33
1.
linearPowerScaling2.function(data_heat_test, 1, -12)
The data itself should be okay. Can anyone tell me, what the problem is?
Without reproducible minimal example it's hard to test if this solves it, but the error message tells you that you are calling abline() before calling plot().
That's exactly what happens on line 33...
Hope this helps.
Check here to see how to make a minimal reproducible example.
The output am trying for is to make a loop of (i, j, k) where i and k takes values [0, 5] and j from [0, 3]. The loop would run on values like:
(0, 0, 0)
(0, 0, 1)
(0, 0, 2)
(0, 0, 3)
(0, 0, 4)
(0, 0, 5)
(0, 1, 0)
(0, 1, 1)
(0, 1, 2)
.
.
.
(5, 3, 5)
Basically I want to run arima (p, d, q) model making loop and extract RMSE value from there.
The code for arima I tried is,
fit <- arima(df.train$Positive, order=c(0, 0, 0),include.mean = FALSE)
S <- as.data.frame(summary(fit))
S$RMSE
The "S$RMSE" gives the RMSE value.
But help me in running the loop of "order= c(i, j, k)" and get this RMSE value automatically.
The result I want is finally cbind these two and make a table like,
Order RMSE
(0, 0, 0) xxxx
(0, 0, 1) xxxx
(0, 0 ,2) xxxx
Without having access to your data, it's impossible to test if the following code solves your problem.
Try using the apply function to loop over the rows of a matrix defined for i, j, and k using expand.grid:
param_data <- expand.grid(i = 0:5, j = 0:3, k = 0:5)
param_data2 <- cbind(param_data,
apply(param_data, 1,
FUN = function(x){
fit <- arima(df.train$Positive,
order = x,
include.mean = FALSE)
S <- as.data.frame(summary(fit))
S$RMSE
})
)
This does not answer your exact question, however I believe you might use auto.arima function from forecast package, which can estimate best ARIMA model alone.
You can set max (p,q,d) values there as well.
Ugly, but easy solution - I would use triple for loop:
order <- c()
RMSEs <- c()
for (i in 1:5) {
for (j in 1:5) {
for (k in 1:5) {
order_temp <- sprintf('(%s, %s, %s)', i, j, k)
order <- c(order, order_temp)
fit <- arima(df.train$Positive, order=c(i, j, k),include.mean = FALSE)
S <- as.data.frame(summary(fit))
RMSEs <- c(RMSEs, S$RMSE)
}
}
}
result <- as.data.frame(order)
result$RMSE <- RMSEs
Code am using,
Demo <- read.csv("C:/UsersMP.csv", header = TRUE)
Dem <- data.frame(Demo)
smp_size <- floor(0.95 * nrow(Dem))
df.train <- Dem[1:smp_size, ]
df.test <- Dem[(smp_size+1):nrow(Dem), ]
fit <- arima(df.train$Positive, order=c(0, 0, 0),include.mean = FALSE)
S1 <- as.data.frame(summary(fit))
S1$RMSE
fit1 <- arima(df.train$Positive, order=c(0, 0, 1),include.mean = FALSE)
S2 <- as.data.frame(summary(fit))
S2$RMSE
fit2 <- arima(df.train$Positive, order=c(0, 0, 2),include.mean = FALSE)
S3 <- as.data.frame(summary(fit))
S3$RMSE
It is working properly in this. I just want to avoid this task of repeating this and form a loop of (i, j, k).
Exmaple your data
set.seed(1)
df.train = data.frame("Month/Year" = paste0(month.abb,"/",rep(12:18,each=12)), Positive = rnorm(84,5000,1500))
head(df.train)
Month.Year Positive
1 Jan/12 4060.319
2 Feb/12 5275.465
3 Mar/12 3746.557
4 Apr/12 7392.921
5 May/12 5494.262
6 Jun/12 3769.297
The order(p,I,q) of the arima model
library(gtools)
param = permutations(n=6,r=3,v=0:5,repeats.allowed=T)
param = cbind(param[param[,2] <= 3,],0)
colnames(param) <- c("p","I","q","RMSE")
param
function that calculates the RMSE of the adjusted arima model
library(forecast)
RMSE = function(param){
for(i in 1:nrow(param)){
s <-data.frame(summary(Arima(df.train$Positive, order=param[i,1:3],include.mean = FALSE, method = "ML")));
param[i,4] <- s$RMSE
}
return(param)
}
the result
result = RMSE(param)
head(result)
p I q RMSE
[1,] 0 0 0 5308.368
[2,] 0 0 1 3536.816
[3,] 0 0 2 2820.933
[4,] 0 0 3 2555.799
[5,] 0 0 4 2438.050
[6,] 0 0 5 2151.455
Note: For this case the best model is an ARIMA (4,1,5), according to the criteria of the RMSE
result[which(result[,4] == min(result[,4])),]
p I q RMSE
4.00 1.00 5.00 1226.28
I'm simulating another dataset here, and am stuck again!
Here's what I want to do:
200 observations, with 90 independent variables (mean 0, sd 1)
the equation to create y is: y = 2x_1 + ... + 2x_30 - x_31 - ... - x_60 + 0*x_61 + ... + 0*x_90 + mu
(In other words, the first 30 x values will have a coefficient of 2, next 30 values have coefficient of -1 and last 30 values have coefficient of 0). mu is also a random generated normal variable with mean 0, sd 10.
Here's what I have so far:
set.seed(11)
n <- 200
mu <- rnorm(200,0,10)
p1 <- for(i in 1:200){
rnorm(200,0,1)
}
p2 <- cbind(p1)
p3 <- for(i in 1:90){
if i<=30, y=2x
if i>30 & i<=60, y=-x
if i>60 & i<=90, y=0x
}
I'm still learning many aspects of R, so I'm pretty sure the code has much wrong with it, even in terms of syntax. Your help would really be appreciated!
Thanks!
Try
library(mvtnorm)
coefs <- rep(c(2, -1, 0), each=30)
mu <- rnorm(200, 0, 10)
m <- rep(0, 90) # mean of independent variables
sig <- diag(90) # cov of indep variables
x <- rmvnorm(200, mean=m, sigma=sig) # generates 200 observations from multivariate normal
y <- x%*%coefs + mu
In case, if you are not comfortable with linear-algebra
n <- 200
coefs <- rep(c(2, -1, 0), each=30)
mu <- rnorm(n, 0, 10)
x <- matrix(nrow=n, ncol=90) # initializes the indep.vars
for(i in 1:90){
x[, i] <- rnorm(200, 0, 1)
}
y <- rep(NA, n) # initializes the dependent vars
for(i in 1:n){
y[i] = sum(x[i,]*coefs) + m[i]
}
x[i,]*coefs gives exactly (2*x_1,..., 2*x_30, -x_31,...,- x_60,0*x_61,...,0*x_90) because * is element-wise operation.
You'd better learn the rudimentaries of R, before actually doing something with it.
I have this matrix calculations in my code that are taking a long time to run. So far the only way I can think of to speed is up is to use a foreach instead of a for loop, but I feel like there's more that can be done. Is there some way of vectorizing things or using an alternative to for loop that I'm missing out on?
Thanks!
require(foreach)
require(mvtnorm)
# some dummy input values
omega.input.jP <- matrix(rnorm(3000*5, 0.1, 0.1), 3000, 5)
nsteps.obs <- ncol(omega.input.jP)
sigma.j <- rnorm(3000, 0.02, 0.05)
rho1.j <- rnorm(3000, 0.8, 0.1)
rho2.j <- rnorm(3000, 0.05, 0.1)
y.lastobs <- 0.3
mu.input.jP <- matrix(NA, nrow(omega.input.jP), ncol(omega.input.jP))
# note: j is an index denoting sample number (here there are 3000 samples in total, and P denotes the time step (5 time steps here)
mu.input.jP <- foreach (j = 1:nrow(mu.input.jP), .combine = "rbind") %do% {
omega <- omega.input.jP[j, ]
Sigma.mu <- GetSigmaMu(nsteps = nsteps.obs, sigma_ar = sigma.j[j], rho1 = rho1.j[j], rho2 = rho2.j[j])
mu.input.P <- GetConditionalMu(omega = omega, Sigma.mu = Sigma.mu, y = y.lastobs)
return(mu.input.P)
}
GetSigmaMu <- function( # Get Sigma.mu, a \code{nsteps} x \code{nsteps} matrix, for AR(2) process
nsteps,
sigma_ar,
rho1,
rho2
) {
rho <- c(rho1, rho2)
cor <- ARMAacf(ar = rho, pacf = FALSE, lag.max = nsteps) # phi's, first element is phi0 = 1
var <- sigma_ar^2/(1 - sum(rho*cor[2:3])) # stationary variance # cor[2:3] gives first two phi's; cor[1] gives phi0 = 1 # change JR, 20140304
cov <- cor*var
Sigma.mu <- matrix(NA, nsteps, nsteps)
for (i in 1:nsteps) {
for (k in 1:nsteps) {
Sigma.mu[i,k] <- cov[abs(i-k)+1]
}
}
return(Sigma.mu)
}
GetConditionalMu <- function( # Get values of mu given y
omega,
Sigma.mu,
y,
method = "svd" # Method to get eigenvalues in matrix. Default method does not work, "svd" used instead.
) {
nsteps <- length(omega)
one <- rep(1, nsteps)
mean.mu.cond <- c(omega + (1/(sum(Sigma.mu)))*(Sigma.mu %*% one)*c(nsteps*y - t(one) %*% omega))
Sigma.mu.cond <- Sigma.mu - (1/(sum(Sigma.mu)))*(Sigma.mu %*% one %*% t(one) %*% Sigma.mu)
mu.cond <- rmvnorm(1, mean.mu.cond, Sigma.mu.cond, method = method)
return(mu.cond)
}