MGCV get design matrix - r

GAM regression with splines basis is defined by the following cost function:
cost = ||y - S \beta ||^2 + scale * integral(|S'' \beta|^2)
where S is the design matrix defined by the splines.
In R I can compute gam with the following code:
library('mgcv')
data = data.frame('x'=c(1,2,3,4,5), 'y'=c(1,0,0,0,1))
g = gam(y~s(x, k = 4),family = 'binomial', data = data, scale = 0.5)
plot(g)
I would like to get the design matrix S that is generated by s() function.
How can I do that?

I believe there are two ways to get the design matrix from a gamObject
library('mgcv')
data <- data.frame('x'=c(1,2,3,4,5), 'y'=c(1,0,0,0,1))
g <- gam(y~s(x, k = 4),family = 'binomial', data = data, scale = 0.5)
plot(g)
(option1 <- predict(g, type = "lpmatrix"))
# (Intercept) s(x).1 s(x).2 s(x).3
# 1 1 1.18270529 -0.39063809 -1.4142136
# 2 1 0.94027407 0.07402655 -0.7071068
# 3 1 -0.03736554 0.32947477 0.0000000
# 4 1 -0.97272283 0.21209396 0.7071068
# 5 1 -1.11289099 -0.22495720 1.4142136
# attr(,"model.offset")
# [1] 0
(option2 <- model.matrix.gam(g))
# (Intercept) s(x).1 s(x).2 s(x).3
# 1 1 1.18270529 -0.39063809 -1.4142136
# 2 1 0.94027407 0.07402655 -0.7071068
# 3 1 -0.03736554 0.32947477 0.0000000
# 4 1 -0.97272283 0.21209396 0.7071068
# 5 1 -1.11289099 -0.22495720 1.4142136
# attr(,"model.offset")
# [1] 0

Related

How Do I Set.Seed for simulation in R to attain reproducibility on Windows OS

I have a simulation done with the below function in R:
## Load packages and prepare multicore process
library(forecast)
library(future.apply)
plan(multisession)
library(parallel)
library(foreach)
library(doParallel)
n_cores <- detectCores()
cl <- makeCluster(n_cores)
registerDoParallel(cores = detectCores())
set.seed(1)
bootstrap1 <- function(n, phi){
ts <- arima.sim(n, model = list(ar=phi, order = c(1, 1, 0)), sd = 1)
#ts <- numeric(n)
#ts[1] <- rnorm(1)
#for(i in 2:length(ts))
# ts[i] <- 2 * ts[i - 1] + rnorm(1)
########################################################
## create a vector of block sizes
t <- length(ts) # the length of the time series
lb <- seq(n-2)+1 # vector of block sizes to be 1 < l < n (i.e to be between 1 and n exclusively)
########################################################
## This section create matrix to store block means
BOOTSTRAP <- matrix(nrow = 1, ncol = length(lb))
colnames(BOOTSTRAP) <-lb
#BOOTSTRAP <- list(length(lb))
########################################################
## This section use foreach function to do detail in the brace
BOOTSTRAP <- foreach(b = 1:length(lb), .combine = 'cbind') %dopar%{
l <- lb[b]# block size at each instance
m <- ceiling(t / l) # number of blocks
blk <- split(ts, rep(1:m, each=l, length.out = t)) # divides the series into blocks
######################################################
res<-sample(blk, replace=T, 1000) # resamples the blocks
res.unlist <- unlist(res, use.names = FALSE) # unlist the bootstrap series
train <- head(res.unlist, round(length(res.unlist) - 10)) # Train set
test <- tail(res.unlist, length(res.unlist) - length(train)) # Test set
nfuture <- forecast::forecast(train, model = forecast::auto.arima(train), lambda=0, biasadj=TRUE, h = length(test))$mean # makes the `forecast of test set
RMSE <- Metrics::rmse(test, nfuture) # RETURN RMSE
BOOTSTRAP[b] <- RMSE
}
BOOTSTRAPS <- matrix(BOOTSTRAP, nrow = 1, ncol = length(lb))
colnames(BOOTSTRAPS) <- lb
BOOTSTRAPS
return(list("BOOTSTRAPS" = BOOTSTRAPS))
}
I use for loop to print its result three times.
for (i in 1:3) { set.seed(1)
print(bootstrap1(10, 0.5))
}
I have the below result:
## 2 3 4 5 6 7 8 9
##[1,] 1.207381 1.447382 1.282099 0.9311434 0.8481634 1.006494 0.9829584 1.205194
## 2 3 4 5 6 7 8 9
##[1,] 1.404846 1.262756 1.50738 1.188452 0.8981125 1.001651 1.349721 1.579556
## 2 3 4 5 6 7 8 9
##[1,] 1.265196 1.080703 1.074807 1.430653 0.9166268 1.12537 0.9492137 1.201763
If I have to run this several times I will be getting a different result.
I want the way I can set the seed such that the three-round will be distinct while if I run with the set seed, I will get the same three-distinct result using R.
We could specify the kind in set.seed. If we are doing this inside the loop, it will return the same values
for (i in 1:3) {
set.seed(1, kind = "L'Ecuyer-CMRG")
print(bootstrap1(10, 0.5))
}
#$BOOTSTRAPS
# 2 3 4 5 6 7 8 9
#[1,] 4.189426 6.428085 3.672116 3.893026 2.685741 3.821201 3.286509 4.062811
#$BOOTSTRAPS
# 2 3 4 5 6 7 8 9
#[1,] 4.189426 6.428085 3.672116 3.893026 2.685741 3.821201 3.286509 4.062811
#$BOOTSTRAPS
# 2 3 4 5 6 7 8 9
#[1,] 4.189426 6.428085 3.672116 3.893026 2.685741 3.821201 3.286509 4.062811
If the intention is to return different values for each iteration in for loop and get the same result on subsequent runs, specify the set.seed outside the loop
1) First run
set.seed(1, kind = "L'Ecuyer-CMRG")
for (i in 1:3) {
print(bootstrap1(10, 0.5))
}
#$BOOTSTRAPS
# 2 3 4 5 6 7 8 9
#[1,] 4.189426 6.428085 3.672116 3.893026 2.685741 3.821201 3.286509 4.062811
#$BOOTSTRAPS
# 2 3 4 5 6 7 8 9
#[1,] 1.476428 1.806258 2.071091 2.09906 2.014298 1.032776 2.573738 1.831142
#$BOOTSTRAPS
# 2 3 4 5 6 7 8 9
#[1,] 2.248546 1.838302 2.345557 1.696614 2.06357 1.502569 1.912556 1.906049
2) Second run
set.seed(1, kind = "L'Ecuyer-CMRG")
for (i in 1:3) {
print(bootstrap1(10, 0.5))
}
#$BOOTSTRAPS
# 2 3 4 5 6 7 8 9
#[1,] 4.189426 6.428085 3.672116 3.893026 2.685741 3.821201 3.286509 4.062811
#$BOOTSTRAPS
# 2 3 4 5 6 7 8 9
#[1,] 1.476428 1.806258 2.071091 2.09906 2.014298 1.032776 2.573738 1.831142
#$BOOTSTRAPS
# 2 3 4 5 6 7 8 9
#[1,] 2.248546 1.838302 2.345557 1.696614 2.06357 1.502569 1.912556 1.906049
According to ?set.seed
"L'Ecuyer-CMRG": -
A ‘combined multiple-recursive generator’ from L'Ecuyer (1999), each element of which is a feedback multiplicative generator with three integer elements: thus the seed is a (signed) integer vector of length 6. The period is around 2^191. The 6 elements of the seed are internally regarded as 32-bit unsigned integers. Neither the first three nor the last three should be all zero, and they are limited to less than 4294967087 and 4294944443 respectively. This is not particularly interesting of itself, but provides the basis for the multiple streams used in package parallel.

Cramer's V with missing values gives different results

My questions concern the calculation of the Cramers V to detect correlation between categorial variables. I 've got a dataset with missing values, but I created a fake dataset for illustration with two variables a and b, one of them containing to NA's.
a <- factor(c("M","","F","F","","M","F","F"))
a2 <- factor(a, levels = c('M','F'),labels =c('Male','Female'))
b <- factor(c("y","y","","y","n","n","n","y"))
b2 <- factor(b, levels=c("y","n"),labels=c("yes","no"))
df<-cbind(a2,b2)
The assocstats function gives me the result for the cramers V:
require(vcd)
> tab <-table(a,b)
> assocstats(tab)
X^2 df P(> X^2)
Likelihood Ratio 1.7261 4 0.78597
Pearson 1.3333 4 0.85570
Phi-Coefficient : 0.408
Contingency Coeff.: 0.378
Cramer's V : 0.289
Now I want to drop the NA's from the levels
a[a==""]<-NA
a3 <- droplevels(a)
levels(a3)
tab <-table(a,b)
assocstats(tab)
But everytime I remove NA's the result looks like this:
X^2 df P(> X^2)
Likelihood Ratio 0.13844 2 0.93312
Pearson NaN 2 NaN
Phi-Coefficient : NaN
Contingency Coeff.: NaN
Cramer's V : NaN
Also, because I have a large dataset I would like to calculate a matrix of the Cramer V results. I found this code here on stack overflow and it seems to work...
get.V<-function(y){
col.y<-ncol(y)
V<-matrix(ncol=col.y,nrow=col.y)
for(i in 1:col.y){
for(j in 1:col.y){
V[i,j]<-assocstats(table(y[,i],y[,j]))$cramer
}
}
return(V)
}
get.V(tab)
Only that the result is different than that with assocstats function:
[,1] [,2] [,3]
[1,] 1.0 0.5 1
[2,] 0.5 1.0 1
[3,] 1.0 1.0 1
This can not be right, because I get this result every time, even when changing the number of observations... what is wrong with this code?
Conclusion:I don't know which one of the result is right. I have a large dataset with a lot of NA's in it. The first asocstat result and the code give different results, altough there is no big difference,because the code only creates a matrix. The second asocstat function gives only NaN.I cant detect any errors... Can somebody help me?
You don't have to replace the "" with NA if you are using factors--any unique value that you don't define in levels will be converted to NA by factor
a <- factor(c("M","","F","F","","M","F","F"))
a2 <- factor(a, levels = c('M','F'),labels =c('Male','Female'))
a
# [1] M F F M F F
# Levels: F M
a2
# [1] Male <NA> Female Female <NA> Male Female Female
# Levels: Male Female
b <- factor(c("y","y","","y","n","n","n","y"))
b2 <- factor(b, levels=c("y","n"),labels=c("yes","no"))
(df <- cbind(a2,b2))
# a2 b2
# [1,] 1 1
# [2,] NA 1
# [3,] 2 NA
# [4,] 2 1
# [5,] NA 2
# [6,] 1 2
# [7,] 2 2
# [8,] 2 1
Above, you're creating a matrix which loses all the labels that you created with factor. I think you want a data frame:
(df <- data.frame(a2,b2))
# a2 b2
# 1 Male yes
# 2 <NA> yes
# 3 Female <NA>
# 4 Female yes
# 5 <NA> no
# 6 Male no
# 7 Female no
# 8 Female yes
require('vcd')
(tab <- table(a2,b2, useNA = 'ifany'))
# b2
# a2 yes no <NA>
# Male 1 1 0
# Female 2 1 1
# <NA> 1 1 0
(tab <- table(a2,b2))
# b2
# a2 yes no
# Male 1 1
# Female 2 1
You need to explicitly tell table if you want to see NA values in the table. Otherwise, it will drop them by default so that you are already "excluding" them when you use assocstats:
assocstats(tab)
# X^2 df P(> X^2)
# Likelihood Ratio 0.13844 1 0.70983
# Pearson 0.13889 1 0.70939
#
# Phi-Coefficient : 0.167
# Contingency Coeff.: 0.164
# Cramer's V : 0.167
For get.V just pass the data frame or matrix, not the table:
get.V <- function(y) {
col.y <- ncol(y)
V <- matrix(ncol=col.y,nrow=col.y)
for(i in 1:col.y){
for(j in 1:col.y){
V[i,j] <- assocstats(table(y[,i],y[,j]))$cramer
}
}
return(V)
}
get.V(df)
# [,1] [,2]
# [1,] 1.0000000 0.1666667
# [2,] 0.1666667 1.0000000

Is there a way to change the way R labels the interaction parameter in model output?

I am having a seemingly simple but very frustrating problem. When you run a model with an interaction term in R, R names the parameter generated "var1:var2" etc. Unfortunately, this naming convention prevents me from calculating predicted values and CI's where newdata is required, because ":" is not a character that can be included in a column header, and the names in the original data frame must exactly match those in newdata. Has anyone else had this problem?
Here is a sample of my code:
wemedist2.exp = glm(survive/trials ~ sitedist + type + sitedist*type + roaddist, family = binomial(logexp(wemedata$expos)), data=wemedata)
summary(wemedist2.exp)
wemepredict3 = with(wemedata, data.frame(sitedist=mean(sitedist),roaddist=mean(roaddist), type=factor(1:2)))
wemepredict3 = cbind(wemepredict3, predict(wemedist2.exp, newdata = wemepredict3, type = "link", se = TRUE))
This produces a table with predicted values for each of the variables at the specified levels, but not interaction.
For your newdata data frame, you shouldn't include columns for the interactions. The product of the interactive variables will be calculated for you (and multiplied by the estimated coefficient) when calling predict.
For example:
Create some dummy data:
set.seed(1)
n <- 10000
X <- data.frame(x1=runif(n), x2=runif(n))
X$x1x2 <- X$x1 * X$x2
head(X)
# x1 x2 x1x2
# 1 0.2655087 0.06471249 0.017181728
# 2 0.3721239 0.67661240 0.251783646
# 3 0.5728534 0.73537169 0.421260147
# 4 0.9082078 0.11129967 0.101083225
# 5 0.2016819 0.04665462 0.009409393
# 6 0.8983897 0.13091031 0.117608474
b <- runif(4)
y <- b[1] + c(as.matrix(X) %*% b[-1]) + rnorm(n, sd=0.1)
Fit the model and compare the estimated vs. true coefficients:
M <- lm(y ~ x1 * x2, X)
summary(M)
# Call:
# lm(formula = y ~ x1 * x2, data = X)
#
# Residuals:
# Min 1Q Median 3Q Max
# -0.43208 -0.06743 -0.00170 0.06601 0.37197
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.202040 0.003906 51.72 <2e-16 ***
# x1 0.128237 0.006809 18.83 <2e-16 ***
# x2 0.156942 0.006763 23.21 <2e-16 ***
# x1:x2 0.292582 0.011773 24.85 <2e-16 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 0.09906 on 9996 degrees of freedom
# Multiple R-squared: 0.5997, Adjusted R-squared: 0.5996
# F-statistic: 4992 on 3 and 9996 DF, p-value: < 2.2e-16
b
# [1] 0.2106027 0.1147864 0.1453641 0.3099322
Create example data to predict to, and do prediction. Note that we only create x1 and x2, and do not create x1:x2:
X.predict <- data.frame(x1=runif(10), x2=runif(10))
head(X.predict)
# x1 x2
# 1 0.26037592 0.7652155
# 2 0.73988333 0.3352932
# 3 0.02650689 0.9788743
# 4 0.84083874 0.1446228
# 5 0.85052685 0.7674547
# 6 0.13568509 0.9612156
predict(M, newdata=X.predict)
# 1 2 3 4 5 6 7
# 0.4138194 0.4221251 0.3666572 0.3681432 0.6225354 0.4084543 0.4711018
# 8 9 10
# 0.7092744 0.3401867 0.2320834
Or...
An alternative approach is to include the interactions in your model-fitting data by calculating the product of the interactive terms, and then include this in your new data as well. We've done the first step in point 1 above, where we created a column called x1x2.
Then we would fit the model with: lm(y ~ x1 + x2 + x1x2, X)
And predict to the following data:
X.predict <- data.frame(x1=runif(10), x2=runif(10), x1x2=runif(10)
If you have categorical variables involved in interactions...
When you have interactions involving categorical variables, the model estimates coefficients describing the effect of belonging to each level relative to belonging to a reference level. So for instance if we have one continuous predictor (x1) and one categorical predictor (x2, with levels a, b, and c), then the model y ~ x1 * x2 will estimate six coefficients, describing:
the intercept (i.e. the predicted y when x1 is zero and the observation belongs to the reference level of x2);
the effect of varying x1 when the observation belongs to the reference level of x2 (i.e. the slope, for the reference level of x2);
the effect of belonging to the second level (i.e. the change in intercept due to belonging to the second level, relative to belonging to the reference level);
the effect of belonging to the third level (i.e. the change in intercept due to belonging to the third level, relative to belonging to the reference level);
the change in the effect of x1 (i.e. change in slope) due to belonging to the second level, relative to belonging to the reference level; and
the change in the effect of x1 (i.e. change in slope) due to belonging to the third level, relative to belonging to the reference level.
If you want to fit and predict the model with/to pre-calculated data describing the interaction, you can create a dataframe that includes columns: x1; x2b (binary, indicating whether the observation belongs to level b); x2c (binary, indicating whether the observation belongs to level c); x1x2b (the product of x1 and x2b); and x1x2c (the product of x1 and x2c).
A quick way to do this is with model.matrix:
set.seed(1)
n <- 1000
d <- data.frame(x1=runif(n), x2=sample(letters[1:3], n, replace=TRUE))
head(d)
# x1 x2
# 1 0.2655087 b
# 2 0.3721239 c
# 3 0.5728534 b
# 4 0.9082078 c
# 5 0.2016819 a
# 6 0.8983897 a
X <- model.matrix(~x1*x2, d)
head(X)
# (Intercept) x1 x2b x2c x1:x2b x1:x2c
# 1 1 0.2655087 1 0 0.2655087 0.0000000
# 2 1 0.3721239 0 1 0.0000000 0.3721239
# 3 1 0.5728534 1 0 0.5728534 0.0000000
# 4 1 0.9082078 0 1 0.0000000 0.9082078
# 5 1 0.2016819 0 0 0.0000000 0.0000000
# 6 1 0.8983897 0 0 0.0000000 0.0000000
b <- rnorm(6) # coefficients
y <- X %*% b + rnorm(n, sd=0.1)
You can rename the columns of X to whatever you want, as long as you use consistent naming when predicting the model to new data later.
Now fit the model. Here I tell lm not to calculate an intercept (with -1), since the variable (Intercept) already exists in X and will have a coefficient calculated for it. We could have also done this by fitting to data as.data.frame(X[, -1]):
(M <- lm(y ~ . - 1, as.data.frame(X)))
# Call:
# lm(formula = y ~ . - 1, data = as.data.frame(X))
#
# Coefficients:
# `(Intercept)` x1 x2b x2c `x1:x2b` `x1:x2c`
# 1.14389 1.09168 -0.88879 0.20405 0.09085 -1.63769
Create some new data to predict to, and carry out the prediction:
d.predict <- expand.grid(x1=seq(0, 1, 0.1), x2=letters[1:3])
X.predict <- model.matrix(~x1*x2, d.predict)
y.predict <- predict(M, as.data.frame(X.predict))

Using mice in R changes dummy coding

I'm trying to use the mice package in R for a project and discovered that the pooled results seemed to change the dummy code I had for one of the variables in the output.
To elaborate, let's say I have a factor, foo, with two levels: 0 and 1. Using a regular lm would typically yield an estimate for foo1. Using mice and the pool function, however, yields an estimate for foo2. I included a reproducible example below using the nhanes dataset from the mice package. Any ideas why the might be occurring?
require(mice)
# Create age as: 0, 1, 2
nhanes$age <- as.factor(nhanes$age - 1)
head(nhanes)
# age bmi hyp chl
# 1 0 NA NA NA
# 2 1 22.7 1 187
# 3 0 NA 1 187
# 4 2 NA NA NA
# 5 0 20.4 1 113
# 6 2 NA NA 184
# Use a regular lm with missing data just to see output
# age1 and age2 come up as expected
lm(chl ~ age + bmi, data = nhanes)
# Call:
# lm(formula = chl ~ age + bmi, data = nhanes)
# Coefficients:
# (Intercept) age1 age2 bmi
# -28.948 55.810 104.724 6.921
imp <- mice(nhanes)
str(complete(imp)) # still the same coding
fit <- with(imp, lm(chl ~ age + bmi))
pool(fit)
# Now the estimates are for age2 and age3
# Call: pool(object = fit)
# Pooled coefficients:
# (Intercept) age2 age3 bmi
# 29.88431 43.76159 56.57606 5.05537
Apparently the mice function sets contrasts for factors. So you get the following (check out the column names):
contrasts(nhanes$age)
## 1 2
## 0 0 0
## 1 1 0
## 2 0 1
contrasts(imp$data$age)
## 2 3
## 0 0 0
## 1 1 0
## 2 0 1
You can just change the contrasts of the imputed data, then you get the same dummy coding:
imp <- mice(nhanes)
contrasts(imp$data$age) <- contrasts(nhanes$age)
fit <- with(imp, lm(chl ~ age + bmi))
pool(fit)
## Call: pool(object = fit)
##
## Pooled coefficients:
## (Intercept) age1 age2 bmi
## 0.9771566 47.6351257 63.1332336 6.2589887
##
## Fraction of information about the coefficients missing due to nonresponse:
## (Intercept) age1 age2 bmi
## 0.3210118 0.5554399 0.6421063 0.3036489

Constructing derivative of a conditional density

I am using npcdens from np package to construct a conditional density of y on covariates x. However, I need the derivative of the log of this density with respect to y. Is there some way in R to get this?
bw <- npcdensbw(formula=y ~ x1+x2+x3)
fhat <- npcdens(bws=bw,gradients=TRUE)
grad.fhat <- gradients(npcdens(bws=bw,gradients=TRUE))
which returns the gradient with respect to x1, x2 and x3
Can we use this example dataset?
dta = data.frame(expand.grid(x1=1:5,x2=2:6,x3=5:10))
dta$y = with(dta,x1+2*x2 + 3*x3^2)
head(dta)
x1 x2 x3 y
1 1 2 5 80
2 2 2 5 81
3 3 2 5 82
4 4 2 5 83
5 5 2 5 84
6 1 3 5 82
y is the value of the "density". estimate a conditional bandwith object
bw <- npcdensbw(formula = y ~ x1+x2+x3,data=dta)
and look at the gradients
head(gradients(npcdens(bws=bw,gradients=TRUE)))
[,1] [,2] [,3]
[1,] -2.024422e-15 -2.048994e-50 -1.227563e-294
[2,] -1.444541e-15 -1.994174e-50 -1.604693e-294
[3,] -1.017979e-31 -1.201719e-50 -1.743784e-294
[4,] 1.444541e-15 -6.753912e-64 -1.604693e-294
[5,] 2.024422e-15 1.201719e-50 -1.227563e-294
[6,] -2.024422e-15 -3.250713e-50 -1.227563e-294
What do you mean with "derivative with respect to y"? this is a function g(x1,x2,x3), so you can only take derivatives w.r.t. to those 3 dimensions. Concerning the "log of y" part of your question, could this be it?
bw <- npcdensbw(formula = log(y) ~ x1 + x2 + x3,data=dta)
I've never used this package, so these are the thoughts of a non-practitioner. I guess you looked at the examples in help(npcdensbw)?

Resources