different variable imputed values using same predictor variables mice R - r

I would expect the imputed values of x to be the same if the same preditor variables were used, despite other variables being imputed or not, but it's not the case, as reproduced here:
library(data.table)
library(robustlmm)
library(mice)
library(miceadds)
library(magrittr)
library(dplyr)
library(tidyr)
set.seed(1)
# Data ------------------------------------
dt1 <- data.table(id = rep(1:10, each=3),
group = rep(1:2, each=15),
time = rep(1:3, 10),
sex = rep(sample(c("F","M"),10,replace=T), each=3),
x = rnorm(30),
y = rnorm(30),
z = rnorm(30))
setDT(dt1)[id %in% sample(1:10,4) & time == 2, `:=` (x = NA, y = NA)][
id %in% sample(1:10,4) & time == 3, `:=` (x = NA, y = NA)]
dt2 <- dt1 %>% group_by(id) %>% fill(y) %>% ungroup %>% as.data.table
# MI 1 ------------------------------------
pm1 <- make.predictorMatrix(dt1)
pm1['x',c('y','z')] <- 0
pm1[c('x','y'), 'id'] <- -2
imp1 <- mice(dt1, pred = pm1, meth = "2l.pmm", seed = 1, m = 2, print = F, maxit = 20)
# boundary (singular) fit: see ?isSingular - don't know how to interpret this (don't occur with my real data)
View(complete(imp1, 'long'))
# MI 2 ------------------------------------
pm2 <- make.predictorMatrix(dt2)
pm2['x',c('y','z')] <- 0
pm2['x', 'id'] <- -2
imp2 <- mice(dt2, pred = pm2, meth = "2l.pmm", seed = 1, m = 2, print = F, maxit = 20, remove.constant = F)
# imp2$loggedEvents report sex as constant (don't know why) so I include remove.constant=F to keep that variable (don't occur with my real data)
View(complete(imp2, 'long'))
In imp1:
group, time and sex are used to predict x
group, time, sex, x and z are used to predict y
In ìmp2:
group, time and sex are used to predict x
y is complete so no imputation is performed for this variable
Given so, why are the results different for the imputed data on x?
Is it the expected behavior?
Thank you!

Related

Predicting from glmmTMB with truncated counts

I'm running a glmmTMB model with various truncated count distributions (truncated_poisson, truncated_compois, truncated_nbinom1, truncated_nbinom2). When I predict from the model, the values seem to be lower than expected, as if the prediction is not accounting for the truncation. Where am I going wrong? A toy example is provided, showing that predicted values are lower than observed means.
Any advice would be appreciated. Extra points if the advice can extend to the other truncated count distributions (see above) and if it shows how to correctly get the 95% confidence band around the estimated values in these cases.
library(dplyr)
library( extraDistr)
library(glmmTMB)
set.seed(1)
df <- data.frame(Group = rep(c("a", "b"), each = 20), N = rtpois(40, 1, a = 0), ran = "a") %>%
mutate(N = ifelse(N == 0, 1, N))
m <- glmmTMB(N ~ Group + (1|ran), data = df, family = "truncated_poisson")
df %>% group_by(Group) %>% summarize(mean(N))
predict(m, newdata = data.frame(Group = c("a", "b"), ran = NA), type = "response")
I think the main issue is probably that you're using a slightly older version of glmmTMB (< 1.1.5, where a bug was fixed, see e.g. e.g. https://github.com/glmmTMB/glmmTMB/issues/860).
sample data
streamlined slightly (we don't need to include a random effect for this example), and adding a truncated nbinom2.
library(dplyr)
library(extraDistr)
library(glmmTMB)
set.seed(1)
df <- data.frame(Group = rep(c("a", "b"), each = 20),
Np = rtpois(40, 1, a = 0))
## clunky trunc nbinom generator
tnb <- rep(0, 40)
z <- (tnb==0)
while(any(z)) {
tnb[z] <- rnbinom(sum(z), mu = 1, size = 1)
z <- (tnb==0)
}
df$Nnb <- tnb
## summarize
df %>% group_by(Group) %>% summarize(across(starts_with("N"), mean))
## Group Np Nnb
## 1 a 1.75 1.8
## 2 b 1.45 2.35
fit models
m1 <- glmmTMB(Np ~ Group, data = df, family = "truncated_poisson")
m2 <- update(m1, Nnb ~ ., family = truncated_nbinom2)
Predicting with se.fit = TRUE will give you standard errors for the predictions, from which you can compute confidence intervals (assuming Normality/Wald intervals/blah blah blah ...) ...
pfun <- function(m, level = 0.95) {
pp <- predict(m, newdata = data.frame(Group = c("a", "b")),
type = "response",
se.fit = TRUE)
list(est = unname(pp$fit),
lwr = unname(pp$fit + qnorm((1-level)/2)*pp$se.fit),
upr = unname(pp$fit + qnorm((1+level)/2)*pp$se.fit))
}
pfun(m1)
pfun(m2)

using lmer predict in data.table

library(lme4)
library(data.table)
library(dplyr)
d = data.frame(x = rep(1:100, times = 3),
y = NA,
grp = rep(1:3, each = 100))
d$y[d$grp == 1] = 1:100 + rnorm(100)
d$y[d$grp == 2] = 1:100 * 1.5 + rnorm(100)
d$y[d$grp == 3] = 1:100 * 0.5 + rnorm(100)
fit = lmer(y ~ x + (x|grp), data = d)
new.data <- data.frame(x = 1:100, grp = rep(1:3, each = 100))
new.data1 = new.data %>% dplyr::mutate(grp = 1)
new.data2 = new.data %>% dplyr::mutate(grp = 3)
temp <- new.data %>%
dplyr::mutate(predV1 = predict(fit, newdata = new.data1, allow.new.levels = TRUE),
predV2 = predict(fit, newdata = new.data2, allow.new.levels = TRUE))
My actual new.data has many more predictors, groups, more observations to predict on (~10000 rows)
and hence the above dplyr solutions takes around 34 seconds.
I wondered if lmer predict function can be used with data.table to speed it.
If we need a similar approach in data.table, convert to data.table (setDT) and assign (:=) the output of predict to the new columns 'predV1', 'predV2'
library(data.table)
setDT(new.data)[, c('predV1', 'predV2') :=
.(predict(fit, newdata = new.data1, allow.new.levels = TRUE),
predict(fit, newdata = new.data2, allow.new.levels = TRUE))]

R - how to get coeffients for each column ~ timeline from a "spread" matrix?

I want to collect the linear regression coefficients for each column ~ ind.
Here is my data:
temp <- data.frame(
ind = c(1:10),
`9891` = runif(10, 15, 75),
`7891` = runif(10, 15, 75),
`5891` = runif(10, 15, 75)
)
I had tried
result = data.frame()
cols <- colnames(temp)[-1]
for (code in cols) {
fit <- lm(temp[, code] ~ temp$ind)
coef <- coef(fit)['ind']
result$ind <- code
result$coef <- coef
}
But this doesn't work.
Can anyone fix my method, or provides a better solution?
Also, I was wondering if lapply() and summarise_at() can do the work.
Thank you!
Here is a summarise_at option
temp %>%
summarise_at(vars(-contains("ind")), list(coef = ~list(lm(. ~ ind)$coef))) %>%
unnest()
# X9891_coef X7891_coef X5891_coef
#1 25.927946 52.5668120 35.152330
#2 2.459137 0.3158741 1.013678
The first row gives the offset and the second row the slope coefficients.
Or to extract only the slope coefficient and store the result in a long data.frame
temp %>%
summarise_at(vars(-contains("ind")), list(coef = ~list(lm(. ~ ind)$coef[2]))) %>%
unnest() %>%
stack() %>%
setNames(c("slope", "column"))
# slope column
# 1 2.4591375 X9891_coef
# 2 0.3158741 X7891_coef
# 3 1.0136783 X5891_coef
PS. It's always good practice to include a fixed random seed when working with random data to ensure reproducibility of results.
Sample data
set.seed(2018)
temp <- data.frame(
ind = c(1:10),
`9891` = runif(10, 15, 75),
`7891` = runif(10, 15, 75),
`5891` = runif(10, 15, 75)
)
You can use sapply
sapply(temp[-1], function(x) coef(lm(x ~ temp$ind))[2])
#X9891.temp$ind X7891.temp$ind X5891.temp$ind
# -0.01252979 -2.94773367 2.57816244
To get the final daatframe, you could do
data.frame(ind = names(temp)[-1],
coef = sapply(temp[-1], function(x) coef(lm(x ~ temp$ind))[2]), row.names = NULL)
# ind coef
#1 X9891 -0.01252979
#2 X7891 -2.94773367
#3 X5891 2.57816244
where every row represents value from the column.
data
set.seed(1234)
temp <- data.frame(
ind = c(1:10),
`9891` = runif(10, 15, 75),
`7891` = runif(10, 15, 75),
`5891` = runif(10, 15, 75)
)

Adapting the meansd moderator option in sjPlot interaction

I am using sjPlot, the sjp.int function, to plot an interaction of an lme.
The options for the moderator values are means +/- sd, quartiles, all, max/min. Is there a way to plot the mean +/- 2sd?
Typically it would be like this:
model <- lme(outcome ~ var1+var2*time, random=~1|ID, data=mydata, na.action="na.omit")
sjp.int(model, show.ci=T, mdrt.values="meansd")
Many thanks
Reproducible example:
#create data
mydata <- data.frame( SID=sample(1:150,400,replace=TRUE),age=sample(50:70,400,replace=TRUE), sex=sample(c("Male","Female"),200, replace=TRUE),time= seq(0.7, 6.2, length.out=400), Vol =rnorm(400),HCD =rnorm(400))
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
mydata2 <- NAins(mydata,0.1)
#run the lme which gives error message
model = lme(Vol ~ age+sex*time+time* HCD, random=~time|SID,na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#lmer works
model2 = lmer(Vol ~ age+sex*time+time* HCD+(time|SID),control=lmerControl(check.nobs.vs.nlev = "ignore",check.nobs.vs.rankZ = "ignore", check.nobs.vs.nRE="ignore"), na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model2, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#plotting gives problems (jittered lines)
plot(mydf)
With sjPlot, it's currently not possible. However, I have written a package especially dedicated to compute and plot marginal effects: ggeffects. This package is a bit more flexible (for marginal effects plots).
In the ggeffects-package, there's a ggpredict()-function, where you can compute marginal effects at specific values. Once you know the sd of your model term in question, you can specify these values in the function call to plot your interaction:
library(ggeffects)
# plot interaction for time and var2, for values
# 10, 30 and 50 of var2
mydf <- ggpredict(model, terms = c("time", "var2 [10,30,50]"))
plot(mydf)
There are some examples in the package-vignette, see especially this section.
Edit
Here are the results, based on your reproducible example (note that GitHub-Version is currently required!):
# requires at least the GitHub-Versiob 0.1.0.9000!
library(ggeffects)
library(nlme)
library(lme4)
library(glmmTMB)
#create data
mydata <-
data.frame(
SID = sample(1:150, 400, replace = TRUE),
age = sample(50:70, 400, replace = TRUE),
sex = sample(c("Male", "Female"), 200, replace = TRUE),
time = seq(0.7, 6.2, length.out = 400),
Vol = rnorm(400),
HCD = rnorm(400)
)
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1) {
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop * n * m)
id <- sample(0:(m * n - 1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x) {
df[rows[x], cols[x]] <<- NA
})
return(df)
}
mydata2 <- NAins(mydata, 0.1)
# run the lme, works now
model = lme(
Vol ~ age + sex * time + time * HCD,
random = ~ time |
SID,
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
lme-plot
# lmer also works
model2 <- lmer(
Vol ~ age + sex * time + time * HCD + (time |
SID),
control = lmerControl(
check.nobs.vs.nlev = "ignore",
check.nobs.vs.rankZ = "ignore",
check.nobs.vs.nRE = "ignore"
),
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model2, terms = c("time", "HCD [-2.5, -0.5, 2.0]"), ci.lvl = NA)
# plotting works, but only w/o CI
plot(mydf)
lmer-plot
# lmer also works
model3 <- glmmTMB(
Vol ~ age + sex * time + time * HCD + (time | SID),
data = mydata2
)
summary(model)
mydf <- ggpredict(model3, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
plot(mydf, facets = T)
glmmTMB-plots

Computing NTILE in R for RFM analysis

I'm trying to create a dataframe computing 10 percentiles based on the Recency, Frequency and Monetary. I have most of it set up, but I can't figure out why my code is returning three NTILES, when I'm asking for 10. I'm currently at a stand still. The next step will be calculating the percentage of customers in each ntile.
Here is my code:
rm(list = ls())
setwd("/Users/a76475/Documents/Customer_Analytics")
rfm<-read.csv("cdnow_students_transaction.csv")
#Set up test and calibration samples
rfm$DATE <- as.Date(rfm$DATE, format = "%m/%d/%y")
calib <- subset(rfm, rfm$DATE<"1997-09-29")
valid <- subset(rfm, rfm$DATE>"1997-09-30")
#Aggregate for frequency, monetary, and recency -- Calibration
recency<- aggregate(DATE ~ ID, data =calib, FUN = max)
colnames(recency) <- c("ID","Recency")
frequency <- aggregate(DOLLARS ~ ID, data =calib, FUN = length)
colnames(frequency) <- c("ID","Frequency")
monetary <- aggregate(DOLLARS ~ ID, data =calib, FUN = mean)
colnames(frequency) <- c("ID","Monetary")
calib <- merge(frequency, monetary, by = "ID")
calib <- merge(calib, recency, by = "ID")
#Aggregate for frequency, monetary, and recency -- Validation
recency<- aggregate(DATE ~ ID, data =valid, FUN = max)
colnames(recency) <- c("ID","Recency")
frequency <- aggregate(DOLLARS ~ ID, data =valid, FUN = length)
colnames(frequency) <- c("ID","Frequency")
monetary <- aggregate(DOLLARS ~ ID, data =valid, FUN = mean)
colnames(frequency) <- c("ID","Monetary")
valid <- merge(frequency, monetary, by = "ID")
valid <- merge(valid, recency, by = "ID")
colnames(valid) <- c("ID","FREQ","MONETARY","RECENCY")
colnames(calib) <- c("ID","FREQ","MONETARY","RECENCY")
calib$RECENCY <- NULL
#Create recency score
#For validation
for (i in 1:nrow(valid)) {
valid$RECENCY1[i] = as.numeric(max(valid$RECENCY) - valid$RECENCY[i])
}
valid$RECENCY <- valid$RECENCY1
valid$RECENCY1 <- NULL
#For calibration
for (i in 1:nrow(calib)) {
calib$RECENCY1[i] = as.numeric(max(valid$RECENCY) - calib$RECENCY[i])
}
calib$RECENCY <- calib$RECENCY1
calib$RECENCY1 <- NULL
#Merge datasets
rfm <- merge(calib,valid, by="ID", all.x = TRUE)
#Create Column for retention%
require(dplyr)
rfm$monetary.ntile <- ntile(rfm$MONETARY.y,10)
rfm$freq.ntile <- ntile(rfm$FREQ.y,10)
rfm$recency.ntile <- ntile(rfm$RECENCY,10)
For example, if you want 10 buckets for Recency, Frequency and Monetary Ratio:
set.seed(1)
n <- 100
df <- data.frame(
R = runif(n, 1, 365),
F = runif(n, 1, 5),
M = runif(n, 0, 100)
)
apply(df, 2, function(col) {
breaks <- quantile(col, probs=seq(0, 1, length.out = 10))
findInterval(col, breaks)
})

Resources