Raking Multiple Imputed dataset - r

This is a continuation from my previous post
Error with svydesign using imputed data sets
I would like to run a rake() function in my imputed dataset. However, it seems it is not finding the input variable. Below is a sample code:
library(mitools)
library(survey)
library(mice)
data(nhanes)
nhanes2$hyp <- as.factor(nhanes2$hyp)
imp <- mice(nhanes2,method=c("polyreg","pmm","logreg","pmm"), seed = 23109)
imp_list <- lapply( 1:5 , function( n ) complete( imp , action = n ) )
des<-svydesign(id=~1, data=imputationList(imp_list))
age.dist <- data.frame(age = c("20-39","40-59", "60-99"),
Freq = nrow(des) * c(0.5, 0.3, .2))
small.svy.rake <- rake(design = des,
sample.margins = list(~age),
population.margins = list(age.dist))
Error in eval(expr, envir, enclos) : object 'age' not found
The code works if I change the input data to a single dataset. That is, instead of des<-svydesign(id=~1, data=imputationList(imp_list)), I have this
data3 <- complete(imp,1)
des<-svydesign(id=~1, data=data3)
How can i edit the code such that it would recognize that the input dataset in the rake() function is of multiple imputation type?

# copy over the structure of your starting multiply-imputed design
small.svy.rake <- des
# loop through each of the implicates
# applying the `rake` function to each
small.svy.rake$designs <-
lapply(
des$designs ,
rake ,
sample.margins = list(~age),
population.margins = list(age.dist)
)
# as you'd expect, the overall number changes..
MIcombine( with( des , svymean( ~ bmi ) ) )
MIcombine( with( small.svy.rake , svymean( ~ bmi ) ) )
# ..but the within-age-category numbers do not
MIcombine( with( des , svyby( ~ bmi , ~ age , svymean ) ) )
MIcombine( with( small.svy.rake , svyby( ~ bmi , ~ age , svymean ) ) )

Related

Outputting the N's using the survey package (svymean)

I have data such as this, I am trying to use the survey package to apply weights and find the means, SE and the N from each variable.
I was able to find the mean and SE, but I don't know how to pull the N for each variable.
library(survey)
data(api)
dclus1<-svydesign(id=~dnum, weights=~pw, data=apiclus1, fpc=~fpc)
vector_of_variables <- c( 'api00' , 'api99' )
result <-
lapply(
vector_of_variables ,
function( w ) svymean( as.formula( paste( "~" , w ) ) , dclus1 , na.rm = TRUE )
)
result <- lapply( result , function( v ) data.frame( variable = names( v ) , mean = coef( v ) , se = as.numeric( SE( v ) ) ) )
do.call( rbind , result )
Any suggestions?
EDIT
I've adapted the answer given below to expand my question:
library(survey)
data(api)
apiclus1 <-
apiclus1 %>%
mutate(pw2 = pw*0.8) %>%
mutate(part = case_when(full<80 ~"part 1", TRUE~"part 2"))
dclus1<-svydesign(id=~dnum, weights=~pw, data=apiclus1, fpc=~fpc)
dclus2 <- svydesign(id=~dnum, weights=~pw2, data=apiclus1, fpc=~fpc)
meanseN<-function(variable,design, part,shc.wide){
formula<-make.formula(variable)
m <-svymean(formula, subset(design, part==part, shc.wide = shc.wide),na.rm=TRUE)
N<-unwtd.count(formula, subset(design, part==part, shc.wide = shc.wide),na.rm=TRUE)
c(mean=coef(m), se=SE(m), N=coef(N))
}
vector_of_variables <- c("acs.k3","api00")
sapply(vector_of_variables, meanseN, "part 1","No",design=dclus1)
acs.k3 api00
mean.acs.k3 20.0347222 644.16940
se 0.5204887 23.54224
N.counts 144.0000000 183.00000
As you can see I subset the data (dclus1), so the N's I expect to see for each design should be:
table(apiclus1$sch.wide, apiclus1$part)
part 1 part 2
No 4 19
Yes 30 130
unwtd.count is returning the count for the full sample of data, instead of the subset.... Any idea's why this might be happening?
You don't actually need the survey package functions to do this. The number of observations is whatever it is, it's not a population estimate based on the design. However, the pacakage does have the function unwtd.count to get unweighted count of non-missing observations, eg
> unwtd.count(~api00, dclus1)
counts SE
counts 183 0
If you want all three things in a loop like you were doing before, then rather than doing it in one line it's easiest to write a little function
meanseN<-function(variable,design){
formula<-make.formula(variable)
m <-svymean(formula, design,na.rm=TRUE)
N<-unwtd.count(formula, design)
c(mean=coef(m), se=SE(m), N=coef(N))
}
and do something like
> sapply(vector_of_variables, meanseN,design=dclus1)
api00 api99
mean.api00 644.16940 606.97814
se 23.54224 24.22504
N.counts 183.00000 183.00000

Svyby returns error although arguments match documentation

I've defined a survey object using the survey package (that all worked fine) and added two columns to it as follows:
anes_svy <- update( #the update fn adds columns to a survey object
anes_svy, #object to add variables to
one = 1,
undoc_kids =
factor( V161195x , levels = 1:6 , labels =
c( 'should sent back - favor a great deal' ,
'should sent back - favor a moderate amount' ,
'should sent back - favor a little' ,
'should allow to stay - favor a little' ,
'should allow to stay - favor a moderate amount' ,
'should allow to stay - favor a great deal' )
)
)
Now when I run
svyby( formula = ~one , by = ~undoc_kids , design = anes_svy , FUN = unwtd.count )
I get the error
Error in sum(sapply(covmats, ncol)) : invalid 'type' (list) of argument
The documentation asks for a vector for 'formula', a list of factors for 'by', a svydesign object for 'design', and a function for FUN (unwtd.count is inbuilt). What am I doing wrong?
I get this same error when I make a mistake coding the factor
This does not work- Gear is 3:5, but the levels is set at 0:2. I ended up with the same error you observed
library( survey)
m <- mtcars
m$gear <- factor( m$gear, levels=c(0:2), labels=c("L","M","H"))
table( m$gear )
f <- svydesign( ~1 , strata = ~cyl , weights = ~wt , data= m )
svyby( ~mpg , ~gear , f, svymean, na.rm=T)
This does
m <- mtcars
m$gear <- factor( mtcars$gear, levels=c(3:5), labels=c("L","M","H"))
table( m$gear )
f <- svydesign( ~1 , strata = ~cyl , weights = ~wt , data= m )
svyby( ~mpg , ~gear , f, svymean, na.rm=T)

Predict Logistf

I'm using a R package called logistf to make a Logistc Regression and I saw that there's no predict function for new data in this package and predict package does not work with this, so I found a code that show how making this with new data:
fit<-logistf(Tax ~ L20+L24+L28+L29+L31+L32+L33+L36+S10+S15+S16+S17+S20, data=trainData)
betas <- coef(fit)
X <- model.matrix(fit, data=testData)
probs <- 1 / (1 + exp(-X %*% betas))
I want to make a cross validation version with this using fit$predict and the probabilities that probs generate for me. Has anyone ever done something like this before?
Other thing that I want to know is about fit$predict I'm making a binary logistic regression, and this function returns many values, are these values from class 0 or 1, how can I know this? Thanks
While the code that you wrote works perfectly, there is a concise way of getting the same results seemingly:
brglm_model <- brglm(formula = response ~ predictor , family = "binomial", data = train )
brglm_pred <- predict(object = brglm_model, newdata = test , type = "response")
About the CV, you have to write a few lines of code I guess:
#Setting the number of folds, and number of instances in each fold
n_folds <- 5
fold_size <- nrow(dataset) %/% 5
residual <- nrow(dataset) %% 5
#label the instances based on the number of folds
cv_labels <- c(rep(1,fold_size),rep(2,fold_size), rep(3,fold_size), rep(4,fold_size), rep(5,fold_size), rep(5,residual))
# the error term would differ based on each threshold value
t_seq <- seq(0.1,0.9,by = 0.1)
index_mat <- matrix(ncol = (n_folds+1) , nrow = length(t_seq))
index_mat[,1] <- t_seq
# the main loop for calculation of the CV error on each fold
for (i in 1:5){
train <- dataset %>% filter(cv_labels != i)
test <- dataset %>% filter(cv_labels == i )
brglm_cv_model <- brglm(formula = response_var ~ . , family = "binomial", data = train )
brglm_cv_pred <- predict(object = brglm_model, newdata = test , type = "response")
# error formula that you want, e.g. misclassification
counter <- 0
for (treshold in t_seq ) {
counter <- counter + 1
conf_mat <- table( factor(test$response_var) , factor(brglm_cv_pred>treshold, levels = c("FALSE","TRUE") ))
sen <- conf_mat[2,2]/sum(conf_mat[2,])
# other indices can be computed as follows
#spec <- conf_mat[1,1]/sum(conf_mat[1,])
#prec <- conf_mat[2,2]/sum(conf_mat[,2])
#F1 <- (2*prec * sen)/(prec+sen)
#accuracy <- (conf_mat[1,1]+conf_mat[2,2])/sum(conf_mat)
#here I am only interested in sensitivity
index_mat[counter,(i+1)] <- sen
}
}
# final data.frame would be the mean of sensitivity over each threshold value
final_mat <- matrix(nrow = length(t_seq), ncol = 2 )
final_mat[,1] <- t_seq
final_mat[,2] <- apply(X = index_mat[,-1] , MARGIN = 1 , FUN = mean)
final_mat <- data.frame(final_mat)
colnames(final_mat) <- c("treshold","sensitivity")
#why not having a look at the CV-sensitivity of the model over threshold values?
ggplot(data = final_mat) +
geom_line(aes(x = treshold, y = sensitivity ), color = "blue")

Combining lapply, svyby, svyratio to calculate many ratios with confidence intervals

I am using the survey package in R to work with the U.S. Census' PUMS dataset for population. I've created a Boolean for each broad industry and a character variable MigrationStatus with three values (Stayed,Left,Entered). I'd like to examine the ratios of workers in each industry by migration status.
This works fine:
AGR_ratio=svyby(~JobAGR, by=~MigrationStatus, denominator=~EmployedAtWork, design=subset(pums_design,EmployedAtWork==1), svyratio, vartype='ci')
But this produces an error:
varlist=names(pums_design$variables)[32:50]
job_ratios = lapply(varlist, function(x) {
svyby(substitute(~i, list(i = as.name(x))), by=~MigrationStatus, denominator=~EmployedAtWork, design=subset(pums_design,EmployedAtWork==1), svyratio, vartype='ci')
})
#Error in svyby.default(substitute(~i, list(i = as.name(x))), by = ~MigrationStatus, :
#object 'byfactor' not found
varlist
#[1] "JobADM" "JobAGR" "JobCON" "JobEDU" "JobENT" "JobEXT" "JobFIN" "JobINF" "JobMED" "JobMFG" "JobMIL" "JobPRF" "JobRET" "JobSCA" "JobSRV"
#[16] "JobTRN" "JobUNE" "JobUTL" "JobWHL"
how about this?
# setup
library(survey)
data(api)
dclus1<-svydesign(id=~dnum, weights=~pw, data=apiclus1, fpc=~fpc)
# single example
svyby(~api99, by = ~stype, denominator = ~api00 , dclus1, svyratio)
# multiple example
variables <- c( "api99" , "pcttest" )
# breaks
lapply(variables, function(x) svyby(substitute(~i, list(i = as.name(x))), by=~stype, denominator=~api00, design=dclus1, svyratio, vartype='ci'))
# works
lapply( variables , function( z ) svyby( as.formula( paste0( "~" , z ) ) , by = ~stype, denominator = ~api00 , dclus1, svyratio , vartype = 'ci' ) )
btw you might be interested in this uspums data automation syntax

Using summary(glm-object) inside ldply() with the summarise() - function

How do i use the summary-function inside a ldply()-summarise-function to extract p-values?
Example data:
(The data frame "Puromycin" is preinstalled)
library(reshape2)
library(plyr)
Puromycin.m <- melt( Puromycin , id=c("state") )
Puro.models <- dlply( Puromycin.m , .(variable) , glm , formula = state ~ value ,
family = binomial )
I can construct this data frame with extracted results:
ldply( Puro.models , summarise , "n in each model" = length(fitted.values) ,
"Coefficients" = coefficients[2] )
But i cant extract the p-values in the same way. I thougt this would work but it does not:
ldply( Puro.models , summarise ,
"n in each model" = length(fitted.values) ,
"Coefficients" = coefficients[2],
"P-value" = function(x) summary(x)$coef[2,4] )
How can i extract p-values to that data frame :) Please help!
Why don't you get them directly?
library(reshape2)
library(plyr)
Puromycin.m <- melt( Puromycin , id=c("state") )
Puro.models <- ddply( Puromycin.m , .(variable), function(x) {
t <- glm(x, formula = state ~ value, family="binomial")
data.frame(n = length(t$fitted.values),
coef = coefficients(t)[2],
pval = summary(t)$coef[2,4])
})
> Puro.models
# variable n coef pval
# 1 conc 23 -0.55300908 0.6451550
# 2 rate 23 -0.01555023 0.1272184

Resources