I want to compute a new column using the quantiles of another column (a continuous variable) incorporating the Sample Design of a complex survey. The idea is to create in the the data frame a new variable that indicates which quantile group each observation falls into
Here is how I execute the idea without incorporating the sample design, so you can understand what I'm aiming for.
# Load Data
data(api)
# Convert data to data.table format (mostly to increase speed of the process)
apiclus1 <- as.data.table(apiclus1)
# Create deciles variable
apiclus1[, decile:=cut(api00,
breaks=quantile(api00,
probs=seq(0, 1, by=0.1), na.rm=T),
include.lowest= TRUE, labels=1:10)]
I've tried using svyquantile from the survey package, but I couldn't get my head around this problem. This code does not return the quantile groups as an output that I can feed into a new variable. Any thoughts on this?
# Load Package
library(survey)
# create survey design
dclus1 <- svydesign(id=~dnum, weights=~pw, data=apiclus1, fpc=~fpc)
# What I've tried to do
svyquantile(~api00, design = dclus1, quantiles = seq(0, 1, by=0.1), method = "linear", ties="rounded")
library(survey)
data(api)
dclus1 <- svydesign(id=~dnum, weights=~pw, data=apiclus1, fpc=~fpc)
a <- svyquantile(~api00, design = dclus1, quantiles = seq(0, 1, by=0.1), method = "linear", ties="rounded")
# use factor() and findInterval()
dclus1 <- update( dclus1 , qtile = factor( findInterval( api00 , a ) ) )
# distribution
svymean( ~ qtile , dclus1 )
# or without the one observation in group number 11
dclus1 <- update( dclus1 , qtile = factor( findInterval( api00 , a[ -length( a ) ] ) ) )
# distribution
svymean( ~ qtile , dclus1 )
# quantiles by group
b <- svyby(~api00, ~stype, design = dclus1, svyquantile, quantiles = seq(0, 0.9 , by=0.1) ,ci=T)
# copy over your data
x <- apiclus1
# stype of each record
match( x$stype , b$stype )
# create the new qtile variable
x$qtile_by_stype <- factor( diag( apply( data.frame( b )[ match( x$stype , b$stype ) , 2:11 ] , 1 , function( v , w ) findInterval( w , v ) , x$api00 ) ) )
# re-create the survey design
dclus1 <- svydesign(id=~dnum, weights=~pw, data=x, fpc=~fpc)
# confirm you have quantiles
svyby( ~ qtile_by_stype , ~ stype , dclus1 , svymean )
The output from your whole code above is :
0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
api00 411 497.8 535.6 573.2 614.6 651.75 686.6 709.55 735.4 780.7 905
You can change the names to represent your groups. 0 and 1 represent minimum and maximum. 0.1 represents decile 1, 0.2 represents decile 2, etc. Something like:
dt_quantile = svyquantile(~api00, design = dclus1, quantiles = seq(0, 1, by=0.1), method = "linear", ties="rounded")
dt_quantile = data.table(dt_quantile)
setnames(dt_quantile, c("min",paste0("decile",1:10)))
dt_quantile = data.table(t(dt_quantile), keep.rownames = T)
dt_quantile
# rn V1
# 1: min 411.00
# 2: decile1 497.80
# 3: decile2 535.60
# 4: decile3 573.20
# 5: decile4 614.60
# 6: decile5 651.75
# 7: decile6 686.60
# 8: decile7 709.55
# 9: decile8 735.40
# 10: decile9 780.70
# 11: decile10 905.00
Am I missing your objective?
Related
I would like to run a bayesian model with multiple species, the abundances are the dependent variables, a generative model could be the following:
#
#
set.seed(73)
N <- 50
U_sim <- rnorm( N )
Q_sim <- sample( 1:4 , size=N , replace=TRUE )
E_sim <- rnorm( N , U_sim + Q_sim )
W_sim <- rnorm( N , U_sim - Q_sim - E_sim)
dat_sim <- list(
W=standardize(W_sim) ,
E=standardize(E_sim) ,
Q=standardize(Q_sim) )
with U_sim as unobserved variable, the model could be
m14.6 <- ulam(
alist(
c(W,E) ~ multi_normal( c(muW,muE) , Rho , Sigma ),
muW <- aW + bQW*Q,
muE <- aE + bQE*Q,
c(aW,aE) ~ normal( 0 , 0.2 ),
c(bQW,bQE) ~ normal( 0 , 0.5 ),
Rho ~ lkj_corr( 2 ),
Sigma ~ exponential( 1 )
), data=dat_sim , chains=4 , cores=4 )
precis( m14.6 , depth=3 )
is there a way to put the linear models in matrix notation instead of specifying the lines :
muW <- aW + bQW*Q,
muE <- aE + bQE*Q,
because if I have more species (dependent variables) seems to be very cumbersome
I have data such as this, I am trying to use the survey package to apply weights and find the means, SE and the N from each variable.
I was able to find the mean and SE, but I don't know how to pull the N for each variable.
library(survey)
data(api)
dclus1<-svydesign(id=~dnum, weights=~pw, data=apiclus1, fpc=~fpc)
vector_of_variables <- c( 'api00' , 'api99' )
result <-
lapply(
vector_of_variables ,
function( w ) svymean( as.formula( paste( "~" , w ) ) , dclus1 , na.rm = TRUE )
)
result <- lapply( result , function( v ) data.frame( variable = names( v ) , mean = coef( v ) , se = as.numeric( SE( v ) ) ) )
do.call( rbind , result )
Any suggestions?
EDIT
I've adapted the answer given below to expand my question:
library(survey)
data(api)
apiclus1 <-
apiclus1 %>%
mutate(pw2 = pw*0.8) %>%
mutate(part = case_when(full<80 ~"part 1", TRUE~"part 2"))
dclus1<-svydesign(id=~dnum, weights=~pw, data=apiclus1, fpc=~fpc)
dclus2 <- svydesign(id=~dnum, weights=~pw2, data=apiclus1, fpc=~fpc)
meanseN<-function(variable,design, part,shc.wide){
formula<-make.formula(variable)
m <-svymean(formula, subset(design, part==part, shc.wide = shc.wide),na.rm=TRUE)
N<-unwtd.count(formula, subset(design, part==part, shc.wide = shc.wide),na.rm=TRUE)
c(mean=coef(m), se=SE(m), N=coef(N))
}
vector_of_variables <- c("acs.k3","api00")
sapply(vector_of_variables, meanseN, "part 1","No",design=dclus1)
acs.k3 api00
mean.acs.k3 20.0347222 644.16940
se 0.5204887 23.54224
N.counts 144.0000000 183.00000
As you can see I subset the data (dclus1), so the N's I expect to see for each design should be:
table(apiclus1$sch.wide, apiclus1$part)
part 1 part 2
No 4 19
Yes 30 130
unwtd.count is returning the count for the full sample of data, instead of the subset.... Any idea's why this might be happening?
You don't actually need the survey package functions to do this. The number of observations is whatever it is, it's not a population estimate based on the design. However, the pacakage does have the function unwtd.count to get unweighted count of non-missing observations, eg
> unwtd.count(~api00, dclus1)
counts SE
counts 183 0
If you want all three things in a loop like you were doing before, then rather than doing it in one line it's easiest to write a little function
meanseN<-function(variable,design){
formula<-make.formula(variable)
m <-svymean(formula, design,na.rm=TRUE)
N<-unwtd.count(formula, design)
c(mean=coef(m), se=SE(m), N=coef(N))
}
and do something like
> sapply(vector_of_variables, meanseN,design=dclus1)
api00 api99
mean.api00 644.16940 606.97814
se 23.54224 24.22504
N.counts 183.00000 183.00000
This is a continuation from my previous post
Error with svydesign using imputed data sets
I would like to run a rake() function in my imputed dataset. However, it seems it is not finding the input variable. Below is a sample code:
library(mitools)
library(survey)
library(mice)
data(nhanes)
nhanes2$hyp <- as.factor(nhanes2$hyp)
imp <- mice(nhanes2,method=c("polyreg","pmm","logreg","pmm"), seed = 23109)
imp_list <- lapply( 1:5 , function( n ) complete( imp , action = n ) )
des<-svydesign(id=~1, data=imputationList(imp_list))
age.dist <- data.frame(age = c("20-39","40-59", "60-99"),
Freq = nrow(des) * c(0.5, 0.3, .2))
small.svy.rake <- rake(design = des,
sample.margins = list(~age),
population.margins = list(age.dist))
Error in eval(expr, envir, enclos) : object 'age' not found
The code works if I change the input data to a single dataset. That is, instead of des<-svydesign(id=~1, data=imputationList(imp_list)), I have this
data3 <- complete(imp,1)
des<-svydesign(id=~1, data=data3)
How can i edit the code such that it would recognize that the input dataset in the rake() function is of multiple imputation type?
# copy over the structure of your starting multiply-imputed design
small.svy.rake <- des
# loop through each of the implicates
# applying the `rake` function to each
small.svy.rake$designs <-
lapply(
des$designs ,
rake ,
sample.margins = list(~age),
population.margins = list(age.dist)
)
# as you'd expect, the overall number changes..
MIcombine( with( des , svymean( ~ bmi ) ) )
MIcombine( with( small.svy.rake , svymean( ~ bmi ) ) )
# ..but the within-age-category numbers do not
MIcombine( with( des , svyby( ~ bmi , ~ age , svymean ) ) )
MIcombine( with( small.svy.rake , svyby( ~ bmi , ~ age , svymean ) ) )
I am using the survey package in R to work with the U.S. Census' PUMS dataset for population. I've created a Boolean for each broad industry and a character variable MigrationStatus with three values (Stayed,Left,Entered). I'd like to examine the ratios of workers in each industry by migration status.
This works fine:
AGR_ratio=svyby(~JobAGR, by=~MigrationStatus, denominator=~EmployedAtWork, design=subset(pums_design,EmployedAtWork==1), svyratio, vartype='ci')
But this produces an error:
varlist=names(pums_design$variables)[32:50]
job_ratios = lapply(varlist, function(x) {
svyby(substitute(~i, list(i = as.name(x))), by=~MigrationStatus, denominator=~EmployedAtWork, design=subset(pums_design,EmployedAtWork==1), svyratio, vartype='ci')
})
#Error in svyby.default(substitute(~i, list(i = as.name(x))), by = ~MigrationStatus, :
#object 'byfactor' not found
varlist
#[1] "JobADM" "JobAGR" "JobCON" "JobEDU" "JobENT" "JobEXT" "JobFIN" "JobINF" "JobMED" "JobMFG" "JobMIL" "JobPRF" "JobRET" "JobSCA" "JobSRV"
#[16] "JobTRN" "JobUNE" "JobUTL" "JobWHL"
how about this?
# setup
library(survey)
data(api)
dclus1<-svydesign(id=~dnum, weights=~pw, data=apiclus1, fpc=~fpc)
# single example
svyby(~api99, by = ~stype, denominator = ~api00 , dclus1, svyratio)
# multiple example
variables <- c( "api99" , "pcttest" )
# breaks
lapply(variables, function(x) svyby(substitute(~i, list(i = as.name(x))), by=~stype, denominator=~api00, design=dclus1, svyratio, vartype='ci'))
# works
lapply( variables , function( z ) svyby( as.formula( paste0( "~" , z ) ) , by = ~stype, denominator = ~api00 , dclus1, svyratio , vartype = 'ci' ) )
btw you might be interested in this uspums data automation syntax
How do i use the summary-function inside a ldply()-summarise-function to extract p-values?
Example data:
(The data frame "Puromycin" is preinstalled)
library(reshape2)
library(plyr)
Puromycin.m <- melt( Puromycin , id=c("state") )
Puro.models <- dlply( Puromycin.m , .(variable) , glm , formula = state ~ value ,
family = binomial )
I can construct this data frame with extracted results:
ldply( Puro.models , summarise , "n in each model" = length(fitted.values) ,
"Coefficients" = coefficients[2] )
But i cant extract the p-values in the same way. I thougt this would work but it does not:
ldply( Puro.models , summarise ,
"n in each model" = length(fitted.values) ,
"Coefficients" = coefficients[2],
"P-value" = function(x) summary(x)$coef[2,4] )
How can i extract p-values to that data frame :) Please help!
Why don't you get them directly?
library(reshape2)
library(plyr)
Puromycin.m <- melt( Puromycin , id=c("state") )
Puro.models <- ddply( Puromycin.m , .(variable), function(x) {
t <- glm(x, formula = state ~ value, family="binomial")
data.frame(n = length(t$fitted.values),
coef = coefficients(t)[2],
pval = summary(t)$coef[2,4])
})
> Puro.models
# variable n coef pval
# 1 conc 23 -0.55300908 0.6451550
# 2 rate 23 -0.01555023 0.1272184