How to use ggplot with Haven dataset - r

New to coding and R but have a STATA dataset, I want to use ggplot for visulations of my data however, I get multiple errors such as
no applicable method for 'rescale' applied to an object of class "c('haven_labelled', 'vctrs_vctr', 'double')"
I dont know how to convert them so I can plot them for visualisations,
the lines of code are as followed:
Data <- read_dta("longitudinal_td.dta")
Data <- Data %>%
select(pidp,wave,age_dv,sex_dv,ethn_dv,sf1_dv,bmi_dv,sf12pcs_dv,fihhmnnet1_dv,sf12mcs_dv) %>%
filter(wave == "1", age_dv<=50)%>%
mutate(pipd = row_number(),age=age_dv, sex=sex_dv, ethnicity = ethn_dv, general_health=sf1_dv,
bmi=bmi_dv, physical_component_score=sf12pcs_dv, mental_component_score=sf12mcs_dv, household_income=fihhmnnet1_dv)%>%
select(-pipd,-age_dv,-sex_dv,-ethn_dv,-sf1_dv,-bmi_dv,-sf12pcs_dv,-sf12mcs_dv,-fihhmnnet1_dv)
I hope this is correct, here is the dput:
Essentially im just trying to explore BMI but i dont know if I can just plot these or have to assign the numbers to a label like it already is done in haven labels
dput(head(Data))
structure(list(pidp = structure(c(68001367, 68006127, 68008167,
68009527, 68010207, 68010887), label = "cross-wave person identifier (public release)", format.stata = "%12.0g"),
wave = structure(c(1, 1, 1, 1, 1, 1), label = "interview wave", format.stata = "%8.0g"),
age = structure(c(39, 39, 38, 31, 24, 45), label = "Age, derived from dob_dv and intdat_dv", format.stata = "%8.0g"),
sex = structure(c(1, 2, 2, 1, 2, 2), label = "Sex, derived", format.stata = "%8.0g", labels = c(Male = 1,
Female = 2), class = c("haven_labelled", "vctrs_vctr", "double"
)), ethnicity = structure(c(1, 1, 1, 1, 1, 1), label = "Ethnic group (derived from multiple sources)", format.stata = "%8.0g", labels = c(`white uk` = 1,
irish = 2, `gypsy or irish traveller` = 3, `any other white background` = 4,
`white and black caribbean` = 5, `white and black african` = 6,
`white and asian` = 7, `any other mixed background` = 8,
indian = 9, pakistani = 10, bangladeshi = 11, chinese = 12,
`any other asian background` = 13, caribbean = 14, african = 15,
`any other black background` = 16, arab = 17, `any other ethnic group` = 97
), class = c("haven_labelled", "vctrs_vctr", "double")),
general_health = structure(c(2, 4, 5, 3, 1, 1), label = "General health", format.stata = "%8.0g", labels = c(excellent = 1,
`very good` = 2, good = 3, fair = 4, `or Poor?` = 5), class = c("haven_labelled",
"vctrs_vctr", "double")), bmi = structure(c(29.6, 38.8, 21.5,
24.2, 25, 25.5), label = "Body Mass Index", format.stata = "%12.0g")

Thanks for posting an example of your data with dput(). The format of the data you have posted suggests that it has somehow become a list rather than a data frame. You need to convert it to a data frame - as you're using haven I would stick with the tidyverse and do it with as_tibble().
Similarly, you want the labels rather than the underlying integers. You can simply apply as_factor to the whole data frame to do this.
Your data is then ready to be piped to ggplot2. For example:
library(dplyr)
library(ggplot2)
library(haven)
Data |>
as_tibble() |>
as_factor() |>
ggplot() +
geom_boxplot(aes(x=sex, y=bmi))

Related

why do I get Error in `vec_as_location()`: when computing count and full join function?

I made the objects with the variable name I want and selected variables for imported data. But when i use full_join or count, it kept giving me the Error in vec_as_location():. Does anyone know how to avoid this error? The code I wrote a month ago also got this error. But a month ago it worked.
vars <- c("pidp", "cb_age")
wave1 <- read_dta("./data/dresp_w.dta",
col_select = vars)
vars2 <- c("pidp", "cb_sex")
wave2 <- read_dta("./data/dresp_w.dta",
col_select = vars2)
wave12 <- full_join(wave1, wave2, by = "pidp")
count(wave1,cb_sex)
The output for dput(head(wave1))
dput(head(wave2)) would be:
structure(list(pidp = structure(c(76165, 280165, 599765, 732365,
1587125, 3424485), label = "Cross-wave Person Identifier (Public Release)", format.stata = "%12.0g"),
cb_age = structure(c(37, 40, 33, 34, 54, 84), label = "Age - derived", format.stata = "%8.0g", labels = c(Missing = -9,
Inapplicable = -8, Refusal = -2, `Don't know` = -1), class = c("haven_labelled",
"vctrs_vctr", "double"))), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
structure(list(pidp = structure(c(76165, 280165, 599765, 732365,
1587125, 3424485), label = "Cross-wave Person Identifier (Public Release)", format.stata = "%12.0g"),
cb_sex = structure(c(2, 2, 2, 1, 2, 2), label = "Respondent sex", format.stata = "%8.0g", labels = c(Missing = -9,
Inapplicable = -8, Refusal = -2, `Don't know` = -1, Male = 1,
Female = 2), class = c("haven_labelled", "vctrs_vctr", "double"
))), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
Thank you for your help!!
It seems to work fine with dplyr packageVersion -1.0.9
dplyr::count(wave2, cb_sex)
# A tibble: 2 × 2
cb_sex n
<dbl+lbl> <int>
1 1 [Male] 1
2 2 [Female] 5

Removing Incorrect Labels within Tidyverse/ Limiting Actions of as_factor()

I'm working with British Election Study data. To be used in R, this first has to be converted from the .dta form provided, which I think puts labels on to a lot of variables. Most of the time this is useful, but I think a problem I've got is where this isn't the case.
Using as_factor() blindly converts all variables with labels to factors. Is there a way to specify that only certain vectors are converted ? i.e
new_df <- data %>%
as_factor(just_this_column)
Failing that, is there a good way to remove the labels of certain variables within a dataframe ? I've kooked at the sjlabelled package but this does something weird and converts the data from a dataframe:
example_data<- str(sjlabelled::remove_all_labels(example_data$generalElectionVoteW19))
The reason I'm trying to do all of this is to make a histogram of number of people voting for each party (the factor) at a certain age. In this dataset, the age variable has a label which is messing up the code.
Of course, I could just convert the factor to a numeric value at the end but this seems like a messy way of achieving things !
Here is the dput:
structure(list(ageW19 = structure(c(72, 52, 39, 75, 26, 56), label = "Age", format.stata = "%8.0g", labels = c(`Not Asked` = -9,
Skipped = -8), class = c("haven_labelled", "vctrs_vctr", "double"
)), generalElectionVoteW19 = structure(c(1, 13, 3, 1, 2, 1), label = "General election vote intention (recalled vote in post-election waves)", format.stata = "%40.0g", labels = c(`I would/did not vote` = 0,
Conservative = 1, Labour = 2, `Liberal Democrat` = 3, `Scottish National Party (SNP)` = 4,
`Plaid Cymru` = 5, `United Kingdom Independence Party (UKIP)` = 6,
`Green Party` = 7, `British National Party (BNP)` = 8, Other = 9,
`Change UK- The Independent Group` = 11, `Brexit Party` = 12,
`An independent candidate` = 13, `Don't know` = 9999), class = c("haven_labelled",
"vctrs_vctr", "double"))), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"), na.action = c(`1` = 1L, `3` = 3L, `5` = 5L
))
To your first questions, you need mutate to convert a single column, e.g.
new_df <- data %>%
mutate(factor_column = as_factor(old column))
However, as you said you probably want to convert to numeric type, so you might want to use as.numeric instead of as_factor.
We may use base R
data$factor_column <- factor(data$old_column)

Error: Problem with `mutate()` input x `labels` must be unique

I am trying to recode some labelled variables to a 0 to 1 scale in the following fashion. When I try to calculate the mean of the two variables using c_across() I get this odd error Error: Problem with mutate() input market_liberalism. x labels must be unique.
If I delete the value labels then it works. I don't understand what problem the value labels cause.
Thank you.
#Install car package if necessary
#install.packages('car')
library(tidyverse)
library(car)
structure(list(PESE15 = structure(c(3, 5, 5, 8, NA), label = "The Government Should Leave it Entirely to the Private Sector to Create Jobs", na_values = c(8, 9), format.spss = "F1.0", display_width = 0L, labels = c(`Strongly agree` = 1, `Somewhat agree` = 3, Somewhatdisagree = 5, Stronglydisagree = 7,D.K. = 8, Refused = 9), class = c("haven_labelled_spss", "haven_labelled", "vctrs_vctr", "double")), MBSA2 = structure(c(3, 8, 1, 1, NA), label = "People Who Do Not Get Ahead Should Blame Themselves Not the System", na_values = 8, format.spss = "F1.0", display_width = 0L, labels = c(`Strongly agree` = 1, Agree = 2, Disagree = 3, Stronglydisagree = 4, `No opinion` = 8), class = c("haven_labelled_spss", "haven_labelled", "vctrs_vctr", "double"))), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"), label = "NSDstat generated file")->out
#use the car::Recode command to convert values to 0 to 1
out$market1<-Recode(out$PESE15, "1=1; 3=0.75; 5=0.25; 7=0; 8=0.5; else=NA")
out$market2<-Recode(out$MBSA2, "1=1; 2=0.75; 3=0.25; 4=0; 8=0.5; else=NA")
#Use dplyr to try to calculate the average
out %>%
rowwise() %>%
mutate(market_liberalism=mean(
c_across(market1:market2))) -> out2
#setting value labels to NULL makes it work.
val_labels(out$market1)<-NULL
val_labels(out$market2)<-NULL
out %>%
rowwise() %>%
mutate(market_liberalism=mean(
c_across(market1:market2)))
For me car::Recode gives an error and does not work with haven labelled class but dplyr::recode does if you have labelled library loaded.
library(labelled)
library(dplyr)
out %>%
mutate(PESE15 = recode(PESE15, `1` = 1,`3` = 0.75, `5`=0.25, `7`=0, `8` = 0.5),
MBSA2 = recode(MBSA2, `1`=1, `2`=0.75, `3`=0.25, `4`=0, `8`=0.5),
market_liberalism = rowMeans(., na.rm = TRUE))

Ggplot error : haven_labelled/vctrs_vctr/double

I am new here and still studying R so I am dealing with an error.
Here is what I get from console
Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.
I don't know what can I do to make it work. I want to get a scatterplot.
ggplot(data = diagnoza, aes(x = Plecc, y = P32.01))
Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.
Adding geom_point as suggested by #zx8754 gives me a scatter plot. There is still the warning you reported which is related to some of your variables being of type haven_labelled, so I guess you imported your data from SPSS.
To get rid of this warning you could convert your variables to R factors using haven::as_factor. Probably it would be best to do that for the whole dataset after importing your data.
diagnoza <- structure(list(Plecc = c(2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2,
1, 1, 1, 1, 2, 1, 1, 2), P32.01 = structure(c(3, 4, 5, 5, 5,
5, 5, 4, 3, 5, 3, 4, 3, 4, 5, 5, 5, 3, 4, 5), label = "P32.01. odpoczynek w domu (oglądanie TV)", format.spss = "F1.0", display_width = 12L, labels = c(Nigdy = 1,
Rzadko = 2, `Od czasu do czasu` = 3, Często = 4, `Bardzo często` = 5
), class = c("haven_labelled", "vctrs_vctr", "double"))), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
library(haven)
library(ggplot2)
# Convert labelled vector to a factor
diagnoza$P32.01 <- haven::as_factor(diagnoza$P32.01)
ggplot(data = diagnoza, aes(x = Plecc, y = P32.01)) +
geom_point()

R Error (subscript) logical subscript too long

I am attempting to adjust my standard errors by running the following code:
#################################################################################
# Metaregression -- Academic Model
#################################################################################
# save list of moderators to include
terms_1 <- c("Targeted_c",
"MOOSES_Rating_5_c", "Middle_c","High_c")
# Student_report_c is reference variable
# format moderators into formula (an R-specifc type)
formula_academic <- reformulate(termlabels = c(terms_1))
formula_academic
# estimate a covariance matrix
V_list_academic <- impute_covariance_matrix(vi = full_academic$variance, #known correlation vector
cluster = full_academic$Study_ID, #study ID
r = 0.80) #assumed correlation
MVfull_academic <- rma.mv(yi=ES_adjusted, #effect size
V = V_list_academic, #variance (ThIS IS WHAt CHANGES FROM HEmodel)
mods = formula_academic, #ADD COVS HERE
random = ~1 | Study_ID/ES_ID, #nesting structure
test= "t", #use t-tests
data=full_academic, #define data
method="REML") #estimate variances using REML
MVfull_academic
#t-tests of each covariate #
MVfull.coef_academic <- coef_test(MVfull_academic,#estimation model above
cluster=full_academic$Study_ID, #define cluster IDs
vcov = "CR2") #estimation method (CR2 is best)
MVfull.coef_academic
This is the part that returns an error:
MVfull_academic
#t-tests of each covariate #
MVfull.coef_academic <- coef_test(MVfull_academic,#estimation model above
cluster=full_academic$Study_ID, #define cluster IDs
vcov = "CR2") #estimation method (CR2 is best)
MVfull.coef_academic
The error is the following:
Error in x[fac == f, fac == f, drop = FALSE] :
(subscript) logical subscript too long
It sounds like something is not fitting within my data, but I'm not sure what it could be. It looks like everything in the daataset is the same lenghth. How to I fix this error?
Here is my data:
structure(list(APA = structure(c("Barr et al. (2015)", "Blair & Ravor (2014)",
"Bos et al. (2019)", "Bos et al. (2019)", "Conduct Problems Prevention Research Group (1999)",
"Conduct Problems Prevention Research Group (1999)"), label = "APA", format.stata = "%215s"),
Intervention = structure(c("Facing History and Ourselves",
"Tools of the Mind", "BARR", "BARR", "Fast Track (Selective)",
"Fast Track (Selective)"), label = "Intervention", format.stata = "%74s"),
TxCluster = structure(c(32, 16, 1, 1, 27, 27), label = "Tx.\nCluster", format.stata = "%10.0g"),
ControlCluster = structure(c(30, 13, 1, 1, 27, 27), label = "Control.\nCluster", format.stata = "%10.0g"),
UnitofCluster = structure(c("schools", "schools", "", "",
"schools", "schools"), label = "Unit of Cluster", format.stata = "%10s"),
TxN = structure(c(587, 408, 1467, 1466, 419, 275), label = "Tx.N", format.stata = "%10.0g"),
ControlN = structure(c(700, 282, 1916, 1910, 418, 276), label = "Control.N", format.stata = "%10.0g"),
Total_N = structure(c(1287, 690, 3383, 3376, 837, 551), label = "Total_N", format.stata = "%10.0g"),
WebsiteCategoryacademicemot = structure(c("Academic", "Academic",
"Academic", "Academic", "Academic", "Academic"), label = "Website Category (academic, emotion, relations, problem behavior)", format.stata = "%20s"),
MOOSES = structure(c(4, 5, 5, 5, 5, 5), label = "MOOSES rating\n1= cognitive/lower level skills (e.g. emotional recog.; pencil tap", format.stata = "%10.0g"),
ES = structure(c(0.14, 0.13, 0.31, 0.11, -0.01, 0.17), label = "ES", format.stata = "%10.0g"),
TypeofMeasure = structure(c("student self-report", "Standardized assessment",
"school record", "school record", "official report", "standardized assessment"
), label = "Type of Measure", format.stata = "%23s"), ES_ID = structure(c(22,
41, 58, 59, 135, 138), format.stata = "%9.0g"), Study_ID = structure(c(5,
9, 11, 11, 19, 19), label = "group(APA)", format.stata = "%9.0g"),
Targeted = structure(c(0, 0, 0, 0, 0, 0), format.stata = "%9.0g"),
Primary = structure(c(0, 1, 0, 0, 1, 1), format.stata = "%9.0g"),
Middle = structure(c(0, 0, 0, 0, 0, 0), format.stata = "%9.0g"),
High = structure(c(1, 0, 1, 1, 0, 0), format.stata = "%9.0g"),
Significant = structure(c(1, 1, 1, 1, 1, 1), format.stata = "%9.0g"),
MOOSES_Rating_4 = structure(c(1, 0, 0, 0, 0, 0), format.stata = "%9.0g"),
MOOSES_Rating_5 = structure(c(0, 1, 1, 1, 1, 1), format.stata = "%9.0g"),
MOOSES_Rating_4_c = structure(c(0.295774638652802, -0.704225361347198,
-0.704225361347198, -0.704225361347198, -0.704225361347198,
-0.704225361347198), format.stata = "%9.0g"), MOOSES_Rating_5_c = structure(c(-0.253521114587784,
0.746478855609894, 0.746478855609894, 0.746478855609894,
0.746478855609894, 0.746478855609894), format.stata = "%9.0g"),
Targeted_c = structure(c(-0.239436626434326, -0.239436626434326,
-0.239436626434326, -0.239436626434326, -0.239436626434326,
-0.239436626434326), format.stata = "%9.0g"), Primary_c = structure(c(-0.718309879302979,
0.281690150499344, -0.718309879302979, -0.718309879302979,
0.281690150499344, 0.281690150499344), format.stata = "%9.0g"),
Middle_c = structure(c(-0.126760557293892, -0.126760557293892,
-0.126760557293892, -0.126760557293892, -0.126760557293892,
-0.126760557293892), format.stata = "%9.0g"), High_c = structure(c(0.845070421695709,
-0.154929578304291, 0.845070421695709, 0.845070421695709,
-0.154929578304291, -0.154929578304291), format.stata = "%9.0g"),
Full_Sample = structure(c(1287, 690, 3383, 3376, 837, 551
), format.stata = "%9.0g"), Clusters_Total = structure(c(62,
29, 2, 2, 54, 54), format.stata = "%9.0g"), ES_adjusted = structure(c(0.12521980702877,
0.116275534033775, 0.277272433042526, 0.0983869880437851,
-0.00894427206367254, 0.152052626013756), format.stata = "%9.0g"),
SE = structure(c(0.05644915625453, 0.0780460089445114, 0.0353467278182507,
0.0349567793309689, 0.0690869837999344, 0.0861022993922234
), format.stata = "%9.0g"), variance = structure(c(0.0439638122916222,
0.0306105446070433, 0.00127180037088692, 0.001214295392856,
0.02976069226861, 0.100570656359196), format.stata = "%9.0g")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
I just found an NA in my data, I think it may be that!

Resources