I have a dataframe that contains multiple variables each measured with multiple items at two different time points. What I want to remove all rows with NA entries in groups of columns containing the same part of a string. Some of these groups contain multiple columns (e.g., grep("learn"), some only one (e.g., T1_age. This is my original dataframe (a part of it):
data <- data.frame(
T1_age = c(39, 30, 20, 48, 27, 55, 37, 50, 50, 37),
T1_sex = c(2, 1, 1, 2, 2, 1, 1, 2, 1, 1),
T2_learn1 = c(2, NA, 3, 4, 1, NA, NA, 2, 4, 4),
T2_learn2 = c(1, NA, 4, 4, 1, NA, NA, 2, 4, 4),
T2_learn3 = c(2, NA, 4, 4, 1, NA, NA, 3, 4, 4),
T2_learn4 = c(2, NA, 2, 5, 5, NA, NA, 5, 5, 5),
T2_learn5 = c(4, NA, 3, 4, 3, NA, NA, 3, 4, 3),
T2_aut1 = c(NA, NA, 4, 4, 4, NA, NA, 3, 5, 4),
T2_aut2 = c(NA, NA, 4, 4, 4, NA, NA, 3, 5, 5),
T2_aut3 = c(NA, NA, 4, 4, 3, NA, NA, 3, 5, 5),
T2_ssup1 = c(1, NA, 4, 5, 4, NA, NA, 2, 4, 3),
T2_ssup2 = c(3, NA, 4, 5, 5, NA, NA, 3, 4, 4),
T2_ssup3 = c(4, NA, 4, 5, 5, NA, NA, 4, 4, 4),
T2_ssup4 = c(2, NA, 3, 5, 5, NA, NA, 3, 4, 4),
T3_learn1 = c(3, NA, NA, 4, 4, NA, NA, 3, 3, 4),
T3_learn2 = c(1, NA, NA, 4, 3, NA, NA, 3, 3, 4),
T3_learn3 = c(3, NA, NA, 4, 4, NA, NA, 3, 3, 5),
T3_learn4 = c(4, NA, NA, 5, 4, NA, NA, 4, 5, 5),
T3_learn5 = c(4, NA, NA, 3, 4, NA, NA, 3, 3, 4),
T3_aut1 = c(NA, NA, NA, 4, 4, NA, NA, 3, 5, 5),
T3_aut2 = c(NA, NA, NA, 3, 4, NA, NA, 3, 5, 5),
T3_aut3 = c(NA, NA, NA, 3, 2, NA, NA, 3, 5, 5),
T3_ssup1 = c(3, NA, NA, 5, 4, NA, NA, 2, 4, 1),
T3_ssup2 = c(3, NA, NA, 5, 5, NA, NA, 4, 5, 5),
T3_ssup3 = c(4, NA, NA, 5, 5, NA, NA, 4, 5, 3),
T3_ssup4 = c(3, NA, NA, 5, 5, NA, NA, 4, 5, 4)
)
Now I already found a very horrible solution and I believe that could be improved. So this code basically does what I want:
library(dplyr)
library(tidyr)
data <- data %>% filter(rowSums(is.na(.[ , grep("learn", colnames(.))])) != ncol(.[ , grep("learn", colnames(.))]))
data <- data %>% filter(rowSums(is.na(.[ , grep("aut", colnames(.))])) != ncol(.[ , grep("aut", colnames(.))]))
data <- data %>% filter(rowSums(is.na(.[ , grep("ssup", colnames(.))])) != ncol(.[ , grep("ssup", colnames(.))]))
data <- data %>% drop_na(T1_age)
data <- data %>% drop_na(T1_sex)
So the new data frame (and what I want to achieve) looks like this:
data2 <- data.frame(
T1_age = c(20, 48, 27, 50, 50, 37),
T1_sex = c(1, 2, 2, 2, 1, 1),
T2_learn1 = c(3, 4, 1, 2, 4, 4),
T2_learn2 = c(4, 4, 1, 2, 4, 4),
T2_learn3 = c(4, 4, 1, 3, 4, 4),
T2_learn4 = c(2, 5, 5, 5, 5, 5),
T2_learn5 = c(3, 4, 3, 3, 4, 3),
T2_aut1 = c(4, 4, 4, 3, 5, 4),
T2_aut2 = c(4, 4, 4, 3, 5, 5),
T2_aut3 = c(4, 4, 3, 3, 5, 5),
T2_ssup1 = c(4, 5, 4, 2, 4, 3),
T2_ssup2 = c(4, 5, 5, 3, 4, 4),
T2_ssup3 = c(4, 5, 5, 4, 4, 4),
T2_ssup4 = c(3, 5, 5, 3, 4, 4),
T3_learn1 = c(NA, 4, 4, 3, 3, 4),
T3_learn2 = c(NA, 4, 3, 3, 3, 4),
T3_learn3 = c(NA, 4, 4, 3, 3, 5),
T3_learn4 = c(NA, 5, 4, 4, 5, 5),
T3_learn5 = c(NA, 3, 4, 3, 3, 4),
T3_aut1 = c(NA, 4, 4, 3, 5, 5),
T3_aut2 = c(NA, 3, 4, 3, 5, 5),
T3_aut3 = c(NA, 3, 2, 3, 5, 5),
T3_ssup1 = c(NA, 5, 4, 2, 4, 1),
T3_ssup2 = c(NA, 5, 5, 4, 5, 5),
T3_ssup3 = c(NA, 5, 5, 4, 5, 3),
T3_ssup4 = c(NA, 5, 5, 4, 5, 4)
)
Could you help me improve this a bit? Thank you!!!
You may iterate over grep in an sapply and check if the rowSums in the slices reach their number of columns.
V <- c('learn', 'aut', 'ssup')
res <- data[!rowSums(sapply(V, \(v) {
X <- data[grep(v, names(data))]
rowSums(is.na(X)) == dim(X)[2]
})), ]
stopifnot(all.equal(res, data2, check.attributes=FALSE))
Or probably just checking if the sums of NA's in the "hot" columns reach the number of columns (without the demographics) is enough.
res1 <- data[rowSums(is.na(data[grep(paste(V, collapse='|'), names(data))])) !=
dim(data[-(1:2)])[2], ]
stopifnot(all.equal(res1, data2, check.attributes=FALSE))
data2 is the result data frame you provide in OP. dim(data)[2] gives the same as ncol(data).
Note: R version 4.1.2 (2021-11-01)
I have WVS 6th wave dataframe. Computed the outgroup trust index (outgroup_index) and I want to divide this vector into 3 groups according to tertiles.
I use base R functions to do that:
# Recoding will be based on tertiles
# Find the tretiles of the index
tertiles <- quantile(filtered_df$outgroup_index, c(0:3) / 3)
# cut the target variable into tertiles
filtered_df$index_recoded <- with(
filtered_df,
cut(outgroup_index,
tertiles,
include.lowest = T)
)
But I am wondering about other possible and more neat ways to do it (preferably using dplyr/tidyverse or any other packages)?
Data:
structure(list(V2 = structure(c(643, 643, 643, 643, 643, 643,
643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643,
643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643,
643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643,
643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643,
643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643,
643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643,
643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643, 643,
643, 643, 643, 643), label = "Country/region", format.spss = "F4.0", labels = c(`Not asked in survey` = -4,
Algeria = 12, Azerbaijan = 31, Argentina = 32, Australia = 36,
Armenia = 51, Brazil = 76, Belarus = 112, Chile = 152, China = 156,
`Taiwan ROC` = 158, Colombia = 170, Cyprus = 196, Ecuador = 218,
Estonia = 233, Georgia = 268, Palestine = 275, Germany = 276,
Ghana = 288, Haiti = 332, `Hong Kong SAR` = 344, India = 356,
Iraq = 368, Japan = 392, Kazakhstan = 398, Jordan = 400, `South Korea` = 410,
Kuwait = 414, Kyrgyzstan = 417, Lebanon = 422, Libya = 434, Malaysia = 458,
Mexico = 484, Morocco = 504, Netherlands = 528, `New Zealand` = 554,
Nigeria = 566, Pakistan = 586, Peru = 604, Philippines = 608,
Poland = 616, Qatar = 634, Romania = 642, Russia = 643, Rwanda = 646,
Singapore = 702, Slovenia = 705, `South Africa` = 710, Zimbabwe = 716,
Spain = 724, Sweden = 752, Thailand = 764, `Trinidad and Tobago` = 780,
Tunisia = 788, Turkey = 792, Ukraine = 804, Egypt = 818, `United States` = 840,
Uruguay = 858, Uzbekistan = 860, Yemen = 887), class = c("haven_labelled",
"vctrs_vctr", "double")), V105 = structure(c(4, 3, 3, 4, 3, 4,
4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 2, 2, 2, 1, 1,
2, 4, 2, 2, 2, 1, 2, 1, 4, 2, 1, 4, 2, 3, 3, 2, 3, 2, 3, 2, 3,
2, 2, 3, 3, 3, 3, 3, 3, NA, 3, 3, 4, 2, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 2, 2, 3, 3, 3, 2, 3, NA), label = "Trust: People you meet for the first time (B)", format.spss = "F3.0", labels = c(`SE:Inapplicable ; RU:Inappropriate response; HT: Dropped out` = -5,
`Not asked` = -4, `Not applicable` = -3, `No answer` = -2, `Don<U+00B4>t know` = -1,
`Trust completely` = 1, `Trust somewhat` = 2, `Do not trust very much` = 3,
`Do not trust at all` = 4), class = c("haven_labelled", "vctrs_vctr",
"double")), V106 = structure(c(3, 2, NA, 4, 2, 4, 4, 3, 3, 4,
3, 3, 4, 4, 4, 4, NA, NA, NA, NA, 3, 2, 2, 2, 2, 2, 2, 3, 3,
3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2,
2, 2, 1, 1, 2, 1, 4, 2, 1, 4, 2, 3, 3, 2, 2, 2, 3, 2, 3, 2, 2,
NA, 3, NA, 3, 3, 3, 2, 3, 3, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 2, 2, 2, 3, 2, 2, 2, 3), label = "Trust: People of another religion (B)", format.spss = "F3.0", labels = c(`DE,SE:Inapplicable ; RU:Inappropriate response; HT: Dropped` = -5,
`Not asked` = -4, `Not applicable` = -3, `No answer` = -2, `Don<U+00B4>t know` = -1,
`Trust completely` = 1, `Trust somewhat` = 2, `Do not trust very much` = 3,
`Do not trust at all` = 4), class = c("haven_labelled", "vctrs_vctr",
"double")), V107 = structure(c(3, 4, NA, 4, 2, 4, 4, 3, 3, 4,
3, 3, 4, 4, 4, 4, 3, 2, NA, NA, 3, 2, 2, 2, 2, 2, 2, 3, 3, 3,
3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 1, 2, 2, 2, 2, 1, 1, 2, 1, 2,
2, 1, 1, 2, 1, 4, 2, 1, 3, 2, 3, 2, 2, 2, 2, 3, 2, 3, 2, 2, NA,
3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 3, 2, 3, 2, 2, 2, 3), label = "Trust: People of another nationality (B)", format.spss = "F3.0", labels = c(`DE,SE:Inapplicable ; RU:Inappropriate response; HT: Dropped` = -5,
`Not asked` = -4, `Not applicable` = -3, `No answer` = -2, `Don<U+00B4>t know` = -1,
`Trust completely` = 1, `Trust somewhat` = 2, `Do not trust very much` = 3,
`Do not trust at all` = 4), class = c("haven_labelled", "vctrs_vctr",
"double")), V248 = structure(c(9, 8, 5, 8, 8, 8, 8, 9, 7, 9,
9, 5, 5, 6, 5, 5, 5, 5, 5, 4, 9, 9, 4, 9, 9, 3, 6, 9, 8, 9, 9,
9, NA, 9, 5, 9, 5, 7, 9, 5, 5, 9, 9, 8, 9, 9, 5, 5, 5, 9, 9,
8, 5, 8, 9, 9, 5, 8, 9, 9, 9, 7, 7, 5, 4, 6, 9, 6, 6, 9, 9, 5,
6, 7, 5, 4, 7, 7, 5, 5, 5, 5, 8, 9, 8, 9, 9, 9, 9, 9, 9, 9, 5,
9, 9, 5, 9, 8, 9, 5, 5), label = "Highest educational level attained", format.spss = "F3.0", labels = c(`AU: Inapplicable (No-school education) DE,SE:Inapplicable ;` = -5,
`Not asked` = -4, `Not applicable` = -3, `No answer` = -2, `Don<U+00B4>t know` = -1,
`No formal education` = 1, `Incomplete primary school` = 2, `Complete primary school` = 3,
`Incomplete secondary school: technical/ vocational type` = 4,
`Complete secondary school: technical/ vocational type` = 5,
`Incomplete secondary school: university-preparatory type` = 6,
`Complete secondary school: university-preparatory type` = 7,
`Some university-level education, without degree` = 8, `University - level education, with degree` = 9
), class = c("haven_labelled", "vctrs_vctr", "double")), V59 = structure(c(9,
5, 6, 8, 6, 7, NA, 8, 5, 3, 4, 7, 2, 1, 1, 6, 8, 6, NA, NA, 1,
5, NA, 6, 1, 2, 9, 5, 6, NA, NA, 3, 6, 6, 4, NA, 6, 6, NA, NA,
3, 9, 8, 10, 9, 6, 10, 9, 8, 9, 9, 10, 6, 4, 4, 6, 4, 10, 3,
3, 4, 3, 5, 4, 7, 3, 3, 4, 3, 7, 4, 6, 4, 1, 1, 6, 1, 1, 6, 1,
1, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 7, 3, 1, 5, 6, 7, 2, 4, 5
), label = "Satisfaction with financial situation of household", format.spss = "F3.0", labels = c(`HT: Dropped out survey;DE,SE:Inapplicable ; RU:Inappropriate` = -5,
`Not asked` = -4, `No answer` = -2, `Don<U+00B4>t know` = -1,
Dissatisfied = 1, `2` = 2, `3` = 3, `4` = 4, `5` = 5, `6` = 6,
`7` = 7, `8` = 8, `9` = 9, Satisfied = 10), class = c("haven_labelled",
"vctrs_vctr", "double")), V237 = structure(c(3, 2, 2, 2, NA,
1, 2, 2, 1, 2, 2, 2, 2, 3, 2, 1, 1, 3, 2, 2, NA, 2, 2, 3, 4,
2, 2, 1, NA, 1, 1, 1, NA, NA, NA, 1, NA, 1, 1, NA, 2, 1, 2, 1,
1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3,
2, 3, 2, 1, 2, 3, 2, 2, 2, NA, 2, 2, 4, 2, 2, 2, 1, 1, 2, 1,
2, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 3, 2, 2, NA, 3), label = "Family savings during past year", format.spss = "F3.0", labels = c(`DE,SE:Inapplicable ; RU:Inappropriate response; BH: Missing;` = -5,
`Not asked` = -4, `Not applicable` = -3, `No answer` = -2, `Don<U+00B4>t know` = -1,
`Save money` = 1, `Just get by` = 2, `Spent some savings and borrowed money` = 3,
`Spent savings and borrowed money` = 4), class = c("haven_labelled",
"vctrs_vctr", "double")), V105_rec = c(1, 2, 2, 1, 2, 1, 1, 1,
1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 3, 3, 3, 4, 4, 3, 1,
3, 3, 3, 4, 3, 4, 1, 3, 4, 1, 3, 2, 2, 3, 2, 3, 2, 3, 2, 3, 3,
2, 2, 2, 2, 2, 2, NA, 2, 2, 1, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 3, 3, 2, 2, 2, 3, 2, NA), V106_rec = c(2, 3, NA, 1, 3,
1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, NA, NA, NA, NA, 2, 3, 3, 3,
3, 3, 3, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 3,
3, 4, 4, 3, 3, 3, 3, 4, 4, 3, 4, 1, 3, 4, 1, 3, 2, 2, 3, 3, 3,
2, 3, 2, 3, 3, NA, 2, NA, 2, 2, 2, 3, 2, 2, 1, 3, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 3, 3, 3, 2), V107_rec = c(2,
1, NA, 1, 3, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 3, NA, NA, 2,
3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 4,
3, 3, 3, 3, 4, 4, 3, 4, 3, 3, 4, 4, 3, 4, 1, 3, 4, 2, 3, 2, 3,
3, 3, 3, 2, 3, 2, 3, 3, NA, 2, 3, 2, 2, 2, 3, 2, 2, 2, 3, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 2, 3, 2, 3, 3, 3, 2), outgroup_index = c(1.66666666666667,
2, 2, 1, 2.66666666666667, 1, 1, 1.66666666666667, 1.66666666666667,
1, 1.66666666666667, 2, 1, 1, 1, 1, 1.5, 2.5, 2, 2, 2, 3, 3,
3, 3, 3, 2.66666666666667, 2, 2, 2, 2, 1.33333333333333, 1.33333333333333,
2, 2, 2, 2, 2, 2, 2, 2, 2.66666666666667, 2, 3, 3, 3, 4, 4, 3,
2.66666666666667, 3, 3, 3.66666666666667, 4, 3, 4, 1, 3, 4, 1.33333333333333,
3, 2, 2.33333333333333, 3, 2.66666666666667, 3, 2, 3, 2, 3, 3,
2, 2, 2.5, 2, 2, 2, 3, 2, 2, 1.33333333333333, 3, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 3, 2.66666666666667, 2.66666666666667, 2,
2.66666666666667, 3, 2.66666666666667, 2), V59_rec = structure(c(5,
3, 3, 4, 3, 4, NA, 4, 3, 2, 2, 4, 1, 1, 1, 3, 4, 3, NA, NA, 1,
3, NA, 3, 1, 1, 5, 3, 3, NA, NA, 2, 3, 3, 2, NA, 3, 3, NA, NA,
2, 5, 4, 5, 5, 3, 5, 5, 4, 5, 5, 5, 3, 2, 2, 3, 2, 5, 2, 2, 2,
2, 3, 2, 4, 2, 2, 2, 2, 4, 2, 3, 2, 1, 1, 3, 1, 1, 3, 1, 1, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 4, 2, 1, 3, 3, 4, 1, 2, 3), labels = c(`Not satisfied at all` = 1,
`Rather not satisfied` = 2, `Neither satisfied, nor not satisfied` = 3,
`Rather satisfied` = 4, Satisfied = 5), class = c("haven_labelled",
"vctrs_vctr", "double")), V248_dummy = structure(c(1, 1, 0, 1,
1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0), labels = c(`A university education and higher` = 1,
`No university education` = 0), class = c("haven_labelled", "vctrs_vctr",
"double")), V237_rec = structure(c(3, 2, 2, 2, NA, 1, 2, 2, 1,
2, 2, 2, 2, 3, 2, 1, 1, 3, 2, 2, NA, 2, 2, 3, 3, 2, 2, 1, NA,
1, 1, 1, NA, NA, NA, 1, NA, 1, 1, NA, 2, 1, 2, 1, 1, 1, 1, 1,
1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 2, 3, 2, 1,
2, 3, 2, 2, 2, NA, 2, 2, 3, 2, 2, 2, 1, 1, 2, 1, 2, 3, 2, 2,
1, 2, 2, 2, 3, 3, 2, 3, 2, 2, NA, 3), labels = c(`Save money` = 1,
`Just get by` = 2, `Spent savings and borrowed money` = 3), class = c("haven_labelled",
"vctrs_vctr", "double"))), row.names = c(NA, -101L), class = c("tbl_df",
"tbl", "data.frame"), label = "filelabel")
A bit unintuitive, but ggplot2 has the functionality you are looking for.
filtered_df %>%
mutate(index_recoded = ggplot2::cut_interval(outgroup_index, 3))
And to verify the levels are the same:
# smaller dput would be nice
start <- Data
all(
{
filtered_df <- start
tertiles <- quantile(filtered_df$outgroup_index, c(0:3) / 3)
filtered_df$index_recoded <- with(
filtered_df,
cut(outgroup_index,
tertiles,
include.lowest = T)
)
filtered_df$index_recoded
} == {
tv_df <- start
tv_df %>%
mutate(index_recoded = ggplot2::cut_interval(outgroup_index, 3)) %>%
pull(index_recoded)
}
)
[1] TRUE
cut has a simpler syntax if you want to divide the data into fixed intervals.
filtered_df$index_recoded <- cut(filtered_df$outgroup_index, 3)
You can also use it with labels = FALSE to get 1, 2 and 3 as output.
filtered_df$index_recoded <- cut(filtered_df$outgroup_index, 3, labels = FALSE)