Describe or display the relationship between variables and the labels xgboost? - r

I have a model:
model<-xgboost(data=as.matrix(data[,-1]),label=data$Ethnicity, num_class=8, nrounds=50,objective="multi:softmax",lambda=1, eval_metric="merror")
data is a matrix of 94 variables of random survey question and the label is Ethnicity which is a 0-7 variable coding race/ethnicity so that every number from 0 to 7 represents an ethnicity.
I found which variables are most important in the prediction:
xgb.importance(model=model)
## Feature Gain Cover Frequency
## 1: q97 0.0924173556 0.0388402250 0.016981237
## 2: q9 0.0603595554 0.0199381316 0.012749847
## 3: q7 0.0456855077 0.0447756304 0.066922777
## 4: q6 0.0436987577 0.0485072162 0.041311731
## 5: q8 0.0319606309 0.0212999077 0.015199599
## 6: q99 0.0276115402 0.0201090242 0.007961695
## 7: q89 0.0245865711 0.0249913356 0.023829408
## 8: q13 0.0197648132 0.0190748590 0.010912533
## 9: q81 0.0194462208 0.0140010066 0.021880742
## 10: q71 0.0192126872 0.0194684164 0.019709370
Now I am stuck, my question is how do I describe or display the relationship between these variables and the labels? TIA!
Here are some data from dput(head(data)):
structure(list(r = c(2, 6, 4, 4, 4, 4), q6 = c(1.73, 1.5, 1.9,
NA, 1.63, 1.7), q7 = c(54.43, 51.26, 66.68, NA, 68.49, 59.88),
q8 = c(2, 2, 1, 2, 1, 2), q9 = c(5, 5, 5, 5, 4, 5), q10 = c(5,
1, 1, 1, 3, 1), q11 = c(1, 1, 1, 2, 1, 1), q12 = c(1, 1,
1, 4, 1, 1), q13 = c(1, 1, 1, 4, 1, 1), q14 = c(1, 1, 1,
1, 1, 1), q15 = c(1, 1, 1, 1, 1, 1), q16 = c(1, 1, 3, 1,
1, 1), q17 = c(2, 1, NA, 1, 1, 1), q18 = c(3, 1, NA, 2, 1,
1), q19 = c(2, 1, NA, 1, 1, 1), q20 = c(2, 1, NA, 2, 1, 1
), q21 = c(2, 2, NA, 2, 1, 2), q22 = c(2, 1, 1, 1, 4, 2),
q23 = c(2, 1, NA, 1, 5, 2), q24 = c(1, 2, 1, 2, 1, 1), q25 = c(1,
2, 1, 2, 2, 1), q26 = c(2, 2, 1, 1, 1, 1), q27 = c(2, 2,
1, 2, 1, 1), q28 = c(2, 2, 2, 2, 1, 1), q29 = c(1, 1, NA,
1, 1, 3), q30 = c(1, 1, NA, 1, 1, 3), q31 = c(1, 2, NA, 1,
1, 1), q32 = c(6, 1, NA, 6, 6, 1), q33 = c(NA, 1, NA, 2,
5, 1), q34 = c(NA, 1, NA, 2, 4, 1), q35 = c(NA, 1, NA, 5,
5, 1), q36 = c(2, 1, NA, 3, 3, 1), q37 = c(1, 1, NA, 1, 1,
1), q38 = c(6, 1, NA, 4, 1, 1), q39 = c(1, 2, 2, 1, 1, 2),
q40 = c(3, 1, NA, 2, 7, 1), q41 = c(6, 1, 2, 5, 6, 3), q42 = c(5,
1, 5, 5, 5, 6), q43 = c(1, 1, 1, 2, 2, 2), q44 = c(1, 1,
1, 2, 2, NA), q45 = c(1, 1, 1, 5, 7, 4), q46 = c(1, 1, 1,
6, 5, 7), q47 = c(7, 1, NA, 7, 7, 6), q48 = c(6, 1, 7, 5,
5, 6), q49 = c(4, 1, NA, 6, 1, 4), q50 = c(1, 1, 1, 2, 3,
1), q51 = c(1, 1, 1, 1, 1, 1), q52 = c(1, 1, 1, 1, 1, 1),
q53 = c(1, 1, 1, 2, 3, 1), q54 = c(1, 1, 1, 1, 2, 1), q55 = c(1,
1, 1, 2, 1, 1), q56 = c(1, 1, 1, 1, 1, 1), q57 = c(1, 1,
1, 4, 4, 2), q58 = c(1, 1, 1, 1, 1, 1), q59 = c(1, 2, 2,
2, 1, 1), q60 = c(1, 2, 1, 1, 1, 1), q61 = c(7, 1, 2, 5,
6, 6), q62 = c(3, 1, 3, 5, 7, 5), q63 = c(3, 1, 3, 2, 4,
5), q64 = c(3, 1, 3, 3, 3, 2), q65 = c(2, 1, 2, 2, 2, 3),
q66 = c(4, 1, NA, 4, 4, 2), q67 = c(2, 3, 3, 2, 3, 2), q68 = c(1,
1, 2, 1, 1, 1), q69 = c(2, 3, 3, 2, 3, 3), q70 = c(2, 4,
4, 2, 1, 1), q71 = c(3, 2, 3, 1, 3, 2), q72 = c(4, 4, 4,
2, 3, 2), q73 = c(1, 2, 1, 1, 1, 2), q74 = c(2, 2, 3, 2,
2, 2), q75 = c(2, 2, 2, 2, 2, 1), q76 = c(7, 2, 2, 2, 2,
1), q77 = c(3, 3, 4, 4, 2, 7), q78 = c(1, 2, 4, 2, 1, 3),
q79 = c(4, 8, 6, 3, 1, 2), q80 = c(6, 4, 4, 3, 1, 4), q81 = c(5,
NA, 1, 4, 2, 1), q82 = c(7, 1, 6, 5, 2, 7), q83 = c(1, 1,
1, 6, 1, 6), q84 = c(1, 1, 1, 2, 1, 2), q85 = c(2, 2, 1,
2, 2, 2), q86 = c(1, 1, NA, 1, 1, 1), q87 = c(2, 2, NA, 2,
2, 1), q88 = c(4, 5, 5, 3, 1, 2), q89 = c(4, 2, 2, 4, 2,
4), q90 = c(2, 1, NA, NA, 1, 2), q91 = c(1, 1, 1, 3, 3, 1
), q92 = c(1, 1, 1, 2, 2, 5), q93 = c(4, 5, 7, 4, 7, 2),
q94 = c(3, 3, 2, 2, 3, 2), q95 = c(1, 4, 1, 1, 1, 4), q96 = c(1,
1, 1, 1, 1, 1), q97 = c(1, 1, 3, 1, 2, 3), q98 = c(1, 2,
2, 1, 1, 1), q99 = c(1, 1, 1, 1, 1, 2)), row.names = c(NA,
6L), class = "data.frame")

Related

Filter data frame to get only rows that have a value in column and another value in any column after first value, R

I am looking for a methodology to filter the following data frame so that I end up with only the rows that have a 1 in some column and a 2 in any other column after the column containing the 1. I am thankful for any help!
data_rel1 <- structure(list(job1category = c(NA, 1, 2, 2, 1, 1, 2, 1, 1, 1,
1, 1, 1, 1, NA, 1, 1, 4, 1, 1, NA, NA, 1, 1, 1, 1, 1, 1, 2, 1,
1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, NA, 7, NA, 5, 1,
5, NA, 2, 5, 1, NA, 1, 5, 1, 1, 1, 1, 5, 1, 5, 4, 1, 4, 5, 4,
NA, 5, NA, 5, 4, 3, 6, 1, 4, 4, 5, 4, 1, NA, 1, NA, 1, NA, 1,
1, 1, 1, 1, 4, 1, 1, 1, NA, 1, NA), job2category = c(3, 2, 1,
2, 3, 1, 2, 2, 1, 1, 1, NA, 2, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 5, 3, 3, 1, 1, 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1,
1, 1, 7, 7, 1, 1, 1, NA, 4, 1, 1, NA, 2, 1, 1, 1, 1, NA, 5, NA,
4, 5, 4, NA, 5, 2, 4, 4, 2, 7, 5, NA, 5, 2, NA, 4, NA, 1, 5,
NA, 1, NA, 1, 1, 1, 1, 5, 2, NA, 4, 1, 1, 1, NA, 1, NA), job3category = c(3,
2, 1, 2, 3, 1, 2, 2, 1, 1, 1, NA, 2, 1, NA, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 5, 3, 3, 1, 1, 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 2,
2, 1, 1, 1, 7, 7, 1, 1, 1, NA, 4, 1, 1, NA, 2, 1, 1, 1, 1, NA,
5, NA, 4, 5, 4, NA, 5, 2, 4, 4, 2, 7, 5, NA, 5, 2, NA, 4, NA,
1, 5, NA, 1, NA, 1, 1, 1, 1, 5, 2, NA, 4, 1, 1, 1, NA, 1, NA),
job4category = c(3, 2, 1, 2, 3, 1, 2, 2, 1, 1, 1, NA, 2,
1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 3, 3, 1, 1, 2,
4, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 7, 7, 1, 1, 1,
NA, 4, 1, 1, NA, 2, 1, 1, 1, 1, NA, 5, NA, 4, 5, 4, NA, 5,
2, 4, 4, 2, 7, 5, NA, 5, 2, NA, 4, NA, 1, 5, NA, 1, NA, 1,
1, 1, 1, 5, 2, NA, 4, 1, 1, 1, NA, 1, NA)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
You can try this with an apply -
data_rel1[apply(data_rel1, 1, function(x) {
inds <- which(x == 1)
length(inds) && any(which(x == 2) > min(inds))
}), ]
# job1category job2category job3category job4category
# <dbl> <dbl> <dbl> <dbl>
#1 1 2 2 2
#2 1 2 2 2
#3 1 2 2 2
#4 1 2 2 2
#5 1 2 2 2
#6 1 2 2 2
#7 1 2 2 2
#8 1 2 2 2

R reshape wide to long: multiple variables, observations with multiple indicies

I have got some data containing observations with multiple idicies $y_{ibc}$ stored in a messy wide format. I have been fiddling around with tidyr and reshape2 but could not figure it out (reshaping really is my nemesis).
Here is an example:
df <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9), a1b1c1 = c(5,
2, 1, 4, 3, 1, 0, 1, 3), a2b1c1 = c(3, 4, 1, 1, 3, 2, 1, 4, 4
), a3b1c1 = c(4, 0, 0, 1, 1, 1, 0, 0, 1), a1b2c1 = c(1, 0, 4,
2, 4, 1, 0, 4, 2), a2b2c1 = c(2, 0, 1, 0, 1, 0, 3, 2, 0), a3b2c1 = c(2,
4, 3, 0, 2, 3, 3, 3, 4), yc1 = c(1, 2, 2, 1, 2, 2, 2, 1, 1), a1b1c2 = c(4,
2, 3, 0, 4, 4, 2, 1, 4), a2b1c2 = c(3, 0, 3, 3, 4, 4, 3, 2, 2
), a3b1c2 = c(3, 1, 0, 1, 4, 0, 2, 2, 3), a1b2c2 = c(2, 2, 0,
3, 2, 1, 4, 1, 0), a2b2c2 = c(3, 0, 2, 3, 4, 4, 4, 0, 4), a3b2c2 = c(0,
0, 0, 2, 0, 0, 1, 4, 3), yc2 = c(2, 2, 2, 1, 2, 2, 2, 1, 1), X = c(5,
6, 3, 7, 4, 3, 2, 3, 2)), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
This is what I want (excerpt):
id b c y a1 a2 a3 X
1 1 b1 c1 1 5 3 4 5
2 1 b2 c1 1 1 2 2 5
3 1 b1 c2 2 4 3 3 5
4 1 b2 c2 2 2 3 0 5
Using tidyr & dplyr:
library(tidyverse)
df %>%
pivot_longer(cols = matches("a.b.c."), names_to = "name", values_to = "value") %>%
separate(name, into = c("a", "b", "c"), sep = c(2,4)) %>%
mutate(y = case_when(c == "c1" ~ yc1,
c == "c2" ~ yc2)) %>%
pivot_wider(names_from = a, values_from = value) %>%
select(id, b, c, y, a1, a2, a3, X)
First, convert all your a/b/c columns to a long format & separate the 3 values into separate columns. Then combine your y columns into one depending on the value of c using mutate andcase_when (you could also use if_else for two options but case_when is more expandable for more values). Then pivot your a columns back to wide format and use select to put them in the right order and get rid of the yc1 and yc2 columns.

When mutate_all and lapply disagree ... How to replace lapply with mutate_all

I'm here again to ask for your help!
I'm trying to figure out what's happening with mutate_all (or with me...).
Let's say I have this dataset:
ds <- structure(list(Q1 = structure(c(5, 4, 5, 5, 5, 5, 5, 5, 5, 5,
5, 4, 3, 5, 5, 5, 5, 5, 1, 4, 5, 5, 3, 4, 5, 5, 5, 5, 5, 2, 5,
5, 4, 5, 5, 3, 5, 5, 4, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4,
5, 4), label = "1 Para mim é igual se os meus amigos são heterossexuais ou homossexuais.", format.spss = "F1.0", display_width = 3L, class = "labelled", labels = c(`discordo totalmente` = 1,
discordo = 2, indiferente = 3, concordo = 4, `concordo totalmente` = 5
)), Q2 = structure(c(1, 1, 1, 1, 1, 1, 3, 1, 2, 3, 1, 4, 4, 4,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2), label = "A homossexualidade é uma perturbação psicológica/biológica.", format.spss = "F1.0", display_width = 5L, class = "labelled", labels = c(`discordo totalmente` = 1,
discordo = 2, indiferente = 3, concordo = 4, `concordo totalmente` = 5
)), Q3 = structure(c(5, 2, 5, 4, 5, 4, 5, 5, 5, 4, 5, 5, 2, 3,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 4, 5, 4), label = "Acredito que os pais e as mães homossexuais são tão competentes como os pais e mães heterossexuais.", format.spss = "F1.0", display_width = 5L, class = "labelled", labels = c(`discordo totalmente` = 1,
discordo = 2, indiferente = 3, concordo = 4, `concordo totalmente` = 5
)), Q4 = structure(c(1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2,
1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 5, 1, 1, 2, 1, 3), label = "4 Todas as Lésbicas, Gays, Bissexuais, Transexuais, Transgêneros e Intersexuais (LGBTI) me deixam irritado.", format.spss = "F1.0", display_width = 4L, class = "labelled", labels = c(`discordo totalmente` = 1,
discordo = 2, indiferente = 3, concordo = 4, `concordo totalmente` = 5
)), Q5 = structure(c(1, 4, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 3, 3,
1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 3, 2,
1, 1, 1, 2, 2, 5, 1, 4, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 3), label = "A legalização do casamento entre pessoas do mesmo sexo é muito errada.", format.spss = "F1.0", display_width = 5L, class = "labelled", labels = c(`discordo totalmente` = 1,
discordo = 2, indiferente = 3, concordo = 4, `concordo totalmente` = 5
))), row.names = c(NA, -54L), class = c("tbl_df", "tbl", "data.frame"
))
Then I need to transform all variables into factors to plot them. I really like the dplyr approach:
ds_mutate <- ds %>% mutate_all(., factor, levels=1:5)
likert(ds_mutate)
But this error is coming up:
Error in likert(ds_mutate) :
All items (columns) must have the same number of levels
When I use lapply (Nobody will convince me 'apply'functions are intuitive...), it works pretty well:
> ds_apply <- lapply(ds, factor, levels=1:5) %>% as.data.frame()
> likert(ds_apply)
Item 1 2 3 4 5
1 Q1 1.851852 1.851852 9.259259 14.814815 72.222222
2 Q2 77.777778 9.259259 5.555556 7.407407 0.000000
3 Q3 0.000000 3.703704 1.851852 14.814815 79.629630
4 Q4 79.629630 14.814815 3.703704 0.000000 1.851852
5 Q5 72.222222 7.407407 14.814815 3.703704 1.851852
But as you can see, the str is (for me) the same...
i'm looking forward to hearing from you!!
Thank you!
There is one difference:
class(ds_mutate)
# [1] "tbl_df" "tbl" "data.frame"
class(ds_apply)
# [1] "data.frame"
The issue then arises from the fact that, in the call of likert, we have
nlevels = length(levels(items[, 1]))
where, in the former case,
length(levels(ds_mutate[, 1]))
# [1] 0
since
ds_mutate[, 1]
# A tibble: 54 x 1
# Q1
# <fct>
# 1 5
# 2 4
# 3 5
# 4 5
# 5 5
# 6 5
# 7 5
# 8 5
# 9 5
# 10 5
# … with 44 more rows
i.e., the result is a tibble. Also,
methods("levels")
# [1] levels.default
so that there is no levels method for tibbles. Notice also that
class(ds_mutate) <- c("data.frame", "tbl_df", "tbl")
ds_mutate[, 1]
# [1] 5 4 5 5 5 5 5 5 5 5 5 4 3 5 5 5 5 5 1 4 5 5 3 4 5 5 5 5 5 2 5 5 4 5 5 3 5 5 4 3 3 5 5 5
# [45] 5 5 5 5 5 5 5 4 5 4
# Levels: 1 2 3 4 5
in which case
likert(ds_mutate)
starts to work too. Without modifying classes you may also use
likert(data.frame(ds_mutate))
Extra: lapply in
lapply(ds, factor, levels = 1:5)
actually is really intuitive once we understand one thing: a data frame is a special case of a list where each list element is of the same length. Know the way sapply or lapply works is that it goes over each element of the first argument: once we see ds as a data frame whose elements (since it's a list) are columns, it becomes clear how it operates. For the same reason, since the results of factor in this case are of the same length, the list resulting from the call to lapply nicely can be converted to a data frame.
I never used likert package but it looks like it doesn't take an object of the class tibble. This works for me:
likert(as.data.frame(ds_mutate))

Advanced if/then/loop function to create new columns

I am learning R (focused on the tidyverse packages) and am hoping that someone could help with the following problem that has me stumped.
I have a data-set that looks similar to the following:
library("tibble")
myData <- frame_data(
~id, ~r1, ~r2, ~r3, ~r4, ~r5, ~r6, ~r7, ~r8, ~r9, ~r10, ~r11, ~r12, ~r13, ~r14, ~r15, ~r16,
"A", 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
"B", 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
"C", 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2,
"D", 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2,
"E", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
)
Basically, I have multiple rows of respondent data, and each respondent gave 16 responses of either "1" or "2".
For each respondent (i.e., each row) I would like to create an additional three columns:
The first new column - called "switchCount" - identifies the number of times the respondent switched from a "2" response to a "1" response.
The second new column - called "switch1" - identifies the index of the first time the respondent switched from a "2" response to a "1" response.
The third new column - called "switch2" - identifies the index of the final time the respondent switched from a "2" response to a "1" response.
If there is no switch and all values are "2", then return the index of 0.
If there is no switch and all values are "1", then return the index of 16.
The final datatable should therefore look like this:
myData <- frame_data(
~id, ~r1, ~r2, ~r3, ~r4, ~r5, ~r6, ~r7, ~r8, ~r9, ~r10, ~r11, ~r12, ~r13, ~r14, ~r15, ~r16, ~switchCount, ~switch1, ~switch2,
"A", 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 1,
"B", 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4,
"C", 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 3, 9,
"D", 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 3, 6, 15,
"E", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 16, 16
)
One approach could be to concatenate all response columns row wise and then find the occurrences of 2,1 using gregexpr
library(dplyr)
myData %>%
rowwise() %>%
mutate(concat_cols = paste(r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,sep=";"),
switchCount = ifelse(gregexpr("2;1", concat_cols)[[1]][1] == -1,
0,
length(gregexpr("2;1", concat_cols)[[1]])),
switch1 = ifelse(switchCount == 0,
ifelse(grepl("2",concat_cols), 1, 16),
min(floor(gregexpr("2;1", concat_cols)[[1]]/2)+1)),
switch2 = ifelse(switchCount == 0,
ifelse(grepl("2",concat_cols), 1, 16),
max(floor(gregexpr("2;1", concat_cols)[[1]]/2)+1))) %>%
select(-concat_cols)
Output is:
id r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 r15 r16 switchCount switch1 switch2
1 A 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 1 1
2 B 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4
3 C 2 2 2 1 1 1 2 2 2 1 1 1 1 2 2 2 2 3 9
4 D 1 1 2 2 2 2 1 1 2 2 1 1 1 2 2 1 3 6 15
5 E 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 16 16
Sample data:
myData <- structure(list(id = c("A", "B", "C", "D", "E"), r1 = c(2, 2,
2, 1, 1), r2 = c(2, 2, 2, 1, 1), r3 = c(2, 2, 2, 2, 1), r4 = c(2,
2, 1, 2, 1), r5 = c(2, 1, 1, 2, 1), r6 = c(2, 1, 1, 2, 1), r7 = c(2,
1, 2, 1, 1), r8 = c(2, 1, 2, 1, 1), r9 = c(2, 1, 2, 2, 1), r10 = c(2,
1, 1, 2, 1), r11 = c(2, 1, 1, 1, 1), r12 = c(2, 1, 1, 1, 1),
r13 = c(2, 1, 1, 1, 1), r14 = c(2, 1, 2, 2, 1), r15 = c(2,
1, 2, 2, 1), r16 = c(2, 1, 2, 1, 1), switchCount = c(0, 1,
2, 3, 0), switch1 = c(1, 4, 3, 6, 16), switch2 = c(1, 4,
9, 15, 16)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))

How do you calculate rowMeans by column name, not column number

I have the following df bhs1:
structure(list(bhs1_1 = c(NA, 1, NA, 2, 1, 2, 2, 2, 1, 2, 1,
2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2), bhs1_2 = c(NA,
2, NA, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2,
1, 1, 2, 2, 2), bhs1_3 = c(NA, 1, NA, 2, 2, 2, 2, 2, 2, 2, 1,
2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2), bhs1_4 = c(NA,
2, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1,
1, 1, 2, 1, 1), bhs1_5 = c(NA, 1, NA, 1, 2, 2, 2, 2, 2, 2, 1,
2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1), bhs1_6 = c(NA,
1, NA, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, NA, 2, 1,
2, NA, 1, 1, 2), bhs1_7 = c(NA, 1, NA, 1, 2, 1, 1, 1, 1, 1, 2,
1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1), bhs1_8 = c(NA,
2, NA, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1,
2, 1, 2, 2, 2), bhs1_9 = c(NA, 1, NA, 2, 1, 1, 1, 1, 2, 1, 2,
1, 1, 1, NA, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2), bhs1_10 = c(NA,
2, NA, 1, 2, 2, 2, 2, 1, 2, 1, 1, NA, 2, 1, 1, 1, 2, 1, 2, 2,
2, 2, 1, 1, 2), bhs1_11 = c(NA, 2, NA, 2, 2, 1, 1, 1, 2, 1, 1,
1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 2, 2, 1), bhs1_12 = c(NA,
2, NA, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1,
1, 2, 2, 1, 1), bhs1_13 = c(NA, 1, NA, 1, 2, 2, 2, 2, 1, 1, 1,
2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2), bhs1_14 = c(NA,
2, NA, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
1, 1, 2, 2, 1), bhs1_15 = c(NA, 1, NA, 2, 2, 2, 2, 2, 2, 1, 2,
2, 2, 2, 1, 1, 2, 2, 2, NA, 2, 2, 2, 1, 2, 2), bhs1_16 = c(NA,
2, NA, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1,
1, 2, 2, 2, 2), bhs1_17 = c(NA, 2, NA, 2, 2, 1, 1, 1, 2, 1, 1,
1, 1, 1, 2, 2, 1, NA, 2, 2, 1, 1, 1, 2, 2, 2), bhs1_18 = c(NA,
1, NA, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
1, 1, 2, 1, 1), bhs1_19 = c(NA, 1, NA, 2, 1, 2, 2, 2, 1, 2, 2,
2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2), bhs1_20 = c(NA,
2, NA, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1, 1, 2, 2, 2)), row.names = c(NA, -26L), class = c("tbl_df",
"tbl", "data.frame"))
bhs1 was passed through as_tibble().
I am interested in calculating a new variable bhs1$total, that is the prorated mean across variables bhs1_1:bhs1_20. The reason for prorating means is so that observations with missing values can be compared to observations without missing values. At the risk of stating the obvious:
observation 1 may only have data on 18 variables, so I need to sum the recorded variables, ignore the NAs and, then divide by 18 to have a mean comparable to,
observation 2 who has data on 20 variables, who would have the summed recorded variables divided by 20.
I am aware of (although do not yet know how to execute) multiple imputation, but I do not want to use this for this exercise.
I have tried the following code:
# A tibble: 908 x 21
bhs1_1 bhs1_2 bhs1_3 bhs1_4 bhs1_5 bhs1_6 bhs1_7 bhs1_8 bhs1_9
* <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 NA NA NA NA NA NA NA NA NA
2 1 2 1 2 1 1 1 2 1
3 NA NA NA NA NA NA NA NA NA
4 2 2 2 1 1 2 1 2 2
5 1 1 2 1 2 1 2 2 1
6 2 1 2 1 2 2 1 2 1
7 2 2 2 1 2 2 1 2 1
8 2 1 2 1 2 2 1 2 1
9 1 2 2 1 2 2 1 1 2
10 2 2 2 1 2 1 1 2 1
# ... with 898 more rows, and 12 more variables: bhs1_10 <dbl>,
# bhs1_11 <dbl>, bhs1_12 <dbl>, bhs1_13 <dbl>, bhs1_14 <dbl>,
# bhs1_15 <dbl>, bhs1_16 <dbl>, bhs1_17 <dbl>, bhs1_18 <dbl>,
# bhs1_19 <dbl>, bhs1_20 <dbl>, meanTest <dbl>
Which works as expected. However, when I enter the column names instead of the column numbers, it fails:
> bhs1$meanTest <- rowMeans(bhs1[,"bhs1_1":"bhs1_20"], na.rm=TRUE)
Error in "bhs1_1":"bhs1_20" : NA/NaN argument
5. check_names_df(j, x)
4. `[.tbl_df`(bhs1, , "bhs1_1":"bhs1_20")
3. bhs1[, "bhs1_1":"bhs1_20"]
2. is.data.frame(x)
1. rowMeans(bhs1[, "bhs1_1":"bhs1_20"], na.rm = TRUE)
I think it is much more straight forward to use variable / column names instead of column numbers. Is there an elegant way to write code to meet this use case? If so, can someone please point me in the right direction and/or provide me with a sample?
Thank you in advance for your consideration and help.
We need to get a vector of names
nm1 <- paste0("bhs1_", 1:20)
bhs1$meanTest <- rowMeans(bhs1[nm1], na.rm = TRUE)
If the names are not having a pattern and we are interested to subset between the start and stop names, then use match to get the column index, get a sequence (:)
nm1 <- Reduce(`:`, match(c("bhs1_1", "bhs1_20"), names(bhs1)))
and use that in rowMeans
Or if we are using the tidyverse, we can specify the range (:)
bhs1 <- bhs1 %>%
select(bhs1_1:bhs1_20) %>% #can replace with 'nm1'
rowMeans(., na.rm = TRUE) %>%
bind_cols(bhs1, meanTest = .)
If we need to do the rowMeans on multiple sets of columns, use the map2 from purrr by either passing it as a list of quosures for the 'start' and 'end' column names in map2 and then do the select by evaluating (!!) the quosures to get the rowMeans afterwards
library(purrr)
map2_df(quos(bhs1_1, bhs1_4), quos(bhs1_5, bhs1_8), ~
bhs1 %>%
select((!! .x) : (!! .y)) %>%
rowMeans(., na.rm = TRUE)) %>%
bind_cols(bhs1, .)
Or we can pass as a vector of strings and then do the conversion to symbol (with sym from rlang) and evaluate (!!)
map2(c("bhs1_1", "bhs1_4"), c("bhs1_5", "bhs1_8"), ~
bhs1 %>%
select(!!(rlang::sym(.x)): !!(rlang::sym(.y)))) %>%
rowMeans(., na.rm = TRUE)) %>%
bind_cols(bhs1, .)

Resources