I have a task that's becoming quite difficult for me.
I have to create a variable (pr_test_1) to test whether a variable for a procedure (I10_PR1) is in a list of procedures, and this code is working great:
df <- df %>%
mutate(pr_test_1=ifelse(I10_PR1 %in% abl_pr, 1,0))
However, I have 25 variables for procedures (I10_PR1 to I10_PR25) and I have to create one for each (pr_test_1 to pr_test_25).
I don't seem to find the right syntax to get a for loop to work.
Any help will be greatly appreciated!
dplyr::across allows you to apply a function to multiple columns as specified with a selector (the below uses the starts_with selector).
library(dplyr)
library(purrr)
# sample data
df <- tibble::tibble(
I10_PR1 = sample(100),
I10_PR2 = sample(100),
I10_PR3 = sample(100),
I10_PR4 = sample(100)
)
# a sample list of values to compare against
match_list <- sample(10)
df %>%
mutate(
across(
starts_with("I10_PR"),
~ if_else(.x %in% match_list, 1, 0),
.names = "pr_test_{.col}"
)
)
#> # A tibble: 100 × 8
#> I10_PR1 I10_PR2 I10_PR3 I10_PR4 pr_test_I10_PR1 pr_test_I10…¹ pr_te…² pr_te…³
#> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 93 45 47 46 0 0 0 0
#> 2 91 89 90 76 0 0 0 0
#> 3 16 32 30 24 0 0 0 0
#> 4 66 26 46 41 0 0 0 0
#> 5 53 51 79 9 0 0 0 1
#> 6 36 64 61 32 0 0 0 0
#> 7 45 75 23 25 0 0 0 0
#> 8 86 61 77 52 0 0 0 0
#> 9 17 87 64 53 0 0 0 0
#> 10 6 42 57 33 1 0 0 0
#> # … with 90 more rows, and abbreviated variable names ¹pr_test_I10_PR2,
#> # ²pr_test_I10_PR3, ³pr_test_I10_PR4
Created on 2022-10-26 with reprex v2.0.2
This for() loop works perfectly with your one (slightly modified) line of code and dynamic variable names
for(i in 1:3){
df <- df %>%
mutate(!!paste0("pr_test_",i) := ifelse(!!as.name(paste0("I10_PR",i)) %in% abl_pr, 1,0))
}
Data used:
abl_pr <- sample(LETTERS)[1:10]
I10_PR1 <- sample(LETTERS)
I10_PR2 <- sample(LETTERS)
I10_PR3 <- sample(LETTERS)
df <- data.frame(I10_PR1,I10_PR2,I10_PR3)
Related
Taking the airquality dataset from the MASS library as an example:
> head(airquality)
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
I want to create three columns:
Missing_Ozone, Missing_Total and Missing_Percent, such that:
Missing_Ozone = 1 if there is a missing value in the Ozone column in the current row.
Missing_total = total count of missing values in the current row
Missing_Percent = percentage of missing values in a row.
So for example, in row 1:
Missing_Ozone = 0, Missing_total = 0, Missing_percent = 0
In row 5:
Missing_Ozone = 1, Missing_total = 2, Missing percent = 100*(2/6)
In row 6:
Missing_Ozone = 0, Missing_total = 1, Missing percent = 100*(1/6)
I tried two approaches, without any luck:
The first was to iterate over each row and use an if statement:
library(MASS)
df_test = airquality
df_test$Missing_Ozone <- 0
for(i in 1:nrow(df_test)){
if (is.na(df_test$Ozone)) {
df_test$Missing_Ozone <- 1
}
}
The second was to just use the if-statement inside that for-loop.
Neither work, and I just get:
> df_test
Ozone Solar.R Wind Temp Month Day Missing_Ozone
1 41 190 7.4 67 5 1 0
2 36 118 8.0 72 5 2 0
3 12 149 12.6 74 5 3 0
4 18 313 11.5 62 5 4 0
5 NA NA 14.3 56 5 5 0
Any help is appreciated.
Edit: Also, does this type of data manipulation have a certain name? I found it hard to search online for a guide that goes through this type of data manipulation.
Tidyverse approach:
library(dplyr)
airquality <- datasets::airquality
cols <- ncol(airquality)
airquality <- airquality %>%
mutate(
Missing_Ozone = as.numeric(is.na(Ozone)),
Missing_Total = rowSums(is.na(.)),
Missing_Percent = Missing_Total/cols
)
> head(airquality)
Ozone Solar.R Wind Temp Month Day Missing_Ozone Missing_Total Missing_Percent
1 41 190 7.4 67 5 1 0 0 0.0000000
2 36 118 8.0 72 5 2 0 0 0.0000000
3 12 149 12.6 74 5 3 0 0 0.0000000
4 18 313 11.5 62 5 4 0 0 0.0000000
5 NA NA 14.3 56 5 5 1 2 0.3333333
6 28 NA 14.9 66 5 6 0 1 0.1666667
Base R approach:
cols <- ncol(airquality)
airquality$Missing_Ozone <- as.numeric(is.na(airquality$Ozone))
airquality$Missing_Total <- rowSums(is.na(airquality))
airquality$Missing_Percent <- airquality$Missing_Total/cols
> head(airquality)
> Ozone Solar.R Wind Temp Month Day Missing_Ozone Missing_Total Missing_Percent
1 41 190 7.4 67 5 1 0 0 0.0000000
2 36 118 8.0 72 5 2 0 0 0.0000000
3 12 149 12.6 74 5 3 0 0 0.0000000
4 18 313 11.5 62 5 4 0 0 0.0000000
5 NA NA 14.3 56 5 5 1 2 0.3333333
6 28 NA 14.9 66 5 6 0 1 0.1666667
edit: A note on performance
I would advise in general against usage of rowwise operations outside of very specific use cases. It will slow you down heavily as your data set scales. The execution time tends to grow linearly with your data, which is really, really bad. A little benchmark with a data set size of 6,426 rows instead of 153:
library(dplyr)
library(microbenchmark)
airquality <- datasets::airquality
# Rowwise
approachA <- function(data) {
result <- data %>%
mutate(Missing_Ozone = as.integer(is.na(Ozone))) %>%
rowwise() %>%
mutate(Missing_Total = sum(is.na((c_across(-Missing_Ozone))))) %>%
mutate(Missing_Percent = Missing_Total/ncol(data)) %>%
ungroup()
return(result)
}
# Tidy
approachB <- function(data) {
cols <- ncol(data)
result <- data %>%
mutate(
Missing_Ozone = as.numeric(is.na(Ozone)),
Missing_Total = rowSums(is.na(.)),
Missing_Percent = Missing_Total/cols
)
return(result)
}
# Base R
approachC <- function(data) {
cols <- ncol(data)
data$Missing_Ozone <- as.numeric(is.na(data$Ozone))
data$Missing_Total <- rowSums(is.na(data))
data$Missing_Percent <- data$Missing_Total/cols
return(data)
}
Result with data x 42: rowwise() has led to some orders of magnitude worse performance over both proposed approaches.
> test_data <- do.call("rbind", replicate(42, airquality, simplify = FALSE))
> set.seed(42)
> microbenchmark::microbenchmark(approachA(test_data), approachB(test_data), approachC(test_data))
Unit: microseconds
expr min lq mean median uq max neval cld
approachA(test_data) 243340.904 251838.3590 259083.8089 256546.9015 260567.8945 405326.615 100 b
approachB(test_data) 577.977 624.0610 723.8304 741.0955 770.3695 2382.756 100 a
approachC(test_data) 102.377 107.9735 139.5595 119.6175 129.4165 2074.231 100 a
Result with data x 420: Execution time of rowwise approach has grown 10x.
test_data <- do.call("rbind", replicate(420, airquality, simplify = FALSE))
> set.seed(42)
> microbenchmark::microbenchmark(approachA(test_data), approachB(test_data), approachC(test_data))
Unit: microseconds
expr min lq mean median uq max neval cld
approachA(test_data) 2519480.258 2620528.08 2671419.663 2672263.417 2707896.209 2907659.730 100 b
approachB(test_data) 1266.818 1488.71 1909.167 1576.327 1678.725 21011.147 100 a
approachC(test_data) 808.684 881.09 1220.151 1000.277 1067.907 8218.655 100 a
A solution using the dplyr package. rowwise and c_cross allow us to do calculation by each row.
library(dplyr)
dat <- airquality %>%
mutate(Missing_Ozone = as.integer(is.na(Ozone))) %>%
rowwise() %>%
mutate(Missing_Total = sum(is.na((c_across(-Missing_Ozone))))) %>%
mutate(Missing_Percent = Missing_Total/ncol(airquality)) %>%
ungroup()
dat
# # A tibble: 153 x 9
# Ozone Solar.R Wind Temp Month Day Missing_Ozone Missing_Total Missing_Percent
# <int> <int> <dbl> <int> <int> <int> <int> <int> <dbl>
# 1 41 190 7.4 67 5 1 0 0 0
# 2 36 118 8 72 5 2 0 0 0
# 3 12 149 12.6 74 5 3 0 0 0
# 4 18 313 11.5 62 5 4 0 0 0
# 5 NA NA 14.3 56 5 5 1 2 0.333
# 6 28 NA 14.9 66 5 6 0 1 0.167
# 7 23 299 8.6 65 5 7 0 0 0
# 8 19 99 13.8 59 5 8 0 0 0
# 9 8 19 20.1 61 5 9 0 0 0
# 10 NA 194 8.6 69 5 10 1 1 0.167
# # ... with 143 more rows
This question already has answers here:
Create new dummy variable columns from categorical variable
(8 answers)
How to make a function in R to recode a variable into new binary columns? (with ifelse statement) [duplicate]
(3 answers)
Closed 1 year ago.
I want to one-hot encode in R through tidyverse, and not use packages such as caret, mltools, etc.
## Load vcd package
library(vcd)
## Load Arthritis dataset (data frame)
data(Arthritis)
Arthritis[1:5, ][2:5]
Treatment Sex Age Improved
1 Treated Male 27 Some
2 Treated Male 29 None
3 Treated Male 30 None
4 Treated Male 32 Marked
5 Treated Male 46 Marked
Is there an easy way to do this in tidyverse where I keep n-1 of the values for each categorical column? For example Sex is binary in this dataset so I would only need a one-hot encoded column for either Male or Female. The age feature would be ignored.
For your specific example you could do this:
library(dplyr)
Arthritis |>
as_tibble() |> # not necessary, just using it for output readability
mutate(sex_male = as.numeric(Sex) - 1)
#> # A tibble: 84 × 6
#> ID Treatment Sex Age Improved sex_male
#> <int> <fct> <fct> <int> <ord> <dbl>
#> 1 57 Treated Male 27 Some 1
#> 2 46 Treated Male 29 None 1
#> 3 77 Treated Male 30 None 1
#> 4 17 Treated Male 32 Marked 1
#> 5 36 Treated Male 46 Marked 1
#> 6 23 Treated Male 58 Marked 1
#> 7 75 Treated Male 59 None 1
#> 8 39 Treated Male 59 Marked 1
#> 9 33 Treated Male 63 None 1
#> 10 55 Treated Male 63 None 1
#> # … with 74 more rows
This only works because Sex is a factor variable with two levels/distinct values. More complex variables will need more attention, unless you are willing to use a function from a package.
You are asking for a tidyverse solution. The recipes package is part of tidymodels.
library(recipes)
Arthritis |>
recipe(Improved ~ .) |>
step_dummy(Sex, Treatment) |>
prep() |>
bake(Arthritis)
#> # A tibble: 84 × 5
#> ID Age Improved Sex_Male Treatment_Treated
#> <int> <int> <ord> <dbl> <dbl>
#> 1 57 27 Some 1 1
#> 2 46 29 None 1 1
#> 3 77 30 None 1 1
#> 4 17 32 Marked 1 1
#> 5 36 46 Marked 1 1
#> 6 23 58 Marked 1 1
#> 7 75 59 None 1 1
#> 8 39 59 Marked 1 1
#> 9 33 63 None 1 1
#> 10 55 63 None 1 1
#> # … with 74 more rows
I agree with Till that recipes is the way to go here. But if you want a solution strictly from the tidyverse, you could do something like this:
library(vcd)
library(tidyverse)
Arthritis %>%
as_tibble() %>%
mutate(d = map_dfc(unique(Improved) %>%
set_names(.),
~ Improved == .x
) %>%
.[-1]
)
#> # A tibble: 84 × 6
#> ID Treatment Sex Age Improved d$None $Marked
#> <int> <fct> <fct> <int> <ord> <lgl> <lgl>
#> 1 57 Treated Male 27 Some FALSE FALSE
#> 2 46 Treated Male 29 None TRUE FALSE
#> 3 77 Treated Male 30 None TRUE FALSE
#> 4 17 Treated Male 32 Marked FALSE TRUE
#> 5 36 Treated Male 46 Marked FALSE TRUE
#> 6 23 Treated Male 58 Marked FALSE TRUE
#> 7 75 Treated Male 59 None TRUE FALSE
#> 8 39 Treated Male 59 Marked FALSE TRUE
#> 9 33 Treated Male 63 None TRUE FALSE
#> 10 55 Treated Male 63 None TRUE FALSE
#> # … with 74 more rows
You could use a combination of pivot_longer and pivot_wider for this.
Arthritis %>%
as_tibble() %>% # not neccessary, for better viewing
mutate(across(everything(), as.character)) %>%
pivot_longer(c(Sex, Treatment, Improved), names_to = 'variable', values_to = 'value') %>% # specify the columns to encode here
mutate(ind = 1) %>%
unite(col_name, variable, value) %>%
pivot_wider(values_from = ind, names_from = col_name, values_fill = 0)
For the n-1, once the data is in long format, you could filter out one of the values
long_format <- Arthritis %>%
as_tibble() %>%
mutate(across(everything(), as.character)) %>%
pivot_longer(c(Sex, Treatment, Improved), names_to = 'variable', values_to = 'value') %>%
mutate(ind = 1)
# for the n-1
values_to_keep <- long_format %>%
count(variable, value) %>%
group_by(variable) %>%
slice(-1) %>%
pull(value)
long_format %>%
filter(value %in% values_to_keep) %>%
unite(col_name, variable, value) %>%
pivot_wider(values_from = ind, names_from = col_name, values_fill = 0)
# A tibble: 78 x 6
ID Age Sex_Male Treatment_Treated Improved_Some Improved_None
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 57 27 1 1 1 0
2 46 29 1 1 0 1
3 77 30 1 1 0 1
4 17 32 1 1 0 0
5 36 46 1 1 0 0
6 23 58 1 1 0 0
7 75 59 1 1 0 1
8 39 59 1 1 0 0
9 33 63 1 1 0 1
10 55 63 1 1 0 1
You may be able to use just model.matrix for this. I've altered your sample data a little to ensure there are 2 or more levels for all:
dat <- structure(list(Treatment = c("Treated", "Treated", "UnTreated", "Treated", "Treated"), Sex = c("Male", "Male", "Male", "FeMale", "Male"), Age = c(27L, 29L, 30L, 32L, 46L), Improved = c("Some", "None", "None", "Marked", "Marked")), class = "data.frame", row.names = c("1", "2", "3", "4", "5"))
dat
# Treatment Sex Age Improved
# 1 Treated Male 27 Some
# 2 Treated Male 29 None
# 3 UnTreated Male 30 None
# 4 Treated FeMale 32 Marked
# 5 Treated Male 46 Marked
From there,
isnum <- sapply(dat, is.numeric)
iscat <- !isnum & lengths(lapply(dat, unique)) > 1
paste("~ 0 +", paste(names(dat)[iscat], collapse = " + "))
# [1] "~ 0 + Treatment + Sex + Improved"
cbind(dat[, !iscat, drop=FALSE],
model.matrix(formula(paste("~ 0 +", paste(names(dat)[iscat], collapse = " + "))), data = dat))
# Age TreatmentTreated TreatmentUnTreated SexMale ImprovedNone ImprovedSome
# 1 27 1 0 1 0 1
# 2 29 1 0 1 1 0
# 3 30 0 1 1 1 0
# 4 32 1 0 0 0 0
# 5 46 1 0 1 0 0
I am applying a user defined function on numeric variables from a dataset but instead of getting their name's I am getting x when applied using map function. How do I replace x with variable name in map functions?
dataset: hd_trn
age sex cp trestbps chol fbs restecg thalach exang
<int> <fctr> <fctr> <int> <int> <fctr> <fctr> <int> <fctr>
63 1 1 145 233 1 2 150 0
67 1 4 160 286 0 2 108 1
67 1 4 120 229 0 2 129 1
37 1 3 130 250 0 0 187 0
41 0 2 130 204 0 2 172 0
56 1 2 120 236 0 0 178 0
user defined function to calculate high freq elements column wise
top_freq_elements <- function(x){
table(x) %>% as.data.frame() %>% top_n(5, Freq) %>% arrange(desc(Freq))
}
Applying function
hd_trn %>% select_if(is.numeric) %>% map(., .f = top_freq_elements)
######### output #########
x Freq
<fctr> <int>
54 51
58 43
55 41
56 38
57 38
desired: In the above output I am looking to get variable name instead of x
Tried reconstructing code below using imap but that is also not giving variable name:
hd_trn %>%
select_if(is.numeric) %>%
imap(function(feature_value, feature_name){
table(feature_value) %>%
as.data.frame() %>% #head()
rename(feature_name = feature_value) %>%
top_n(5, Freq) %>%
arrange(desc(Freq))
})
######### output #########
feature_name Freq
<fctr> <int>
54 51
58 43
55 41
56 38
57 38
You can rename the 1st column in each list :
library(dplyr)
library(purrr)
iris %>%
select(where(is.numeric)) %>%
imap(function(feature_value, feature_name){
table(feature_value) %>%
as.data.frame() %>%
rename_with(~feature_name, 1) %>%
slice_max(n = 5, Freq) %>%
arrange(desc(Freq))
})
This could be achieved using e.g. curly-curly {{ and := in rename like so:
top_freq_elements <- function(x){
table(x) %>% as.data.frame() %>% top_n(5, Freq) %>% arrange(desc(Freq))
}
library(dplyr)
library(purrr)
hd_trn %>%
select_if(is.numeric) %>%
imap(function(feature_value, feature_name){
table(feature_value) %>%
as.data.frame() %>% #head()
rename({{feature_name}} := feature_value) %>%
top_n(5, Freq) %>%
arrange(desc(Freq))
})
#> $age
#> age Freq
#> 1 67 2
#> 2 37 1
#> 3 41 1
#> 4 56 1
#> 5 63 1
#>
#> $sex
#> sex Freq
#> 1 1 5
#> 2 0 1
#>
#> $cp
#> cp Freq
#> 1 2 2
#> 2 4 2
#> 3 1 1
#> 4 3 1
#>
#> $trestbps
#> trestbps Freq
#> 1 120 2
#> 2 130 2
#> 3 145 1
#> 4 160 1
I'm trying to split columns into new rows keeping the data of the first two columns.
d1 <- data.frame(a=c(100,0,78),b=c(0,137,117),c.1=c(111,17,91), d.1=c(99,66,22), c.2=c(11,33,44), d.2=c(000,001,002))
d1
a b c.1 d.1 c.2 d.2
1 100 0 111 99 11 0
2 0 137 17 66 33 1
3 78 117 91 22 44 2
Expected results would be:
a b c d
1 100 0 111 99
2 100 0 11 0
3 0 137 17 66
4 0 137 33 1
5 78 117 91 22
6 78 117 44 2
Multiple tries with dplyr, but in sees is not the right approach.
If you want to stay in dplyr/tidyverse, you want tidyr::pivot_longer with a special reference to .value -- see the pivot vignette for more:
library(tidyverse)
d1 <- data.frame(
a = c(100, 0, 78),
b = c(0, 137, 117),
c.1 = c(111, 17, 91),
d.1 = c(99, 66, 22),
c.2 = c(11, 33, 44),
d.2 = c(000, 001, 002)
)
d1 %>%
pivot_longer(
cols = contains("."),
names_to = c(".value", "group"),
names_sep = "\\."
)
#> # A tibble: 6 x 5
#> a b group c d
#> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 100 0 1 111 99
#> 2 100 0 2 11 0
#> 3 0 137 1 17 66
#> 4 0 137 2 33 1
#> 5 78 117 1 91 22
#> 6 78 117 2 44 2
Created on 2020-05-11 by the reprex package (v0.3.0)
This could solve your issue:
#Try this
a1 <- d1[,c(1:4)]
a2 <- d1[,c(1,2,5,6)]
names(a1) <- names(a2) <- c('a','b','c','d')
DF <- rbind(a1,a2)
The posted answers are good, here's my attempt:
df <- data.frame(a=c(100,0,78),b=c(0,137,117),
c.1=c(111,17,91), d.1=c(99,66,22),
c.2=c(11,33,44), d.2=c(000,001,002))
# Make 2 pivot long operations
df_c <- df %>% select(-d.1, -d.2) %>%
pivot_longer(cols = c("c.1", "c.2"), values_to = "c") %>% select(-name)
df_d <- df %>% select(-c.1, -c.2) %>%
pivot_longer(cols=c("d.1","d.2"), values_to = "d") %>% select(-name)
# bind them without the "key" colums
bind_cols(df_c, select(df_d, -a, -b))
Which produces
# A tibble: 6 x 4
a b c d
<dbl> <dbl> <dbl> <dbl>
1 100 0 111 99
2 100 0 11 0
3 0 137 17 66
4 0 137 33 1
5 78 117 91 22
6 78 117 44 2
Can someone think of a more interesting way to combine multiple factor columns into a single numeric column?
MWE dataset:
df <- data.frame(q.82=factor(c(1,2,2,1,1)), q.77=factor(c(2,1,1,1,1)), q.72=factor(c(1,1,1,2,2)))
levels(df$q.82) <- c("","$80 and above")
levels(df$q.77) <- c("", "$75 to $79")
levels(df$q.72) <- c("", "$70 to $74")
str(df$q.82)
Factor w/ 2 levels "","$80 and above": 1 2 2 1 1
df looks like this:
q.82 q.77 q.72
1 $74 to $79
2 $80 and above
3 $80 and above
4 $70 to $74
5 $70 to $74
What I'd like is something like this, where the columns are numeric:
q.82 q.77 q.72 q
1 0 77 0 77
2 82 0 0 82
3 82 0 0 82
4 0 0 72 72
5 0 0 72 72
The following works, but seems klunky—mostly because the actual dataset has many columns.
df$q.82 <- as.numeric(as.factor(df$q.82))
df$q.82[df$q.82 == 2] <- 82
df$q.82[df$q.82 == 1] <- 0
df$q.77 <- as.numeric(as.factor(df$q.77))
df$q.77[df$q.77 == 2] <- 77
df$q.77[df$q.77 == 1] <- 0
df$q.72 <- as.numeric(as.factor(df$q.72))
df$q.72[df$q.72 == 2] <- 72
df$q.72[df$q.72 == 1] <- 0
df <- df %>% mutate(q=q.82+q.77+q.72)
A possible approach with base R using sapply:
For each column, replace non-empty strings by the numeric part of the column name and replace empty strings by zero.
Add an additional column q that contains the summed value of each row.
out_df <- sapply(names(df), function(name) {
ifelse(nchar(as.character(df[[name]])) > 0, as.numeric(sub("^q\\.", "", name)), 0)
})
out_df <- transform(out_df, q = rowSums(out_df))
out_df
#> q.82 q.77 q.72 q
#> 1 0 77 0 77
#> 2 82 0 0 82
#> 3 82 0 0 82
#> 4 0 0 72 72
#> 5 0 0 72 72
Similarly, using the tidyverse:
library(tidyverse)
df_out <- imap_dfc(.x = df, .f = ~{
if_else(nchar(as.character(.x)) > 0, as.numeric(str_remove(.y, "^q\\.")), 0)
}) %>%
mutate(q = rowSums(.))
df_out
#> # A tibble: 5 x 4
#> q.82 q.77 q.72 q
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0 77 0 77
#> 2 82 0 0 82
#> 3 82 0 0 82
#> 4 0 0 72 72
#> 5 0 0 72 72
Or with data.table:
library(data.table)
setDT(df)
for(j in names(df))
set(df, j = j, value = ifelse(nchar(as.character(df[[j]])) > 0, as.numeric(sub("^q\\.", "", j)), 0))
df[, q := rowSums(.SD)][]
#> q.82 q.77 q.72 q
#> 1: 0 77 0 77
#> 2: 82 0 0 82
#> 3: 82 0 0 82
#> 4: 0 0 72 72
#> 5: 0 0 72 72
Data
df <- data.frame(q.82=factor(c(1,2,2,1,1)), q.77=factor(c(2,1,1,1,1)), q.72=factor(c(1,1,1,2,2)))
levels(df$q.82) <- c("","$80 and above")
levels(df$q.77) <- c("", "$75 to $79")
levels(df$q.72) <- c("", "$70 to $74")
Here is another base R method, where we replace non-blank value in the column with the numeric part in the column name using sub.
df[] <- t(as.integer(sub(".*?(\\d+)", "\\1", names(df))) * t(df != ""))
df
# q.82 q.77 q.72
#1 0 77 0
#2 82 0 0
#3 82 0 0
#4 0 0 72
#5 0 0 72
and then if you want to row-wise sum the values you can use rowSums
df$q <- rowSums(df)