Collapse factor levels into missing levels - r

I would like to have "Dont know" and "Refuse" as missing, but assigning it to NULL does not seem to do the trick.
library(tidyverse)
mydata <- tibble(
a = factor(c("Yes", "No", "Dont know", "Yes", "Refuse", "No",
"Dont know", "Yes", "No", "Dont know", "Refuse"))
)
mydata %>%
mutate(a = fct_collapse(a, NULL = c("Dont know", "Refuse"))) %>%
count(a)
# # A tibble: 3 × 2
# a n
# <fct> <int>
# 1 NULL 5
# 2 No 3
# 3 Yes 3

Since forcats v1.0.0, you should use fct_na_level_to_value:
mydata %>%
mutate(a = fct_na_level_to_value(a, extra_levels = c("Dont know", "Refuse"))) %>%
count(a)
# # A tibble: 3 × 2
# a n
# <fct> <int>
# 1 No 3
# 2 Yes 3
# 3 NA 5

Related

Apply multiple conditions to single column

Code
count
AA
BB
CC
101
1
No
NO
4
101
2
Yes
NO
5
101
3
Yes
NO
10
102
1
Yes
NO
7
102
2
Yes
NO
40
102
3
Yes
NO
6
102
4
No
NO
12
I want to apply the condition as,
If the count column is 1 with respect to code column then AA should be "NO" and BB should be "NO".
For count between the max and min count with respect to code column then AA can be "NO" or "YES" and BB should be "NO".
For the max count column with respect to code column then AA should be "NO" and BB should be "NO".
Code
count
AA
BB
CC
101
1
No
NO
4
101
2
Yes
NO
5
102
2
Yes
NO
40
102
3
Yes
NO
6
102
4
No
NO
12
Hi,#Darren Tsai Whatever might be the case if the count column is 1 then it is getting deleted completely, by using you code I am getting the below output
Code
count
AA
BB
CC
101
2
Yes
NO
5
102
2
Yes
NO
40
102
3
Yes
NO
6
102
4
No
NO
12
A dplyr solution:
library(dplyr)
df %>%
group_by(Code) %>%
mutate(flag = count %in% range(count)) %>%
filter(flag & if_all(c(AA, BB), ~ toupper(.x) == 'NO') | !flag & toupper(BB) == 'NO') %>%
ungroup() %>%
select(-flag)
# # A tibble: 5 × 5
# Code count AA BB CC
# <int> <int> <chr> <chr> <int>
# 1 101 1 No NO 4
# 2 101 2 Yes NO 5
# 3 102 2 Yes NO 40
# 4 102 3 Yes NO 6
# 5 102 4 No NO 12
A base equivalent:
df |>
transform(flag = ave(count, Code, FUN = \(x) x %in% range(x))) |>
subset(flag & toupper(AA) == 'NO' & toupper(BB) == 'NO' | !flag & toupper(BB) == 'NO', -flag)
Data
df <- structure(list(Code = c(101L, 101L, 101L, 102L, 102L, 102L, 102L),
count = c(1L, 2L, 3L, 1L, 2L, 3L, 4L), AA = c("No", "Yes",
"Yes", "Yes", "Yes", "Yes", "No"), BB = c("NO", "NO", "NO", "NO",
"NO", "NO", "NO"), CC = c(4L, 5L, 10L, 7L, 40L, 6L, 12L)), class = "data.frame", row.names = c(NA,-7L))
Update with another dataset
This dataset has 12 rows with 3 ID 8540, 2254, 607. After running my code the 2nd, 4th, 12th rows are removed.
library(dplyr)
df2 <- structure(list(Unique_Id = c(8540, 8540, 2254, 2254, 607, 607, 607, 607, 607, 607, 607, 607),
AA = c("No", "Yes", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No"),
count = c(1, 2, 1, 2, 1, 2, 3, 4, 5, 6, 7, 8),
BB = c("No", "Yes", "No", "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes")),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -12L))
df2
# A tibble: 12 × 4
Unique_Id AA count BB
<dbl> <chr> <dbl> <chr>
1 8540 No 1 No
2 8540 Yes 2 Yes
3 2254 No 1 No
4 2254 No 2 Yes
5 607 No 1 No
6 607 No 2 No
7 607 No 3 No
8 607 No 4 No
9 607 No 5 No
10 607 No 6 No
11 607 No 7 No
12 607 No 8 Yes
df2 %>%
group_by(Unique_Id) %>%
mutate(flag = count %in% range(count)) %>%
filter(flag & if_all(c(AA, BB), ~ toupper(.x) == 'NO') | !flag & toupper(BB) == 'NO') %>%
ungroup() %>%
select(-flag)
# A tibble: 9 × 4
Unique_Id AA count BB
<dbl> <chr> <dbl> <chr>
1 8540 No 1 No
2 2254 No 1 No
3 607 No 1 No
4 607 No 2 No
5 607 No 3 No
6 607 No 4 No
7 607 No 5 No
8 607 No 6 No
9 607 No 7 No

Counting, conditionals and constellation variable for hundred of variables in a data frame in R

I am working with a dataset where I need to evaluate hundreds of columns at the time to create new variables with computations by row. I have three new variables, one needs the "or" operator to decide if there is any "yes" across the ~100 columns. The second one needs to count across the variables how many "yes" I have in total, and the third one needs to create a constellation variable that shows me the name of variables with the "yes" value, all of this by row. I have the code for the first two, but for the third one I am stuck. Also, I am using only a few variables for example purposes but I have ~100 variables that I need to use. My code is below:
#making the data - I am using actually ~100 variables
test.data <- data.frame(var1 = c("yes", "no", "no", "N/A", NA, NA),
var2 = c(NA, NA, "yes", "no", "yes", NA),
var3 = c("yes", "yes", "yes", "no", "yes", "N/A"),
var4 = c("N/A", "yes", "no", "no", "yes", NA))
# code for the first two variables: is.positive and number.pos - not elegant nor efficient since I #need to work with ~100 vars
final.data <- data.frame(test.data %>%
mutate(is.positive = ifelse(var1=="yes" | var2=="yes" | var3=="yes" | var4=="yes", 1,
ifelse((is.na(var1) | var1=="N/A") &
(is.na(var2) | var2=="N/A") &
(is.na(var3) | var3=="N/A") &
(is.na(var4) | var4=="N/A"), NA, 0))) %>%
rowwise() %>%
mutate(number.pos = sum(c_across(c(var1, var2, var3, var4))=="yes",na.rm=TRUE)))
You could do it by making a list column for which ones are positive and then deriving the other values from that.
library(tidyverse)
test.data <- data.frame(var1 = c("yes", "no", "no", "N/A", NA, NA),
var2 = c(NA, NA, "yes", "no", "yes", NA),
var3 = c("yes", "yes", "yes", "no", "yes", "N/A"),
var4 = c("N/A", "yes", "no", "no", "yes", NA))
nv <- test.data %>%
select(var1:var4) %>%
names()
out <- test.data %>%
rowwise() %>%
mutate(which_pos = list(nv[which(c_across(var1:var4) == "yes")]),
num.positive = length(which_pos),
is.positive = num.positive > 0)
out
#> # A tibble: 6 × 7
#> # Rowwise:
#> var1 var2 var3 var4 which_pos num.positive is.positive
#> <chr> <chr> <chr> <chr> <list> <int> <lgl>
#> 1 yes <NA> yes N/A <chr [2]> 2 TRUE
#> 2 no <NA> yes yes <chr [2]> 2 TRUE
#> 3 no yes yes no <chr [2]> 2 TRUE
#> 4 N/A no no no <chr [0]> 0 FALSE
#> 5 <NA> yes yes yes <chr [3]> 3 TRUE
#> 6 <NA> <NA> N/A <NA> <chr [0]> 0 FALSE
out$which_pos
#> [[1]]
#> [1] "var1" "var3"
#>
#> [[2]]
#> [1] "var3" "var4"
#>
#> [[3]]
#> [1] "var2" "var3"
#>
#> [[4]]
#> character(0)
#>
#> [[5]]
#> [1] "var2" "var3" "var4"
#>
#> [[6]]
#> character(0)
Created on 2022-05-26 by the reprex package (v2.0.1)
If you wanted a normal column for the variable identifying which ones are positive, you could simply paste the names together to create a string that has comma-separated names:
library(tidyverse)
test.data <- data.frame(var1 = c("yes", "no", "no", "N/A", NA, NA),
var2 = c(NA, NA, "yes", "no", "yes", NA),
var3 = c("yes", "yes", "yes", "no", "yes", "N/A"),
var4 = c("N/A", "yes", "no", "no", "yes", NA))
nv <- test.data %>%
select(var1:var4) %>%
names()
out <- test.data %>%
rowwise() %>%
mutate(which_pos = paste(nv[which(c_across(var1:var4) == "yes")], collapse=","),
num.positive = sum(c_across(var1:var4) == "yes", na.rm=TRUE),
is.positive = num.positive > 0)
out
#> # A tibble: 6 × 7
#> # Rowwise:
#> var1 var2 var3 var4 which_pos num.positive is.positive
#> <chr> <chr> <chr> <chr> <chr> <int> <lgl>
#> 1 yes <NA> yes N/A "var1,var3" 2 TRUE
#> 2 no <NA> yes yes "var3,var4" 2 TRUE
#> 3 no yes yes no "var2,var3" 2 TRUE
#> 4 N/A no no no "" 0 FALSE
#> 5 <NA> yes yes yes "var2,var3,var4" 3 TRUE
#> 6 <NA> <NA> N/A <NA> "" 0 FALSE
Created on 2022-05-26 by the reprex package (v2.0.1)
The list column might be easier to use in subsequent analyses if needed, but the comma-separated variable maybe easier to use for visual inspection.
Using Base R:
is.na(test.data) <- test.data == 'N/A'
idx <- test.data == 'yes'
test.data['num.positive'] <- rowSums(idx, na.rm = TRUE)
test.data['is.positive'] <- +(test.data[['num.positive']] > 0)
idx2 <- data.frame(which(idx, TRUE))
df1 <- aggregate(col~row, idx2, \(x)paste(names(test.data)[x], collapse = '-'))
df2 <- merge(cbind(test.data, row = seq(nrow(test.data))), df1, all.x =TRUE)
df2
row var1 var2 var3 var4 num.positive is.positive col
1 1 yes <NA> yes <NA> 2 1 var1-var3
2 2 no <NA> yes yes 2 1 var3-var4
3 3 no yes yes no 2 1 var2-var3
4 4 <NA> no no no 0 0 <NA>
5 5 <NA> yes yes yes 3 1 var2-var3-var4
6 6 <NA> <NA> <NA> <NA> 0 0 <NA>

How to combine count() and group_by() to count responses with a certain value, grouped by respondent?

I have a set of data where the response to a series of repeated questions is the outcome of interest. Because of this, I'd like to count the number of "I don't know" responses, grouping those counts by respondent ID, and append it as a new column. So basically, I have data that look like this:
ID
response
1
Yes
1
I don't know
2
No
2
I don't know
And I want them to look like this:
ID
response
idkcount
1
Yes
1
1
I don't know
1
2
No
1
2
I don't know
1
This is the code I've most recently written:
df$idkcount <- group_by(as_tibble(df$ID)) %>% count(df$response == "I don't know")
But I seem to get an error message no matter what I try with these two commands. What am I missing?
Using group_by and mutate you could do:
Note: I slightly altered your example data to a more general case.
df <- data.frame(
ID = c(1L, 1L, 1L, 1L, 2L, 2L),
response = c("Yes", "I don't know", "I don't know", "I don't know", "No", "I don't know")
)
library(dplyr)
df %>%
group_by(ID) %>%
mutate(idkcount = sum(response == "I don't know", na.rm = TRUE)) %>%
ungroup()
#> # A tibble: 6 × 3
#> ID response idkcount
#> <int> <chr> <int>
#> 1 1 Yes 3
#> 2 1 I don't know 3
#> 3 1 I don't know 3
#> 4 1 I don't know 3
#> 5 2 No 1
#> 6 2 I don't know 1
my_df <- data.frame("id" = c(1, 1, 2, 2, 3),
"response" = c("I don't know", "I don't know", "no", "I don't know", "maybe"),
stringsAsFactors = FALSE)
my_df <- my_df %>% group_by(id) %>% mutate(count = length(which(response == "I don't know")))
A possible solution (I am using #stefan's dataset):
library(tidyverse)
df <- data.frame(
ID = c(1L, 1L, 1L, 1L, 2L, 2L),
response = c("Yes", "I don't know", "I don't know", "I don't know", "No", "I don't know")
)
df %>%
count(ID, response, name = "idkcount")
#> ID response idkcount
#> 1 1 I don't know 3
#> 2 1 Yes 1
#> 3 2 I don't know 1
#> 4 2 No 1

group_by and keep all groups that does not not contain specific value and filter where there is value

I have the following dataframe:
df <- data.frame(
Code = c("a", "a", "a", "a", "a", "b", "b", "b", "b", "b"),
Inst = c("Yes", "No", "No", "No", "No", "No", "No", "No", "No", "No"),
Date = c(
"2021-01-01", "2021-01-02", "2021-01-03", "2021-01-04", "2021-01-05",
"2021-01-06", "2021-01-06", "2021-01-06", "2021-01-09", "2021-01-10"
)
)
I want to apply dplyr::group_by to the variable Code and filter for specific value "Yes" and for minimum Date, but I want to keep all observations of groups that do not contain the Yes value. I tried filter(any(Inst == "Yes")) but this does not work.
I would like to have this result:
Code Inst Date
a Yes 2021-01-01
b No 2021-01-06
b No 2021-01-06
b No 2021-01-06
If there could be multiple Yes values:
df %>%
group_by(Code) %>%
slice(if(all(Inst != "Yes")) 1:n() else which(Inst == "Yes"))
Code Inst
<chr> <chr>
1 a Yes
2 b No
3 b No
4 b No
5 b No
6 b No
Considering the updated question:
df %>%
mutate(Date = as.Date(Date, format = "%Y-%m-%d")) %>%
group_by(Code) %>%
slice(if(all(Inst != "Yes")) 1:n() else which(Inst == "Yes")) %>%
filter(Date == min(Date))
Code Inst Date
<chr> <chr> <date>
1 a Yes 2021-01-01
2 b No 2021-01-06
3 b No 2021-01-06
4 b No 2021-01-06
With dplyr :
library(dplyr)
df %>%
group_by(Code) %>%
summarize(
across(everything(), function(x) {
if (any(Inst == "Yes")) x[which.max(Inst == "Yes")] else x
})
) %>%
ungroup()
#> `summarise()` has grouped output by 'Code'. You can override using the `.groups` argument.
#> # A tibble: 6 x 3
#> Code Inst Date
#> <chr> <chr> <chr>
#> 1 a Yes 2021-01-01
#> 2 b No 2021-01-06
#> 3 b No 2021-01-06
#> 4 b No 2021-01-06
#> 5 b No 2021-01-09
#> 6 b No 2021-01-10
A dplyr option
df %>%
group_by(Code) %>%
filter(ifelse(all(Inst == "No"), c, `!`)(Inst == "No")) %>%
filter(Date == min(Date)) %>%
ungroup()
gives
# A tibble: 4 x 3
Code Inst Date
<chr> <chr> <chr>
1 a Yes 2021-01-01
2 b No 2021-01-06
3 b No 2021-01-06
4 b No 2021-01-06

How to use dplyr to conditionally change values in a column by group?

I have data like this:
g1 g2 var
1 a Yes
1 a No
1 a No
1 b Yes
1 b Yes
1 b Yes
2 a No
2 a No
2 a No
I would like to change all values in var to Yes if in each g1&g2 group, there is at least one Yes in var. I tried to use combinations of group_by and mutate, replace, ifelse with no success. Any help is appreciated.
We can use if/else instead of ifelse. Grouped by 'g1', 'g2', if 'Yes' is %in% 'var', then return "Yes" or else return 'var'
library(dplyr)
df1 %>%
group_by(g1, g2) %>%
mutate(var = if("Yes" %in% var) "Yes" else var)
# A tibble: 9 x 3
# Groups: g1, g2 [3]
# g1 g2 var
# <int> <chr> <chr>
#1 1 a Yes
#2 1 a Yes
#3 1 a Yes
#4 1 b Yes
#5 1 b Yes
#6 1 b Yes
#7 2 a No
#8 2 a No
#9 2 a No
Or with case_when
df1 %>%
group_by(g1, g2) %>%
mutate(var = case_when("Yes" %in% var ~ "Yes", TRUE ~ var))
data
df1 <- structure(list(g1 = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), g2 = c("a",
"a", "a", "b", "b", "b", "a", "a", "a"), var = c("Yes", "No",
"No", "Yes", "Yes", "Yes", "No", "No", "No")), class = "data.frame",
row.names = c(NA, -9L))
You can also do:
df %>%
group_by(g1, g2) %>%
mutate(var = ifelse(any(var == "Yes"), "Yes", "No"))
g1 g2 var
<int> <chr> <chr>
1 1 a Yes
2 1 a Yes
3 1 a Yes
4 1 b Yes
5 1 b Yes
6 1 b Yes
7 2 a No
8 2 a No
9 2 a No
Here, if any value (per "g1" and "g2") in "var" is equal to Yes, it returns Yes, otherwise No.
An extra line of code from the above two solutions, but using ifelse or if_else by creating a new column then deleting and renaming:
library(tidyverse)
df %>%
group_by(g1, g2) %>%
mutate(var2 = if_else("Yes" %in% var, "Yes", "No")) %>%
select(-var, var = var2)
result:
g1 g2 var
<dbl> <chr> <chr>
1 1 a Yes
2 1 a Yes
3 1 a Yes
4 1 b Yes
5 1 b Yes
6 1 b Yes
7 2 a No
8 2 a No
9 2 a No `
a non-case_when if_else way, fun
df1 %>%
group_by(g1,g2) %>%
arrange (g1,g2,var) %>%
mutate(var=last(var))
# arranged alphabetically, var values may be changed to the last value by groups -- Yes in this case
g1 g2 var
<int> <chr> <chr>
1 1 a Yes
2 1 a Yes
3 1 a Yes
4 1 b Yes
5 1 b Yes
6 1 b Yes
7 2 a No
8 2 a No
9 2 a No

Resources