How to convert columns to multiple boolean columns with tidyverse - r

I have a group of columns for each time and I want to convert it to a lot of boolean columns (one by category) with mutate() and across() like that :
data <- data.frame(category_t1 = c("A","B","C","C","A","B"),
category_t2 = c("A","C","B","B","B",NA),
category_t3 = c("C","C",NA,"B",NA,"A"))
data %>% mutate(across(starts_with("category"),
~case_when(.x == "A" ~ TRUE, !is.na(.x) ~ FALSE),
.names = "{str_replace(.col, 'category', 'A')}"),
across(starts_with("category"),
~case_when(.x == "B" ~ TRUE, !is.na(.x) ~ FALSE),
.names = "{str_replace(.col, 'category', 'B')}"),
across(starts_with("category"),
~case_when(.x == "C" ~ TRUE, !is.na(.x) ~ FALSE),
.names = "{str_replace(.col, 'category', 'C')}"))
Which makes :
category_t1 category_t2 category_t3 A_t1 A_t2 A_t3 B_t1 B_t2 B_t3 C_t1 C_t2
1 A A C TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
2 B C C FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
3 C B <NA> FALSE FALSE NA FALSE TRUE NA TRUE FALSE
4 C B B FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
5 A B <NA> TRUE FALSE NA FALSE TRUE NA FALSE FALSE
6 B <NA> A FALSE NA TRUE TRUE NA FALSE FALSE NA
It works but I would like to know if there is a better idea because here I am doing the same code 3 times instead of one big code (and imagine if I had 10 times to repeat it...). I though I could do it with map() but I didn't manage to make it work.
I think there is a problem because of .names argument in across() that cannot connect with the string I use in case_when().
I think maybe there is something to do in the ... argument, like :
data %>% mutate(across(starts_with("category"),
~case_when(.x == mod ~ TRUE, !is.na(.x) ~ FALSE),
mod = levels(as.factor(data$category_t1)),
.names = "{str_replace(.col, 'category', mod)}"))
But of course that doesn't work here. Do you know how to do that ?
Thanks a lot.

We may use table in across
library(dplyr)
library(stringr)
library(tidyr)
data %>%
mutate(across(everything(), ~ as.data.frame.matrix(table(row_number(), .x) *
NA^(is.na(.x)) > 0),
.names = "{str_remove(.col, 'category_')}")) %>%
unpack(where(is.data.frame), names_sep = ".")
-output
# A tibble: 6 × 12
category_t1 category_t2 category_t3 t1.A t1.B t1.C t2.A t2.B t2.C t3.A t3.B t3.C
<chr> <chr> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 A A C TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
2 B C C FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
3 C B <NA> FALSE FALSE TRUE FALSE TRUE FALSE NA NA NA
4 C B B FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
5 A B <NA> TRUE FALSE FALSE FALSE TRUE FALSE NA NA NA
6 B <NA> A FALSE TRUE FALSE NA NA NA TRUE FALSE FALSE
Or use model.matrix from base R
data1 <- replace(data, is.na(data), "NA")
lvls <- lapply(data1, \(x) levels(factor(x, levels = c("NA", "A", "B", "C"))))
m1 <- model.matrix(~ 0 + ., data = data1, xlev = lvls)
out <- cbind(data, m1[, -grep("NA", colnames(m1))] > 0)
-output
out
category_t1 category_t2 category_t3 category_t1A category_t1B category_t1C category_t2A category_t2B category_t2C category_t3A category_t3B category_t3C
1 A A C TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
2 B C C FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
3 C B <NA> FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
4 C B B FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
5 A B <NA> TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
6 B <NA> A FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
> colnames(out)
[1] "category_t1" "category_t2" "category_t3"
[4] "category_t1A" "category_t1B" "category_t1C"
[7] "category_t2A" "category_t2B" "category_t2C"
[10] "category_t3A"
[11] "category_t3B" "category_t3C"
Or another option with table
cbind(data, do.call(cbind.data.frame,
lapply(data, \(x) (table(seq_along(x), x)* NA^is.na(x)) > 0)))
-output
category_t1 category_t2 category_t3 category_t1.A category_t1.B category_t1.C category_t2.A category_t2.B category_t2.C category_t3.A category_t3.B
1 A A C TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
2 B C C FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
3 C B <NA> FALSE FALSE TRUE FALSE TRUE FALSE NA NA
4 C B B FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE
5 A B <NA> TRUE FALSE FALSE FALSE TRUE FALSE NA NA
6 B <NA> A FALSE TRUE FALSE NA NA NA TRUE FALSE
category_t3.C
1 TRUE
2 TRUE
3 NA
4 FALSE
5 NA
6 FALSE

Not a tidyverse option (although pipe-compatible), it is very easily doable with package fastDummies:
fastDummies::dummy_cols(data, ignore_na = TRUE)
category_t1 category_t2 category_t3 category_t1_A category_t1_B category_t1_C category_t2_A category_t2_B category_t2_C category_t3_A category_t3_B category_t3_C
1 A A C 1 0 0 1 0 0 0 0 1
2 B C C 0 1 0 0 0 1 0 0 1
3 C B <NA> 0 0 1 0 1 0 NA NA NA
4 C B B 0 0 1 0 1 0 0 1 0
5 A B <NA> 1 0 0 0 1 0 NA NA NA
6 B <NA> A 0 1 0 NA NA NA 1 0 0

purrr's map_dfc could match well with your current approach:
library(dplyr)
library(purrr)
bind_cols(data,
map_dfc(LETTERS[1:3], \(letter) { mutate(data,
across(starts_with("category"),
~ case_when(.x == letter ~ TRUE, !is.na(.x) ~ FALSE),
.names = paste0("{str_replace(.col, 'category', '", letter, "')}")),
.keep = "none") }
)
)
Or skip the bind_cols and use .keep = ifelse(letter == "A", "all", "none").
Output:
category_t1 category_t2 category_t3 A_t1 A_t2 A_t3 B_t1 B_t2 B_t3 C_t1 C_t2 C_t3
1 A A C TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
2 B C C FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE
3 C B <NA> FALSE FALSE NA FALSE TRUE NA TRUE FALSE NA
4 C B B FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE
5 A B <NA> TRUE FALSE NA FALSE TRUE NA FALSE FALSE NA
6 B <NA> A FALSE NA TRUE TRUE NA FALSE FALSE NA FALSE

A base solution with nested lapply():
cbind(data, lapply(data, \(x) {
lev <- levels(factor(x))
sapply(setNames(lev, lev), \(y) x == y)
}))
category_t1 category_t2 category_t3 category_t1.A category_t1.B category_t1.C category_t2.A category_t2.B category_t2.C category_t3.A category_t3.B category_t3.C
1 A A C TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
2 B C C FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
3 C B <NA> FALSE FALSE TRUE FALSE TRUE FALSE NA NA NA
4 C B B FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
5 A B <NA> TRUE FALSE FALSE FALSE TRUE FALSE NA NA NA
6 B <NA> A FALSE TRUE FALSE NA NA NA TRUE FALSE FALSE

Related

Turns thousands of dummy variables into multinomial variable

I have a dataframe of the following sort:
a<-c('q','w')
b<-c(T,T)
d<-c(F,F)
.e<-c(T,F)
.f<-c(F,F)
.g<-c(F,T)
h<-c(F,F)
i<-c(F,T)
j<-c(T,T)
df<-data.frame(a,b,d,.e,.f,.g,h,i,j)
a b d .e .f .g h i j
1 q TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
2 w TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE
I want to turn all variables starting with periods at the start into a single multinomial variable called Index such that the second row would have a value 1 for the Index column, the third row would have a value 2, etc. :
df$Index<-c('e','g')
a b d .e .f .g h i j Index
1 q TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE e
2 w TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE g
Although many rows can have a T for any of period-initial variable, each row can be T for only ONE period-initial variable.
If it were just a few items id do an ifelse statement:
df$Index <- ifelse(df$_10000, '10000',...
But there are 12000 of these. The names for all dummy variables begin with underscores, so I feel like there must be a better way. In pseudocode I would say something like:
for every row:
for every column beginning with '_':
if value == T:
assign the name of the column without '_' to a Column 'Index'
Thanks in advance
Sample data:
df <- cbind(a = letters[1:10], b = LETTERS[1:10],
data.frame(diag(10) == 1))
names(df)[-(1:2)] <- paste0("_", 1:10)
set.seed(42)
df <- df[sample(nrow(df)),]
head(df,3)
# a b _1 _2 _3 _4 _5 _6 _7 _8 _9 _10
# 1 a A TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# 5 e E FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
# 10 j J FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
Execution:
df$Index <- apply(subset(df, select = grepl("^_", names(df))), 1,
function(z) which(z)[1])
df
# a b _1 _2 _3 _4 _5 _6 _7 _8 _9 _10 Index
# 1 a A TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 1
# 5 e E FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE 5
# 10 j J FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE 10
# 8 h H FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE 8
# 2 b B FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 2
# 4 d D FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE 4
# 6 f F FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE 6
# 9 i I FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE 9
# 7 g G FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE 7
# 3 c C FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 3
If there are more than one TRUE in a row of _-columns, the first found will be used, the remainder silently ignored. If there are none, Index will be NA for that row.

R, find character string from vector, create new TRUE/FALSE columns

I have a data frame like this:
df<-structure(list(MRN = c("53634", "65708", "72122", "40458", "03935",
"67473", "20281", "52479", "10261", "40945", "40630", "92295",
"43505", "80719", "39492", "44720", "70691", "21351", "03457",
"02182"), Outcome_Diagnosis_1 = c(NA, NA, NA, "Seroma of breast [N64.89]",
"Breast implant capsular contracture [T85.44XA]; Breast implant capsular contracture [T85.44XA]; Breast implant capsular contracture [T85.44XA]",
NA, NA, NA, "Acquired breast deformity [N64.89]", NA, NA, NA,
NA, "Acquired breast deformity [N64.89]", NA, NA, NA, NA, NA,
NA), Outcome_Diagnosis_2 = c(NA, NA, NA, "Extrusion of breast implant, initial encounter [T85.49XA]; Extrusion of breast implant, initial encounter [T85.49XA]; Extrusion of breast implant, initial encounter [T85.49XA]",
NA, NA, NA, NA, NA, NA, NA, NA, NA, "Capsular contracture of breast implant, subsequent encounter [T85.44XD]; Capsular contracture of breast implant, subsequent encounter [T85.44XD]; Capsular contracture of breast implant, subsequent encounter [T85.44XD]",
NA, NA, NA, NA, NA, NA), Outcome_Diagnosis_3 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Acquired breast deformity [N64.89]; Capsular contracture of breast implant, initial encounter [T85.44XA]; Capsular contracture of breast implant, initial encounter [T85.44XA]; Capsular contracture of breast implant, initial encounter [T85.44XA]",
NA, NA, NA, NA, NA, NA)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
And I have a few vectors like this:
Infection<-c("L76","L00", "L01","L02","L03","L04", "L05","L08")
Hematoma<-c("N64.89","M79.81")
Seroma<- c("L76.34")
Necrosis<- c("N64.1","T86.821")
CapsularContracture<- c("T85.44")
MechanicalComplications<- c("T85", "T85.4", "T85.41", "T85.42", "T85.43", "T85.49")
What I'd like to do is create new columns in the data frame that are TRUE/FALSE for if that vector was found in each row. (And it would just be TRUE even if it shows up multiple times in that row, i.e. it doesn't need to "count" them).
So the output I want would be something like this:
The reason I am struggling and came to stack for help is I don't really know how to combine searching for particular strings (that might be within a longer sentence in that column) and looking over multiple columns.
Additional Info that might be important:
There are more columns than just those 3 outcome diagnoses, it'd be useful if the answer looked through the entire row regardless of how many columns
Sometimes those codes aren't specific enough and it'd probably be useful to look for the actual words like "Seroma". I imagine that'd just be a case of swapping out the characters inside the quotes right?
You could store your vectors in a list:
lst <- list(Infection = c("L76","L00", "L01","L02","L03","L04", "L05","L08"),
Hematoma = c("N64.89","M79.81"),
Seroma = c("L76.34"),
Necrosis = c("N64.1","T86.821"),
CapsularContracture = c("T85.44"),
MechanicalComplications = c("T85", "T85.4", "T85.41", "T85.42", "T85.43", "T85.49"))
And then, using dplyr and purrr you could do:
imap(lst,
~ df %>%
mutate(!!.y := reduce(across(Outcome_Diagnosis_1:Outcome_Diagnosis_3, function(y) grepl(paste(sub("\\.", "", .x), collapse = "|"), sub("\\.", "", y))), `|`))) %>%
reduce(full_join)
MRN Outcome_Diagnos… Outcome_Diagnos… Outcome_Diagnos… Infection Hematoma Seroma Necrosis CapsularContrac…
<chr> <chr> <chr> <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
1 53634 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
2 65708 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
3 72122 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
4 40458 Seroma of breas… Extrusion of br… <NA> FALSE TRUE FALSE FALSE FALSE
5 03935 Breast implant … <NA> <NA> FALSE FALSE FALSE FALSE TRUE
6 67473 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
7 20281 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
8 52479 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
9 10261 Acquired breast… <NA> <NA> FALSE TRUE FALSE FALSE FALSE
10 40945 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
Up front
out <- lapply(manythings, function(thing) {
rowSums(
do.call(cbind, lapply(df[,2:4], function(col) Vectorize(grepl, vectorize.args = "pattern")(thing, col, fixed = TRUE)))
) > 0
})
tibble(cbind(df, out))
# # A tibble: 20 x 10
# MRN Outcome_Diagnosis_1 Outcome_Diagnosis_2 Outcome_Diagnosis_3 Infection Hematoma Seroma Necrosis CapsularContrac~ MechanicalCompl~
# <chr> <chr> <chr> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
# 1 53634 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 2 65708 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 3 72122 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 4 40458 Seroma of breast [N6~ Extrusion of breast ~ <NA> FALSE TRUE FALSE FALSE FALSE TRUE
# 5 03935 Breast implant capsu~ <NA> <NA> FALSE FALSE FALSE FALSE TRUE TRUE
# 6 67473 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 7 20281 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 8 52479 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 9 10261 Acquired breast defo~ <NA> <NA> FALSE TRUE FALSE FALSE FALSE FALSE
# 10 40945 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 11 40630 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 12 92295 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 13 43505 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 14 80719 Acquired breast defo~ Capsular contracture~ Acquired breast defo~ FALSE TRUE FALSE FALSE TRUE TRUE
# 15 39492 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 16 44720 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 17 70691 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 18 21351 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 19 03457 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 20 02182 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
Walk-through
Same as #tmfmnk, I recommend putting all of your patterns into a named list:
manythings <- list(
Infection = c("L76","L00", "L01","L02","L03","L04", "L05","L08"),
Hematoma = c("N64.89","M79.81"),
Seroma = c("L76.34"),
Necrosis = c("N64.1","T86.821"),
CapsularContracture = c("T85.44"),
MechanicalComplications = c("T85", "T85.4", "T85.41", "T85.42", "T85.43", "T85.49"))
Also, we should note that while grepl is good for this, it does not vectorize the pattern= argument, so we need to do that externally. Further, since some of your patterns have regex-sensitive characters (i.e., . which matches anything), we need to guard against regex-injection. For instance, if we aren't careful, then "N64.89" as a pattern will incorrectly match "N64989". For this, I use fixed=TRUE as a safeguard. Unfortunately, this also hampers our ability to shape the patterns such that we can check for all of them in one step. Instead, we'll vectorize it, searching a fixed-regex (single element of one of your vectors of patterns) and aggregate the results.
So let's do one of the pattern-vectors against one column of the frame:
Vectorize(grepl, vectorize.args = "pattern")(manythings[[2]], df[[2]], fixed = TRUE)
# N64.89 M79.81
# [1,] FALSE FALSE
# [2,] FALSE FALSE
# [3,] FALSE FALSE
# [4,] TRUE FALSE
# [5,] FALSE FALSE
# [6,] FALSE FALSE
# [7,] FALSE FALSE
# [8,] FALSE FALSE
# [9,] TRUE FALSE
# [10,] FALSE FALSE
# [11,] FALSE FALSE
# [12,] FALSE FALSE
# [13,] FALSE FALSE
# [14,] TRUE FALSE
# [15,] FALSE FALSE
# [16,] FALSE FALSE
# [17,] FALSE FALSE
# [18,] FALSE FALSE
# [19,] FALSE FALSE
# [20,] FALSE FALSE
Now we can reduce that so that we know of one of the patterns is found within each cell of this one column:
rowSums(
Vectorize(grepl, vectorize.args = "pattern")(manythings[[2]], df[[2]], fixed = TRUE)
) > 0
# [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
Now we can iterate that process over each of the vectors of patterns within manythings:
lapply(manythings, function(thing) {
rowSums(
Vectorize(grepl, vectorize.args = "pattern")(thing, df[[2]], fixed = TRUE)
) > 0
})
# $Infection
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# $Hematoma
# [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
# $Seroma
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# $Necrosis
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# $CapsularContracture
# [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# $MechanicalComplications
# [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
All of this has been to a single column, df[[2]]. In order to apply this across multiple (selectable) columns, I'll employ some tricks with column-binding (in the code at the top). To break that down,
lapply(df[,2:4], ...) subsets the data we want to search so just a few columns. Any way you want to select columns will fit here. This will return a list of matrices, something like:
lapply(df[,2:4], function(col) Vectorize(grepl, vectorize.args = "pattern")(thing, col, fixed = TRUE))
# $Outcome_Diagnosis_1
# N64.89 M79.81
# [1,] FALSE FALSE
# [2,] FALSE FALSE
# [3,] FALSE FALSE
# ...
# $Outcome_Diagnosis_2
# N64.89 M79.81
# [1,] FALSE FALSE
# [2,] FALSE FALSE
# [3,] FALSE FALSE
# ...
# $Outcome_Diagnosis_3
# N64.89 M79.81
# [1,] FALSE FALSE
# [2,] FALSE FALSE
# [3,] FALSE FALSE
# ...
do.call(cbind, ...) will take each of those embedded matrices and combine them into a single matrix:
do.call(cbind, lapply(df[,2:4], function(col) Vectorize(grepl, vectorize.args = "pattern")(thing, col, fixed = TRUE)))
# N64.89 M79.81 N64.89 M79.81 N64.89 M79.81
# [1,] FALSE FALSE FALSE FALSE FALSE FALSE
# [2,] FALSE FALSE FALSE FALSE FALSE FALSE
# [3,] FALSE FALSE FALSE FALSE FALSE FALSE
# [4,] TRUE FALSE FALSE FALSE FALSE FALSE
# ...
which allows us to use rowSums(.) > 0 to determine if any patterns (in each column) is met for each row.
Here is another base R solution you could use albeit similar to some extent. As pointed out cleverly by dear #r2evans I also changed my pattern matching to fixed = TRUE which I was not aware of in the first place:
cbind(df, as.data.frame(do.call(cbind, lst |>
lapply(function(a) {
sapply(a, function(b) {
apply(df[-1], 1, function(c) as.logical(Reduce(`+`, grepl(b, c, fixed = TRUE))))
}) |> rowSums() |> as.logical()
}))))
Infection Hematoma Seroma Necrosis CapsularContracture MechanicalComplications
1 FALSE FALSE FALSE FALSE FALSE FALSE
2 FALSE FALSE FALSE FALSE FALSE FALSE
3 FALSE FALSE FALSE FALSE FALSE FALSE
4 FALSE TRUE FALSE FALSE FALSE TRUE
5 FALSE FALSE FALSE FALSE TRUE TRUE
6 FALSE FALSE FALSE FALSE FALSE FALSE
7 FALSE FALSE FALSE FALSE FALSE FALSE
8 FALSE FALSE FALSE FALSE FALSE FALSE
9 FALSE TRUE FALSE FALSE FALSE FALSE
10 FALSE FALSE FALSE FALSE FALSE FALSE
11 FALSE FALSE FALSE FALSE FALSE FALSE
12 FALSE FALSE FALSE FALSE FALSE FALSE
13 FALSE FALSE FALSE FALSE FALSE FALSE
14 FALSE TRUE FALSE FALSE TRUE TRUE
15 FALSE FALSE FALSE FALSE FALSE FALSE
16 FALSE FALSE FALSE FALSE FALSE FALSE
17 FALSE FALSE FALSE FALSE FALSE FALSE
18 FALSE FALSE FALSE FALSE FALSE FALSE
19 FALSE FALSE FALSE FALSE FALSE FALSE
20 FALSE FALSE FALSE FALSE FALSE FALSE
In order to accommodate the result I only put the output of newly created columns here, but the code binds them to the original data set.
lst <- list(Infection = c("L76", "L00", "L01", "L02", "L03", "L04",
"L05", "L08"), Hematoma = c("N64.89", "M79.81"), Seroma = "L76.34",
Necrosis = c("N64.1", "T86.821"), CapsularContracture = "T85.44",
MechanicalComplications = c("T85", "T85.4", "T85.41", "T85.42",
"T85.43", "T85.49"))

Comparing 2 data frame with different size

I would like to check if the values in my dataframe df is larger than the threshold in df2. I tried making df2 to have the size with df to test on the threshold, but is an alternate way to do this?
> df
A B C
5 12 -5
4 4 0
15 5 9
1 11 1
11 1 -3
> df2
A B C
5 6 3
I tried replicating df2 into and then checking if df > df2
> df2
A B C
5 6 3
5 6 3
5 6 3
5 6 3
5 6 3
dput
> dput(df)
structure(list(A = c(5, 4, 15, 1, 11), B = c(12, 4, 5, 11, 1),
C = c(-5, 0, 9, 1, -3)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
> dput(df2)
structure(list(A = 5, B = 6, C = 3), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
You can try using sweep :
sweep(df, 2, unlist(df2), `>`)
# A B C
#[1,] FALSE TRUE FALSE
#[2,] FALSE FALSE FALSE
#[3,] TRUE FALSE TRUE
#[4,] FALSE TRUE FALSE
#[5,] TRUE FALSE FALSE
Using tidyverse
library(dplyr)
df %>%
mutate(across(everything(), ~ . > df2[[cur_column()]]))
# A tibble: 5 x 3
# A B C
# <lgl> <lgl> <lgl>
#1 FALSE TRUE FALSE
#2 FALSE FALSE FALSE
#3 TRUE FALSE TRUE
#4 FALSE TRUE FALSE
#5 TRUE FALSE FALSE
Or using map2
library(purrr)
map2_df(df, df2, `>`)
# A tibble: 5 x 3
# A B C
# <lgl> <lgl> <lgl>
#1 FALSE TRUE FALSE
#2 FALSE FALSE FALSE
#3 TRUE FALSE TRUE
#4 FALSE TRUE FALSE
#5 TRUE FALSE FALSE
You can use the follwing code
setNames(data.frame(do.call("cbind",lapply(names(df), function(nam) {
df[[nam]] > df2[[nam]]
}))),names(df))
# A B C
#1 FALSE TRUE FALSE
#2 FALSE FALSE FALSE
#3 TRUE FALSE TRUE
#4 FALSE TRUE FALSE
#5 TRUE FALSE FALSE
If its enough to get the result as named matrix (coercing to data.frame is quite time consuming if not really needed), you can just do:
comparedMatrix <- do.call("cbind",lapply(names(df), function(nam) {
df[[nam]] > df2[[nam]]
}))
colnames(comparedMatrix) <- names(df)
comparedMatrix
A base R option using t + unlist
> t(t(df)> unlist(df2))
A B C
[1,] FALSE TRUE FALSE
[2,] FALSE FALSE FALSE
[3,] TRUE FALSE TRUE
[4,] FALSE TRUE FALSE
[5,] TRUE FALSE FALSE

Function for mutating, tagging, or identifying records surrounding a condition by a given window

Given a data.frame with some type of a flag or identifier column, I would like to be able to flag the surrounding (leading and lagging) records by some time window parameter, n. So given:
df <- data.frame(
id = letters[1:26],
flag = FALSE
)
df$flag[10] <- TRUE
df$flag[17] <- TRUE
I would like to write something like:
flag_surrounding <- function(flag, n) {
# should flag surrounding -n to +n records with condition flag
}
# expected results for n = 2, n = 1...
df
# id flag flag_n2 flag_n1
# 1 a FALSE FALSE FALSE
# 2 b FALSE FALSE FALSE
# 3 c FALSE FALSE FALSE
# 4 d FALSE FALSE FALSE
# 5 e FALSE FALSE FALSE
# 6 f FALSE FALSE FALSE
# 7 g FALSE FALSE FALSE
# 8 h FALSE TRUE FALSE
# 9 i FALSE TRUE TRUE
# 10 j TRUE TRUE TRUE
# 11 k FALSE TRUE TRUE
# 12 l FALSE TRUE FALSE
# 13 m FALSE FALSE FALSE
# 14 n FALSE FALSE FALSE
# 15 o FALSE TRUE FALSE
# 16 p FALSE TRUE TRUE
# 17 q TRUE TRUE TRUE
# 18 r FALSE TRUE TRUE
# 19 s FALSE TRUE FALSE
# 20 t FALSE FALSE FALSE
# 21 u FALSE FALSE FALSE
# 22 v FALSE FALSE FALSE
# 23 w FALSE FALSE FALSE
# 24 x FALSE FALSE FALSE
# 25 y FALSE FALSE FALSE
# 26 z FALSE FALSE FALSE
I started writing some things using dplyr::lead and dplyr::lag and variants with cumsum, but I felt like this is already in a package somewhere, but couldn't find it quickly (and not really sure how to phrase this as a question for googling) - maybe someone has better recall than me :)
The following does the trick (using ideas from this post), but feels a bit clunky and error prone. I'd be curious to get other approaches/techniques and/or something more robust from a package.
library(dplyr)
flag_surrounding <- function(flag, n) {
as.logical(cumsum(lead(flag, n, default = FALSE)) - cumsum(lag(flag, n + 1, default = FALSE)))
}
df %>%
mutate(flag_n2 = flag_surrounding(flag, 2),
flag_n1 = flag_surrounding(flag, 1))
Here's a simple solution in base:
set.seed(4)
df <- data.frame(
id = letters[1:26],
flag = as.logical(rbinom(n = 26, size = 1, prob = 0.1))
)
lead_lag_flag = function(x, n) {
flagged = which(x)
to_flag = sapply(flagged, function(z) (z - n):(z + n))
to_flag = pmax(0, to_flag)
to_flag = pmin(length(x), to_flag)
to_flag = unique(to_flag)
new_flag = rep(FALSE, length(x))
new_flag[to_flag] = TRUE
return(new_flag)
}
df$flag_n1 = lead_lag_flag(df$flag, 1)
df$flag_n2 = lead_lag_flag(df$flag, 2)
df
# id flag flag_n1 flag_n2
# 1 a FALSE FALSE FALSE
# 2 b FALSE FALSE FALSE
# 3 c FALSE FALSE FALSE
# 4 d FALSE FALSE FALSE
# 5 e FALSE FALSE FALSE
# 6 f FALSE FALSE TRUE
# 7 g FALSE TRUE TRUE
# 8 h TRUE TRUE TRUE
# 9 i TRUE TRUE TRUE
# 10 j FALSE TRUE TRUE
# 11 k FALSE FALSE TRUE
# 12 l FALSE FALSE TRUE
# 13 m FALSE TRUE TRUE
# 14 n TRUE TRUE TRUE
# 15 o FALSE TRUE TRUE
# 16 p FALSE TRUE TRUE
# 17 q TRUE TRUE TRUE
# 18 r FALSE TRUE TRUE
# 19 s TRUE TRUE TRUE
# 20 t FALSE TRUE TRUE
# 21 u FALSE TRUE TRUE
# 22 v TRUE TRUE TRUE
# 23 w FALSE TRUE TRUE
# 24 x FALSE FALSE TRUE
# 25 y FALSE FALSE FALSE
# 26 z FALSE FALSE FALSE
Another base alternative:
n <- 1
nm <- paste0("flag", n)
i <- -n:n
df[ , nm] <- FALSE
ix <- rep(which(df$flag), each = length(i)) + i
ix <- ix[ix > 0 & ix <= nrow(d)]
df[ix, nm] <- TRUE
df
# id flag flag1
# 1 a FALSE FALSE
# 2 b FALSE FALSE
# 3 c FALSE FALSE
# 4 d FALSE FALSE
# 5 e FALSE FALSE
# 6 f FALSE FALSE
# 7 g FALSE FALSE
# 8 h FALSE FALSE
# 9 i FALSE TRUE
# 10 j TRUE TRUE
# 11 k FALSE TRUE
# 12 l FALSE FALSE
# 13 m FALSE FALSE
# 14 n FALSE FALSE
# 15 o FALSE FALSE
# 16 p FALSE TRUE
# 17 q TRUE TRUE
# 18 r FALSE TRUE
# 19 s FALSE FALSE
# 20 t FALSE FALSE
# 21 u FALSE FALSE
# 22 v FALSE FALSE
# 23 w FALSE FALSE
# 24 x FALSE FALSE
# 25 y FALSE FALSE
# 26 z FALSE FALSE

how to loop through columns in R

I have a very large data set including 250 string and numeric variables. I want to compare one after another columns together. For example, I am going to compare (difference) the first variable with second one, third one with fourth one, fifth one with sixth one and so on.
For example (The structure of the data set is something like this example), I want to compare number.x with number.y, day.x with day.y, school.x with school.y and etc.
number.x<-c(1,2,3,4,5,6,7)
number.y<-c(3,4,5,6,1,2,7)
day.x<-c(1,3,4,5,6,7,8)
day.y<-c(4,5,6,7,8,7,8)
school.x<-c("a","b","b","c","n","f","h")
school.y<-c("a","b","b","c","m","g","h")
city.x<- c(1,2,3,7,5,8,7)
city.y<- c(1,2,3,5,5,7,7)
You mean, something like this?
> number.x == number.y
[1] FALSE FALSE FALSE FALSE FALSE FALSE TRUE
> length(which(number.x==number.y))
[1] 1
> school.x == school.y
[1] TRUE TRUE TRUE TRUE FALSE FALSE TRUE
> test.day <- day.x == day.y
> test.day
[1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE
EDIT: Given your example variables above, we have:
df <- data.frame(number.x,
number.y,
day.x,
day.y,
school.x,
school.y,
city.x,
city.y,
stringsAsFactors=FALSE)
n <- ncol(df) # no of columns (assumed EVEN number)
k <- 1
comp <- list() # comparisons will be stored here
while (k <= n-1) {
l <- (k+1)/2
comp[[l]] <- df[,k] == df[,k+1]
k <- k+2
}
After which, you'll have:
> comp
[[1]]
[1] FALSE FALSE FALSE FALSE FALSE FALSE TRUE
[[2]]
[1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE
[[3]]
[1] TRUE TRUE TRUE TRUE FALSE FALSE TRUE
[[4]]
[1] TRUE TRUE TRUE FALSE TRUE FALSE TRUE
To get the comparison result between columns k and k+1, you look at the (k+1)/2 element of comp - i.e to get the comparison results between columns 7 & 8, you look at the comp element 8/2=4:
> comp[[4]]
[1] TRUE TRUE TRUE FALSE TRUE FALSE TRUE
EDIT 2: To have the comparisons as new columns in the dataframe:
new.names <- rep('', n/2)
for (i in 1:(n/2)) {
new.names[i] <- paste0('V', i)
}
cc <- as.data.frame(comp, optional=TRUE)
names(cc) <- new.names
df.new <- cbind(df, cc)
After which, you have:
> df.new
number.x number.y day.x day.y school.x school.y city.x city.y V1 V2 V3 V4
1 1 3 1 4 a a 1 1 FALSE FALSE TRUE TRUE
2 2 4 3 5 b b 2 2 FALSE FALSE TRUE TRUE
3 3 5 4 6 b b 3 3 FALSE FALSE TRUE TRUE
4 4 6 5 7 c c 7 5 FALSE FALSE TRUE FALSE
5 5 1 6 8 n m 5 5 FALSE FALSE FALSE TRUE
6 6 2 7 7 f g 8 7 FALSE TRUE FALSE FALSE
7 7 7 8 8 h h 7 7 TRUE TRUE TRUE TRUE

Resources