Converting binary to list - r

I have somating like a binary dataframe
> dput(head(dat))
structure(list(CDR3.aa = c("CALWEVQELGKKIKVF", "CAATVGGWGKLQF",
"CACDPLYGGITGGFNTDKLIF", "CACDTLLPTSLGDMAKLIF", "CALGELSSDGGGAIF",
"CALSNTGGFKTIF"), TCR_CS001_T1 = c(1, 1, 1, 1, 1, 0), TCR_CS001_T2 = c(0,
1, 1, 1, 1, 0), TCR_CS002 = c(1, 0, 0, 0, 0, 0), TCR_HC002 = c(0,
0, 0, 0, 0, 1), TCR_HC003 = c(1, 0, 0, 0, 0, 1)), row.names = c(NA,
-6L), .internal.selfref = <pointer: 0x0000023f7a101ef0>, class = c("immunr_public_repertoire",
"data.table", "data.frame"))
That shows if an amin acide exists in a sample we see 1 and if absent shown by 0
I want to replace 1 and 0 by amino acid itself
How I can do that please?

If CDR3.aa is the amino acid column you can do :
dplyr :
library(dplyr)
dat %>% mutate(across(-CDR3.aa, ~ifelse(. == 1, CDR3.aa, .)))
# CDR3.aa TCR_CS001_T1 TCR_CS001_T2 TCR_CS002
#1: CALWEVQELGKKIKVF CALWEVQELGKKIKVF 0 CALWEVQELGKKIKVF
#2: CAATVGGWGKLQF CAATVGGWGKLQF CAATVGGWGKLQF 0
#3: CACDPLYGGITGGFNTDKLIF CACDPLYGGITGGFNTDKLIF CACDPLYGGITGGFNTDKLIF 0
#4: CACDTLLPTSLGDMAKLIF CACDTLLPTSLGDMAKLIF CACDTLLPTSLGDMAKLIF 0
#5: CALGELSSDGGGAIF CALGELSSDGGGAIF CALGELSSDGGGAIF 0
#6: CALSNTGGFKTIF 0 0 0
# TCR_HC002 TCR_HC003
#1: 0 CALWEVQELGKKIKVF
#2: 0 0
#3: 0 0
#4: 0 0
#5: 0 0
#6: CALSNTGGFKTIF CALSNTGGFKTIF
data.table :
library(data.table)
dat[, (names(dat)[-1]) := lapply(.SD, function(x) ifelse(x == 1, CDR3.aa, x)), .SDcols = -1]

Related

Count occurrences in specific column ranges and return factor variable, R

I have data like this:
df<-structure(list(levels_incised___1 = c(0, 0, 0, 0, 0, 0), levels_incised___2 = c(1,
0, 0, 0, 0, 0), levels_incised___3 = c(1, 0, 0, 0, 0, 0), levels_incised___4 = c(1,
0, 0, 0, 0, 0), levels_incised___5 = c(1, 0, 0, 0, 0, 0), levels_incised___6 = c(1,
0, 0, 0, 0, 0), levels_incised___7 = c(1, 0, 0, 0, 0, 0), levels_incised___8 = c(1,
1, 1, 0, 0, 0), levels_incised___9 = c(1, 1, 1, 0, 0, 0), levels_incised___10 = c(1,
1, 1, 0, 0, 0), levels_incised___11 = c(0, 1, 0, 0, 0, 0), levels_incised___12 = c(0,
1, 0, 0, 0, 0), levels_incised___13 = c(0, 1, 0, 0, 0, 0), levels_incised___14 = c(0,
1, 0, 0, 0, 0), levels_incised___15 = c(0, 1, 0, 0, 0, 0), levels_incised___16 = c(0,
0, 0, 0, 0, 0), levels_incised___17 = c(0, 0, 0, 0, 0, 0), levels_incised___18 = c(0,
0, 0, 0, 0, 0), levels_incised___19 = c(0, 0, 0, 0, 0, 0), levels_incised___20 = c(0,
0, 0, 0, 0, 0), levels_incised___21 = c(0, 0, 0, 0, 0, 0), levels_incised___22 = c(0,
0, 0, 0, 1, 0), levels_incised___23 = c(0, 0, 0, 0, 1, 1), levels_incised___24 = c(0,
0, 0, 0, 1, 1), levels_incised___25 = c(0, 0, 0, 0, 1, 1), levels_incised___26 = c(0,
0, 0, 0, 1, 1), levels_incised___27 = c(0, 0, 0, 1, 1, 1), levels_incised___28 = c(0,
0, 0, 1, 1, 1), levels_incised___29 = c(0, 0, 0, 1, 1, 0), levels_incised___30 = c(0,
0, 0, 1, 1, 0), levels_incised___31 = c(0, 0, 0, 0, 0, 0)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
That originally came from this Redcap input where each button was one of those columns:
And I need to create a column at the end (lets call it Level) with these possible inputs:
Cervical (any of the c buttons)
Thoracic (the t's)
Lumbar (the L's)
Sacral (sacral)
Thoracocervical (t's or c's)
Thoracolumbar (t's or l's)
Lumbosacral (l's and sacral)
So for instance, the patient in the first row had "1"'s in levels_incised_2 through levels_incised_10... meaning they had values in both the cervical range and the thoracic range. So that patient should get "Thoracocervical".
The patient in row 2 had 1's in 8 through 15, so they'd only get a "thoracic"
Does anyone know the most straight forward way to accomplish this?
Oh one last detail, there's 100+ other columns so it'd be nice if I could select/name these specific ones to count
A few things to resolve here:
find a way to convert levels...# to one of the C/T/... categories;
produce logic to infer based on presence of groups.
I think the first can be done by extracting the number and using findInterval to determine with of C/T/... each column belongs to. From there, we can do some simple c_across to find "any" in a group, and case_when to get your Level labels.
library(dplyr)
# helper function for renaming
func <- function(z) {
num <- as.integer(gsub("\\D", "", z))
grp <- c("C","T","L","S","Co","unclear")[findInterval(num, 1+c(0, 7, 19, 24, 29, 30, 31))]
grp <- paste0(grp, ave(grp, grp, FUN = seq_along))
# fix those that do not need numbering
grp[grepl("^Co", grp)] <- "Co"
grp[grepl("^unc", grp)] <- "unclear"
grp
}
out <- df %>%
rename_with(.cols = starts_with("levels"), .fn = func) %>%
rowwise() %>%
mutate(
anyC = sum(c_across(C1:C7)) > 0,
anyT = sum(c_across(T1:T12)) > 0,
anyL = sum(c_across(L1:L5)) > 0,
anyS = sum(c_across(S1:S5)) > 0
) %>%
ungroup() %>%
mutate(
Level = case_when(
anyC & anyT & anyL ~ "More than 2?",
anyL & anyS ~ "Lumbosacral",
anyT & anyL ~ "Thoracolumbar",
anyT & anyC ~ "Thoracocervical",
anyS ~ "Sacral",
anyL ~ "Lumbar",
anyT ~ "Thoracic",
anyC ~ "Cervical",
TRUE ~ "Nothing?"
)
)
out
# # A tibble: 6 x 36
# C1 C2 C3 C4 C5 C6 C7 T1 T2 T3 T4 T5 T6 T7 T8 T9 T10 T11 T12 L1 L2 L3 L4 L5 S1 S2 S3 S4 S5 Co unclear anyC anyT anyL anyS Level
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <lgl> <lgl> <lgl> <chr>
# 1 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 TRUE TRUE FALSE FALSE Thoracocervical
# 2 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 FALSE TRUE FALSE FALSE Thoracic
# 3 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 FALSE TRUE FALSE FALSE Thoracic
# 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 FALSE FALSE FALSE TRUE Sacral
# 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 FALSE FALSE TRUE TRUE Lumbosacral
# 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 FALSE FALSE TRUE TRUE Lumbosacral
out$Level
# [1] "Thoracocervical" "Thoracic" "Thoracic" "Sacral" "Lumbosacral" "Lumbosacral"
If you don't want to keep the renaming, then you can combine the Level result to your original frame with cbind(df, Level = out$Level).
using package dplyr:
## vertebra codes needed later on
vertebra_codes <- c(
paste0('C',1:7), paste0('T',1:12),
paste0('L',1:5), paste0('S',1:5),
'X', ## for Coccyx
'-' ## for unknown
)
df %>%
mutate(
## assuming each row is a case:
case_id = paste0('case_',row_number())
) %>%
## reshape the data from wide to long format:
pivot_longer(
cols = -case_id,
names_to = 'level_incised', values_to = 'is_incised'
) %>%
mutate(
## remove the redundant 'levels_incised__' prefix:
level_incised = gsub('.*_','',level_incised),
## assign the vertebra corresponding to 'level':
vertebra = vertebra_codes[as.integer(level_incised)],
## assign the spine region (e.g.: all lumbal vert. start with 'L'
spine_region = substr(vertebra,1,1)
) %>%
filter(is_incised == 1) %>% ## we're interested in incised vert. only
## remove replicates (more than one vertebra per spine region affected:
distinct(case_id, spine_region) %>%
## do the counts per case:
group_by(case_id) %>%
## string together the affected regions per case:
summarise(incised_regions = paste(spine_region, collapse = ','))
result:
# A tibble: 6 x 2
case_id incised_regions
<chr> <chr>
1 case_1 C,T
2 case_2 T
3 case_3 T
4 case_4 S,X
5 case_5 L,S,X
6 case_6 L,S
(Note that original `df` remains unchanged throughout the processing pipeline. However you can break up the pipeline by removing the `%>%` operator and inspect the intermediary steps, or assign them to temporary objects.)
extra / for fun: example code to ggplot the spine with vertebra status (incised or not) per patient.

Create new columns using across() and if_else()

I have survey data that has a binary 1, 0 (indicating peak or off-peak) variable with the related peak or off-peak numbers in two separate columns.
structure(list(q9_jul_2019 = c(1, 0, 1, 0, 1, 0), q9_aug_2019 = c(1,
0, 1, 0, 1, 0), q9_sep_2019 = c(1, 0, 1, 0, 1, 0), q9_oct_2019 = c(0,
0, 1, 0, 1, 0), q9_nov_2019 = c(0, 0, 1, 0, 1, 0), q9_dec_2019 = c(0,
0, 1, 0, 0, 0), q9_jan_2020 = c(0, 0, 1, 0, 0, 0), q9_feb_2020 = c(0,
1, 0, 1, 0, 0), q9_mar_2020 = c(1, 1, 0, 1, 0, 0), q9_apr_2020 = c(1,
1, 1, 1, 0, 1), q9_may_2020 = c(0, 1, 0, 0, 0, 0), q9_jun_2020 = c(0,
0, 0, 0, 0, 0), q15 = c(1, 10, 30, 0, 2, 0), q22 = c(0, 10, 6,
0, 0, 0)), row.names = c(NA, 6L), class = "data.frame")
I have created new monthly columns that have the associated visitation numbers in that column but I'm sure there must be a neater way to do it using across(). I haven't been able to make it work though, so at the moment I'm stuck at the following:
survey <- survey %>%
mutate(visitation_jul_19 = if_else(q9_jul_2019 == 1, q15, q22),
visitation_aug_19 = if_else(q9_aug_2019 == 1, q15, q22),
visitation_sep_19 = if_else(q9_sep_2019 == 1, q15, q22),
visitation_oct_19 = if_else(q9_oct_2019 == 1, q15, q22),
visitation_nov_19 = if_else(q9_nov_2019 == 1, q15, q22),
visitation_dec_19 = if_else(q9_dec_2019 == 1, q15, q22),
visitation_jan_20 = if_else(q9_jan_2020 == 1, q15, q22),
visitation_feb_20 = if_else(q9_feb_2020 == 1, q15, q22),
visitation_mar_20 = if_else(q9_mar_2020 == 1, q15, q22),
visitation_apr_20 = if_else(q9_apr_2020 == 1, q15, q22),
visitation_may_20 = if_else(q9_may_2020 == 1, q15, q22),
visitation_jun_20 = if_else(q9_jun_2020 == 1, q15, q22))
You may try
library(dplyr)
survey %>%
mutate(across(q9_jul_2019:q9_jun_2020, ~ ifelse(.x == 1, q15, q22)))
q9_jul_2019 q9_aug_2019 q9_sep_2019 q9_oct_2019 q9_nov_2019 q9_dec_2019 q9_jan_2020 q9_feb_2020 q9_mar_2020 q9_apr_2020
1 1 1 1 0 0 0 0 0 1 1
2 10 10 10 10 10 10 10 10 10 10
3 30 30 30 30 30 30 30 6 6 30
4 0 0 0 0 0 0 0 0 0 0
5 2 2 2 2 2 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0
q9_may_2020 q9_jun_2020 q15 q22
1 0 0 1 0
2 10 10 10 10
3 6 6 30 6
4 0 0 0 0
5 0 0 2 0
6 0 0 0 0

Cumulative count for a column using R

I got data like this
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1), drug_2 = c(0, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 1)), class = "data.frame", row.names = c(NA, -12L
))
I would like to get the cumulative count of each column for each id and get the data like this
structure(list(id2 = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1_b = c(0,
0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2), drug_2_b = c(0, 1, 2, 3, 4,
0, 5, 0, 0, 1, 0, 2)), class = "data.frame", row.names = c(NA,
-12L))
You can get a cumulative sum with cumsum.
To split data.frame into subsets, you can use split and then lapply cumsum over the list of the data.frames and again over the list of the columns, or you can use the ave function which does exactly that:
data = structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1), drug_2 = c(0, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 1)), class = "data.frame", row.names = c(NA, -12L
))
data[-1] = ave(data[-1], data$id, FUN=cumsum)
edit:
I assumed that the cumulative sum is requested (as per instructions) and that there is a mistake in the example data. If the example data is correct, then the condition is If the count is zero, don't do cumulative sum and leave at zero or ifelse(x == 0, 0, cumsum(x)) (as per #r2evans). However, this construct doesn't work when applied for the data.frame. A more complex helper function is required:
data[-1] = ave(data[-1], data$id, FUN=function(x){
y = cumsum(x)
y[x == 0] = 0
y
})
We can now compare it with the requested (renamed) data:
result = structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2), drug_2 = c(0, 1, 2, 3, 4,
0, 5, 0, 0, 1, 0, 2)), class = "data.frame", row.names = c(NA,
-12L))
identical(data, result)
Base R,
ave(df$drug_2, df$id, FUN = function(z) ifelse(z == 0, z, cumsum(z)))
# [1] 0 1 2 3 4 0 5 0 0 1 0 2
Edit Simplified the solution after reading r2evans' approach.
You could use
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with("drug"),
~ifelse(.x == 0, 0, cumsum(.x)))) %>%
ungroup()
This returns
# A tibble: 12 x 3
id drug_1 drug_2
<dbl> <dbl> <dbl>
1 1 0 0
2 1 0 1
3 1 0 2
4 1 0 3
5 1 0 4
6 1 1 0
7 1 2 5
8 2 0 0
9 2 0 0
10 2 1 1
11 2 0 0
12 2 2 2
Base R solution:
# Resolve the names of vectors we want to cumulatively sum:
# drug_vec_names => character vector
drug_vec_names <- grep( "^drug\\_", colnames(df), value = TRUE)
# Resolve the names of vectors we want to keep:
# not_drug_vec_names => character vector
not_drug_vec_names <- names(df)[!(names(df) %in% drug_vec_names)]
# Calculate the result: res => data.frame
res <- setNames(
cbind(
df[,not_drug_vec_names],
replace(
ave(
df[,drug_vec_names],
df[,not_drug_vec_names],
FUN = cumsum
),
df[,drug_vec_names] == 0,
0
)
),
c(not_drug_vec_names, drug_vec_names)
)
If you have binary values (1/0) in drug columns, you can multiply the cumulative sum with itself to get 0 for 0 values.
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with('drug'), ~cumsum(.) * .)) %>%
ungroup
# id drug_1 drug_2
# <dbl> <dbl> <dbl>
# 1 1 0 0
# 2 1 0 1
# 3 1 0 2
# 4 1 0 3
# 5 1 0 4
# 6 1 1 0
# 7 1 2 5
# 8 2 0 0
# 9 2 0 0
#10 2 1 1
#11 2 0 0
#12 2 2 2

Remove duplicate lines while keeping the bottom lines

I would like to remove duplicate lines in R keeping the information of the lower lines, that is, from this data:
example <- structure(list(var1 = c(1, 1, 2, 2, 3, 4, 5, 6, 6), var2 = c(0,
0, 0, 0, 0, 0, 0, 0, 0), var3 = c(1, 0, 0, 0, 0, 1, 0, 0, 0),
var4 = c(1, 1, 1, 1, 0, 1, 1, 0, 0), var5 = c(1, 1, 1, 0,
0, 1, 1, 0, 0), Year = 2001:2009), row.names = c(NA, -9L), class = "data.frame")
I would like to remove the duplicates keeping the lines at the bottom, so that I get:
example1 <- structure(list(var1 = c(1, 2, 3, 4, 5, 6), var2 = c(0, 0, 0,
0, 0, 0), var3 = c(0, 0, 0, 1, 0, 0), var4 = c(1, 1, 0, 1, 1,
0), var5 = c(1, 0, 0, 1, 1, 0), Year = c(2002, 2004, 2005, 2006,
2007, 2009)), row.names = c(NA, -6L), class = "data.frame")
Is it possible to apply the duplicated function or the distinct function of the `dplyr package?
I appreciate any help. Thanks.
Is this what you want?
example %>%
group_by(var1) %>%
slice_tail()
output
# A tibble: 6 x 6
# Groups: var1 [6]
var1 var2 var3 var4 var5 Year
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 1 0 0 1 1 2002
2 2 0 0 1 0 2004
3 3 0 0 0 0 2005
4 4 0 1 1 1 2006
5 5 0 0 1 1 2007
6 6 0 0 0 0 2009
The #ThomasIsCoding response, with the dplyr tools, worked well. I found another possibility, which seems faster:
example1 <- example[!duplicated(example$var1, fromLast = T), ]
distinct keeps 1st row in each group, if you want to keep last row you can reverse the rows and then apply distinct.
library(dplyr)
example %>%
slice(n():1) %>%
distinct(var1, .keep_all = TRUE) %>%
arrange(var1)
# var1 var2 var3 var4 var5 Year
#1 1 0 0 1 1 2002
#2 2 0 0 1 0 2004
#3 3 0 0 0 0 2005
#4 4 0 1 1 1 2006
#5 5 0 0 1 1 2007
#6 6 0 0 0 0 2009
Alternatively you can also use slice :
example %>% group_by(var1) %>% slice(n())

If Value Exists in Any Column for a Row Automatically Flag Whole Row R

I subset my dataframe to include many variables and if a row contains a value of 1 for any column, I need to flag that row. What is a nice way to do this?
You could use rowSums :
df$has_1 <- rowSums(df == 1, na.rm = TRUE) > 0
df
# a b c has_1
#1 0 0 0 FALSE
#2 0 0 1 TRUE
#3 1 0 0 TRUE
#4 0 0 0 FALSE
#5 1 0 1 TRUE
data
df <- data.frame(a = c(0, 0, 1, 0, 1), b = 0, c = c(0, 1, 0, 0, 1))
We can use any with apply
df$has_1 <- apply(df, 1, any)
data
df <- data.frame(a = c(0, 0, 1, 0, 1), b = 0, c = c(0, 1, 0, 0, 1))

Resources