Sort character in vector of string in R - r

I have data like,
df <- structure(list(Sex = structure(c(1L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("F", "M"), class = "factor"),
Age = c(19L, 16L, 16L, 13L, 16L, 30L, 16L, 30L, 16L, 30L,
30L, 16L, 19L, 1L, 30L), I = c(1, 1, 0, 0, 1, 0, 1, 0, 1,
0, 0, 0, 1, 0, 1), E = c(0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
1, 0, 1, 0), S = c(1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
0, 1), N = c(0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0),
F = c(1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1), T = c(0,
1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0), C = c(1, 1, 1,
0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1), D = c(0, 0, 0, 1, 0,
1, 0, 1, 0, 1, 1, 1, 1, 0, 0), type = c("CIFS", "CITN", "CESF",
"DEFS", "CIFN", "DETS", "CITS", "DEFS", "CIFN", "DEFN", "DETS",
"DETS", "DINF", "CENT", "CIFS"), PO = runif(15, -3, 3), AO = runif(15, -3, 3)), .Names = c("Sex",
"Age", "I", "E", "S", "N", "F", "T", "C", "D", "type", "PO",
"AO"), class = c("tbl_dt", "tbl", "data.table", "data.frame"), row.names = c(NA,
-15L))
I want to sort the column type. Not the column but the characters in it. And get the same structure afterwards. For example, CIFS should then be CFIS. I tried to do it as,
df <- within(df, {
type <- apply(sapply(strsplit(df[, type], split=''), sort), 2,
function(x) paste0(x, collapse = ''))
})
Is there any simpler solution, that I have missed to find.

Since you are using data.table, I would suggest
df[, type := paste(sort(unlist(strsplit(type, ""))), collapse = ""), by = type]
like described in How to sort letters in a string?

This should work for both data.frame and data.table (base R only):
df$type <- vapply(strsplit(df$type, split=''),FUN=function(x)paste(sort(x),collapse=''),"")
Result:
> df
Sex Age I E S N F T C D type PO AO
1 F 19 1 0 1 0 1 0 1 0 CFIS 2.9750666 2.0308410
2 F 16 1 0 0 1 0 1 1 0 CINT 0.7902187 2.0891158
3 M 16 0 1 1 0 1 0 1 0 CEFS -1.7173785 2.4774140
4 F 13 0 1 1 0 1 0 0 1 DEFS 1.5352127 -1.9272470
5 M 16 1 0 0 1 1 0 1 0 CFIN -0.2160741 1.7359897
6 M 30 0 1 1 0 0 1 0 1 DEST 2.6314981 -0.6252466
7 F 16 1 0 1 0 0 1 1 0 CIST -1.6032894 -1.9938226
8 M 30 0 1 1 0 1 0 0 1 DEFS 0.7748583 -2.0935737
9 F 16 1 0 0 1 1 0 1 0 CFIN -2.9368356 0.3363364
10 F 30 0 1 0 1 1 0 0 1 DEFN -0.6506217 2.6681535
11 F 30 0 1 1 0 0 1 0 1 DEST -0.4432578 0.4627441
12 F 16 0 1 1 0 0 1 0 1 DEST 2.0236760 2.7684298
13 F 19 1 0 0 1 1 0 0 1 DFIN -1.1774931 2.6546726
14 F 1 0 1 0 1 0 1 1 0 CENT -2.2365388 2.7902646
15 F 30 1 0 1 0 1 0 1 0 CFIS -1.6139238 -2.4982620

Related

Create column conditioning the behavior of rows in the dataset

I would like to do something very specific. I have a vast set of data, which, in summary, looks more or less like this, with values 0, 1 and 2:
I need to create a situation variable so that it contains the value 0, 1 and 2.
The value 0 for cases that contain only 0's and 1's in the entire line.
The value 1 for the case where the value 2 appears, but at some point 1 appears before it.
The value 2 for the case where the value 2 appears, but at some point 0 appears before it.
So it's something close to:
structure(list(X1 = c(1, 1, 1, 1, 1, 1, 1, 1, 0, 1), X2 = c(1,
1, 1, 1, 0, 0, 0, 0, 0, 2), X3 = c(0, 1, 1, 1, 1, 0, 0, 1, 0,
0), X4 = c(0, 1, 1, 0, 1, 1, 0, 0, 0, 0), X5 = c(2, 1, 1, 0,
2, 1, 1, 0, 0, 0), X6 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0), X7 = c(2,
1, 1, 1, 2, 1, 1, 2, 0, 0), X8 = c(0, 1, 1, 1, 2, 1, 2, 2, 2,
0)), class = "data.frame", row.names = c(NA, 10L))
I wrote a score function and applied it over all the rows of your dataframe.
score <- function(x) {
a <- which(x == 2)
ifelse(length(a) > 0, ifelse(a[1] >=2, 2 - x[a[1] - 1], 1), 0)
}
df <- structure(list(X1 = c(1, 1, 1, 1, 1, 1, 1, 1, 0, 1),
X2 = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 2),
X3 = c(0, 1, 1, 1, 1, 0, 0, 1, 0, 0),
X4 = c(0, 1, 1, 0, 1, 1, 0, 0, 0, 0),
X5 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0),
X6 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0),
X7 = c(2, 1, 1, 1, 2, 1, 1, 2, 0, 0),
X8 = c(0, 1, 1, 1, 2, 1, 2, 2, 2, 0)),
class = "data.frame", row.names = c(NA, 10L))
df$situation <- sapply(1:nrow(df), function(i) score(as.numeric(df[i,])))
df
Here's a tidyverse approach.
I'll first concatenate all columns together, then use grepl() to look for 12 or 02.
library(tidyverse)
df %>% rowwise() %>%
mutate(concat = paste(c_across(everything()), collapse = "")) %>%
ungroup() %>%
mutate(situation = case_when(
!grepl(2, concat) ~ 0,
grepl("12", concat) ~ 1,
grepl("02", concat) ~ 2
)) %>%
select(-concat)
Output
# A tibble: 10 x 9
X1 X2 X3 X4 X5 X6 X7 X8 situation
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 2 2 2 0 2
2 1 1 1 1 1 1 1 1 0
3 1 1 1 1 1 1 1 1 0
4 1 1 1 0 0 0 1 1 0
5 1 0 1 1 2 2 2 2 1
6 1 0 0 1 1 1 1 1 0
7 1 0 0 0 1 1 1 2 1
8 1 0 1 0 0 0 2 2 2
9 0 0 0 0 0 0 0 2 2
10 1 2 0 0 0 0 0 0 1
Note that this solution assumes that:
2 will not appear in the first column
1 or 2 in the situation is defined by the number immediately before 2 in your dataset
There will not be a case of 12 and 02 happening in the same row

Create new columns using across() and if_else()

I have survey data that has a binary 1, 0 (indicating peak or off-peak) variable with the related peak or off-peak numbers in two separate columns.
structure(list(q9_jul_2019 = c(1, 0, 1, 0, 1, 0), q9_aug_2019 = c(1,
0, 1, 0, 1, 0), q9_sep_2019 = c(1, 0, 1, 0, 1, 0), q9_oct_2019 = c(0,
0, 1, 0, 1, 0), q9_nov_2019 = c(0, 0, 1, 0, 1, 0), q9_dec_2019 = c(0,
0, 1, 0, 0, 0), q9_jan_2020 = c(0, 0, 1, 0, 0, 0), q9_feb_2020 = c(0,
1, 0, 1, 0, 0), q9_mar_2020 = c(1, 1, 0, 1, 0, 0), q9_apr_2020 = c(1,
1, 1, 1, 0, 1), q9_may_2020 = c(0, 1, 0, 0, 0, 0), q9_jun_2020 = c(0,
0, 0, 0, 0, 0), q15 = c(1, 10, 30, 0, 2, 0), q22 = c(0, 10, 6,
0, 0, 0)), row.names = c(NA, 6L), class = "data.frame")
I have created new monthly columns that have the associated visitation numbers in that column but I'm sure there must be a neater way to do it using across(). I haven't been able to make it work though, so at the moment I'm stuck at the following:
survey <- survey %>%
mutate(visitation_jul_19 = if_else(q9_jul_2019 == 1, q15, q22),
visitation_aug_19 = if_else(q9_aug_2019 == 1, q15, q22),
visitation_sep_19 = if_else(q9_sep_2019 == 1, q15, q22),
visitation_oct_19 = if_else(q9_oct_2019 == 1, q15, q22),
visitation_nov_19 = if_else(q9_nov_2019 == 1, q15, q22),
visitation_dec_19 = if_else(q9_dec_2019 == 1, q15, q22),
visitation_jan_20 = if_else(q9_jan_2020 == 1, q15, q22),
visitation_feb_20 = if_else(q9_feb_2020 == 1, q15, q22),
visitation_mar_20 = if_else(q9_mar_2020 == 1, q15, q22),
visitation_apr_20 = if_else(q9_apr_2020 == 1, q15, q22),
visitation_may_20 = if_else(q9_may_2020 == 1, q15, q22),
visitation_jun_20 = if_else(q9_jun_2020 == 1, q15, q22))
You may try
library(dplyr)
survey %>%
mutate(across(q9_jul_2019:q9_jun_2020, ~ ifelse(.x == 1, q15, q22)))
q9_jul_2019 q9_aug_2019 q9_sep_2019 q9_oct_2019 q9_nov_2019 q9_dec_2019 q9_jan_2020 q9_feb_2020 q9_mar_2020 q9_apr_2020
1 1 1 1 0 0 0 0 0 1 1
2 10 10 10 10 10 10 10 10 10 10
3 30 30 30 30 30 30 30 6 6 30
4 0 0 0 0 0 0 0 0 0 0
5 2 2 2 2 2 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0
q9_may_2020 q9_jun_2020 q15 q22
1 0 0 1 0
2 10 10 10 10
3 6 6 30 6
4 0 0 0 0
5 0 0 2 0
6 0 0 0 0

Converting binary to list

I have somating like a binary dataframe
> dput(head(dat))
structure(list(CDR3.aa = c("CALWEVQELGKKIKVF", "CAATVGGWGKLQF",
"CACDPLYGGITGGFNTDKLIF", "CACDTLLPTSLGDMAKLIF", "CALGELSSDGGGAIF",
"CALSNTGGFKTIF"), TCR_CS001_T1 = c(1, 1, 1, 1, 1, 0), TCR_CS001_T2 = c(0,
1, 1, 1, 1, 0), TCR_CS002 = c(1, 0, 0, 0, 0, 0), TCR_HC002 = c(0,
0, 0, 0, 0, 1), TCR_HC003 = c(1, 0, 0, 0, 0, 1)), row.names = c(NA,
-6L), .internal.selfref = <pointer: 0x0000023f7a101ef0>, class = c("immunr_public_repertoire",
"data.table", "data.frame"))
That shows if an amin acide exists in a sample we see 1 and if absent shown by 0
I want to replace 1 and 0 by amino acid itself
How I can do that please?
If CDR3.aa is the amino acid column you can do :
dplyr :
library(dplyr)
dat %>% mutate(across(-CDR3.aa, ~ifelse(. == 1, CDR3.aa, .)))
# CDR3.aa TCR_CS001_T1 TCR_CS001_T2 TCR_CS002
#1: CALWEVQELGKKIKVF CALWEVQELGKKIKVF 0 CALWEVQELGKKIKVF
#2: CAATVGGWGKLQF CAATVGGWGKLQF CAATVGGWGKLQF 0
#3: CACDPLYGGITGGFNTDKLIF CACDPLYGGITGGFNTDKLIF CACDPLYGGITGGFNTDKLIF 0
#4: CACDTLLPTSLGDMAKLIF CACDTLLPTSLGDMAKLIF CACDTLLPTSLGDMAKLIF 0
#5: CALGELSSDGGGAIF CALGELSSDGGGAIF CALGELSSDGGGAIF 0
#6: CALSNTGGFKTIF 0 0 0
# TCR_HC002 TCR_HC003
#1: 0 CALWEVQELGKKIKVF
#2: 0 0
#3: 0 0
#4: 0 0
#5: 0 0
#6: CALSNTGGFKTIF CALSNTGGFKTIF
data.table :
library(data.table)
dat[, (names(dat)[-1]) := lapply(.SD, function(x) ifelse(x == 1, CDR3.aa, x)), .SDcols = -1]

Include all variables in tsibble formula

I want to fit a linear regression model using the tsibble package and I have a bunch of dummy variables that I want to include in my analysis. A sample dataset would be the following:
library(tsibble)
library(dplyr)
library(fable)
ex = structure(list(id = c("KEY1", "KEY1", "KEY1", "KEY1", "KEY1",
"KEY1", "KEY1", "KEY1", "KEY1", "KEY1", "KEY1", "KEY1", "KEY1",
"KEY1", "KEY1"), sales = c(0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), date = structure(c(15003, 15004, 15005, 15006, 15007,
15008, 15009, 15010, 15011, 15012, 15013, 15014, 15015, 15016,
15017), class = "Date"), wday = c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L), dummy_1 = c(0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), dummy_2 = c(0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0), dummy_3 = c(0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -15L), key = structure(list(
id = "KEY1", .rows = list(1:15)), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), index = structure("date", ordered = TRUE), index2 = "date", interval = structure(list(
year = 0, quarter = 0, month = 0, week = 0, day = 1, hour = 0,
minute = 0, second = 0, millisecond = 0, microsecond = 0,
nanosecond = 0, unit = 0), class = "interval"), class = c("tbl_ts",
"tbl_df", "tbl", "data.frame"))
> ex
# A tsibble: 15 x 7 [1D]
# Key: id [1]
id sales date wday dummy_1 dummy_2 dummy_3
<chr> <dbl> <date> <int> <dbl> <dbl> <dbl>
1 KEY1 0 2011-01-29 1 0 0 0
2 KEY1 5 2011-01-30 2 0 0 0
3 KEY1 0 2011-01-31 3 0 0 1
4 KEY1 0 2011-02-01 4 1 0 0
5 KEY1 0 2011-02-02 5 0 0 0
6 KEY1 0 2011-02-03 6 0 0 0
7 KEY1 0 2011-02-04 7 0 1 0
8 KEY1 0 2011-02-05 1 0 0 0
9 KEY1 0 2011-02-06 2 0 0 0
10 KEY1 0 2011-02-07 3 0 0 0
11 KEY1 0 2011-02-08 4 0 0 0
12 KEY1 0 2011-02-09 5 0 0 0
13 KEY1 0 2011-02-10 6 0 0 0
14 KEY1 0 2011-02-11 7 0 0 0
15 KEY1 0 2011-02-12 1 0 0 0
They are too many dummies to specify manually so I was hoping for something faster. Normally I would use the . symbol in the formula in the following way:
fit = ex %>%
model(TSLM(sales ~ trend() + season() + .))
But this does not work:
Warning message:
1 error encountered for TSLM(sales ~ trend() + season() + .)
[1] '.' in formula and no 'data' argument
Is there a systematic tsibble way around this or do I have to create the formula on the fly using the names of the dataset?
We could create a formula with reformulate using the 'dummy' column names
nm1 <- names(ex)[startsWith(names(ex), 'dummy')]
ex %>%
model(lm = TSLM(reformulate(c(nm1, 'trend()', 'season()'), 'sales') ))

Creating one hot encoded columns while preserving other features

I've got the following data:
dataset <- structure(list(id = structure(c(2L, 3L, 1L, 3L, 1L, 9L), .Label = c("215101",
"215559", "216566", "217284", "219435", "220209", "220249", "220250",
"225678", "225679", "225687", "225869", "228420", "228435", "230621",
"230623", "233063", "233097", "233098", "235546", "235560", "235567",
"236379"), class = "factor"), cat1 = c("A", "B", "B", "A", "A",
"A"), cat2 = c("item 1", "item 1", "item 2", "item 5", "item 3",
"item 28"), cat3 = c("theme 2", "theme 2", "theme 1", "theme 4",
"theme 10", "theme 40")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L))
I would like to create kind of model matrix with one hot encoded columns features created from columns cat2 and cat3. Therefore, my output would look like this:
structure(list(id = structure(c(1L, 1L, 2L, 3L, 3L, 9L), .Label = c("215101",
"215559", "216566", "217284", "219435", "220209", "220249", "220250",
"225678", "225679", "225687", "225869", "228420", "228435", "230621",
"230623", "233063", "233097", "233098", "235546", "235560", "235567",
"236379"), class = "factor"), cat1 = c("A", "B", "A", "A", "B",
"A"), `item 1` = c(0, 0, 1, 0, 1, 0), `item 2` = c(0, 1, 0, 0,
0, 0), `item 28` = c(0, 0, 0, 0, 0, 1), `item 3` = c(1, 0, 0,
0, 0, 0), `item 5` = c(0, 0, 0, 1, 0, 0), `theme 1` = c(0, 1,
0, 0, 0, 0), `theme 10` = c(1, 0, 0, 0, 0, 0), `theme 2` = c(0,
0, 1, 0, 1, 0), `theme 4` = c(0, 0, 0, 1, 0, 0), `theme 40` = c(0,
0, 0, 0, 0, 1)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L))
However, I don't have my independent variable in this dataset and I would like to preserve id and cat1 columns. How can I do that?
You could use merge and dcast twice.
library(reshape2)
merge(dcast(dataset, id + cat1 ~ cat2, fun.aggregate = length),
dcast(dataset, id + cat1 ~ cat3, fun.aggregate = length),
by = c("id", "cat1"))
# id cat1 item 1 item 2 item 28 item 3 item 5 theme 1 theme 10 theme 2 theme 4 theme 40
#1 215101 A 0 0 0 1 0 0 1 0 0 0
#2 215101 B 0 1 0 0 0 1 0 0 0 0
#3 215559 A 1 0 0 0 0 0 0 1 0 0
#4 216566 A 0 0 0 0 1 0 0 0 1 0
#5 216566 B 1 0 0 0 0 0 0 1 0 0
#6 225678 A 0 0 1 0 0 0 0 0 0 1
If you have more then two variables to spread you might melt you data first. This will save you some typing.
dcast(melt(dataset, id.vars = c("id", "cat1")), id + cat1 ~ value, fun.aggregate = length)

Resources