how to remove these characters from a string c () \ ][ "" - r

I have a string like this
st <- "c(c(c(\"E\" >= \"E\", \"B\" <= \"E\" | \"D\" <= \"B\"), \"A\" >= \"C\" | \"A\" >= \"A\"), c(\"E\" >= \"C\", \"D\" <= \"C\" | \"C\" <= \"C\")) []"
I want to remove these characters from it c () \ ][ ""
i would like to get this
"E >= E, B <= E | D <= B, A >= C | A >= A, E >= C, D <= C | C <= C"

Here is one approach that could solve your problem:
gsub('[c()"]| \\[\\]', "", st)
# [1] "E >= E, B <= E | D <= B, A >= C | A >= A, E >= C, D <= C | C <= C"

Related

Report all possible combinations of a string-separated vector

In the tidyverse I would like to mutate/expand a string vector so that all possible combinations of elements (separated by " & ") are reported, one for each line.
I tried decomposing my function using t(combn(unlist(strsplit(x, " & ")),2)), but fails when there is no " & ".
In the example:
"A" remains "A" (or becomes "A & A")
"A & B" remains "A & B"
"C & D & E" becomes "C & D", "C & E" and "D & E" in three different rows
Note (1): I cannot predict the number of combinations in advance "A & B & C & D..."
Note (2): Order is not important (i.e. "C & D" == "D & C")
Note (3): This would feed into a separate function and be used in a igraph application.
Thanks in advance.
data <- data.frame(names=c(1:3), combinations=c("A","A & B","C & D & E"))
names combinations
1 1 A
2 2 A & B
3 3 C & D & E
expected <- data.frame(projects=c(1,2,3,3,3), combinations=c("A","A & B","C & D","C & E","D & E"))
projects combinations
1 1 A
2 2 A & B
3 3 C & D
4 3 C & E
5 3 D & E
You can use combn to create combinations within each name :
library(dplyr)
library(tidyr)
data %>%
separate_rows(combinations, sep = ' & ') %>%
group_by(names) %>%
summarise(combinations = if(n() > 1)
combn(combinations, 2, paste0, collapse = ' & ') else combinations) %>%
ungroup
# names combinations
# <int> <chr>
#1 1 A
#2 2 A & B
#3 3 C & D
#4 3 C & E
#5 3 D & E
A data.table option
setnames(
setDT(data)[
,
{
s <- unlist(strsplit(combinations, " & "))
if (length(s) == 1) s else combn(s, 2, paste0, collapse = " & ")
},
names
], "V1", "combinations"
)[]
gives
names combinations
1: 1 A
2: 2 A & B
3: 3 C & D
4: 3 C & E
5: 3 D & E
Using data.table method
library(splitstackshape)
setnames(cSplit(data, 'combinations', sep=' & ', 'long', type.convert = FALSE)[,
if(.N > 1) combn(combinations, 2, FUN = paste, collapse = ' & ') else
combinations, names], 'V1', 'combinations')[]
# names combinations
#1: 1 A
#2: 2 A & B
#3: 3 C & D
#4: 3 C & E
#5: 3 D & E

subsets of different vectors R

I have three vectors as shown below.
q = c("a == 1", "a == 2", "a == 3")
w = c("b >= 50", "b >= 100")
t = c("c >= 40 & c <= 80", "c > 80")
I want to be able to combine all the vectors into one large vector so that every possible subset is in a larger vector. For example I want to have
("a == 1 & b >= 50", "a == 1 & b >= 100", "a ==2 & b >=50",
"a == 2 & b >= 100", "a == 3 & b >= 50", "a == 3 & b >= 100",
"a ==1 & c >= 40 & c <= 80", "a ==1 & c > 80",
"a ==2 & c >= 40 & c <= 80", "a ==2 & c > 80",
"a ==3 & c >= 40 & c <= 80", "a ==3 & c > 80",
"b >= 50 & c >= 40 & c <= 80", "b >= 50 & c > 80",
"b >= 100 & c >= 40 & c <= 80", "b >= 100 & c > 80",
"a == 1 & b >= 50 & c >= 40 & c <= 80", "a == 1 & b >=50 & c > 80",
"a == 2 & b >= 50 & c >= 40 & c <= 80", "a == 2 & b >=50 & c > 80",
"a == 3 & b >= 50 & c >= 40 & c <= 80", "a == 3 & b >=50 & c > 80")
"a == 2 & b >= 100 & c >= 40 & c <= 80", "a == 2 & b >=100 & c > 80",
"a == 3 & b >= 100 & c >= 40 & c <= 80", "a == 3 & b >=100 & c > 80")
So I need every subset to be created and joined with the "&" sign but I don't want to be comparing any element in the same vector. I also have three vectors in this example but the number of vectors should be variable. Does anyone know how to achieve this? Thanks!
We can create strings using expand.grid and combn. Create a combn of list ('lst') elements picking 2 or 3 in a list (using lapply), expand the list elements into a data.frame and paste with do.call (specifying the sep as " & ")
lst <- list(q w, t)
unlist( lapply(2:3, function(i) combn(lst, i,
FUN = function(x) do.call(paste, c(expand.grid(x), sep = " & ")),
simplify = FALSE)))

Filtering rows in a dataset by columns

I have the following table:
FN LN LN1 LN2 LN3 LN4 LN5
a b b x x x x
a c b d e NA NA
a d c a b x x
a e b c d x e
I'm filtering records for which LN is present in LN1 to LN5.
The code I used:
testFilter = filter(test, LN %in% c(LN1, LN2, LN3, LN4, LN5))
The result is not what I expect:
ï..FN LN LN1 LN2 LN3 LN4 LN5
1 a b b x x x x
2 a c b d e <NA> <NA>
3 a d c a b x x
4 a e b c d x e
I understand that c(LN1, LN2, LN3, LN4, LN5) gives: "b" "b" "c" "b" "x" "d" "a" "c" "x" "e" "b" "d" "x" NA "x" "x" "x" NA "x" "e" and know this is where the mistake is.
Ideally, I want to return only the 1st and 4th record.
FN LN LN1 LN2 LN3 LN4 LN5
a b b x x x x
a e b c d x e
I want to filter them only using column names. This is just a subset of 5.4M records.
There is an alternative approach using data.table and Reduce():
library(data.table)
cols <- paste0("LN", 1:5)
setDT(test)[test[, .I[Reduce(`|`, lapply(.SD, function(x) !is.na(x) & LN == x))],
.SDcols = cols]]
FN LN LN1 LN2 LN3 LN4 LN5
1: a b b x x x x
2: a e b c d x e
Data
library(data.table)
test <- fread(
"FN LN LN1 LN2 LN3 LN4 LN5
a b b x x x x
a c b d e NA NA
a d c a b x x
a e b c d x e")
Benchmark
library(data.table)
library(dplyr)
n_row <- 1e6L
set.seed(123L)
DT <- data.table(
FN = "a",
LN = sample(letters, n_row, TRUE))
cols <- paste0("LN", 1:5)
DT[, (cols) := lapply(1:5, function(x) sample(c(letters, NA), n_row, TRUE))]
DT
df1 <- as.data.frame(DT)
bm <- microbenchmark::microbenchmark(
zx8754 = {
df1[ apply(df1, 1, function(i) i[2] %in% i[3:7]), ]
},
eric = {
df1[ which(df1$LN == df1$LN1 |
df1$LN == df1$LN2 |
df1$LN == df1$LN3 |
df1$LN == df1$LN4 |
df1$LN == df1$LN5), ]
},
uwe = {
DT[DT[, .I[Reduce(`|`, lapply(.SD, function(x) !is.na(x) & LN == x))],
.SDcols = cols]]
},
axe = {
filter_at(df1, vars(num_range("LN", 1:5)), any_vars(. == LN))
},
jaap = {df1[!!rowSums(df1$LN == df1[, 3:7], na.rm = TRUE),]},
times = 50L
)
print(bm, "ms")
Unit: milliseconds
expr min lq mean median uq max neval cld
zx8754 3120.68925 3330.12289 3508.03001 3460.83459 3589.10255 4552.9070 50 c
eric 69.74435 79.11995 101.80188 83.78996 98.24054 309.3864 50 a
uwe 93.26621 115.30266 130.91483 121.64281 131.75704 292.8094 50 a
axe 69.82137 79.54149 96.70102 81.98631 95.77107 315.3111 50 a
jaap 362.39318 489.86989 543.39510 544.13079 570.10874 1110.1317 50 b
For 1 M rows, the hard coded subsetting is the fastest, followed by the data.table/Reduce() and dplyr/filter_at approaches. Using apply() is 60 times slower.
ggplot(bm, aes(expr, time)) + geom_violin() + scale_y_log10() + stat_summary(fun.data = mean_cl_boot)
A quick and very easy dplyr solution:
filter_at(df1, vars(num_range("LN", 1:5)), any_vars(. == LN))
This is very similar in performance as the hard coded answer by #EricFail, because this simply internally extends the call to:
filter(df1, (LN1 == LN) | (LN2 == LN) | (LN3 == LN) | (LN4 == LN) | (LN5 == LN))
Instead of num_range any other select helpers can be used within vars to easily select many variables based on their names. Or one can directly give column positions.
Using apply:
# data
df1 <- read.table(text = "
FN LN LN1 LN2 LN3 LN4 LN5
a b b x x x x
a c b d e NA NA
a d c a b x x
a e b c d x e", header = TRUE, stringsAsFactors = FALSE)
df1[ apply(df1, 1, function(i) i[2] %in% i[3:7]), ]
# FN LN LN1 LN2 LN3 LN4 LN5
# 1 a b b x x x x
# 4 a e b c d x e
Note: Consider using other solutions below for big datasets, which can be 60 times faster than this apply solution.
not the simplest code, but
df1[ which(df1$LN == df1$LN1 |
df1$LN == df1$LN2 |
df1$LN == df1$LN3 |
df1$LN == df1$LN4 |
df1$LN == df1$LN5), ]
#> FN LN LN1 LN2 LN3 LN4 LN5
#> 1 a b b x x x x
#> 4 a e b c d x e
You could also use rowSums:
df1[!!rowSums(df1$LN == df1[, 3:7], na.rm = TRUE),]
which gives:
FN LN LN1 LN2 LN3 LN4 LN5
1 a b b x x x x
4 a e b c d x e
For a benchmark, see the answer of #Uwe.

Column values based on another column

I have some data with this structure:
## Column examples generation
bases <- c("A", "T", "C", "G")
ID <- c(1,2,3,4,5,6)
SNP <- rep (c("F1", "F3", "F4"), each=length(ID))
Al_1 <- sample(bases, length(SNP), replace=T)
Al_2 <- sample(bases, length(SNP), replace=T)
tipo <- rep(c("."),length(SNP))
## Data frame generation:
ArrDat <- as.data.frame(cbind(ID, SNP, Al_1, Al_2, tipo))
ArrDat <- data.frame(lapply(ArrDat, as.character), stringsAsFactors = F)
OrderArr <- ArrDat[order(ArrDat$ID),]
## Column "tipo" values:
for (i in 1:nrow(OrderArr)) {
if (OrderArr$Al_1[i] == "A" & OrderArr$Al_2[i] == "T"){
OrderArr$tipo[i] = "a"
} else if (OrderArr$Al_1[i] == "A" & OrderArr$Al_2[i] == "C"){
OrderArr$tipo[i] = "b"
} else if (OrderArr$Al_1[i] == "A" & OrderArr$Al_2[i] == "G"){
OrderArr$tipo[i] = "c"
} else if (OrderArr$Al_1[i] == "T" & OrderArr$Al_2[i] == "A"){
OrderArr$tipo[i] = "d"
} else if (OrderArr$Al_1[i] == "T" & OrderArr$Al_2[i] == "C"){
OrderArr$tipo[i] = "e"
} else if (OrderArr$Al_1[i] == "T" & OrderArr$Al_2[i] == "G"){
OrderArr$tipo[i] = "f"
} else if (OrderArr$Al_1[i] == "C" & OrderArr$Al_2[i] == "A"){
OrderArr$tipo[i] = "g"
} else if (OrderArr$Al_1[i] == "C" & OrderArr$Al_2[i] == "T"){
OrderArr$tipo[i] = "h"
} else if (OrderArr$Al_1[i] == "C" & OrderArr$Al_2[i] == "G"){
OrderArr$tipo[i] = "i"
} else if (OrderArr$Al_1[i] == "G" & OrderArr$Al_2[i] == "A"){
OrderArr$tipo[i] = "j"
} else if (OrderArr$Al_1[i] == "G" & OrderArr$Al_2[i] == "T"){
OrderArr$tipo[i] = "k"
} else if (OrderArr$Al_1[i] == "G" & OrderArr$Al_2[i] == "C"){
OrderArr$tipo[i] = "l"
} else if (OrderArr$Al_1[i] == "A" & OrderArr$Al_2[i] == "A"){
OrderArr$tipo[i] = "STHG.A"
} else if (OrderArr$Al_1[i] == "T" & OrderArr$Al_2[i] == "T"){
OrderArr$tipo[i] = "STHG.T"
} else if (OrderArr$Al_1[i] == "C" & OrderArr$Al_2[i] == "C"){
OrderArr$tipo[i] = "STHG.C"
} else if (OrderArr$Al_1[i] == "G" & OrderArr$Al_2[i] == "G"){
OrderArr$tipo[i] = "STHG.G"
} else {OrderArr$tipo[i] = "x"}
}
Here is an example of the data:
ID SNP Al_1 Al_2 tipo
1 1 F1 T A d
7 1 F3 C A g
13 1 F4 G C l
2 2 F1 T T STHG.T
8 2 F3 C C STHG.C
14 2 F4 C C STHG.C
My problem are the OrderArr$tipo values for these Al_1-Al_2 combinations: A-A, T-T, C-C or G-G.
These combinations may have an OrderArr$tipo value equal to other rows' with the same OrderArr$SNP value, so the data I put before should be:
ID SNP Al_1 Al_2 tipo
1 1 F1 T A d
7 1 F3 C A g
13 1 F4 G C l
2 2 F1 T T d
8 2 F3 C C g
14 2 F4 C C l
How can I implement this at the code?
Thanks a lot.
I've created a data frame where each SNP has only one combination of Al_1 and Al_2.
ID SNP Al_1 Al_2 combo tipo
1 1 F1 A T AT a
2 1 F4 G G GG z
3 1 D2 C T CT h
4 1 D4 T C TC e
5 1 HY7 A A AA z
6 1 HY66 T G TG f
7 1 XZD1 C A CA g
8 1 XZD33 G A GA j
9 2 F1 A A AA z
10 2 F4 C G CG i
11 2 D2 C C CC z
12 2 D4 T C TC e
13 2 HY7 A A AA z
14 2 HY66 G G GG z
15 2 XZD1 C A CA g
16 2 XZD33 G A GA j
17 3 F1 T T TT z
18 3 F4 C C CC z
19 3 D2 C T CT h
20 3 D4 T C TC e
21 3 HY7 A C AC b
22 3 HY66 G G GG z
23 3 XZD1 A A AA z
24 3 XZD33 A A AA z
25 4 F1 A T AT a
26 4 F4 C G CG i
27 4 D2 C T CT h
28 4 D4 T T TT z
29 4 HY7 C C CC z
30 4 HY66 T T TT z
31 4 XZD1 C A CA g
32 4 XZD33 A A AA z
33 5 F1 T T TT z
34 5 F4 C G CG i
35 5 D2 T T TT z
36 5 D4 T T TT z
37 5 HY7 A A AA z
38 5 HY66 T G TG f
39 5 XZD1 A A AA z
40 5 XZD33 G G GG z
41 6 F1 A T AT a
42 6 F4 G G GG z
43 6 D2 T T TT z
44 6 D4 C C CC z
45 6 HY7 C C CC z
46 6 HY66 T T TT z
47 6 XZD1 C C CC z
48 6 XZD33 G A GA j
And I think I've one answer for your question.
data$combo <- paste0(data$Al_1, data$Al_2)
snp <- unique(data$SNP)
for (i in 1:nrow(data)){
if(data$Al_1[i] == data$Al_2[i]) data$tipo[i] ='z'
else if (data$Al_1[i] == 'A') {
if (data$Al_2[i] == 'T') data$tipo[i] = 'a'
else if (data$Al_2[i] == 'C') data$tipo[i] = 'b'
else if (data$Al_2[i] == 'G') data$tipo[i] = 'c'
}
else if (data$Al_1[i] == 'T') {
if (data$Al_2[i] == 'A') data$tipo[i] = 'd'
else if (data$Al_2[i] == 'C') data$tipo[i] = 'e'
else if (data$Al_2[i] == 'G') data$tipo[i] = 'f'
}
else if (data$Al_1[i] == 'C') {
if (data$Al_2[i] == 'A') data$tipo[i] = 'g'
else if (data$Al_2[i] == 'T') data$tipo[i] = 'h'
else if (data$Al_2[i] == 'G') data$tipo[i] = 'i'
}
else if (data$Al_1[i] == 'G') {
if (data$Al_2[i] == 'A') data$tipo[i] = 'j'
else if (data$Al_2[i] == 'T') data$tipo[i] = 'k'
else if (data$Al_2[i] == 'C') data$tipo[i] = 'l'
}
}
ord.data <- data
ord.data2 <- data.frame()
for (j in 1:length(snp)){
temp <- ord.data[as.numeric(as.factor(ord.data$SNP)) == j, ]
for (h in 1:nrow(temp)){
if (temp$tipo[h] == 'z') {
if (temp$Al_1[h] == 'A') {
tempi <- subset(temp, tipo != 'z')
letra <- unique(tempi$tipo)
temp$tipo[h] = letra
}
else if (temp$Al_1[h] == 'T') {
tempi <- subset(temp, tipo != 'z')
letra <- unique(tempi$tipo)
temp$tipo[h] = letra
}
else if (temp$Al_1[h] == 'C') {
tempi <- subset(temp, tipo != 'z')
letra <- unique(tempi$tipo)
temp$tipo[h] = letra
}
else if (temp$Al_1[h] == 'G') {
tempi <- subset(temp, tipo != 'z')
letra <- unique(tempi$tipo)
temp$tipo[h] = letra
}
}
}
ord.data2 <- rbind(ord.data2, temp)
}

Multiple subsets

Could you suggest a more elegant solution to the following problem? Remove rows containing more than one 0 in columns x,z,y or a,b,c.
df <- data.frame(x = 0, y = 1:5, z = 0:4, a = 4:0, b = 1:5, c=0)
my solution (row 1 and row 5 should get removed)
df_new <- subset(df, ((((x != 0 & y != 0) | (x != 0 & z != 0) | (y != 0 & z != 0)) & ((a != 0 & b != 0) | (a != 0 & c != 0) | (b != 0 & c != 0)))))
# 1:3 is same as columns 'x', 'y', 'z', Similarily for 4:6 .
# You can also specify the colnames explicitly
# add a na.rm = T inside rowSums() incase you also have missing data
(rowSums(df[, 1:3]==0)>1)|(rowSums(df[, 4:6]==0)>1)
# did you mean this ?
df[!((rowSums(df[, 1:3]==0)>1)|(rowSums(df[, 4:6]==0)>1)),]
# x y z a b c
#2 0 2 1 3 2 0
#3 0 3 2 2 3 0
#4 0 4 3 1 4 0

Resources