Join Tables by Condition in R

Join Tables by Condition in R - r

I've two tables I would like to join by condition.
Data_1 <- data.frame(Section_A = c("a","a","a","a","a","b","b","b","b","b","b","c","c","c","c"),
ID1 = c(1,2,3,4,5,10,11,12,13,14,15,20,21,22,23),
Key = c("A","A","A","B","B","C","C","C","A","B","B","A","B","C","C"))
Data_2 <- data.frame(Section_B = c("d","d","d","d","d","d","e","e","e","f","f","f","f","f","f"),
ID2 = c(31,32,33,34,35,36,41,42,43,54,55,56,57,58,59),
Key = c("B","B","A","A","C","A","C","B","A","C","A","A","C","B","A"))
The following conditions must be satisfied when joining the tables.
Section_A "level a" can "receive" no more than 2 (count(ID2) for Section_A(a) <= 2)
Section_A "level b" can "receive" no more than 1 (count(ID2) for Section_A(b) <= 1)
Section_A "level c" can "receive" no more than 2 (count(ID2) for Section_A(c) <= 2)
Section_B "level d" can "give" no more than 2 (count(ID2) for Section_B(d) <= 2)
Section_B "level e" can "give" no more than 2 (count(ID2) for Section_B(e) <= 2)
Section_B "level f" can "give" no more than 1 (count(ID2) for Section_B(f) <= 1)
Any of ID1 and ID2 can't repeat itself in the joined table.
The expected outcome is:
Expected_Outcome <- data.frame(Section_A = c("a","a","b","c","c"),
ID1 = c(1,2,10,20,21),
Key = c("A","A","C","B","C"),
Section_B = c("d","d", "e","e","f"),
ID2 = c(33,34,41,42,57))

Related

optimize call function for each row of data table

We need to optimize a program will perform a treatment on each "nb_student"
depending on the value number, generate a list number corresponding to the value entered.
Once this list is obtained, another program will have to count according to a ranking rule.
ranking rule
if number of student :
is less than 1 => increment group A
is between 1 and 3 => increment group B
is between 3 and 4 => increment group C
is greater than 4 => increment group D
initial data
"category_name" "nb_student"
A 6,00000
A 10,00000
B 12,0000
C 74,0000
D 6,00000
init data code
DT = data.table(
category_name = c("A","B","C","D"),
nb_student = c(6,12,74,6)
)
function for each row
treatment_group <- function(nb_student){
nb_group_A = nb_group_B = nb_groupe_C = nb_groupe_D <- 0
limit_1 <- 1
limit_2 <- 3
limit_3 <- 4
list <- runif(nb_student, 0, 5)
for (i in list) {
if(i < limit_1){
nb_group_A <- nb_group_A + 1
}else if(i > limit_1 & i < limit_2){
nb_group_B <- nb_group_B + 1
}else if(i > limit_3){
nb_groupe_C <- nb_groupe_C + 1
}else {
nb_groupe_D <- nb_groupe_D + 1
}
}
list(nb_group_A, nb_group_B, nb_groupe_C, nb_groupe_D)
}
result
DT[ , c("group A", "group B", "group C", "group D") := tratment_group(nb_student), by = seq_len(nrow(DT))]
The final result must match this table
"category_name" "nb_student" "group A" "group B" "group C" "group D"
A 6,00000 0,00000 2,00000 4,00000 0,00000
A 10,00000 3,00000 3,00000 4,00000 0,00000
B 12,0000 2,00000 9,00000 0,00000 1,00000
C 74,0000 14,0000 29,0000 15,0000 16,0000
D 6,00000 0,00000 1,00000 3,00000 2,00000
this code works, but i want to optimize it to run with 200000 rows. Maybe using parallelization ?

I guess you can try findInterval
set.seed(1)
DT[
,
c(
.SD,
as.data.frame(
t(as.matrix(table(
factor(
findInterval(runif(nb_student, 0, 5), c(1, 3, 4)) + 1,
levels = 1:4,
label = paste("group", LETTERS[1:4])
)
)))
)
),
category_name
]
which gives
category_name nb_student group A group B group C group D
1: A 6 0 4 0 2
2: B 12 2 3 5 2
3: C 74 11 35 17 11
4: D 6 0 2 3 1

Report all possible combinations of a string-separated vector

In the tidyverse I would like to mutate/expand a string vector so that all possible combinations of elements (separated by " & ") are reported, one for each line.
I tried decomposing my function using t(combn(unlist(strsplit(x, " & ")),2)), but fails when there is no " & ".
In the example:
"A" remains "A" (or becomes "A & A")
"A & B" remains "A & B"
"C & D & E" becomes "C & D", "C & E" and "D & E" in three different rows
Note (1): I cannot predict the number of combinations in advance "A & B & C & D..."
Note (2): Order is not important (i.e. "C & D" == "D & C")
Note (3): This would feed into a separate function and be used in a igraph application.
Thanks in advance.
data <- data.frame(names=c(1:3), combinations=c("A","A & B","C & D & E"))
names combinations
1 1 A
2 2 A & B
3 3 C & D & E
expected <- data.frame(projects=c(1,2,3,3,3), combinations=c("A","A & B","C & D","C & E","D & E"))
projects combinations
1 1 A
2 2 A & B
3 3 C & D
4 3 C & E
5 3 D & E

You can use combn to create combinations within each name :
library(dplyr)
library(tidyr)
data %>%
separate_rows(combinations, sep = ' & ') %>%
group_by(names) %>%
summarise(combinations = if(n() > 1)
combn(combinations, 2, paste0, collapse = ' & ') else combinations) %>%
ungroup
# names combinations
# <int> <chr>
#1 1 A
#2 2 A & B
#3 3 C & D
#4 3 C & E
#5 3 D & E

A data.table option
setnames(
setDT(data)[
,
{
s <- unlist(strsplit(combinations, " & "))
if (length(s) == 1) s else combn(s, 2, paste0, collapse = " & ")
},
names
], "V1", "combinations"
)[]
gives
names combinations
1: 1 A
2: 2 A & B
3: 3 C & D
4: 3 C & E
5: 3 D & E

Using data.table method
library(splitstackshape)
setnames(cSplit(data, 'combinations', sep=' & ', 'long', type.convert = FALSE)[,
if(.N > 1) combn(combinations, 2, FUN = paste, collapse = ' & ') else
combinations, names], 'V1', 'combinations')[]
# names combinations
#1: 1 A
#2: 2 A & B
#3: 3 C & D
#4: 3 C & E
#5: 3 D & E

subsets of different vectors R

I have three vectors as shown below.
q = c("a == 1", "a == 2", "a == 3")
w = c("b >= 50", "b >= 100")
t = c("c >= 40 & c <= 80", "c > 80")
I want to be able to combine all the vectors into one large vector so that every possible subset is in a larger vector. For example I want to have
("a == 1 & b >= 50", "a == 1 & b >= 100", "a ==2 & b >=50",
"a == 2 & b >= 100", "a == 3 & b >= 50", "a == 3 & b >= 100",
"a ==1 & c >= 40 & c <= 80", "a ==1 & c > 80",
"a ==2 & c >= 40 & c <= 80", "a ==2 & c > 80",
"a ==3 & c >= 40 & c <= 80", "a ==3 & c > 80",
"b >= 50 & c >= 40 & c <= 80", "b >= 50 & c > 80",
"b >= 100 & c >= 40 & c <= 80", "b >= 100 & c > 80",
"a == 1 & b >= 50 & c >= 40 & c <= 80", "a == 1 & b >=50 & c > 80",
"a == 2 & b >= 50 & c >= 40 & c <= 80", "a == 2 & b >=50 & c > 80",
"a == 3 & b >= 50 & c >= 40 & c <= 80", "a == 3 & b >=50 & c > 80")
"a == 2 & b >= 100 & c >= 40 & c <= 80", "a == 2 & b >=100 & c > 80",
"a == 3 & b >= 100 & c >= 40 & c <= 80", "a == 3 & b >=100 & c > 80")
So I need every subset to be created and joined with the "&" sign but I don't want to be comparing any element in the same vector. I also have three vectors in this example but the number of vectors should be variable. Does anyone know how to achieve this? Thanks!

We can create strings using expand.grid and combn. Create a combn of list ('lst') elements picking 2 or 3 in a list (using lapply), expand the list elements into a data.frame and paste with do.call (specifying the sep as " & ")
lst <- list(q w, t)
unlist( lapply(2:3, function(i) combn(lst, i,
FUN = function(x) do.call(paste, c(expand.grid(x), sep = " & ")),
simplify = FALSE)))

Optimizing matching in R

Hoping someone can help. I have a ton of ortholog mapping to do in R, which is proving to be incredibly time consuming. I've posted an example structure below. Obvious answers such as iterating line by line (for i in 1:nrow(df)) and string splitting, or using sapply have been tried and are incredibly slow. I am therefore hoping for a vectorized option.
stringsasFactors = F
# example accession mapping
map <- data.frame(source = c("1", "2 4", "3", "4 6 8", "9"),
target = c("a b", "c", "d e f", "g", "h i"))
# example protein list
df <- data.frame(sourceIDs = c("1 2", "3", "4", "5", "8 9"))
# now, map df$sourceIDs to map$target
# expected output
> matches
[1] "a b c" "d e f" "g" "" "g h i"
I appreciate any help!

In most cases, the best approach to this kind of problem is to create data.frames with one observation per row.
map_split <- lapply(map, strsplit, split = ' ')
long_mappings <- mapply(expand.grid, map2$source, map2$target, SIMPLIFY = FALSE)
all_map <- do.call(rbind, long_mappings)
names(all_map) <- c('source', 'target')
Now all_map looks like this:
source target
1 1 a
2 1 b
3 2 c
4 4 c
5 3 d
6 3 e
7 3 f
8 4 g
9 6 g
10 8 g
11 9 h
12 9 i
Doing the same for df...
sourceIDs_split <- strsplit(df$sourceIDs, ' ')
df_long <- data.frame(
index = rep(seq_along(sourceIDs_split), lengths(sourceIDs_split)),
source = unlist(sourceIDs_split)
)
Give us this for df_long:
index source
1 1 1
2 1 2
3 2 3
4 3 4
5 4 5
6 5 8
7 5 9
Now they just need to be merged and collapsed.
matches <- merge(df_long, all_map, by = 'source', all.x = TRUE)
tapply(
matches$target,
matches$index,
function(x) {
paste0(sort(x), collapse = ' ')
}
)
# 1 2 3 4 5
# "a b c" "d e f" "c g" "" "g h i"

split and table

I have a data frame like this:
GN SN
a b
a b
a c
d e
d f
d e
I would like the following output:
GN: a SN: 2 b 1 c
GN d SN: 2 e 1 f
In other words I would like to have a sort of table() of the data.frame on the column S.N. First of all I splitted the data.frame according to $GN, so I have blocks. At this point I' m not able to have the counting of the elements on column SN according to the split I've done. Is the "apply" function a way to do this? And how can i save a general output belonging from split function?
Thanks in advance

With your data:
df <- data.frame(GN = rep(c("a","b"), each = 3),
SN = c(rep("b", 2), "c", "e", "f", "e"))
We could do:
> lapply(with(df, split(SN, GN)), table)
$a
b c e f
2 1 0 0
$b
b c e f
0 0 2 1
But if you don't want all the levels (the 0 entries) then we need to drop the empty levels:
> lapply(with(df, split(SN, GN)), function(x) table(droplevels(x)))
$a
b c
2 1
$b
e f
2 1
Writing out the individual tables to a file
This isn't perfect but at least you can work with it
## save tables
tmp <- lapply(with(df, split(SN, GN)), function(x) table(droplevels(x)))
## function to write output to file `fname`
foo <- function(x, fname) {
cat(paste(names(x), collapse = " "), "\n", file = fname, append = TRUE)
cat(paste(x, collapse = " "), "\n", file = fname, append = TRUE)
invisible()
}
fname <- "foo.txt"
file.create(fname) # create file fname
lapply(tmp, foo, fname = fname) # run our function to write to fname
That gives:
R> readLines(fname)
[1] "b c " "2 1 " "e f " "2 1 "
or from the OS:
$ cat foo.txt
b c
2 1
e f
2 1

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Join Tables by Condition in R - r

Related

optimize call function for each row of data table

Report all possible combinations of a string-separated vector

subsets of different vectors R

Optimizing matching in R

split and table

Categories

Resources