Combining sequences with similar gene IDs - r

I have a list of gene IDs along with their sequences in R.
$2435
[1]"ATGCGGGCGGGGGTCGTCGA"
$2435
[1]"ATGCGGCGCGCGCGCTATATACGC"
$2435
[1]"ATGCGGCGCCTCTCATCGCGGGGG"
I want to combine the sequences with the same gene IDs in that list in R.
$2435
[1]"ATGCGGGCGGGGGTCGTCGAATGCGGCGCGCGCGCTATATACGCATGCGGCGCCTCTCATCGCGGGGG"

Use lapply after matching the names with unique. Here's some sample data:
A <- list("12" = "AAAABBBBCCCCDDDD",
"34" = "GGGG",
"12" = "XXXXXXXXXXXXXXXXXXXXXXX",
"10" = "FFFFGGGG",
"10" = "HHHHIIII")
A
# $`12`
# [1] "AAAABBBBCCCCDDDD"
#
# $`34`
# [1] "GGGG"
#
# $`12`
# [1] "XXXXXXXXXXXXXXXXXXXXXXX"
#
# $`10`
# [1] "FFFFGGGG"
#
# $`10`
# [1] "HHHHIIII"
Subset the related names and paste them together.
lapply(unique(names(A)), function(x) paste(A[names(A) %in% x], collapse = ""))
# [[1]]
# [1] "AAAABBBBCCCCDDDDXXXXXXXXXXXXXXXXXXXXXXX"
#
# [[2]]
# [1] "GGGG"
#
# [[3]]
# [1] "FFFFGGGGHHHHIIII"

l <- list("A" = "ABC", "B" = "XYX", "A" = "DEF", "C" = "YZY", "A" = "GHI")
tapply(l, names(l), paste, collapse = "", simplify = FALSE)
# $A
# [1] "ABCDEFGHI"
#
# $B
# [1] "XYX"
#
# $C
# [1] "YZY"

Bonus:
For a dataframe output, use this:
aggregate(unlist(A), by=list(id=names(A)), paste, collapse="")
Where A is you list.
Using #Ananda's A, I get this:
id x
1 10 FFFFGGGGHHHHIIII
2 12 AAAABBBBCCCCDDDDXXXXXXXXXXXXXXXXXXXXXXX
3 34 GGGG

Related

Identifying patterns in two strings in R

I want to evaluate if ColA contains a new string than ColB. However, I am not interested in certain types of string, for example, oil. I would like to have an indicator variable as follow:
ColA ColB Ind
-------------------------- ------------------------ -----
coconut+grape+pine grape+coconut TRUE
orange+apple+grape+pine grape+coconut TRUE
grape+pine grape+oil TRUE
oil+grape grape+apple FALSE
grape grape+oil FALSE
grape+pine grape+orange+pine FALSE
Any Suggestions using R?
Many thanks!
Since we need to split the strings, we'll start with strsplit,
strsplit(dat$ColA, '+', fixed = TRUE)
# [[1]]
# [1] "coconut" "grape" "pine"
# [[2]]
# [1] "orange" "apple" "grape" "pine"
# [[3]]
# [1] "grape" "pine"
# [[4]]
# [1] "oil" "grape"
# [[5]]
# [1] "grape"
# [[6]]
# [1] "grape" "pine"
From here, we want to determine what is in ColA that is not in ColB. I'll use Map to run setdiff on each set (ColA's [[1]] with ColB's [[1]], etc).
Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE))
# [[1]]
# [1] "pine"
# [[2]]
# [1] "orange" "apple" "pine"
# [[3]]
# [1] "pine"
# [[4]]
# [1] "oil"
# [[5]]
# character(0)
# [[6]]
# character(0)
To determine which one has "new words", we can just check for non-zero length using lengths(.) > 0:
lengths(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE))) > 0
# [1] TRUE TRUE TRUE TRUE FALSE FALSE
But since you don't care about oil, we need to remove that as well.
lapply(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE)), setdiff, "oil")
# [[1]]
# [1] "pine"
# [[2]]
# [1] "orange" "apple" "pine"
# [[3]]
# [1] "pine"
# [[4]]
# character(0)
# [[5]]
# character(0)
# [[6]]
# character(0)
lengths(lapply(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE)),
setdiff, "oil")) > 0
# [1] TRUE TRUE TRUE FALSE FALSE FALSE
#akrun suggested a tidyverse variant:
library(dplyr)
library(purrr) # map2_lgl
library(stringr) # str_extract_all
dat %>%
mutate(
new = map2_lgl(
str_extract_all(ColB, "\\w+"), str_extract_all(ColA, "\\w+"),
~ !all(setdiff(.y, "oil") %in% .x)
)
)
# ColA ColB Ind new
# 1 coconut+grape+pine grape+coconut TRUE TRUE
# 2 orange+apple+grape+pine grape+coconut TRUE TRUE
# 3 grape+pine grape+oil TRUE TRUE
# 4 oil+grape grape+apple FALSE FALSE
# 5 grape grape+oil FALSE FALSE
# 6 grape+pine grape+orange+pine FALSE FALSE
Data
dat <- structure(list(ColA = c("coconut+grape+pine", "orange+apple+grape+pine", "grape+pine", "oil+grape", "grape", "grape+pine"), ColB = c("grape+coconut", "grape+coconut", "grape+oil", "grape+apple", "grape+oil", "grape+orange+pine"), Ind = c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE)), class = "data.frame", row.names = c(NA, -6L))
Here's a solution similar to r2evans's that calls strsplit only once with the help of do.call.
rid <- function(x) x[!x %in% z] ## helper FUN to get rid of the oil
z <- "oil"
L <- sapply(unname(dat), strsplit, "\\+")
dat$ind <- sapply(1:nrow(L), function(x) length(do.call(setdiff, rev(Map(rid, L[x,]))))) > 0
dat
# V1 V2 ind
# 1 grape+coconut coconut+grape+pine TRUE
# 2 grape+coconut orange+apple+grape+pine TRUE
# 3 grape+oil grape+pine TRUE
# 4 grape+apple oil+grape FALSE
# 5 grape+oil grape FALSE
# 6 grape+orange+pine grape+pine FALSE
Data:
dat <- structure(list(V1 = c("grape+coconut", "grape+coconut", "grape+oil",
"grape+apple", "grape+oil", "grape+orange+pine"), V2 = c("coconut+grape+pine",
"orange+apple+grape+pine", "grape+pine", "oil+grape", "grape",
"grape+pine")), row.names = c(NA, -6L), class = "data.frame")

How to merge two lists based on object indices - keeping attributes?

I want to merge two lists keeping the index of each object:
mylist<-list(1,NULL,2)
otherlist<-list(NULL,3,NULL,4,5,6)
# Desired
list(1,3,2,4,5,6)
# my try:
suppressWarnings(mapply(c, mylist, otherlist) )
Answer should be universal
EDIT: In order to avoid proliferation of similar questions. I decided to request here also the possibility of keeping attributes (preferably with base).
mylist<-list(1,NULL,2)
attr(mylist[[1]],"at")<-"a"
attr(mylist[[3]],"at")<-"c"
otherlist<-list(NULL,3,NULL,4,5,6)
attr(otherlist[[2]],"at")<-"b"
attr(otherlist[[4]],"at")<-"d"
attr(otherlist[[5]],"at")<-"e"
attr(otherlist[[6]],"at")<-"f"
Here is an option where we create a logical index with lengths (which will return 0 when there is NULL) and use to assign the elements with mylist unlisted
otherlist[lengths(otherlist) == 0] <- unlist(mylist)
otherlist
#[[1]]
#[1] 1
#[[2]]
#[1] 2
#[[3]]
#[1] 3
#[[4]]
#[1] 4
#[[5]]
#[1] 5
#[[6]]
#[1] 6
If we need to use Map, make sure the lengths are the same for the corresponding elements
otherlist[seq_along(mylist)] <- Map(c, otherlist[seq_along(mylist)], mylist)
Update
For the updated example
i1 <- sapply(otherlist, is.null)
i2 <- !sapply(mylist, is.null)
otherlist[i1] <- mylist[i2]
otherlist
#[[1]]
#[1] 1
#attr(,"at")
#[1] "a"
#[[2]]
#[1] 3
#attr(,"at")
#[1] "b"
#[[3]]
#[1] 2
#attr(,"at")
#[1] "c"
#[[4]]
#[1] 4
#attr(,"at")
#[1] "d"
#[[5]]
#[1] 5
#attr(,"at")
#[1] "e"
#[[6]]
#[1] 6
#attr(,"at")
#[1] "f"
foo <- function(l1, l2) {
out <- vector(mode = "list", length = max(length(l1), length(l2)))
out[seq_along(l1)] <- l1
out[!lengths(out)] <- l2[!lengths(out)]
out
}
foo(mylist, otherlist2)
# [[1]]
# [1] 1
# attr(,"at")
# [1] "a"
#
# [[2]]
# [1] 3
# attr(,"at")
# [1] "b"
#
# [[3]]
# [1] 2
# attr(,"at")
# [1] "c"
#
# [[4]]
# [1] 5
# attr(,"at")
# [1] "e"
#
# [[5]]
# [1] 6
# attr(,"at")
# [1] "f"

Propagate pairs into groups

I have the following data.frame:
df <- data.frame(V1 = c("A","X","A","Z","B","Y"),
V2 = c("B","Y","C","Y","C","W"),
stringsAsFactors=FALSE)
df
# V1 V2
# 1 A B
# 2 X Y
# 3 A C
# 4 Z Y
# 5 B C
# 6 Y W
I want to group all the values that occur together at some point and get the following:
list(c("A","B","C"), c("X","Y","Z","W"))
# [[1]]
# [1] "A" "B" "C"
#
# [[2]]
# [1] "X" "Y" "Z" "W"
Network analyses can help.
library(igraph)
df <- data.frame(V1 = c("A","X","A","Z","B","Y"),
V2 = c("B","Y","C","Y","C","W"),
stringsAsFactors=FALSE)
g <- graph_from_data_frame(df, directed = FALSE)
clust <- clusters(g)
clusters <- data.frame(name = names(clust$membership),
cluster = clust$membership,
row.names = NULL,
stringsAsFactors = FALSE)
split(clusters$name, clusters$cluster)
$`1`
[1] "A" "B" "C"
$`2`
[1] "X" "Z" "Y" "W"
You can of course leave everything in the cluster data.frame for further analyses.

Convert 3D-dataframe to 2D-dataframe in R

I have a three dimensional excel table which I would like to convert into a two dimensional dataframe that I can use in R. I think the best way is to read it in R and then transform it directly within R, but I do not find how. Here is an example. I have a df1-like dataframe that I want to transform to df2:
a1 <- paste("a","b","c",sep = ";")
a2 <- paste("e","f","g",sep = ";")
df1 <- data.frame(v1=a1, v2=a2, row.names = "w1")
df2 <- data.frame(w1=c(rep("v1",3),rep("v2",3)), "value"=letters[1:6])
You can achieve this by using reshape2
sub_df1 <- apply(df1,2,FUN= strsplit,split = ";")
# $v1
# $v1$w1
# [1] "a" "b" "c"
# $v2
# $v2$w1
# [1] "e" "f" "g
sub_df2 <- sapply(apply(df1,2,FUN= strsplit,split = ";"), FUN = unlist,use.names = TRUE, recursive = FALSE)
# v1 v2
# w11 "a" "e"
# w12 "b" "f"
# w13 "c" "g"
melt(sub_df2)[-1]
# Var2 value
# 1 v1 a
# 2 v1 b
# 3 v1 c
# 4 v2 e
# 5 v2 f
# 6 v2 g
You can then delete the first column by adding the [-1]

Using rbind()/cbind() to append single row data in R

I have 6 numeric lists each containing different number of values i.e [1:350] , [1:450] .... . I am trying to append all of these lists into a singular list i.e [1:1050] using rbind(), but the output I get is dataframe of [1:350, 1:6].
Can someone please help me with this.
To concatenate multiple lists, you can use c()
x <- list(1, 2:5)
y <- list("A", "B")
z <- list(letters[1:5])
c(x, y, z)
# [[1]]
# [1] 1
#
# [[2]]
# [1] 2 3 4 5
#
# [[3]]
# [1] "A"
#
# [[4]]
# [1] "B"
#
# [[5]]
# [1] "a" "b" "c" "d" "e"

Resources