Propagate pairs into groups - r

I have the following data.frame:
df <- data.frame(V1 = c("A","X","A","Z","B","Y"),
V2 = c("B","Y","C","Y","C","W"),
stringsAsFactors=FALSE)
df
# V1 V2
# 1 A B
# 2 X Y
# 3 A C
# 4 Z Y
# 5 B C
# 6 Y W
I want to group all the values that occur together at some point and get the following:
list(c("A","B","C"), c("X","Y","Z","W"))
# [[1]]
# [1] "A" "B" "C"
#
# [[2]]
# [1] "X" "Y" "Z" "W"

Network analyses can help.
library(igraph)
df <- data.frame(V1 = c("A","X","A","Z","B","Y"),
V2 = c("B","Y","C","Y","C","W"),
stringsAsFactors=FALSE)
g <- graph_from_data_frame(df, directed = FALSE)
clust <- clusters(g)
clusters <- data.frame(name = names(clust$membership),
cluster = clust$membership,
row.names = NULL,
stringsAsFactors = FALSE)
split(clusters$name, clusters$cluster)
$`1`
[1] "A" "B" "C"
$`2`
[1] "X" "Z" "Y" "W"
You can of course leave everything in the cluster data.frame for further analyses.

Related

Exporting dataframe by write.csv

I have a data like following
class.df <- data.frame(
A = sample(1:2, 100, replace=TRUE),
B = sample(1:2, 100, replace=TRUE),
C = sample(1:2, 100, replace=TRUE),
D = sample(1:2, 100, replace=TRUE)
)
ids_df <- t(combn(names(class.df), 2))
fisher_tests <- apply(ids_df, 1, function(i) tryCatch(fisher.test(table(class.df[,i])), error = function(e) NA_real_))
edge_table <- cbind(ids_df, t(sapply(fisher_tests, "[", c("p.value", "estimate"))))
edge_table
p.value estimate
[1,] "A" "B" 0.6826874 0.7919741
[2,] "A" "C" 0.6873498 1.219358
[3,] "A" "D" 0.5473441 0.7356341
[4,] "B" "C" 0.6828843 0.8164863
[5,] "B" "D" 1 1.033625
[6,] "C" "D" 0.2257244 0.5789776
write.csv(edge_table,"/Users/Results/EE2.csv")
But when i try to write.csv and open the csv file. The last column (estimate) shows weird values of matrix like following. I just need numerical values as it was showing in R. How to resolve the issue.
Your edge_table object is an array that contains list columns due to the way you have created it. If you want a standard data frame, you can try:
edge_table <- as.data.frame(edge_table)
edge_table[] <- lapply(edge_table, unlist)
write.csv(edge_table, 'test.csv', row.names = FALSE)
read.csv('test.csv')
#> V1 V2 p.value estimate
#> 1 A B 0.2372953 0.6160489
#> 2 A C 0.3164033 1.6070638
#> 3 A D 0.3238934 1.5017413
#> 4 B C 0.8411522 1.1562830
#> 5 B D 0.6882391 0.8017990
#> 6 C D 0.5514280 1.2965724
Created on 2022-12-06 with reprex v2.0.2

Find all cycles in data

I have data with 'from' and 'to' columns:
df = data.frame(from = c('A','A','X','E','B','W','C','Y'),
to = c('B','E','Y','C','A','X','A','W'))
I'd like to identify all sequences of 'from-to', considering two or more rows, which starts and ends on the same value. An easy one would be A-B-A:
# df
# from to
# 1 A B # 1. From A to B
# 2 A E
# 3 X Y
# 4 E C
# 5 B A # 2. From B and back to the starting point A, completing the sequence A-B-A
# 6 W X
# 7 C A
# 8 Y W
Another one:
# df
# from to
# 1 A B
# 2 A E # 1.
# 3 X Y
# 4 E C # 2.
# 5 B A
# 6 W X
# 7 C A # 3. -> Thus: A - E - C - A
# 8 Y W
There is also e.g. X - Y - W - X
How can I find such cycles?
Here is another option:
library(igraph)
g <- graph_from_data_frame(h)
#https://lists.nongnu.org/archive/html/igraph-help/2009-04/msg00125.html
find.cycles <- function(graph, k) {
ring <- graph.ring(k, TRUE)
subgraph_isomorphisms(ring, graph)
}
#find all cycles
N <- length(unique(unlist(h)))
l <- unlist(lapply(1L:N, find.cycles, graph=g), recursive=FALSE)
#extract the vertices in each cycle
Filter(Negate(is.null), lapply(l, function(e) {
if (length(e) > 1L) {
nm <- names(e)
c(nm, nm[1L])
}
}))
output:
[[1]]
[1] "A" "B" "A"
[[2]]
[1] "B" "A" "B"
[[3]]
[1] "A" "E" "C" "A"
[[4]]
[1] "X" "Y" "W" "X"
[[5]]
[1] "E" "C" "A" "E"
[[6]]
[1] "W" "X" "Y" "W"
[[7]]
[1] "C" "A" "E" "C"
[[8]]
[1] "Y" "W" "X" "Y"
Reference:
Re: [igraph] Help - find cycles by Gábor Csárdi

Convert 3D-dataframe to 2D-dataframe in R

I have a three dimensional excel table which I would like to convert into a two dimensional dataframe that I can use in R. I think the best way is to read it in R and then transform it directly within R, but I do not find how. Here is an example. I have a df1-like dataframe that I want to transform to df2:
a1 <- paste("a","b","c",sep = ";")
a2 <- paste("e","f","g",sep = ";")
df1 <- data.frame(v1=a1, v2=a2, row.names = "w1")
df2 <- data.frame(w1=c(rep("v1",3),rep("v2",3)), "value"=letters[1:6])
You can achieve this by using reshape2
sub_df1 <- apply(df1,2,FUN= strsplit,split = ";")
# $v1
# $v1$w1
# [1] "a" "b" "c"
# $v2
# $v2$w1
# [1] "e" "f" "g
sub_df2 <- sapply(apply(df1,2,FUN= strsplit,split = ";"), FUN = unlist,use.names = TRUE, recursive = FALSE)
# v1 v2
# w11 "a" "e"
# w12 "b" "f"
# w13 "c" "g"
melt(sub_df2)[-1]
# Var2 value
# 1 v1 a
# 2 v1 b
# 3 v1 c
# 4 v2 e
# 5 v2 f
# 6 v2 g
You can then delete the first column by adding the [-1]

How to add a list to a data frame in R?

I have 2 tables as below:
a = read.table(text=' a b
1 c
1 d
2 c
2 a
2 b
3 a
', head=T)
b = read.table(text=' a c
1 x i
2 y j
3 z k
', head=T)
And I want result to be like this:
1 x i c d
2 y j c a b
3 z k a
Originally I thought to use tapply to transform them to lists (eg. aa = tapply(a[,2], a[,1], function(x) paste(x,collapse=","))), then append it back to table b, but I got stuck...
Any suggestion to do this?
Thanks a million.
One way to do it:
mapply(FUN = c,
lapply(split(b, row.names(b)), function(x) as.character(unlist(x, use.names = FALSE))),
split(as.character(a$b), a$a),
SIMPLIFY = FALSE)
# $`1`
# [1] "x" "i" "c" "d"
#
# $`2`
# [1] "y" "j" "c" "a" "b"
#
# $`3`
# [1] "z" "k" "a"

Combining sequences with similar gene IDs

I have a list of gene IDs along with their sequences in R.
$2435
[1]"ATGCGGGCGGGGGTCGTCGA"
$2435
[1]"ATGCGGCGCGCGCGCTATATACGC"
$2435
[1]"ATGCGGCGCCTCTCATCGCGGGGG"
I want to combine the sequences with the same gene IDs in that list in R.
$2435
[1]"ATGCGGGCGGGGGTCGTCGAATGCGGCGCGCGCGCTATATACGCATGCGGCGCCTCTCATCGCGGGGG"
Use lapply after matching the names with unique. Here's some sample data:
A <- list("12" = "AAAABBBBCCCCDDDD",
"34" = "GGGG",
"12" = "XXXXXXXXXXXXXXXXXXXXXXX",
"10" = "FFFFGGGG",
"10" = "HHHHIIII")
A
# $`12`
# [1] "AAAABBBBCCCCDDDD"
#
# $`34`
# [1] "GGGG"
#
# $`12`
# [1] "XXXXXXXXXXXXXXXXXXXXXXX"
#
# $`10`
# [1] "FFFFGGGG"
#
# $`10`
# [1] "HHHHIIII"
Subset the related names and paste them together.
lapply(unique(names(A)), function(x) paste(A[names(A) %in% x], collapse = ""))
# [[1]]
# [1] "AAAABBBBCCCCDDDDXXXXXXXXXXXXXXXXXXXXXXX"
#
# [[2]]
# [1] "GGGG"
#
# [[3]]
# [1] "FFFFGGGGHHHHIIII"
l <- list("A" = "ABC", "B" = "XYX", "A" = "DEF", "C" = "YZY", "A" = "GHI")
tapply(l, names(l), paste, collapse = "", simplify = FALSE)
# $A
# [1] "ABCDEFGHI"
#
# $B
# [1] "XYX"
#
# $C
# [1] "YZY"
Bonus:
For a dataframe output, use this:
aggregate(unlist(A), by=list(id=names(A)), paste, collapse="")
Where A is you list.
Using #Ananda's A, I get this:
id x
1 10 FFFFGGGGHHHHIIII
2 12 AAAABBBBCCCCDDDDXXXXXXXXXXXXXXXXXXXXXXX
3 34 GGGG

Resources