Comparing two data sets and find out common names - r

How can i compare two datasets and find the common gene names, provided if CNA and chr of both datasets are same
dt1
CNA chr Genes
gain 5 Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1
loss 5 RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5
gain 2 PDIA5,SEMA5B
dt2
CNA chr Genes
gain 5 Sall3,Nfatc1,SNORA5,SNORA5
gain 5 RNU6-866P,OR8J1,OR8K3,OR8K3
gain 2 PDIA5,DCC
expected output
df3
CNA chr Genes
gain 5 Sall3,Nfatc1
gain 2 PDIA5
Im sure this is trivial question, but would love to have suggestions to help me a bit.

Here is an approach:
library(data.table)
df2 = setDT(df2)[,list(Genes=paste0(Genes, collapse=',')),by=list(CNA, chr)]
res = setkey(setDT(df1), CNA, chr)[df2]
# CNA chr Genes Genes.1
#1: gain 5 Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1 Sall3,Nfatc1,SNORA5,SNORA5,RNU6-866P,OR8J1,OR8K3,OR8K3
#2: gain 2 PDIA5,SEMA5B PDIA5,DCC
res[, paste0(intersect(strsplit(Genes,',')[[1]], strsplit(Genes.1,',')[[1]]), collapse=',')
, by=list(CNA, chr)]
# CNA chr V1
#1: gain 5 Sall3,Nfatc1
#2: gain 2 PDIA5
Data:
df1 = structure(list(CNA = c("gain", "gain", "loss"), chr = c(2L, 5L,
5L), Genes = c("PDIA5,SEMA5B", "Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1",
"RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5")), .Names = c("CNA", "chr",
"Genes"), class = "data.frame", row.names = c(NA, -3L))
df2 = structure(list(CNA = c("gain", "gain", "gain"), chr = c(5L, 5L,
2L), Genes = c("Sall3,Nfatc1,SNORA5,SNORA5", "RNU6-866P,OR8J1,OR8K3,OR8K3",
"PDIA5,DCC")), .Names = c("CNA", "chr", "Genes"), class = "data.frame", row.names = c(NA,
-3L))

Not very elegant but
dt1 <- read.table(header = TRUE, text = "CNA chr Genes
gain 5 Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1
loss 5 RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5
gain 2 PDIA5,SEMA5B", stringsAsFactors = FALSE)
dt2 <- read.table(header = TRUE, text= "CNA chr Genes
gain 5 Sall3,Nfatc1,SNORA5,SNORA5
gain 5 RNU6-866P,OR8J1,OR8K3,OR8K3
gain 2 PDIA5,DCC", stringsAsFactors = FALSE)
f <- function(x, y, z = 'Genes') {
## split the genes out and find common ones
xx <- strsplit(x[, z], ',')
yy <- strsplit(y[, z], ',')
res <- lapply(seq_along(xx), function(ii)
intersect(xx[[ii]], yy[[ii]]))
## combine back into one of the data frames
res <- lapply(res, paste, collapse = ',')
res <- cbind(x[, 1:2], Genes = do.call('rbind', res))
## make sure the chr and alterations are the same and only return those
idx <- sapply(1:nrow(x), function(ii) all(x[ii, 1:2] == y[ii, 1:2]))
res[idx, ]
}
f(dt1, dt2)
# CNA chr Genes
# 1 gain 5 Sall3,Nfatc1
# 3 gain 2 PDIA5

Related

How to collapse values in a list to allow a list column in a dataframe to be converted to a vector?

I have a dataframe, df:
df <- structure(list(ID = c("ID1", "ID2", "ID3"), values = list(A = "test",
B = c("test2", "test3"), C = "test4")), row.names = c(NA,
-3L), class = "data.frame")
df
ID values
1 ID1 test
2 ID2 test2, test3
3 ID3 test4
sapply(df, class)
ID values
"character" "list"
I'm trying to create a function that will run through each row of df$values, and if the length is greater than one, paste the values into one string. So the data frame will look the same, but will have a different structure:
df
ID values
1 ID1 test
2 ID2 test2, test3
3 ID3 test4
dput(df)
structure(list(ID = c("ID1", "ID2", "ID3"), values = c("test",
"test2, test3", "test4")), class = "data.frame", row.names = c(NA,
-3L))
sapply(df, class)
ID values
"character" "character"
(Note how in the end result, both columns are character columns, rather than a character column and a list).
I tried making a function to do this, but it doesn't work (and is very messy):
newcol <- NULL
for (i in nrow(df)) {
row <- df$values[i] %>%
unlist(., use.names = FALSE)
if (length(row) == 1) {
newcol = rbind(row, newcol)
} else if (length(row)>1) {
row = paste0(row[1], ", ", row[2])
newcol = rbind(row, newcol)
}
}
df$values <- newcol
Is there an easier way to do this (that works), and that can do it for any size of list entry? (eg. if df$values has a row entry that was "test6", test7, test8, test9").
We can use sapply with toString :
df$values <- sapply(df$values, toString)
sapply(df, class)
# ID values
#"character" "character"
str(df)
#'data.frame': 3 obs. of 2 variables:
# $ ID : chr "ID1" "ID2" "ID3"
# $ values: chr "test" "test2, test3" "test4"
toString is shorthand for paste0(..., collapse = ',').
df$values <- sapply(df$values, paste0, collapse = ',')
Using tidyverse
library(dplyr)
library(purrr)
df <- df %>%
mutate(values = map_chr(values, toString))

more dynamic melting with data.table

I am looking for the most efficient form to transform
ARTNR FILGRP
1 1 9827
2 2 9348
3 3 9335, 9827, 9339
into this
ARTNR FILGRP
1 1 9827
2 2 9348
3 3 9335
4 3 9827
5 3 9339
I tried the following code and it works, but it is not elegant and has some shortcomings. :
setDT(artnrs)
artnrs[, c("P1", "P2", "P3") := tstrsplit(FILGRP, ",", fixed=TRUE)] # 1)
artnrs <- melt(artnrs, c("ARTNR"), measure = patterns("^P")) # 2)
artnrs[,variable:=NULL] # 3)
artnrs <- na.omit(artnrs, cols="value") # 4)
names(artnrs)[2] <- "FILGRP" # 5)
ad 1) splits the last column in three new ones. How can I make this dynamic and make it fit for five or ten?
ad 2-5) rather clumpsy operations, could I chain this better?
It is based on data.table but performance is not that critical so an easy to understand tidyverse solution would be ok. But the fewer packages, the better.
Thanks!
dput output;
structure(list(ARTNR = c(1, 2, 3), FILGRP = c("9827", "9348", "9335, 9827, 9339")),
row.names = c(NA, -3L), class = "data.frame")
df <- structure(list(ARTNR = c(1, 2, 3), FILGRP = c("9827", "9348", "9335, 9827, 9339")),
row.names = c(NA, -3L), class = "data.frame")
df2 <- strsplit(df$FILGRP, split = ",")
df2 <- data.frame(ARTNR = rep(df$ARTNR, sapply(df2, length)), FILGRP = unlist(df2))
here is a data.table approach
library( data.table )
setDT(DT)
melt( DT[, paste0( "v", 1:length(tstrsplit( DT$FILGRP, ", ") ) ) := tstrsplit( FILGRP, ", ") ],
id.vars = "ARTNR",
measure.vars = patterns( "^v" ),
value.name = "FILGRP" )[!is.na(FILGRP), .SD, .SDcols = c(1,3) ]
# ARTNR FILGRP
# 1: 1 9827
# 2: 2 9348
# 3: 3 9335
# 4: 3 9827
# 5: 3 9339

Speeding up code inside for loop

I have a dataframe that contains more than 2 millions records. I am only sharing the few records due to data security reasons .I wish you guys can understand my reason.
data <- data[order(data$email_address_hash),]
skip_row <- c()
data$hash_time <- rep('NA',NROW(data)) #adding new column to our data
rownames(data) <- as.character(1:NROW(data))
dput(droplevels(data))
structure(list(email_address_hash = structure(c(2L, 1L, 1L, 2L
), .Label = c("0004eca7b8bed22aaf4b320ad602505fe9fa9d26", "35c0ef2c2a804b44564fd4278a01ed25afd887f8"
), class = "factor"), open_time = structure(c(2L, 1L, 3L, 4L), .Label = c(" 04:39:24",
" 09:57:20", " 10:39:43", " 19:00:09"), class = "factor")), .Names = c("email_address_hash",
"open_time"), row.names = c(41107L, 47808L, 3973L, 8307L), class = "data.frame")
str(data)
'data.frame': 4 obs. of 2 variables:
$ email_address_hash: Factor w/ 36231 levels "00012aec4ca3fa6f2f96cf97fc2a3440eacad30e",..: 7632 2 2 7632
$ open_time : Factor w/ 34495 levels " 00:00:03"," 00:00:07",..: 15918 5096 16971 24707
.
skip_row <- c()
data$hash_time <- rep('NA',NROW(data)) #adding new column to our data
rownames(data) <- as.character(1:NROW(data))
for(i in 1:NROW(data)){
#Skipping the email_address_hash that was already used for grouping
if(i %in% skip_row) next
hash_row_no <- c()
#trimming data so that we don't need to look into whole dataframe
trimmed_data <- data[i:NROW(data),]
# Whenever we search for email_address_hash the previous one was ignored or removed from the check
#extracting rownames so that we can used that as rownumber inside the skip_row
hash_row_no <- rownames(trimmed_data[trimmed_data$email_address_hash==trimmed_data$email_address_hash[1],])
#note :- we know the difference b/w rownames and rownumber
#converting rownames into numeric so that we can use them as rowno
hash_row_no <- as.numeric(hash_row_no)
first_no <- hash_row_no[1]
last_no <- hash_row_no[NROW(hash_row_no)]
skip_row <- append(skip_row,hash_row_no)
data$hash_time[first_no] <- paste(data$open_time[first_no:last_no], collapse = "")
}
Please note that I also tried the below approaches for to speed up the process but that seems to be ineffective
hash_row_no <- rownames(trimmed_data[trimmed_data$email_address_hash==trimmed_data$email_address_hash[1],])
converted dataframe to data.table
setDT(data)
performed either of the operation gives times similar time
system.time(rownames(trimmed_data[trimmed_data$email_address_hash==trimmed_data$email_address_hash[1],]))
system.time(rownames(trimmed_data)[trimmed_data[["email_address_hash"]] == trimmed_data$email_address_hash[1]])
Can you guys help me to speed up my code as my data contains more than 2 millions records and it is taking more than 30 minutes and even more ?
Apparently you want to do this:
library(data.table)
setDT(data)
data[, .(open_times = paste(open_time, collapse = "")), by = email_address_hash]
# email_address_hash open_times
#1: 35c0ef2c2a804b44564fd4278a01ed25afd887f8 09:57:20 19:00:09
#2: 0004eca7b8bed22aaf4b320ad602505fe9fa9d26 04:39:24 10:39:43
Or possibly this:
data[email_address_hash == "0004eca7b8bed22aaf4b320ad602505fe9fa9d26",
paste(open_time, collapse = "")]
#[1] " 04:39:24 10:39:43"

replacing blank not NA

I have two variables a and b
a b
vessel hot
parts
nest NA
best true
neat smooth
I want to replace blank in b with a
la$b[i1] <- ifelse(la$b[i1] == "",la$a[i1],la$b[i1])
But it is not working
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), specify the condition in 'i' (b==''), and assign the values of 'a' that corresponds to TRUE values in 'i' to 'b'. It should be fast as we are assigning in place.
library(data.able)
setDT(df1)[b=='', b:= a]
df1
# a b
#1: vessel hot
#2: parts parts
#3: nest NA
#4: best true
#5: neat smooth
Or we can just base R
i1 <- df1$b=='' & !is.na(df1$b)
df1$b[i1] <- df1$a[i1]
data
df1 <- structure(list(a = c("vessel", "parts", "nest", "best", "neat"
), b = c("hot", "", NA, "true", "smooth")), .Names = c("a", "b"
), class = "data.frame", row.names = c(NA, -5L))
instead of
# la$b[i1] <- ifelse(la$b[i1] == "",la$a[i1],la$b[i1])
# what is i1? it doesn't seem to have any obvious function here
... it should be:
la$b <- ifelse(la$b == "", la$a, la$b)
assuming that you want to replace blank in b with a and that applies to all blanks
it works:
df <- structure(list(a = c("vessel", "parts", "nest", "best", "neat"
), b = c("hot", "parts", NA, "true", "smooth")), .Names = c("a",
"b"), row.names = c(NA, -5L), class = "data.frame")
df$b <- ifelse(df$b=="", df$a, df$b)
# or, with `with`: df$b <- with(df, ifelse(b=="",a,b))
# > df
# a b
# 1 vessel hot
# 2 parts parts
# 3 nest <NA>
# 4 best true
# 5 neat smooth

Converting given list into dataframe

I have the following list:
$id1
$id1[[1]]
A B
"A" "B"
$id1[[2]]
A B
"A" "A1"
$id2
$id2[[1]]
A B
"A2" "B2"
In R-pastable form:
dat = structure(list(SampleTable = structure(list(id2 = list(structure(c("90", "7"), .Names = c("T", "G")), structure(c("90", "8"), .Names = c("T", "G"))), id1 = structure(c("1", "1"), .Names = c("T", "G"))), .Names = c("id2", "id1"))), .Names = "SampleTable")
I want this given list to be converted into following dataframe:
id1 A B
id1 A A1
id2 A2 B2
Your data structure (apparently a named list of unnamed lists of 1-row data.frames) is a bit complicated: the easiest may be to use a loop to build the data.frame.
It can be done directly with do.call, lapply and rbind, but it is not very readable, even if you are familiar with those functions.
# Sample data
d <- list(
id1 = list(
data.frame( x=1, y=1 ),
data.frame( x=2, y=2 )
),
id2 = list(
data.frame( x=3, y=3 ),
data.frame( x=4, y=4 )
),
id3 = list(
data.frame( x=5, y=5 ),
data.frame( x=6, y=6 )
)
)
# Convert
d <- data.frame(
id=rep(names(d), unlist(lapply(d,length))),
do.call( rbind, lapply(d, function(u) do.call(rbind, u)) )
)
Other solution, using a loop, if you have a ragged data structure, containing vectors (not data.frames) as explained in the comments.
d <- structure(list(SampleTable = structure(list(id2 = list(structure(c("90", "7"), .Names = c("T", "G")), structure(c("90", "8"), .Names = c("T", "G"))), id1 = structure(c("1", "1"), .Names = c("T", "G"))), .Names = c("id2", "id1"))), .Names = "SampleTable")
result <- list()
for(i in seq_along(d$SampleTable)) {
id <- names(d$SampleTable)[i]
block <- d$SampleTable[[i]]
if(is.atomic(block)) {
block <- list(block)
}
for(row in block) {
result <- c(result, list(data.frame(id, as.data.frame(t(row)))))
}
}
result <- do.call(rbind, result)
NOTE! I could not get melt and cast working on this kind of ragged data (I tried for over an hour...) I am going to leave this answer here to show that for this kind of operation, the reshape pacakge could also be used.
Using the example data of vincent, you can use melt and cast from the reshape package:
library(reshape)
res = cast(melt(d))[-1]
names(res) = c("id","x","y")
res
id x y
1 id1 1 1
2 id2 3 3
3 id3 5 5
4 id1 2 2
5 id2 4 4
6 id3 6 6
The order in the resulting data.frame is not the same, but the result is identical. And the code is a bit shorter. I use the [-1] to delete the first column which is also returned by melt. This additional variable is the column index of the individual data.frames in the list of lists. Just have a look at the result of melt(d), that will hopefully make it more clear.
This is a bit messier that you let on. That dat object has an extra "layer" above it, so it is easier to work with dat[[1]]:
dfrm <- data.frame(dat[[1]], stringsAsFactors=FALSE)
names(dfrm) <- sub("\\..+$", "", names(dfrm))
> dfrm
id2 id2 id1
T 90 90 1
G 7 8 1
> t(dfrm)
T G
id2 "90" "7"
id2 "90" "8"
id1 "1" "1"

Resources