more dynamic melting with data.table - r

I am looking for the most efficient form to transform
ARTNR FILGRP
1 1 9827
2 2 9348
3 3 9335, 9827, 9339
into this
ARTNR FILGRP
1 1 9827
2 2 9348
3 3 9335
4 3 9827
5 3 9339
I tried the following code and it works, but it is not elegant and has some shortcomings. :
setDT(artnrs)
artnrs[, c("P1", "P2", "P3") := tstrsplit(FILGRP, ",", fixed=TRUE)] # 1)
artnrs <- melt(artnrs, c("ARTNR"), measure = patterns("^P")) # 2)
artnrs[,variable:=NULL] # 3)
artnrs <- na.omit(artnrs, cols="value") # 4)
names(artnrs)[2] <- "FILGRP" # 5)
ad 1) splits the last column in three new ones. How can I make this dynamic and make it fit for five or ten?
ad 2-5) rather clumpsy operations, could I chain this better?
It is based on data.table but performance is not that critical so an easy to understand tidyverse solution would be ok. But the fewer packages, the better.
Thanks!
dput output;
structure(list(ARTNR = c(1, 2, 3), FILGRP = c("9827", "9348", "9335, 9827, 9339")),
row.names = c(NA, -3L), class = "data.frame")

df <- structure(list(ARTNR = c(1, 2, 3), FILGRP = c("9827", "9348", "9335, 9827, 9339")),
row.names = c(NA, -3L), class = "data.frame")
df2 <- strsplit(df$FILGRP, split = ",")
df2 <- data.frame(ARTNR = rep(df$ARTNR, sapply(df2, length)), FILGRP = unlist(df2))

here is a data.table approach
library( data.table )
setDT(DT)
melt( DT[, paste0( "v", 1:length(tstrsplit( DT$FILGRP, ", ") ) ) := tstrsplit( FILGRP, ", ") ],
id.vars = "ARTNR",
measure.vars = patterns( "^v" ),
value.name = "FILGRP" )[!is.na(FILGRP), .SD, .SDcols = c(1,3) ]
# ARTNR FILGRP
# 1: 1 9827
# 2: 2 9348
# 3: 3 9335
# 4: 3 9827
# 5: 3 9339

Related

Pattern Searching in R

I have two data frames as below. DF1 is slighly messy (as you can see below) has multiple values from DF2 combined into one column.
DF1
SRNo. Value
1 1ABCD2EFGH3IJKL
2 1ABCD2EFGH3IJKL/7MLPO0OKMN8MNBV
3 3ABCD4EFGH5IJKL
4 3ABCD4EFGH5IJKL/1ABCD2EFGH3IJKL
5 7MLPO0OKMN8MNBV/9IUYT7HGFD3LKJH
DF2
SRNo. Value
1 1ABCD2EFGH3IJKL
2 3ABCD4EFGH5IJKL
3 6PQRS7TUVW8XYZA
4 5FGHI9XUZX1RATP
5 9AGTY6UGFW0AAUU
6 6TEYD7RARA8MHAT
7 9IUYT7HGFD3LKJH
I want to do a look up using values column in both the data set. Here is what I am trying to accomplish.
i) For rows 1 & 3 in DF1 it is a simple look up in DF2. I expect the code to return those looked up values.
ii) For row #3 in DF1, only first part of the string matches with a value in DF2. I expect the code to return only the first part.
iii) For row#4 in DF1, both the parts in the string matches with values in DF2. In this case I want the first part of the string that is matching to be retained
iv) For Row #5, the second part in the string matches with the value in DF2. I would expect the code to return the 2nd part of the string.
I have around 47000 rows in first dataset and over 300,000 in second dataset and ofcourse there are other columns in both the datasets. I have tried this in multiple ways using str_split/str_match but could not accomplish what I want to. Every suggestion is appreciated. My rest of the coding is in R.
Thank You
First step is to tidyr::separate() your DF1 at "/". Then I used dplyr::case_when() to see if there was a match between the first of the listed items in DF2 with %in%; if there wasn't then check against the second. I used dplyr::mutate() to append the results to DF1 under dat.
library(dplyr)
library(tidyr)
DF1 <- data.frame("SRNo." = 1:5, Value = c("1ABCD2EFGH3IJKL","1ABCD2EFGH3IJKL/7MLPO0OKMN8MNBV","3ABCD4EFGH5IJKL","3ABCD4EFGH5IJKL/1ABCD2EFGH3IJKL","7MLPO0OKMN8MNBV/9IUYT7HGFD3LKJH"), stringsAsFactors = F) %>% tbl_df()
DF2 <- data.frame("SRNo." = 1:7, Value = c("1ABCD2EFGH3IJKL","3ABCD4EFGH5IJKL","6PQRS7TUVW8XYZA","5FGHI9XUZX1RATP","9AGTY6UGFW0AAUU","6TEYD7RARA8MHAT","9IUYT7HGFD3LKJH"), stringsAsFactors = F) %>%tbl_df()
DF1 %>%
separate(Value, c("Value1", "Value2"), sep = "/") %>%
mutate(dat = case_when(
Value1 %in% DF2$Value ~ Value1,
Value2 %in% DF2$Value ~ Value2,
TRUE ~ NA_character_
))
# # A tibble: 5 x 4
# SRNo. Value1 Value2 dat
# <int> <chr> <chr> <chr>
# 1 1 1ABCD2EFGH3IJKL NA 1ABCD2EFGH3IJKL
# 2 2 1ABCD2EFGH3IJKL 7MLPO0OKMN8MNBV 1ABCD2EFGH3IJKL
# 3 3 3ABCD4EFGH5IJKL NA 3ABCD4EFGH5IJKL
# 4 4 3ABCD4EFGH5IJKL 1ABCD2EFGH3IJKL 3ABCD4EFGH5IJKL
# 5 5 7MLPO0OKMN8MNBV 9IUYT7HGFD3LKJH 9IUYT7HGFD3LKJH
Data.table solution
df1 <- read.table(text="SRNo. Value
1 1ABCD2EFGH3IJKL
2 1ABCD2EFGH3IJKL/7MLPO0OKMN8MNBV
3 3ABCD4EFGH5IJKL
4 3ABCD4EFGH5IJKL/1ABCD2EFGH3IJKL
5 7MLPO0OKMN8MNBV/9IUYT7HGFD3LKJH", header = T, stringsAsFactors = F)
df2 <- read.table( text = "SRNo. Value
1 1ABCD2EFGH3IJKL
2 3ABCD4EFGH5IJKL
3 6PQRS7TUVW8XYZA
4 5FGHI9XUZX1RATP
5 9AGTY6UGFW0AAUU
6 6TEYD7RARA8MHAT
7 9IUYT7HGFD3LKJH", header = T, stringsAsFactors = F )
library( data.table )
setDT(df1)[, c( "Value1", "Value2" ) := tstrsplit( Value, "/", fixed = TRUE)]
setDT(df2)
resultv1 <- df2[ df1, on = c( Value = "Value1"), nomatch = 0L ]
resultv2 <- df2[ df1, on = c( Value = "Value2"), nomatch = 0L ]
result <- rbindlist( list( resultv1, resultv2 ) )[!duplicated( i.SRNo.)]
Benchmarking it against the solution from #Paul shows similar runtimes (~2.5 miliseconds).. But data.table sometimes surprises me on larger data-sets..
If memory becomes an issue, you can do it all in one go:
rbindlist( list( setDT(df2)[ setDT(df1)[, c( "Value1", "Value2" ) := tstrsplit( Value, "/", fixed = TRUE)],
on = c( Value = "Value1"), nomatch = 0L ],
setDT(df2)[ setDT(df1)[, c( "Value1", "Value2" ) := tstrsplit( Value, "/", fixed = TRUE)],
on = c( Value = "Value2"), nomatch = 0L ] ) )[!duplicated( i.SRNo.)]

Select column matching pattern then keep only rows that match other vector values

I want to select the columns in DT1 that match the pattern flux then keep only rows that have values similar to those in a predefined vector vec1
Sample Data
library(data.table)
DT1 <- structure(list(flux_1 = c(1, 6, 2, 9, 5),
FileName = c("prac_1", "prac_2", "prac_3", "prac_4", "prac_5")),
.Names = c("flux_1", "FileName"),
class = c("data.table", "data.frame"),
row.names = c(NA, -5L))
DT1
flux_1 FileName
1: 1 prac_1
2: 6 prac_2
3: 2 prac_3
4: 9 prac_4
5: 5 prac_5
vec1 <- c(6, 2)
The following code works but I need to explicitly specify flux_1.
DT1[ flux_1 %in% vec1]
flux_1 FileName
1: 6 prac_2
2: 2 prac_3
I was thinking about something like this but it didn't work
DT1[, .SD, .SDcols = names(DT1) %like% "flux"] %>%
.[. %in% vec1]
Empty data.table (0 rows) of 1 col: flux_1
Any suggestion is appreciated! Thank you!
We can use get to return the value of the column after grep
DT1[get(grep('flux', names(DT1), value = TRUE)) %in% vec1 ]
# flux_1 FileName
#1: 6 prac_2
#2: 2 prac_3
Or if we use the .SDcols route, extract the .SD as a vector do the comparison and subset the dataset
DT1[DT1[, .SD[[1]] %in% vec1, .SDcols = grep('flux', names(DT1))]]
Similar option can be used with %like%
DT1[DT1[, .SD[[1]] %in% vec1, .SDcols = names(DT1) %like% "flux"]]
Regarding the OP's approach
DT1[, .SD, .SDcols = names(DT1) %like% "flux"]
# flux_1
#1: 1
#2: 6
#3: 2
#4: 9
#5: 5
returns a data.table with a single column. By chaining, we need to extract the 'flux_1' column
DT1[, .SD, .SDcols = names(DT1) %like% "flux"] %>%
.[[1]] %in% vec1 %>%
magrittr::extract(DT1, .)
# flux_1 FileName
#1: 6 prac_2
#2: 2 prac_3

R string split, to normalized (long) format with running index

I have this data frame
structure(list(rule.id = c(1, 2), rules = structure(1:2, .Label = c("Lamp1.1,Lamp1.2",
"Lamp2.1,Lamp2.2"), class = "factor")), .Names = c("rule.id",
"rules"), row.names = c(NA, -2L), class = "data.frame")
# rule.id rules
#1 1 Lamp1.1,Lamp1.2
#2 2 Lamp2.1,Lamp2.2
which I need to split on the "rules" column by separator comma (","), multiple commas occur (not only 2 like in example) and then transform this into a normalized format with keeping the relevant rule.id value from the original df.
The result should look like this:
structure(list(rule.id = c(1, 1, 2, 2), lhs = c("Lamp1.1", "Lamp1.2",
"Lamp2.1", "Lamp2.1")), .Names = c("rule.id", "lhs"), row.names = c(NA,
-4L), class = "data.frame")
# rule.id lhs
#1 1 Lamp1.1
#2 1 Lamp1.2
#3 2 Lamp2.1
#4 2 Lamp2.1
I have a code that takes care of the str split and normalize (long) format, but not sure how to take care of the rule.id requirement
lhs.norm <- as.data.frame(
cbind(
rules.df$ruleid,
unlist(strsplit(
unlist(lapply(strsplit(unlist(lapply(as.character(rules.df$rules),function(x) substr(x,2,nchar(x)))), "} =>", fixed = T), function(x) x[1]))
,","))))
thanks to #acrun solution using
cSplit(rules.df.lhs, "lhs", ",", "long"))
I benchmarked 19 seconds for 1M rows (result was around 2M rows)
We can use cSplit from splitstackshape
library(splitstackshape)
cSplit(df, "rules", ",", "long")
# rule.id rules
#1: 1 Lamp1.1
#2: 1 Lamp1.2
#3: 2 Lamp2.1
#4: 2 Lamp2.2
If this is a huge dataset, we can use stringi to split
library(stringi)
lst <- stri_split_fixed(df$rules, ",")
df2 <- data.frame(rule.id = rep(df$rule.id, lengths(lst)),
rules = unlist(lst))
df2
# rule.id rules
#1 1 Lamp1.1
#2 1 Lamp1.2
#3 2 Lamp2.1
#4 2 Lamp2.2
Another option is data.table
library(data.table)
setDT(df)[, strsplit(as.character(rules), ","), by = rule.id]
With the new base pipes we can make #akrun's great solution using stringi::stri_split_fixed even faster. This also exploits recycling of the rule.id column.
stringi::stri_split_fixed(d$rules, ",") |>
unlist() |>
cbind(d[1])
# unlist(stringi::stri_split_fixed(d$rules, ",")) rule.id
# 1 Lamp1.1 1
# 2 Lamp1.2 2
# 3 Lamp2.1 1
# 4 Lamp2.2 2
Benchmark
sapply(c('splitstackshape', 'stringi', 'data.table', 'reshape2'),
library, character.only=TRUE)
dl <- data.frame(rule.id=1:1e6, rules=d$rules)
microbenchmark::microbenchmark(
melt=cbind(dl[1], do.call(rbind, strsplit(as.character(dl$rules), ',', fixed=T))) |>
reshape2::melt('rule.id'),
cbind=stri_split_fixed(dl$rules, ",") |>
unlist() |>
cbind(dl[1]),
dtable=as.data.table(dl)[, strsplit(as.character(rules), ","), by = rule.id],
cSplit=cSplit(dl, "rules", ",", "long"),
stringi={lst <- stri_split_fixed(dl$rules, ",")
data.frame(rule.id = rep(dl$rule.id, lengths(lst)),
rules = unlist(lst))}, times=3L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# melt 1472.5459 1518.5649 1608.8124 1564.5838 1676.9456 1789.3075 3 b
# cbind 335.7105 365.9372 380.9120 396.1639 403.5128 410.8617 3 a
# dtable 10414.8486 10605.5725 10674.1134 10796.2965 10803.7458 10811.1951 3 d
# cSplit 3003.0660 3079.3098 3232.6108 3155.5537 3347.3832 3539.2128 3 c
# stringi 421.1481 469.1054 518.9577 517.0627 567.8626 618.6624 3 a
# Warning messages:
# 1: In type.convert.default(unlist(x, use.names = FALSE)) :
# 'as.is' should be specified by the caller; using TRUE
# 2: In type.convert.default(unlist(x, use.names = FALSE)) :
# 'as.is' should be specified by the caller; using TRUE
# 3: In type.convert.default(unlist(x, use.names = FALSE)) :
# 'as.is' should be specified by the caller; using TRUE
Note: The warnings stem from cSplit() which code probably wasn't updated for a long time.
Data
d <- structure(list(rule.id = c(1, 2), rules = structure(1:2, .Label = c("Lamp1.1,Lamp1.2",
"Lamp2.1,Lamp2.2"), class = "factor")), .Names = c("rule.id",
"rules"), row.names = c(NA, -2L), class = "data.frame")

replacing blank not NA

I have two variables a and b
a b
vessel hot
parts
nest NA
best true
neat smooth
I want to replace blank in b with a
la$b[i1] <- ifelse(la$b[i1] == "",la$a[i1],la$b[i1])
But it is not working
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), specify the condition in 'i' (b==''), and assign the values of 'a' that corresponds to TRUE values in 'i' to 'b'. It should be fast as we are assigning in place.
library(data.able)
setDT(df1)[b=='', b:= a]
df1
# a b
#1: vessel hot
#2: parts parts
#3: nest NA
#4: best true
#5: neat smooth
Or we can just base R
i1 <- df1$b=='' & !is.na(df1$b)
df1$b[i1] <- df1$a[i1]
data
df1 <- structure(list(a = c("vessel", "parts", "nest", "best", "neat"
), b = c("hot", "", NA, "true", "smooth")), .Names = c("a", "b"
), class = "data.frame", row.names = c(NA, -5L))
instead of
# la$b[i1] <- ifelse(la$b[i1] == "",la$a[i1],la$b[i1])
# what is i1? it doesn't seem to have any obvious function here
... it should be:
la$b <- ifelse(la$b == "", la$a, la$b)
assuming that you want to replace blank in b with a and that applies to all blanks
it works:
df <- structure(list(a = c("vessel", "parts", "nest", "best", "neat"
), b = c("hot", "parts", NA, "true", "smooth")), .Names = c("a",
"b"), row.names = c(NA, -5L), class = "data.frame")
df$b <- ifelse(df$b=="", df$a, df$b)
# or, with `with`: df$b <- with(df, ifelse(b=="",a,b))
# > df
# a b
# 1 vessel hot
# 2 parts parts
# 3 nest <NA>
# 4 best true
# 5 neat smooth

Comparing two data sets and find out common names

How can i compare two datasets and find the common gene names, provided if CNA and chr of both datasets are same
dt1
CNA chr Genes
gain 5 Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1
loss 5 RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5
gain 2 PDIA5,SEMA5B
dt2
CNA chr Genes
gain 5 Sall3,Nfatc1,SNORA5,SNORA5
gain 5 RNU6-866P,OR8J1,OR8K3,OR8K3
gain 2 PDIA5,DCC
expected output
df3
CNA chr Genes
gain 5 Sall3,Nfatc1
gain 2 PDIA5
Im sure this is trivial question, but would love to have suggestions to help me a bit.
Here is an approach:
library(data.table)
df2 = setDT(df2)[,list(Genes=paste0(Genes, collapse=',')),by=list(CNA, chr)]
res = setkey(setDT(df1), CNA, chr)[df2]
# CNA chr Genes Genes.1
#1: gain 5 Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1 Sall3,Nfatc1,SNORA5,SNORA5,RNU6-866P,OR8J1,OR8K3,OR8K3
#2: gain 2 PDIA5,SEMA5B PDIA5,DCC
res[, paste0(intersect(strsplit(Genes,',')[[1]], strsplit(Genes.1,',')[[1]]), collapse=',')
, by=list(CNA, chr)]
# CNA chr V1
#1: gain 5 Sall3,Nfatc1
#2: gain 2 PDIA5
Data:
df1 = structure(list(CNA = c("gain", "gain", "loss"), chr = c(2L, 5L,
5L), Genes = c("PDIA5,SEMA5B", "Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1",
"RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5")), .Names = c("CNA", "chr",
"Genes"), class = "data.frame", row.names = c(NA, -3L))
df2 = structure(list(CNA = c("gain", "gain", "gain"), chr = c(5L, 5L,
2L), Genes = c("Sall3,Nfatc1,SNORA5,SNORA5", "RNU6-866P,OR8J1,OR8K3,OR8K3",
"PDIA5,DCC")), .Names = c("CNA", "chr", "Genes"), class = "data.frame", row.names = c(NA,
-3L))
Not very elegant but
dt1 <- read.table(header = TRUE, text = "CNA chr Genes
gain 5 Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1
loss 5 RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5
gain 2 PDIA5,SEMA5B", stringsAsFactors = FALSE)
dt2 <- read.table(header = TRUE, text= "CNA chr Genes
gain 5 Sall3,Nfatc1,SNORA5,SNORA5
gain 5 RNU6-866P,OR8J1,OR8K3,OR8K3
gain 2 PDIA5,DCC", stringsAsFactors = FALSE)
f <- function(x, y, z = 'Genes') {
## split the genes out and find common ones
xx <- strsplit(x[, z], ',')
yy <- strsplit(y[, z], ',')
res <- lapply(seq_along(xx), function(ii)
intersect(xx[[ii]], yy[[ii]]))
## combine back into one of the data frames
res <- lapply(res, paste, collapse = ',')
res <- cbind(x[, 1:2], Genes = do.call('rbind', res))
## make sure the chr and alterations are the same and only return those
idx <- sapply(1:nrow(x), function(ii) all(x[ii, 1:2] == y[ii, 1:2]))
res[idx, ]
}
f(dt1, dt2)
# CNA chr Genes
# 1 gain 5 Sall3,Nfatc1
# 3 gain 2 PDIA5

Resources