Which files have some content in R - r

I have a list which contain lines of files, sample of which is shown.
list(c("\"ID\",\"SIGNALINTENSITY\",\"SNR\"", "\"NM_012429\",\"7.19739265676517\",\"0.738130599770152\"",
"\"NM_003980\",\"12.4036181424743\",\"13.753593768862\"", "\"AY044449\",\"8.74973537284918\",\"1.77200602833912\"",
"\"NM_005015\",\"11.3735054810744\",\"6.76079815107347\""), c("\"ID\",\"SIGNALINTENSITY\",\"SNR\"",
"\"NM_012429\",\"7.07699512126353\",\"0.987579612646805\"", "\"NM_003980\",\"11.3172936656653\",\"8.38227473088534\"",
"\"AY044449\",\"9.2865464417786\",\"2.61149606120517\"", "\"NM_005015\",\"10.1228142794354\",\"3.98707517627092\""
), c("ID,SIGNALINTENSITY,SNR", "1,NM_012429,6.44764696592035,0.84120306786724",
"2,NM_003980,9.52604513443066,3.02404186191898", "3,AY044449,9.11930818670925,2.24361163736047",
"4,NM_005015,10.5672879852575,5.29334273442728"))
I want to confirm the match when reading lines. I tried to find out which files has content starting with NM or GE by the following code
which(lapply(lines, function(x) any(grepl(paste(c("^NM_","^GE"),collapse = "|"), x, ignore.case = TRUE))) == T)
which is supposed to give index of all the three, but it return integer(0). I am not sure what I am missing.

Try this:
lyst <- list(c("\"ID\",\"SIGNALINTENSITY\",\"SNR\"", "\"NM_012429\",\"7.19739265676517\",\"0.738130599770152\"",
"\"NM_003980\",\"12.4036181424743\",\"13.753593768862\"", "\"AY044449\",\"8.74973537284918\",\"1.77200602833912\"",
"\"NM_005015\",\"11.3735054810744\",\"6.76079815107347\""), c("\"ID\",\"SIGNALINTENSITY\",\"SNR\"",
"\"NM_012429\",\"7.07699512126353\",\"0.987579612646805\"", "\"NM_003980\",\"11.3172936656653\",\"8.38227473088534\"",
"\"AY044449\",\"9.2865464417786\",\"2.61149606120517\"", "\"NM_005015\",\"10.1228142794354\",\"3.98707517627092\""
), c("ID,SIGNALINTENSITY,SNR", "1,NM_012429,6.44764696592035,0.84120306786724",
"2,NM_003980,9.52604513443066,3.02404186191898", "3,AY044449,9.11930818670925,2.24361163736047",
"4,NM_005015,10.5672879852575,5.29334273442728"))
Assuming lyst is given string as per your question then you can do:
lapply(1:length(lyst), function(x)grepl("^NM|^GE",gsub('"',"", lyst[[x]])))
Logic:
First replacing the ' " ' with nothing using gsub then using the '^' to determining if the start of string is NM or GE using grepl.
However, if someone is interested in matching with optional numbers and commas
one can also use this regex:
lapply(1:3, function(x)grepl("^(NM|GE)|^\\d+,(NM|GE)",gsub('"',"", lyst[[x]])))
Output:
> lapply(1:3, function(x)grepl("^(NM|GE)|^\\d+,(NM|GE)",gsub('"',"", lyst[[x]])))
[[1]]
[1] FALSE TRUE TRUE FALSE TRUE
[[2]]
[1] FALSE TRUE TRUE FALSE TRUE
[[3]]
[1] FALSE TRUE TRUE FALSE TRUE

dat <- lapply(
lines,
function(x) read.csv(text = x)
)
# [[1]]
# ID SIGNALINTENSITY SNR
# 1 NM_012429 7.197393 0.7381306
# 2 NM_003980 12.403618 13.7535938
# 3 AY044449 8.749735 1.7720060
# 4 NM_005015 11.373505 6.7607982
#
# [[2]]
# ID SIGNALINTENSITY SNR
# 1 NM_012429 7.076995 0.9875796
# 2 NM_003980 11.317294 8.3822747
# 3 AY044449 9.286546 2.6114961
# 4 NM_005015 10.122814 3.9870752
#
# [[3]]
# ID SIGNALINTENSITY SNR
# 1 NM_012429 6.447647 0.8412031
# 2 NM_003980 9.526045 3.0240419
# 3 AY044449 9.119308 2.2436116
# 4 NM_005015 10.567288 5.2933427
To filter lines:
lapply(
dat,
function(df) df[grepl("^NM_|^GE", df$ID, ignore.case = TRUE), ]
)
# [[1]]
# ID SIGNALINTENSITY SNR
# 1 NM_012429 7.197393 0.7381306
# 2 NM_003980 12.403618 13.7535938
# 4 NM_005015 11.373505 6.7607982
#
# [[2]]
# ID SIGNALINTENSITY SNR
# 1 NM_012429 7.076995 0.9875796
# 2 NM_003980 11.317294 8.3822747
# 4 NM_005015 10.122814 3.9870752
#
# [[3]]
# ID SIGNALINTENSITY SNR
# 1 NM_012429 6.447647 0.8412031
# 2 NM_003980 9.526045 3.0240419
# 4 NM_005015 10.567288 5.2933427
Or if just indices are needed:
lapply(
dat,
function(df) grepl("^NM_|^GE", df$ID, ignore.case = TRUE)
)
# [[1]]
# [1] TRUE TRUE FALSE TRUE
#
# [[2]]
# [1] TRUE TRUE FALSE TRUE
#
# [[3]]
# [1] TRUE TRUE FALSE TRUE
Or with grep instead of grepl:
lapply(
dat,
function(df) grep("^NM_|^GE", df$ID, ignore.case = TRUE)
)
# [[1]]
# [1] 1 2 4
#
# [[2]]
# [1] 1 2 4
#
# [[3]]
# [1] 1 2 4

Related

How do I search if the numbers in one vector are within a range of two other vectors in R?

I have two vectors. One is start and one is stop for a range of Nucleotides in a protein. Ex. one range is 1374742-1375555.
domainStart = c(1374742,1374760,1374769,1375822,1376182,1376320,1376350)
domainStop = c(1375555, 1375726,1375516, 1378129, 1376638, 1376638, 1377382)
Next I have a long list of nucleotide mutation positions.
db = c(37788, 40303, 138445, 161587, 165946,172979,177605, 200118, 244427, 251156, 258459, 265170, 344062)
I want to know if any of the mutation positions (db) are in the ranges of the domain (1374742-1375555) and return TRUE /FALSE as a vector for each position. Thanks!
You could use map2() from the purrr package:
domainStart = c(1374742,1374760,1374769,1375822,1376182,1376320,1376350)
domainStop = c(1375555, 1375726,1375516, 1378129, 1376638, 1376638, 1377382)
db = c(37788, 40303, 138445, 161587, 165946,172979,177605, 200118, 244427, 251156, 258459, 265170, 344062)
purrr:::map2(domainStart, domainStop, ~which(db > .x & db < .y))
# [[1]]
# integer(0)
#
# [[2]]
# integer(0)
#
# [[3]]
# integer(0)
#
# [[4]]
# integer(0)
#
# [[5]]
# integer(0)
#
# [[6]]
# integer(0)
#
# [[7]]
# integer(0)
Each element of the list identifies the position of the match in db for each pair of start/stop values. Here it is with some that actually work:
db <- c(1374750, 1374761, 1374770)
purrr:::map2(domainStart, domainStop, ~which(db > .x & db < .y))
# [[1]]
# [1] 1 2 3
#
# [[2]]
# [1] 2 3
#
# [[3]]
# [1] 3
#
# [[4]]
# integer(0)
#
# [[5]]
# integer(0)
#
# [[6]]
# integer(0)
#
# [[7]]
# integer(0)
Update: Fixed to address comment
db <- c(1374750, 1374761, 1374770)
purrr:::map2(domainStart, domainStop, function(.x,.y){
mx <- db[which(db > .x & db < .y)]
if(length(mx) == 0){
mx <- NA
}
data.frame(domainStart = .x, domainStop = .y, db = mx)
})
# [[1]]
# domainStart domainStop db
# 1 1374742 1375555 1374750
# 2 1374742 1375555 1374761
# 3 1374742 1375555 1374770
#
# [[2]]
# domainStart domainStop db
# 1 1374760 1375726 1374761
# 2 1374760 1375726 1374770
#
# [[3]]
# domainStart domainStop db
# 1 1374769 1375516 1374770
#
# [[4]]
# domainStart domainStop db
# 1 1375822 1378129 NA
#
# [[5]]
# domainStart domainStop db
# 1 1376182 1376638 NA
#
# [[6]]
# domainStart domainStop db
# 1 1376320 1376638 NA
#
# [[7]]
# domainStart domainStop db
# 1 1376350 1377382 NA
Perhaps we can try the code below
df <- data.frame(Start = domainStart, Stop = domainStop)
apply(
outer(db, domainStart, `>=`) & outer(db, domainStart, `<=`),
1,
function(v) {
df[which(v, arr.ind = TRUE), ]
}
)

Selection of only existing combination before the normalization operation

I'd like to normalize some variable just only if existing combinations in var1and var2 using for, in my example:
# Create my variables
var1<-c(rep(6,25),rep(7,5))
var2<-c(1,1,1,1,1,2,2,2,2,2,5,5,5,5,5,10,10,10,10,10,11,11,11,11,11,5,5,5,5,5)
var3<-rnorm(30)
# Create a data frame
mydf<-data.frame(var1,var2,var3)
str(mydf)
# Inspection by var1 and var2
table(mydf$var1,mydf$var2)
# 1 2 5 10 11
#6 5 5 5 5 5
#7 0 0 5 0 0
# I'd like not considering "0" combinations!!
# My idea is create a subset just only for combinations that have values, but if I make:
var1ID <- unique(mydf$var1)
var2ID <- unique(mydf$var2)
for(a in 1:length(var1ID)){
for(b in 1:length(var2ID)){
mydf_sub <- mydf[mydf$var1 == var1ID[a] & mydf$var2 ==var2ID[b],]
print(var1ID[a])
print(var2ID[b])
# Normalize function
normalizevar <- function(x, na.rm = TRUE) {
return((x- min(x))/(max(x)-min(x)))
}
print(normalizevar(mydf_sub$var3))
}}
# [1] 6
# [1] 1
# [1] 0.0000000 0.1235632 0.1541684 1.0000000 0.3910381
# [1] 6
# [1] 2
# [1] 0.7911505 0.0000000 0.6296866 1.0000000 0.1904835
# [1] 6
# [1] 5
# [1] 0.6571259 1.0000000 0.1402675 0.0000000 0.4068031
# [1] 6
# [1] 10
# [1] 0.7060784 0.0000000 1.0000000 0.4842629 0.9560127
# [1] 6
# [1] 11
# [1] 0.4096362 0.4831099 1.0000000 0.0000000 0.5492811
# [1] 7
# [1] 1
# numeric(0)
# [1] 7
# [1] 2
# numeric(0)
# [1] 7
# [1] 5
# [1] 0.6208451 0.3219927 1.0000000 0.4012007 0.0000000
# [1] 7
# [1] 10
# numeric(0)
# [1] 7
# [1] 11
# numeric(0)
Here a have a problem because I'd just only the output with values existent combinations and not numeric(0). Please, any help with my problem or any dplyr approach to solving it?
Note that in the question, the normalizing function was not removing NA's, if any.
# define the function at the beginning of the script,
# never in a loop
normalizevar <- function(x, na.rm = TRUE) {
(x- min(x, na.rm = na.rm))/(max(x, na.rm = na.rm)-min(x, na.rm = na.rm))
}
# make the results reproducible
set.seed(2021)
# Create my variables
var1 <- c(rep(6,25),rep(7,5))
var2 <- c(1,1,1,1,1,2,2,2,2,2,5,5,5,5,5,10,10,10,10,10,11,11,11,11,11,5,5,5,5,5)
var3 <- rnorm(30)
mydf <- data.frame(var1,var2,var3)
Base R solution
There is no need for nested loops, two (unnested) *apply loops will do it. And in just 3 code lines.
# create the groups of var1, var2
sp <- split(mydf, mydf[1:2])
# keep the sub-data.frames with more than zero rows
sp <- sp[sapply(sp, nrow) > 0]
# and normalize var3
lapply(sp, function(X) normalizevar(X$var3))
dplyr solution
A dplyr solution could be the following.
mydf %>%
group_by(var1, var2) %>%
mutate(new_var3 = normalizevar(var3))

Identifying patterns in two strings in R

I want to evaluate if ColA contains a new string than ColB. However, I am not interested in certain types of string, for example, oil. I would like to have an indicator variable as follow:
ColA ColB Ind
-------------------------- ------------------------ -----
coconut+grape+pine grape+coconut TRUE
orange+apple+grape+pine grape+coconut TRUE
grape+pine grape+oil TRUE
oil+grape grape+apple FALSE
grape grape+oil FALSE
grape+pine grape+orange+pine FALSE
Any Suggestions using R?
Many thanks!
Since we need to split the strings, we'll start with strsplit,
strsplit(dat$ColA, '+', fixed = TRUE)
# [[1]]
# [1] "coconut" "grape" "pine"
# [[2]]
# [1] "orange" "apple" "grape" "pine"
# [[3]]
# [1] "grape" "pine"
# [[4]]
# [1] "oil" "grape"
# [[5]]
# [1] "grape"
# [[6]]
# [1] "grape" "pine"
From here, we want to determine what is in ColA that is not in ColB. I'll use Map to run setdiff on each set (ColA's [[1]] with ColB's [[1]], etc).
Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE))
# [[1]]
# [1] "pine"
# [[2]]
# [1] "orange" "apple" "pine"
# [[3]]
# [1] "pine"
# [[4]]
# [1] "oil"
# [[5]]
# character(0)
# [[6]]
# character(0)
To determine which one has "new words", we can just check for non-zero length using lengths(.) > 0:
lengths(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE))) > 0
# [1] TRUE TRUE TRUE TRUE FALSE FALSE
But since you don't care about oil, we need to remove that as well.
lapply(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE)), setdiff, "oil")
# [[1]]
# [1] "pine"
# [[2]]
# [1] "orange" "apple" "pine"
# [[3]]
# [1] "pine"
# [[4]]
# character(0)
# [[5]]
# character(0)
# [[6]]
# character(0)
lengths(lapply(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE)),
setdiff, "oil")) > 0
# [1] TRUE TRUE TRUE FALSE FALSE FALSE
#akrun suggested a tidyverse variant:
library(dplyr)
library(purrr) # map2_lgl
library(stringr) # str_extract_all
dat %>%
mutate(
new = map2_lgl(
str_extract_all(ColB, "\\w+"), str_extract_all(ColA, "\\w+"),
~ !all(setdiff(.y, "oil") %in% .x)
)
)
# ColA ColB Ind new
# 1 coconut+grape+pine grape+coconut TRUE TRUE
# 2 orange+apple+grape+pine grape+coconut TRUE TRUE
# 3 grape+pine grape+oil TRUE TRUE
# 4 oil+grape grape+apple FALSE FALSE
# 5 grape grape+oil FALSE FALSE
# 6 grape+pine grape+orange+pine FALSE FALSE
Data
dat <- structure(list(ColA = c("coconut+grape+pine", "orange+apple+grape+pine", "grape+pine", "oil+grape", "grape", "grape+pine"), ColB = c("grape+coconut", "grape+coconut", "grape+oil", "grape+apple", "grape+oil", "grape+orange+pine"), Ind = c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE)), class = "data.frame", row.names = c(NA, -6L))
Here's a solution similar to r2evans's that calls strsplit only once with the help of do.call.
rid <- function(x) x[!x %in% z] ## helper FUN to get rid of the oil
z <- "oil"
L <- sapply(unname(dat), strsplit, "\\+")
dat$ind <- sapply(1:nrow(L), function(x) length(do.call(setdiff, rev(Map(rid, L[x,]))))) > 0
dat
# V1 V2 ind
# 1 grape+coconut coconut+grape+pine TRUE
# 2 grape+coconut orange+apple+grape+pine TRUE
# 3 grape+oil grape+pine TRUE
# 4 grape+apple oil+grape FALSE
# 5 grape+oil grape FALSE
# 6 grape+orange+pine grape+pine FALSE
Data:
dat <- structure(list(V1 = c("grape+coconut", "grape+coconut", "grape+oil",
"grape+apple", "grape+oil", "grape+orange+pine"), V2 = c("coconut+grape+pine",
"orange+apple+grape+pine", "grape+pine", "oil+grape", "grape",
"grape+pine")), row.names = c(NA, -6L), class = "data.frame")

List all combinations of strings that together cover all given elements

Say I am given the following strings:
1:{a,b,c,t}
2:{b,c,d}
3:{a,c,d}
4:{a,t}
I want to make a program that will give me all different combinations of these strings, where each combination has to include each given letter.
So for example the above combinations are strings {1&2, 1&3, 2&3&4, 1&2&3&4, 2&4}.
I was thinking of doing this with for loops, where the program would look at the first string, find which elements are missing, then work down through the list to find strings which have these letters. However I think this idea will only find combinations of two strings, and also it requires listing all letters to the program which seems very un-economical.
I think something like this should work.
sets <- list(c('a', 'b', 'c', 't'),
c('b', 'c', 'd'),
c('a', 'c', 'd'),
c('a', 't'))
combinations <- lapply(2:length(sets),
function(x) combn(1:length(sets), x, simplify=FALSE))
combinations <- unlist(combinations, FALSE)
combinations
# [[1]]
# [1] 1 2
#
# [[2]]
# [1] 1 3
#
# [[3]]
# [1] 1 4
#
# [[4]]
# [1] 2 3
#
# [[5]]
# [1] 2 4
#
# [[6]]
# [1] 3 4
#
# [[7]]
# [1] 1 2 3
#
# [[8]]
# [1] 1 2 4
#
# [[9]]
# [1] 1 3 4
#
# [[10]]
# [1] 2 3 4
#
# [[11]]
# [1] 1 2 3 4
u <- unique(unlist(sets))
u
# [1] "a" "b" "c" "t" "d"
Filter(function(x) length(setdiff(u, unlist(sets[x]))) == 0, combinations)
# [[1]]
# [1] 1 2
#
# [[2]]
# [1] 1 3
#
# [[3]]
# [1] 2 4
#
# [[4]]
# [1] 1 2 3
#
# [[5]]
# [1] 1 2 4
#
# [[6]]
# [1] 1 3 4
#
# [[7]]
# [1] 2 3 4
#
# [[8]]
# [1] 1 2 3 4
As a start...
I'll edit this answer when I have time. The following result is dependent on the order of choice. I haven't figured out how to flatten the list yet. If I could flatten it, I would sort each result then remove duplicates.
v = list(c("a","b","c","t"),c("b","c","d"),c("a","c","d"),c("a","t"))
allChars <- Reduce(union, v) # [1] "a" "b" "c" "t" "d"
charInList <- function(ch, li) which(sapply(li, function(vect) ch %in% vect))
locations <- sapply(allChars, function(ch) charInList(ch, v) )
# > locations
# $a
# [1] 1 3 4
#
# $b
# [1] 1 2
#
# $c
# [1] 1 2 3
#
# $t
# [1] 1 4
#
# $d
# [1] 2 3
findStillNeeded<-function(chosen){
haveChars <- Reduce(union, v[chosen])
stillNeed <- allChars[!allChars %in% haveChars]
if(length(stillNeed) == 0 ) return(chosen) #terminate if you dont need any more characters
return ( lapply(1:length(stillNeed), function(i) { #for each of the characters you still need
loc <- locations[[stillNeed[i]]] #find where the character is located
lapply(loc, function(j){
findStillNeeded(c(chosen, j)) #when you add this location to the choices, terminate if you dont need any more characters
})
}) )
}
result<-lapply(1:length(v), function(i){
findStillNeeded(i)
})

function usage and output of lapply

I am trying to play with function of lapply
lapply(1:3, function(i) print(i))
# [1] 1
# [1] 2
# [1] 3
# [[1]]
# [1] 1
# [[2]]
# [1] 2
# [[3]]
# [1] 3
I understand that lapply should be able to perform print (i) against each element i among 1:3
But why the output looks like this.
Besides, when I use unlist, I get the output like the following
unlist(lapply(1:3, function(i) print(i)))
# [1] 1
# [1] 2
# [1] 3
# [1] 1 2 3
The description of lapply function is the following:
"lapply returns a list of the same length as X, each element of which is the result of applying FUN to the corresponding element of X."
Your example:
lapply(1:3, function(x) print(x))
Prints the object x and returns a list of length 3.
str(lapply(1:3, function(x) print(x)))
# [1] 1
# [1] 2
# [1] 3
# List of 3
# $ : int 1
# $ : int 2
# $ : int 3
There are a few ways to avoid this as mentioned in the comments:
1) Using invisible
lapply(1:3, function(x) invisible(x))
# [[1]]
# [1] 1
# [[2]]
# [1] 2
# [[3]]
# [1] 3
unlist(lapply(1:3, function(x) invisible(x)))
# [1] 1 2 3
2) Without explicitly printing inside the function
unlist(lapply(1:3, function(x) x))
# [1] 1 2 3
3) Assining the list to an object:
l1 <- lapply(1:3, function(x) print(x))
unlist(l1)
# [1] 1 2 3

Resources