Isolate alphabetical strings within a larger string - r

Is there a way to isolate parts of a string that are in alphabetical order?
In other words, if you have a string like this: hjubcdepyvb
Could you just pull out the portion in alphabetical order?: bcde
I have thought about using the is.unsorted() function, but I'm not sure how to apply this to only a portion of a string.

Here's one way by converting to ASCII and back:
input <- "hjubcdepyvb"
spl_asc <- as.integer(charToRaw(input)) # Convert to ASCII
d1 <- diff(spl_asc) == 1 # Find sequences
filt <- spl_asc[c(FALSE, d1) | c(d1, FALSE)] # Only keep sequences (incl start and end)
rawToChar(as.raw(filt)) # Convert back to character
#[1] "bcde"
Note that this will concatenate any parts that are in alphabetical order.
i.e. If input is "abcxasdicfgaqwe" then output would be abcfg.
If you wanted to get separate vectors for each sequential string, you could do the following
input <- "abcxasdicfgaqwe"
spl_asc <- as.integer(charToRaw(input))
d1 <- diff(spl_asc) == 1
r <- rle(c(FALSE, d1) | c(d1, FALSE)) # Find boundaries
cm <- cumsum(c(1, r$lengths)) # Map these to string positions
substring(input, cm[-length(cm)], cm[-1] - 1)[r$values] # Extract matching strings
Finally, I had to come up with a way to use regex:
input <- c("abcxasdicfgaqwe", "xufasiuxaboqdasdij", "abcikmcapnoploDEFgnm",
"acfhgik")
(rg <- paste0("(", paste0(c(letters[-26], LETTERS[-26]),
"(?=", c(letters[-1], LETTERS[-1]), ")", collapse = "|"), ")+."))
#[1] "(a(?=b)|b(?=c)|c(?=d)|d(?=e)|e(?=f)|f(?=g)|g(?=h)|h(?=i)|i(?=j)|j(?=k)|
#k(?=l)|l(?=m)|m(?=n)|n(?=o)|o(?=p)|p(?=q)|q(?=r)|r(?=s)|s(?=t)|t(?=u)|u(?=v)|
#v(?=w)|w(?=x)|x(?=y)|y(?=z)|A(?=B)|B(?=C)|C(?=D)|D(?=E)|E(?=F)|F(?=G)|G(?=H)|
#H(?=I)|I(?=J)|J(?=K)|K(?=L)|L(?=M)|M(?=N)|N(?=O)|O(?=P)|P(?=Q)|Q(?=R)|R(?=S)|
#S(?=T)|T(?=U)|U(?=V)|V(?=W)|W(?=X)|X(?=Y)|Y(?=Z))+."
regmatches(input, gregexpr(rg, input, perl = TRUE))
#[[1]]
#[1] "abc" "fg"
#
#[[2]]
#[1] "ab" "ij"
#
#[[3]]
#[1] "abc" "nop" "DEF"
#
#[[4]]
#character(0)
This regular expression will identify consecutive upper or lower case letters (but not mixed case). As demonstrated, it works for character vectors and produces a list of vectors with all the matches identified. If no match is found, the output is character(0).

Using factor integer conversion:
input <- "hjubcdepyvb"
d1 <- diff(as.integer(factor(unlist(strsplit(input, "")), levels = letters))) == 1
filt <- c(FALSE, d1) | c(d1, FALSE)
paste(unlist(strsplit(input, ""))[filt], collapse = "")
# [1] "bcde"

myf = function(x){
x = unlist(strsplit(x, ""))
ind = charmatch(x, letters)
d = c(0, diff(ind))
d[d !=1] = 0
d = d + c(sapply(1:(length(d)-1), function(i) {
ifelse(d[i] == 0 & d[i+1] == 1, 1, 0)
}
), 0)
d = split(seq_along(d)[d!=0], with(rle(d), rep(seq_along(values), lengths))[d!=0])
return(sapply(d, function(a) paste(x[a], collapse = "")))
}
myf(x = "hjubcdepyvblltpqrs")
# 2 4
#"bcde" "pqrs"

Related

How to iterate entries in a function to create two new character vectors

I am struggling to separate a single string input into a series of inputs. The user gives a list of FASTA formatted sequences (see example below). I'm able to separate the inputs into their own
ex:
">Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
.>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
"
[1] "Rosalind_6404CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG"
[2] "Rosalind_5959CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC"
But I am struggling to find a way to create a function that splits the "Rosalind_6404" from the gene sequence to the unknown amount of FASTA sequences while creating new vectors for the split elements.
Ultimately, the result would look something such as:
.> "Rosalind_6404" "Rosalind5959"
.> "CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG","CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC"
I was hoping the convert_entries function would allow me to iterate over all the elements of the prepped_s character vector and split the elements into two new vectors with the same index number.
s <- ">Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC"
split_s <- strsplit(s, ">")
ul_split_s<- unlist(split_s)
fixed_s <- gsub("\n","", ul_split_s)
prepped_s <- fixed_s[-1]
prepped_s
nchar(prepped_s[2])
print(prepped_s[2])
entry_tags <- list()
entry_seqs <- list()
entries <- length(prepped_s)
unlist(entries)
first <- prepped_s[1]
convert_entries <- function() {
for (i in entries) {
tag <- substr(prepped_s[i], start = 1, stop = 13)
entry_tags <- append(entry_tags, tag)
return(entry_tags)
}
}
entry_tags <- convert_entries()
print(entry_tags)
Please help in any way you can, thanks!
One option with tidyverse
library(dplyr)
library(tidyr)
library(stringr)
tibble(col1 = s) %>%
separate_rows(col1, sep="\n") %>%
group_by(grp = cumsum(str_detect(col1, '^>'))) %>%
summarise(prefix = first(col1),
col1 = str_c(col1[-1], collapse=""), .groups = 'drop') %>%
select(-grp)
-output
# A tibble: 2 x 2
prefix col1
<chr> <chr>
1 >Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG
2 >Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC
Using seqinr package:
library(seqinr)
# example fasta file
write(">Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC", "myFile.fasta")
# read the fasta file
x <- read.fasta("myFile.fasta", as.string = TRUE, forceDNAtolower = FALSE)
# get the names
names(x)
# [1] "Rosalind_6404" "Rosalind_5959"
# get the seq
x$Rosalind_6404
# [1] "CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG"
# attr(,"name")
# [1] "Rosalind_6404"
# attr(,"Annot")
# [1] ">Rosalind_6404"
# attr(,"class")
# [1] "SeqFastadna"
In base R you could do:
t(gsub('\n', '', regmatches(s, gregexec("([A-Z][a-z_0-9]+)\n([A-Z\n]+)", s))[[1]][-1,]))
[,1] [,2]
[1,] "Rosalind_6404" "CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG"
[2,] "Rosalind_5959" "CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC"
NOTE: I transposed the matrix so that you may vie the results. Ignore the use of t function
Another base R solution:
read.table(text=sub('\n', ' ', gsub('(\\D)\n', '\\1', unlist(strsplit(s, '>')))))
V1 V2
1 Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG
2 Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC
or even
proto <- data.frame(name = character(), value = character())
new_s <- gsub('\n', '', unlist(strsplit(s, '>')))
strcapture("([A-Z][a-z_0-9]+)([A-Z]+)", grep('\\w', new_s, value = T), proto)
name value
1 Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG
2 Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC

How can I use lapply, sapply or apply to filter a data frame in R?

I am trying to remove all field that does not contain 10 digit numbers and those that have 10 zeros, I want to achieve this with the lapply or sapply or apply function. my code below does not work:
lapply(df, function(x) filter(x %like% "^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]" | !x %in% "0000000000"))
10 zeroes are part of 10-digit numbers so you don't need to test for them separately.
df <- data.frame(a = c('123456789', '123456789', '123'),
b = c('0000000000', '2345', '1234'))
result <- lapply(df, function(x) grep('\\d{10}', x, value = TRUE, invert = TRUE))
#$a
#[1] "123456789" "123456789" "123"
#$b
#[1] "2345" "1234"
You can also use nchar to count number of characters.
result <- lapply(df, function(x) x[nchar(x) != 10])

Convert data.frame to list of lists

I am trying to figure out how to convert a data.frame to a list of lists. Suppose I had (feel free to modify this if you need to capture more attributes for later):
v <- list(
row1=list(col1 = as.Date("2011-01-23"), col2="A"),
row2=list(col1 = as.Date("2012-03-03"), col2="B"))
d <- do.call(rbind, lapply(v, as.data.frame))
d$col3 <- 2
How do I get d back to a list of lists (similar to v). The end result should be equivalent to the result of:
vr <- list(
row1=list(col1 = as.Date("2011-01-23"), col2="A", col3=2),
row2=list(col1 = as.Date("2012-03-03"), col2="B", col3=2))
You can do
out <- lapply(split(d, rownames(d)), as.list)
out
#$row1
#$row1$col1
#[1] "2011-01-23"
#$row1$col2
#[1] "A"
#$row1$col3
#[1] 2
#$row2
#$row2$col1
#[1] "2012-03-03"
#$row2$col2
#[1] "B"
#$row2$col3
#[1] 2
If you add stringsAsFactors = FALSE when creating d, i.e.
d <- do.call(rbind, lapply(v, as.data.frame, stringsAsFactors = FALSE))
d$col3 <- 2
then
identical(out, vr)
returns TRUE.
You have to go through the columns again making them lists before you pass them as values of the element of the main list. I hope the below code helps:
apply(d,MARGIN = 1, FUN=function(x){as.list(x)})

Select the last n characters in a string

I have the following dataset
df <-data.frame(fact=c("a,bo,v", "c,b,v,d", "c"))
I wish to select the last two items for each row. So, Ideally I wish to have this output:
fact
1 bo,v
2 v,d
3 c
I've tried to split the rows and then choose the last two items:
spl <- strsplit(as.character(df$fact), split = ",")
tail(spl[[1]], n=2)
But doe not give me the correct results
You can do this:
lapply(lapply(strsplit(as.character(df$fact), split = ','), function(x) x[c(length(x)-1,length(x))]), paste, collapse = ',')
You split the col and then extract the n and n-1 index. Then paste them together.
You can generalise this for by doing:
lapply(strsplit(as.character(df$fact), split = ','), function(x) x[(length(x)-n):length(x)] )
where n is no of backward steps you want to take.
Using tail is even simpler.
lapply(strsplit(as.character(df$fact), split = ','), tail, n=2)
We can use sapply to loop over every element of fact, split it on basis of , and then select the last n elements using tail
n <- 2
sapply(as.character(df$fact), function(x) {
temp = unlist(strsplit(x, ','))
tail(temp, n)
}, USE.NAMES = F)
#[[1]]
#[1] "bo" "v"
#[[2]]
#[1] "v" "d"
#[[3]]
#[1] "c"
A better option with dplyr I feel using rowwise
library(dplyr)
df %>%
rowwise() %>%
mutate(last_two = paste0(tail(unlist(strsplit(as.character(fact),",")), n),
collapse = ","))
# fact last_two
# <fctr> <chr>
#1 a,bo,v bo,v
#2 c,b,v,d v,d
#3 c c

How to sort characters in each element of a character vector? [duplicate]

This question already has answers here:
Closed 11 years ago.
Possible Duplicate:
How to sort letters in a string in R?
I have a dataframe where a variable is a character string. Is there a way to create another variable having the same elements as x, but each sorted in ascending or descending order as below:
x_old: (trad, jfwd, qerf...)
x_new: (adrt, dfjw, efqr...)
Using the dummy data:
strs <- c("trad", "jfwd", "qerf")
You can do this with a series of steps over the character vector:
sapply( ## 3
lapply( ## 2
sapply(strs, strsplit, ""), ## 1
sort), ## 2
paste, collapse = "") ## 3
which gives:
> sapply(lapply(sapply(strs, strsplit, ""), sort), paste, collapse = "")
trad jfwd qerf
"adrt" "dfjw" "efqr"
Where in the function, ## 1 splits each element of the character vector into single characters, ## 2 sorts these sets of characters, and ## 3 pastes them back together again.
We can do this in a single step by encapsulating the steps into a function:
foo <- function(x) {
x <- strsplit(x, split = "")[[1]]
x <- sort(x)
paste(x, collapse = "")
}
which can be used as:
> sapply(strs, foo)
trad jfwd qerf
"adrt" "dfjw" "efqr"
There must be an easier way:
x <- c("trad", "jfwd", "qerf")
unname(sapply(x, function(i)paste(sort(unlist(strsplit(i, ""))), collapse="")))
[1] "adrt" "dfjw" "efqr"

Resources