I would like to remove the third and fourth last character from as string.
Here is some sample data:
HS0202
HS0902
MV0100
SUE0300
I would need return something like this
HS02
HS02
MV00
SUE00
Using a regex with gsub():
gsub("..(?=..$)", "", chrs, perl = TRUE)
# "HS02" "HS02" "MV00" "SUE00"
Or with stringr::str_remove():
library(stringr)
str_remove(chrs, "..(?=..$)")
# "HS02" "HS02" "MV00" "SUE00"
You can extract from first to fourth last characters, and paste on the final character as follows:
have <- c('HS0202', 'HS0902', 'MV0100', 'SUE0300')
want <- paste0(substring(have,1,nchar(have)-3),substring(have,nchar(have)))
Using stringi:
library(stringi)
stri_sub_replace(x, from = -4, to = -3, value = "")
[1] "HS02" "HS02" "MV00" "SUE00"
Or with stringr:
library(stringr)
str_sub(x, start = -4, end = -3) <- ""
Data
x <- c("HS0202", "HS0902", "MV0100", "SUE0300")
Related
One string with 25 xy as patterns and a 25 long vector that should replace those 25 xy.
This is not for prgramming or anything complicated, I just wish to get a result, which I can copy and then paste into a forum that uses this BBcode inside the string to make a colorful line.
string <- "[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]__[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]"
colrs <- c( "08070D", "100F1A", "191627", "211E34", "292541", "312D4E", "39345B", "413C68", "4A4375", "524A82", "5A528E", "62599B", "6C64A6", "7971AE", "827BB3", "8C85B9", "958FBF", "9F99C4", "A9A3CA", "B2AED0", "BCB8D6", "C5C2DC", "CFCCE2", "D9D6E8", "E2E0ED")
and want this as a result
[COLOR="#08070D"]_[COLOR="#100F1A"]_[COLOR="#191627"]_[COLOR="#211E34"]_[COLOR="#292541"]_[COLOR="#312D4E"]_[COLOR="#39345B"]_[COLOR="#413C68"]_[COLOR="#4A4375"]_[COLOR="#524A82"]_[COLOR="#5A528E"]_[COLOR="#62599B"]_[COLOR="#6C64A6"]_[COLOR="#7971AE"]_[COLOR="#827BB3"]_[COLOR="#8C85B9"]_[COLOR="#958FBF"]_[COLOR="#9F99C4"]_[COLOR="#A9A3CA"]_[COLOR="#B2AED0"]_[COLOR="#BCB8D6"]_[COLOR="#C5C2DC"]_[COLOR="#CFCCE2"]_[COLOR="#D9D6E8"]_[COLOR="#E2E0ED"]__[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]
I have completely revised my answer given that you removed the previous iteration of your code example. Here's the revised solution:
string <- '[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]__[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]'
colrs <- c("08070D", "100F1A", "191627", "211E34", "292541", "312D4E", "39345B", "413C68", "4A4375", "524A82", "5A528E", "62599B", "6C64A6", "7971AE", "827BB3", "8C85B9", "958FBF", "9F99C4", "A9A3CA", "B2AED0", "BCB8D6", "C5C2DC", "CFCCE2", "D9D6E8", "E2E0ED")
library(stringr)
string0 <- string |>
str_split("xy") |>
unlist()
string0[seq_along(colrs)] |>
str_c(colrs, collapse = "") |>
str_c(string0[length(colrs)+1])
[1] "[COLOR=\"#08070D\"]_[COLOR=\"#100F1A\"]_[COLOR=\"#191627\"]_[COLOR=\"#211E34\"]_[COLOR=\"#292541\"]_[COLOR=\"#312D4E\"]_[COLOR=\"#39345B\"]_[COLOR=\"#413C68\"]_[COLOR=\"#4A4375\"]_[COLOR=\"#524A82\"]_[COLOR=\"#5A528E\"]_[COLOR=\"#62599B\"]_[COLOR=\"#6C64A6\"]_[COLOR=\"#7971AE\"]_[COLOR=\"#827BB3\"]_[COLOR=\"#8C85B9\"]_[COLOR=\"#958FBF\"]_[COLOR=\"#9F99C4\"]_[COLOR=\"#A9A3CA\"]_[COLOR=\"#B2AED0\"]_[COLOR=\"#BCB8D6\"]_[COLOR=\"#C5C2DC\"]_[COLOR=\"#CFCCE2\"]_[COLOR=\"#D9D6E8\"]_[COLOR=\"#E2E0ED\"]__[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]"
EDIT #2:
An easy solution to the new data problem is this:
library(stringr)
string0 <- unlist(str_split(gsub('"', "", string), "__?"))
str_c(str_replace(string0,'xy', colrs), collapse = "_")
[1] "[COLOR=#08070D]_[COLOR=#100F1A]_[COLOR=#191627]_[COLOR=#211E34]_[COLOR=#292541]_[COLOR=#312D4E]_[COLOR=#39345B]_[COLOR=#413C68]_[COLOR=#4A4375]_[COLOR=#524A82]_[COLOR=#5A528E]_[COLOR=#62599B]_[COLOR=#6C64A6]_[COLOR=#7971AE]_[COLOR=#827BB3]_[COLOR=#8C85B9]_[COLOR=#958FBF]_[COLOR=#9F99C4]_[COLOR=#A9A3CA]_[COLOR=#B2AED0]_[COLOR=#BCB8D6]_[COLOR=#C5C2DC]_[COLOR=#CFCCE2]_[COLOR=#D9D6E8]_[COLOR=#E2E0ED]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]"
EDIT:
Given this data:
string <- "AxyBxyCxyDxyExy"
vector <- c(1,2,3,4,5)
and this desired result:
"A1B2C3D4E5"
you can do this:
library(stringr)
First, we extract the character that's before xy using str_extract_all:
string0 <- unlist(str_extract_all(string, ".(?=xy)"))
Next we do two things: a) we replace the lone character with itself (\\1) AND the vector value, and b) we collapse the separate strings into one large string using str_c:
str_c(str_replace(string0, "(.)$", str_c("\\1", vector)), collapse = "")
[1] "A1B2C3D4E5"
I have sequences that I want to reformat in R.
But R presents me the sequences with returns of lines (as can be seen with the "\n":
seqs <- ">PRTRE213-13 Volkameria aculeatum matK \n------------------------------------------------------------------CCAAC\nCGAGAGCCAGCTCC------TCTTTTTCAAAA---------CGAAAT---------------------CAA\nAAGACTATTCTTATTCTTATAT------------AATTCTCATGTATGTGAATATGAATCCGTTTTCGTCT\nTTTCTACGTAACCAATCTTTT---CATTTACGATCAACATCTTTTGAAGTTCTTCTTGAACGAATCTATTT\nTCTATGTA---------AAAGTAGAACGTCTT------GTGAACGTCTTTGTTAAGATTAAC---------\n-AATTTTCGGGCGAACCCGTGGTTGGTCAAG------GAACCTTTCATGCATTATATTAGGTATCAAAGAA\nAGATCCATTCTGGCTTCA------AAGGGAACATCTTTTTTCATGAAAAAATGGCAATTTTATCTTGTCAC\nCTTTTTGGCAATGGCATTTTTCGCTGTGGTTTCATCCAAGAAGGATTTATCTAAAC---CAATTATCCAAT\nTTATTCCCTTGAA------TTTTTGGGCTATCTTTCA------AGCGTGCGAATGAACCCCTCTGTGGTAC\nCGGAGTCAAATTCTAGAAAATGCATTTCTAATCAATAATGCTATT------AAGAAGTTTGATACCCTTAT\nTTCCAATTATTCCAATGATTGCGTCATTGGCTAAAGCGAAATTTTGTAACGTATTTGGGCATCCTGTTAGT\nTAAGCCGATTTGGGCTGATTTATCAGATTCTAATATTATTGACCGATTTGGTCGTATA---TGCAGAAATC\nCTTTCTC-------------"
But I want to remove all the \nexcept the first one. I.e.:
[1] ">PRTRE213-13 Volkameria aculeatum matK \n------------------------------------------------------------------CCAACCGAGAGCCAGCTCC------TCTTTTTCAAAA---------CGAAAT---------------------CAAAAGACTATTCTTATTCTTATAT------------AATTCTCATGTATGTGAATATGAATCCGTTTTCGTCTTTTCTACGTAACCAATCTTTT---CATTTACGATCAACATCTTTTGAAGTTCTTCTTGAACGAATCTATTTTCTATGTA---------AAAGTAGAACGTCTT------GTGAACGTCTTTGTTAAGATTAAC----------AATTTTCGGGCGAACCCGTGGTTGGTCAAG------GAACCTTTCATGCATTATATTAGGTATCAAAGAAAGATCCATTCTGGCTTCA------AAGGGAACATCTTTTTTCATGAAAAAATGGCAATTTTATCTTGTCACCTTTTTGGCAATGGCATTTTTCGCTGTGGTTTCATCCAAGAAGGATTTATCTAAAC---CAATTATCCAATTTATTCCCTTGAA------TTTTTGGGCTATCTTTCA------AGCGTGCGAATGAACCCCTCTGTGGTACCGGAGTCAAATTCTAGAAAATGCATTTCTAATCAATAATGCTATT------AAGAAGTTTGATACCCTTATTTCCAATTATTCCAATGATTGCGTCATTGGCTAAAGCGAAATTTTGTAACGTATTTGGGCATCCTGTTAGTTAAGCCGATTTGGGCTGATTTATCAGATTCTAATATTATTGACCGATTTGGTCGTATA---TGCAGAAATCCTTTCTC-------------"
If I do this, it removes all of the returns.
gsub(pattern = "\n",replacement = "", x = seqs)
This is not working:
sub("^(.*? \n .*?) \n .*", "\\1", seqs)
This gives me an error:
gsub(pattern = "${'\n'[*]:0:2}",replacement = "", x = seqs)
Error in gsub(pattern = "${'\n'[*]:0:2}", replacement = "", x = seqs) :
invalid regular expression '${'
'[*]:0:2}', reason 'Invalid contents of {}'
My sequences are variable:
">Whatever here before \n the sequence start \n the rest \n..."
The end result would be
">Whatever here before \n the sequence start the rest..."
Interestingly, the code below partially works for the test sentence, but not the sequence above:
seqss = ">Whatever here before \n the sequence start \n the rest \n..."
sub("^(.*? \n .*?) \n .*", "\\1", seqss)
[1] ">Whatever here before \n the sequence start"
Try it like this:
seqs <- ">PRTRE213-13 Volkameria aculeatum matK \n------------------------------------------------------------------CCAAC\nCGAGAGCCAGCTCC------TCTTTTTCAAAA---------CGAAAT---------------------CAA\nAAGACTATTCTTATTCTTATAT------------AATTCTCATGTATGTGAATATGAATCCGTTTTCGTCT\nTTTCTACGTAACCAATCTTTT---CATTTACGATCAACATCTTTTGAAGTTCTTCTTGAACGAATCTATTT\nTCTATGTA---------AAAGTAGAACGTCTT------GTGAACGTCTTTGTTAAGATTAAC---------\n-AATTTTCGGGCGAACCCGTGGTTGGTCAAG------GAACCTTTCATGCATTATATTAGGTATCAAAGAA\nAGATCCATTCTGGCTTCA------AAGGGAACATCTTTTTTCATGAAAAAATGGCAATTTTATCTTGTCAC\nCTTTTTGGCAATGGCATTTTTCGCTGTGGTTTCATCCAAGAAGGATTTATCTAAAC---CAATTATCCAAT\nTTATTCCCTTGAA------TTTTTGGGCTATCTTTCA------AGCGTGCGAATGAACCCCTCTGTGGTAC\nCGGAGTCAAATTCTAGAAAATGCATTTCTAATCAATAATGCTATT------AAGAAGTTTGATACCCTTAT\nTTCCAATTATTCCAATGATTGCGTCATTGGCTAAAGCGAAATTTTGTAACGTATTTGGGCATCCTGTTAGT\nTAAGCCGATTTGGGCTGATTTATCAGATTCTAATATTATTGACCGATTTGGTCGTATA---TGCAGAAATC\nCTTTCTC-------------"
gsub(pattern = "(^.*?\\n)|\\n",replacement = "\\1", x = seqs, perl = TRUE)
regex101 demo
The idea of the regex
(^.*?\\n)|\\n
is to capture everything up to the first newline in a group to retain and put it back in the replacement.
A stringr approach that simply splits the string and combines the two parts back together.
seqs <- ">PRTRE213-13 Volkameria aculeatum matK \n------------------------------------------------------------------CCAAC\nCGAGAGCCAGCTCC------TCTTTTTCAAAA---------CGAAAT---------------------CAA\nAAGACTATTCTTATTCTTATAT------------AATTCTCATGTATGTGAATATGAATCCGTTTTCGTCT\nTTTCTACGTAACCAATCTTTT---CATTTACGATCAACATCTTTTGAAGTTCTTCTTGAACGAATCTATTT\nTCTATGTA---------AAAGTAGAACGTCTT------GTGAACGTCTTTGTTAAGATTAAC---------\n-AATTTTCGGGCGAACCCGTGGTTGGTCAAG------GAACCTTTCATGCATTATATTAGGTATCAAAGAA\nAGATCCATTCTGGCTTCA------AAGGGAACATCTTTTTTCATGAAAAAATGGCAATTTTATCTTGTCAC\nCTTTTTGGCAATGGCATTTTTCGCTGTGGTTTCATCCAAGAAGGATTTATCTAAAC---CAATTATCCAAT\nTTATTCCCTTGAA------TTTTTGGGCTATCTTTCA------AGCGTGCGAATGAACCCCTCTGTGGTAC\nCGGAGTCAAATTCTAGAAAATGCATTTCTAATCAATAATGCTATT------AAGAAGTTTGATACCCTTAT\nTTCCAATTATTCCAATGATTGCGTCATTGGCTAAAGCGAAATTTTGTAACGTATTTGGGCATCCTGTTAGT\nTAAGCCGATTTGGGCTGATTTATCAGATTCTAATATTATTGACCGATTTGGTCGTATA---TGCAGAAATC\nCTTTCTC-------------"
library(stringr)
keep_first_newline <- function(string){
first_newline <- str_locate(string, "\\n")[1]
head <- str_sub(string, end = first_newline)
tail = string %>%
str_sub(start = first_newline + 1) %>%
str_remove_all("\\n")
out <- str_c(head, tail)
}
seqs %>%
keep_first_newline %>%
writeLines
#> >PRTRE213-13 Volkameria aculeatum matK
#> ------------------------------------------------------------------CCAACCGAGAGCCAGCTCC------TCTTTTTCAAAA---------CGAAAT---------------------CAAAAGACTATTCTTATTCTTATAT------------AATTCTCATGTATGTGAATATGAATCCGTTTTCGTCTTTTCTACGTAACCAATCTTTT---CATTTACGATCAACATCTTTTGAAGTTCTTCTTGAACGAATCTATTTTCTATGTA---------AAAGTAGAACGTCTT------GTGAACGTCTTTGTTAAGATTAAC----------AATTTTCGGGCGAACCCGTGGTTGGTCAAG------GAACCTTTCATGCATTATATTAGGTATCAAAGAAAGATCCATTCTGGCTTCA------AAGGGAACATCTTTTTTCATGAAAAAATGGCAATTTTATCTTGTCACCTTTTTGGCAATGGCATTTTTCGCTGTGGTTTCATCCAAGAAGGATTTATCTAAAC---CAATTATCCAATTTATTCCCTTGAA------TTTTTGGGCTATCTTTCA------AGCGTGCGAATGAACCCCTCTGTGGTACCGGAGTCAAATTCTAGAAAATGCATTTCTAATCAATAATGCTATT------AAGAAGTTTGATACCCTTATTTCCAATTATTCCAATGATTGCGTCATTGGCTAAAGCGAAATTTTGTAACGTATTTGGGCATCCTGTTAGTTAAGCCGATTTGGGCTGATTTATCAGATTCTAATATTATTGACCGATTTGGTCGTATA---TGCAGAAATCCTTTCTC-------------
Created on 2018-06-29 by the reprex package (v0.2.0).
Using gsubfn we can do. Not any better than the regex in this case, but more easily extended if you want to keep the first n occurrences with n>1.
library(gsubfn)
p <- proto(fun = function(this, x) if(count > 1) '' else x)
out <- gsubfn('\n', p, seqs)
Same as accepted answer
out == gsub(pattern = "(^.*?\\n)|\\n",replacement = "\\1", x = seqs, perl = TRUE)
#[1] TRUE
I have a question that is how to replace a character which is in a certain place. For example:
str <- c("abcdccc","hijklccc","abcuioccc")
#I want to replace character "c" which is in position 3 to "X" how can I do that?
#I know the function gsub and substr, but the only idea I have got so far is
#use if() to make it. How can I do it quickly?
#ideal result
>str
"abXdccc" "hijklccc" "abXuioccc"
It's a bit awkward, but you can replace a single character dependent on that single character's value like:
ifelse(substr(str,3,3)=="c", `substr<-`(str,3,3,"X"), str)
#[1] "abXdccc" "hijklccc" "abXuioccc"
If you are happy to overwrite the value, you could do it a bit cleaner:
substr(str[substr(str,3,3)=="c"],3,3) <- "X"
str
#[1] "abXdccc" "hijklccc" "abXuioccc"
I wonder if you can use a regex lookahead here to get what you are after.
str <- c("abcdccc","hijklccc","abcuioccc")
gsub("(^.{2})(?=c)(.*$)", "\\1X\\2", str, perl = T)
Or using a positive lookbehind as suggested by thelatemail
sub("(?<=^.{2})c", "X", str, perl = TRUE)
What this is doing is looking to match the letter c which is after any two characters from the start of the string. The c is replaced with X.
(?<= is the start of positive lookbehind
^.{2} means any two characters from the start of the string
)c is the last part which says it has to be a c after the two characters
[1] "abXcdccc" "hijklccc" "abXcuioccc"
If you want to read up more about regex being used (link)
Additionally a generalised function:
switch_letter <- function(x, letter, position, replacement) {
stopifnot(position > 1)
pattern <- paste0("(?<=^.{", position - 1, "})", letter)
sub(pattern, replacement, x, perl = TRUE)
}
switch_letter(str, "c", 3, "X")
This should work too:
str <- c("abcdefg","hijklnm","abcuiowre")
a <- strsplit(str[1], "")[[1]]
a[3] <- "X"
a <- paste(a, collapse = '')
str[1] <- a
How about this idea:
c2Xon3 <- function(x){sprintf("%s%s%s",substring(x,1,3),gsub("c","X",substring(x,3,3)),substring(x,4,nchar(x)))}
str <- c("abcdccc","hijklccc","abcuioccc")
strNew <- sapply(str,c2Xon3 )
This should work
str <- c("abcdefg","hijklnm","abcuiowre")
for (i in 1:length(str))
{
if (substr(str[i],3,3)=='c') {
substr(str[i], 3, 3) <- "X"
}
}
You can just use ifelse with gsub, i.e.
ifelse(substr(str, 3, 3) == 'c', paste0(substring(str, 1, 2),'X', substring(str, 4)), str)
#[1] "abXdccc" "hijklccc" "abXuioccc"
str1 <- c("youaremyfriend","youarethebest")
str2 <- c("ABCDEFGHIJKLMN","thisismydata","thereisabird")
df <-c(str1,str2)
#the ideal research
"youar" "youa" "ABCDE" "this" "ther"
Hello, I want to extract the first one-third of string from all elements of df. For example, "youaremyfriend" the length is 14,so the result should be "youar". How can I do that? Thank you :)
If you have a string, you can use built-in functions nchar and substring (no libraries needed)
str1 = "youaremyfriend"
str1_length = nchar(str1)
substring_wanted = substr(str1, 1, round(str1_length/3)
"youar"
With stringr this can be done like this:
library(stringr)
str1 <- c("youaremyfriend","youarethebest")
str2 <- c("ABCDEFGHIJKLMN","thisismydata","thereisabird")
string <- c(str1, str2)
str_sub(string, start = 1, end = round(str_length(string)/3))
# [1] "youar" "youa" "ABCDE" "this" "ther"
Another solution in base R:
str1 <- c("youaremyfriend", "youarethebest", "BBBAAAKKK")
str2 <- c("ABCDEFGHIJKLMN", "thisismydata", "thereisabird")
df <- data.frame(str1, str2)
c(apply(df, c(1, 2), function(s) {
strtrim(s, width = nchar(s)/3)
}))
Your first column was missing an entry so I added one to construct the dataframe.
I have a dataframe which contains in different cells a special character which I know which is. An example of the structure:
df = data.frame(col_1 = c("21 myspec^ch2 12",NA),
col_2 = c("1 myspec^ch2 4","4 myspec^ch2 212"))
The character is this myspec^ch2 and I would like to replace with -. An example of expected output:
df = data.frame(col_1 = c("21-12",NA),
col_2 = c("1-4","4-212"))
I tried this but it is not working:
df [ df == " myspec^ch2 " ] <- "-"
To get gsub work on whole dataframe use apply:
apply(df, 2, function(x) gsub(" myspec\\^ch2 ", "-", x))
You really want to do a regex-style substitution here. However, in regex, ^ is seen as the beginning of the line (rather than a literal caret). So you can do something like this (using the stringr package):
library(dplyr)
library(stringr)
fixed_df <- df %>%
mutate_all(funs(str_replace_all( . , " myspec\\^ch2 ", "-"))
Note the double backslashes in front of the caret--that escapes the caret and tells R to interpret it literally, rather than as the beginning of the line.