I have the following dataframe:
a <- seq(1:5)
b <- c("abc_a_123456_defghij_1", "abc_a_78912_abc_2", "abc_a_345678912_xyzabc_3",
"abc_b_34567_defgh_4", "abc_c_891234556778_ijklmnop_5")
df <- data.frame(a, b)
df$b <- as.character(df$b)
And I need to extract the numbers in df$b that come between the second and third underscores and assign to df$c.
I'm guessing there's a fairly simple solution, but haven't found it yet. The actual dataset is fairly large (3MM rows) so efficiency is a bit of a factor.
Thanks for the help!
We can use sub to match the zeor or more characters that are not a _ ([^_]*) from the start (^) of the string followed by an underscore (_), then another set of characters that are not an underscore followed by underscore, capture the one of more numbers that follows in a group ((\\d+)) followed by underscore and other characters, then replace it with the backreference for that group and finally convert it to numeric
as.numeric(sub("^[^_]*_[^_]+_(\\d+)_.*", "\\1", df$b))
#[1] 123456 78912 345678912 34567 891234556778
create a my_split function that finds the start and end position of "_" using gregexpr. Then extract the string between start and end position using substr.
my_split <- function(x, start, end){
a1 <- gregexpr("_", x)
substr(x, a1[[1]][start]+1, a1[[1]][end]-1)
}
b <- c("abc_a_123456_defghij_1", "abc_a_78912_abc_2", "abc_a_345678912_xyzabc_3", "abc_b_34567_defgh_4", "abc_c_891234556778_ijklmnop_5")
sapply(b, my_split, start = 2, end = 3)
# abc_a_123456_defghij_1 abc_a_78912_abc_2
# "123456" "78912"
# abc_a_345678912_xyzabc_3 abc_b_34567_defgh_4
# "345678912" "34567"
# abc_c_891234556778_ijklmnop_5
# "891234556778"
using data.table library
library(data.table)
setDT(df)[, c := lapply(b, my_split, start = 2, end = 3)]
df
# a b c
# 1: 1 abc_a_123456_defghij_1 123456
# 2: 2 abc_a_78912_abc_2 78912
# 3: 3 abc_a_345678912_xyzabc_3 345678912
# 4: 4 abc_b_34567_defgh_4 34567
# 5: 5 abc_c_891234556778_ijklmnop_5 891234556778
data:
a <- seq(1:5)
b <- c("abc_a_123456_defghij_1", "abc_a_78912_abc_2", "abc_a_345678912_xyzabc_3", "abc_b_34567_defgh_4", "abc_c_891234556778_ijklmnop_5")
df <- data.frame(a, b, stringsAsFactors = FALSE)
Related
How to write a function that accepts a DNA sequence (as a single string) and a number “n >= 2” and returns a vector with all DNA subsequences (as strings) that start with the triplet “AAA” or “GAA” and end with the triplet “AGT” and have at least 2 and at most “n” other triplets between the start and the end.
Q1:
for "GAACCCACTAGTATAAAATTTGGGAGTCCCAAACCCTTTGGGAGT" and for n=2,
the answer is c=(“GAACCCACTAGT”, “AAATTTGGGAGT”).
Q2:
e.g, n=10
the answer is: c("GAACCCACTAGTATAAAATTTGGGAGT", "AAACCCTTTGGGAGT")
here is a possible approach.
it uses a regex based on 2 -> n repetitions of three [A-Z] as it's core.
library( stringr )
#sample data
dna <- c("GAACCCACTAGTATAAAATTTGGGAGTCCCAAACCCTTTGGGAGT")
#set constants
start <- c("AAA", "GAA")
end <- "AGT"
n <- 10 # << set as desired
#build regex
regex <- paste0( "(", paste0( start, collapse = "|" ), ")", paste0( "([A-Z]{3}){2,", n, "}" ), end )
#for n = 10, this looks like: "(AAA|GAA)([A-Z]{3}){2,10}AGT"
stringr::str_extract_all( dna, regex )
# n = 2
# [[1]]
# [1] "GAACCCACTAGT" "AAATTTGGGAGT"
# n = 10
# [[1]]
# [1] "GAACCCACTAGTATAAAATTTGGGAGT" "AAACCCTTTGGGAGT"
How can I extract just the number between the parentheses () and before %?
df <- data.frame(X = paste0('(',runif(3,0,1), '%)'))
X
1 (0.746698269620538%)
2 (0.104987640399486%)
3 (0.864544949028641%)
For instance, I would like to have a DF like this:
X
1 0.746698269620538
2 0.104987640399486
3 0.864544949028641
We can use sub to match the ( (escaped \\ because it is metacharacter) at the start (^) of the string followed by 0 or more numbers ([0-9.]*) captured as a group ((...)), followed by % and other characters (.*), replace it with the backreference (\\1) of the captured group
df$X <- as.numeric(sub("^\\(([0-9.]*)%.*", "\\1", df$X))
If it includes also non-numeric characters then
sub("^\\(([^%]*)%.*", "\\1", df$X)
Use substr since your know you need to omit the first and last two chars:
> df <- data.frame(X = paste0('(',runif(3,0,1), '%)'))
> df
X
1 (0.393457352882251%)
2 (0.0288733830675483%)
3 (0.289543839870021%)
> df$X <- as.numeric(substr(df$X, 2, nchar(as.character(df$X)) - 2))
> df
X
1 0.39345735
2 0.02887338
3 0.28954384
I have a dataframe with strings such as:
id <- c(1,2)
x <- c("...14.....5.......................395.00.........................14.........1..",
"......114.99....................124.99................")
df <- data.frame(id,x)
df$x <- as.character(df$x)
How can I extract only values with a decimal point in between such as 395.00, 114.99 and 124.99 and not 14, 5, or 1 for each row, and put them in a new column separated by a comma?
The ideal result would be:
id x2
1 395.00
2 114.99,124.99
The amount of periods separating the values are random.
library(stringr)
df$x2 = str_extract_all(df$x, "[0-9]+\\.[0-9]+")
df[c(1, 3)]
# id x2
# 1 1 395.00
# 2 2 114.99, 124.99
Explanation: [0-9]+ matches one or more numbers, \\. matches a single decimal point. str_extract_all extracts all matches.
The new column is a list column, not a string with an inserted comma. This allows you access to the individual elements, if needed:
df$x2[2]
# [[1]]
# [1] "114.99" "124.99"
If you prefer a character vector as the column, do this:
df$x3 = sapply(str_extract_all(df$x, "[0-9]+\\.[0-9]+"), paste, collapse = ",")
df$x3[2]
#[1] "114.99,124.99"
In the example below, I would like the know the number of 010 sequences, or the number of 1010 sequences. Below is a workable example;
x <- c(1,0,0,1,0,0,0,1,1,1,0,0,1,0,1,0,1,0,1,0,1,0)
In this example, the number of 010 sequences would be 6 and the number of 1010 sequences would be 4.
What would be the most efficient/simplest way to count the number of consecutive sequences?
A stringless way:
f = function(x, patt){
if (length(x) == length(patt)) return(as.integer(x == patt))
w = head(seq_along(x), 1L-length(patt))
for (k in seq_along(patt)) w <- w[ x[w + k - 1L] == patt[k] ]
w
}
length(f(x, patt = c(0,1,0))) # 6
length(f(x, patt = c(1,0,1,0))) # 4
Alternatives. From #cryo11, here's another way:
function(x,patt) sum(apply(embed(x,length(patt)),1,function(x) all(!xor(x,patt))))
or another variation:
function(x,patt) sum(!colSums( xor(patt, t(embed(x,length(patt)))) ))
or with data.table:
library(data.table)
setkey(setDT(shift(x, seq_along(patt), type = "lead")))[as.list(patt), .N]
(The shift function is very similar to embed.)
Another solution would be this:
library(stringr)
x <- c(1,0,0,1,0,0,0,1,1,1,0,0,1,0,1,0,1,0,1,0,1,0)
xx = paste0(x, collapse = "")
str_count(xx, '(?<=010)')
[1] 6
str_count(xx, '(?<=1010)')
[1] 4
As #Pierre Lafortune pointed out in the comments this can be done without using any packages:
length(gregexpr("(?<=010)", xx, perl=TRUE)[[1]])
[1] 6
logic : take a substr of length of pattern you are searching for and compare it with the pattern.
xx = paste0(x, collapse = "")
# [1] "1001000111001010101010"
# case 1 :
xxx = "010"
sum(sapply(1:(length(x)-nchar(xxx)+1), function(i) substr(xx,i,i+nchar(xxx)-1)==xxx))
# [1] 6
# case 2 :
xxx = "1010"
# [1] 4
R introduced the startsWith function in 3.3.0. Using this and substring, we can implement #joel.wilson's method as
sum(startsWith(substring(paste(x, collapse=""),
head(seq_along(x), -2), tail(seq_along(x), -2)), "010"))
Here, substring constructs all three character adjacent sets and startsWith tests if each of these is the same as "010". The TRUE values are then summed together.
I would like to remove all commas and periods from string, except in the case that a string ends in a comma (or period) followed by one or two numbers.
Some examples would be:
12.345.67 #would become 12345.67
12.345,67 #would become 12345,67
12.345,6 #would become 12345,6
12.345.6 #would become 12345.6
12.345 #would become 12345
1,2.345 #would become 12345
and so forth
a stringi solution using same data as #Sotos would be:
library(stringi)
line 1 removes the last , or . character if more than 2 characters follow
line 2 removes the first , or . characters if there is more than 1 , or . left
x<-ifelse(stri_locate_last_regex(x,"([,.])")[,2]<(stri_length(x)-2),
stri_replace_last_regex(x,"([,.])",""),x)
x <- if(stri_count_regex(x,"([,.])") > 1){stri_replace_first_regex(x,"([,.])","")}
> x
[1] "12345.67" "12345,67" "12345,6" "12234" "1234" "12.45"
Another option is to use negative look ahead syntax ?! with the perl compatible regex:
df
# V1
# 1 12.345.67
# 2 12.345,67
# 3 12.345,6
# 4 12.345.6
# 5 12.345
# 6 1,2.345
df$V1 = gsub("[,.](?!\\d{1,2}$)", "", df$V1, perl = T)
df # remove , or . except they are followed by 1 or 2 digits at the end of string
# V1
# 1 12345.67
# 2 12345,67
# 3 12345,6
# 4 12345.6
# 5 12345
# 6 12345
One solution is to count the characters after the last comma/period (nchar(word(x, -1, sep = ',|\\.'))), and if the length is greater than 2, remove all delimiters (gsub(',|\\.', '', x)), otherwise just the first one (sub(',|\\.', '', x).
library(stringr)
ifelse(nchar(word(x, -1, sep = ',|\\.')) > 2, gsub(',|\\.', '', x), sub(',|\\.', '', x))
#[1] "12345.67" "12345,67" "12345,6" "12234" "1234" "12.45"
DATA
x <- c("12.345.67", "12.345,67", "12.345,6", "1,2.234", "1.234", "1,2.45")