extract different strings after match using R - r

data <- c("Demand = 001 979", "Demand = -08 976 (154)", "Demand = -01 975 (359)")
data <- str_match(data, pattern = ("Demand = (.*) (.*)"))
I need to extract the first 2 sets of numbers (including the - sign) into columns using str_match.
Exclude 3rd set of numbers in bracket ().
Any help is welcomed.
Output:
## [1] "001" "-08" "-01"
## [2] "979" "976" "975"

How about removing everything else?
data <- c("Demand = 001 979", "Demand = -08 976 (154)", "Demand = -01 975 (359)")
data <- gsub("Demand = ", "", x = data)
data <- trimws(gsub("\\(.*\\)", "", x = data))
out <- list()
out[[1]] <- sapply(data, "[", 1)
out[[2]] <- sapply(data, "[", 2)
out
[[1]]
[1] "001" "-08" "-01"
[[2]]
[1] "979" "976" "975"

A possibility with str_extract_all() from stringr:
sapply(str_extract_all(x, "-?[0-9]+?[0-9]*"), function(x) x[1])
[1] "001" "-08" "-01"
sapply(str_extract_all(x, "-?[0-9]+?[0-9]*"), function(x) x[2])
[1] "979" "976" "975"
Or using the idea of #Roman Luštrik with strsplit():
sapply(strsplit(gsub("Demand = ", "", x), " "), function(x) x[1])
[1] "001" "-08" "-01"

Related

Shortening a long vector in R

Vectors a and b can be shortened using toString(width = 10) in Base R resulting in a shorter vector that ends in ....
However, I wonder how I can make the shortened vector to end in ..., last vector element?
My desired_output is shown below.
a <- 1:26
b <- LETTERS
toString(a, width = 10)
# [1] "1,2,...."
desired_output1 = "1,2,...,26"
toString(b, width = 10)
# [1] "A,B,...."
desired_output2 = "A,B,...,Z"
You could just add the end on.
paste(toString(a, width = 10), a[length(a)], sep=", ")
[1] "1, 2, ...., 26"
paste(toString(b, width = 10), b[length(b)], sep=", ")
[1] "A, B, ...., Z"
After applting the toString, we may use sub to remove the substring to format
f1 <- function(vec, n = 2) {
gsub("\\s+", "",
sub(sprintf("^(([^,]+, ){%s}).*, ([^,]+)$", n), "\\1...,\\3", toString(vec)))
}
-testing
> f1(a)
[1] "1,2,...,26"
> f1(b)
[1] "A,B,...,Z"
> f1(a, 3)
[1] "1,2,3,...,26"
> f1(b, 3)
[1] "A,B,C,...,Z"
> f1(a, 4)
[1] "1,2,3,4,...,26"
> f1(b, 4)
[1] "A,B,C,D,...,Z"
We could do it this way:
Creating a function that extraxt the first two elements and the last element of the vector and paste them together:
my_func <- function(x) {
a <- paste(x[1:2], collapse=",")
b <- tail(x, n=1)
paste0(a,",...,",b)
}
my_func(a)
[1] "1,2,...,26"
my_func(b)
[1] "A,B,...,Z"
library(stringr)
a <- 1:26
b <- LETTERS
reduce_string <- function(x, n_show) {
str_c(x[1:n_show], collapse = ',') %>%
str_c('....,', x[[length(x)]])
}
reduce_string(a, 2)
#> [1] "1,2....,26"
Created on 2022-01-02 by the reprex package (v2.0.1)

How to split letters with bracket and numbers in R?

The string is s = '[12]B1[16]M5'
I want to split it as the following results with strsplit function in R:
let <- c('[12]B', '[16]M')
num <- c(1, 5)
Thanks a lot
You could use regular expression for your task.
s = '[12]B1[16]M22'
grx <- gregexpr("\\[.+?\\].+[[:digit:]]?", s)
let <- do.call(c, regmatches(s, grx))
#let
#[1] "[12]B" "[16]M"
If you want to get all chunks (let + num), you can tweak the patter as below. This facilitates extracting the numeric part.
grx <- gregexpr("\\[.+?\\].+([[:digit:]]+)", s)
out <- do.call(c, regmatches(s, grx))
num <- gsub(".+\\][[:alpha:]]+", "", out)
num
[1] "1" "22"
Using the stringr package:
library(stringr)
x <- '[12]B1[16]M2'
let <- unlist(str_extract_all(x, "\\[[0-9]{2}\\][A-Z]"))
x <- gsub(pattern = "\\[[0-9]{2}\\][A-Z]",
replacement = "",
x)
num <- unlist(str_extract_all(x, "[0-9]"))
the regular expression "\\[[0-9]{2}\\][A-Z]" can be broken down as
\\[ an opening bracket
[0-9]{2} a sequence of two consecutive digits
\\] a closing bracket
[A-Z] a sequence of exactly one upper case letter
1) strapply Create a regular expression, pat which matches the two parts and then extract each separately using strapply. The first capture group (first parenthesized portion of regular expression) consists of a left square bracket "\\[" the smallest string ".*?" until the right square bracket "\\]" followed by any character "." . The second capture group consists of one or more digits "\\d+".
library(gsubfn)
pat <- "(\\[.*?\\].)(\\d+)"
let <- strapply(s, pat, simplify = c)
num <- strapply(s, pat, ~ as.numeric(..2), simplify = c)
let
## [1] "[12]B" "[16]M"
num
## [1] 1 5
1a) Variation
This could also be expressed as this mapply producing a 2 component list:
mapply(strapply, s, pat, c(~ ..1, ~ as.numeric(..2)), simplify = "c",
SIMPLIFY = FALSE, USE.NAMES = FALSE)
## [[1]]
## [1] "[12]B" "[16]M"
##
## [[2]]
## [1] 1 5
2) gsub/read.table This uses no packages -- only gsub and read.table. pat is defined in (1). It returns a data frame with the results in two coiumns:
read.table(text = gsub(pat, "\\1 \\2\n", s), as.is = TRUE, col.names = c("let", "num"))
## let num
## 1 [12]B 1
## 2 [16]M 5
3) gsub/strsplit This is somewhat similar to (2) but uses strsplit rather than read.table. pat is from (1).
spl <- matrix(strsplit(gsub(pat, "\\1 \\2 ", s), " ")[[1]], 2)
let <- spl[1, ]
num <- as.numeric(spl[2, ])

R extract string between nth and ith instance of delimiter

I have a vector of strings, similar to this one, but with many more elements:
s <- c("CGA-DV-558_T_90.67.0_DV_1541_07", "TC-V-576_T_90.0_DV_151_0", "TCA-DV-X_T_6.0_D_A2_07", "T-V-Z_T_2_D_A_0", "CGA-DV-AW0_T.1_24.4.0_V_A6_7", "ACGA-DV-A4W0_T_274.46.0_DV_A266_07")
And I would like to use a function that extracts the string between the nth and ith instances of the delimiter "_". For example, the string between the 2nd (n = 2) and 3rd (i = 3) instances, to get this:
[1] "90.67.0" "90.0" "6.0" "2" "24.4.0" "274.46.0"
Or if n = 4 and i = 5"
[1] "1541" "151" "A2" "A" "A" "A266"
Any suggestions? Thank you for your help!
You can do this with gsub
n = 2
i = 3
pattern1 = paste0("(.*?_){", n, "}")
temp = gsub(pattern1, "", s)
pattern2 = paste0("((.*?_){", i-n, "}).*")
temp = gsub(pattern2, "\\1", temp)
temp = gsub("_$", "", temp)
[1] "1541" "151" "A2" "A" "A6" "A266"
#FUNCTION
foo = function(x, n, i){
do.call(c, lapply(x, function(X)
paste(unlist(strsplit(X, "_"))[(n+1):(i)], collapse = "_")))
}
#USAGE
foo(x = s, n = 3, i = 5)
#[1] "DV_1541" "DV_151" "D_A2" "D_A" "V_A6" "DV_A266"
A third method, that uses substring for the extraction and gregexpr to find the positions is
# extract postions of "_" from each vector element, returns a list
spots <- gregexpr("_", s, fixed=TRUE)
# extract text in between third and fifth underscores
substring(s, sapply(spots, "[", 3) + 1, sapply(spots, "[", 5) - 1)
"DV_1541" "DV_151" "D_A2" "D_A" "V_A6" "DV_A266"

R - return to numbers from cut

I have a table with the cuts in intervals like:
bin targets casos prop phyp logit
(-2,-1] 193 6144 0.0314 0 -3.4286244
(-1,3] 128 431 0.2970 1 -0.8617025
(3,11] 137 245 0.5592 1 0.2378497
I want to get the original cuts. I tried with:
a<-strsplit(as.character(pl$table[,'bin']), ' ')
And then I tried to split each row with:
lapply(a, function(x) strsplit(x, ",")[1] )
But I don't get the expected result, which is:
(-1,3,11)
Is there a better way to achieve this? What else do I need to do to get to the result?
Thanks.
If your data is consistently in this format, you could use gsub().
df <- data.frame(bin = c('(-2,-1]','(1,3]','(3,11]'),
targets = c(193, 128, 137),
casos = c(6144, 431, 245),
prop = c(0.0314, 0.297, 0.5592),
phyp = c(0,1,1),
logit = c(-3.4286244,-0.8617025, 0.2378497), stringsAsFactors = F)
a <- strsplit(df$bin, ',')
sapply(a, function(x) gsub("]", "", x))[2,]
sapply(a, function(x) gsub("\\(", "", x))[1,]
Which gives you
[1] "-1" "3" "11"
[1] "-2" "1" "3"
In your example, there are more bounds than you say you are hoping to retrieve. This will give you all bounds:
d <- read.table(text=' bin targets casos prop phyp logit
"(-2,-1]" 193 6144 0.0314 0 -3.4286244
"(1,3]" 128 431 0.2970 1 -0.8617025
"(3,11]" 137 245 0.5592 1 0.2378497', header=T)
strings <- as.character(levels(d$bin))
strings <- substr(strings, 2, nchar(strings)-1)
unique(unlist(strsplit(strings, ",")))
# [1] "-2" "-1" "1" "3" "11"
If you only wanted the upper bounds, this will work:
strings <- as.character(levels(d$bin))
strings <- sapply(strsplit(strings, ","), function(l){ l[2] })
strings <- substr(strings, 1, nchar(strings)-1)
unique(strings)
# [1] "-1" "3" "11"
Another way would be:
a<-strsplit(as.character(pl$table[,'bin']), ' ')
lapply(a, function(x) unlist(strsplit(x, ",|]"))[2])

Combining elements in a string vector with defined element size and accounting for not event sizes

Given is vector:
vec <- c(LETTERS[1:10])
I would like to be able to combine it in a following manner:
resA <- c("AB", "CD", "EF", "GH", "IJ")
resB <- c("ABCDEF","GHIJ")
where elements of the vector vec are merged together according to the desired size of a new element constituting the resulting vector. This is 2 in case of resA and 5 in case of resB.
Desired solution characteristics
The solution should allow for flexibility with respect to the element sizes, i.e. I may want to have vectors with elements of size 2 or 20
There may be not enough elements in the vector to match the desired chunk size, in that case last element should be shortened accordingly (as shown)
This is shouldn't make a difference but the solution should work on words as well
Attempts
Initially, I was thinking of using something on the lines:
c(
paste0(vec[1:2], collapse = ""),
paste0(vec[3:4], collapse = ""),
paste0(vec[5:6], collapse = "")
# ...
)
but this would have to be adapted to jump through the remaining pairs/bigger groups of the vec and handle last group which often would be of a smaller size.
Here is what I came up with. Using Harlan's idea in this question, you can split the vector in different number of chunks. You also want to use your paste0() idea in lapply() here. Finally, you unlist a list.
unlist(lapply(split(vec, ceiling(seq_along(vec)/2)), function(x){paste0(x, collapse = "")}))
# 1 2 3 4 5
#"AB" "CD" "EF" "GH" "IJ"
unlist(lapply(split(vec, ceiling(seq_along(vec)/5)), function(x){paste0(x, collapse = "")}))
# 1 2
#"ABCDE" "FGHIJ"
unlist(lapply(split(vec, ceiling(seq_along(vec)/3)), function(x){paste0(x, collapse = "")}))
# 1 2 3 4
#"ABC" "DEF" "GHI" "J"
vec <- c(LETTERS[1:10])
f1 <- function(x, n){
f <- function(x) paste0(x, collapse = '')
regmatches(f(x), gregexpr(f(rep('.', n)), f(x)))[[1]]
}
f1(vec, 2)
# [1] "AB" "CD" "EF" "GH" "IJ"
or
f2 <- function(x, n)
apply(matrix(x, nrow = n), 2, paste0, collapse = '')
f2(vec, 5)
# [1] "ABCDE" "FGHIJ"
or
f3 <- function(x, n) {
f <- function(x) paste0(x, collapse = '')
strsplit(gsub(sprintf('(%s)', f(rep('.', n))), '\\1 ', f(x)), '\\s+')[[1]]
}
f3(vec, 4)
# [1] "ABCD" "EFGH" "IJ"
I would say the last is best of these since n for the others must be a factor or you will get warnings or recycling
edit - more
f4 <- function(x, n) {
f <- function(x) paste0(x, collapse = '')
Vectorize(substring, USE.NAMES = FALSE)(f(x), which((seq_along(x) %% n) == 1),
which((seq_along(x) %% n) == 0))
}
f4(vec, 2)
# [1] "AB" "CD" "EF" "GH" "IJ"
or
f5 <- function(x, n)
mapply(function(x) paste0(x, collapse = ''),
split(x, c(0, head(cumsum(rep_len(sequence(n), length(x)) %in% n), -1))),
USE.NAMES = FALSE)
f5(vec, 4)
# [1] "ABCD" "EFGH" "IJ"
Here is another way, working with the original array.
A side note, working with words is not straightforward, since there is at least two ways to understand it: you can either keep each word separately or collapse them first an get individual characters. The next function can deal with both options.
vec <- c(LETTERS[1:10])
vec2 <- c("AB","CDE","F","GHIJ")
cuts <- function(x, n, bychar=F) {
if (bychar) x <- unlist(strsplit(paste0(x, collapse=""), ""))
ii <- seq_along(x)
li <- split(ii, ceiling(ii/n))
return(sapply(li, function(y) paste0(x[y], collapse="")))
}
cuts(vec2,2,F)
# 1 2
# "ABCDE" "FGHIJ"
cuts(vec2,2,T)
# 1 2 3 4 5
# "AB" "CD" "EF" "GH" "IJ"

Resources