String ID format checking and padding in R

String ID format checking and padding in R - r

Some codes are formatted as numbers divided by dashes (e.g., Social Security Numbers are typically formatted "ddd-dd-dddd", where d stands for any digit; denote this in short 3-2-4 format, standing for the number of digit in each "chunk").
I need to input product codes which come at 5-4, 4-4 or 5-3 format, and then:
(a) validate that they conform to any of these format, and (b) pad with zeros, so that the output is in 5-4 format.
Here is a code that does that. Is there a nicer way? how can it be vectorized?
library(stringr)
as_product_code <- function(x) {
# Clean Product Codes
# Input: 5-4, 5-3, or 4-4 product code.
# Output: 5-4 product code.
chunks <- unlist(strsplit(x, split = "-", fixed = T))
if (length(chunks == 2) & (identical(nchar(chunks), c(5L, 3L)) |
identical(nchar(chunks), c(5L, 4L)) |
identical(nchar(chunks), c(4L, 4L)))) {
output_code<- paste(str_pad(chunks[1], pad = "0", width = 5),
str_pad(chunks[2], pad = "0", width = 4),
sep = "-")
return(output_code)
} else {
warning("Unexpected format. Doing nothing.")
return(x)
}
}

You can use regular expressions and the stringr-package. This will return NA for a entry which does not follow the specified pattern.
For regular expression have a look at the cheat sheet.
\\d stands for any digit (0-9) and the number in the brackets { } give the number of repetions (either {min, max} or {exact}). The ^ means, that I'm looking at the beginning of the string and $ marks the end. Thus I don't match the string with ab at the end.
test <- c("1234-1234", "12345-123", "12345-1234ab", "12345-1234", "1234-123")
ifelse(str_detect(test, "^(\\d{4,5})-(\\d{4})$|^(\\d{5})-(\\d{3})$"),
str_replace_all(test, c("^(\\d{4})-" = "0\\1-", "-(\\d{3})$" = "-0\\1")),
NA)
[1] "01234-1234" "12345-0123" NA "12345-1234" NA

We can actually take advantage of the dataframe structure here to get some vectorization help.
# Create reproducible example
set.seed(9025)
d1 = sample(1:5, 1e5, replace=TRUE)
d2 = sample(1:5, 1e5, replace=TRUE)
codes = sapply(1:1e5, function(i) {
c1 = paste0(sample(1:9, d1[i]), collapse='')
c2 = paste0(sample(1:9, d2[i]), collapse='')
paste(c1, c2, sep='-')
})
library(stringr)
library(tidyverse)
# Create our dataframe, separate the product code, pad the values,
# and use vectorized ifelse to "remove" bad product codes.
output = codes %>%
tbl_df() %>%
separate(value, into=c('c1', 'c2'), sep='-', remove=TRUE) %>%
mutate(include = ifelse(nchar(c1) %in% 4:5 &
nchar(c2) %in% 3:4 &
(nchar(c1) + nchar(c2) > 7),
1, 0),
c1 = str_pad(c1, width=5, side='left', pad=0),
c2 = str_pad(c2, width=4, side='right', pad=0),
code = paste(c1, c2, sep='-')) %>%
mutate(code = ifelse(include == 1, code, '')) %>%
pull(code)
head(codes)
[1] "62971-2" "5-51864" "32419-328" "931-8"
[5] "18324-248" "8-628"
head(output)
[1] "" "" "32419-3280"
[4] "" "18324-2480" ""

You can use Vectorize base R function:
as_product_code <- function(x) {
#your function
}
x <- c('1234-1234','1234-1234')
as_product_code_vec <- Vectorize(as_product_code,'x',USE.NAMES = F)
as_product_code_vec(x)

Related

Move substring to end of string in R

I have a set of df with a large number of columns. The column names follow a pattern like so:
my.df <- data.frame(sentiment_brand1_1 = c(1,0,0,1), sentiment_brand1_2 = c(0,1,1,0),
sentiment_brand2_1 = c(1,1,1,1),
sentiment_brand2_2 = c(0,0,0,0),
brand1_rating_1 = c(1,2,3,4),
brand2_rating_1 = c(4,3,2,1))
I'd like to programmatically rename the columns, moving the substrings "brand1" and "brand2" from the middle of the column name to the end, e.g.:
desired_colnames <- c("sentiment_1_brand1",
"sentiment_2_brand1",
"sentiment_1_brand2",
"sentiment_2_brand2",
"rating_1_brand1",
"rating_1_brand2")

Capture the substring groups and rearrange in replacement
sub("(.*)_(brand1)(.*)", "\\1\\3_\\2", v1)
-output
[1] "variable_1_brand1" "_stuff_1_brand1" "thing_brand1"
data
v1 <- c("variable_brand1_1", "_brand1_stuff_1", "_brand1thing")

## Input:
Test <- c("variable_brand1_1", "_brand1_stuff_1", "_brand1thing")
library("stringr")
paste(str_remove(Test, "_brand1"), "_brand1", sep = "")
## OutPut:
[1] "variable_1_brand1" "_stuff_1_brand1" "thing_brand1"

Flipping two sides of string

I need to prepare a certain dataset for analysis. What I have is a table with column names (obviously). The column names are as follows (sample colnames):
"X99_NORM", "X101_NORM", "X76_110_T02_09747", "X30_NORM"
(this is a vector, for those not familiair with R colnames() function)
Now, what I want is simply to flip the values in front of, and after the underscore. e.g. X99_NORM becomes NORM_X99. Note that I want this only for the column names which contain NORM in their name.

Some other base R options
1)
Use sub to switch the beginning and end - we can make use of capturing groups here.
x <- sub(pattern = "(^X\\d+)_(NORM$)", replacement = "\\2_\\1", x = x)
Result
x
# [1] "NORM_X99" "NORM_X101" "X76_110_T02_09747" "NORM_X30"
2)
A regex-free approach that might be more efficient using chartr, dirname and paste. But we need to get the indices of the columns that contain "NORM" first
idx <- grep(x = x, pattern = "NORM", fixed = TRUE)
x[idx] <- paste0("NORM_", dirname(chartr("_", "/", x[idx])))
x
data
x <- c("X99_NORM", "X101_NORM", "X76_110_T02_09747", "X30_NORM")

x = c("X99_NORM", "X101_NORM", "X76_110_T02_09747", "X30_NORM")
replace(x,
grepl("NORM", x),
sapply(strsplit(x[grepl("NORM", x)], "_"), function(x){
paste(rev(x), collapse = "_")
}))
#[1] "NORM_X99" "NORM_X101" "X76_110_T02_09747" "NORM_X30"

A tidyverse solution with stringr:
library(tidyverse)
library(stringr)
my_data <- tibble(column = c("X99_NORM", "X101_NORM", "X76_110_T02_09747", "X30_NORM"))
my_data %>%
filter(str_detect(column, "NORM")) %>%
mutate(column_2 = paste0("NORM", "_", str_extract(column, ".+(?=_)"))) %>%
select(column_2)
# A tibble: 3 x 1
column_2
<chr>
1 NORM_X99
2 NORM_X101
3 NORM_X30

Selecting multiple columns using Regular Expressions

I have variables with names such as r1a r3c r5e r7g r9i r11k r13g r15i etc. I am trying to select variables which starts with r5 - r12 and create a dataframe in R.
The best code that I could write to get this done is,
data %>% select(grep("r[5-9][^0-9]" , names(data), value = TRUE ),
grep("r1[0-2]", names(data), value = TRUE))
Given my experience with regular expressions span a day, I was wondering if anyone could help me write a better and compact code for this!

Here's a regex that gets all the columns at once:
data %>% select(grep("r([5-9]|1[0-2])", names(data), value = TRUE))
The vertical bar represents an 'or'.
As the comments have pointed out, this will fail for items such as r51, and can also be shortened. Instead, you will need a slightly longer regex:
data %>% select(matches("r([5-9]|1[0-2])([^0-9]|$)"))

Suppose that in the code below x represents your names(data). Then the following will do what you want.
# The names of 'data'
x <- scan(what = character(), text = "r1a r3c r5e r7g r9i r11k r13g r15i")
y <- unlist(strsplit(x, "[[:alpha:]]"))
y <- as.numeric(y[sapply(y, `!=`, "")])
x[y > 4]
#[1] "r5e" "r7g" "r9i" "r11k" "r13g" "r15i"
EDIT.
You can make a function with a generalization of the above code. This function has three arguments, the first is the vector of variables names, the second and the third are the limits of the numbers you want to keep.
var_names <- function(x, from = 1, to = Inf){
y <- unlist(strsplit(x, "[[:alpha:]]"))
y <- as.integer(y[sapply(y, `!=`, "")])
x[from <= y & y <= to]
}
var_names(x, 5)
#[1] "r5e" "r7g" "r9i" "r11k" "r13g" "r15i"

Remove the non-digits, scan the remainder in and check whether each is in 5:12 :
DF <- data.frame(r1a=1, r3c=2, r5e=3, r7g=4, r9i=5, r11k=6, r13g=7, r15i=8) # test data
DF[scan(text = gsub("\\D", "", names(DF)), quiet = TRUE) %in% 5:12]
## r5e r7g r9i r11k
## 1 3 4 5 6
Using magrittr it could also be written like this:
library(magrittr)
DF %>% .[scan(text = gsub("\\D", "", names(.)), quiet = TRUE) %in% 5:12]
## r5e r7g r9i r11k
## 1 3 4 5 6

str_replace A1-A9 to A01-A09 and so on

Hi I have a following strings in my data and would like to replace A1-A9 to A01-A09 and B1-B9 to B01-B09 but keep the numbers >=10.
rep_data=data.frame(Str= c("A1B10", "A2B3", "A11B1", "A5B10"))
Str
1 A1B10
2 A2B3
3 A11B1
4 A5B10
There is a similar post here but my problem is little bit different! and haven't seen similar example in here str_replace.
Will be very glad if you know the solution.
expected output
Str
1 A01B10
2 A02B03
3 A11B01
4 A05B10

I think this should get you what you want:
gsub("(?<![0-9])([0-9])(?![0-9])", "0\\1", rep_data$Str, perl = TRUE)
#[1] "A01B10" "A02B03" "A11B01" "A05B10"
It uses PCRE lookahead's/lookbehind's to match a 1 digit number and then pastes a "0" onto it.

How about something like this
num_pad <- function(x) {
x <- as.character(x)
mm <- gregexpr("\\d+|\\D+",x)
parts <- regmatches(x, mm)
pad_number <- function(x) {
nn<-suppressWarnings(as.numeric(x))
x[!is.na(nn)] <- sprintf("%02d", nn[!is.na(nn)])
x
}
parts <- lapply(parts, pad_number)
sapply(parts, paste0, collapse="")
}
num_pad(rep_data$Str)
# [1] "A01B10" "A02B03" "A11B01" "A05B10"
Basically we use regular expressions to split the strings up into digit and non-digit groups. We then find those values that look like numbers and use sprintf() to zero-pad them to 2 characters. Then we insert the padded values into the vector and paste everything back together.

Not checked thoroughly
x = c("A1B10", "A2B3", "A11B1", "A5B10")
sapply(strsplit(x, ""), function(s){
paste(sapply(split(s, cumsum(s %in% LETTERS)), function(a){
if(length(a) == 2){
a[2] = paste0(0, a[2])
}
paste(a, collapse = "")
}), collapse = "")
})
#[1] "A01B10" "A02B03" "A11B01" "A05B10"

A solution from tidyverse and stringr.
library(tidyverse)
library(stringr)
rep_data2 <- rep_data %>%
extract(Str, into = c("L1", "N1", "L2", "N2"), regex = "(A)(\\d+)(B)(\\d+)") %>%
mutate_at(vars(starts_with("N")), funs(str_pad(., width = 2, pad = "0"))) %>%
unite(Str, everything(), sep = "")
rep_data2
Str
1 A01B10
2 A02B03
3 A11B01
4 A05B10

This is the most concise tidy solution I can come up with:
library(tidyverse)
library(stringr)
rep_data %>%
mutate(
num_1 = str_match(Str, "A([0-9]+)")[, 2],
num_2 = str_match(Str, "B([0-9]+)")[, 2],
num_1 = str_pad(num_1, width = 2, side = "left", pad = "0"),
num_2 = str_pad(num_2, width = 2, side = "left", pad = "0"),
Str = str_c("A", num_1, "B", num_2)
) %>%
select(- num_1, - num_2)

A bit similar to #Mike's answer, but this solution uses one positive lookahead:
gsub("(\\D)(?=\\d(\\D|\\b))", "\\10", rep_data$Str, perl = TRUE)
# [1] "A01B10" "A02B03" "A11B01" "A05B10"
With tidyverse:
library(dplyr)
library(stringr)
rep_data %>%
mutate(Str = str_replace_all(Str, "(\\D)(?=\\d(\\D|\\b))", "\\10"))
# Str
# 1 A01B10
# 2 A02B03
# 3 A11B01
# 4 A05B10
This regex matches all non-digits that are followed by a digit and either by another non-digit or a word boundary. \\10 is quite deceiving since it looks like it is replacing the match with the 10th capture group. Instead, it replaces the match with the 1st capture group plus a zero right after.

Here is one option with gsubfn
library(gsubfn)
gsubfn("(\\d+)", ~sprintf("%02d", as.numeric(x)), as.character(rep_data$Str))
#[1] "A01B10" "A02B03" "A11B01" "A05B10"

Isolate alphabetical strings within a larger string

Is there a way to isolate parts of a string that are in alphabetical order?
In other words, if you have a string like this: hjubcdepyvb
Could you just pull out the portion in alphabetical order?: bcde
I have thought about using the is.unsorted() function, but I'm not sure how to apply this to only a portion of a string.

Here's one way by converting to ASCII and back:
input <- "hjubcdepyvb"
spl_asc <- as.integer(charToRaw(input)) # Convert to ASCII
d1 <- diff(spl_asc) == 1 # Find sequences
filt <- spl_asc[c(FALSE, d1) | c(d1, FALSE)] # Only keep sequences (incl start and end)
rawToChar(as.raw(filt)) # Convert back to character
#[1] "bcde"
Note that this will concatenate any parts that are in alphabetical order.
i.e. If input is "abcxasdicfgaqwe" then output would be abcfg.
If you wanted to get separate vectors for each sequential string, you could do the following
input <- "abcxasdicfgaqwe"
spl_asc <- as.integer(charToRaw(input))
d1 <- diff(spl_asc) == 1
r <- rle(c(FALSE, d1) | c(d1, FALSE)) # Find boundaries
cm <- cumsum(c(1, r$lengths)) # Map these to string positions
substring(input, cm[-length(cm)], cm[-1] - 1)[r$values] # Extract matching strings
Finally, I had to come up with a way to use regex:
input <- c("abcxasdicfgaqwe", "xufasiuxaboqdasdij", "abcikmcapnoploDEFgnm",
"acfhgik")
(rg <- paste0("(", paste0(c(letters[-26], LETTERS[-26]),
"(?=", c(letters[-1], LETTERS[-1]), ")", collapse = "|"), ")+."))
#[1] "(a(?=b)|b(?=c)|c(?=d)|d(?=e)|e(?=f)|f(?=g)|g(?=h)|h(?=i)|i(?=j)|j(?=k)|
#k(?=l)|l(?=m)|m(?=n)|n(?=o)|o(?=p)|p(?=q)|q(?=r)|r(?=s)|s(?=t)|t(?=u)|u(?=v)|
#v(?=w)|w(?=x)|x(?=y)|y(?=z)|A(?=B)|B(?=C)|C(?=D)|D(?=E)|E(?=F)|F(?=G)|G(?=H)|
#H(?=I)|I(?=J)|J(?=K)|K(?=L)|L(?=M)|M(?=N)|N(?=O)|O(?=P)|P(?=Q)|Q(?=R)|R(?=S)|
#S(?=T)|T(?=U)|U(?=V)|V(?=W)|W(?=X)|X(?=Y)|Y(?=Z))+."
regmatches(input, gregexpr(rg, input, perl = TRUE))
#[[1]]
#[1] "abc" "fg"
#
#[[2]]
#[1] "ab" "ij"
#
#[[3]]
#[1] "abc" "nop" "DEF"
#
#[[4]]
#character(0)
This regular expression will identify consecutive upper or lower case letters (but not mixed case). As demonstrated, it works for character vectors and produces a list of vectors with all the matches identified. If no match is found, the output is character(0).

Using factor integer conversion:
input <- "hjubcdepyvb"
d1 <- diff(as.integer(factor(unlist(strsplit(input, "")), levels = letters))) == 1
filt <- c(FALSE, d1) | c(d1, FALSE)
paste(unlist(strsplit(input, ""))[filt], collapse = "")
# [1] "bcde"

myf = function(x){
x = unlist(strsplit(x, ""))
ind = charmatch(x, letters)
d = c(0, diff(ind))
d[d !=1] = 0
d = d + c(sapply(1:(length(d)-1), function(i) {
ifelse(d[i] == 0 & d[i+1] == 1, 1, 0)
}
), 0)
d = split(seq_along(d)[d!=0], with(rle(d), rep(seq_along(values), lengths))[d!=0])
return(sapply(d, function(a) paste(x[a], collapse = "")))
}
myf(x = "hjubcdepyvblltpqrs")
# 2 4
#"bcde" "pqrs"

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

String ID format checking and padding in R - r

You can use Vectorize base R function: as_product_code <- function(x) { #your function } x <- c('1234-1234','1234-1234') as_product_code_vec <- Vectorize(as_product_code,'x',USE.NAMES = F) as_product_code_vec(x)

Related

Move substring to end of string in R

Flipping two sides of string

Selecting multiple columns using Regular Expressions

str_replace A1-A9 to A01-A09 and so on

Isolate alphabetical strings within a larger string

Categories

Resources