Parse text with separator depending on its structure - r

My dataframe:
>datasetM
Mean
ENSORLG00000001933:tex11 2500.706
ENSORLG00000010797: 44225.330
ENSORLG00000003008:pabpc1a 11788.555
ENSORLG00000001973:sept6 3100.493
ENSORLG00000000997: 5418.796
Output needed:
>out
[1] "tex11" "ENSORLG00000010797" "pabpc1a" "sept6" "ENSORLG00000000997"
I tried this, but I only retrieve the part before the separator:
titles <- rownames(datasetM)
vapply(strsplit(titles,":"), `[`, 1, FUN.VALUE=character(1))
Note: There is not logic in the alternance of ENS000:name and ENS00:
Note 2: ENSOR are rownames
Note 3: When there is nothing after ":" I want the ENSOR

Here is a solution with base R:
sapply(strsplit(rownames(df), ":"), function(x) x[length(x)])
# [1] "tex11" "ENSORLG00000010797" "pabpc1a" "sept6"
# [5] "ENSORLG00000000997"
Another solution with sub, might be simpler:
sub("^\\w+:(?=\\w)|:", "", rownames(df), perl = TRUE)
# [1] "tex11" "ENSORLG00000010797" "pabpc1a" "sept6"
# [5] "ENSORLG00000000997"
Data:
df = read.table(text = " Mean
ENSORLG00000001933:tex11 2500.706
ENSORLG00000010797: 44225.330
ENSORLG00000003008:pabpc1a 11788.555
ENSORLG00000001973:sept6 3100.493
ENSORLG00000000997: 5418.796", header = TRUE, row.names = 1)

Here is a vectorized way to do this using a regex (taken from here) to identify the last character of each rowname,
rownames(df)[!sub('.*(?=.$)', '', rownames(df), perl=TRUE) == ':'] <-
sub('.*:', '', rownames(df)[!sub('.*(?=.$)', '', rownames(df), perl=TRUE) == ':'])
which gives,
V2
tex11 2500.706
ENSORLG00000010797: 44225.330
pabpc1a 11788.555
sept6 3100.493
ENSORLG00000000997: 5418.796
DATA
dput(df)
structure(list(V2 = c(2500.706, 44225.33, 11788.555, 3100.493,
5418.796)), .Names = "V2", row.names = c("tex11", "ENSORLG00000010797:",
"pabpc1a", "sept6", "ENSORLG00000000997:"), class = "data.frame")
NOTE You can remove the colons from rownames simply by
rownames(df) <- sub(':', '', rownames(df))

Related

Extract numbers from a character vector and adding leading zeros

I have a character-vector with the following structure:
GDM3
PER.1.1.1_1
PER.1.10.2_1
PER.1.1.32_1
PER.1.1.4_1
PER.1.1.5_1
PER.11.29.1_1
PER.1.2.2_1
PER.31.2.3_1
PER.1.2.44_1
PER.5.2.25_1
I want to extract the three numbers in the middle of middle of that ID and add leading numbers if they are only single digits. The finale vector can be a character vector again. In the end the result should look like this:
GDM3
010101
011002
010132
010104
010105
112901
010202
310203
010244
050225
tmp <- strcapture("\\.([0-9]+)\\.([0-9]+)\\.([0-9]+)_", X$GDM3,
proto = list(a=0L, b=0L, c=0L)) |>
lapply(sprintf, fmt = "%02i")
do.call(paste0, tmp)
# [1] "010101" "011002" "010132" "010104" "010105" "112901" "010202" "310203" "010244" "050225"
Explanation:
strcapture extracts the known patterns into a data.frame, with names and classes defined in proto (the actual values in proto are not used);
lapply(sprintf, fmt="%02i") zero-pads to 2 digits all columns of the frame
do.call(paste, tmp) concatenates each row of the frame into a single string.
Data
X <- structure(list(GDM3 = c("PER.1.1.1_1", "PER.1.10.2_1", "PER.1.1.32_1", "PER.1.1.4_1", "PER.1.1.5_1", "PER.11.29.1_1", "PER.1.2.2_1", "PER.31.2.3_1", "PER.1.2.44_1", "PER.5.2.25_1")), class = "data.frame", row.names = c(NA, -10L))
Assuming GDM3 shown in the Note at the end, read it creating a data frame and the use sprintf to create the result.
with( read.table(text = GDM3, sep = ".", comment.char = "_"),
sprintf("%02d%02d%02d", V2, V3, V4) )
giving:
[1] "010101" "011002" "010132" "010104" "010105" "112901" "010202" "310203"
[9] "010244" "050225"
Note
GDM3 <- c("PER.1.1.1_1", "PER.1.10.2_1", "PER.1.1.32_1", "PER.1.1.4_1",
"PER.1.1.5_1", "PER.11.29.1_1", "PER.1.2.2_1", "PER.31.2.3_1",
"PER.1.2.44_1", "PER.5.2.25_1")
Another solution:
X <- structure(list(GDM3 = c("PER.1.1.1_1", "PER.1.10.2_1", "PER.1.1.32_1", "PER.1.1.4_1", "PER.1.1.5_1", "PER.11.29.1_1", "PER.1.2.2_1", "PER.31.2.3_1", "PER.1.2.44_1", "PER.5.2.25_1")), class = "data.frame", row.names = c(NA, -10L))
strsplit(X$GDM3, "\\.|_") |>
sapply(function(x) paste0(sprintf("%02i", as.numeric(x[2:4])), collapse = ""))
#[1] "010101" "011002" "010132" "010104" "010105" "112901" "010202" "310203" "010244" "050225"

How can I change character class to numeric class?

I have a variable like this:
VALOR
3.554,34
56,34
But its class is "character" and when I code this:
gastosbolsonaro <- gastosbolsonaro %>% mutate(VALOR = as.numeric(VALOR))
Happening this:
Problem while computing `VALOR = as.numeric(as.character(VALOR))`.
i NAs introduced by coercion
And all value change to NA's.
I want to change value money to numeric class
You can use parse_number from readr package
library(readr)
x <- c("3.554,34", "56,34")
parse_number(x, locale = locale(decimal_mark = ",", grouping_mark = "."))
[1] 3554.34 56.34
Remove ., change , to . and then convert using as.numeric.
x <- c("3.554,34", "56,34")
. <- gsub(".", "", x, fixed = TRUE)
. <- sub(",", ".", .)
as.numeric(.)
#[1] 3554.34 56.34
Or in one line:
as.numeric(sub(",", ".", gsub("\\.", "", x)))
#[1] 3554.34 56.34
Another base option using scan.
scan(text=gsub("\\.", "", x), dec=",")
#scan(text=gsub("\\.", "", x), dec=",", quiet = TRUE) #Quiet Alternative
#[1] 3554.34 56.34

Remove n last character from a string

I would like to remove the third and fourth last character from as string.
Here is some sample data:
HS0202
HS0902
MV0100
SUE0300
I would need return something like this
HS02
HS02
MV00
SUE00
Using a regex with gsub():
gsub("..(?=..$)", "", chrs, perl = TRUE)
# "HS02" "HS02" "MV00" "SUE00"
Or with stringr::str_remove():
library(stringr)
str_remove(chrs, "..(?=..$)")
# "HS02" "HS02" "MV00" "SUE00"
You can extract from first to fourth last characters, and paste on the final character as follows:
have <- c('HS0202', 'HS0902', 'MV0100', 'SUE0300')
want <- paste0(substring(have,1,nchar(have)-3),substring(have,nchar(have)))
Using stringi:
library(stringi)
stri_sub_replace(x, from = -4, to = -3, value = "")
[1] "HS02" "HS02" "MV00" "SUE00"
Or with stringr:
library(stringr)
str_sub(x, start = -4, end = -3) <- ""
Data
x <- c("HS0202", "HS0902", "MV0100", "SUE0300")

Solve shorten notation by regular expression

I want to solve two shorten notation in R.
For Ade/i, I should get Ade, Adi
For Do(i)lfal, I should get Dolfal, Doilfal
I have this solution
b='Do(i)lferl'
gsub(pattern = '(\\w+)\\((\\w+)+\\)', replacement='\\1\\i,\\1\\2', x=b)
Can anyone help me to code this
If these values are part of a dataframe, you can do this:
df <- data.frame(
Nickname = c("Ade/i", "Do(i)lfal")
)
df$Nickname_new[1] <- paste0(sub("(?=.*/)(.*)/.*", "\\1", df$Nickname[1], perl = T), ",", paste0(unlist(str_split(df$Nickname[1], "\\w/")), collapse = ""))
df$Nickname_new[2] <- paste0(sub("(.*)(\\(.*\\))(.*)", "\\1\\3", df$Nickname[2]),",", sub("(.*)\\((\\w)\\)(.*)", "\\1\\2\\3\\4", df$Nickname[2]))
which gives you:
df
Nickname Nickname_new
1 Ade/i Ade,Adi
2 Do(i)lfal Dolfal,Doilfal
EDIT:
Just in case the whole thing is not part of a dataframe but an atomic vector, you can do this:
x <- c("Ade/i", "Do(i)lfal")
c(paste0(sub("/.*", "", x[grepl("/", x)]), ", ", sub("./", "", x[grepl("/", x)])),
paste0(sub("(.*)\\((\\w)\\)(.*)", "\\1\\2\\3\\4", x[grepl("\\(", x)]), ", ", sub("\\(\\w\\)", "", x[grepl("\\(", x)])))
which gives you:
[1] "Ade, Adi" "Doilfal, Dolfal"
If there are values that you don't want to change, then this regex by #Wiktor will work (it works even with any scenario!):
x <- c("Ade/i", "Do(i)lfal", "Peter", "Mary")
gsub('(\\w*)\\((\\w+)\\)(\\w*)', '\\1\\2\\3, \\1\\3', gsub("(\\w*)(\\w)/(\\w)\\b", "\\1\\2, \\1\\3", x))
which gives you:
[1] "Ade, Adi" "Doilfal, Dolfal" "Peter" "Mary"

Replace multiple strings in one gsub() or chartr() statement in R?

I have a string variable containing alphabet[a-z], space[ ], and apostrophe['],eg. x <- "a'b c"
I want to replace apostrophe['] with blank[], and replace space[ ] with underscore[_].
x <- gsub("'", "", x)
x <- gsub(" ", "_", x)
It works absolutely, but when I have a lot of condition, the code becomes ugly. Therefore, I want to use chartr(), but chartr() can't deal with blank, eg.
x <- chartr("' ", "_", x)
#Error in chartr("' ", "_", "a'b c") : 'old' is longer than 'new'
Is there any way to solve this problem? thanks!
You can use gsubfn
library(gsubfn)
gsubfn(".", list("'" = "", " " = "_"), x)
# [1] "ab_c"
Similarly, we can also use mgsub which allows multiple replacement with multiple pattern to search
mgsub::mgsub(x, c("'", " "), c("", "_"))
#[1] "ab_c"
I am a fan of the syntax that the %<>% and %>% opperators from the magrittr package provide.
library(magrittr)
x <- "a'b c"
x %<>%
gsub("'", "", .) %>%
gsub(" ", "_", .)
x
##[1] "ab_c"
gusbfn is wonderful, but I like the chaining %>% allows.
I'd go with the quite fast function stri_replace_all_fixed from library(stringi):
library(stringi)
stri_replace_all_fixed("a'b c", pattern = c("'", " "), replacement = c("", "_"), vectorize_all = FALSE)
Here is a benchmark taking into account most of the other suggested solutions:
library(stringi)
library(microbenchmark)
library(gsubfn)
library(mgsub)
library(magrittr)
library(dplyr)
x_gsubfn <-
x_mgsub <-
x_nested_gsub <-
x_magrittr <-
x_stringi <- "a'b c"
microbenchmark("gsubfn" = { gsubfn(".", list("'" = "", " " = "_"), x_gsubfn) },
"mgsub" = { mgsub::mgsub(x_mgsub, c("'", " "), c("", "_")) },
"nested_gsub" = { gsub("Find", "Replace", gsub("Find","Replace", x_nested_gsub)) },
"magrittr" = { x_magrittr %<>% gsub("'", "", .) %>% gsub(" ", "_", .) },
"stringi" = { stri_replace_all_fixed(x_stringi, pattern = c("'", " "), replacement = c("", "_"), vectorize_all = FALSE) }
)
Unit: microseconds
expr min lq mean median uq max neval
gsubfn 458.217 482.3130 519.12820 513.3215 538.0100 715.371 100
mgsub 180.521 200.8650 221.20423 216.0730 231.6755 460.587 100
nested_gsub 14.615 15.9980 17.92178 17.7760 18.7630 40.687 100
magrittr 113.765 133.7125 148.48202 142.9950 153.0680 296.261 100
stringi 3.950 7.7030 8.41780 8.2960 9.0860 26.071 100
I know it is a bit old but it is hard to pass on an efficient base R solution. Just use the pipe:
test <- "abcegdfk461mnb"
test2 <- gsub("e|4|6","",test)
print(test2)
I think nested gsub will do the job.
gsub("Find","Replace",gsub("Find","Replace",X))
I would opt for a magrittr and/or dplyr solution, as well. However, I prefer not making a new copy of the object, especially if it is in a function and can be returned cheaply.
i.e.
return(
catInTheHat %>% gsub('Thing1', 'Thing2', .) %>% gsub('Red Fish', 'Blue
Fish', .)
)
...and so on.
gsub("\\s", "", chartr("' ", " _", x)) # Use whitespace and then remove it
Try this replace multi text character in column:
df$TYPE <- str_replace_all(df$TYPE, c("test" = "new_test", "G" = "N", "T" = "W"))
I use this function, which also allows omitting the argument for the replacement if the replacement is empty:
s=function(x,...,ignore.case=F,perl=F,fixed=F,useBytes=F){
a=match.call(expand.dots=F)$...
l=length(a)
for(i in seq(1,l,2))x=gsub(a[[i]],if(i==l)""else a[[i+1]],x,ignore.case=ignore.case,perl=perl,fixed=fixed,useBytes=useBytes)
x
}
> s("aa bb cc","aa","dd","bb")
[1] "dd cc"

Resources