names(score)
[1] "(Intercept)" "aado2_calc(20,180]" "aado2_calc(360,460]"
[4] "aado2_calc(460,629]" "albumin[1,1.8]" "albumin(1.8,2.2]"
[7] "albumin(2.2,2.8]" "aniongap(15,18]" "aniongap(18,20]"
[10] "aniongap(20,22]" "aniongap(22,25]" "aniongap(25,49]"
[13] "ethnicityBLACK" "ethnicityUNKNOWN" "admission_typeEMERGENCY"
[16] "electivesurgery" "mechvent" "congestive_heart_failure"
[19] "cardiac_arrhythmias" "renal_failure" "liver_disease"
[22] "lymphoma" "metastatic_cancer" "coagulopathy"
[25] "obesity" "fluid_electrolyte"
In this program, I want to delete symbols or numbers behind "(" or "[". For example, "albumin[1,1.8]" should be "albumin".
We can use sub to match either the ( or (|) the [ followed by one or more number ([0-9]+) and the rest of the characters and replace it with blank
sub("(\\(|\\[)[0-9]+.*", "", names(score))
#[1] "(Intercept)" "aado2_calc" "aado2_calc" "aado2_calc" "albumin"
#[6] "albumin" "albumin" "aniongap" "aniongap" "aniongap"
#[11] "aniongap" "aniongap" "ethnicityBLACK" "ethnicityUNKNOWN" "admission_typeEMERGENCY"
#[16] "electivesurgery" "mechvent" "congestive_heart_failure" "cardiac_arrhythmias" "renal_failure"
#[21] "liver_disease" "lymphoma" "metastatic_cancer" "coagulopathy" "obesity"
#[26] "fluid_electrolyte"
Related
I want to rename this columns in R, I want to remove X from each of them so that it remains just figures which represents different years varying from 1960 to 2020. The first two (country name and Country Code) are sorted out already.
[1] "ï..Country.Name" "Country.Code" "X1960" "X1961" "X1962"
[6] "X1963" "X1964" "X1965" "X1966" "X1967"
[11] "X1968" "X1969" "X1970" "X1971" "X1972"
[16] "X1973" "X1974" "X1975" "X1976" "X1977"
[21] "X1978" "X1979" "X1980" "X1981" "X1982"
[26] "X1983" "X1984" "X1985" "X1986" "X1987"
[31] "X1988" "X1989" "X1990" "X1991" "X1992"
[36] "X1993" "X1994" "X1995" "X1996" "X1997"
[41] "X1998" "X1999" "X2000" "X2001" "X2002"
[46] "X2003" "X2004" "X2005" "X2006" "X2007"
[51] "X2008" "X2009" "X2010" "X2011" "X2012"
[56] "X2013" "X2014" "X2015" "X2016" "X2017"
[61] "X2018" "X2019" "X2020"
names(df) <- gsub("^X", "", names(df))
gsub() matches a regular expression and replaces it if found. In our case, the regex says the string must have an X at the beginning.
I have some email address where I am trying to extract the domain from. I found a solution here but it is taking too long.
I am trying with the following approach:
First remove all the text before the # sign.
gsub("#(.+)$", "\\1", emails)
Other - not used
qdapRegex::ex_between(emails, ".", ".")
Data:
emails <- c("ut317#hotmail.com", "drrro#iueywapp.com", "esdfdsfos#lasdfsdfsdstores.com",
"asfds#mobsdaff.com", "asfsdaf.gsdsfdsfd#hotmail.org", "asdfdsaf#sdffsddapp.com",
"wqrerq.mwqerweing#mwerqwie.com", "qwera#niweqrerw.tv", "qwereqr3rew7#hotmail.com",
"mqwerwewrk#moweqrewsfaslay.com")
You can try the following:
str_sub(emails, str_locate(emails, "#")[,1]+1)
Output:
[1] "hotmail.com" "iueywapp.com" "lasdfsdfsdstores.com" "mobsdaff.com"
[5] "hotmail.org" "sdffsddapp.com" "mwerqwie.com" "niweqrerw.tv"
[9] "hotmail.com" "moweqrewsfaslay.com"
how about
sub(".*#(.*)\\..*","\\1",emails)
[1] "hotmail" "iueywapp" "lasdfsdfsdstores" "mobsdaff"
[5] "hotmail" "sdffsddapp" "mwerqwie" "niweqrerw"
[9] "hotmail" "moweqrewsfaslay"
or if you want everything after the #:
sub(".*#","",emails)
[1] "hotmail.com" "iueywapp.com" "lasdfsdfsdstores.com"
[4] "mobsdaff.com" "hotmail.org" "sdffsddapp.com"
[7] "mwerqwie.com" "niweqrerw.tv" "hotmail.com"
[10] "moweqrewsfaslay.com"
We can use trimws from base R
trimws(emails, whitespace= '.*#')
#[1] "hotmail.com" "iueywapp.com" "lasdfsdfsdstores.com" "mobsdaff.com" "hotmail.org" "sdffsddapp.com"
#[7] "mwerqwie.com" "niweqrerw.tv" "hotmail.com" "moweqrewsfaslay.com"
trimws(emails, whitespace= '.*#|\\..*')
#[1] "hotmail" "iueywapp" "lasdfsdfsdstores" "mobsdaff" "hotmail" "sdffsddapp" "mwerqwie"
#[8] "niweqrerw" "hotmail" "moweqrewsfaslay"
I have a vector that looks like
> inecodes
[1] "01001" "01002" "01049" "01003" "01006" "01037" "01008" "01004" "01009" "01010" "01011"
[12] "01013" "01014" "01016" "01017" "01021" "01022" "01023" "01046" "01056" "01901" "01027"
[23] "01019" "01020" "01028" "01030" "01031" "01032" "01902" "01033" "01036" "01058" "01034"
[34] "01039" "01041" "01042" "01043" "01044" "01047" "01051" "01052" "01053" "01054" "01055"
And I want to remove these "numbers" from this vector:
>pob
[1] "01001-Alegría-Dulantzi" "01002-Amurrio"
[3] "01049-Añana" "01003-Aramaio"
[5] "01006-Armiñón" "01037-Arraia-Maeztu"
[7] "01008-Arratzua-Ubarrundia" "01004-Artziniega"
[9] "01009-Asparrena" "01010-Ayala/Aiara"
[11] "01011-Baños de Ebro/Mañueta" "01013-Barrundia"
[13] "01014-Berantevilla" "01016-Bernedo"
[15] "01017-Campezo/Kanpezu" "01021-Elburgo/Burgelu"
[17] "01022-Elciego" "01023-Elvillar/Bilar"
[19] "01046-Erriberagoitia/Ribera Alta"
They are longer that these samples and they don't have the same length. The answer must to be like following:
>pob
[1] "Alegría-Dulantzi" "Amurrio"
[3] "Añana" "Aramaio"
[5] "Armiñón" "Arraia-Maeztu"
[7] "Arratzua-Ubarrundia" "Artziniega"
[9] "Asparrena" "Ayala/Aiara"
[11] "Baños de Ebro/Mañueta" "Barrundia"
[13] "Berantevilla" "Bernedo"
[15] "Campezo/Kanpezu" "Elburgo/Burgelu"
[17] "Elciego" "Elvillar/Bilar"
[19] "Erriberagoitia/Ribera Alta"
Not sure why you needed inecodes at all, since you can use sub to remove all digits:
sub('^\\d+-', '', pob)
Result:
[1] "Alegría-Dulantzi" "Amurrio" "Añana"
[4] "Aramaio" "Armiñón" "Arraia-Maeztu"
[7] "Arratzua-Ubarrundia" "Artziniega" "Asparrena"
[10] "Ayala/Aiara" "Baños de Ebro/Mañueta" "Barrundia"
[13] "Berantevilla" "Bernedo" "Campezo/Kanpezu"
[16] "Elburgo/Burgelu" "Elciego" "Elvillar/Bilar"
[19] "Erriberagoitia/Ribera Alta"
One reason that you might need inecodes is that you have codes in pob that don't exist in inecodes, but that doesn't seem like the case here. If you insist on using inecodes to remove numbers from pob, you can use str_replace_all from stringr:
library(stringr)
str_replace_all(pob, setNames(rep("", length(inecodes)), paste0(inecodes, "-")))
This gives you the exact same result:
[1] "Alegría-Dulantzi" "Amurrio" "Añana"
[4] "Aramaio" "Armiñón" "Arraia-Maeztu"
[7] "Arratzua-Ubarrundia" "Artziniega" "Asparrena"
[10] "Ayala/Aiara" "Baños de Ebro/Mañueta" "Barrundia"
[13] "Berantevilla" "Bernedo" "Campezo/Kanpezu"
[16] "Elburgo/Burgelu" "Elciego" "Elvillar/Bilar"
[19] "Erriberagoitia/Ribera Alta"
Data:
inecodes = c("01001", "01002", "01049", "01003", "01006", "01037", "01008",
"01004", "01009", "01010", "01011", "01013", "01014", "01016",
"01017", "01021", "01022", "01023", "01046", "01056", "01901",
"01027", "01019", "01020", "01028", "01030", "01031", "01032",
"01902", "01033", "01036", "01058", "01034", "01039", "01041",
"01042", "01043", "01044", "01047", "01051", "01052", "01053",
"01054", "01055")
pob = c("01001-Alegría-Dulantzi", "01002-Amurrio", "01049-Añana", "01003-Aramaio",
"01006-Armiñón", "01037-Arraia-Maeztu", "01008-Arratzua-Ubarrundia",
"01004-Artziniega", "01009-Asparrena", "01010-Ayala/Aiara", "01011-Baños de Ebro/Mañueta",
"01013-Barrundia", "01014-Berantevilla", "01016-Bernedo", "01017-Campezo/Kanpezu",
"01021-Elburgo/Burgelu", "01022-Elciego", "01023-Elvillar/Bilar",
"01046-Erriberagoitia/Ribera Alta")
library(stringr)
for(code in inecodes) {
ix <- which(str_detect(pob, code))
pob[ix] <- unlist(str_split(pob, "-", 2))[2]
}
Try this. Match should be much faster
pos<-which(!is.na(pob[match(sub('^([0-9]+)-.*$','\\1',pob),inecodes)]))
pob[pos]<-sub('^[0-9]+-(.*)$','\\1',pob[pos])
Please do post the timings if you manage to get this. Match usually solves many computational issues for large data sets lookup. Would like to see if there are any opposite scenarios.
A bit shorter than sub, str_detect and str_replace is str_remove:
library(stringr)
c("01001-Alegría-Dulantzi", "01002-Amurrio") %>%
str_remove("[0-9]*-")
returns
"Alegría-Dulantzi" "Amurrio"
I have a character list that looks like this
[70] "CSF 5896-6133"
[71] "CRT 16"
[72] "SEEF 54-55"
[73] "CIF 190-195"
[74] "DE & /ON CIF 196-222"
[75] " CRT 17 "
[76] " SEEF 56-57"
[77] "DE & /ON CSF 6134-6725 "
[78] " SEEF 58-60"
[79] "CRT 18"
[80] " CSF 6726-6837"
[81] "SEEF 61"
[82] " CSF 6840-6926"
[83] " CIF 223-226"
[84] "SEEF 62-63"
[85] " CSF 6927-7065"
[86] " CIF 226-228"
[87] "CSF 7066-7185"
[88] "CSF 7186-7311"
[89] " CIF 229"
[90] " SEEF 66"
[91] "CSF 7312-7561"
[92] " CRT 19"
[93] " SEEF 67-68"
[94] "Final data QAQC done on CSF 1-7561"
[95] " CIF 1-229"
[96] " SEEF 1-68 "
[97] " CRT 1-19"
[98] "082015-HOBA-G17-1 changed to offPlot based on GIS review of searched area"
As you can see this is only part of it.
I want to remove all words that are NOT either a number or
CSF, CIF, SEEF, CRT
So that for example the section from 94-98 would look like
[94] "CSF 1-7561"
[95] " CIF 1-229"
[96] " SEEF 1-68 "
[97] " CRT 1-19"
As you can see line 98 would be deleted completely because it had none of the keywords I wanted it to have. Line 94 also got stripped of some words.
Consider the following vector:
v <- c("Final data QAQC done on CSF 1-7561",
"CIF 1-229",
"SEEF 1-68",
"CRT 1-19",
"082015-HOBA-G17-1 changed to offPlot based on GIS review of searched area")
You could do:
## vector with words to match
cond <- c("CSF", "CIF", "SEEF", "CRT")
## regex that captures digits and tolerates dashes (-)
reg <- "(\\d+-?)+$"
## pattern to match either words or regex
pattern <- paste(c(cond, reg), collapse = "|")
Then use stri_extract_all() from the stringi package:
library(stringi)
stri_extract_all_regex(v, pattern)
Which gives:
#[[1]]
#[1] "CSF" "1-7561"
#
#[[2]]
#[1] "CIF" "1-229"
#
#[[3]]
#[1] "SEEF" "1-68"
#
#[[4]]
#[1] "CRT" "1-19"
#
#[[5]]
#[1] NA
As per mentionned by #akrun, you could also do:
regmatches(v, gregexpr(pattern, v))
Which gives:
#[[1]]
#[1] "CSF" "1-7561"
#
#[[2]]
#[1] "CIF" "1-229"
#
#[[3]]
#[1] "SEEF" "1-68"
#
#[[4]]
#[1] "CRT" "1-19"
#
#[[5]]
#character(0)
Use stringr:
library(stringr)
testString <- c("Final data QAQC done on CSF 1-7561" ,
" CIF 1-229" ,
" SEEF 1-68 ",
" CRT 1-19",
"082015-HOBA-G17-1 changed to offPlot based on GIS review of searched area" )
str_extract(testString, "(CSF|CIF|SEEF|CRT)\\s+\\d+-\\d+")
[1] "CSF 1-7561" "CIF 1-229" "SEEF 1-68" "CRT 1-19" NA
I'd use the stringr library.
Here's a subset of your data.
x <- c("CSF 5896-6133",
"CRT 16",
"SEEF 54-55",
"CIF 190-195",
"Final data QAQC done on CSF 1-7561",
"082015-HOBA-G17-1 changed to offPlot based on GIS review of searched area"
)
You could use str_extract and a regular expression matching your pattern.
library(stringr)
> str_extract(x, '(CSF|CIF|SEEF|CRT)[:space:]+([0-9]|-)+')
[1] "CSF 5896-6133" "CRT 16" "SEEF 54-55" "CIF 190-195" "CSF 1-7561"
[6] NA
When you have nothing matching the pattern it will return a missing value.
I have two almost identical data.frames, and I want to find the unique column name that is added to the x.2 object.
> colnames(x.1)
[1] "listPrice" "rent" "floor" "livingArea"
[5] "rooms" "published" "constructionYear" "objectType"
[9] "booliId" "soldDate" "soldPrice" "url"
[13] "additionalArea" "isNewConstruction" "location.namedAreas" "location.address.streetAddress"
[17] "location.address.city" "location.position.latitude" "location.position.longitude" "location.region.municipalityName"
[21] "location.region.countyName" "location.distance.ocean" "source.name" "source.id"
[25] "source.type" "source.url" "areaSize" "priceDiff"
[29] "perc.priceDiff" "sqrmPrice"
> colnames(x.2)
[1] "listPrice" "livingArea" "additionalArea" "plotArea"
[5] "rooms" "published" "constructionYear" "objectType"
[9] "booliId" "soldDate" "soldPrice" "url"
[13] "isNewConstruction" "floor" "rent" "location.namedAreas"
[17] "location.address.streetAddress" "location.address.city" "location.position.latitude" "location.position.longitude"
[21] "location.region.municipalityName" "location.region.countyName" "location.distance.ocean" "source.name"
[25] "source.id" "source.type" "source.url" "areaSize"
[29] "priceDiff" "perc.priceDiff" "sqrmPrice"
You can use setdiff to get the column names that are in 'x.2' and not in 'x.1'
setdiff(colnames(x.2), colnames(x.1))
Try
colnames(x.2)[!colnames(x.2) %in% colnames(x.1)]