How to split a string conditionally in R? - r

I would like to split a string into multiple columns based on a number of conditions.
An example of my data:
Col1<- c("01/05/2004 02:59", "01/05/2004 05:04", "01/06/2004 07:19", "01/07/2004 02:55", "01/07/2004 04:32", "01/07/2004 04:38", "01/07/2004 17:13", "01/07/2004 18:40", "01/07/2004 20:58", "01/07/2004 23:39", "01/09/2004 13:28")
Col2<- c("Wabamun #4 off line.", "Keephills #2 on line.", "Wabamun #1 on line.", "North Red Deer T217s bus lock out. Under investigation.", "T217s has blown CTs on 778L", "T217s North Red Deer bus back in service (778L out of service)", "Keephills #2 off line.", "Wabamun #4 on line.", "Sundance #1 off line.", "Keephills #2 on line", "Homeland security event lowered to yellow ( elevated)")
df<- data.frame(Col1,Col2)
I would like to be able to split column w conditionally.
To get something like this:
Col3<- c("Wabamun #4", "Keephills #2", "Wabamun #1", "General Asset", "General Asset", "General Asset", "Keephills #2", "Wabamun #4", "Sundance #1", "Keephills #2", "General Asset")
Col4<- c("off line.", "on line.", "on line.", "North Red Deer T217s bus lock out. Under investigation.", "T217s has blown CTs on 778L", "T217s North Red Deer bus back in service (778L out of service)", "off line.", "on line.", "off line.", "on line", "Homeland security event lowered to yellow ( elevated)")
After I'm planning to find the times between when an asset goes down and comes back online. These are often generator plants so I would also be looking up the capacity of the plant. Example Keephills #2 has a capacity of 300MW.

Thankfully, regular expressions are here to save the day.
# This line prevents character strings turning into factors
df<- data.frame(Col1,Col2, stringsAsFactors=FALSE)
# This match works with the powerplant names as
# they're all 1 or more characters followed by a space, hash and single digit.
pwrmatch <- regexpr("^[[:alpha:]]+ #[[:digit:]]", df$Col2)
df$Col3 <- "General Asset"
df$Col3[grepl("^[[:alpha:]]+ #[[:digit:]]", df$Col2)] <- regmatches(df$Col2, pwrmatch)
Col3 now looks like: c("Wabamun #4", "Keephills #2", "Wabamun #1", "General Asset",
"General Asset", "General Asset", "Keephills #2", "Wabamun #4",
"Sundance #1", "Keephills #2", "General Asset")
The other line is a similar matter, simply matching all cases of on/off line.
linematch <- regexpr("(on|off) line", df$Col2)
df$Col4 <- df$Col2
df$Col4[grepl("(on|off) line", df$Col2)] <- regmatches(df$Col2, linematch)
Col4 now looks like: c("off line", "on line", "on line", "North Red Deer T217s bus lock out. Under investigation.",
"T217s has blown CTs on 778L", "T217s North Red Deer bus back in service (778L out of service)",
"off line", "on line", "off line", "on line", "Homeland security event lowered to yellow ( elevated)"
)

> Col3 <- Col4 <- character(nrow(df))
> index <- grep("#", Col2, invert = TRUE)
> spl1 <- unlist(strsplit(Col2[-index], " o"))[c(TRUE, FALSE)]
> Col3[-index] <- spl1
> Col3[index] <- "General Asset"
> spl2 <- unlist(strsplit(Col2[-index], " o"))[c(FALSE, TRUE)]
> Col4[-index] <- paste("o", spl2, sep="")
> Col4[index] <- Col2[index]
> Col3
## [1] "Wabamun #4" "Keephills #2" "Wabamun #1" "General Asset"
## [5] "General Asset" "General Asset" "Keephills #2" "Wabamun #4"
## [9] "Sundance #1" "Keephills #2" "General Asset"
> Col4
## [1] "off line."
## [2] "on line."
## [3] "on line."
## [4] "North Red Deer T217s bus lock out. Under investigation."
## [5] "T217s has blown CTs on 778L"
## [6] "T217s North Red Deer bus back in service (778L out of service)"
## [7] "off line."
## [8] "on line."
## [9] "off line."
## [10] "on line"
## [11] "Homeland security event lowered to yellow ( elevated)"

Related

Compare 2 columns of one dataframe with the same 2 columns of another dataframe and find the common combination of values [duplicate]

This question already has answers here:
How to join (merge) data frames (inner, outer, left, right)
(13 answers)
Closed 1 year ago.
How can I find the common combination of values in same columns of 2 dataframes? Basically same name and same artistName
dat1<-structure(list(artistName = c("Adele", "Mariah Carey", "D-Block Europe",
"Wham!", "Ed Sheeran", "Adele", "Adele", "Elton John & Dua Lipa",
"ArrDee", "GAYLE", "Ed Sheeran", "The Pogues", "Ed Sheeran",
"Shakin' Stevens", "Leona Lewis", "Sam Fender", "Acraze", "Kelly Clarkson",
"Joel Corry", "SwitchOTR"), name = c("Easy On Me", "All I Want For Christmas Is You",
"Overseas (feat. Central Cee)", "Last Christmas", "Shivers",
"Oh My God", "I Drink Wine", "Cold Heart (PNAU Remix)", "Flowers (Say My Name)",
"abcdefu", "Bad Habits", "Fairytale of New York", "Overpass Graffiti",
"Merry Christmas Everyone", "One More Sleep", "Seventeen Going Under",
"Do It To It (feat. Cherish)", "Underneath the Tree", "I Wish (feat. Mabel)",
"Coming for You (feat. A1 x J1)")), row.names = c(NA, 20L), class = "data.frame")
dat2<-structure(list(artistName = c("Adele", "Rod Wave", "Kodak Black",
"Drake", "Nardo Wick", "Drake", "Adele", "Bruno Mars, Anderson .Paak & Silk Sonic",
"Adele", "Summer Walker & SZA", "Mariah Carey", "GAYLE", "Drake",
"Doja Cat", "Lil Nas X & Jack Harlow", "Taylor Swift", "The Kid LAROI & Justin Bieber",
"Adele", "Summer Walker", "Drake"), name = c("Easy On Me", "By Your Side",
"Super Gremlin", "Knife Talk (feat. 21 Savage & Project Pat)",
"Who Want Smoke?? (feat. G Herbo, Lil Durk & 21 Savage)", "Way 2 Sexy (feat. Future & Young Thug)",
"Oh My God", "Smokin Out The Window", "My Little Love", "No Love",
"All I Want For Christmas Is You", "abcdefu", "Girls Want Girls (feat. Lil Baby)",
"Need To Know", "INDUSTRY BABY", "All Too Well (10 Minute Version) (Taylor's Version) (From The Vault)",
"STAY", "I Drink Wine", "Insane", "Fair Trade (feat. Travis Scott)"
)), row.names = c(NA, 20L), class = "data.frame")
Is the following you are looking for?
dplyr::inner_join(dat1, dat2)
#> Joining, by = c("artistName", "name")
#> artistName name
#> 1 Adele Easy On Me
#> 2 Mariah Carey All I Want For Christmas Is You
#> 3 Adele Oh My God
#> 4 Adele I Drink Wine
#> 5 GAYLE abcdefu

How to count how many string in a column

I have a datasets with a column "amenities" and I want to count how many amenities in each row.
> airbnbT$amenities[1]
[1] ["Essentials", "Refrigerator", "Shampoo", "TV", "Dedicated workspace", "Hangers", "Iron", "Long term stays allowed", "Dishes and silverware", "First aid kit", "Free parking on premises", "Hair dryer", "Patio or balcony", "Washer", "Dryer", "Cooking basics", "Coffee maker", "Private entrance", "Hot water", "Fire extinguisher", "Wifi", "Air conditioning", "Hot tub", "Kitchen", "Microwave", "Oven", "Smoke alarm"]
14673 Levels: ["Air conditioning", "Baby bath", "Long term stays allowed", "Baby monitor"] ...
> class(airbnbT$amenities[1])
[1] "factor"
Here for row 1, there are 27 amenities.
Is there a way to count the comma in each row "," ? This way would count the numbers of amenities.
Try str_count from the stringr package. You will need to add 1 since there will be one fewer comma than the number of amenities:
library(stringr)
airbnbT$amenities_count = str_count(airbnbT$amenities,",") + 1

Split 2 numbers character into it's own new line

I've a character object with 84 elements.
> head(output.by.line)
[1] "\n17"
[2] "Now when Joseph saw that his father"
[3] "laid his right hand on the head of"
[4] "Ephraim, it displeased him; so he took"
[5] "hold of his father's hand to remove it"
[6] "from Ephraim's head to Manasseh's"
But there is a line that has 2 numbers (49) that is not in it's own line:
[35] "49And Jacob called his sons and"
I'd like to transform this into:
[35] "\n49"
[36] "And Jacob called his sons and"
And insert this in the correct numeration, after object 34.
Dput Output:
dput(output.by.line)
c("\n17", "Now when Joseph saw that his father", "laid his right hand on the head of",
"Ephraim, it displeased him; so he took", "hold of his father's hand to remove it",
"from Ephraim's head to Manasseh's", "head.", "\n18", "And Joseph said to his father, \"Not so,",
"my father, for this one is the firstborn;", "put your right hand on his head.\"",
"\n19", "But his father refused and said, \"I", "know, my son, I know. He also shall",
"become a people, and he also shall be", "great; but truly his younger brother shall",
"be greater than he, and his descendants", "shall become a multitude of nations.\"",
"\n20", "So he blessed them that day, saying,", "\"By you Israel will bless, saying, \"May",
"God make you as Ephraim and as", "Manasseh!\"' And thus he set Ephraim",
"before Manasseh.", "\n21", "Then Israel said to Joseph, \"Behold, I",
"am dying, but God will be with you and", "bring you back to the land of your",
"fathers.", "\n22", "Moreover I have given to you one", "portion above your brothers, which I",
"took from the hand of the Amorite with", "my sword and my bow.\"",
"49And Jacob called his sons and", "said, \"Gather together, that I may tell",
"you what shall befall you in the last", "days:", "\n2", "\"Gather together and hear, you sons of",
"Jacob, And listen to Israel your father.", "\n3", "\"Reuben, you are my firstborn, My",
"might and the beginning of my strength,", "The excellency of dignity and the",
"excellency of power.", "\n4", "Unstable as water, you shall not excel,",
"Because you went up to your father's", "bed; Then you defiled it-- He went up to",
"my couch.", "\n5", "\"Simeon and Levi are brothers;", "Instruments of cruelty are in their",
"dwelling place.", "\n6", "Let not my soul enter their council; Let",
"not my honor be united to their", "assembly; For in their anger they slew a",
"man, And in their self-will they", "hamstrung an ox.", "\n7",
"Cursed be their anger, for it is fierce;", "And their wrath, for it is cruel! I will",
"divide them in Jacob And scatter them", "in Israel.", "\n8",
"\"Judah, you are he whom your brothers", "shall praise; Your hand shall be on the",
"neck of your enemies; Your father's", "children shall bow down before you.",
"\n9", "Judah is a lion's whelp; From the prey,", "my son, you have gone up. He bows",
"down, he lies down as a lion; And as a", "lion, who shall rouse him?",
"\n10", "The scepter shall not depart from", "Judah, Nor a lawgiver from between his",
"feet, Until Shiloh comes; And to Him", "shall be the obedience of the people.",
"\n11", "Binding his donkey to the vine, And his", "donkey's colt to the choice vine, He"
)
Please, check this:
library(tidyverse)
split_line_number <- function(x) {
x %>%
str_replace("^([0-9]+)", "\n\\1\b") %>%
str_split("\b")
}
output.by.line %>%
map(split_line_number) %>%
unlist()
# Output:
# [35] "\n49"
# [36] "And Jacob called his sons and"
# [37] "said, \"Gather together, that I may tell"
# [38] "you what shall befall you in the last"
An option using stringr::str_match is to match two components of an optional number followed by everything. Get the captured output from the matched matrix (2:3) and create a new vector of strings by dropping NAs and empty strings.
vals <- c(t(stringr::str_match(output.by.line, "(\n?\\d+)?(.*)")[, 2:3]))
output <- vals[!is.na(vals) & vals != ""]
output[32:39]
#[1] "portion above your brothers, which I"
#[2] "took from the hand of the Amorite with"
#[3] "my sword and my bow.\""
#[4] "49"
#[5] "And Jacob called his sons and"
#[6] "said, \"Gather together, that I may tell"
#[7] "you what shall befall you in the last" "days:"
We'll make use of the stringr package:
library(stringr)
Modify the object:
output.by.line <- unlist(
ifelse(grepl('[[:digit:]][[:alpha:]]', output.by.line), str_split(gsub('([[:digit:]]+)([[:alpha:]])', paste0('\n', '\\1 \\2'), output.by.line), '[[:blank:]]', n = 2), output.by.line)
)
Print the resuts:
dput(output.by.line)
#[32] "portion above your brothers, which I"
#[33] "took from the hand of the Amorite with"
#[34] "my sword and my bow.\""
#[35] "\n49"
#[36] "And Jacob called his sons and"
#[37] "said, \"Gather together, that I may tell"
#[38] "you what shall befall you in the last"

How to remove a group of words from a character variable in R?

I have a dataset with the character variable like below and I want to remove some words from that variable.
My data:
variable
1 "star war it ik ar"
2 "jammu kashmir DO"
3 "hey Ho dude"
4 "time kya hi abhi"
5 "every one ji"
6 "thanks to iF everyone"
7 "hey i am in america yo"
I want to remove some words and letters fro the variable.
Expected result:
variable created_var
1 "star war it ik ar" "star war"
2 "jammu kashmir DO" "jammu kashmir"
3 "hey Ho dude" "hey dude"
4 "time kya hi abhi" "time kya abhi"
5 "every one ji" "every one"
6 "thanks to iF everyone" "thanks to everyone"
7 "hey i am in america yo" "hey i am in america"
I tried the following:
waste <- c("it", "ik", "ar", "DO", "Ho", "hi", "iF", "yo")
data$created_var <- gsub(waste, "", data$variable)
The above code is working for a single word but not for the group of words. How can I remove all the words in waste?

Why does the ngrams() function give distinct bigrams?

I am writing an R script and am using library(ngram).
Suppose I have a string,
"good qualiti dog food bought sever vital can dog food product found good qualiti product look like stew process meat smell better labrador finicki appreci product better"
and want to find bi-grams.
The ngram library is giving me bi-grams as follows:
"appreci product" "process meat" "food product" "food bought" "qualiti dog" "product found" "product look" "look like" "like stew" "good qualiti" "labrador finicki" "bought sever" "qualiti product" "better labrador"
"dog food" "smell better" "vital can" "meat smell" "found good" "sever vital" "stew process" "can dog" "finicki appreci" "product better"
As the sentence contains "dog food" two times, I want this bi-gram two times. But I am getting it once!
Is there an option in thengram library or any other library that gives all the bi-grams of my sentence in R?
The development version of ngram has a get.phrasetable method:
devtools::install_github("wrathematics/ngram")
library(ngram)
text <- "good qualiti dog food bought sever vital can dog food product found good qualiti product look like stew process meat smell better labrador finicki appreci product better"
ng <- ngram(text)
head(get.phrasetable(ng))
# ngrams freq prop
# 1 good qualiti 2 0.07692308
# 2 dog food 2 0.07692308
# 3 appreci product 1 0.03846154
# 4 process meat 1 0.03846154
# 5 food product 1 0.03846154
# 6 food bought 1 0.03846154
In addition, you can use the print() method and specify output == "full". That is:
print(ng, output = "full")
# NOTE: more output not shown...
better labrador | 1
finicki {1} |
dog food | 2
product {1} | bought {1}
# NOTE: more output not shown...
You can use stylo package. Gives duplicates:
library(stylo)
a = "good qualiti dog food bought sever vital can dog food product found good qualiti product look like stew process meat smell better labrador finicki appreci product better"
b = txt.to.words(a)
c = make.ngrams(b, ngram.size = 2)
print(c)
Result:
[1] "good qualiti" "qualiti dog" "dog food" "food bought" "bought sever" "sever vital" "vital can" "can dog" "dog food"
[10] "food product" "product found" "found good" "good qualiti" "qualiti product" "product look" "look like" "like stew" "stew process"
[19] "process meat" "meat smell" "smell better" "better labrador" "labrador finicki" "finicki appreci" "appreci product" "product better"
>
You could use RWeka. In the result you can see "dog food" and "good qualiti" appearing twice
txt <- "good qualiti dog food bought sever vital can dog food product found good qualiti product look like stew process meat smell better labrador finicki appreci product better"
library(RWeka)
RWEKABigramTokenizer <- function(x) {
NGramTokenizer(x, Weka_control(min = 2, max = 2))
}
RWEKABigramTokenizer(txt)
[1] "good qualiti" "qualiti dog" "dog food" "food bought" "bought sever" "sever vital" "vital can"
[8] "can dog" "dog food" "food product" "product found" "found good" "good qualiti" "qualiti product"
[15] "product look" "look like" "like stew" "stew process" "process meat" "meat smell" "smell better"
[22] "better labrador" "labrador finicki" "finicki appreci" "appreci product" "product better"
Or use the tm package in combination with RWeka
library(tm)
library(RWeka)
my_corp <- Corpus(VectorSource(txt))
tdm_RWEKA <- TermDocumentMatrix(my_corp, control=list(tokenize = RWEKABigramTokenizer))
#show the 2 bigrams
findFreqTerms(tdm_RWEKA, lowfreq = 2)
[1] "dog food" "good qualiti"
#turn into matrix with frequency counts
tdm_matrix <- as.matrix(tdm_RWEKA)
In order to produce such bi-gram, you don't need any special package. Basically, split the text and paste it together again.
t <- "good qualiti dog food bought sever vital can dog food product found good qualiti product look like stew process meat smell better labrador finicki appreci product better"
ug <- strsplit(t, " ")[[1]]
bg <- paste(ug, ug[2:length(ug)])
The resulted bg would be:
[1] "good qualiti" "qualiti dog" "dog food"
[4] "food bought" "bought sever" "sever vital"
[7] "vital can" "can dog" "dog food"
[10] "food product" "product found" "found good"
[13] "good qualiti" "qualiti product" "product look"
[16] "look like" "like stew" "stew process"
[19] "process meat" "meat smell" "smell better"
[22] "better labrador" "labrador finicki" "finicki appreci"
[25] "appreci product" "product better" "better qualiti"
Try the quanteda package:
> quanteda::tokenize(txt, ngrams = 2, concatenator = " ")
[[1]]
[1] "good qualiti" "qualiti dog" "dog food" "food bought" "bought sever" "sever vital"
[7] "vital can" "can dog" "dog food" "food product" "product found" "found good"
[13] "good qualiti" "qualiti product" "product look" "look like" "like stew" "stew process"
[19] "process meat" "meat smell" "smell better" "better labrador" "labrador finicki" "finicki appreci"
[25] "appreci product" "product better"
Plenty of additional arguments available through ngrams, including getting different combinations of n sizes, or skip-grams.

Resources