Separate string into list in r - r

I have a string in R that looks like this:
"{[PP]}{[BGH]}{[AC]}{[ETL]}....{[D]}"
I want to convert it into a list so that:
List[[1]] = {[PP]}
List[[2]] = {[BGH]}
....
List[[N]] = {[D]}
If there were commas you could do strsplit but I want to keep the brackets and not get rid of them. Not sure how to do this in R

without regular expressions:
s <- "{[PP]}{[BGH]}{[AC]}{[ETL]}{[D]}"
as.list(paste("{", strsplit(s, "\\{")[[1]][-1], sep = ""))
[[1]]
[1] "{[PP]}"
[[2]]
[1] "{[BGH]}"
[[3]]
[1] "{[AC]}"
[[4]]
[1] "{[ETL]}"
[[5]]
[1] "{[D]}"

strsplit still works if you pass this regular expression (?<=})(?={) which constrains the position to split on:
strsplit(s, "(?<=})(?={)", perl = T)
# [[1]]
# [1] "{[PP]}" "{[BGH]}" "{[AC]}" "{[ETL]}" "{[D]}"
Or as #thelatemail suggested:
strsplit(s, "(?<=})", perl = T)

obligatory stringi answer:
library(stringi)
dat <- "{[PP]}{[BGH]}{[AC]}{[ETL]}{[more]{[D]}"
as.list(stri_match_all_regex(dat, "(\\{\\[[[:alpha:]]+\\]\\})")[[1]][,2])
## [[1]]
## [1] "{[PP]}"
##
## [[2]]
## [1] "{[BGH]}"
##
## [[3]]
## [1] "{[AC]}"
##
## [[4]]
## [1] "{[ETL]}"
##
## [[5]]
## [1] "{[D]}"

There is a convenient function in qdap for this i.e. bracketXtract
library(qdap)
setNames(as.list(bracketXtract(s, "curly", TRUE)), NULL)
#[[1]]
#[1] "{[PP]}"
#[[2]]
#[1] "{[BGH]}"
#[[3]]
#[1] "{[AC]}"
#[[4]]
#[1] "{[ETL]}"
#[[5]]
#[1] "{[D]}"
By default, with = FALSE. So without using with = TRUE, it will remove the bracket.
data
s <- "{[PP]}{[BGH]}{[AC]}{[ETL]}{[D]}"

Related

r extract element from the list with for-loop

I have the following list of values:
$`1`
[1] "S21_027_1" "5_G3_A_1_counts.txt"
$`5`
[1] "S21_027_13" "5_G3_A_12_counts.txt"
$`9`
[1] "S21_027_17" "5_G3_A_15_counts.txt"
$`14`
[1] "S21_027_21" "5_G3_A_22_counts.txt"
$`18`
[1] "S21_027_25" "5_G3_A_26_counts.txt"
$`22`
[1] "S21_027_29" "5_G3_A_29_counts.txt"
I try to extract only stuff which starts with S21_027.
I tried to use for loop however it keeps just one element.
My attempt to extract it:
order_column <- c()
for (i in length(order_col))
{
v <- order_col[[i]][[1]]
print(v)
order_column <- c(v, order_column)
}
Using base R
lapply(order_col, grep, pattern = 'S21_027', value = TRUE)
[[1]]
[1] "S21_027_1"
[[2]]
[1] "S21_027_13"
[[3]]
[1] "S21_027_17"
Does this work:
lst <- list(c('S21_027_1','5_G3_A_1_counts.txt'),
c('S21_027_13','5_G3_A_12_counts.txt'),
c('S21_027_17','5_G3_A_15_counts.txt'))
sapply(lst, function(x) x[grepl('^S21_027', x)])
[1] "S21_027_1" "S21_027_13" "S21_027_17"
You may use -
library(purrr)
library(stringr)
map(order_col, str_subset, "S21_027")
#[[1]]
#[1] "S21_027_1"
#[[2]]
#[1] "S21_027_13"
#[[3]]
#[1] "S21_027_17"
Or to extract the 1st element -
map_chr(order_col, head, 1)
#[1] "S21_027_1" "S21_027_13" "S21_027_17"

Iteratively extract repeated word forms across speaking turns

I'm working on speaking turns in conversation. My interest is in the words that get repeated from a prior turn to a next turn:
turnsX <- data.frame(
speaker = c("A","B","A","B"),
speech = c("let's have a look",
"yeah let's take a look",
"yeah okay so where to start",
"let's start here"), stringsAsFactors = F
)
I want to extract the repeated word forms. To this end I've run a for loop, iteratively defining each speech turn as a regex pattern for the next speech turn and str_extracting the words that get repeated from turn to turn:
library(stringr)
pattern <- c()
extracted <- c()
for(i in 1:nrow(turnsX)){
pattern[i] <- paste0(unlist(str_split(turnsX$speech[i], " ")), collapse = "|")
extracted[i+1] <- str_extract_all(turnsX$speech[i+1], pattern[i])
}
The result however is partly incorrect:
extracted
[[1]]
NULL
[[2]]
[1] "a" "let's" "a" "a" "look"
[[3]]
[1] "yeah" "a" "a"
[[4]]
[1] "start"
[[5]]
[1] NA
The correct result should be:
extracted
[[1]]
NULL
[[2]]
[1] "let's" "a" "look"
[[3]]
[1] "yeah"
[[4]]
[1] "start"
Where's the mistake? How can the code be mended, or what other approach is there, to get the correct result?
Maybe you can use Map and %in%.
x <- strsplit(turnsX$speech, " ")
Map(function(y,z) y[y %in% z], x[-length(x)], x[-1])
#[[1]]
#[1] "let's" "a" "look"
#
#[[2]]
#[1] "yeah"
#
#[[3]]
#[1] "start"
Here's a base R approach using Map :
tmp <- strsplit(turnsX$speech, ' ')
c(NA, Map(intersect, tmp[-1], tmp[-length(tmp)]))
#[[1]]
#[1] NA
#[[2]]
#[1] "let's" "a" "look"
#[[3]]
#[1] "yeah"
#[[4]]
#[1] "start"
You want the word boundaries "\\b"
library(stringr)
pattern <- c()
extracted <- c()
for(i in 2:nrow(turnsX)){
pattern[i - 1] <- paste0(unlist(str_split(turnsX$speech[i - 1], " ")), collapse = "|\\b")
extracted[i] <- str_extract_all(turnsX$speech[i], pattern[i - 1])
}
# [[1]]
# NULL
#
# [[2]]
# [1] "let's" "a" "look"
#
# [[3]]
# [1] "yeah"
#
# [[4]]
# [1] "start"

ft_tokenizer tokenizes words to lower, I want it to be as they are

I am using ft_tokenizer for spark dataframe in R.
and it tokenizes each word and changes it to all lower, I want the words to be in the format they originally are.
text_data <- data_frame(
x = c("This IS a sentence", "So is this")
)
tokenized <- text_data_tbl %>%
ft_tokenizer("x", "word")
tokenized$word
## [[1]]
## [[1]][[1]]
## [1] "this"
##
## [[1]][[2]]
## [1] "is"
##
## [[1]][[3]]
## [1] "a"
I want:
tokenized$word
## [[1]]
## [[1]][[1]]
## [1] "This"
##
## [[1]][[2]]
## [1] "IS"
##
## [[1]][[3]]
## [1] "a"
I guess it is not possible with ft_tokenizer. From ?ft_tokenizer
A tokenizer that converts the input string to lowercase and then splits it by white spaces.
So it's basic feature is to convert the string to lowercase and split on white-space which I guess cannot be changed. Consider doing
text_data$new_x <- lapply(strsplit(text_data$x, "\\s+"), as.list)
which will give the same output as expected and you can continue your process as it is from here.
text_data$new_x
#[[1]]
#[[1]][[1]]
#[1] "This"
#[[1]][[2]]
#[1] "IS"
#[[1]][[3]]
#[1] "a"
#[[1]][[4]]
#[1] "sentence"
#[[2]]
#[[2]][[1]]
#[1] "So"
#[[2]][[2]]
#[1] "is"
#[[2]][[3]]
#[1] "this"

R - Grepl vector over vector

I have a vector of character strings (v1) like so:
> head(v1)
[1] "do_i_need_to_even_say_it_do_i_well_here_i_go_anyways_chris_cornell_in_chicago_tonight"
[2] "going_to_see_harry_sunday_happiness"
[3] "this_motha_fucka_stay_solid_foh_with_your_naieve_ass_mentality_your_synapsis_are_lacking_read_a_fucking_book_for_christ_sake"
[4] "why_twitter_will_soon_become_obsolete_http_www.imediaconnection.com_content_23465_asp"
[5] "like_i_said_my_back_still_fucking_hurts_and_im_going_to_complain_about_it_like_no_ones_business_http_tumblr.com_x6n25amd5"
[6] "my_picture_with_kris_karmada_is_gone_forever_its_not_in_my_comments_on_my_mysapce_or_on_my_http_tumblr.com_xzg1wy4jj"
And another vector of character strings (v2) like so:
> head(v2)
[1] "here_i_go" "going" "naieve_ass" "your_synapsis" "my_picture_with" "roll"
What is the quickest way that I can return a list of vectors where each list item represents each vector item in v1 and each vector item is a regular expression match where an item in v2 appeared in that v1 item, like so:
[[1]]
[1] "here_i_go"
[[2]]
[1] "going"
[[3]]
[1] "naieve_ass" "your_synapsis"
[[4]]
[[5]]
[1] "going"
[[6]]
[1] "my_picture_with"
I'd like to leave another option with stri_extract_all_regex() in the stringi package. You can create your regular expression directly from v2 and use it in pattern.
library(stringi)
stri_extract_all_regex(str = v1, pattern = paste(v2, collapse = "|"))
[[1]]
[1] "here_i_go"
[[2]]
[1] "going"
[[3]]
[1] "naieve_ass" "your_synapsis"
[[4]]
[1] NA
[[5]]
[1] "going"
[[6]]
[1] "my_picture_with"
If you want speed, I'd use stringi. You don't seem to have any regex, just fixed patterns, so we can use a fixed stri_extract, and (since you don't mention what to do with multiple matches) I'll assume only extracting the first match is fine, giving us a little more speed with stri_extract_first_fixed.
It's probably not worth benchmarking on such a small example, but this should be quite fast.
library(stringi)
matches = lapply(v1, stri_extract_first_fixed, v2)
lapply(matches, function(x) x[!is.na(x)])
# [[1]]
# [1] "here_i_go"
#
# [[2]]
# [1] "going"
#
# [[3]]
# [1] "naieve_ass" "your_synapsis"
#
# [[4]]
# character(0)
#
# [[5]]
# [1] "going"
Thanks for sharing data, but next time please share it copy/pasteably. dput is nice for that. Here's a copy/pasteable input:
v1 = c(
"do_i_need_to_even_say_it_do_i_well_here_i_go_anyways_chris_cornell_in_chicago_tonight" ,
"going_to_see_harry_sunday_happiness" ,
"this_motha_fucka_stay_solid_foh_with_your_naieve_ass_mentality_your_synapsis_are_lacking_read_a_fucking_book_for_christ_sake",
"why_twitter_will_soon_become_obsolete_http_www.imediaconnection.com_content_23465_asp" ,
"like_i_said_my_back_still_fucking_hurts_and_im_going_to_complain_about_it_like_no_ones_business_http_tumblr.com_x6n25amd5" ,
"my_picture_with_kris_karmada_is_gone_forever_its_not_in_my_comments_on_my_mysapce_or_on_my_http_tumblr.com_xzg1wy4jj")
v2 = c("here_i_go", "going", "naieve_ass", "your_synapsis", "my_picture_with", "roll" )

Check list in list in R

I have 2 lists, I want to check if the second list in the first list, if yes, paste letters "a","b"... to each element in the first list
list1 <- list("Year","Age","Enrollment","SES","BOE")
list2 <- list("Year","Enrollment","SES")
I try to use lapply
text <- letters[1:length(list2)]
listText<- lapply(list1,function(i) ifelse(i %in% list2,paste(i,text[i],sep="^"),i))
I got wrong output
> listText
[[1]]
[1] "Year^NA"
[[2]]
[1] "Age"
[[3]]
[1] "Enrollment^NA"
[[4]]
[1] "SES^NA"
[[5]]
[1] "BOE"
This is the output I want
[[1]]
[1] "Year^a"
[[2]]
[1] "Age"
[[3]]
[1] "Enrollment^b"
[[4]]
[1] "SES^c"
[[5]]
[1] "BOE"
We can use match to find the index and then use it to subset the first list and paste the letters
i1 <- match(unlist(list2), unlist(list1))
list1[i1] <- paste(list1[i1], letters[seq(length(i1))], sep="^")
You just need change to :
text <- as.character(letters[1:length(list2)])
names(text) <- unlist(list2)
The result is :
> listText
[[1]]
[1] "Year^a"
[[2]]
[1] "Age"
[[3]]
[1] "Enrollment^b"
[[4]]
[1] "SES^c"
[[5]]
[1] "BOE"

Resources