retrieve a string from grep function in R - r

I'm searching in some sentences some words using this code
sent <- c()
for (w in my_words){
my_selected_sentences <- grep(paste0('\\b', w, '\\b'), my_sentences, ignore.case = TRUE, value = TRUE, perl = TRUE)
sent <- c(sent, my_selected_sentences)
}
Now I'd like to add in the vector "sent" word or words that have been retrieved. For example
[1] here is the selected sentence.---"the", "sentence"
How I can do? thanks!

You can do something like this (used an artificial example):
my_sentences <- c('hi there','will you do it', 'i will go home', 'salt is good for health')
my_sentences_df <- data.frame(sentences = my_sentences, words_retrieved = "", stringsAsFactors = FALSE)
my_words <- c('will','salt','home')
sent <- c()
for (w in my_words){
my_selected_sentences <- grep(paste0('\\b', w, '\\b'), my_sentences, ignore.case = TRUE, value = TRUE, perl = TRUE)
my_sentences_df[grep(w, my_sentences_df$sentences),"words_retrieved"] <-
paste(my_sentences_df[grep(w, my_sentences_df$sentences),"words_retrieved"], w, sep = ", ")
sent <- c(sent, my_selected_sentences)
}
my_sentences_df$words_retrieved <- sub('.', '', my_sentences_df$words_retrieved)
writeLines(with(my_sentences_df, paste0(my_sentences, " ---", words_retrieved)))

Related

Split multiple csv files in chunks based on condition

I have 400+ quite large csv files (~ million rows) having all a similar structure :
A long header of which I only need the 2nd and 3rd rows
A first time serie (always preceded by 'Target1')
A second time serie (always preceded by 'Target2')
Here is an example of data :
#multiple rows of header#
Target 1
Timestamp,X,Y,Z
1553972886851,0.017578,-0.003052,-0.971375
1553972886851,0.017883,-0.003662,-0.980408
1553972886851,0.016418,-0.003174,-0.977295
1553972886999,0.017151,-0.002808,-0.978088
1553972886999,0.016785,-0.003113,-0.977051
1553972886999,0.017883,-0.002197,-0.975830
1553972887096,0.017517,-0.003113,-0.976624
1553972887096,0.017883,-0.003113,-0.977966
1553972887096,0.017883,-0.002869,-0.978210
1553972887243,0.017151,-0.003113,-0.976135
1553972887243,0.018250,-0.003235,-0.975647
1553972887243,0.017273,-0.002991,-0.976257
1553972887340,0.018372,-0.003235,-0.977722
1553972887340,0.017761,-0.003235,-0.978027
Target 2
Timestamp,X,Y,Z
1553972886753,-0.411585,0.072409,-0.849848
1553972886753,-0.339177,-0.053354,-0.556402
1553972886753,-0.411585,-0.262957,-0.483994
1553972886855,-0.506860,-0.057165,-0.472561
1553972886855,-0.499238,-0.007622,-0.529726
1553972886855,-0.472561,-0.041921,-0.560213
1553972887002,-0.510671,-0.083841,-0.480183
1553972887002,-0.525915,-0.057165,-0.480183
1553972887002,-0.544969,-0.038110,-0.522104
1553972887098,-0.510671,-0.030488,-0.510671
1553972887098,-0.529726,-0.026677,-0.525915
1553972887098,-0.510671,-0.068598,-0.518293
I need to split each csv files in those 3 parts and name them accordingly.
I managed to do step 1) and step 3) but struggle for step 2).
Here is what I did for step 3) :
fileNames <- basename(list.files(path = ".", all.files = FALSE, full.names = FALSE, recursive = TRUE, ignore.case = FALSE, include.dirs = FALSE))
extension <- "txt"
fileNumbers <- seq(fileNames)
for (fileNumber in fileNumbers) {
newFileName <- paste("Target2-",
sub(paste("\\.", extension, sep = ""), "", fileNames[fileNumber]),
".", extension, sep = "")
# read old data:
Lines <- readLines(fileNames[fileNumber])
ix <- which(Lines == "Target2")
sample <- read.csv(fileNames[fileNumber],
header = TRUE,
sep = ",", skip= ix)
# write old data to new files:
write.table(sample,
newFileName,
append = FALSE,
quote = FALSE,
sep = ",",
row.names = FALSE,
col.names = TRUE)
}
I'm quite sure this is not the most straightforward approach and I can't get the data comprised between Target 1 and Target 2 using this approach. Also, this is super slow and I was wondering it there could be a more memory efficient approach ?
foo = function(filename) {
cat("\nprocessing", filename, "...")
x = readLines(con = filename)
idx = grepl("^Target", x)
x = split(x[!idx], cumsum(idx)[!idx])[-1]
invisible(lapply(seq_along(x), function(i) {
write.table(
x = x[[i]],
file = sub("\\.csv", paste0("_", i, ".csv"), filename),
append =FALSE,
row.names = FALSE,
quote = FALSE,
col.names = FALSE)
}))
}
files = list.files(path = "path/to/files", pattern = ".+\\.csv$")
lapply(files, foo)

R: Append multiple rows to dataframe within for-loop

I have PDF files that I made from these wikipedia pages (for example):
https://en.wikipedia.org/wiki/AIM-120_AMRAAM
https://en.wikipedia.org/wiki/AIM-9_Sidewinder
I have a list of keywords I want to search for within the document and extract the sentences in which they appear.
keywords <- c("altitude", "range", "speed")
I can call the file, extract the text from the PDF, pull the sentences with the keywords from the PDF. This works if I do this with each of the keywords individually, but when I try to do this in a loop I keep getting this issue where the rows aren't appending. Instead it's almost doing a cbind and then an error gets thrown regarding the number of columns. Here is my code and any help you can provide as to what I can do to make this work is much appreciated.
How do I get the rows to append correctly and appear in one file per PDF?
pdf.files <- list.files(path = "/path/to/file", pattern = "*.pdf", full.names = FALSE, recursive = FALSE)
for (i in 1:length(pdf.files)) {
for (j in 1:length(keywords)) {
text <- pdf_text(file.path("path", "to", "file", pdf.files[i]))
text2 <- tolower(text)
text3 <- gsub("\r", "", text2)
text4 <- gsub("\n", "", text3)
text5 <- grep(keywords[j], unlist(strsplit(text4, "\\.\\s+")), value = TRUE)
}
temp <- rbind(text5)
assign(pdf.files[i], temp)
}
After I get the rows to append correctly the next step will be to add in the keywords as a variable to the left of the extracted sentences. Example of ideal output:
keywords sentence
altitude sentence1.1
altitude sentence1.2
range sentence2.1
range sentence2.2
range sentence2.3
speed sentence3.1
speed sentence3.2
Would this be done in the loop as well or post as a separate function?
Any help is appreciated.
Alright so it took some real thinking but I made it work and it's not pretty but it gets the job done:
# This first part initializes the files to be written to
files <- list.files(path = "/path/to/file", pattern = "*.*", full.names = FALSE, recursive = FALSE)
for (h in 1:length(files)) {
temp1 <- data.frame(matrix(ncol = 2, nrow = 0))
x <- c("Title", "x")
colnames(temp1) <- x
write.table(temp1, paste0("/path/to/file", tools::file_path_sans_ext(files[h]), ".txt"), sep = "\t", row.names = FALSE, quote = FALSE)
}
# This next part fills in the files with the sentences
pdf.files <- list.files(path = "/path/to/file", pattern = "*.pdf", full.names = FALSE, recursive = FALSE)
for (i in 1:length(pdf.files)) {
for (j in 1:length(keywords)) {
text <- pdf_text(file.path("path", "to", "file", pdf.files[i]))
text2 <- tolower(text)
text3 <- gsub("\r", "", text2)
text4 <- gsub("\n", "", text3)
text5 <- as.data.frame(grep(keywords[j], unlist(strsplit(text4, "\\.\\s+")), value = TRUE))
colnames(text5) <- "x"
if (nrow(text5) != 0) {
title <- as.data.frame(keywords[j])
colnames(title) <- "Title"
temp <- cbind(title, text5)
temp <- unique(temp)
write.table(temp, paste0("/path/to/file", tools::file_path_sans_ext(pdf.files[i]), ".txt"), sep = "\t", row.names = FALSE, quote = FALSE, col.names = FALSE, append = TRUE)
}
}
}

Stemming words in r does not work as expected

I am trying to do a very simple word steming in R and getting something very unexpected. In the code below 'complete' variable is 'NA'. Why can't I complete stem on the word easy?
library(tm)
library(SnowballC)
dict <- c("easy")
stem <- stemDocument(dict, language = "english")
complete <- stemCompletion(stem, dictionary=dict)
Thank You!
You can see the internals of the stemCompletion() function with tm:::stemCompletion.
function (x, dictionary, type = c("prevalent", "first", "longest", "none", "random", "shortest")){
if(inherits(dictionary, "Corpus"))
dictionary <- unique(unlist(lapply(dictionary, words)))
type <- match.arg(type)
possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s",w), dictionary, value = TRUE))
switch(type, first = {
setNames(sapply(possibleCompletions, "[", 1), x)
}, longest = {
ordering <- lapply(possibleCompletions, function(x) order(nchar(x),
decreasing = TRUE))
possibleCompletions <- mapply(function(x, id) x[id],
possibleCompletions, ordering, SIMPLIFY = FALSE)
setNames(sapply(possibleCompletions, "[", 1), x)
}, none = {
setNames(x, x)
}, prevalent = {
possibleCompletions <- lapply(possibleCompletions, function(x) sort(table(x),
decreasing = TRUE))
n <- names(sapply(possibleCompletions, "[", 1))
setNames(if (length(n)) n else rep(NA, length(x)), x)
}, random = {
setNames(sapply(possibleCompletions, function(x) {
if (length(x)) sample(x, 1) else NA
}), x)
}, shortest = {
ordering <- lapply(possibleCompletions, function(x) order(nchar(x)))
possibleCompletions <- mapply(function(x, id) x[id],
possibleCompletions, ordering, SIMPLIFY = FALSE)
setNames(sapply(possibleCompletions, "[", 1), x)
})
}
The x argument is your stemmed terms, dictionary is the unstemmed. The only line that matters is the fifth; it does a simple regex match for the stemmed word in the list of dictionary terms.
possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s",w), dictionary, value = TRUE))
Therefore it fails, since it can't find a match for "easi" with "easy". If you also have the word "easiest" in your dictionary, then both terms match, since there is now a dictionary word with the same beginning four letters to match to.
library(tm)
library(SnowballC)
dict <- c("easy","easiest")
stem <- stemDocument(dict, language = "english")
complete <- stemCompletion(stem, dictionary=dict)
complete
easi easiest
"easiest" "easiest"
wordStem() seems to do it..
library(tm)
library(SnowballC)
dict <- c("easy")
> wordStem(dict)
[1] "easi"

How to create a custom write.table function?

I can use write.table function to create an output data from a data.frame:
> write.table(head(cars), sep = "|", row.names=FALSE)
"speed"|"dist"
4|2
4|10
7|4
7|22
8|16
9|10
How can I create my own write.table function which creates an output like this (header with double pipes and data with preceding and succeeding pipes)?:
||"speed"||"dist"||
|4|2|
|4|10|
|7|4|
|7|22|
|8|16|
|9|10|
write.table can get you part of the way, but you will still need to do some fiddling around to get things to work just as you want.
Here's an example:
x <- capture.output(
write.table(head(cars), sep = "|", row.names = FALSE, eol = "|\n"))
x2 <- paste0("|", x)
x2[1] <- gsub("|", "||", x2[1], fixed=TRUE)
cat(x2, sep = "\n")
# ||"speed"||"dist"||
# |4|2|
# |4|10|
# |7|4|
# |7|22|
# |8|16|
# |9|10|
As a function, I guess in its most basic form it could look something like:
write.myOut <- function(inDF, outputFile) {
x <- capture.output(
write.table(inDF, sep = "|", row.names = FALSE, eol = "|\n"))
x <- paste0("|", x)
x[1] <- gsub("|", "||", x[1], fixed=TRUE)
cat(x, sep = "\n", file=outputFile)
}
I don't think that it is possible with write.table. Here is a workaround:
# function for formatting a row
rowFun <- function(x, sep = "|") {
paste0(sep, paste(x, collapse = sep), sep)
}
# create strings
rows <- apply(head(cars), 1, rowFun)
header <- rowFun(gsub("^|(.)$", "\\1\"", names(head(cars))), sep = "||")
# combine header and row strings
vec <- c(header, rows)
# write the vector
write(vec, sep = "\n", file = "myfile.sep")
The resulting file:
||"speed"||"dist"||
|4|2|
|4|10|
|7|4|
|7|22|
|8|16|
|9|10|

Strip function in qdap package in R - Removes slash incorrectly

I have a text file, which is several hundred rows long. I am trying to remove all of the [edit:add] punctuation characters from it except the "/" characters. I am currently using the strip function in the qdap package.
Here is a sample data set:
htxt <- c("{rtf1ansiansicpg1252cocoartf1038cocoasubrtf360/",
"{fonttblf0fswissfcharset0 helvetica",
"margl1440margr1440vieww9000viewh8400viewkind0")
Here is the code:
strip(htxt, char.keep = "/", digit.remove = F, apostrophe.remove = TRUE, lower.case = TRUE)
The only problem with this beautiful function is that it removes the "/" characters. If I try to remove all characters except the "{" character it works:
strip(htxt, char.keep = "{", digit.remove = F, apostrophe.remove = TRUE, lower.case = TRUE)
Has anyone experienced the same problem?
For whatever reason it seems the qdap:::strip always strips "/" out of character vectors. This is in the source code towards the end of the function:
x <- clean(gsub("/", " ", gsub("-", " ", x)))
This is run before the actual function which does the stripping which is defined in the body of the function strip....
So just replace the function with your own version:
strip.new <- function (x, char.keep = "~~", digit.remove = TRUE, apostrophe.remove = TRUE,
lower.case = TRUE)
{
strp <- function(x, digit.remove, apostrophe.remove, char.keep,
lower.case) {
if (!is.null(char.keep)) {
x2 <- Trim(gsub(paste0(".*?($|'|", paste(paste0("\\",
char.keep), collapse = "|"), "|[^[:punct:]]).*?"),
"\\1", as.character(x)))
}
else {
x2 <- Trim(gsub(".*?($|'|[^[:punct:]]).*?", "\\1",
as.character(x)))
}
if (lower.case) {
x2 <- tolower(x2)
}
if (apostrophe.remove) {
x2 <- gsub("'", "", x2)
}
ifelse(digit.remove == TRUE, gsub("[[:digit:]]", "",
x2), x2)
}
unlist(lapply(x, function(x) Trim(strp(x = x, digit.remove = digit.remove,
apostrophe.remove = apostrophe.remove, char.keep = char.keep,
lower.case = lower.case))))
}
strip.new(htxt, char.keep = "/", digit.remove = F, apostrophe.remove = TRUE, lower.case = TRUE)
#[1] "rtf1ansiansicpg1252cocoartf1038cocoasubrtf360/"
#[2] "fonttblf0fswissfcharset0 helvetica"
#[3] "margl1440margr1440vieww9000viewh8400viewkind0"
The package author is pretty active on this site so he can probably clear up why strip does this by default.
Why not:
> gsub("[^/]", "", htxt)
[1] "/" "" ""
Given the clarification by #SimonO101, the regex approach might be:
gsub("[]!\"#$%&'()*+,.:;<=>?#[^_`{|}~-]", "", htxt)
Note that the first item in that sequence is "]" and the last item is "-" and that the double-quote needed to be escaped. This is what is targeted with [:punct:] with the "\" removed. to do it programatically you might use:
rem.some.punct <- function(txt, notpunct=NULL){
punctstr <- "[]!\"#$%&'()*/+,.:;<=>?#[^_`{|}~-]"
rempunct <- gsub(paste0("",notpunct), "", punctstr)
gsub(rempunct, "", txt)}

Resources