I have several rtf files and would like to search if certain words appear in the files. If it appears, I would like to extract the entire sentence with that word in it.
This is the part of the code that I would like to repeat. I have tried this on the document 'a1' but I have documents a1 to a10 in the same folder that I would like to search.
library(striprtf)
files <- list.files(path="/Users/Jane/R/Test", pattern="*.rtf", full.names=TRUE, recursive=FALSE)
rtf <- read_rtf("a1.rtf", verbose = FALSE, row_start = "*| ", row_end = "",
cell_end = " | ", ignore_tables = FALSE, check_file = TRUE)
text <- unlist(strsplit(rtf, "\\."))
)
The final output should be one data frame with a column for the file name (i.e. a1), sentence
You can use lapply() and define a function to do what you want. results will be a list with all your matches data frames
library(striprtf)
files <- list.files(path="/Users/Jane/R/Test", pattern="*.rtf", full.names=TRUE, recursive=FALSE)
# files <-paste0(paste0("a",1:10),".rtf")
results = lapply(files, function(x) {
rtf <- read_rtf(x, verbose = FALSE, row_start = "*| ", row_end = "",
cell_end = " | ", ignore_tables = FALSE, check_file = TRUE)
text <- unlist(strsplit(rtf, "\\."))
toMatch <- c("passive","fund act")
matches <- unique(grep(paste(toMatch,collapse="|"),
text, value=TRUE))
matches <- data.frame(matches)
})
Related
I have a list of 50 text files all beginning with NEW.
I want to loop through each textfile/dataframe and run some function and then output the results via the write.table function. Therefore for each file, a function is applied and then an output should be created containing the original name with output at the end.
Here is my code.
fileNames <- Sys.glob("*NEW.*")
for (fileName in fileNames) {
df <- read.table(fileName, header = TRUE)
FUNCTION (not shown as this works)
...
result <-print(chr1$results) #for each file a result would be printed.
write.table(result, file = paste0(fileName,"_output.txt"), quote = F, sep = "\t", row.names = F, col.names = T)
#for each file a new separate file is created with the original output name retained.
}
However, I only get one output rather than 50 output files. It seems like its only looping through one file. What am I doing wrong?
readme <- function(folder_name = "my_texts"){
file_list <- list.files(path = folder_name, pattern = "*.txt",
recursive = TRUE, full.names = TRUE).
#list files with .txt ending
textdata <- lapply(file_list, function(x) {.
paste(readLines(x), collapse=" ").
}).
#apply readlines over the file list.
data.table::setattr(textdata, "names", file_list) .
#add names attribute to textdata from file_list.
lapply(names(file_list), function(x){.
lapply(names(file_list[[x]]), function(y) setattr(DT[[x]], y,
file_list[[x]][[y]])).
}).
#set names attribute over the list.
df1 <- data.frame(doc_id = rep(names(textdata), lengths(textdata)),
doc_text = unlist(textdata), row.names = NULL).
#convert to dataframe where names attribute is doc_id and textdata is text.
return(df1).
}
I found this code here, and it worked to convert the '.txt' to '.csv' but the file is not broken into columns, pretty sure there's an easy fix or line to add here, but I'm not finding it. Still new to r and working through, so any help or direction is appreciated.
EDIT: The file contains the following, a list of invasive plants:
Header: Noxious Weed List.
'(a) Abrus precatorius – rosary pea '
'(b) Aeginetia spp. – aeginetia'
'(c) Ageratina adenophora – crofton weed '
'(d) Alectra spp. – alectra '
And so I would like to get all the parts, i.e., genus, species, and common name, in a separate column. and if possible, delete the letters '(a)' and the ' - ' separating hyphen.
filelist = list.files(pattern = ".txt")
for (i in 1:length(filelist)) {
input<-filelist[i]
output <- paste0(gsub("\\.txt$", "", input), ".csv")
print(paste("Processing the file:", input))
data = read.delim(input, header = TRUE)
write.table(data, file=output, sep=",", col.names=TRUE, row.names=FALSE)
}
You'll need to adjust if you have common names with three or more words, but this is the general idea:
path <- "C:\\Your File Path Here\\"
file <- paste0(path, "WeedList.txt")
DT <- read.delim(file, header = FALSE, sep = " ")
DT <- DT[-c(1),-c(1,4,7)]
colnames(DT) <- c("Genus", "Species", "CommonName", "CommonName2")
DT$CommonName <- gsub("'", "", DT$CommonName)
DT$CommonName2 <- gsub("'", "", DT$CommonName2)
DT$CommonName <- paste(DT$CommonName, DT$CommonName2, sep = " ")
DT <- DT[,-c(4)]
write.csv(DT, paste0(path, "WeedList.csv"), row.names = FALSE)
I have a code that reads two different csv files from a folder at the time of execution. i need to use for loop in this context to execute this multiple times and write the output in to a separate csv file of the form "bsc_.csv". The file format of the two input csv files are "base_.csv" and "fut_.csv". The files are incrementally numbered, and that that is the pattern I need to iterative over. The sample code is attached below.
library('CDFt')
d1<-read.csv("base1.csv",header=TRUE)
d2<-read.csv("fut1.csv",header=TRUE)
A1<-d1[,2]
A2<-d1[,3]
A3<-d2[,2]
CT<-CDFt(A1,A2,A3)
x<-CT$x
FGp<-CT$FGp
FGf<-CT$FGf
FRp<-CT$FRp
FRf<-CT$FRf
ds<-CT$DS
d<-round(ds,3)
dat<-replace(d,d<0,0)
write.table(dat,"bsc1.csv", row.names=F,na="NA",append=T, quote= FALSE, sep=",", col.names=F)
Try this (untested):
bases <- list.files(pattern = "base[0-9]*\\.csv$")
futs <- list.files(pattern = "fut[0-9]*\\.csv$")
mismatches <- setdiff(gsub("^base", "", bases), gsub("^fut", "", futs) )
if (length(mismatches)) {
warning("'bases' files not in 'futs': ", paste(sQuote(mismatches), collapse = ", "))
bases <- setdiff(bases, paste0("base", mismatches))
}
# and the reverse
mismatches <- setdiff(gsub("^fut", "", futs), gsub("^base", "", bases) )
if (length(mismatches)) {
warning("'futs' files not in 'bases': ", paste(sQuote(mismatches), collapse = ", "))
futs <- setdiff(futs, paste0("fut", mismatches))
}
ign <- Map(function(fb, ff) {
bdat <- read.csv(fb, header = TRUE)
fdat <- read.csv(ff, header = TRUE)
# ...
newfn <- gsub("^base", "bsc", fb)
write.table(dat, newfn, ...)
}, bases, futs)
I have PDF files that I made from these wikipedia pages (for example):
https://en.wikipedia.org/wiki/AIM-120_AMRAAM
https://en.wikipedia.org/wiki/AIM-9_Sidewinder
I have a list of keywords I want to search for within the document and extract the sentences in which they appear.
keywords <- c("altitude", "range", "speed")
I can call the file, extract the text from the PDF, pull the sentences with the keywords from the PDF. This works if I do this with each of the keywords individually, but when I try to do this in a loop I keep getting this issue where the rows aren't appending. Instead it's almost doing a cbind and then an error gets thrown regarding the number of columns. Here is my code and any help you can provide as to what I can do to make this work is much appreciated.
How do I get the rows to append correctly and appear in one file per PDF?
pdf.files <- list.files(path = "/path/to/file", pattern = "*.pdf", full.names = FALSE, recursive = FALSE)
for (i in 1:length(pdf.files)) {
for (j in 1:length(keywords)) {
text <- pdf_text(file.path("path", "to", "file", pdf.files[i]))
text2 <- tolower(text)
text3 <- gsub("\r", "", text2)
text4 <- gsub("\n", "", text3)
text5 <- grep(keywords[j], unlist(strsplit(text4, "\\.\\s+")), value = TRUE)
}
temp <- rbind(text5)
assign(pdf.files[i], temp)
}
After I get the rows to append correctly the next step will be to add in the keywords as a variable to the left of the extracted sentences. Example of ideal output:
keywords sentence
altitude sentence1.1
altitude sentence1.2
range sentence2.1
range sentence2.2
range sentence2.3
speed sentence3.1
speed sentence3.2
Would this be done in the loop as well or post as a separate function?
Any help is appreciated.
Alright so it took some real thinking but I made it work and it's not pretty but it gets the job done:
# This first part initializes the files to be written to
files <- list.files(path = "/path/to/file", pattern = "*.*", full.names = FALSE, recursive = FALSE)
for (h in 1:length(files)) {
temp1 <- data.frame(matrix(ncol = 2, nrow = 0))
x <- c("Title", "x")
colnames(temp1) <- x
write.table(temp1, paste0("/path/to/file", tools::file_path_sans_ext(files[h]), ".txt"), sep = "\t", row.names = FALSE, quote = FALSE)
}
# This next part fills in the files with the sentences
pdf.files <- list.files(path = "/path/to/file", pattern = "*.pdf", full.names = FALSE, recursive = FALSE)
for (i in 1:length(pdf.files)) {
for (j in 1:length(keywords)) {
text <- pdf_text(file.path("path", "to", "file", pdf.files[i]))
text2 <- tolower(text)
text3 <- gsub("\r", "", text2)
text4 <- gsub("\n", "", text3)
text5 <- as.data.frame(grep(keywords[j], unlist(strsplit(text4, "\\.\\s+")), value = TRUE))
colnames(text5) <- "x"
if (nrow(text5) != 0) {
title <- as.data.frame(keywords[j])
colnames(title) <- "Title"
temp <- cbind(title, text5)
temp <- unique(temp)
write.table(temp, paste0("/path/to/file", tools::file_path_sans_ext(pdf.files[i]), ".txt"), sep = "\t", row.names = FALSE, quote = FALSE, col.names = FALSE, append = TRUE)
}
}
}
Is there a way of saying something like:
for (i in 1:10){
ga${i} <- read.table(file="ene.${i}.dat",header=T, sep = ",")
}
in R.
I tried using many other constructs, but none suited the requirement.
Thanks.
We can extract file names first.
ga <- lapply(list.files(path = ".", pattern = "\\.dat"), read.csv)
or with loop:
lf <- list.files(path = ".", pattern = "\\.dat")
ga <- structure(vector("list", length(lf)),
names = gsub("\\.dat", "", lf))
for (i in seq_along(ga))
ga[i] <- read.csv(lf[i])
To assign data to the separate variables:
lf <- list.files(path = ".", pattern = "\\.dat")
fn <- gsub("\\.dat", "", lf)
for (i in seq_along(lf))
assign(fn[i], read.csv(lf[i]))
You can use an empty list and then a paste function to do something like this:
ga <- list()
for (i in 1:10) {
ga[[i]] <- read.table(file = paste('ene.', i, '.dat', sep = ''), header = TRUE, sep = ',')
}
Then, you will have a list of data frames. You can index as ga[[1]], ga[[2]] etc. to access them.