Read and Convert .pdf files to .txt files in R faster? - r

I am trying to .pdf files (most of which are image based) to .txt files in bulk. The below program successfully converts both text and image based pdfs to text files.
My problem is that there is a set of ~15 pdf files that take a really long time to convert. They aren't particularly large (maximum pages between 10 to 600) but my program takes about 45 mins to convert them.
Why is it taking so long to convert them and how can I speed it up? I am using CRAN RGui(64-bit) and the R version 3.5.0
The .pdf files are in the following hirarchy
My Directory->Sub-folder 1->abc.pdf
My Directory->Sub-folder 2->def.pdf
etc..
The code is as below:
programdir<-"C:\\My directory"
# Delete all txt files in the path
file.remove(list.files(path=programdir, pattern = ".txt", recursive = T, full.names = T))
# Get list of sub folders in the main directory
mydir<-list.dirs(path=programdir,full.names = TRUE, recursive = TRUE)
# Loop through sub-folders, starting from 2 as 1 is the parent directory
for(i in 2:length(mydir)) {
# make a vector of PDF file names
myfiles <- list.files(path=mydir[i],pattern = ".pdf",
full.names = TRUE,recursive = TRUE)
# Loop through every file in the sub-directory
for(j in 1:length(myfiles)) {
# Render pdf to png image
img_file <- pdftools::pdf_convert(myfiles[j], format = 'tiff', dpi = 400)
# Extract text from png image
pdftotext <- ocr(img_file)
# Ensure text files are named as per sub-directory name_pdf name.txt format
fname = paste(mydir[i],basename(file_path_sans_ext(myfiles[j])),sep="_")
# Save files to directory path
sink(file=paste(fname , ".txt", sep=''))
writeLines(unlist(lapply(pdftotext , paste, collapse=" ")))
sink()
j <- j + 1 # Next file in sub-directory
}
i <- i + 1 # Next sub-directory record
}
file.remove(list.files(pattern = ".tiff", recursive = TRUE, full.names = TRUE))

Related

R foreach: Read and manipulate multiple files in parallel

I have 500 tar.xz files containing 2000 csv files. I need to untar them a few tar files at a time (because of disk space), process them into a data.table, delete the csv files from disk and then save the result as RDS before moving on to the next few tar file.
My function works fine in serial but in parallel it gets the files mixed up between cores. Why is this?
Some sample data:
for(j in 1:5){
for(i in 1:5){
a<-df[sample(x = 1:nrow(df), size = 50, replace = TRUE),]
write.csv(a,paste0("seed_",i,".csv"))
lf<-list.files(pattern=".csv")
}
tar(tarfile = paste0("seed_",j,".tar"),files = lf,compression = c("xz"), tar="tar")
}
Example code with foreach
require(dplyr)
require(tidyr)
require(foreach)
require(doParallel)
require(magrittr)
#List all tar files in directory
list_of_files<-list.files(pattern = ".tar")
packsINeed<-c("vroom","magrittr","dplyr","tidyr","doParallel")
#Start for loop
myCluster<-makeCluster(6,type="PSOCK")
registerDoParallel(myCluster)
foreach(i= 1:NROW(list_of_files),.packages = packsINeed)%dopar%{
print(paste(list_of_files[i], "which is", i, "of", NROW(list_of_files) ))
print("2. Untar .csv files inside")
untar(tarfile = list_of_files[i], exdir = "tempOutputFiles")
print("#3. Read in files and add up two columns")
df<-vroom::vroom(list.files("tempOutputFiles/$.csv"), id="path")
df$A<-df$B+df$C
print("#4. save RDS")
saveRDS(object = df, file = paste0(tools::file_path_sans_ext(list_of_files[i], compression = TRUE),".rds"))
print("#5. Clean up files")
.files<-list.files("tempOutputFiles",pattern=".csv")
file.remove(basename(.files))
}
Using mclapply - behaves the same
require(dplyr)
require(tidyr)
require(foreach)
require(doParallel)
require(magrittr)
#List all tar files in directory
list_of_files<-list.files(pattern = ".tar")
myParFun<-fun(filename){
print(paste(filename))
print("2. Untar all .csv files inside")
untar(tarfile = filename, exdir = "tempOutputFiles")
print("#3. Read in files and add up two columns")
df<-vroom::vroom(list.files("tempOutputFiles/$.csv"), id="path")
df$A<-df$B+df$C
print("#4. save RDS")
saveRDS(object = df, file = paste0(tools::file_path_sans_ext(filename, compression = TRUE),".rds"))
print("#5. Clean up files")
.files<-list.files("tempOutputFiles",pattern=".csv")
file.remove(.files)
}
mclapply(FUN=myParFun, list_of_files, mc.cores=4)
Based on Waldi's comment I've created a directory for each file in list_of_files and it now works fine. But is there snore approach? Using tempdir for example?
As suggested in comments, the code below creates one directory per process / tar file, untars, merges the CSVs in a .rds file and deletes them.
Note that it seems that vroom needs the altrep = FALSE argument to avoid a permission denied error at deletion.
# Generate sample tars for test
write.csv(mtcars,'file1.csv')
write.csv(mtcars,'file2.csv')
write.csv(iris,'file3.csv')
write.csv(iris,'file4.csv')
tar('tar1.tar',files=c('file1.csv','file2.csv'),tar="tar")
tar('tar2.tar',files=c('file3.csv','file4.csv'),tar="tar")
require(dplyr)
require(tidyr)
require(foreach)
require(doParallel)
require(magrittr)
#List all tar files in directory
list_of_files<-list.files(pattern = "\\.tar")
packsINeed<-c("vroom","magrittr","dplyr","tidyr","doParallel")
#Start for loop
myCluster<-makeCluster(2,type="PSOCK")
registerDoParallel(myCluster)
foreach(i= 1:NROW(list_of_files),.packages = packsINeed)%dopar%{
print(paste(list_of_files[i], "which is", i, "of", NROW(list_of_files) ))
print("2. Untar .csv files inside")
fileout <- tools::file_path_sans_ext(list_of_files[i], compression = TRUE)
exdir <- paste0("temp",fileout)
untar(tarfile = list_of_files[i], exdir = exdir)
print("#3. Read in files and add up two columns")
df<-vroom::vroom(file.path(exdir,dir(exdir,"*.csv")),altrep = FALSE)
# df$A<-df$B+df$C # These columns don't exist in mtcars used as example
print("#4. save RDS")
saveRDS(object = df, file = file.path(exdir,paste0(fileout,".rds")))
print("#5. Clean up files")
.files<-list.files(exdir,pattern="\\.csv")
file.remove(file.path(exdir,.files))
}
Not sure where the .rds should go, so left for the time being in the temporary folder.

Convert png files to txt files

I have 100 scanned PDF files and I need to convert them into text files.
I have first converted them into png files (see script below),
now I need help to convert these 100 png files to 100 text files.
library(pdftools)
library("tesseract")
#location
dest <- "P:\\TEST\\images to text"
#making loop for all files
myfiles <- list.files(path = dest, pattern = "pdf", full.names = TRUE)
#Convert files to png
sapply(myfiles, function(x)
pdf_convert(x, format = "png", pages = NULL,
filenames = NULL, dpi = 600, opw = "", upw = "", verbose = TRUE))
#read files
cat(text)
I expect to have a text file for each png file:
From: file1.png, file2.png, file3.png...
To: file1.txt, file2.txt, file3.txt...
But the actual result is one text file containing all png files text.
I guess you left out the bit with teh png -> text bit, but I assume you used library(tesseract).
You could do the following in your code:
library(tesseract)
eng <- tesseract("eng")
sapply(myfiles, function(x) {
png_file <- gsub("\\.pdf", ".png", x)
txt_file <- gsub("\\.pdf", ".txt", x)
pdf_convert(x, format = "png", pages = 1,
filenames = png_file, dpi = 600, verbose = TRUE)
text <- ocr(png_file, engine = eng)
cat(text, file = txt_file)
## just return the text string for convenience
## we are anyways more interested in the side effects
text
})

R batch txt file processing

I am new to R and I want to batch process all files in a working directory.
I have lots of .txt files and want to read them in, calculate a frequency of one Column, calculate percentage and a so called "H-Score", calculate the sum of the H-Score and store it in a vector. Then the next .txt file should be processed and so on.
After all files are processed, I want to write the vector in another .txt file as a result. The final .txt file should also contain the name of the input file and the calculated sum of H-Score. This is what I have so far, but as you can see, I am a absolute Newbie to programming and R...
setwd("~/Desktop/Automated Analysis/TXT/") # Set working directory
# List all txt files including sub-folders
list_of_files <- list.files(path = ".", recursive = TRUE,
pattern = "\\.txt$", full.names = TRUE)
library(data.table)
# Read all the files and create a FileName column to store filenames
DT <- rbindlist( sapply(list_of_files, fread, simplify = FALSE),
use.names = TRUE, idcol = "FileName" )
br = c(0,1,3,9,15,500) # Set breaks
bins = c(0,1,2,3,4) # Set bins
for (k in 1:length(list_of_files)) { # process all the files in the working directory
HScore_list = c() # create a vector for storing the results
for(i in 1:5) { my_vector = c(HScore_list,i) }
freq = hist(Count, breaks=br, plot=FALSE)
df = data.frame(bins, frequency=freq$counts,
df$percent=df$frequency / sum(df$frequency) * 100,
df$HScore=df$percent * df$bins)
HScore = sum(df$HScore)
}
write(HScore_list, "HScore_list.txt", sep="\n")
Do you know what I want and can help me?
EDIT: My Problem is, that the Code is producing no output.

Convert .pdf to .txt

The problem is not new on Stackoverflow, but I am pretty sure I am missing something obvious.
I am trying to convert a few .pdf files into .txt files, in order to mine their text. I based my approach on this excellent script. The text in the .pdf files is not composed by images, hence no OCR required.
# Load tm package
library(tm)
# The folder containing my PDFs
dest <- "./pdfs"
# Correctly installed xpdf from http://www.foolabs.com/xpdf/download.html
file.exists(Sys.which(c("pdfinfo", "pdftotext")))
[1] TRUE TRUE
# Delete white spaces from pdfs' names
sapply(myfiles, FUN = function(i){
file.rename(from = i, to = paste0(dirname(i), "/", gsub(" ", "", basename(i))))
})
# make a vector of PDF file names
myfiles <- list.files(path = dest, pattern = "pdf", full.names = TRUE)
lapply(myfiles, function(i) system(paste('"C:/Program Files/xpdf/bin64/pdftotext.exe"',
paste0('"', i, '"')), wait = FALSE))
It should create a .txt copy of any .pdf file in the dest folder. I checked for issues with the path, for white spaces in the path, for xpdf common installation issues but nothing happens.
Here is the repository I am working on. If it can be useful, I can paste the SessionInfo. Thanks in advance.
Late answer:
But I recently discovered that with the current verions of tm (0.7-4) you can read pdfs directly into a corpus if you have pdftools installed (install.packages("pdftools")).
library(tm)
directory <- getwd() # change this to directory where pdf-files are located
# read the pdfs with readPDF, default engine used is pdftools see ?readPDF for more info
my_corpus <- VCorpus(DirSource(directory, pattern = ".pdf"),
readerControl = list(reader = readPDF))

Automate zip file reading in R

I need to automate R to read a csv datafile that's into a zip file.
For example, I would type:
read.zip(file = "myfile.zip")
And internally, what would be done is:
Unzip myfile.zip to a temporary folder
Read the only file contained on it using read.csv
If there is more than one file into the zip file, an error is thrown.
My problem is to get the name of the file contained into the zip file, in orded to provide it do the read.csv command. Does anyone know how to do it?
UPDATE
Here's the function I wrote based on #Paul answer:
read.zip <- function(zipfile, row.names=NULL, dec=".") {
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
dir.create(zipdir)
# Unzip the file into the dir
unzip(zipfile, exdir=zipdir)
# Get the files into the dir
files <- list.files(zipdir)
# Throw an error if there's more than one
if(length(files)>1) stop("More than one data file inside zip")
# Get the full name of the file
file <- paste(zipdir, files[1], sep="/")
# Read the file
read.csv(file, row.names, dec)
}
Since I'll be working with more files inside the tempdir(), I created a new dir inside it, so I don't get confused with the files. I hope it may be useful!
Another solution using unz:
read.zip <- function(file, ...) {
zipFileInfo <- unzip(file, list=TRUE)
if(nrow(zipFileInfo) > 1)
stop("More than one data file inside zip")
else
read.csv(unz(file, as.character(zipFileInfo$Name)), ...)
}
You can use unzip to unzip the file. I just mention this as it is not clear from your question whether you knew that. In regard to reading the file. Once your extracted the file to a temporary dir (?tempdir), just use list.files to find the files that where dumped into the temporary directory. In your case this is just one file, the file you need. Reading it using read.csv is then quite straightforward:
l = list.files(temp_path)
read.csv(l[1])
assuming your tempdir location is stored in temp_path.
I found this thread as I was trying to automate reading multiple csv files from a zip. I adapted the solution to the broader case. I haven't tested it for weird filenames or the like, but this is what worked for me so I thought I'd share:
read.csv.zip <- function(zipfile, ...) {
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
dir.create(zipdir)
# Unzip the file into the dir
unzip(zipfile, exdir=zipdir)
# Get a list of csv files in the dir
files <- list.files(zipdir)
files <- files[grep("\\.csv$", files)]
# Create a list of the imported csv files
csv.data <- sapply(files, function(f) {
fp <- file.path(zipdir, f)
return(read.csv(fp, ...))
})
return(csv.data)}
If you have zcat installed on your system (which is the case for linux, macos, and cygwin) you could also use:
zipfile<-"test.zip"
myData <- read.delim(pipe(paste("zcat", zipfile)))
This solution also has the advantage that no temporary files are created.
Here is an approach I am using that is based heavily on #Corned Beef Hash Map 's answer. Here are some of the changes I made:
My approach makes use of the data.table package's fread(), which
can be fast (generally, if it's zipped, sizes might be large, so you
stand to gain a lot of speed here!).
I also adjusted the output format so that it is a named list, where
each element of the list is named after the file. For me, this was a
very useful addition.
Instead of using regular expressions to sift through the files
grabbed by list.files, I make use of list.file()'s pattern
argument.
Finally, I by relying on fread() and by making pattern an
argument to which you could supply something like "" or NULL or
".", you can use this to read in many types of data files; in fact,
you can read in multiple types of at once (if your .zip contains
.csv, .txt in you want both, e.g.). If there are only some types of
files you want, you can specify the pattern to only use those, too.
Here is the actual function:
read.csv.zip <- function(zipfile, pattern="\\.csv$", ...){
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
dir.create(zipdir)
# Unzip the file into the dir
unzip(zipfile, exdir=zipdir)
# Get a list of csv files in the dir
files <- list.files(zipdir, rec=TRUE, pattern=pattern)
# Create a list of the imported csv files
csv.data <- sapply(files,
function(f){
fp <- file.path(zipdir, f)
dat <- fread(fp, ...)
return(dat)
}
)
# Use csv names to name list elements
names(csv.data) <- basename(files)
# Return data
return(csv.data)
}
The following refines the above answers. FUN could be read.csv, cat, or anything you like, providing the first argument will accept a file path. E.g.
head(read.zip.url("http://www.cms.gov/Medicare/Coding/ICD9ProviderDiagnosticCodes/Downloads/ICD-9-CM-v32-master-descriptions.zip", filename = "CMS32_DESC_LONG_DX.txt"))
read.zip.url <- function(url, filename = NULL, FUN = readLines, ...) {
zipfile <- tempfile()
download.file(url = url, destfile = zipfile, quiet = TRUE)
zipdir <- tempfile()
dir.create(zipdir)
unzip(zipfile, exdir = zipdir) # files="" so extract all
files <- list.files(zipdir)
if (is.null(filename)) {
if (length(files) == 1) {
filename <- files
} else {
stop("multiple files in zip, but no filename specified: ", paste(files, collapse = ", "))
}
} else { # filename specified
stopifnot(length(filename) ==1)
stopifnot(filename %in% files)
}
file <- paste(zipdir, files[1], sep="/")
do.call(FUN, args = c(list(file.path(zipdir, filename)), list(...)))
}
Another approach that uses fread from the data.table package
fread.zip <- function(zipfile, ...) {
# Function reads data from a zipped csv file
# Uses fread from the data.table package
## Create the temporary directory or flush CSVs if it exists already
if (!file.exists(tempdir())) {dir.create(tempdir())
} else {file.remove(list.files(tempdir(), full = T, pattern = "*.csv"))
}
## Unzip the file into the dir
unzip(zipfile, exdir=tempdir())
## Get path to file
file <- list.files(tempdir(), pattern = "*.csv", full.names = T)
## Throw an error if there's more than one
if(length(file)>1) stop("More than one data file inside zip")
## Read the file
fread(file,
na.strings = c(""), # read empty strings as NA
...
)
}
Based on the answer/update by #joão-daniel
unzipped file location
outDir<-"~/Documents/unzipFolder"
get all the zip files
zipF <- list.files(path = "~/Documents/", pattern = "*.zip", full.names = TRUE)
unzip all your files
purrr::map(.x = zipF, .f = unzip, exdir = outDir)
I just wrote a function based on top read.zip that may help...
read.zip <- function(zipfile, internalfile=NA, read.function=read.delim, verbose=TRUE, ...) {
# function based on http://stackoverflow.com/questions/8986818/automate-zip-file-reading-in-r
# check the files within zip
unzfiles <- unzip(zipfile, list=TRUE)
if (is.na(internalfile) || is.numeric(internalfile)) {
internalfile <- unzfiles$Name[ifelse(is.na(internalfile),1,internalfile[1])]
}
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
if (verbose) catf("Directory created:",zipdir,"\n")
dir.create(zipdir)
# Unzip the file into the dir
if (verbose) catf("Unzipping file:",internalfile,"...")
unzip(zipfile, file=internalfile, exdir=zipdir)
if (verbose) catf("Done!\n")
# Get the full name of the file
file <- paste(zipdir, internalfile, sep="/")
if (verbose)
on.exit({
catf("Done!\nRemoving temporal files:",file,".\n")
file.remove(file)
file.remove(zipdir)
})
else
on.exit({file.remove(file); file.remove(zipdir);})
# Read the file
if (verbose) catf("Reading File...")
read.function(file, ...)
}

Resources