Speed up text mining (and for loop) in R - r

I'm text-mining thousands of documents (basically doing frequency count) and wondering is there any other ways to speed up the following process? Currently it takes more than 10 hours to run the whole analysis. Thank you (from an R beginner).
sessionInfo()
#R version 3.2.3 (2015-12-10)
library(bitops)
library(RCurl)
library(XML)
library(stringr)
library(tm)
setwd("F:/testing_folder")
path = "F:/testing_folder"
file.names <- dir(path, pattern =".txt")
filename <- vector()
totalword <- vector()
system.time(
for(i in 1:length(file.names)){
text.v <- scan(file.names[i], what="character", sep="\n",encoding = "UTF-8")
report.v <- paste(text.v, collapse=" " )
#Count total number of words
words.l <- strsplit(report.v, "\\W")
word.v <- unlist(words.l)
not.blanks.v <- which(word.v!="")
word.v <- word.v[not.blanks.v]
totalword <- append(totalword,length(word.v))
filename <- append(filename,print(file.names[i]))
x <- data.frame(filename,totalword)
write.csv(x, file= "results.csv") #export results
}
)

What do you get from the following?
Rprof("profile1.out", line.profiling=TRUE)
source("http://pastebin.com/raw/kFGCse5s")
Rprof(NULL)
proftable("profile1.out", lines=10)

Related

R-studio crash after a getURL loop

I'm trying to run a loop of size 59 but R continues to crash, this is the loop:
n=length(webpage)
count=0
i=1
for(i in 1:n)
{
#get the URL
u <- webpage[i]
doc <- getURL(u)
#get the text from the body
html <- htmlTreeParse(doc, useInternal = TRUE)
txt <- xpathApply(html, "//body//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)]", xmlValue)
txt<-toString(txt)
txt
#clean
txt<-(str_replace_all(txt, "[\r\n\t,]" , ""))
txt<-tolower(txt)
search <- c("wi-fi","router","switch","adsl","wireless")
stri_count_fixed(txt, search)
count[i]<-sum(stri_count_fixed(txt, search))
}
count
I tried with 3 and worked but with 15 didn't

chunking txt files in R

all,
I'm working form Matthew Jockers's code in his "Text Analysis with R for Students of Literature" book.
In it he provides code to pull all <p> tags from XML documents, chop that content in 1000 words chunks and apply a bunch data massaging tricks. Once that's done, he inserts that chunking function in a loop that produces a data matrix that is ready to be used in mallet. Please see the code below.
My question is, how do I do the same thing with .txt files? Obviously, text files do not have attributes like <p> to work from. I'm not an experienced programmer so go easy on me please!!!
chunk.size <- 1000 #number of words per chunk
makeFlexTextChunks <- function(doc.object, chunk.size=1000, percentage=TRUE){
paras <- getNodeSet(doc.object,
"/d:TEI/d:text/d:body//d:p",
c(d = "http://www.tei-c.org/ns/1.0"))
words <- paste(sapply(paras,xmlValue), collapse=" ")
words.lower <- tolower(words)
words.lower <- gsub("[^[:alnum:][:space:]']", " ", words.lower)
words.l <- strsplit(words.lower, "\\s+")
word.v <- unlist(words.l)
x <- seq_along(word.v)
if(percentage){
max.length <- length(word.v)/chunk.size
chunks.l <- split(word.v, ceiling(x/max.length))
} else {
chunks.l <- split(word.v, ceiling(x/chunk.size))
#deal with small chunks at the end
if(length(chunks.l[[length(chunks.l)]]) <=
length(chunks.l[[length(chunks.l)]])/2){
chunks.l[[length(chunks.l)-1]] <-
c(chunks.l[[length(chunks.l)-1]],
chunks.l[[length(chunks.l)]])
chunks.l[[length(chunks.l)]] <- NULL
}
}
chunks.l <- lapply(chunks.l, paste, collapse=" ")
chunks.df <- do.call(rbind, chunks.l)
return(chunks.df)
}
topic.m <- NULL
for(i in 1:length(files.v)){
doc.object <- xmlTreeParse(file.path(input.dir, files.v[i]),
useInternalNodes=TRUE)
chunk.m <- makeFlexTextChunks(doc.object, chunk.size,
percentage=FALSE)
textname <- gsub("\\..*","", files.v[i])
segments.m <- cbind(paste(textname,
segment=1:nrow(chunk.m), sep="_"), chunk.m)
topic.m <- rbind(topic.m, segments.m)
}
Thank you everybody for your help. I think I found my answer after much trial and error! The key was to pull the txt files with scan(paste(input.dir, files.v[i], sep="/") in the loop rather than the function. Please see my code here:
input.dir <- "data/plainText"
files.v <- dir(input.dir, ".*txt")
chunk.size <- 100 #number of words per chunk
makeFlexTextChunks <- function(doc.object, chunk.size=100, percentage=TRUE){
words.lower <- tolower(paste(doc.object, collapse=" "))
words.lower <- gsub("[^[:alnum:][:space:]']", " ", words.lower)
words.l <- strsplit(words.lower, "\\s+")
word.v <- unlist(words.l)
x <- seq_along(word.v)
if(percentage){
max.length <- length(word.v)/chunk.size
chunks.l <- split(word.v, ceiling(x/max.length))
}
else {
chunks.l <- split(word.v, ceiling(x/chunk.size))
#deal with small chunks at the end
if(length(chunks.l[[length(chunks.l)]]) <=
length(chunks.l[[length(chunks.l)]])/2){
chunks.l[[length(chunks.l)-1]] <-
c(chunks.l[[length(chunks.l)-1]],
chunks.l[[length(chunks.l)]])
chunks.l[[length(chunks.l)]] <- NULL
}
}
chunks.l <- lapply(chunks.l, paste, collapse=" ")
chunks.df <- do.call(rbind, chunks.l)
return(chunks.df)
}
topic.m <- NULL
for(i in 1:length(files.v)){
doc.object <- scan(paste(input.dir, files.v[i], sep="/"), what="character", sep="\n")
chunk.m <- makeFlexTextChunks(doc.object, chunk.size, percentage=FALSE)
textname <- gsub("\\..*","", files.v[i])
segments.m <- cbind(paste(textname, segment=1:nrow(chunk.m), sep="_"), chunk.m)
topic.m <- rbind(topic.m, segments.m)
}
Maybe this can point you in the right direction. The following code reads in a txt file a splits the words up into elements of a vector.
library(readr)
library(stringr)
url <- "http://www.gutenberg.org/files/98/98-0.txt"
mystring <- read_file(url)
res <- str_split(mystring, "\\s+")
Then you can split it into chunks of 1000 words and do your magic?

Speed up loading and combining multiple .csv file in one big matrix in R

I follow here some post here
How to combine multiple .csv files in R?
and here
Reading Many CSV Files at the Same Time in R and Combining All into one dataframe
My purpose is basically the same: combining into one big matrix multiples, very large, csv file in R.
I have this solution that I would like to speed up as much as possible:
Here a fully reproducible example; I have much more and bigger files
setwd("C:/") #### set an easy directory to create acceptably large files
#### this takes about 60 seconds
for(i in 1:80){
print(80-i)
write.table(matrix(rnorm(20*3891,0,1),ncol=20),col.names=F,row.names=F,sep=",",file=paste(i,"file.csv",sep=""))
}
listfiles<-list.files(path="C:/",pattern="*.csv")
#### now the problem: this takes about 30-40 seconds; as I have bigger (and much more) files I want to speed up this step
library(plyr)
mybigmatrix<-ldply(listfiles,read.csv,header=F)
Thanks in advance for any help
maybe the use of special packages and functions like readr and the function read_csv()
mybigmatrix<-ldply(listfiles,readr::read_csv,header=F)
Here a fully reproducible example that shows a problem with fread() that does not allow me to coerce in matrix the data.table object.
setwd("C:/") #### set an easy directory to create acceptably large files
#### this takes few seconds
for(i in 1:5){
print(5-i)
write.table(matrix(rnorm(5*3891,0,1),nrow=5),col.names=F,row.names=F,sep=",",file=paste(i,"file.csv",sep=""))
}
listfiles<-list.files(path="C:/",pattern="*.csv")
myfread<-function(file){
data_frame <- fread(file,sep=",",header=FALSE,stringsAsFactors=FALSE,select=c(1:3891),colClasses=c(rep("as.numeric",3891)))
data_frame
}
###### this is a matrix 25*3891 I want an array of 1297x3x25
alld<-rbindlist(lapply(listfiles,myfread))
### why this is in characters??
as.matrix(alld)
k<-1297
m<-3
vectorr<-as.vector(t(as.matrix(alld)))
tem <- vectorr
n <- length(tem)/(k * m)
tem <- array(tem, c(m, k, n))
tem <- aperm(tem, c(2, 1, 3))
xup <- tem ####### here I have characters
I think any of these options should work well for you.
setwd("C:/Users/your_path_here/test")
fnames <- list.files()
csv <- lapply(fnames, read.csv)
result <- do.call(rbind, csv)
filedir <- setwd("C:/Users/your_path_here/csv_files")
file_names <- dir(filedir)
your_data_frame <- do.call(rbind,lapply(file_names,read.csv))
filedir <- setwd("C:/Users/your_path_here/csv_files")
file_names <- dir(filedir)
your_data_frame <- do.call(rbind, lapply(file_names, read.csv, skip = 1, header = FALSE))
filedir <- setwd("C:/Users/your_path_here/csv_files")
file_names <- dir(filedir)
your_data_frame <- do.call(rbind, lapply(file_names, read.csv, header = FALSE))
temp <- setwd("C:/Users/Excel/Desktop/test")
temp = list.files(pattern="*.csv")
myfiles = lapply(temp, read.delim)
Finally, try this:
setwd("C:/Users/your_path_here/")
file_list <- list.files()
file_list <- list.files("C:/Users/your_path_here/")
for (file in file_list){
# if the merged dataset doesn't exist, create it
if (!exists("dataset")){
dataset <- read.table(file, header=TRUE, sep="\t")
}
# if the merged dataset does exist, append to it
if (exists("dataset")){
temp_dataset <-read.table(file, header=TRUE, sep="\t")
dataset<-rbind(dataset, temp_dataset)
rm(temp_dataset)
}
}

Looping Issue -Store the data which is of a different format

I am having some trouble storing the data after it runs. The code is picking the files up correctly and running the forecast model but it somehow stores the value for the last file. All the others are lost. Is there anyway that I can have all the results stored in a different array. The problem is that the format of the output is in "forecast" format and because of that I am getting stuck on it. I have looked through all the websites but couldn't find something like that.
Here is the code:
library(forecast)
library(quantmod)
library(forecast)
fileList <-as.array(length(50))
Forecast1 <- as.array(length(50))
fileList<-list.files(path ='C:\\Users\\User\\Downloads\\wOOLWORTHS\\',recursive =T, pattern = ".csv")
i<- integer()
j<-integer()
i=1
setwd("C:\\Users\\User\\Downloads\\wOOLWORTHS\\")
while (i<51)
{
a<-fileList[i]
print(a)
a <- read.csv(a)
fileSales<-a$sales
fileTransform<-log(fileSales)
plot.ts(fileTransform)
result1<-HoltWinters(fileTransform,beta = FALSE,gamma =FALSE,seasonal ="multiplicative",optim.control =TRUE)
result2<-forecast.HoltWinters(result1,h=1)
summary(result1)
accuracy(result2)
#Forecast1[i] <- result2(forecast)
#print(Forecast1[i])
i=i+1
}
It may just be how you are storing your results. Try filling an empty list instead (e.g.Forecast1):
setwd("C:\\Users\\User\\Downloads\\wOOLWORTHS\\")
library(forecast)
library(quantmod)
library(forecast)
fileList <- list.files(path ='C:\\Users\\User\\Downloads\\wOOLWORTHS\\',recursive =T, pattern = ".csv")
Forecast1 <- vector(mode="list", 50)
for(i in seq(length(fileList)){
a <- fileList[[i]]
#print(a)
a <- read.csv(a)
fileSales<-a$sales
fileTransform<-log(fileSales)
plot.ts(fileTransform)
result1<-HoltWinters(fileTransform,beta = FALSE,gamma =FALSE,seasonal ="multiplicative",optim.control =TRUE)
result2<-forecast.HoltWinters(result1,h=1)
#summary(result1)
#accuracy(result2)
Forecast1[[i]] <- result2
#print(Forecast1[i])
print(paste(i, "of", length(fileList), "completed"))
}

How to Read Multiple HTML Tables in R

I am trying to automate the pulling in of and saving to a dataframe of this readHTML function; I am an R newbie and am having trouble figuring out how to write a loop that would automate this function that works if you do it one by one.
library('XML')
urls<-c("http://www.basketball-reference.com/teams/ATL/","http://www.basketball-reference.com/teams/BOS/")
theurl<-urls[2] #Pick second link (celtics)
tables <- readHTMLTable(theurl)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
BOS <-tables[[which.max(n.rows)]]
Team.History<-write.csv(BOS,"Bos.csv")
Any and all help would be very appreciated!
I think this combines the best of both answers (and tidies up a little).
library(RCurl)
library(XML)
stem <- "http://www.basketball-reference.com/teams/"
teams <- htmlParse(getURL(stem), asText=T)
teams <- xpathSApply(teams,"//*/a[contains(#href,'/teams/')]", xmlAttrs)[-1]
teams <- gsub("/teams/(.*)/", "\\1", teams)
urls <- paste0(stem, teams)
names(teams) <- NULL # get rid of the "href" labels
names(urls) <- teams
results <- data.frame()
for(team in teams){
tables <- readHTMLTable(urls[team])
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
team.results <- tables[[which.max(n.rows)]]
write.csv(team.results, file=paste0(team, ".csv"))
team.results$TeamCode <- team
results <- rbind(results, team.results)
rm(team.results, n.rows, tables)
}
rm(stem, team)
write.csv(results, file="AllTeams.csv")
I'm assuming you want to loop over your urls vector? I'd try something like this:
library('XML')
url_base <- "http://www.basketball-reference.com/teams/"
teams <- c("ATL", "BOS")
# better still, get the full list of teams as in
# http://stackoverflow.com/a/11804014/1543437
results <- data.frame()
for(team in teams){
theurl <- paste(url_base, team , sep="/")
tables <- readHTMLTable(theurl)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
team.results <-tables[[which.max(n.rows)]]
write.csv(team.results, file=paste0(team, ".csv"))
team.results$TeamCode <- team
results <- rbind(results, team.results)
}
write.csv(results, file="AllTeams.csv")

Resources