R-studio crash after a getURL loop - r

I'm trying to run a loop of size 59 but R continues to crash, this is the loop:
n=length(webpage)
count=0
i=1
for(i in 1:n)
{
#get the URL
u <- webpage[i]
doc <- getURL(u)
#get the text from the body
html <- htmlTreeParse(doc, useInternal = TRUE)
txt <- xpathApply(html, "//body//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)]", xmlValue)
txt<-toString(txt)
txt
#clean
txt<-(str_replace_all(txt, "[\r\n\t,]" , ""))
txt<-tolower(txt)
search <- c("wi-fi","router","switch","adsl","wireless")
stri_count_fixed(txt, search)
count[i]<-sum(stri_count_fixed(txt, search))
}
count
I tried with 3 and worked but with 15 didn't

Related

Assign numbers to rows which a similar to the loop numbers in R

I am scraping a site in R (rvest package) and I want to create in every parsed csv file a new column and 1) assign to numbers similar to my loop numbers or 2) create a new column and assign a special value (which I got using rvest nodes). I can assign these numbers if I scrape only one page, but that is not what I need. And the for loop works smoothly.
Here is my code with for loop
registered <- for (n in c(11:12)){
url_2019 <-
paste0("https://www.cvk.gov.ua/pls/vnd2019/wp033pt001f01=919pf7331=", n
,".html")
results_2019 <- read_html(url_2019)%>% html_table(fill = TRUE)
results_2019[[6]]%>%as.data.frame
#dir.create("registered_major_2019")
file <- paste0("registered_major_2019/dist_", n, ".csv")
if (!file.exists(file)) write.csv(results_2019[[6]], file, fileEncoding
= "Windows-1251")
Sys.sleep(0.5)
}
And I know to do it separately
url_2019 <-
paste0("https://www.cvk.gov.ua/pls/vnd2019/wp033pt001f01=919pf7331=11
.html")
results_2019 <- read_html(url_2019)%>% html_table(fill = TRUE)
pfont <- read_html(url_2019)%>% html_node("font")%>%html_text()
# This is actually what I need
results_2019a <- data.frame(results_2019[[6]], pfont)
But can't figure it out how to do it in for(). I tried this, but it doesn't work:
registered <- for (n in c(11:12)){
url_2019 <-
paste0("https://www.cvk.gov.ua/pls/vnd2019/wp033pt001f01=919pf7331=", n
,".html")
results_2019 <- read_html(url_2019)%>% html_table(fill = TRUE)%>%data.frame()
pfont <- read_html(url_2019)%>% html_node("font")%>%html_text()
df <- data.frame(results_2019[[6]], pfont)
#dir.create("registered_major_2019")
file <- paste0("registered_major_2019/dist_", n, ".csv")
if (!file.exists(file)) write.csv(df, file, fileEncoding = "Windows-
1251")
Sys.sleep(0.5)
}

Web scraping data in R when there are a lot of links

I am trying to scrape baseball data from baseball-reference (e.g., https://www.baseball-reference.com/teams/NYY/2017.shtml). I have a huge vector of URLS that I created using a for loop, since the links follow a specific pattern. However, I am having trouble running my code, probably because I have to make too many connections within R. There are over 17000 elements in my vector, and my code stops working once it gets to around 16000. Is there an easier and perhaps a more efficient way to replicate my code?
require(Lahman)
teams <- unique(Teams$franchID)
years <- 1871:2017
urls <- matrix(0, length(teams), length(years))
for(i in 1:length(teams)) {
for(j in 1:length(years)) {
urls[i, j] <- paste0("https://www.baseball-reference.com/teams/",
teams[i], "/", years[j], ".shtml")
}
}
url_vector <- as.vector(urls)
list_of_batting <- list()
list_of_pitching <- list()
for(i in 1:length(url_vector)) {
url <- url_vector[i]
res <- try(readLines(url), silent = TRUE)
## check if website exists
if(inherits(res, "try-error")) {
list_of_batting[[i]] <- NA
list_of_pitching[[i]] <- NA
}
else {
urltxt <- readLines(url)
urltxt <- gsub("-->", "", gsub("<!--", "", urltxt))
doc <- htmlParse(urltxt)
tables_full <- readHTMLTable(doc)
tmp1 <- tables_full$players_value_batting
tmp2 <- tables_full$players_value_pitching
list_of_batting[[i]] <- tmp1
list_of_pitching[[i]] <- tmp2
}
print(i)
closeAllConnections()
}

Working with large csv file in R

any help will be appreciated.
I used the following code to break down my large csv file (4gb) and now I am trying to save the 2nd, 3rd... part into a csv. However, I can only access the first chunk of my data.
Is there anything wrong with my code?
How do I save the second chunk of my data into csv?
rgfile <- 'filename.csv'
index <- 0
chunkSize <- 100000
con <- file(description = rgfile, open="r")
dataChunk <- read.table(con, nrows= chunkSize, header=T, fill= TRUE, sep= ",")
actualColumnNames <- names(dataChunk)
repeat {
index <- index + 1
print(paste('Processing rows:', index * chunkSize))
if (nrow(dataChunk) != chunkSize){
print('Processed all files!')
break
}
dataChunk <- read.table(
con, nrows = chunkSize, skip=0, header = FALSE,
fill=TRUE, sep = ",", col.names=actualColumnNames
)
break
}
library(tidyverse)
library(nycflights13)
# make the problelm reproducible
rgfile <- 'flights.csv'
write_csv(flights, rgfile)
# now, get to work
lines <- as.numeric(R.utils::countLines(rgfile))
chunk_size <- 100000
hdr <- read_csv(rgfile, n_max=2)
fnum <- 1
for (i in seq(1, lines, chunk_size)) {
suppressMessages(
read_csv(
rgfile, col_names=colnames(hdr), skip=(i-1), n_max=chunk_size
)
) -> x
if (i>1) colnames(x) <- colnames(hdr)
write_csv(x, sprintf("file%03d.csv", fnum))
fnum <- fnum + 1
}

chunking txt files in R

all,
I'm working form Matthew Jockers's code in his "Text Analysis with R for Students of Literature" book.
In it he provides code to pull all <p> tags from XML documents, chop that content in 1000 words chunks and apply a bunch data massaging tricks. Once that's done, he inserts that chunking function in a loop that produces a data matrix that is ready to be used in mallet. Please see the code below.
My question is, how do I do the same thing with .txt files? Obviously, text files do not have attributes like <p> to work from. I'm not an experienced programmer so go easy on me please!!!
chunk.size <- 1000 #number of words per chunk
makeFlexTextChunks <- function(doc.object, chunk.size=1000, percentage=TRUE){
paras <- getNodeSet(doc.object,
"/d:TEI/d:text/d:body//d:p",
c(d = "http://www.tei-c.org/ns/1.0"))
words <- paste(sapply(paras,xmlValue), collapse=" ")
words.lower <- tolower(words)
words.lower <- gsub("[^[:alnum:][:space:]']", " ", words.lower)
words.l <- strsplit(words.lower, "\\s+")
word.v <- unlist(words.l)
x <- seq_along(word.v)
if(percentage){
max.length <- length(word.v)/chunk.size
chunks.l <- split(word.v, ceiling(x/max.length))
} else {
chunks.l <- split(word.v, ceiling(x/chunk.size))
#deal with small chunks at the end
if(length(chunks.l[[length(chunks.l)]]) <=
length(chunks.l[[length(chunks.l)]])/2){
chunks.l[[length(chunks.l)-1]] <-
c(chunks.l[[length(chunks.l)-1]],
chunks.l[[length(chunks.l)]])
chunks.l[[length(chunks.l)]] <- NULL
}
}
chunks.l <- lapply(chunks.l, paste, collapse=" ")
chunks.df <- do.call(rbind, chunks.l)
return(chunks.df)
}
topic.m <- NULL
for(i in 1:length(files.v)){
doc.object <- xmlTreeParse(file.path(input.dir, files.v[i]),
useInternalNodes=TRUE)
chunk.m <- makeFlexTextChunks(doc.object, chunk.size,
percentage=FALSE)
textname <- gsub("\\..*","", files.v[i])
segments.m <- cbind(paste(textname,
segment=1:nrow(chunk.m), sep="_"), chunk.m)
topic.m <- rbind(topic.m, segments.m)
}
Thank you everybody for your help. I think I found my answer after much trial and error! The key was to pull the txt files with scan(paste(input.dir, files.v[i], sep="/") in the loop rather than the function. Please see my code here:
input.dir <- "data/plainText"
files.v <- dir(input.dir, ".*txt")
chunk.size <- 100 #number of words per chunk
makeFlexTextChunks <- function(doc.object, chunk.size=100, percentage=TRUE){
words.lower <- tolower(paste(doc.object, collapse=" "))
words.lower <- gsub("[^[:alnum:][:space:]']", " ", words.lower)
words.l <- strsplit(words.lower, "\\s+")
word.v <- unlist(words.l)
x <- seq_along(word.v)
if(percentage){
max.length <- length(word.v)/chunk.size
chunks.l <- split(word.v, ceiling(x/max.length))
}
else {
chunks.l <- split(word.v, ceiling(x/chunk.size))
#deal with small chunks at the end
if(length(chunks.l[[length(chunks.l)]]) <=
length(chunks.l[[length(chunks.l)]])/2){
chunks.l[[length(chunks.l)-1]] <-
c(chunks.l[[length(chunks.l)-1]],
chunks.l[[length(chunks.l)]])
chunks.l[[length(chunks.l)]] <- NULL
}
}
chunks.l <- lapply(chunks.l, paste, collapse=" ")
chunks.df <- do.call(rbind, chunks.l)
return(chunks.df)
}
topic.m <- NULL
for(i in 1:length(files.v)){
doc.object <- scan(paste(input.dir, files.v[i], sep="/"), what="character", sep="\n")
chunk.m <- makeFlexTextChunks(doc.object, chunk.size, percentage=FALSE)
textname <- gsub("\\..*","", files.v[i])
segments.m <- cbind(paste(textname, segment=1:nrow(chunk.m), sep="_"), chunk.m)
topic.m <- rbind(topic.m, segments.m)
}
Maybe this can point you in the right direction. The following code reads in a txt file a splits the words up into elements of a vector.
library(readr)
library(stringr)
url <- "http://www.gutenberg.org/files/98/98-0.txt"
mystring <- read_file(url)
res <- str_split(mystring, "\\s+")
Then you can split it into chunks of 1000 words and do your magic?

Speed up text mining (and for loop) in R

I'm text-mining thousands of documents (basically doing frequency count) and wondering is there any other ways to speed up the following process? Currently it takes more than 10 hours to run the whole analysis. Thank you (from an R beginner).
sessionInfo()
#R version 3.2.3 (2015-12-10)
library(bitops)
library(RCurl)
library(XML)
library(stringr)
library(tm)
setwd("F:/testing_folder")
path = "F:/testing_folder"
file.names <- dir(path, pattern =".txt")
filename <- vector()
totalword <- vector()
system.time(
for(i in 1:length(file.names)){
text.v <- scan(file.names[i], what="character", sep="\n",encoding = "UTF-8")
report.v <- paste(text.v, collapse=" " )
#Count total number of words
words.l <- strsplit(report.v, "\\W")
word.v <- unlist(words.l)
not.blanks.v <- which(word.v!="")
word.v <- word.v[not.blanks.v]
totalword <- append(totalword,length(word.v))
filename <- append(filename,print(file.names[i]))
x <- data.frame(filename,totalword)
write.csv(x, file= "results.csv") #export results
}
)
What do you get from the following?
Rprof("profile1.out", line.profiling=TRUE)
source("http://pastebin.com/raw/kFGCse5s")
Rprof(NULL)
proftable("profile1.out", lines=10)

Resources