error htmlTreeParse in R - r

I'm trying to get the body text from this webpage www.kinyo.es
but it returns this problem:
Error in which(value == defs) :
argument "code" is missing, with no default
In addition: Warning messages:
1: XML content does not seem to be XML: 'Error displaying the error page: Application Instantiation Error: Could not connect to MySQL.'
2: XML content does not seem to be XML: ''
My code is the following loop:
for(i in 1:n)
{
#get the URL
u <- webpage[i]
doc <- getURL(u)
#get the text from the body
html <- htmlTreeParse(doc, useInternal = TRUE)
txt <- xpathApply(html, "//body//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)]", xmlValue)
txt<-toString(txt)
txt
#clean
txt<-(str_replace_all(txt, "[\r\n\t,]" , ""))
txt<-tolower(txt)
txt
search <- c("wi-fi","router","switch","adsl","wireless")
search
stri_count_fixed(txt, search)
conta[i]<-sum(stri_count_fixed(txt, search))
#txt
}

This is a bit of a stretch, as I read your other questions and I can only suppose this is what you are after:
library(rvest)
library(stringr)
count_keywords <- function(url, keywords){
read_html(url) %>%
html_nodes(xpath = '//body//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)]') %>%
html_text() %>%
toString() %>%
str_count(keywords) %>%
sum
}
urls <- c('http://www.dlink.com/it/it', 'http://www.kinyo.es')
search <- c("Wi-Fi","Router","Switch","ADSL")
res <- sapply(urls, count_keywords, search)
res
#> http://www.dlink.com/it/it http://www.kinyo.es
#> 11 0

Related

How can I correct the error: `x` must be a string of length 1?

I want to download tables on many similar URLs.
These URLs differed only in a small part (numbers), so I put the differing numbers (2270100023, 2270100080, 2270100122) in an excel file (test_no.xlsx) and created a loop to create a series of URLs. it was very successful up to this point.
no_list <- read_excel("X:/zhang/R/markdown/test/test_no.xlsx")
xpath1 <- '//*[#id="kihonPage"]/div[1]/div[1]/article/section/div[5]'
final_data <- NULL
for (i in no_list)
{url1 <- paste("https://www.kaigokensaku.mhlw.go.jp/20/index.php?action_kouhyou_detail_024_kihon=true&JigyosyoCd=", i, "-00&ServiceCd=510", sep ="")
final_data <- rbind(final_data, url1)
}
The final_data is a "matrix" "array" like
final_data I got
But when I tried to use this loop to download the tables, the program got an error.
Error: x must be a string of length 1
The code I used is as follows:
no_list <- read_excel("X:/zhang/R/markdown/test/test_no.xlsx")
xpath1 <- '//*[#id="kihonPage"]/div[1]/div[1]/article/section/div[5]'
final_data <- NULL
for (i in no_list)
{url1 <- paste("https://www.kaigokensaku.mhlw.go.jp/20/index.php?action_kouhyou_detail_024_kihon=true&JigyosyoCd=", i, "-00&ServiceCd=510", sep ="")
test <- url1 %>% # Scrape data
read_html() %>%
html_nodes(xpath=xpath1) %>%
html_table()
test1 <- test[[1]] # Select table number
final_data <- rbind(final_data, test1)
}
How could I solve this problem?

Object not found in R language

I am newbie at using R and here's my attempt to play a round a code to scrape quotes from multiple pages
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape html_nodes(".text") html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)
I have encountered an error Object not found as for content variable. How can I overcome this error and set the object so as to be reusable in the append line?
Growing vector in a loop is very inefficient if you are scraping many pages. Instead what you should do is initialise a list with specific length which you know beforehand.
library(rvest)
n <- 4
content = vector('list', n)
# Scrape Multiple Pages
for (i in 1:n){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
content[[i]] <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
}
write.csv(unlist(content), file = "content.csv", row.names = FALSE)
Another option without initialising is to use sapply/lapply :
all_urls <- paste0("http://quotes.toscrape.com/page/",1:4)
content <- unlist(lapply(all_urls, function(x)
x %>% read_html %>% html_nodes(".text") %>% html_text()))
I have searched and found the way to assign empty object before the loop content = c()
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
content = c()
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)

Extract the node with R

I would like to parse the table from link http://www.espn.com/nfl/team/schedule/_/name/bal/baltimore-ravens
All information I need is stored in node //tbody (XPath - //*[#id="my-teams-table"]/div[3]/div/table/tbody)
Now I'm trying in R with the XML package,
html_page <- htmlTreeParse(url, useInternalNodes = TRUE)
topNode <- xmlRoot(html_page)
content <- getNodeSet(topNode, "//tbody")
However, this gives me an empty value.
We can use rvest
library(rvest)
tbl <- read_html(url) %>%
html_nodes("table") %>%
html_table(fill = TRUE, header = TRUE) %>%
as.data.frame
data
url <- "http://www.espn.com/nfl/team/schedule/_/name/bal/baltimore-ravens"

convert batch of pdf to text using pdftools

I'm tying to convert 1000 pdfs to text for data analysis. I'm using the package pdftools.
I have been able to convert 2 pdf using the following code:
library(pdftools)
file_list <- list.files('pdf', full.names = TRUE, pattern = 'pdf')
for(i in 1:length(file_list)){
temp <- pdf_text(file_list[i])
temp <- tolower(temp)
file_name = paste(file_list[i], '.txt')
sink(file_name)
cat(temp)
sink()
}
but when I add more than 2 I get the following error:
" Error in poppler_pdf_text(loadfile(pdf), opw, upw) : PDF parsing failure."
also, I would like the final text file to be only "file_name.txt" right now i'm getting "file_name.pdf .txt"
thanks,
library(pdftools)
library(purrr)
setwd("/tmp/test")
file_list <- list.files(".", full.names = TRUE, pattern = '.pdf$')
s_pdf_text <- safely(pdf_text) # helps catch errors
walk(file_list, ~{ # iterate over the files
res <- s_pdf_text(.x) # try to read it in
if (!is.null(res$result)) { # if successful
message(sprintf("Processing [%s]", .x))
txt_file <- sprintf("%stxt", sub("pdf$", "", .x)) # make a new filename
unlist(res$result) %>% # cld be > 1 pg (which makes a list)
tolower() %>%
paste0(collapse="\n") %>% # make one big text block with line breaks
cat(file=txt_file) # write it out
} else { # if not successful
message(sprintf("Failure converting [%s]", .x)) # show a message
}
})

"Rescue" command in R?

I have this code:
library(rvest)
url_list <- c("https://github.com/rails/rails/pull/100",
"https://github.com/rails/rails/pull/200",
"https://github.com/rails/rails/pull/300")
mine <- function(url){
url_content <- html(url)
url_mainnode <- html_node(url_content, "*")
url_mainnode_text <- html_text(url_mainnode)
url_mainnode_text <- gsub("\n", "", url_mainnode_text) # clean up the text
url_mainnode_text
}
messages <- lapply(url_list, mine)
However, as i make the list longer I tend to run into a
Error in html.response(r, encoding = encoding) :
server error: (500) Internal Server Error
I know in Ruby I can use rescue to keep iterating through a list, even though some attempts at applying a function fails. Is there something similar in R?
One option is to use try(). For more info, see here. Here's an implementation:
library(rvest)
url_list <- c("https://github.com/rails/rails/pull/100",
"https://github.com/rails/rails/pull/200",
"https://github.com/rails/rails/pull/300")
mine <- function(url){
try(url_content <- html(url))
url_mainnode <- html_node(url_content, "*")
url_mainnode_text <- html_text(url_mainnode)
url_mainnode_text <- gsub("\n", "", url_mainnode_text) # clean up the text
url_mainnode_text
}
messages <- lapply(url_list, mine)

Resources