Extract the node with R - r

I would like to parse the table from link http://www.espn.com/nfl/team/schedule/_/name/bal/baltimore-ravens
All information I need is stored in node //tbody (XPath - //*[#id="my-teams-table"]/div[3]/div/table/tbody)
Now I'm trying in R with the XML package,
html_page <- htmlTreeParse(url, useInternalNodes = TRUE)
topNode <- xmlRoot(html_page)
content <- getNodeSet(topNode, "//tbody")
However, this gives me an empty value.

We can use rvest
library(rvest)
tbl <- read_html(url) %>%
html_nodes("table") %>%
html_table(fill = TRUE, header = TRUE) %>%
as.data.frame
data
url <- "http://www.espn.com/nfl/team/schedule/_/name/bal/baltimore-ravens"

Related

How to read and make dataframe with this data?

I need to read and create an dataframe with R from this url https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest, but I confess that I cannot go much far than this...
# R packages
library(tidyverse)
library(dplyr)
library(rvest)
and...
url <- "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest"
page<- read_html(url)
page
{html_document}
<html>
[1] <body><p>2.3|lacnic|20220922|84615|19870101|20220922|-0300\nlacnic|*|ipv4 ...
I tryed to use rvest to find tables but...
tables <- page %>%
html_table(fill=TRUE)
tables
list()
My expected dataframe result is something like
In other words, using the | as sep ... How can I extract this data and convert it for an R dataframe ?
Try this:
url <- "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest"
df <- read.delim(url(url), sep = "|", row.names = NULL)
df <- readr::read_delim(
file = "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest",
delim = "|",
col_names = F
)

Object not found in R language

I am newbie at using R and here's my attempt to play a round a code to scrape quotes from multiple pages
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape html_nodes(".text") html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)
I have encountered an error Object not found as for content variable. How can I overcome this error and set the object so as to be reusable in the append line?
Growing vector in a loop is very inefficient if you are scraping many pages. Instead what you should do is initialise a list with specific length which you know beforehand.
library(rvest)
n <- 4
content = vector('list', n)
# Scrape Multiple Pages
for (i in 1:n){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
content[[i]] <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
}
write.csv(unlist(content), file = "content.csv", row.names = FALSE)
Another option without initialising is to use sapply/lapply :
all_urls <- paste0("http://quotes.toscrape.com/page/",1:4)
content <- unlist(lapply(all_urls, function(x)
x %>% read_html %>% html_nodes(".text") %>% html_text()))
I have searched and found the way to assign empty object before the loop content = c()
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
content = c()
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)

Exporting scraped data to one CSV

I managed to do a scraper for gathering Election info in R(rvest), but now I am struggling with how I can save the data not in separate CSV files, but in the one CSV file.
Here is my working code where I can scrap pages 11,12,13 separately.
library(rvest)
library(xml2)
do.call(rbind, lapply(11:13,
function(n) {
url <- paste0("http://www.cvk.gov.ua/pls/vnd2014/WP040?PT001F01=910&pf7331=", n)
mi <- read_html(url)%>% html_table(fill = TRUE)
mi[[8]]
file <- paste0("election2014_", n, ".csv")
if (!file.exists(file)) write.csv(mi[[8]], file)
Sys.sleep(5)
}))
I tried to do this in the end, but it is not working as I expected
write.csv(rbind(mi[[8]],url), file="election2014.csv")
try this one :
library(rvest)
library(tidyverse)
scr<-function(n){
url<-paste0("http://www.cvk.gov.ua/pls/vnd2014/WP040?PT001F01=910&pf7331=",n)
df=read_html(url)%>%
html_table(fill = TRUE)%>%
.[[8]]%>%
data.frame()
colnames(df)<-df[1,]
df<-df[-1,]
}
res<-11:13%>%
map_df(.,scr)
write.csv2(res,"odin_tyr.csv")
I wasn't able to get your code to work, but you could try creating an empty data frame before running you code, and then do this before writing a csv file with the complete data:
df = rbind(df,mi[[8]])
you could also consider turning your csv files into one using the purrr package:
files = list.files("folder_name",pattern="*.csv",full.names = T)
df = files %>%
map(read_csv) %>%
reduce(rbind)

error htmlTreeParse in R

I'm trying to get the body text from this webpage www.kinyo.es
but it returns this problem:
Error in which(value == defs) :
argument "code" is missing, with no default
In addition: Warning messages:
1: XML content does not seem to be XML: 'Error displaying the error page: Application Instantiation Error: Could not connect to MySQL.'
2: XML content does not seem to be XML: ''
My code is the following loop:
for(i in 1:n)
{
#get the URL
u <- webpage[i]
doc <- getURL(u)
#get the text from the body
html <- htmlTreeParse(doc, useInternal = TRUE)
txt <- xpathApply(html, "//body//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)]", xmlValue)
txt<-toString(txt)
txt
#clean
txt<-(str_replace_all(txt, "[\r\n\t,]" , ""))
txt<-tolower(txt)
txt
search <- c("wi-fi","router","switch","adsl","wireless")
search
stri_count_fixed(txt, search)
conta[i]<-sum(stri_count_fixed(txt, search))
#txt
}
This is a bit of a stretch, as I read your other questions and I can only suppose this is what you are after:
library(rvest)
library(stringr)
count_keywords <- function(url, keywords){
read_html(url) %>%
html_nodes(xpath = '//body//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)]') %>%
html_text() %>%
toString() %>%
str_count(keywords) %>%
sum
}
urls <- c('http://www.dlink.com/it/it', 'http://www.kinyo.es')
search <- c("Wi-Fi","Router","Switch","ADSL")
res <- sapply(urls, count_keywords, search)
res
#> http://www.dlink.com/it/it http://www.kinyo.es
#> 11 0

R scrape HTML table with multiple subheaders

I'm trying to import the list of nuclear test sites (from Wikipedia's page) in a data.frame using the code below:
library(RCurl)
library(XML)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
webpage <- getURL(theurl)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
# Find XPath (go the webpage, right-click inspect element, find table then right-click copyXPath)
myxpath <- "//*[#id='mw-content-text']/table[2]"
# Extract table header and contents
tablehead <- xpathSApply(pagetree, paste(myxpath,"/tr/th",sep=""), xmlValue)
results <- xpathSApply(pagetree, paste(myxpath,"/tr/td",sep=""), xmlValue)
# Convert character vector to dataframe
content <- as.data.frame(matrix(results, ncol = 5, byrow = TRUE))
names(content) <- c("Testing country", "Location", "Site", "Coordinates", "Notes")
However there are multiple sub-headers that prevent the data.frame to be populated consistently. How can I fix this?
Take a look at the htmltab package. It allows you to use the subheaders for populating a new column:
library(htmltab)
tab <- htmltab("https://en.wikipedia.org/wiki/List_of_nuclear_test_sites",
which = "/html/body/div[3]/div[3]/div[4]/table[2]",
header = 1 + "//tr/th[#style='background:#efefff;']",
rm_nodata_cols = F)
I found this example by Carson Sievert that worked well for me:
library(rvest)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
# First, grab the page source
content <- html(theurl) %>%
# then extract the first node with class of wikitable
html_node(".wikitable") %>%
# then convert the HTML table into a data frame
html_table()
Have you tried this?
l.wiki.url <- getURL( url = "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites" )
l.wiki.par <- htmlParse( file = l.wiki.url )
l.tab.con <- xpathSApply( doc = l.wiki.par
, path = "//table[#class='wikitable']//tr//td"
, fun = xmlValue
)

Resources