I am newbie at using R and here's my attempt to play a round a code to scrape quotes from multiple pages
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape html_nodes(".text") html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)
I have encountered an error Object not found as for content variable. How can I overcome this error and set the object so as to be reusable in the append line?
Growing vector in a loop is very inefficient if you are scraping many pages. Instead what you should do is initialise a list with specific length which you know beforehand.
library(rvest)
n <- 4
content = vector('list', n)
# Scrape Multiple Pages
for (i in 1:n){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
content[[i]] <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
}
write.csv(unlist(content), file = "content.csv", row.names = FALSE)
Another option without initialising is to use sapply/lapply :
all_urls <- paste0("http://quotes.toscrape.com/page/",1:4)
content <- unlist(lapply(all_urls, function(x)
x %>% read_html %>% html_nodes(".text") %>% html_text()))
I have searched and found the way to assign empty object before the loop content = c()
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
content = c()
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)
Related
I want to download tables on many similar URLs.
These URLs differed only in a small part (numbers), so I put the differing numbers (2270100023, 2270100080, 2270100122) in an excel file (test_no.xlsx) and created a loop to create a series of URLs. it was very successful up to this point.
no_list <- read_excel("X:/zhang/R/markdown/test/test_no.xlsx")
xpath1 <- '//*[#id="kihonPage"]/div[1]/div[1]/article/section/div[5]'
final_data <- NULL
for (i in no_list)
{url1 <- paste("https://www.kaigokensaku.mhlw.go.jp/20/index.php?action_kouhyou_detail_024_kihon=true&JigyosyoCd=", i, "-00&ServiceCd=510", sep ="")
final_data <- rbind(final_data, url1)
}
The final_data is a "matrix" "array" like
final_data I got
But when I tried to use this loop to download the tables, the program got an error.
Error: x must be a string of length 1
The code I used is as follows:
no_list <- read_excel("X:/zhang/R/markdown/test/test_no.xlsx")
xpath1 <- '//*[#id="kihonPage"]/div[1]/div[1]/article/section/div[5]'
final_data <- NULL
for (i in no_list)
{url1 <- paste("https://www.kaigokensaku.mhlw.go.jp/20/index.php?action_kouhyou_detail_024_kihon=true&JigyosyoCd=", i, "-00&ServiceCd=510", sep ="")
test <- url1 %>% # Scrape data
read_html() %>%
html_nodes(xpath=xpath1) %>%
html_table()
test1 <- test[[1]] # Select table number
final_data <- rbind(final_data, test1)
}
How could I solve this problem?
I am trying to pull data from a website: https://transtats.bts.gov/PREZIP/
I am interested in downloading the datasets named Origin_and_Destination_Survey_DB1BMarket_1993_1.zip to Origin_and_Destination_Survey_DB1BMarket_2021_3.zip
For this I am trying to automate and put the url in a loop
# dates of all files
year_quarter_comb <- crossing(year = 1993:2021, quarter = 1:4) %>%
mutate(year_quarter_comb = str_c(year, "_", quarter)) %>%
pull(year_quarter_comb)
# download all files
for(year_quarter in year_quarter_comb){
get_BTS_data(str_glue("https://transtats.bts.gov/PREZIP/Origin_and_Destination_Survey_DB1BMarket_", year_quarter, ".zip"))
}
What I was wondering is how I can exclude 2021 quarter 4 since the data for this is not available yet. Also is there a better way to automate the task? I was thinking of matching by "DB1BMarket" but R is actually case-sensitive. The names for certain dates change to "DB1BMARKET"
I can use this year_quarter_comb[-c(116)] to remove 2021_4 from the output:
EDIT: I was actually trying to download the files into a specific folder with these set of codes:
path_to_local <- "whatever location" # this is the folder where the raw data is stored.
# download data from BTS
get_BTS_data <- function(BTS_url) {
# INPUT: URL for the zip file with the data
# OUTPUT: NULL (this just downloads the data)
# store the download in the path_to_local folder
# down_file <- str_glue(path_to_local, "QCEW_Hawaii_", BLS_url %>% str_sub(34) %>% str_replace_all("/", "_"))
down_file <- str_glue(path_to_local, fs::path_file(BTS_url))
# download data to folder
QCEW_files <- BTS_url %>%
# download file
curl::curl_download(down_file)
}
EDIT2:
I edited the codes a little from the answer below and it runs:
url <- "http://transtats.bts.gov/PREZIP"
content <- read_html(url)
file_paths <- content %>%
html_nodes("a") %>%
html_attr("href")
origin_destination_paths <-
file_paths[grepl("DB1BM", file_paths)]
base_url <- "https://transtats.bts.gov"
origin_destination_urls <-
paste0(base_url, origin_destination_paths)
h <- new_handle()
handle_setopt(h, ssl_verifyhost = 0, ssl_verifypeer=0)
lapply(origin_destination_urls, function(x) {
tmp_file <- tempfile()
curl_download(x, tmp_file, handle = h)
unzip(tmp_file, overwrite = F, exdir = "airfare data")
})
It takes a while to download these datasets as the files are quite large. It downloaded files until 2007_2 but then I got an error with the curl connection dropping out.
Instead of trying to generate the URL you could scrape the file paths from the website. This avoids generating any non-existing files.
Below is a short script that downloads all of the zip files you are looking for and unzips them into your working directory.
The hardest part for me here was, that the server seems to have a misconfigured SSL certificate. I was able to find help here on SO for turning of SSL certificate verification for read_html() and curl_download(). These solutions are integrated in the script below.
library(tidyverse)
library(rvest)
library(curl)
url <- "http://transtats.bts.gov/PREZIP"
content <-
httr::GET(url, config = httr::config(ssl_verifypeer = FALSE)) |>
read_html()
file_paths <-
content |>
html_nodes("a") |>
html_attr("href")
origin_destination_paths <-
file_paths[grepl("DB1BM", file_paths)]
base_url <- "https://transtats.bts.gov"
origin_destination_urls <-
paste0(base_url, origin_destination_paths)
h <- new_handle()
handle_setopt(h, ssl_verifyhost = 0, ssl_verifypeer=0)
lapply(origin_destination_urls, function(x) {
tmp_file <- tempfile()
curl_download(x, tmp_file, handle = h)
unzip(tmp_file)
})
I have a CSV file that contains information about a set of articles and the 9th volume refers to the URLs. I have successfully scraped the title and abstract by a single URL with the following code:
library('rvest')
url <- 'https://link.springer.com/article/10.1007/s10734-019-00404-5'
webpage <- read_html(url)
title_data_html <- html_nodes(webpage,'.u-h1')
title_data <- html_text(title_data_html)
head(title_data)
abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
abstract_data <- html_text(abstract_data_html)
head(abstract_data)
myTable = data.frame(Title = title_data, Abstract = abstract_data)
View(myTable)
Now I want to use R to scrape the title and abstract of each article. My question is how to import the URLs contained in the CVS file and how to write a for loop to scrape the data I need. I'm quite new to r so thanks in advance for your help.
Try This:
library(rvest)
URLs <- read.csv("urls.csv")
n <-nrow(URLs)
URLs2 <-character()
for (i in 1:n) {
URLs2[i]<-as.character(URLs[i,1])
}
df <- data.frame(Row = as.integer(), Title=as.character(), Abstract=as.character(), stringsAsFactors = FALSE)
for (i in 1:n) {
webpage <- tryCatch(read_html(URLs2[i]), error = function(e){'empty page'})
if (!"empty page" %in% webpage) {
title_data_html <- html_nodes(webpage,'.u-h1')
title_data <- html_text(title_data_html)
abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
abstract_data <- html_text(abstract_data_html)
temp <- as.data.frame(cbind(Row = match(URLs2[i], URLs2), Title = title_data, Abstract = abstract_data))
if(ncol(temp)==3) {
df <- rbind(df,temp)
}
}
}
View(df)
Edit: The code has been edited in such a way that it will work even if the urls are broken (skipping them). The output rows will be numbered with the entry's corresponding row number in the csv.
I am scraping a site in R (rvest package) and I want to create in every parsed csv file a new column and 1) assign to numbers similar to my loop numbers or 2) create a new column and assign a special value (which I got using rvest nodes). I can assign these numbers if I scrape only one page, but that is not what I need. And the for loop works smoothly.
Here is my code with for loop
registered <- for (n in c(11:12)){
url_2019 <-
paste0("https://www.cvk.gov.ua/pls/vnd2019/wp033pt001f01=919pf7331=", n
,".html")
results_2019 <- read_html(url_2019)%>% html_table(fill = TRUE)
results_2019[[6]]%>%as.data.frame
#dir.create("registered_major_2019")
file <- paste0("registered_major_2019/dist_", n, ".csv")
if (!file.exists(file)) write.csv(results_2019[[6]], file, fileEncoding
= "Windows-1251")
Sys.sleep(0.5)
}
And I know to do it separately
url_2019 <-
paste0("https://www.cvk.gov.ua/pls/vnd2019/wp033pt001f01=919pf7331=11
.html")
results_2019 <- read_html(url_2019)%>% html_table(fill = TRUE)
pfont <- read_html(url_2019)%>% html_node("font")%>%html_text()
# This is actually what I need
results_2019a <- data.frame(results_2019[[6]], pfont)
But can't figure it out how to do it in for(). I tried this, but it doesn't work:
registered <- for (n in c(11:12)){
url_2019 <-
paste0("https://www.cvk.gov.ua/pls/vnd2019/wp033pt001f01=919pf7331=", n
,".html")
results_2019 <- read_html(url_2019)%>% html_table(fill = TRUE)%>%data.frame()
pfont <- read_html(url_2019)%>% html_node("font")%>%html_text()
df <- data.frame(results_2019[[6]], pfont)
#dir.create("registered_major_2019")
file <- paste0("registered_major_2019/dist_", n, ".csv")
if (!file.exists(file)) write.csv(df, file, fileEncoding = "Windows-
1251")
Sys.sleep(0.5)
}
I managed to do a scraper for gathering Election info in R(rvest), but now I am struggling with how I can save the data not in separate CSV files, but in the one CSV file.
Here is my working code where I can scrap pages 11,12,13 separately.
library(rvest)
library(xml2)
do.call(rbind, lapply(11:13,
function(n) {
url <- paste0("http://www.cvk.gov.ua/pls/vnd2014/WP040?PT001F01=910&pf7331=", n)
mi <- read_html(url)%>% html_table(fill = TRUE)
mi[[8]]
file <- paste0("election2014_", n, ".csv")
if (!file.exists(file)) write.csv(mi[[8]], file)
Sys.sleep(5)
}))
I tried to do this in the end, but it is not working as I expected
write.csv(rbind(mi[[8]],url), file="election2014.csv")
try this one :
library(rvest)
library(tidyverse)
scr<-function(n){
url<-paste0("http://www.cvk.gov.ua/pls/vnd2014/WP040?PT001F01=910&pf7331=",n)
df=read_html(url)%>%
html_table(fill = TRUE)%>%
.[[8]]%>%
data.frame()
colnames(df)<-df[1,]
df<-df[-1,]
}
res<-11:13%>%
map_df(.,scr)
write.csv2(res,"odin_tyr.csv")
I wasn't able to get your code to work, but you could try creating an empty data frame before running you code, and then do this before writing a csv file with the complete data:
df = rbind(df,mi[[8]])
you could also consider turning your csv files into one using the purrr package:
files = list.files("folder_name",pattern="*.csv",full.names = T)
df = files %>%
map(read_csv) %>%
reduce(rbind)