Web Scraping with R on multiple pages/links - r

I have a list of 5000 movies in an excel file:
Avatar
Tangled
Superman Returns
Avengers : Endgame
Man of Steel
And so on....
I need to extract weekend collections of these movies.
The weekend collections are available on boxofficemojo.com website.
By writing the following code, i am only able to fetch the Weekend collections of one single movie 'Avatar' since the url mentioned in the code contains only the details of 'Avatar'.
library(rvest)
webpage <- read_html("https://www.boxofficemojo.com/release/rl876971521/weekend/?ref_=bo_rl_tab#tabs")
weekend_collections <- webpage %>%
html_nodes(".mojo-field-type-rank+ .mojo-estimatable") %>%
html_text()
Other movies will have different url's.
5000 different movie's weekend collections will be in 5000 different url's.
Is it possible to just give the list of the movies and ask r to fetch the weekend collections of every movie without providing the respective url's of the movies ?
I can add the url's of the movies manually and perform the task but it isn't a great idea to manually add the url's of the movies to the code.
So how do i fetch the weekend collections of these 5000 movies ?
I am new to R.
Need help.

It is possible to automate the search process on this site, since it is easy enough to generate the search string and parse the incoming html to navigate to the weekends page.
The problem is that the search will sometimes generate several hits, so you can't be sure you are getting exactly the right movie. You can only examine the title afterwards to find out.
Here is a function you can use. You supply it with a movie title and it will try to get the url to the weekend collections for the original release. It will select the first hit on the search page, so you have no guarantee it's the correct movie.
get_weekend_url <- function(movie)
{
site <- "https://www.boxofficemojo.com"
search_query <- paste0(site, "/search/?q=")
search_xpath <- "//a[#class = 'a-size-medium a-link-normal a-text-bold']"
release_xpath <- "//option[text() = 'Original Release']"
territory_xpath <- "//option[text() = 'Domestic']"
weekend <- "weekend/?ref_=bo_rl_tab#tabs"
movie_url <- url_escape(movie) %>%
{gsub("%20", "+", .)} %>%
{paste0(search_query, .)} %>%
read_html() %>%
html_nodes(xpath = search_xpath) %>%
html_attr("href")
if(!is.na(movie_url[1]))
{
release <- read_html(paste0(site, movie_url[1])) %>%
html_node(xpath = release_xpath) %>%
html_attr("value") %>%
{paste0(site, .)}
} else release <- NA # We can stop if there is no original release found
if(!is.na(release))
{
target <- read_html(release) %>%
html_node(xpath = territory_xpath) %>%
html_attr("value") %>%
{paste0(site, ., weekend)}
} else target <- "Movie not found"
return(target)
}
Now you can use sapply to get the urls you want:
movies <- c("Avatar",
"Tangled",
"Superman Returns",
"Avengers : Endgame",
"Man of Steel")
urls <- sapply(movies, get_weekend_url)
urls
#> Avatar
#> "https://www.boxofficemojo.com/release/rl876971521/weekend/?ref_=bo_rl_tab#tabs"
#> Tangled
#> "https://www.boxofficemojo.com/release/rl980256257/weekend/?ref_=bo_rl_tab#tabs"
#> Superman Returns
#> "https://www.boxofficemojo.com/release/rl4067591681/weekend/?ref_=bo_rl_tab#tabs"
#> Avengers : Endgame
#> "https://www.boxofficemojo.com/release/rl3059975681/weekend/?ref_=bo_rl_tab#tabs"
#> Man of Steel
#> "https://www.boxofficemojo.com/release/rl4034037249/weekend/?ref_=bo_rl_tab#tabs"
Now you can use these to get the tables for each movie:
css <- ".mojo-field-type-rank+ .mojo-estimatable"
weekends <- lapply(urls, function(x) read_html(x) %>% html_nodes(css) %>% html_text)
Which gives you:
weekends
#> $`Avatar`
#> [1] "Weekend\n " "$77,025,481" "$75,617,183"
#> [4] "$68,490,688" "$50,306,217" "$42,785,612"
#> [7] "$54,401,446" "$34,944,081" "$31,280,029"
#> [10] "$22,850,881" "$23,611,625" "$28,782,849"
#> [13] "$16,240,857" "$13,655,274" "$8,118,102"
#> [16] "$6,526,421" "$4,027,005" "$2,047,475"
#> [19] "$980,239" "$1,145,503" "$844,651"
#> [22] "$1,002,814" "$920,204" "$633,124"
#> [25] "$425,085" "$335,174" "$188,505"
#> [28] "$120,080" "$144,241" "$76,692"
#> [31] "$64,767" "$45,181" "$44,572"
#> [34] "$28,729" "$35,706" "$36,971"
#> [37] "$15,615" "$16,817" "$13,028"
#> [40] "$10,511"
#>
#> $Tangled
#> [1] "Weekend\n " "$68,706,298" "$56,837,104"
#> [4] "$48,767,052" "$21,608,891" "$14,331,687"
#> [7] "$8,775,344" "$6,427,816" "$9,803,091"
#> [10] "$5,111,098" "$3,983,009" "$5,638,656"
#> [13] "$3,081,926" "$2,526,561" "$1,850,628"
#> [16] "$813,849" "$534,351" "$743,090"
#> [19] "$421,474" "$790,248" "$640,753"
#> [22] "$616,057" "$550,994" "$336,339"
#> [25] "$220,670" "$85,574" "$31,368"
#> [28] "$16,475" "$5,343" "$6,351"
#> [31] "$910,502" "$131,938" "$135,891"
#>
#> $`Superman Returns`
#> [1] "Weekend\n " "$52,535,096" "$76,033,267"
#> [4] "$21,815,243" "$12,288,317" "$7,375,213"
#> [7] "$3,788,228" "$2,158,227" "$1,242,461"
#> [10] "$848,255" "$780,405" "$874,141"
#> [13] "$1,115,228" "$453,273" "$386,424"
#> [16] "$301,373" "$403,377" "$296,502"
#> [19] "$331,938" "$216,430" "$173,300"
#> [22] "$40,505"
#>
#> $`Avengers : Endgame`
#> [1] "Weekend\n " "$357,115,007" "$147,383,211"
#> [4] "$63,299,904" "$29,973,505" "$17,200,742"
#> [7] "$22,063,855" "$8,037,491" "$4,870,963"
#> [10] "$3,725,855" "$1,987,849" "$6,108,736"
#> [13] "$3,118,317" "$2,104,276" "$1,514,741"
#> [16] "$952,609" "$383,158" "$209,992"
#> [19] "$100,749" "$50,268" "$70,775"
#> [22] "$86,837" "$12,680"
#>
#> $`Man of Steel`
#> [1] "Weekend\n " "$116,619,362" "$41,287,206"
#> [4] "$20,737,490" "$11,414,297" "$4,719,084"
#> [7] "$1,819,387" "$749,233" "$466,574"
#> [10] "$750,307" "$512,308" "$353,846"
#> [13] "$290,194" "$390,175" "$120,814"
#> [16] "$61,017"
If you have 5000 movies to look up, it is going to take a long time to send and parse all these requests. Depending on your internet connection, it may well take 2-3 seconds per movie. That's not bad, but it may still be 4 hours of processing time. I would recommend starting with an empty list and writing each result to the list as it is received, so that if something breaks after an hour or two, you don't lose everything you have so far.

Related

How to download data from the Reptile database using r

I am using R to try and download images from the Reptile-database by filling their form to seek for specific images. For that, I am following previous suggestions to fill a form online from R, such as:
library(httr)
library(tidyverse)
POST(
url = "http://reptile-database.reptarium.cz/advanced_search",
encode = "json",
body = list(
genus = "Chamaeleo",
species = "dilepis"
)) -> res
out <- content(res)[1]
This seems to work smoothly, but my problem now is to identify the link with the correct species name in the resulting out object.
This object should contain the following page:
https://reptile-database.reptarium.cz/species?genus=Chamaeleo&species=dilepis&search_param=%28%28genus%3D%27Chamaeleo%27%29%28species%3D%27dilepis%27%29%29
This contains names with links. Thus, i would like to identify the link that takes me to the page with the correct species's table. however I am unable to find the link nor even the name of the species within the generated out object.
Here I only extract the links to the pictures. Simply map or apply a function to download them with download.file()
library(tidyverse)
library(rvest)
genus <- "Chamaeleo"
species <- "dilepis"
pics <- paste0(
"http://reptile-database.reptarium.cz/species?genus=", genus,
"&species=", species) %>%
read_html() %>%
html_elements("#gallery img") %>%
html_attr("src")
[1] "https://www.reptarium.cz/content/photo_rd_05/Chamaeleo-dilepis-03000034021_01_t.jpg"
[2] "https://www.reptarium.cz/content/photo_rd_05/Chamaeleo-dilepis-03000033342_01_t.jpg"
[3] "https://www.reptarium.cz/content/photo_rd_02/Chamaeleo-dilepis-03000029987_01_t.jpg"
[4] "https://www.reptarium.cz/content/photo_rd_02/Chamaeleo-dilepis-03000029988_01_t.jpg"
[5] "https://www.reptarium.cz/content/photo_rd_05/Chamaeleo-dilepis-03000035130_01_t.jpg"
[6] "https://www.reptarium.cz/content/photo_rd_05/Chamaeleo-dilepis-03000035131_01_t.jpg"
[7] "https://www.reptarium.cz/content/photo_rd_05/Chamaeleo-dilepis-03000035132_01_t.jpg"
[8] "https://www.reptarium.cz/content/photo_rd_05/Chamaeleo-dilepis-03000035133_01_t.jpg"
[9] "https://www.reptarium.cz/content/photo_rd_06/Chamaeleo-dilepis-03000036237_01_t.jpg"
[10] "https://www.reptarium.cz/content/photo_rd_06/Chamaeleo-dilepis-03000036238_01_t.jpg"
[11] "https://www.reptarium.cz/content/photo_rd_06/Chamaeleo-dilepis-03000036239_01_t.jpg"
[12] "https://www.reptarium.cz/content/photo_rd_11/Chamaeleo-dilepis-03000041048_01_t.jpg"
[13] "https://www.reptarium.cz/content/photo_rd_11/Chamaeleo-dilepis-03000041049_01_t.jpg"
[14] "https://www.reptarium.cz/content/photo_rd_11/Chamaeleo-dilepis-03000041050_01_t.jpg"
[15] "https://www.reptarium.cz/content/photo_rd_11/Chamaeleo-dilepis-03000041051_01_t.jpg"
[16] "https://www.reptarium.cz/content/photo_rd_12/Chamaeleo-dilepis-03000042287_01_t.jpg"
[17] "https://www.reptarium.cz/content/photo_rd_12/Chamaeleo-dilepis-03000042288_01_t.jpg"
[18] "https://calphotos.berkeley.edu/imgs/128x192/9121_3261/2921/0070.jpeg"
[19] "https://calphotos.berkeley.edu/imgs/128x192/1338_3161/0662/0074.jpeg"
[20] "https://calphotos.berkeley.edu/imgs/128x192/9121_3261/2921/0082.jpeg"
[21] "https://calphotos.berkeley.edu/imgs/128x192/1338_3152/3386/0125.jpeg"
[22] "https://calphotos.berkeley.edu/imgs/128x192/6666_6666/1009/0136.jpeg"
[23] "https://calphotos.berkeley.edu/imgs/128x192/6666_6666/0210/0057.jpeg"

Can't Scrape a table from naturereport.miljoeportal.dk using rvest

I am trying to scrape a table from the following site (https://naturereport.miljoeportal.dk/HtmlViewer?id=827472&bA=1&bI=1&bN=1)
I am using rvest and the Selector Gadget to try to make it work, but so far I have only been able to get it in text form.
What I need to extract:
I am mostly interested in extracting the number of species of two areas the Stjernearter, and the 2-stjernearter, as seen in the image bellow:
As seen in the developer tools of firefox that corresponds to a table:
But as I have tried to get the table with Gadget selector, I have not had any success.
What I have tried:
This are some ideas I have tried with limited success:
I have been able to get the text, but not the table with this 2 codes
library(rvest)
rvest::read_html("https://naturereport.miljoeportal.dk/HtmlViewer?id=827472&bA=1&bI=1&bN=1%22") %>%
html_elements(":nth-child(9) .table-col") %>%
html_text()
this gets me the following:
[1] "\r\n\t\t\t\t\t\t\tStjernearter (arter med artsscorer = 4 eller 5):\r\n\t\t\t\t\t\t"
[2] "Strandarve | Honckenya peploides"
[3] "Bidende stenurt | Sedum acre"
[4] "\r\n\t\t\t\t\t\t\t2-stjernearter (artsscore = 6 eller 7):\r\n\t\t\t\t\t\t"
[5] "Ingen arter registreret"
[6] "\r\n\t\t\t\t\t\t\t N-følsomme arter:\r\n\t\t\t\t\t\t "
[7] "Bidende stenurt | Sedum acre"
[8] "\r\n\t\t\t\t\t\t\tProblemarter:\r\n\t\t\t\t\t\t"
[9] "Ingen arter registreret"
[10] "\r\n\t\t\t\t\t\t\tInvasive arter:\r\n\t\t\t\t\t\t"
[11] "Ingen arter registreret"
[12] "\r\n\t\t\t\t\t\t\tHabitatdirektivets bilagsarter:\r\n\t\t\t\t\t\t"
[13] "Ingen arter registreret"
[14] "\r\n\t\t\t\t\t\t\tRødlistede arter:\r\n\t\t\t\t\t\t"
[15] "Ingen arter registreret"
[16] "\r\n\t\t\t\t\t\t\tFredede arter:\r\n\t\t\t\t\t\t"
[17] "Ingen arter registreret"
[18] "\r\n\t\t\t\t\t\t\tAntal arter:\r\n\t\t\t\t\t\t"
[19] "Mosser: 1 fund"
[20] "Planter: 7 fund"
And I get a similar result with
rvest::read_html("https://naturereport.miljoeportal.dk/HtmlViewer?id=827472&bA=1&bI=1&bN=1%22") %>%
html_elements(":nth-child(9) .table-col") %>%
html_text2()
I have also tried the following codes:
rvest::read_html("https://naturereport.miljoeportal.dk/HtmlViewer?id=827472&bA=1&bI=1&bN=1%22") %>%
html_elements(":nth-child(9) .table-col") %>%
html_table()
and
rvest::read_html("https://naturereport.miljoeportal.dk/HtmlViewer?id=827472&bA=1&bI=1&bN=1%22") %>%
html_elements(".report-body") %>%
html_table()
This will be done for several sites that I will loop from, so I will need it in a table format.
Edit
It seems that this code is bringing me closer to the answer:
rvest::read_html("https://naturereport.miljoeportal.dk/HtmlViewer?id=827472&bA=1&bI=1&bN=1%22") %>%
html_elements(".report-section-body")
The eighth element has the table, but I have not been able to extract it:
Test <- rvest::read_html("https://naturereport.miljoeportal.dk/HtmlViewer?id=827472&bA=1&bI=1&bN=1%22") %>%
html_elements(".report-section-body")
Test[8]
{xml_nodeset (1)}
[1] <div class="report-section-body"><div class="table">\n<div class="

Extracting a list of links from a webpage by using its class

I am trying to extract from this website a list of four links that are clearly named as:
PNADC_012018_20190729.zip
PNADC_022018_20190729.zip
PNADC_032018_20190729.zip
PNADC_042018_20190729.zip
I've seen that they are all part of a class called 'jstree-wholerow'. I'm not really good at scraping, yet I've tried to capture such links using this regularity:
x <- rvest::read_html('https://www.ibge.gov.br/estatisticas/downloads-estatisticas.html?caminho=Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/2018') %>%
rvest::html_nodes("jstree-wholerow") %>%
rvest::html_text()
However, I received an empty vector as output.
Can someone help fixing this?
Although the webpage uses javascript, the files are stored in a ftp. It also has very predictable directory names.
library(tidyverse)
library(stringr)
library(rvest)
#>
#> Attaching package: 'rvest'
#> The following object is masked from 'package:readr':
#>
#> guess_encoding
library(RCurl)
#>
#> Attaching package: 'RCurl'
#> The following object is masked from 'package:tidyr':
#>
#> complete
link <- 'https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/2018/PNADC_042018_20190729.zip'
zip_names <- c('PNADC_012018_20190729.zip', 'PNADC_022018_20190729.zip', 'PNADC_032018_20190729.zip', 'PNADC_042018_20190729.zip')
links <- str_replace(link, '/2018.*\\.zip$', str_c('/2018/', zip_names))
links
#> [1] "https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/2018/PNADC_012018_20190729.zip"
#> [2] "https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/2018/PNADC_022018_20190729.zip"
#> [3] "https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/2018/PNADC_032018_20190729.zip"
#> [4] "https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/2018/PNADC_042018_20190729.zip"
#option 2
links <- RCurl::getURL(url = 'https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/2018/') %>% read_html() %>%
html_nodes(xpath = '//td/a[#href]') %>% html_attr('href')
links <- links[-1]
links
#> [1] "PNADC_012018_20190729.zip" "PNADC_022018_20190729.zip"
#> [3] "PNADC_032018_20190729.zip" "PNADC_042018_20190729.zip"
str_c('https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/2018/', links)
#> [1] "https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/2018/PNADC_012018_20190729.zip"
#> [2] "https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/2018/PNADC_022018_20190729.zip"
#> [3] "https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/2018/PNADC_032018_20190729.zip"
#> [4] "https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/2018/PNADC_042018_20190729.zip"
Created on 2021-06-11 by the reprex package (v2.0.0)

R: scrape nested html table with links (table within cell)

For university research I try to scrape an FDA table (robots.txt allows to scrape this content)
The table contains 19 rows and 2 columns:
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K203181
The format I try to extract is:
col1 col2 url_of_col2
<chr> <chr> <chr>
1 Device Classificati~ distal transcutaneous electrical stimulator for treatm~ https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpcd/classification.cfm?s~
What I achieved:
I can easly extract the items of the first column:
#library
library(tidyverse)
library(xml2)
library(rvest)
#load html
html <- xml2::read_html("https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K203181")
# select table of interest
html %>%
html_nodes("table") -> tables
tables[[9]] -> table
# extract col 1 items
table %>%
html_nodes("th") %>%
html_text() %>%
gsub("\n|\t|\r","",.) %>%
trimws()
#> [1] "Device Classification Name" "510(k) Number"
#> [3] "Device Name" "Applicant"
#> [5] "Applicant Contact" "Correspondent"
#> [7] "Correspondent Contact" "Regulation Number"
#> [9] "Classification Product Code" "Date Received"
#> [11] "Decision Date" "Decision"
#> [13] "Regulation Medical Specialty" "510k Review Panel"
#> [15] "summary" "Type"
#> [17] "Clinical Trials" "Reviewed by Third Party"
#> [19] "Combination Product"
Created on 2021-02-27 by the reprex package (v1.0.0)
Where I get stuck
Since some cells of column 2 contain a table, this approach does not give the same number of items:
# extract col 2 items
table %>%
html_nodes("td") %>%
html_text()%>%
gsub("\n|\t|\r","",.) %>%
trimws()
#> [1] "distal transcutaneous electrical stimulator for treatment of acute migraine"
#> [2] "K203181"
#> [3] "Nerivio, FGD000075-4.7"
#> [4] "Theranica Bioelectronics ltd4 Ha-Omanutst. Poleg Industrial Parknetanya, IL4250574"
#> [5] "Theranica Bioelectronics ltd"
#> [6] "4 Ha-Omanutst. Poleg Industrial Park"
#> [7] "netanya, IL4250574"
#> [8] "alon ironi"
#> [9] "Hogan Lovells US LLP1735 Market StreetSuite 2300philadelphia, PA 19103"
#> [10] "Hogan Lovells US LLP"
#> [11] "1735 Market Street"
#> [12] "Suite 2300"
#> [13] "philadelphia, PA 19103"
#> [14] "janice m. hogan"
#> [15] "882.5899"
#> [16] "QGT  "
#> [17] "QGT  "
#> [18] "10/26/2020"
#> [19] "01/22/2021"
#> [20] "substantially equivalent (SESE)"
#> [21] "Neurology"
#> [22] "Neurology"
#> [23] "summary"
#> [24] "Traditional"
#> [25] "NCT04089761"
#> [26] "No"
#> [27] "No"
Created on 2021-02-27 by the reprex package (v1.0.0)
Moreover, I could not find a way to extract the urls of col2
I found a good manual to read html tables with cells spanning on multiple rows. However, I think this approach does not work for nested dataframes.
There is similar question regarding a nested table without links (How to scrape older html with nested tables in R?) which has not been answered yet. A comment suggested this question, unfortunately I could not apply it to my html table.
There is the unpivotr package that aims to read nested html tables, however, I could not solve my problem with that package.
Yes the tables within the rows of the parent table does make it more difficult. The key for this one is to find the 27 rows of the table and then parse each row individually.
library(rvest)
library(stringr)
library(dplyr)
#load html
html <- xml2::read_html("https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K203181")
# select table of interest
tables <- html %>% html_nodes("table")
table <- tables[[9]]
#find all of the table's rows
trows <- table %>% html_nodes("tr")
#find the left column
leftside <- trows %>% html_node("th") %>% html_text() %>% trimws()
#find the right column (remove white at the end and in the middle)
rightside <- trows %>% html_node("td") %>% html_text() %>% str_squish() %>% trimws()
#get links
links <-trows %>% html_node("td a") %>% html_attr("href")
answer <-data.frame(leftside, rightside, links)
One will will need to use paste("https://www.accessdata.fda.gov/", answer$links) on some of the links to obtain the full web address.
The final dataframe does have several cells containing "NA" these can be removed and the table can be cleaned up some more depending on the final requirements. See tidyr::fill() as a good starting point.
Update
To reduce the answer down to the desired 19 original rows:
library(tidyr)
#replace NA with blanks
answer$links <- replace_na(answer$links, "")
#fill in the blank is the first column to allow for grouping
answer <-fill(answer, leftside, .direction = "down")
#Create the final results
finalanswer <- answer %>% group_by(leftside) %>%
summarize(info=paste(rightside, collapse = " "), link=first(links))

Download multiple files from a url, using R

I have this url: https://www.cnpm.embrapa.br/projetos/relevobr/download/index.htm with geographic information about Brazilian states. If you click in any state, you will find these grids:
Now, if you click in any grid, you will be able to download the geographic information of this specific grid:
What I need: download all the grids at once. Is it possible?
You can scrape the page to get the URLs for the zip files, then iterate across the URLs to download everything:
library(rvest)
# get page source
h <- read_html('https://www.cnpm.embrapa.br/projetos/relevobr/download/mg/mg.htm')
urls <- h %>%
html_nodes('area') %>% # get all `area` nodes
html_attr('href') %>% # get the link attribute of each node
sub('.htm$', '.zip', .) %>% # change file suffix
paste0('https://www.cnpm.embrapa.br/projetos/relevobr/download/mg/', .) # append to base URL
# create a directory for it all
dir <- file.path(tempdir(), 'mg')
dir.create(dir)
# iterate and download
lapply(urls, function(url) download.file(url, file.path(dir, basename(url))))
# check it's there
list.files(dir)
#> [1] "sd-23-y-a.zip" "sd-23-y-b.zip" "sd-23-y-c.zip" "sd-23-y-d.zip" "sd-23-z-a.zip" "sd-23-z-b.zip"
#> [7] "sd-23-z-c.zip" "sd-23-z-d.zip" "sd-24-y-c.zip" "sd-24-y-d.zip" "se-22-y-d.zip" "se-22-z-a.zip"
#> [13] "se-22-z-b.zip" "se-22-z-c.zip" "se-22-z-d.zip" "se-23-v-a.zip" "se-23-v-b.zip" "se-23-v-c.zip"
#> [19] "se-23-v-d.zip" "se-23-x-a.zip" "se-23-x-b.zip" "se-23-x-c.zip" "se-23-x-d.zip" "se-23-y-a.zip"
#> [25] "se-23-y-b.zip" "se-23-y-c.zip" "se-23-y-d.zip" "se-23-z-a.zip" "se-23-z-b.zip" "se-23-z-c.zip"
#> [31] "se-23-z-d.zip" "se-24-v-a.zip" "se-24-v-b.zip" "se-24-v-c.zip" "se-24-v-d.zip" "se-24-y-a.zip"
#> [37] "se-24-y-c.zip" "sf-22-v-b.zip" "sf-22-x-a.zip" "sf-22-x-b.zip" "sf-23-v-a.zip" "sf-23-v-b.zip"
#> [43] "sf-23-v-c.zip" "sf-23-v-d.zip" "sf-23-x-a.zip" "sf-23-x-b.zip" "sf-23-x-c.zip" "sf-23-x-d.zip"
#> [49] "sf-23-y-a.zip" "sf-23-y-b.zip" "sf-23-z-a.zip" "sf-23-z-b.zip" "sf-24-v-a.zip"

Resources