Download all files on a webpage with R? - r

My question is almost same as here. I want to download all files from this page. But the difference is I do not have the same pattern to be able to download all the files.
Any idea to get the download in R ?

# use the FTP mirror link provided on the page
mirror <- "ftp://srtm.csi.cgiar.org/SRTM_v41/SRTM_Data_GeoTIFF/"
# read the file listing
pg <- readLines(mirror)
# take a look
head(pg)
## [1] "06-18-09 06:18AM 713075 srtm_01_02.zip"
## [2] "06-18-09 06:18AM 130923 srtm_01_07.zip"
## [3] "06-18-09 06:18AM 130196 srtm_01_12.zip"
## [4] "06-18-09 06:18AM 156642 srtm_01_15.zip"
## [5] "06-18-09 06:18AM 317244 srtm_01_16.zip"
## [6] "06-18-09 06:18AM 160847 srtm_01_17.zip"
# clean it up and make them URLs
fils <- sprintf("%s%s", mirror, sub("^.*srtm", "srtm", pg))
head(fils)
## [1] "ftp://srtm.csi.cgiar.org/SRTM_v41/SRTM_Data_GeoTIFF/srtm_01_02.zip"
## [2] "ftp://srtm.csi.cgiar.org/SRTM_v41/SRTM_Data_GeoTIFF/srtm_01_07.zip"
## [3] "ftp://srtm.csi.cgiar.org/SRTM_v41/SRTM_Data_GeoTIFF/srtm_01_12.zip"
## [4] "ftp://srtm.csi.cgiar.org/SRTM_v41/SRTM_Data_GeoTIFF/srtm_01_15.zip"
## [5] "ftp://srtm.csi.cgiar.org/SRTM_v41/SRTM_Data_GeoTIFF/srtm_01_16.zip"
## [6] "ftp://srtm.csi.cgiar.org/SRTM_v41/SRTM_Data_GeoTIFF/srtm_01_17.zip"
# test download
download.file(fils[1], basename(fils[1]))
# validate it worked before slamming the server (your job)
# do the rest whilst being kind to the mirror server
for (f in fils[-1]) {
download.file(f, basename(f))
Sys.sleep(5) # unless you have entitlement issues, space out the downloads by a few seconds
}
If you don't mind using a non-base package, curl can help you just get the file names vs doing the sub above:
unlist(strsplit(rawToChar(curl::curl_fetch_memory(mirror, curl::new_handle(dirlistonly=TRUE))$content), "\n"))

This is not the most elegant solution, but it appears to be working when I try it on random subsets of helplinks.
library(rvest)
#Grab filenames from separate URL
helplinks <- read_html("http://rdf.muninn-project.org/api/elevation/datasets/srtm/") %>% html_nodes("a") %>% html_text(trim = T)
#Keep only filenames relevant for download
helplinks <- helplinks[grepl("srtm", helplinks)]
#Download files - make sure to adjust the `destfile` argument of the download.file function.
lapply(helplinks, function(x) download.file(sprintf("http://srtm.csi.cgiar.org/SRT-ZIP/SRTM_V41/SRTM_Data_GeoTiff/%s", x), sprintf("C:/Users/aud/Desktop/%s", x)))

Related

Download multiple files from a url, using R

I have this url: https://www.cnpm.embrapa.br/projetos/relevobr/download/index.htm with geographic information about Brazilian states. If you click in any state, you will find these grids:
Now, if you click in any grid, you will be able to download the geographic information of this specific grid:
What I need: download all the grids at once. Is it possible?
You can scrape the page to get the URLs for the zip files, then iterate across the URLs to download everything:
library(rvest)
# get page source
h <- read_html('https://www.cnpm.embrapa.br/projetos/relevobr/download/mg/mg.htm')
urls <- h %>%
html_nodes('area') %>% # get all `area` nodes
html_attr('href') %>% # get the link attribute of each node
sub('.htm$', '.zip', .) %>% # change file suffix
paste0('https://www.cnpm.embrapa.br/projetos/relevobr/download/mg/', .) # append to base URL
# create a directory for it all
dir <- file.path(tempdir(), 'mg')
dir.create(dir)
# iterate and download
lapply(urls, function(url) download.file(url, file.path(dir, basename(url))))
# check it's there
list.files(dir)
#> [1] "sd-23-y-a.zip" "sd-23-y-b.zip" "sd-23-y-c.zip" "sd-23-y-d.zip" "sd-23-z-a.zip" "sd-23-z-b.zip"
#> [7] "sd-23-z-c.zip" "sd-23-z-d.zip" "sd-24-y-c.zip" "sd-24-y-d.zip" "se-22-y-d.zip" "se-22-z-a.zip"
#> [13] "se-22-z-b.zip" "se-22-z-c.zip" "se-22-z-d.zip" "se-23-v-a.zip" "se-23-v-b.zip" "se-23-v-c.zip"
#> [19] "se-23-v-d.zip" "se-23-x-a.zip" "se-23-x-b.zip" "se-23-x-c.zip" "se-23-x-d.zip" "se-23-y-a.zip"
#> [25] "se-23-y-b.zip" "se-23-y-c.zip" "se-23-y-d.zip" "se-23-z-a.zip" "se-23-z-b.zip" "se-23-z-c.zip"
#> [31] "se-23-z-d.zip" "se-24-v-a.zip" "se-24-v-b.zip" "se-24-v-c.zip" "se-24-v-d.zip" "se-24-y-a.zip"
#> [37] "se-24-y-c.zip" "sf-22-v-b.zip" "sf-22-x-a.zip" "sf-22-x-b.zip" "sf-23-v-a.zip" "sf-23-v-b.zip"
#> [43] "sf-23-v-c.zip" "sf-23-v-d.zip" "sf-23-x-a.zip" "sf-23-x-b.zip" "sf-23-x-c.zip" "sf-23-x-d.zip"
#> [49] "sf-23-y-a.zip" "sf-23-y-b.zip" "sf-23-z-a.zip" "sf-23-z-b.zip" "sf-24-v-a.zip"

Issue scraping page with "Load more" button with rvest

I want to obtain the links to the atms listed on this page: https://coinatmradar.com/city/345/bitcoin-atm-birmingham-uk/
Would I need to do something about the 'load more' button at the bottom of the page?
I have been using the selector tool you can download for chrome to select the CSS path.
I've written the below code block and it only seems to retrieve the first ten links.
library(rvest)
base <- "https://coinatmradar.com/city/345/bitcoin-atm-birmingham-uk/"
base_read <- read_html(base)
atm_urls <- html_nodes(base_read, ".place > a")
all_urls_final <- html_attr(atm_urls, "href" )
print(all_urls_final)
I expected to be able to retrieve all links to the atms listed in the area but my R code has not done so.
Any help would be great. Sorry if this is a really simple question.
You should give RSelenium a try. I'm able to get the links with the following code:
# install.packages("RSelenium")
library(RSelenium)
library(rvest)
# Download binaries, start driver, and get client object.
rd <- rsDriver(browser = "firefox", port = 4444L)
ffd <- rd$client
# Navigate to page.
ffd$navigate("https://coinatmradar.com/city/345/bitcoin-atm-birmingham-uk/")
# Find the load button and assign, then send click event.
load_btn <- ffd$findElement(using = "css selector", ".load-more .btn")
load_btn$clickElement()
# Wait for elements to load.
Sys.sleep(2)
# Get HTML data and parse
html_data <- ffd$getPageSource()[[1]]
html_data %>%
read_html() %>%
html_nodes(".place a:not(.operator-link)") %>%
html_attr("href")
#### OUTPUT ####
# [1] "/bitcoin_atm/5969/bitcoin-atm-shitcoins-club-birmingham-uk-bitcoin-embassy/"
# [2] "/bitcoin_atm/7105/bitcoin-atm-general-bytes-northampton-costcutter/"
# [3] "/bitcoin_atm/4759/bitcoin-atm-general-bytes-birmingham-uk-costcutter/"
# [4] "/bitcoin_atm/2533/bitcoin-atm-general-bytes-birmingham-uk-londis-# convenience/"
# [5] "/bitcoin_atm/5458/bitcoin-atm-general-bytes-coventry-agg-african-restaurant/"
# [6] "/bitcoin_atm/711/bitcoin-atm-general-bytes-coventry-bigs-barbers/"
# [7] "/bitcoin_atm/5830/bitcoin-atm-general-bytes-telford-bpred-lion-service-station/"
# [8] "/bitcoin_atm/5466/bitcoin-atm-general-bytes-nottingham-24-express-off-licence/"
# [9] "/bitcoin_atm/4615/bitcoin-atm-general-bytes-northampton-costcutter/"
# [10] "/bitcoin_atm/4841/bitcoin-atm-lamassu-worcester-computer-house/"
# [11] "/bitcoin_atm/3150/bitcoin-atm-bitxatm-leicester-keshs-wines-and-newsagents-braustone/"
# [12] "/bitcoin_atm/2948/bitcoin-atm-bitxatm-coventry-nisa-local/"
# [13] "/bitcoin_atm/4742/bitcoin-atm-bitxatm-birmingham-uk-custcutter-coventry-road-hay-mills/"
# [14] "/bitcoin_atm/4741/bitcoin-atm-bitxatm-derby-michaels-drink-store-alvaston/"
# [15] "/bitcoin_atm/4740/bitcoin-atm-bitxatm-birmingham-uk-nisa-local-crabtree-# hockley/"
# [16] "/bitcoin_atm/4739/bitcoin-atm-bitxatm-birmingham-uk-nisa-local-subway-boldmere/"
# [17] "/bitcoin_atm/4738/bitcoin-atm-bitxatm-birmingham-uk-ashtree-convenience-store/"
# [18] "/bitcoin_atm/4737/bitcoin-atm-bitxatm-birmingham-uk-nisa-local-finnemore-road-bordesley-green/"
# [19] "/bitcoin_atm/3160/bitcoin-atm-bitxatm-birmingham-uk-costcutter/"
When you click show more the page does an XHR POST request for more results using an offset of 10 (suggesting results come in batches of 10) from current set. You can mimic this so long as you have the followings params in the post body (I suspect only the bottom 3 are essential)
'direction' : 1
'sort' : 1
'offset' : 10
'pagetype' : 'city'
'pageid' : 345
And the following request header is required (at least in Python implementations)
'X-Requested-With' : 'XMLHttpRequest'
You send that correctly and you will get a response containing the additional content. Note: content is wrapped in ![CDATA[]] as instruction that content should not be interpreted as xml - you will need to account for that by extracting content within for parsing.
The total number of atms is returned from original page you have and with css selector
.atm-number
You can split on and take the upper bound value from the split and convert to int. You then can calculate each offset required to meet that total (being used in a loop as consecutive offset param until total achieved) e.g. 19 results will be 2 requests total with 1 request at offset 10 for additional content.

html_nodes returning two results for a link

I'm trying to use R to fetch all the links to data files on the Eurostat website. While my code currently "works", I seem to get a duplicate result for every link.
Note, the use of download.file is to get around my company's firewall, per this answer
library(dplyr)
library(rvest)
myurl <- "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?dir=data&sort=1&sort=2&start=all"
download.file(myurl, destfile = "eurofull.html")
content <- read_html("eurofull.html")
links <- content %>%
html_nodes("a") %>% #Note that I dont know the significance of "a", this was trial and error
html_attr("href") %>%
data.frame()
# filter to only get the ".tsv.gz" links
files <- filter(links, grepl("tsv.gz", .))
Looking at the top of the dataframe
files$.[1:6]
[1] /eurostat/estat-navtree-portlet-prod/BulkDownloadListing?
sort=1&file=data%2Faact_ali01.tsv.gz
[2] /eurostat/estat-navtree-portlet-prod/BulkDownloadListing?
sort=1&downfile=data%2Faact_ali01.tsv.gz
[3] /eurostat/estat-navtree-portlet-prod/BulkDownloadListing?
sort=1&file=data%2Faact_ali02.tsv.gz
[4] /eurostat/estat-navtree-portlet-prod/BulkDownloadListing?
sort=1&downfile=data%2Faact_ali02.tsv.gz
[5] /eurostat/estat-navtree-portlet-prod/BulkDownloadListing?
sort=1&file=data%2Faact_eaa01.tsv.gz
[6] /eurostat/estat-navtree-portlet-prod/BulkDownloadListing?
sort=1&downfile=data%2Faact_eaa01.tsv.gz
The only difference between 1 and 2 is that 1 says "...file=data..." while 2 says "...downfile=data...". This pattern continues for all pairs down the dataframe.
If I download 1 and 2 and read the files into R, an identical check confirms they are the same.
Why are two links to the same data being returned? Is there a way (other than filtering for "downfile") to only return one of the links?
As noted, you can just do some better node targeting. This uses XPath vs CSS selectors and picks the links with downfile in the href:
html_nodes(content, xpath = ".//a[contains(#href, 'downfile')]") %>%
html_attr("href") %>%
sprintf("http://ec.europa.eu/%s", .) %>%
head()
## [1] "http://ec.europa.eu//eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&downfile=data%2Faact_ali01.tsv.gz"
## [2] "http://ec.europa.eu//eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&downfile=data%2Faact_ali02.tsv.gz"
## [3] "http://ec.europa.eu//eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&downfile=data%2Faact_eaa01.tsv.gz"
## [4] "http://ec.europa.eu//eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&downfile=data%2Faact_eaa02.tsv.gz"
## [5] "http://ec.europa.eu//eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&downfile=data%2Faact_eaa03.tsv.gz"
## [6] "http://ec.europa.eu//eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&downfile=data%2Faact_eaa04.tsv.gz"

Extracting NLP part-of-speech labels of customers' review in R

I have the following dataframe which contains reviews that customer have left on a restaurant website:
id<-c(1,2,3,4,5,6)
review<- c("the food was very delicious and hearty - perfect to warm up during a freezing winters day", "Excellent service as usual","Love this place!", "Service and quality of food first class"," Customer services was exceptional by all staff","excellent services")
df<-data.frame(id, review)
Now I am looking for a way (preferably without using a for loop) to find the part-of-speech labels in each customer's review in R.
This is a pretty straightforward adaption of the example on the Maxent_POS_Tag_Annotator help page.
df<-data.frame(id, review, stringsAsFactors=FALSE)
library(NLP)
library(openNLP)
review.pos <-
sapply(df$review, function(ii) {
a2 <- Annotation(1L, "sentence", 1L, nchar(ii))
a2 <- annotate(ii, Maxent_Word_Token_Annotator(), a2)
a3 <- annotate(ii, Maxent_POS_Tag_Annotator(), a2)
a3w <- subset(a3, type == "word")
tags <- sapply(a3w$features, `[[`, "POS")
sprintf("%s/%s", as.String(ii)[a3w], tags)
})
Which results in this output:
#[[1]]
# [1] "the/DT" "food/NN" "was/VBD" "very/RB" "delicious/JJ"
# [6] "and/CC" "hearty/NN" "-/:" "perfect/JJ" "to/TO"
#[11] "warm/VB" "up/RP" "during/IN" "a/DT" "freezing/JJ"
#[16] "winters/NNS" "day/NN"
#
#[[2]]
#[1] "Excellent/JJ" "service/NN" "as/IN" "usual/JJ"
#
#[[3]]
#[1] "Love/VB" "this/DT" "place/NN" "!/."
#
#[[4]]
#[1] "Service/NNP" "and/CC" "quality/NN" "of/IN" "food/NN"
#[6] "first/JJ" "class/NN"
#
#[[5]]
#[1] "Customer/NN" "services/NNS" "was/VBD" "exceptional/JJ"
#[5] "by/IN" "all/DT" "staff/NN"
#
#[[6]]
#[1] "excellent/JJ" "services/NNS"
It should be relatively straightforward to adapt this to whatever format you want.
Considerig in your example the id column is simply the row index, I believe you can obtain your desired output with the pos() function from the qdap package.
library(qdap)
pos(df$review)
If you do need grouping because of multiple reviews per customer, you can use
pos_by(df$review,df$id)
If you don't mind trying a GitHub package I have the tagger package to wrap NLP/openNLP to do a number of tasks quickly in the way Python users manipulate pos tags. Note that the output prints in the traditional word/tag format but in reality the object is actually a list of named vectors. This makes working with the words and tags easier. Here I demo how to get the tags and a few manipulations that tagger makes easy:
# First load your data and get the tagger package for those playing along at home
id<-c(1,2,3,4,5,6)
review<- c("the food was very delicious and hearty - perfect to warm up during a freezing winters day", "Excellent service as usual","Love this place!", "Service and quality of food first class"," Customer services was exceptional by all staff","excellent services")
df<-data.frame(id, review)
if (!require("pacman")) install.packages("pacman")
pacman::p_load_gh("trinker/tagger")
# Now tag and manipulate
(out <- tag_pos(as.character(df[["review"]])))
## [1] "the/DT food/NN was/VBD very/RB delicious/JJ and/CC hearty/NN -/: perfect/JJ to/TO warm/VB up/RP during/IN a/DT freezing/JJ winters/NNS day/NN"
## [2] "Excellent/JJ service/NN as/IN usual/JJ"
## [3] "Love/VB this/DT place/NN !/."
## [4] "Service/NNP and/CC quality/NN of/IN food/NN first/JJ class/NN"
## [5] "Customer/NN services/NNS was/VBD exceptional/JJ by/IN all/DT staff/NN"
## [6] "excellent/JJ services/NNS"
c(out) ## True structure: list of named vectors
as_word_tag(out) ## Match the print method (less mutable)
count_tags(out, df[["id"]]) ## Get counts by row
plot(out) ## tag distribution (plot at end)
as_basic(out) ## basic pos tags
## [1] "the/article food/noun was/verb very/adverb delicious/adjective and/conjunction hearty/noun -/. perfect/adjective to/preposition warm/verb up/preposition during/preposition a/article freezing/adjective winters/noun day/noun"
## [2] "Excellent/adjective service/noun as/preposition usual/adjective"
## [3] "Love/verb this/adjective place/noun !/."
## [4] "Service/noun and/conjunction quality/noun of/preposition food/noun first/adjective class/noun"
## [5] "Customer/noun services/noun was/verb exceptional/adjective by/preposition all/adjective staff/noun"
## [6] "excellent/adjective services/noun"
select_tags(out, c("NN", "NNP", "NNPS", "NNS"))
## [1] "food/NN hearty/NN winters/NNS day/NN"
## [2] "service/NN"
## [3] "place/NN"
## [4] "Service/NNP quality/NN food/NN class/NN"
## [5] "Customer/NN services/NNS staff/NN"
## [6] "services/NNS"
Everything works pretty nicely within a magrittr pipeline as well, which is my preference. The Examples Section of the README has a nice overview of the package's usage.

R: Extract words from a website

I am attempting to extract all words that start with a particular phrase from a website. The website I am using is:
http://docs.ggplot2.org/current/
I want to extract all the words that start with "stat_". I should get 21 names like "stat_identity" in return. I have the following code:
stats <- readLines("http://docs.ggplot2.org/current/")
head(stats)
grep("stat_{1[a-z]", stats, value=TRUE)
I am returned every line containing the phrase "stat_". I just want to extract the "stat_" words. So I tried something else:
gsub("\b^stat_[a-z]+ ", "", stats)
I think the output I got was an empty string, " ", where a "stat_" phrase would be? So now I'm trying to think of ways to extract all the text and set everything that is not a "stat_" phrase to empty strings. Does anyone have any ideas on how to get my desired output?
rvest & stringr to the rescue:
library(xml2)
library(rvest)
library(stringr)
pg <- read_html("http://docs.ggplot2.org/current/")
unique(str_match_all(html_text(html_nodes(pg, "body")),
"(stat_[[:alnum:]_]+)")[[1]][,2])
## [1] "stat_bin" "stat_bin2dCount"
## [3] "stat_bindot" "stat_binhexBin"
## [5] "stat_boxplot" "stat_contour"
## [7] "stat_density" "stat_density2d"
## [9] "stat_ecdf" "stat_functionSuperimpose"
## [11] "stat_identity" "stat_qqCalculation"
## [13] "stat_quantile" "stat_smooth"
## [15] "stat_spokeConvert" "stat_sum"
## [17] "stat_summarySummarise" "stat_summary_hexApply"
## [19] "stat_summary2dApply" "stat_uniqueRemove"
## [21] "stat_ydensity" "stat_defaults"
Unless you need the links (then you can use other rvest functions), this removes all the markup for you and just gives you the text of the website.

Resources