I am trying to scrape Bangladesh COVID-19 data (number of tests, number of positive tests, positive rate) from the official website: http://103.247.238.92/webportal/pages/covid19.php
The website contains 3 drop-down menus to arrive at the data: Select Division; Select District; Select time frame for the data.
I have tried the following so far:
url <- "http://103.247.238.92/webportal/pages/covid19.php"
webpage <- read_html(url)
webpage has the following:
List of 2
$ node:<externalptr>
$ doc :<externalptr>
- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
Since this did not help, I also tried the following based on this question:
a <- GET(url)
a <- content(a, as="text")
a <- gsub("^angular.callbacks._2\\(", "", a)
a <- gsub("\\);$", "", a)
df <- fromJSON(a, simplifyDataFrame = TRUE)
The above returns the following error:
Error: lexical error: invalid char in json text.
<!DOCTYPE html> <!-- This is a
(right here) ------^
So I am really lost in terms of how I can even read the data - but upon looking at the source of the webpage, I know that the data is right there: Safari Website inspector
Any suggestions on how I can read this data?
Additionally, if someone could help with how I can go about selecting the different drop-down menu items, that would be really appreciated. The final goal is to collect data for each district in each division for the last 12 months.
tl;dr
The page makes additional requests to pick up that info. Those additional requests rely on combinations of ids; an id pulled from the option element value attribute, of each option within Division dropdown, in tandem with an id pulled from the option element value attribute of each option within the District dropdown.
You can make an initial request to get all the Division dropdown ids:
divisions <- options_df("#division option:nth-child(n+2)", "division")
nth-child(n+2) is used to exclude the initial 'select' option.
This returns a dataframe with the initial divisionIDs and friendly division names.
Those ids can then be used to retrieve the associated districtIDs (the options which become available in the second dropdown after making your selection in the first):
districts <- pmap_dfr(
list(divisions$divisionID),
~ {
df_districts <- districts_from_updated_session(.x, "district") %>%
mutate(
divisionID = .x
)
return(df_districts)
}
)
This returns a dataframe mapping the divisionID to all the associated districtIDs, as well as the friendly district names:
By including the divisionID in both dataframes I can inner-join them:
div_district <- dplyr::inner_join(divisions, districts, by = "divisionID", copy = FALSE)
Up until now, I have been using a session object for the efficiency of tcp re-use. Unfortunately, I couldn't find anything in the documentation covering how to update an already open session allowing for sending a new POST request with dynamic body argument. Instead, I leveraged furrr::future_map to try and gain some efficiencies through parallel processing:
df <- div_district %>%
mutate(json = furrr::future_map(divisionID, .f = get_covid_data, districtID))
To get the final covid numbers, via get_covid_data(), I leverage some perhaps odd behaviour of the server, in that I could make a GET, passing divisionID and districtID within the body, then regex out part of the jquery datatables scripting, string clean that into a json valid string, then read that into a json object stored in the json column of the final dataframe.
Inside of the json column
R:
library(httr)
#> Warning: package 'httr' was built under R version 4.0.3
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(jsonlite)
#> Warning: package 'jsonlite' was built under R version 4.0.3
#>
#> Attaching package: 'jsonlite'
#> The following object is masked from 'package:purrr':
#>
#> flatten
library(furrr)
#> Warning: package 'furrr' was built under R version 4.0.3
#> Loading required package: future
#> Warning: package 'future' was built under R version 4.0.3
## to clean out everything before a run
# rm(list = ls(all = TRUE))
# invisible(lapply(paste0('package:', names(sessionInfo()$otherPkgs)), detach, character.only=TRUE, unload=TRUE)) # https://stackoverflow.com/a/39235076 #mmfrgmpds
#returns value:text for options e.g. divisions/districts (dropdown)
options_df <- function(css_selector, level) {
nodes <- session %>% html_nodes(css_selector)
options <- nodes %>% map_df(~ c(html_attr(., "value"), html_text(.)) %>%
set_names(paste0(level, "ID"), level))
return(options)
}
#returns districts associated with division
districts_from_updated_session <- function(division_id, level) {
session <- jump_to(session, paste0("http://103.247.238.92/webportal/pages/ajaxDataDistrictDHIS2Dashboard.php?division_id=", division_id))
return(options_df("#district option:nth-child(n+2)", level))
}
# returns json object housing latest 12 month covid numbers by divisionID + districtID pairing
get_covid_data <- function(divisionID, districtID) {
headers <- c(
"user-agent" = "Mozilla/5.0",
"if-modified-since" = "Wed, 08 Jul 2020 00:00:00 GMT" # to mitigate for caching
)
data <- list("division" = divisionID, "district" = districtID, "period" = "LAST_12_MONTH", "Submit" = "Search")
r <- httr::GET(url = "http://103.247.238.92/webportal/pages/covid19.php", httr::add_headers(.headers = headers), body = data)
data <- stringr::str_match(content(r, "text"), "DataTable\\((\\[[\\s\\S]+\\])\\)")[1, 2] %>% #clean up extracted string so can be parsed as valid json
gsub("role", '"role"', .) %>%
gsub("'", '"', .) %>%
gsub(",\\s+\\]", "]", .) %>%
str_squish() %>%
jsonlite::parse_json()
return(data)
}
url <- "http://103.247.238.92/webportal/pages/covid19.php"
headers <- c("User-Agent" = "Mozilla/4.0", "Referer" = "http://103.247.238.92/webportal/pages/covid19.php")
session <- html_session(url, httr::add_headers(.headers = headers)) #for tcp re-use
divisions <- options_df("#division option:nth-child(n+2)", "division") #nth-child(n+2) to exclude initial 'select' option
districts <- pmap_dfr(
list(divisions$divisionID),
~ {
df <- districts_from_updated_session(.x, "district") %>%
mutate(
divisionID = .x
)
return(df)
}
)
div_district <- dplyr::inner_join(divisions, districts, by = "divisionID", copy = FALSE)
no_cores <- future::availableCores() - 1
future::plan(future::multisession, workers = no_cores)
df <- div_district %>%
mutate(json = future_map(divisionID, .f = get_covid_data, districtID))
Created on 2021-03-04 by the reprex package (v0.3.0)
Py
import requests, re, ast
from bs4 import BeautifulSoup as bs
def options_dict(soup, css_selector):
options = {i.text:i['value'] for i in soup.select(css_selector) if i['value']}
return options
def covid_numbers(text):
covid_data = p.findall(text)[0]
covid_data = re.sub(r'\n\s+', '', covid_data.replace("role","'role'"))
covid_data = ast.literal_eval(covid_data)
return covid_data
url = 'http://103.247.238.92/webportal/pages/covid19.php'
regions = {}
result = {}
p = re.compile(r'DataTable\((\[[\s\S]+\])\)')
with requests.Session() as s:
s.headers = {'User-Agent': 'Mozilla/5.0', 'Referer': 'http://103.247.238.92/webportal/pages/covid19.php'}
soup = bs(s.get(url).content, 'lxml')
divisions = options_dict(soup, '#division option')
for k,v in divisions.items():
r = s.get(f'http://103.247.238.92/webportal/pages/ajaxDataDistrictDHIS2Dashboard.php?division_id={v}')
soup = bs(r.content, 'lxml')
districts = options_dict(soup, '#district option')
regions[k] = districts
s.headers = {'User-Agent': 'Mozilla/5.0','if-modified-since': 'Wed, 08 Jul 2020 22:27:07 GMT'}
for k,v in divisions.items():
result[k] = {}
for k2,v2 in regions.items():
data = {'division': k2, 'district': v2, 'period': 'LAST_12_MONTH', 'Submit': 'Search'}
r = s.get('http://103.247.238.92/webportal/pages/covid19.php', data=data)
result[k][k2] = covid_numbers(r.text)
I am fairly new with R. I decided for my own learning process to scrape the tracks that my favorite radio station is playing and then add these songs to my spotify playlist. This way I can listen to the music of my favorite radio station without any advertising
What is going well?
I can scrape the songs and add a test song to my spotify playlist.
Where does it go wrong?
Via the spotify API I retrieve all information about the songs based on the artist and title. I only need the spotify:track:xxxxx part of every response I get back. When I try to extract the part spotify: track: (track uri) from every response I get the error: subscript out of bounds:
### Radio2 playlist scraper ###
#Loading packages#
install.packages("rvest")
library(rvest)
install.packages("dplyr")
library("dplyr")
install.packages("remotes")
remotes::install_github("charlie86/spotifyr")
library(spotifyr)
install.packages('knitr', dependencies = TRUE)
library(knitr)
install.packages("stringr")
library("stringr")
install.packages("jsonlite")
library("jsonlite")
library(jsonlite)
library(purrr)
library(data.table)
library(httr)
library(magrittr)
library(rvest)
library(ggplot2)
#Get playlist url #
url <- "https://www.nporadio2.nl/playlist"
#Read HTML code from pagen#
webpage <- read_html(url)
#Get Artist and Title#
artist <- html_nodes(webpage, '.fn-artist')
title <- html_nodes(webpage, '.fn-song')
#Artist and Title to text#
artist_text <- html_text(artist)
title_text <- html_text(title)
#Artist and Title to dataframe#
artiest <- as.data.frame(artist_text)
titel_text <- as.data.frame(title_text)
#Make one dataframe#
radioplaylist <- cbind(artiest$artist_text, titel_text$title_text)
radioplaylist <- as.data.frame(radioplaylist)
radioplaylist
#Rename columns#
colnames(radioplaylist)[1] <- "Artiest"
colnames(radioplaylist)[2] <- "Titel"
radioplaylist
#Remove duplicate songs#
radioplaylistuniek <- radioplaylist %>% distinct(Artiest, Titel, .keep_all = TRUE)
#Write to csv#
date <- Sys.Date()
date
write.csv(radioplaylistuniek, paste0("C://Users//Kantoor//Radio2playlists//playlist - ", date, ".csv"))
#Set spotify API#
Sys.setenv(SPOTIFY_CLIENT_ID = 'xxxxxxxxxxxxx')
Sys.setenv(SPOTIFY_CLIENT_SECRET = 'xxxxxxxxxxxx')
access_token <- get_spotify_access_token()
# Client and secret#
clientID <- "xxxxxxxxxxxxxxx"
secret <- "xxxxxxxxxxxxxx"
# Get access token and write this to authorization header #
response = POST(
'https://accounts.spotify.com/api/token',
accept_json(),
authenticate(clientID, secret),
body = list(grant_type = 'client_credentials'),
encode = 'form',
verbose()
)
token = content(response)$access_token
authorization.header = paste0("Bearer ", token)
# Generate URLS #
radioplaylistuniektest <- radioplaylistuniek[1:100,]
urls <- list(c("https://api.spotify.com/v1/search?q=track:")) %>% paste0(radioplaylistuniektest$Titel) %>% paste0(c("%20artist:")) %>% paste0(radioplaylistuniektest$Artiest) %>% paste(c("&type=track&limit=1"), sep = "")
# Get track information#
lijstwijk <- lapply(urls, GET, simplifyMatrix=TRUE, flatten=TRUE, config = add_headers(authorization = authorization.header))
# Get trackuri from each response#
lijstwijkuri <- lapply(lijstwijk, function(item) content(item, as="parsed")$tracks$items[[1]]$uri)
Error in content(item, as = "parsed")$tracks$items[[1]] :
subscript out of bounds
When I remove the track URI from the response for a few songs, lets say for the first 5, everything goes well:
# Generate URLS #
radioplaylistuniektest <- radioplaylistuniek[1:5,]
urls <- list(c("https://api.spotify.com/v1/search?q=track:")) %>% paste0(radioplaylistuniektest$Titel) %>% paste0(c("%20artist:")) %>% paste0(radioplaylistuniektest$Artiest) %>% paste(c("&type=track&limit=1"), sep = "")
# Get track information#
lijstwijk <- lapply(urls, GET, simplifyMatrix=TRUE, flatten=TRUE, config = add_headers(authorization = authorization.header))
# Get trackuri from each response#
lijstwijkuri <- lapply(lijstwijk, function(item) content(item, as="parsed")$tracks$items[[1]]$uri)
lijstwijkuri
[[1]]
[1] "spotify:track:5Xhqe9xu6bKRSqLj1mS1SB"
[[2]]
[1] "spotify:track:21YxK0klhpfLW8budkJaMF"
[[3]]
[1] "spotify:track:468OIV1LzYrm3rluVKl8AU"
[[4]]
[1] "spotify:track:3yDhZq8f17SmumVmEyCaRN"
[[5]]
[1] "spotify:track:0IseLavjQ32B5wxYxWeuw5"
How to fix the subscript out of bounds error?
What is going wrong? How can i fix the subscript out of bounds error for extracting the spotify:track:xxxx part from each response?
Got the solution. So for anyone who is curious. This is how i fixed it:
# Unlist results #
responses <- unlist(lapply(lijstwijk, paste, collapse=" "))
# Results to dataframe #
responsesdf <- as.data.frame(responses)
# Get spotify:track string#
uriperurl <- data.frame(uri = str_extract(responsesdf$responses, "(spotify:track:)\\w+"))
I am new to web scraping. I am trying to scrape a table with the following code. But I am unable to get it. The source of data is
https://www.investing.com/stock-screener/?sp=country::6|sector::a|industry::a|equityType::a|exchange::a%3Ceq_market_cap;1
url <- "https://www.investing.com/stock-screener/?sp=country::6|sector::a|industry::a|equityType::a|exchange::a%3Ceq_market_cap;1"
urlYAnalysis <- paste(url, sep = "")
webpage <- readLines(urlYAnalysis)
html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
tableNodes <- getNodeSet(html, "//table")
Tab <- readHTMLTable(tableNodes[[1]])
I copied this apporach from the link (Web scraping of key stats in Yahoo! Finance with R) where it is applied on yahoo finance data.
In my opinion, in readHTMLTable(tableNodes[[12]]), it should be Table 12. But when I try giving tableNodes[[12]], it always gives me an error.
Error in do.call(data.frame, c(x, alis)) :
variable names are limited to 10000 bytes
Please suggest me the way to extract the table and combine the data from other tabs as well (Fundamental, Technical and Performance).
This data is returned dynamically as json. In R (behaves differently from Python requests) you get html from which you can extract a given page's results as json. A page includes all the tabs info and 50 records. From the first page you are given the total record count and therefore can calculate the total number of pages to loop over to get all results. Perhaps combine them info a final dataframe during a loop to total number of pages; where you alter the pn param of the XHR POST body to the appropriate page number for desired results in each new POST request. There are two required headers.
Probably a good idea to write a function that accepts a page number in signature and returns a given page's json as a dataframe. Apply that via a tidyverse package to handle loop and combining of results to final dataframe?
library(httr)
library(jsonlite)
library(magrittr)
library(rvest)
library(stringr)
headers = c(
'User-Agent' = 'Mozilla/5.0',
'X-Requested-With' = 'XMLHttpRequest'
)
data = list(
'country[]' = '6',
'sector' = '7,5,12,3,8,9,1,6,2,4,10,11',
'industry' = '81,56,59,41,68,67,88,51,72,47,12,8,50,2,71,9,69,45,46,13,94,102,95,58,100,101,87,31,6,38,79,30,77,28,5,60,18,26,44,35,53,48,49,55,78,7,86,10,1,34,3,11,62,16,24,20,54,33,83,29,76,37,90,85,82,22,14,17,19,43,89,96,57,84,93,27,74,97,4,73,36,42,98,65,70,40,99,39,92,75,66,63,21,25,64,61,32,91,52,23,15,80',
'equityType' = 'ORD,DRC,Preferred,Unit,ClosedEnd,REIT,ELKS,OpenEnd,Right,ParticipationShare,CapitalSecurity,PerpetualCapitalSecurity,GuaranteeCertificate,IGC,Warrant,SeniorNote,Debenture,ETF,ADR,ETC,ETN',
'exchange[]' = '109',
'exchange[]' = '127',
'exchange[]' = '51',
'exchange[]' = '108',
'pn' = '1', # this is page number and should be altered in a loop over all pages. 50 results per page i.e. rows
'order[col]' = 'eq_market_cap',
'order[dir]' = 'd'
)
r <- httr::POST(url = 'https://www.investing.com/stock-screener/Service/SearchStocks', httr::add_headers(.headers=headers), body = data)
s <- r %>%read_html()%>%html_node('p')%>% html_text()
page1_data <- jsonlite::fromJSON(str_match(s, '(\\[.*\\])' )[1,2])
total_rows <- str_match(s, '"totalCount\":(\\d+),' )[1,2]%>%as.integer()
num_pages <- ceiling(total_rows/50)
My current attempt at combining which I would welcome feedback on. This is all the returned columns, for all pages, and I have to handle missing columns and different ordering of columns as well as 1 column being a data.frame. As the returned number is far greater than those visible on page, you could simply revise to subset returned columns with a mask just for the columns present in the tabs.
library(httr)
library(jsonlite)
library(magrittr)
library(rvest)
library(stringr)
library(tidyverse)
library(data.table)
headers = c(
'User-Agent' = 'Mozilla/5.0',
'X-Requested-With' = 'XMLHttpRequest'
)
data = list(
'country[]' = '6',
'sector' = '7,5,12,3,8,9,1,6,2,4,10,11',
'industry' = '81,56,59,41,68,67,88,51,72,47,12,8,50,2,71,9,69,45,46,13,94,102,95,58,100,101,87,31,6,38,79,30,77,28,5,60,18,26,44,35,53,48,49,55,78,7,86,10,1,34,3,11,62,16,24,20,54,33,83,29,76,37,90,85,82,22,14,17,19,43,89,96,57,84,93,27,74,97,4,73,36,42,98,65,70,40,99,39,92,75,66,63,21,25,64,61,32,91,52,23,15,80',
'equityType' = 'ORD,DRC,Preferred,Unit,ClosedEnd,REIT,ELKS,OpenEnd,Right,ParticipationShare,CapitalSecurity,PerpetualCapitalSecurity,GuaranteeCertificate,IGC,Warrant,SeniorNote,Debenture,ETF,ADR,ETC,ETN',
'exchange[]' = '109',
'exchange[]' = '127',
'exchange[]' = '51',
'exchange[]' = '108',
'pn' = '1', # this is page number and should be altered in a loop over all pages. 50 results per page i.e. rows
'order[col]' = 'eq_market_cap',
'order[dir]' = 'd'
)
get_data <- function(page_number){
data['pn'] = page_number
r <- httr::POST(url = 'https://www.investing.com/stock-screener/Service/SearchStocks', httr::add_headers(.headers=headers), body = data)
s <- r %>% read_html() %>% html_node('p') %>% html_text()
if(page_number==1){ return(s) }
else{return(data.frame(jsonlite::fromJSON(str_match(s, '(\\[.*\\])' )[1,2])))}
}
clean_df <- function(df){
interim <- df['viewData']
df_minus <- subset(df, select = -c(viewData))
df_clean <- cbind.data.frame(c(interim, df_minus))
return(df_clean)
}
initial_data <- get_data(1)
df <- clean_df(data.frame(jsonlite::fromJSON(str_match(initial_data, '(\\[.*\\])' )[1,2])))
total_rows <- str_match(initial_data, '"totalCount\":(\\d+),' )[1,2] %>% as.integer()
num_pages <- ceiling(total_rows/50)
dfs <- map(.x = 2:num_pages,
.f = ~clean_df(get_data(.)))
r <- rbindlist(c(list(df),dfs),use.names=TRUE, fill=TRUE)
write_csv(r, 'data.csv')