Web scraping news page with a "load more"

Web scraping news page with a "load more" - web-scraping

I'm trying to scrape this news website "https://inshorts.com/en/read/national" and i'm just getting the results for just the displayed articles, i need all the articles on the website which contain the word (eg."COVID-19"), and don't have to use the "load more" button.
Here's my code which gives the current articles:
import requests
from bs4 import BeautifulSoup
import pandas as pd
dummy_url="https://inshorts.com/en/read/badminton"
data_dummy=requests.get(dummy_url)
soup=BeautifulSoup(data_dummy.content,'html.parser')
urls=["https://inshorts.com/en/read/national"]
news_data_content,news_data_title,news_data_category,news_data_time=[],[],[],[]
for url in urls:
category=url.split('/')[-1]
data=requests.get(url)
soup=BeautifulSoup(data.content,'html.parser')
news_title=[]
news_content=[]
news_category=[]
news_time = []
for headline,article,time in zip(soup.find_all('div', class_=["news-card-title news-right-box"]),
soup.find_all('div',class_=["news-card-content news-right-box"]),
soup.find_all('div', class_ = ["news-card-author-time news-card-author-time-in-title"])):
news_title.append(headline.find('span',attrs={'itemprop':"headline"}).string)
news_content.append(article.find('div',attrs={'itemprop':"articleBody"}).string)
news_time.append(time.find('span', clas=["date"]))
news_category.append(category)
news_data_title.extend(news_title)
news_data_content.extend(news_content)
news_data_category.extend(news_category)
news_data_time.extend(news_time)
df1=pd.DataFrame(news_data_title,columns=["Title"])
df2=pd.DataFrame(news_data_content,columns=["Content"])
df3=pd.DataFrame(news_data_category,columns=["Category"])
df4=pd.DataFrame(news_data_time, columns=["time"])
df=pd.concat([df1,df2,df3,df4],axis=1)
def name():
a = input("File Name: ")
return a
b = name()
df.to_csv(b + ".csv")

You can use this example how to simulate the clicking on Load More button:
import re
import requests
from bs4 import BeautifulSoup
url = "https://inshorts.com/en/read/national"
api_url = "https://inshorts.com/en/ajax/more_news"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
}
# load first page:
html_doc = requests.get(url, headers=headers).text
min_news_id = re.search(r'min_news_id = "([^"]+)"', html_doc).group(1)
pages = 10 # <-- here I limit number of pages to 10
while pages:
soup = BeautifulSoup(html_doc, "html.parser")
# search the soup for your articles here
# ...
# here I just print the headlines:
for headline in soup.select('[itemprop="headline"]'):
print(headline.text)
# load next batch of articles:
data = requests.post(api_url, data={"news_offset": min_news_id}).json()
html_doc = data["html"]
min_news_id = data["min_news_id"]
pages -= 1
Prints news headlines of first 10 pages:
...
Moeen has done some wonderful things in Test cricket: Root
There should be an evolution in player-media relationship: Federer
Swiggy in talks to raise over $500 mn at $10 bn valuation: Reports
Tesla investors urged to reject Murdoch, Kimbal Musk's re-election
Doctor dies on Pune-Mumbai Expressway when rolls of paper fall on his car
2 mothers name newborn girls after Cyclone Gulab in Odisha
100 US citizens, permanent residents waiting to leave Afghanistan
Iran's nuclear programme has crossed all red lines: Israeli PM

Related

Google Scholar profile scraping

I'm trying to retrieve the links of a Google Scholar user's work from their profile but am having trouble accessing the html that is hidden behind the "show more" button. I would like to be able to capture all the links from a user but currently can only get the first 20. Im using the following script to scrape for reference.
from bs4 import BeautifulSoup
import requests
author_url = 'https://scholar.google.com/citations?hl=en&user=mG4imMEAAAAJ'
html_content = requests.get(author_url)
soup = BeautifulSoup(html_content.text, 'lxml')
tables = soup.final_all('table)
table = tables[1]
rows = table.final_all('tr')
links = []
for row in rows:
t = row.find('a')
if t is not None:
links.append(t.get('href'))

You need to use cstart URL parameter which stands for page number, 0 is the first page, 10 is the second.. This parameter allows to skip the need to click "show more button" and does the same thing.
This parameter needs to be used in while loop in order to paginate through all articles.
To exist the loop, one of the ways would be to check certain CSS selector such as .gsc_a_e which is assigned to text when no results are present:
The great thing about such approach is that it paginates dynamically, instead of for i in range() which is hard coded and will be broken if certain authors have 20 articles and another has 2550 articles.
On the screenshot above I'm using the SelectorGadget Chrome extension that lets you pick CSS selectors by clicking on certain elements in the browser. It works great if the website is not heavily JS driven.
Keep in mind that at some point you also need to use CAPTCHA solver or proxies. This is only when you need to extract a lot of articles from multiple authors.
Code with the option to save to CSV using pandas and a full example in the online IDE:
import pandas as pd
from bs4 import BeautifulSoup
import requests, lxml, json
def bs4_scrape_articles():
params = {
"user": "mG4imMEAAAAJ", # user-id
"hl": "en", # language
"gl": "us", # country to search from
"cstart": 0, # articles page. 0 is the first page
"pagesize": "100" # articles per page
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
}
all_articles = []
articles_is_present = True
while articles_is_present:
html = requests.post("https://scholar.google.com/citations", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
for article in soup.select("#gsc_a_b .gsc_a_t"):
article_title = article.select_one(".gsc_a_at").text
article_link = f'https://scholar.google.com{article.select_one(".gsc_a_at")["href"]}'
article_authors = article.select_one(".gsc_a_at+ .gs_gray").text
article_publication = article.select_one(".gs_gray+ .gs_gray").text
all_articles.append({
"title": article_title,
"link": article_link,
"authors": article_authors,
"publication": article_publication
})
# this selector is checking for the .class that contains: "There are no articles in this profile."
# example link: https://scholar.google.com/citations?hl=en&user=mG4imMEAAAAJ&cstart=600
if soup.select_one(".gsc_a_e"):
articles_is_present = False
else:
params["cstart"] += 100 # paginate to the next page
print(json.dumps(all_articles, indent=2, ensure_ascii=False))
# pd.DataFrame(data=all_articles).to_csv(f"google_scholar_{params['user']}_articles.csv", encoding="utf-8", index=False)
bs4_scrape_articles()
Outputs (shows only last results as output is 400+ articles):
[
{
"title": "Exponential family sparse coding with application to self-taught learning with text documents",
"link": "https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:LkGwnXOMwfcC",
"authors": "H Lee, R Raina, A Teichman, AY Ng",
"publication": ""
},
{
"title": "Visual and Range Data",
"link": "https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:eQOLeE2rZwMC",
"authors": "S Gould, P Baumstarck, M Quigley, AY Ng, D Koller",
"publication": ""
}
]
If you don't want want to deal with bypassing blocks from Google or maintaining your script, have a look at the Google Scholar Author Articles API.
There's also a scholarly package that can also extract author articles.
Code that shows how to extract all author articles with Google Scholar Author Articles API:
from serpapi import GoogleScholarSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
import os
def serpapi_scrape_articles():
params = {
# https://docs.python.org/3/library/os.html
"api_key": os.getenv("API_KEY"),
"engine": "google_scholar_author",
"hl": "en",
"author_id": "mG4imMEAAAAJ",
"start": "0",
"num": "100"
}
search = GoogleScholarSearch(params)
all_articles = []
articles_is_present = True
while articles_is_present:
results = search.get_dict()
for index, article in enumerate(results["articles"], start=1):
title = article["title"]
link = article["link"]
authors = article["authors"]
publication = article.get("publication")
citation_id = article["citation_id"]
all_articles.append({
"title": title,
"link": link,
"authors": authors,
"publication": publication,
"citation_id": citation_id
})
if "next" in results.get("serpapi_pagination", {}):
# split URL in parts as a dict() and update "search" variable to a new page
search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
else:
articles_is_present = False
print(json.dumps(all_articles, indent=2, ensure_ascii=False))
# pd.DataFrame(data=all_articles).to_csv(f"serpapi_google_scholar_{params['author_id']}_articles.csv", encoding="utf-8", index=False)
serpapi_scrape_articles()

Here is one way of obtaining that data:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from tqdm import tqdm ## if Jupyter notebook: from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
big_df = pd.DataFrame()
headers = {
'accept-language': 'en-US,en;q=0.9',
'x-requested-with': 'XHR',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
payload = {'json': '1'}
for x in tqdm(range(0, 500, 100)):
url = f'https://scholar.google.com/citations?hl=en&user=mG4imMEAAAAJ&cstart={x}&pagesize=100'
r = s.post(url, data=payload)
soup = bs(r.json()['B'], 'html.parser')
works = [(x.get_text(), 'https://scholar.google.com' + x.get('href')) for x in soup.select('a') if 'javascript:void(0)' not in x.get('href') and len(x.get_text()) > 7]
df = pd.DataFrame(works, columns = ['Paper', 'Link'])
big_df = pd.concat([big_df, df], axis=0, ignore_index=True)
print(big_df)
Result in terminal:
100%
5/5 [00:03<00:00, 1.76it/s]
Paper Link
0 Latent dirichlet allocation https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:IUKN3-7HHlwC
1 On spectral clustering: Analysis and an algorithm https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:2KloaMYe4IUC
2 ROS: an open-source Robot Operating System https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:u-x6o8ySG0sC
3 Rectifier nonlinearities improve neural network acoustic models https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:gsN89kCJA0AC
4 Recursive deep models for semantic compositionality over a sentiment treebank https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:_axFR9aDTf0C
... ... ...
473 A Sparse Sampling Algorithm for Near-Optimal Planning in Large Markov Decision Processes https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:hMod-77fHWUC
474 On Discrim inative vs. Generative https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:qxL8FJ1GzNcC
475 Game Theory with Restricted Strategies https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:8k81kl-MbHgC
476 Exponential family sparse coding with application to self-taught learning with text documents https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:LkGwnXOMwfcC
477 Visual and Range Data https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:eQOLeE2rZwMC
478 rows × 2 columns
See pandas documentation at https://pandas.pydata.org/docs/
Also Requests docs: https://requests.readthedocs.io/en/latest/
For BeautifulSoup, go to https://beautiful-soup-4.readthedocs.io/en/latest/
And for TQDM visit https://pypi.org/project/tqdm/

Scrape Text Between Two Strong Tags

I am trying to scrape "167" (top right) from the following website: https://www.goodfirms.co/billing-invoicing-software/.
I can manage to get all of the text, but I'm just trying to get the numbers, and I am not sure on how to isolate it. Would someone be able to help me?
Code:
from bs4 import BeautifulSoup as bs
import requests
import requests_random_user_agent
s = requests.Session()
user_agent = s.headers['User-Agent']
headers = {
'accept': '*/*',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,es;q=0.7,ru;q=0.6',
'referer': 'https://www.google.com/',
'connection': 'Keep-alive',
'user-agent': user_agent,
}
response = requests.get('https://www.goodfirms.co/billing-invoicing-software/', headers=headers)
soup = bs(response.content, 'lxml')
test = soup.find("section", class_="section-breadcrumb blog-breadcrumb overflow").text
print(test)
Output:
Home >
Billing and Invoicing Software
167 Softwares  |  Last updated: Jul 31, 2022
Desired output:
167

The number 167 is located under tag with class class="last_update inside <strong> tag:
import requests
from bs4 import BeautifulSoup
url = "https://www.goodfirms.co/billing-invoicing-software/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
num = soup.select_one(".last_update strong")
print(num.text)
Prints:
167

I'm having difficulty using Beautiful Soup to scrape data from an NCBI website

I can't for the life of me figure out how to use beautiful soup to scrape the isolation source information from web pages such as this:
https://www.ncbi.nlm.nih.gov/nuccore/JOKX00000000.2/
I keep trying to check if that tag exists and it keep returning that it doesn't, when I know for a fact it does. If I can't even verify it exists I'm not sure how to scrape it.
Thanks!

you shouldn' scrape the ncbi when there is the NCBI-EUtilities web service.
wget -q -O - "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=JOKX00000000.2&rettype=gb&retmode=xml" | xmllint --xpath '//GBQualifier[GBQualifier_name="isolation_source"]/GBQualifier_value/text()' - && echo
Type II sourdough

The data is loaded from external URL. To get isolation_source, you can use this example:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.ncbi.nlm.nih.gov/nuccore/JOKX00000000.2/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
ncbi_uidlist = soup.select_one('[name="ncbi_uidlist"]')["content"]
api_url = "https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi"
params = {
"id": ncbi_uidlist,
"db": "nuccore",
"report": "genbank",
"extrafeat": "null",
"conwithfeat": "on",
"hide-cdd": "on",
"retmode": "html",
"withmarkup": "on",
"tool": "portal",
"log$": "seqview",
"maxdownloadsize": "1000000",
}
soup = BeautifulSoup(
requests.get(api_url, params=params).content, "html.parser"
)
features = soup.select_one(".feature").text
isolation_source = re.search(r'isolation_source="([^"]+)"', features).group(1)
print(features)
print("-" * 80)
print(isolation_source)
Prints:
source 1..12
/organism="Limosilactobacillus reuteri"
/mol_type="genomic DNA"
/strain="TMW1.112"
/isolation_source="Type II sourdough"
/db_xref="taxon:1598"
/country="Germany"
/collection_date="1998"
--------------------------------------------------------------------------------
Type II sourdough

I want to go to the all the pages of yelp webiste and extract data from

I want to go to all the pages of the yelp site but cann't
this is the code
# packages
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import urllib
import os
import json
import datetime
import csv
# property scraper class
class Yelp(scrapy.Spider):
# scraper name
name = 'home business'
base_url = 'https://www.yelp.com/search?'
params = {
'find_desc': 'Home Cleaning',
'find_loc':'North Dallas, Dallas, TX',
#'start' : ''
}
page = 0
current_page = 1
# headers
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
}
#params['start'] = page
try:
os.remove('abx.csv')
except OSError:
pass
# custom settings
custom_settings = {
'CONCURRENT_REQUEST_PER_DOMAIN': 2,
'DOWNLOAD_DELAY': 1
}
# general crawler
def start_requests(self):
url = self.base_url + urllib.parse.urlencode(self.params)
# initial HTTP request
yield scrapy.Request(
url=url,
headers=self.headers,
callback=self.parse_listing
)
def parse_listing(self, response):
lists = response.css('h4[class="css-1l5lt1i"]')
for link in lists:
link = link.css('a::attr(href)').get()
link = 'https://www.yelp.com/' + link
#print('\n\nlink:',link,'\n\n')
yield response.follow(link, headers = self.headers, callback = self.parse_cards)
break
try:
#self.params['start'] = self.page
try:
total_pages = response.css('.text-align--center__09f24__1P1jK .css-e81eai::text').get()[5:7]
print(total_pages)
self.page +=10
self.current_page +=1
except Exception as e:
total_pages = 1
print('totl:',total_pages)
print('PAGE %s | %s ' % (self.current_page, total_pages))
if int(self.page/10) <= int(total_pages):
self.log('\n\n %s | %s\n\n ' %(self.page/10, total_pages))
next_page = response.url + '&start=' + str(self.page)
yield response.follow(url = next_page, headers = self.headers, callback = self.parse_listing)
except:
print('only single page',self.current_page)
def parse_cards(self,response):
print('\nok\n')
# main driver
if __name__ == '__main__':
# run scraper
process = CrawlerProcess()
process.crawl(Yelp)
process.start()
#Yelp.parse_cards(Yelp, '')
I applied try and except method also but cann't done the job.
The main problem is in the next page with the param '&start=' if i increment the start to 10 in every time then the url become every time like this
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=10&start=20&start=30'
and so on i want to only the url start will increment to start=10 and after them start=20 and so on.
like this
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=20'
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=30'
and so on.

Just find the link to the next page and follow that
next_page = response.css("a.next-link::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
This is pretty similar to what is done in the scrapy tutorial, have you followed that? Was there a reason you couldn't do it this way?
In the end your entire spider can become
from scrapy import Spider
class Yelp(Spider):
# scraper name
name = "home business"
start_urls = [
"https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX"
]
def parse(self, response):
for link in response.css("h4 > span > a"):
yield response.follow(link, callback=self.parse_cards)
next_page = response.css("a.next-link::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_cards(self, response):
print("parse_cards", response.url)
I removed the start_requests stuff to keep it simple for this example (something you should probably try to do when asking questions)

Is there any way of getting an output of all the header links because iv got none and no error as well

Tried using beautiful soup for Scraping header links out of Bing but I don't get any errors nor output.
from bs4 import BeautifulSoup
import requests
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find("ol", {"id": "b_results"})
links = soup.findAll("li", {"class": "b_algo"})
for item in links:
item_text = item.find("a").text
item_href = item.find("a").attrs["href"]
if item_text and item_href:
print(item_text)
print(item_href)

Try to specify User-Agent HTTP header to obtain the results:
import requests
from bs4 import BeautifulSoup
url = 'https://www.bing.com/search'
params = {'q': 'tree'}
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
soup = BeautifulSoup(requests.get(url, headers=headers, params=params).content, 'html.parser')
for a in soup.select('.b_algo a'):
print(a.text, a['href'])
Prints:
tree｜好きな物語と出逢えるサイト https://tree-novel.com/
sustainably stylish home furniture Hong Kong | TREE https://tree.com.hk/
Chairs & Benches https://tree.com.hk/furniture/chairs-benches
Desks https://tree.com.hk/furniture/desks
Living Room https://tree.com.hk/rooms/living-room
Bedroom https://tree.com.hk/rooms/bedroom
Finishing Touches https://tree.com.hk/furniture/finishing-touches
Entryway https://tree.com.hk/rooms/entryway
Tree | Definition of Tree by Merriam-Webster https://www.merriam-webster.com/dictionary/tree
Tree | Definition of Tree at Dictionary.com https://www.dictionary.com/browse/tree
tree | Structure, Uses, Importance, & Facts | Britannica https://www.britannica.com/plant/tree
Tree Images · Nature Photography · Free Photos from Pexels ... https://www.pexels.com/search/tree/

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Web scraping news page with a "load more" - web-scraping

Related

Google Scholar profile scraping

Scrape Text Between Two Strong Tags

I'm having difficulty using Beautiful Soup to scrape data from an NCBI website

I want to go to the all the pages of yelp webiste and extract data from

Is there any way of getting an output of all the header links because iv got none and no error as well

Categories

Resources