Google Scholar profile scraping - web-scraping

I'm trying to retrieve the links of a Google Scholar user's work from their profile but am having trouble accessing the html that is hidden behind the "show more" button. I would like to be able to capture all the links from a user but currently can only get the first 20. Im using the following script to scrape for reference.
from bs4 import BeautifulSoup
import requests
author_url = 'https://scholar.google.com/citations?hl=en&user=mG4imMEAAAAJ'
html_content = requests.get(author_url)
soup = BeautifulSoup(html_content.text, 'lxml')
tables = soup.final_all('table)
table = tables[1]
rows = table.final_all('tr')
links = []
for row in rows:
t = row.find('a')
if t is not None:
links.append(t.get('href'))

You need to use cstart URL parameter which stands for page number, 0 is the first page, 10 is the second.. This parameter allows to skip the need to click "show more button" and does the same thing.
This parameter needs to be used in while loop in order to paginate through all articles.
To exist the loop, one of the ways would be to check certain CSS selector such as .gsc_a_e which is assigned to text when no results are present:
The great thing about such approach is that it paginates dynamically, instead of for i in range() which is hard coded and will be broken if certain authors have 20 articles and another has 2550 articles.
On the screenshot above I'm using the SelectorGadget Chrome extension that lets you pick CSS selectors by clicking on certain elements in the browser. It works great if the website is not heavily JS driven.
Keep in mind that at some point you also need to use CAPTCHA solver or proxies. This is only when you need to extract a lot of articles from multiple authors.
Code with the option to save to CSV using pandas and a full example in the online IDE:
import pandas as pd
from bs4 import BeautifulSoup
import requests, lxml, json
def bs4_scrape_articles():
params = {
"user": "mG4imMEAAAAJ", # user-id
"hl": "en", # language
"gl": "us", # country to search from
"cstart": 0, # articles page. 0 is the first page
"pagesize": "100" # articles per page
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
}
all_articles = []
articles_is_present = True
while articles_is_present:
html = requests.post("https://scholar.google.com/citations", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
for article in soup.select("#gsc_a_b .gsc_a_t"):
article_title = article.select_one(".gsc_a_at").text
article_link = f'https://scholar.google.com{article.select_one(".gsc_a_at")["href"]}'
article_authors = article.select_one(".gsc_a_at+ .gs_gray").text
article_publication = article.select_one(".gs_gray+ .gs_gray").text
all_articles.append({
"title": article_title,
"link": article_link,
"authors": article_authors,
"publication": article_publication
})
# this selector is checking for the .class that contains: "There are no articles in this profile."
# example link: https://scholar.google.com/citations?hl=en&user=mG4imMEAAAAJ&cstart=600
if soup.select_one(".gsc_a_e"):
articles_is_present = False
else:
params["cstart"] += 100 # paginate to the next page
print(json.dumps(all_articles, indent=2, ensure_ascii=False))
# pd.DataFrame(data=all_articles).to_csv(f"google_scholar_{params['user']}_articles.csv", encoding="utf-8", index=False)
bs4_scrape_articles()
Outputs (shows only last results as output is 400+ articles):
[
{
"title": "Exponential family sparse coding with application to self-taught learning with text documents",
"link": "https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:LkGwnXOMwfcC",
"authors": "H Lee, R Raina, A Teichman, AY Ng",
"publication": ""
},
{
"title": "Visual and Range Data",
"link": "https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:eQOLeE2rZwMC",
"authors": "S Gould, P Baumstarck, M Quigley, AY Ng, D Koller",
"publication": ""
}
]
If you don't want want to deal with bypassing blocks from Google or maintaining your script, have a look at the Google Scholar Author Articles API.
There's also a scholarly package that can also extract author articles.
Code that shows how to extract all author articles with Google Scholar Author Articles API:
from serpapi import GoogleScholarSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
import os
def serpapi_scrape_articles():
params = {
# https://docs.python.org/3/library/os.html
"api_key": os.getenv("API_KEY"),
"engine": "google_scholar_author",
"hl": "en",
"author_id": "mG4imMEAAAAJ",
"start": "0",
"num": "100"
}
search = GoogleScholarSearch(params)
all_articles = []
articles_is_present = True
while articles_is_present:
results = search.get_dict()
for index, article in enumerate(results["articles"], start=1):
title = article["title"]
link = article["link"]
authors = article["authors"]
publication = article.get("publication")
citation_id = article["citation_id"]
all_articles.append({
"title": title,
"link": link,
"authors": authors,
"publication": publication,
"citation_id": citation_id
})
if "next" in results.get("serpapi_pagination", {}):
# split URL in parts as a dict() and update "search" variable to a new page
search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
else:
articles_is_present = False
print(json.dumps(all_articles, indent=2, ensure_ascii=False))
# pd.DataFrame(data=all_articles).to_csv(f"serpapi_google_scholar_{params['author_id']}_articles.csv", encoding="utf-8", index=False)
serpapi_scrape_articles()

Here is one way of obtaining that data:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from tqdm import tqdm ## if Jupyter notebook: from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
big_df = pd.DataFrame()
headers = {
'accept-language': 'en-US,en;q=0.9',
'x-requested-with': 'XHR',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
payload = {'json': '1'}
for x in tqdm(range(0, 500, 100)):
url = f'https://scholar.google.com/citations?hl=en&user=mG4imMEAAAAJ&cstart={x}&pagesize=100'
r = s.post(url, data=payload)
soup = bs(r.json()['B'], 'html.parser')
works = [(x.get_text(), 'https://scholar.google.com' + x.get('href')) for x in soup.select('a') if 'javascript:void(0)' not in x.get('href') and len(x.get_text()) > 7]
df = pd.DataFrame(works, columns = ['Paper', 'Link'])
big_df = pd.concat([big_df, df], axis=0, ignore_index=True)
print(big_df)
Result in terminal:
100%
5/5 [00:03<00:00, 1.76it/s]
Paper Link
0 Latent dirichlet allocation https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:IUKN3-7HHlwC
1 On spectral clustering: Analysis and an algorithm https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:2KloaMYe4IUC
2 ROS: an open-source Robot Operating System https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:u-x6o8ySG0sC
3 Rectifier nonlinearities improve neural network acoustic models https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:gsN89kCJA0AC
4 Recursive deep models for semantic compositionality over a sentiment treebank https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:_axFR9aDTf0C
... ... ...
473 A Sparse Sampling Algorithm for Near-Optimal Planning in Large Markov Decision Processes https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:hMod-77fHWUC
474 On Discrim inative vs. Generative https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:qxL8FJ1GzNcC
475 Game Theory with Restricted Strategies https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:8k81kl-MbHgC
476 Exponential family sparse coding with application to self-taught learning with text documents https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:LkGwnXOMwfcC
477 Visual and Range Data https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:eQOLeE2rZwMC
478 rows × 2 columns
See pandas documentation at https://pandas.pydata.org/docs/
Also Requests docs: https://requests.readthedocs.io/en/latest/
For BeautifulSoup, go to https://beautiful-soup-4.readthedocs.io/en/latest/
And for TQDM visit https://pypi.org/project/tqdm/

Related

Web scraping news page with a "load more"

I'm trying to scrape this news website "https://inshorts.com/en/read/national" and i'm just getting the results for just the displayed articles, i need all the articles on the website which contain the word (eg."COVID-19"), and don't have to use the "load more" button.
Here's my code which gives the current articles:
import requests
from bs4 import BeautifulSoup
import pandas as pd
dummy_url="https://inshorts.com/en/read/badminton"
data_dummy=requests.get(dummy_url)
soup=BeautifulSoup(data_dummy.content,'html.parser')
urls=["https://inshorts.com/en/read/national"]
news_data_content,news_data_title,news_data_category,news_data_time=[],[],[],[]
for url in urls:
category=url.split('/')[-1]
data=requests.get(url)
soup=BeautifulSoup(data.content,'html.parser')
news_title=[]
news_content=[]
news_category=[]
news_time = []
for headline,article,time in zip(soup.find_all('div', class_=["news-card-title news-right-box"]),
soup.find_all('div',class_=["news-card-content news-right-box"]),
soup.find_all('div', class_ = ["news-card-author-time news-card-author-time-in-title"])):
news_title.append(headline.find('span',attrs={'itemprop':"headline"}).string)
news_content.append(article.find('div',attrs={'itemprop':"articleBody"}).string)
news_time.append(time.find('span', clas=["date"]))
news_category.append(category)
news_data_title.extend(news_title)
news_data_content.extend(news_content)
news_data_category.extend(news_category)
news_data_time.extend(news_time)
df1=pd.DataFrame(news_data_title,columns=["Title"])
df2=pd.DataFrame(news_data_content,columns=["Content"])
df3=pd.DataFrame(news_data_category,columns=["Category"])
df4=pd.DataFrame(news_data_time, columns=["time"])
df=pd.concat([df1,df2,df3,df4],axis=1)
def name():
a = input("File Name: ")
return a
b = name()
df.to_csv(b + ".csv")
You can use this example how to simulate the clicking on Load More button:
import re
import requests
from bs4 import BeautifulSoup
url = "https://inshorts.com/en/read/national"
api_url = "https://inshorts.com/en/ajax/more_news"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
}
# load first page:
html_doc = requests.get(url, headers=headers).text
min_news_id = re.search(r'min_news_id = "([^"]+)"', html_doc).group(1)
pages = 10 # <-- here I limit number of pages to 10
while pages:
soup = BeautifulSoup(html_doc, "html.parser")
# search the soup for your articles here
# ...
# here I just print the headlines:
for headline in soup.select('[itemprop="headline"]'):
print(headline.text)
# load next batch of articles:
data = requests.post(api_url, data={"news_offset": min_news_id}).json()
html_doc = data["html"]
min_news_id = data["min_news_id"]
pages -= 1
Prints news headlines of first 10 pages:
...
Moeen has done some wonderful things in Test cricket: Root
There should be an evolution in player-media relationship: Federer
Swiggy in talks to raise over $500 mn at $10 bn valuation: Reports
Tesla investors urged to reject Murdoch, Kimbal Musk's re-election
Doctor dies on Pune-Mumbai Expressway when rolls of paper fall on his car
2 mothers name newborn girls after Cyclone Gulab in Odisha
100 US citizens, permanent residents waiting to leave Afghanistan
Iran's nuclear programme has crossed all red lines: Israeli PM

I'm having difficulty using Beautiful Soup to scrape data from an NCBI website

I can't for the life of me figure out how to use beautiful soup to scrape the isolation source information from web pages such as this:
https://www.ncbi.nlm.nih.gov/nuccore/JOKX00000000.2/
I keep trying to check if that tag exists and it keep returning that it doesn't, when I know for a fact it does. If I can't even verify it exists I'm not sure how to scrape it.
Thanks!
you shouldn' scrape the ncbi when there is the NCBI-EUtilities web service.
wget -q -O - "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=JOKX00000000.2&rettype=gb&retmode=xml" | xmllint --xpath '//GBQualifier[GBQualifier_name="isolation_source"]/GBQualifier_value/text()' - && echo
Type II sourdough
The data is loaded from external URL. To get isolation_source, you can use this example:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.ncbi.nlm.nih.gov/nuccore/JOKX00000000.2/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
ncbi_uidlist = soup.select_one('[name="ncbi_uidlist"]')["content"]
api_url = "https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi"
params = {
"id": ncbi_uidlist,
"db": "nuccore",
"report": "genbank",
"extrafeat": "null",
"conwithfeat": "on",
"hide-cdd": "on",
"retmode": "html",
"withmarkup": "on",
"tool": "portal",
"log$": "seqview",
"maxdownloadsize": "1000000",
}
soup = BeautifulSoup(
requests.get(api_url, params=params).content, "html.parser"
)
features = soup.select_one(".feature").text
isolation_source = re.search(r'isolation_source="([^"]+)"', features).group(1)
print(features)
print("-" * 80)
print(isolation_source)
Prints:
source 1..12
/organism="Limosilactobacillus reuteri"
/mol_type="genomic DNA"
/strain="TMW1.112"
/isolation_source="Type II sourdough"
/db_xref="taxon:1598"
/country="Germany"
/collection_date="1998"
--------------------------------------------------------------------------------
Type II sourdough

Is there any way of getting an output of all the header links because iv got none and no error as well

Tried using beautiful soup for Scraping header links out of Bing but I don't get any errors nor output.
from bs4 import BeautifulSoup
import requests
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find("ol", {"id": "b_results"})
links = soup.findAll("li", {"class": "b_algo"})
for item in links:
item_text = item.find("a").text
item_href = item.find("a").attrs["href"]
if item_text and item_href:
print(item_text)
print(item_href)
Try to specify User-Agent HTTP header to obtain the results:
import requests
from bs4 import BeautifulSoup
url = 'https://www.bing.com/search'
params = {'q': 'tree'}
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
soup = BeautifulSoup(requests.get(url, headers=headers, params=params).content, 'html.parser')
for a in soup.select('.b_algo a'):
print(a.text, a['href'])
Prints:
tree|好きな物語と出逢えるサイト https://tree-novel.com/
sustainably stylish home furniture Hong Kong | TREE https://tree.com.hk/
Chairs & Benches https://tree.com.hk/furniture/chairs-benches
Desks https://tree.com.hk/furniture/desks
Living Room https://tree.com.hk/rooms/living-room
Bedroom https://tree.com.hk/rooms/bedroom
Finishing Touches https://tree.com.hk/furniture/finishing-touches
Entryway https://tree.com.hk/rooms/entryway
Tree | Definition of Tree by Merriam-Webster https://www.merriam-webster.com/dictionary/tree
Tree | Definition of Tree at Dictionary.com https://www.dictionary.com/browse/tree
tree | Structure, Uses, Importance, & Facts | Britannica https://www.britannica.com/plant/tree
Tree Images · Nature Photography · Free Photos from Pexels ... https://www.pexels.com/search/tree/

web scraping using BeautifulSoup: reading tables

I'm trying to get data from a table on transfermarkt.com. I was able to get the first 25 entry with the following code. However, I need to get the rest of the entries which are in the following pages. When I clicked on the second page, url does not change.
I tried to increase the range in the for loop but it gives an error. Any suggestion would be appreciated.
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop'
heads = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
r = requests.get(url, headers = heads)
source = r.text
soup = BeautifulSoup(source, "html.parser")
players = soup.find_all("a",{"class":"spielprofil_tooltip"})
values = soup.find_all("td",{"class":"rechts hauptlink"})
playerslist = []
valueslist = []
for i in range(0,25):
playerslist.append(players[i].text)
valueslist.append(values[i].text)
df = pd.DataFrame({"Players":playerslist, "Values":valueslist})
Alter the url in the loop and also change your selectors
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
players = []
values = []
headers = {'User-Agent':'Mozilla/5.0'}
with requests.Session() as s:
for page in range(1,21):
r = s.get(f'https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop?ajax=yw1&page={page}', headers=headers)
soup = bs(r.content,'lxml')
players += [i.text for i in soup.select('.items .spielprofil_tooltip')]
values += [i.text for i in soup.select('.items .rechts.hauptlink')]
df = pd.DataFrame({"Players":players, "Values":values})

Web Scraping Underlying Data from Online Interactive Map

I am trying to get the underlying data from the interactive map on this website:https://www.sabrahealth.com/properties
I tried using the Inspect feature on Google Chrome to find the XHR file that would hold the locations of all the points on the map but nothing appeared. Is there another way to extract the location data from this map?
Well, the location data is available to download on their site here. But let's assume you are wanting the actual latitude, longitude values to do some analysis.
The first thing I would do is exactly what you did (look for the XHR). If I can't find anything there, the second thing I always do is search the html for the <script> tags. sometimes the data is "hiding" in there. It takes a little bit more detective work. It doesn't always yield results, but it does in this case.
If you look within the <script> tags, you'll find the relevant json format. Then you can just work with that. It's just a matter of finding it then manipulating the string to get the valid json format, then use json.loads() to feed that in.
import requests
import bs4
import json
url = 'https://www.sabrahealth.com/properties'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if 'jQuery.extend(Drupal.settings,' in script.text:
jsonStr = script.text.split('jQuery.extend(Drupal.settings,')[1]
jsonStr = jsonStr.rsplit(');',1)[0]
jsonObj = json.loads(jsonStr)
for each in jsonObj['gmap']['auto1map']['markers']:
name = each['markername']
lat = each['latitude']
lon = each['longitude']
soup = bs4.BeautifulSoup(each['text'], 'html.parser')
prop_type = soup.find('i', {'class':'property-type'}).text.strip()
sub_cat = soup.find('span', {'class':'subcat'}).text.strip()
location = soup.find('span', {'class':'subcat'}).find_next('p').text.split('\n')[0]
print ('Type: %s\nSubCat: %s\nLat: %s\nLon: %s\nLocation: %s\n' %(prop_type, sub_cat, lat, lon, location))
Output:
Type: Senior Housing - Leased
SubCat: Assisted Living
Lat: 38.3309
Lon: -85.862521
Location: Floyds Knobs, Indiana
Type: Skilled Nursing/Transitional Care
SubCat: SNF
Lat: 29.719507
Lon: -99.06649
Location: Bandera, Texas
Type: Skilled Nursing/Transitional Care
SubCat: SNF
Lat: 37.189079
Lon: -77.376015
Location: Petersburg, Virginia
Type: Skilled Nursing/Transitional Care
SubCat: SNF
Lat: 37.759998
Lon: -122.254616
Location: Alameda, California
...

Resources