How to scrape url links when the website takes us to a splash screen? - web-scraping

import requests
from bs4 import BeautifulSoup
import re
R = []
url = "https://ascscotties.com/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; ' \
'Intel Mac OS X 10.6; rv:16.0) Gecko/20100101 Firefox/16.0'}
reqs = requests.get(url, headers=headers)
soup = BeautifulSoup(reqs.text, 'html.parser')
links= soup.find_all('a',href=re.compile("roster"))
s=[url + link.get("href") for link in links]
for i in s:
r = requests.get(i, allow_redirects=True, headers=headers)
if r.status_code < 400:
R.append(r.url)
Output
['https://ascscotties.com/sports/womens-basketball/roster',
'https://ascscotties.com/sports/womens-cross-country/roster',
'https://ascscotties.com/sports/womens-soccer/roster',
'https://ascscotties.com/sports/softball/roster',
'https://ascscotties.com/sports/womens-tennis/roster',
'https://ascscotties.com/sports/womens-volleyball/roster']
The code looks for roster links from url's and gives output, but like "https://auyellowjackets.com/" it fails as the url takes use to a splash screen. What can be done?

The site uses a cookie to indicate it has shown a splash screen before. So set it to get to the main page:
import re
import requests
from bs4 import BeautifulSoup
R = []
url = "https://auyellowjackets.com"
cookies = {"splash_2": "splash_2"} # <--- set cookie
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; "
"Intel Mac OS X 10.6; rv:16.0) Gecko/20100101 Firefox/16.0"
}
reqs = requests.get(url, headers=headers, cookies=cookies)
soup = BeautifulSoup(reqs.text, "html.parser")
links = soup.find_all("a", href=re.compile("roster"))
s = [url + link.get("href") for link in links]
for i in s:
r = requests.get(i, allow_redirects=True, headers=headers)
if r.status_code < 400:
R.append(r.url)
print(*R, sep="\n")
Prints:
https://auyellowjackets.com/sports/mens-basketball/roster
https://auyellowjackets.com/sports/mens-cross-country/roster
https://auyellowjackets.com/sports/football/roster
https://auyellowjackets.com/sports/mens-track-and-field/roster
https://auyellowjackets.com/sports/mwrest/roster
https://auyellowjackets.com/sports/womens-basketball/roster
https://auyellowjackets.com/sports/womens-cross-country/roster
https://auyellowjackets.com/sports/womens-soccer/roster
https://auyellowjackets.com/sports/softball/roster
https://auyellowjackets.com/sports/womens-track-and-field/roster
https://auyellowjackets.com/sports/volleyball/roster

Related

Scraping an href

I was wondering if someone could help me scrape an href tag and clean it up. I am trying to scrape the url from the big "Visit Website" button on this page: https://www.goodfirms.co/software/inflow-inventory, and then clean it up a little bit.
Code:
url = 'https://www.goodfirms.co/software/inflow-inventory'
page = requests.get(url)
time.sleep(2)
soup = bs(page.content, 'lxml')
try:
url = soup.find("div", class_="entity-detail-header-visit-website")
except AttributeError:
url = "Couldn't Find"
Print(url)
Output Print:
<div class="entity-detail-header-visit-website">
<a class="visit-website-btn" href="https://www.inflowinventory.com/?utm_source=goodfirms&utm_medium=profile" rel="nofollow" target="_blank">Visit website</a>
</div>
Desired Output:
https://www.inflowinventory.com
This will get you what you need:
import requests
from bs4 import BeautifulSoup
headers= {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
r = requests.get('https://www.goodfirms.co/software/inflow-inventory', headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
link = soup.select_one('a.visit-website-btn')
print(link['href'].split('/?utm')[0])
Result:
https://www.inflowinventory.com
Documentation for BeautifulSoup can be found at:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
Try this code to get #href value
url = soup.find("a", class_="visit-website-btn").get('href')
Having complete URL you can get base with
from urllib.parse import urlsplit
print(urlsplit(url).netloc)
# www.inflowinventory.com
"div", class_="entity-detail-header-visit-website" detects the same url two times with html content. So .a.get('href') with find() method will pull the righ url
import requests
from bs4 import BeautifulSoup
url = 'https://www.goodfirms.co/software/inflow-inventory'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')
link = soup.find("div", class_="entity-detail-header-visit-website").a.get('href')
print(link)
Output:
https://www.inflowinventory.com/?utm_source=goodfirms&utm_medium=profile
If you are looking for a solution according to your code then it is like this.
import requests
from bs4 import BeautifulSoup
url = 'https://www.goodfirms.co/software/inflow-inventory'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')
try:
url = soup.find("div", class_="entity-detail-header-visit-website")
print(url.a.get('href'))
except AttributeError:
url = "Couldn't Find"
print(url)
Result :
https://www.inflowinventory.com/?utm_source=goodfirms&utm_medium=profile

Is there any way of getting an output of all the header links because iv got none and no error as well

Tried using beautiful soup for Scraping header links out of Bing but I don't get any errors nor output.
from bs4 import BeautifulSoup
import requests
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find("ol", {"id": "b_results"})
links = soup.findAll("li", {"class": "b_algo"})
for item in links:
item_text = item.find("a").text
item_href = item.find("a").attrs["href"]
if item_text and item_href:
print(item_text)
print(item_href)
Try to specify User-Agent HTTP header to obtain the results:
import requests
from bs4 import BeautifulSoup
url = 'https://www.bing.com/search'
params = {'q': 'tree'}
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
soup = BeautifulSoup(requests.get(url, headers=headers, params=params).content, 'html.parser')
for a in soup.select('.b_algo a'):
print(a.text, a['href'])
Prints:
tree|好きな物語と出逢えるサイト https://tree-novel.com/
sustainably stylish home furniture Hong Kong | TREE https://tree.com.hk/
Chairs & Benches https://tree.com.hk/furniture/chairs-benches
Desks https://tree.com.hk/furniture/desks
Living Room https://tree.com.hk/rooms/living-room
Bedroom https://tree.com.hk/rooms/bedroom
Finishing Touches https://tree.com.hk/furniture/finishing-touches
Entryway https://tree.com.hk/rooms/entryway
Tree | Definition of Tree by Merriam-Webster https://www.merriam-webster.com/dictionary/tree
Tree | Definition of Tree at Dictionary.com https://www.dictionary.com/browse/tree
tree | Structure, Uses, Importance, & Facts | Britannica https://www.britannica.com/plant/tree
Tree Images · Nature Photography · Free Photos from Pexels ... https://www.pexels.com/search/tree/

Accessing websites in a dropdown list

I'm trying to build a web scraper that visits school district websites and retrieves the names and websites of the schools. I'm using https://www.dallasisd.org/ to test the code below.
I'm currently stuck on how to 1) only access the dropdown list of 'Schools' and 2) retrieve the links in the <li> tags in the same dropdown.
Any help would be much appreciated! Thank you.
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib.request
import requests
import re
import xlwt
import pandas as pd
import xlrd
from xlutils.copy import copy
import os.path
hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
browser = webdriver.Chrome()
url = 'https://www.dallasisd.org/'
browser.get(url)
html_source = browser.page_source
browser.quit()
soup = BeautifulSoup(html_source, "lxml")
for name_list in soup.find_all(class_ ='sw-dropdown-list'):
print(name_list.text)
The dropdown lists of elementary schools are contained in the <div id="cs-elementary-schools-panel" [...]> which you could access prior to finding all and obtain the links:
from bs4 import BeautifulSoup
import requests
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
url = 'https://www.dallasisd.org/'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
dropdown = soup.find('div', attrs={'id': "cs-elementary-schools-panel"})
for link in dropdown.find_all('li', attrs={'class': "cs-panel-item"}):
print("Url: https://www.dallasisd.org" + link.find('a')['href'])
You can easily extend this code to the Middle and High schools

web scraping using BeautifulSoup: reading tables

I'm trying to get data from a table on transfermarkt.com. I was able to get the first 25 entry with the following code. However, I need to get the rest of the entries which are in the following pages. When I clicked on the second page, url does not change.
I tried to increase the range in the for loop but it gives an error. Any suggestion would be appreciated.
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop'
heads = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
r = requests.get(url, headers = heads)
source = r.text
soup = BeautifulSoup(source, "html.parser")
players = soup.find_all("a",{"class":"spielprofil_tooltip"})
values = soup.find_all("td",{"class":"rechts hauptlink"})
playerslist = []
valueslist = []
for i in range(0,25):
playerslist.append(players[i].text)
valueslist.append(values[i].text)
df = pd.DataFrame({"Players":playerslist, "Values":valueslist})
Alter the url in the loop and also change your selectors
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
players = []
values = []
headers = {'User-Agent':'Mozilla/5.0'}
with requests.Session() as s:
for page in range(1,21):
r = s.get(f'https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop?ajax=yw1&page={page}', headers=headers)
soup = bs(r.content,'lxml')
players += [i.text for i in soup.select('.items .spielprofil_tooltip')]
values += [i.text for i in soup.select('.items .rechts.hauptlink')]
df = pd.DataFrame({"Players":players, "Values":values})

I have scrapy script, but I can not scrape data, don't knew why

I run the script, but I got none, but there are data on the url
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
class GetSpider(scrapy.Spider):
name = 'gets'
start_urls = ['https://www.retailmenot.com/coupons/insurance?u=ZTF65B5PJZEU3JDF326WY2SXOQ']
def parse(self, response):
s = Selector(response)
code = s.xpath("//button[contains(#class,'CopyCode')][1]/text()").get()
yield {'code':code}
I expect 52YR, but i got None
The easiest way to go about this is probably to load the json in the script as a python dictionary and navigate through it to get to the codes.
The below code should get you started:
import scrapy
import json
import logging
class GetSpider(scrapy.Spider):
name = 'gets'
start_urls = ['https://www.retailmenot.com/coupons/insurance?u=ZTF65B5PJZEU3JDF326WY2SXOQ']
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
custom_settings = {'ROBOTSTXT_OBEY': False}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url,
callback=self.parse,
headers=self.headers,
dont_filter=True)
def parse(self, response):
script = response.xpath(
'//script[contains(text(), "__NEXT_DATA__")]/text()'
).extract_first()
dict_start_index = script.index('{')
dict_end_index = script.index('};') + 1
data = json.loads(script[dict_start_index:dict_end_index])
coupon_data = data['props']['pageProps']['serverState']['apollo']['data']
for key, value in coupon_data.items():
try:
code = value['code']
except KeyError:
logging.debug("no code found")
else:
yield {'code': code}

Resources