i have written code which scrape all the link from web , however i would like to only scrape 'Latest update' from https://www.bbc.com/news/coronavirus
This is how i tried
from bs4 import BeautifulSoup
import requests
def get_links(url):
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'lxml')
links = []
for link in soup.find_all('a'):
link_url = link.get('href')
if link_url is not None and link_url.startswith('/news'):
links.append(link_url + '\n')
write_to_file(links)
return links
def write_to_file(links):
with open('data.txt', 'a') as f:
f.writelines(links)
def get_all_links(url):
for link in get_links(url):
get_all_links(link)
r = 'https://www.bbc.com/news/coronavirus'
write_to_file([r])
get_all_links(r)
soup = BeautifulSoup(data, 'lxml')
latest = soup.find(class_="class_of_the_div_wrapping_latest_news")
then you can go on with
links = []
for link in latest.find_all('a'):
link_url = link.get('href')
Related
Hi there I built a scraper using scrapy framework, it works on the first page perfectly but fails to get same data from next pages even after writing a code to crawl from next page. What am I getting wrong in my code. My items.py file is working fine too.
Here's my code
import scrapy
from amazonscraper.items import AmazonscraperItem
from scrapy.loader import ItemLoader
class AmazonspiderSpider(scrapy.Spider):
name = 'amazonspider'
allowed_domains = ['amazon.com']
start_urls = ['https://www.amazon.com/s?i=fashion-womens-intl-ship&bbn=16225018011&rh=n%3A16225018011%2Cn%3A1040660%2Cn%3A1045024&pd_rd_r=2da30763-bfe6-4a38-b17a-77236fa718c5&pd_rd_w=JtaUW&pd_rd_wg=BtgRm&pf_rd_p=6a92dcea-e071-4bb9-866a-369bc067390d&pf_rd_r=86NBFKV4TA7CCSEVNBM7&qid=1671522114&rnid=1040660&ref=sr_pg_1']
def parse(self, response):
products = response.css('div.sg-col-4-of-12')
for product in products:
l = ItemLoader(item = AmazonscraperItem(), selector = product )
l.add_css('name', 'a.a-link-normal span.a-size-base-plus')
l.add_css('price', 'span.a-price span.a-offscreen')
l.add_css('review', 'i.a-icon span.a-icon-alt')
yield l.load_item()
next_page = response.xpath('//*[#id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[52]/div/div/span/a/#href').get()
if next_page is not None:
next_page_url = 'https://www.amazon.com' + next_page
yield response.follow(next_page_url, callback = self.parse)
Here's my AmazonScraperItem
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
class AmazonscraperItem(scrapy.Item):
name = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
price = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
review = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
I have fixed the issue. There was a technical error with the code. I have updated a few things. I have updated the next page selector to get the correct URL. Secondly, we don't need to append any URL while sending a request as you are using response.follow. response.follow will automatically convert the relative URL into an absolute URL. The below code is working for multiple pages (all pagination).
class AmazonspiderSpider(scrapy.Spider):
name = 'amazonspider'
allowed_domains = ['amazon.com']
start_urls = ['https://www.amazon.com/s?i=fashion-womens-intl-ship&bbn=16225018011&rh=n%3A16225018011%2Cn%3A1040660%2Cn%3A1045024&pd_rd_r=2da30763-bfe6-4a38-b17a-77236fa718c5&pd_rd_w=JtaUW&pd_rd_wg=BtgRm&pf_rd_p=6a92dcea-e071-4bb9-866a-369bc067390d&pf_rd_r=86NBFKV4TA7CCSEVNBM7&qid=1671522114&rnid=1040660&ref=sr_pg_1']
def parse(self, response):
products = response.css('div.sg-col-4-of-12')
for product in products:
l = ItemLoader(item = AmazonscraperItem(), selector = product )
l.add_css('name', 'a.a-link-normal span.a-size-base-plus')
l.add_css('price', 'span.a-price span.a-offscreen')
l.add_css('review', 'i.a-icon span.a-icon-alt')
yield l.load_item()
next_page = response.css('.s-pagination-next ::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
I am doing web-scraping using beautifulsoup in python 3.7. The code below is successfully scraping date, title, tags but not the content of the articles. It is giving None instead.
import time
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
url = 'https://www.thehindu.com/search/?q=cybersecurity&order=DESC&sort=publishdate&ct=text&page={}'
pages = 32
for page in range(4, pages+1):
res = requests.get(url.format(page))
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all("a", {"class": "story-card75x1-text"}, href=True):
_href = item.get("href")
try:
resp = requests.get(_href)
except Exception as e:
try:
resp = requests.get("https://www.thehindu.com"+_href)
except Exception as e:
continue
dateTag = soup.find("span", {"class": "dateline"})
sauce = BeautifulSoup(resp.text,"lxml")
tag = sauce.find("a", {"class": "section-name"})
titleTag = sauce.find("h1", {"class": "title"})
contentTag = sauce.find("div", {"class": "_yeti_done"})
date = None
tagName = None
title = None
content = None
if isinstance(dateTag,Tag):
date = dateTag.get_text().strip()
if isinstance(tag,Tag):
tagName = tag.get_text().strip()
if isinstance(titleTag,Tag):
title = titleTag.get_text().strip()
if isinstance(contentTag,Tag):
content = contentTag.get_text().strip()
print(f'{date}\n {tagName}\n {title}\n {content}\n')
time.sleep(3)
I don't see where is the problem as I am writing the correct class in contentTag.
Thanks.
I guess the links you would like to follow from first page to it's inner page end with .ece. I've applied that logic within the script to traverse those target pages to scrape data from. I've defined selectors for content slightly differently. Now it appears to be working correctly. The following script only scrapes data from page 1. Feel free to change it as per your requirement.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = 'https://www.thehindu.com/search/?q=cybersecurity&order=DESC&sort=publishdate&ct=text&page=1'
base = "https://www.thehindu.com"
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".story-card-news a[href$='.ece']"):
resp = requests.get(urljoin(base,item.get("href")))
sauce = BeautifulSoup(resp.text,"lxml")
title = item.get_text(strip=True)
content = ' '.join([item.get_text(strip=True) for item in sauce.select("[id^='content-body-'] p")])
print(f'{title}\n {content}\n')
I am collecting date, headline, and content from USA Today Newspaper. I can able to get Date, headline and even content but along with content, I am getting some unwanted stuff. I don't know what I should change in my code to get only the content (article)?
import time
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
url = 'https://www.usatoday.com/search/?q=cybersecurity&page={}'
pages = 72
for page in range(1, pages+1):
res = requests.get(url.format(page))
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all("a", {"class": "gnt_se_a"}, href=True):
_href = item.get("href")
try:
resp = requests.get(_href)
except Exception as e:
try:
resp = requests.get("https://www.usatoday.com"+_href)
except Exception as e:
continue
sauce = BeautifulSoup(resp.text,"lxml")
dateTag = sauce.find("span",{"class": "asset-metabar-time asset-metabar-item nobyline"})
titleTag = sauce.find("h1", {"class": "asset-headline speakable-headline"})
contentTag = sauce.find("div", {"class": "asset-double-wide double-wide p402_premium"})
date = None
title = None
content = None
if isinstance(dateTag,Tag):
date = dateTag.get_text().strip()
if isinstance(titleTag,Tag):
title = titleTag.get_text().strip()
if isinstance(contentTag,Tag):
content = contentTag.get_text().strip()
print(f'{date}\n {title}\n {content}\n')
time.sleep(3)
I am expecting date, headline, and content from each article.
I try to find content by
contentTag = sauce.find_all('p',{"class": "p-text"})
and condition for content is
if isinstance(contentTag,list):
content = []
for c in contentTag:
content.append(c.get_text().strip())
content = ' '.join(content)
It works.
I am working on my first web scrape and, thanks to the help of stackoverflow I have managed to put the following code together. This code works well to click through each of the pages and then go into each of the links to pull the info I need. However, it gets stuck at one of these links as there is no web info to pull.
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import time
binary = FirefoxBinary('geckodriver.exe')
driver = webdriver.Firefox()
driver.get('http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php')
url = 'http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php'
text = requests.get(url).text
page1 = BeautifulSoup(text, "html.parser")
def get_data(url, tries=0, max_tries=3):
text_test2 = requests.get(url).text
page2 = BeautifulSoup(text_test2, "html.parser")
try:
title = page2.find('h1', attrs={'class':'hl_2'}).text
content = page2.find('div', attrs={'class':'cont'}).text
phone = page2.find('div', attrs={'class':'sico ico_phone'}).text
email_div = page2.find('div', attrs={'class':'sico ico_email'})
email = email_div.find('a', attrs={'class': 'xsecondarylink'})['href']
web_div = page2.find('div', attrs={'class':'sico ico_link'})
web = web_div.find('a', attrs={'class':'xsecondarylink'})
if web != None:
web = web['href']
except:
if tries < max_tries:
tries += 1
print("try {}".format(tries))
return get_data(url, tries)
data = {'Name': [title],
'Street address': [content],
'Phone number': [phone],
'Email': [email],
'Web': [web]
}
return pd.DataFrame(data=data)
df = pd.DataFrame()
for i in range(0,80):
print(i)
page1 = BeautifulSoup(driver.page_source, 'html.parser')
for div in page1.findAll('div', attrs={'class':'item'}):
for a in div.findAll('a', attrs={'class':'initial_noline'}):
if 'kid=' not in a['href'] : continue
print('http://www.interzum.com' + a['href'])
data = get_data('http://www.interzum.com' + a['href'])
df = pd.concat([df, data])
next_button = driver.find_element_by_class_name('slick-next')
next_button.click()
time.sleep(20)
df.to_csv('results.csv')
I have tried numerous different ways to try and say, if the web doesn't exist then continue looping, if it does then pull me the href. But i keep getting an error message that
UnboundLocalError: local variable 'web' referenced before assignment
I can't seem to put this together correctly with the info out there on the web. Any insights into what I am doing incorrectly would be really appreciated!
Thank you all in advance.
I think you need to switch to selenium for retrieving info from specific pages as the content is not loaded always for requests. You can use the following as a framework.
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
baseLink = 'http://www.interzum.com'
varLink = 'http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php?fw_goto=aussteller/blaettern&fw_ajax=1&paginatevalues=%7B%22stichwort%22%3A%22%22%7D&start={}&dat=231518http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php'
startUrl = 'http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php'
resultsPerPage = 20
i = 0
headers = {'User-Agent' : 'Mozilla/5.0'}
results = []
final = []
with requests.Session() as s:
r = s.get(startUrl, headers = headers)
soup = bs(r.content, 'lxml')
numPages = int(soup.select('a[rel=next]')[-2].text)
links = list((baseLink + link['href'] for link in soup.select('[href*="fw_goto=aussteller/details&&kid="]')))
results.append(links)
for j in range(1, numPages):
i+=20
url = varLink.format(i)
r = s.get(url, headers = headers)
soup = bs(r.content, 'lxml')
links = list((baseLink + link['href'] for link in soup.select('[href*="fw_goto=aussteller/details&&kid="]')))
results.append(links)
totalList = [item for sublist in results for item in sublist]
for link in totalList:
driver.get(link)
try:
title = driver.find_element_by_css_selector('h1.hl_2').text
content = driver.find_element_by_css_selector('div.cont').text
phone = driver.find_element_by_css_selector('div.sico.ico_phone').text
email = driver.find_element_by_css_selector('div.sico.ico_email a.xsecondarylink').get_attribute('href')
web = driver.find_element_by_css_selector('div.sico.ico_link a.xsecondarylink').get_attribute('href')
final.append([title, content, phone, email, web])
except Exception as e:
print(link)
print(e)
continue
So What I am trying to do is grab the text from the particular tag within the link and what I want to do is return the html only if the text contains certain words for example: if text contains "chemical" then return that link if not pass
here is my code:
import requests
from bs4 import BeautifulSoup
import webbrowser
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
all_job_url = []
def get_all_joblinks():
for tag in prettify.find_all('a', {'data-tn-element':"jobTitle"}):
link = tag['href']
all_job_url.append(link)
def filter_links():
for eachurl in all_job_url:
rurl = requests.get(base_url + eachurl)
content = rurl.content
soup = BeautifulSoup(content, "html.parser")
summary = soup.find('td', {'class':'snip'}).get_text()
print(summary)
def search_job():
while True:
if prettify.select('div.no_results'):
print("no job matches found")
break
else:
# opens the web page of job search if entries are found
website = webbrowser.open_new(url);
break
get_all_joblinks()
filter_links()
You seem to be getting all the links from a single indeed.ca page in your get_all_joblinks function. Here's how you could check whether a typical link mentions 'chemical' somewhere in the text in its body element.
>>> import requests
>>> import bs4
>>> page = requests.get('https://jobs.sanofi.us/job/-/-/507/4895612?utm_source=indeed.com&utm_campaign=sanofi%20sem%20campaign&utm_medium=job_aggregator&utm_content=paid_search&ss=paid').content
>>> soup = bs4.BeautifulSoup(page, 'lxml')
>>> body = soup.find('body').text
>>> chemical_present = body.lower().find('chemical')>-1
>>> chemical_present
True
I hope this is what you were looking for.
Edit, in response to comment.
>>> import webbrowser
>>> job_type = 'engineer'
>>> location = 'Toronto'
>>> url = "https://ca.indeed.com/jobs?q=" + job_type + "&l=" + location
>>> base_url = '%s://%s' % parse.urlparse(url)[0:2]
>>> page = requests.get(url).content
>>> soup = bs4.BeautifulSoup(page, 'lxml')
>>> for link in soup.find_all('a', {'data-tn-element':"jobTitle"}):
... job_page = requests.get(base_url+link['href']).content
... job_soup = bs4.BeautifulSoup(job_page, 'lxml')
... body = job_soup.find('body').text
... if body.lower().find('chemical')>-1:
... webbrowser.open(base_url+link['href'])