No data in scraping through scrapy - web-scraping

Below is the code along with url of the site I m trying to get data from, but programs runs writing nothing to csv output file. Also, I got the xpath from chrome inspector console, so must be correct.
Here is the code:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from insta.items import Insta
class instagram(CrawlSpider):
name = "instagram"
allowed_domains = ["zymanga.com"]
start_urls = ['http://zymanga.com/millionplus/%sf' % page for page in range(1,163)]
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//*[#id="username"]/')
items = []
for title in titles:
item = Insta()
item["username"] = title.select("a/text()").extract()
items.append(item)
return(items)

Related

Why can't my scraper extract data from next page

Hi there I built a scraper using scrapy framework, it works on the first page perfectly but fails to get same data from next pages even after writing a code to crawl from next page. What am I getting wrong in my code. My items.py file is working fine too.
Here's my code
import scrapy
from amazonscraper.items import AmazonscraperItem
from scrapy.loader import ItemLoader
class AmazonspiderSpider(scrapy.Spider):
name = 'amazonspider'
allowed_domains = ['amazon.com']
start_urls = ['https://www.amazon.com/s?i=fashion-womens-intl-ship&bbn=16225018011&rh=n%3A16225018011%2Cn%3A1040660%2Cn%3A1045024&pd_rd_r=2da30763-bfe6-4a38-b17a-77236fa718c5&pd_rd_w=JtaUW&pd_rd_wg=BtgRm&pf_rd_p=6a92dcea-e071-4bb9-866a-369bc067390d&pf_rd_r=86NBFKV4TA7CCSEVNBM7&qid=1671522114&rnid=1040660&ref=sr_pg_1']
def parse(self, response):
products = response.css('div.sg-col-4-of-12')
for product in products:
l = ItemLoader(item = AmazonscraperItem(), selector = product )
l.add_css('name', 'a.a-link-normal span.a-size-base-plus')
l.add_css('price', 'span.a-price span.a-offscreen')
l.add_css('review', 'i.a-icon span.a-icon-alt')
yield l.load_item()
next_page = response.xpath('//*[#id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[52]/div/div/span/a/#href').get()
if next_page is not None:
next_page_url = 'https://www.amazon.com' + next_page
yield response.follow(next_page_url, callback = self.parse)
Here's my AmazonScraperItem
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
class AmazonscraperItem(scrapy.Item):
name = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
price = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
review = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
I have fixed the issue. There was a technical error with the code. I have updated a few things. I have updated the next page selector to get the correct URL. Secondly, we don't need to append any URL while sending a request as you are using response.follow. response.follow will automatically convert the relative URL into an absolute URL. The below code is working for multiple pages (all pagination).
class AmazonspiderSpider(scrapy.Spider):
name = 'amazonspider'
allowed_domains = ['amazon.com']
start_urls = ['https://www.amazon.com/s?i=fashion-womens-intl-ship&bbn=16225018011&rh=n%3A16225018011%2Cn%3A1040660%2Cn%3A1045024&pd_rd_r=2da30763-bfe6-4a38-b17a-77236fa718c5&pd_rd_w=JtaUW&pd_rd_wg=BtgRm&pf_rd_p=6a92dcea-e071-4bb9-866a-369bc067390d&pf_rd_r=86NBFKV4TA7CCSEVNBM7&qid=1671522114&rnid=1040660&ref=sr_pg_1']
def parse(self, response):
products = response.css('div.sg-col-4-of-12')
for product in products:
l = ItemLoader(item = AmazonscraperItem(), selector = product )
l.add_css('name', 'a.a-link-normal span.a-size-base-plus')
l.add_css('price', 'span.a-price span.a-offscreen')
l.add_css('review', 'i.a-icon span.a-icon-alt')
yield l.load_item()
next_page = response.css('.s-pagination-next ::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)

Can't fetch the content of articles using beautifulsoup in python 3.7

I am doing web-scraping using beautifulsoup in python 3.7. The code below is successfully scraping date, title, tags but not the content of the articles. It is giving None instead.
import time
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
url = 'https://www.thehindu.com/search/?q=cybersecurity&order=DESC&sort=publishdate&ct=text&page={}'
pages = 32
for page in range(4, pages+1):
res = requests.get(url.format(page))
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all("a", {"class": "story-card75x1-text"}, href=True):
_href = item.get("href")
try:
resp = requests.get(_href)
except Exception as e:
try:
resp = requests.get("https://www.thehindu.com"+_href)
except Exception as e:
continue
dateTag = soup.find("span", {"class": "dateline"})
sauce = BeautifulSoup(resp.text,"lxml")
tag = sauce.find("a", {"class": "section-name"})
titleTag = sauce.find("h1", {"class": "title"})
contentTag = sauce.find("div", {"class": "_yeti_done"})
date = None
tagName = None
title = None
content = None
if isinstance(dateTag,Tag):
date = dateTag.get_text().strip()
if isinstance(tag,Tag):
tagName = tag.get_text().strip()
if isinstance(titleTag,Tag):
title = titleTag.get_text().strip()
if isinstance(contentTag,Tag):
content = contentTag.get_text().strip()
print(f'{date}\n {tagName}\n {title}\n {content}\n')
time.sleep(3)
I don't see where is the problem as I am writing the correct class in contentTag.
Thanks.
I guess the links you would like to follow from first page to it's inner page end with .ece. I've applied that logic within the script to traverse those target pages to scrape data from. I've defined selectors for content slightly differently. Now it appears to be working correctly. The following script only scrapes data from page 1. Feel free to change it as per your requirement.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = 'https://www.thehindu.com/search/?q=cybersecurity&order=DESC&sort=publishdate&ct=text&page=1'
base = "https://www.thehindu.com"
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".story-card-news a[href$='.ece']"):
resp = requests.get(urljoin(base,item.get("href")))
sauce = BeautifulSoup(resp.text,"lxml")
title = item.get_text(strip=True)
content = ' '.join([item.get_text(strip=True) for item in sauce.select("[id^='content-body-'] p")])
print(f'{title}\n {content}\n')

Beautiful soup - error pulling a website link that isn't on every looped page

I am working on my first web scrape and, thanks to the help of stackoverflow I have managed to put the following code together. This code works well to click through each of the pages and then go into each of the links to pull the info I need. However, it gets stuck at one of these links as there is no web info to pull.
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import time
binary = FirefoxBinary('geckodriver.exe')
driver = webdriver.Firefox()
driver.get('http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php')
url = 'http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php'
text = requests.get(url).text
page1 = BeautifulSoup(text, "html.parser")
def get_data(url, tries=0, max_tries=3):
text_test2 = requests.get(url).text
page2 = BeautifulSoup(text_test2, "html.parser")
try:
title = page2.find('h1', attrs={'class':'hl_2'}).text
content = page2.find('div', attrs={'class':'cont'}).text
phone = page2.find('div', attrs={'class':'sico ico_phone'}).text
email_div = page2.find('div', attrs={'class':'sico ico_email'})
email = email_div.find('a', attrs={'class': 'xsecondarylink'})['href']
web_div = page2.find('div', attrs={'class':'sico ico_link'})
web = web_div.find('a', attrs={'class':'xsecondarylink'})
if web != None:
web = web['href']
except:
if tries < max_tries:
tries += 1
print("try {}".format(tries))
return get_data(url, tries)
data = {'Name': [title],
'Street address': [content],
'Phone number': [phone],
'Email': [email],
'Web': [web]
}
return pd.DataFrame(data=data)
df = pd.DataFrame()
for i in range(0,80):
print(i)
page1 = BeautifulSoup(driver.page_source, 'html.parser')
for div in page1.findAll('div', attrs={'class':'item'}):
for a in div.findAll('a', attrs={'class':'initial_noline'}):
if 'kid=' not in a['href'] : continue
print('http://www.interzum.com' + a['href'])
data = get_data('http://www.interzum.com' + a['href'])
df = pd.concat([df, data])
next_button = driver.find_element_by_class_name('slick-next')
next_button.click()
time.sleep(20)
df.to_csv('results.csv')
I have tried numerous different ways to try and say, if the web doesn't exist then continue looping, if it does then pull me the href. But i keep getting an error message that
UnboundLocalError: local variable 'web' referenced before assignment
I can't seem to put this together correctly with the info out there on the web. Any insights into what I am doing incorrectly would be really appreciated!
Thank you all in advance.
I think you need to switch to selenium for retrieving info from specific pages as the content is not loaded always for requests. You can use the following as a framework.
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
baseLink = 'http://www.interzum.com'
varLink = 'http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php?fw_goto=aussteller/blaettern&fw_ajax=1&paginatevalues=%7B%22stichwort%22%3A%22%22%7D&start={}&dat=231518http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php'
startUrl = 'http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php'
resultsPerPage = 20
i = 0
headers = {'User-Agent' : 'Mozilla/5.0'}
results = []
final = []
with requests.Session() as s:
r = s.get(startUrl, headers = headers)
soup = bs(r.content, 'lxml')
numPages = int(soup.select('a[rel=next]')[-2].text)
links = list((baseLink + link['href'] for link in soup.select('[href*="fw_goto=aussteller/details&&kid="]')))
results.append(links)
for j in range(1, numPages):
i+=20
url = varLink.format(i)
r = s.get(url, headers = headers)
soup = bs(r.content, 'lxml')
links = list((baseLink + link['href'] for link in soup.select('[href*="fw_goto=aussteller/details&&kid="]')))
results.append(links)
totalList = [item for sublist in results for item in sublist]
for link in totalList:
driver.get(link)
try:
title = driver.find_element_by_css_selector('h1.hl_2').text
content = driver.find_element_by_css_selector('div.cont').text
phone = driver.find_element_by_css_selector('div.sico.ico_phone').text
email = driver.find_element_by_css_selector('div.sico.ico_email a.xsecondarylink').get_attribute('href')
web = driver.find_element_by_css_selector('div.sico.ico_link a.xsecondarylink').get_attribute('href')
final.append([title, content, phone, email, web])
except Exception as e:
print(link)
print(e)
continue

Trouble importing scrapy into json

I'm trying to pull some info off of Craigslist and store it in a JSON file, but the info is getting stored a bit wrong. Instead of having an array of [title, link, location, time], I'm getting an array with all the titles, one with all the links, etc. Are my titles wrong or is the for loop itself wrong?
from scrapy.spiders import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["https://pittsburgh.craigslist.org/search/ccc"]
def parse(self, response):
titles = response.selector.xpath("//p[#class='row']")
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.xpath("//span[#id='titletextonly']").extract()
item["link"] = titles.xpath("a/#href").extract()
item["location"] = titles.xpath("//small").extract()
item["time"] = titles.xpath('//time').extract()
items.append(item)
return items
That's because your inner xpaths match the elements starting from the root of the tree. Instead, you need to force them to work in the context of each item by prepending a dot:
for title in titles:
item = CraigslistSampleItem()
item["title"] = title.xpath(".//span[#id='titletextonly']").extract()
item["link"] = title.xpath("a/#href").extract()
item["location"] = title.xpath(".//small").extract()
item["time"] = title.xpath('.//time').extract()
yield item

Scrapy crawlSpider rules - prioritize "next pages"

Im trying to scrape a list of all the hotels in San Francisco from:
http://www.tripadvisor.com/Hotels-g60713-San_Francisco_California-Hotels.html
the "Next Hotels" has unique urls:
page 2 is: /Hotels-g60713-oa30-San_Francisco_California-Hotels.html
page 3 is: /Hotels-g60713-oa60-San_Francisco_California-Hotels.html
page 4 is: /Hotels-g60713-oa90-San_Francisco_California-Hotels.html
and so on..
How can I set the crawlSpider to reach these pages
Is there a Rule that can help me in this case?
Is there a way to prioritize and make it scrape and parse these pages before anything else?
my code so far:
import beatSoup_test
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class TriAdvSpider(CrawlSpider):
name = "tripAdv"
allowed_domains = ["tripadvisor.com"]
start_urls = [
"http://www.tripadvisor.com/Hotels-g60713-San_Francisco_California-Hotels.html"
]
rules = (
Rule(SgmlLinkExtractor(allow=r'-\w+.html$'), callback='parse_item', follow=True),
)
def parse_item(self, response):
beatSoup_test.getHotels(response.body_as_unicode())
where beatSoup_test is my parsing function that uses beautifulsoup.
Thanks!
If you want to scrape the data from any page. Use Xpath
That way you can scrape anything at same page.
And Use items to store the scraped data so that u can scrape as many thing you want.
Here is example how you can use it.
sites = Selector(text=response.body).xpath('//div[contains(#id, "identity")]//section/div/div/h3/a/text()')
items = []
items = myspiderBotItem()
items['title'] = sites.xpath('/text()').extract()
Like this
class TriAdvSpider(CrawlSpider):
name = "tripAdv"
allowed_domains = ["tripadvisor.com"]
start_urls = [
"http://www.tripadvisor.com/Hotels-g60713-San_Francisco_California-Hotels.html"
]
rules = (
Rule(SgmlLinkExtractor(allow=r'-\w+.html$'), callback='parse_item', follow=True),
)
def parse_item(self, response):
# beatSoup_test.getHotels(response.body_as_unicode())
l = XPathItemLoader(item = TriAdvItem(),response = response)
for i in range(1,8):
l.add_xpath('day','//*[#id="super-container"]/div/div[1]/div[2]/div[2]/div[1]/table/tbody/tr['+str(i)+']/th[#scope="row"]/text()')
l.add_xpath('timings1','//*[#id="super-container"]/div/div[1]/div[2]/div[2]/div[1]/table/tbody/tr['+str(i)+']/td[1]/span[1]/text()')
l.add_xpath('timings2','//*[#id="super-container"]/div/div[1]/div[2]/div[2]/div[1]/table/tbody/tr['+str(i)+']/td[1]/span[2]/text()')
return l.load_item()

Resources