How To Remove White Space in Scrapy Spider Data - web-scraping

I am writing my first spider in Scrapy and attempting to follow the documentation. I have implemented ItemLoaders. The spider extracts the data, but the data contains many line returns. I have tried many ways to remove them, but nothing seems to work. The replace_escape_chars utility is supposed to work, but I can't figure out how to use it with the ItemLoader. Also some people use (unicode.strip), but again, I can't seem to get it to work. Some people try to use these in items.py and others in the spider. How can I clean the data of these line returns (\r\n)? My items.py file only contains the item names and field(). The spider code is below:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from scrapy.utils.markup import replace_escape_chars
from ccpstore.items import Greenhouse
class GreenhouseSpider(BaseSpider):
name = "greenhouse"
allowed_domains = ["domain.com"]
start_urls = [
"http://www.domain.com",
]
def parse(self, response):
items = []
l = XPathItemLoader(item=Greenhouse(), response=response)
l.add_xpath('name', '//div[#class="product_name"]')
l.add_xpath('title', '//h1')
l.add_xpath('usage', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl00_liItem"]')
l.add_xpath('repeat', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl02_liItem"]')
l.add_xpath('direction', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl03_liItem"]')
items.append(l.load_item())
return items

You can use the default_output_processor on the loader and also other processors on individual fields, see title:
from scrapy.spider import BaseSpider
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Compose, MapCompose
from w3lib.html import replace_escape_chars, remove_tags
from ccpstore.items import Greenhouse
class GreenhouseSpider(BaseSpider):
name = "greenhouse"
allowed_domains = ["domain.com"]
start_urls = ["http://www.domain.com"]
def parse(self, response):
l = XPathItemLoader(Greenhouse(), response=response)
l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)
l.add_xpath('name', '//div[#class="product_name"]')
l.add_xpath('title', '//h1', Compose(remove_tags))
l.add_xpath('usage', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl00_liItem"]')
l.add_xpath('repeat', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl02_liItem"]')
l.add_xpath('direction', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl03_liItem"]')
return l.load_item()

It turns out that there were also many blank spaces in the data, so combining the answer of Steven with some more research allowed the data to have all tags, line returns and duplicate spaces removed. The working code is below. Note the addition of text() on the loader lines which removes the tags and the split and join processors to remove spaces and line returns.
def parse(self, response):
items = []
l = XPathItemLoader(item=Greenhouse(), response=response)
l.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
l.default_output_processor = Join()
l.add_xpath('title', '//h1/text()')
l.add_xpath('usage', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl00_liItem"]/text()')
l.add_xpath('repeat', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl02_liItem"]/text()')
l.add_xpath('direction', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl03_liItem"]/text()')
items.append(l.load_item())
return items

Related

Struggling with Scrapy pagination

At the moment have got a bit of the Frankenstein code (consisting of Beautifulsoup and Scrapy parts) that seem to be doing a job in terms of the reading the info from page 1 urls. Shall try to redo everything in Scrapy as soon as pagination issue resolved.
So what codes is meant to do:
Read all subcategories (BeautifulSoup part)
The rest are Scrapy code parts
Using the above urls read sub-subcategories.
Extract the last page number and loop over the above urls.
Extract the necessary product info from the above urls.
All except part 3 do seem to work.
Have tried to use the below code to extract the last page number but not sure how to integrate it into the main code
def parse_paging(self, response):
try:
for next_page in ('?pn=1' + response.xpath('//ul[#class="pagination pull-left"]/noscript/a/text()').extract()[-1]):
print(next_page)
# yield scrapy.Request(url=response.urljoin(next_page))
except:
pass
The below is the main code.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess
category_list = []
sub_category_url = []
root_url = 'https://uk.rs-online.com/web'
page = requests.get(root_url)
soup = BeautifulSoup(page.content, 'html.parser')
cat_up = [a.find_all('a') for a in soup.find_all('div',class_='horizontalMenu sectionUp')]
category_up = [item for sublist in cat_up for item in sublist]
cat_down = [a.find_all('a') for a in soup.find_all('div',class_='horizontalMenu sectionDown')]
category_down = [item for sublist in cat_down for item in sublist]
for c_up in category_up:
sub_category_url.append('https://uk.rs-online.com' + c_up['href'])
for c_down in category_down:
sub_category_url.append('https://uk.rs-online.com' + c_down['href'])
# print(k)
class subcategories(scrapy.Spider):
name = 'subcategories'
def start_requests(self):
urls = sub_category_url
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
products = response.css('div.card.js-title a::href').extract() #xpath("//div[contains(#class, 'js-tile')]/a/#href").
for p in products:
url = urljoin(response.url, p)
yield scrapy.Request(url, callback=self.parse_product)
def parse_product(self, response):
for quote in response.css('tr.resultRow'):
yield {
'product': quote.css('div.row.margin-bottom a::text').getall(),
'stock_no': quote.css('div.stock-no-label a::text').getall(),
'brand': quote.css('div.row a::text').getall(),
'price': quote.css('div.col-xs-12.price.text-left span::text').getall(),
'uom': quote.css('div.col-xs-12.pack.text-left span::text').getall(),
}
process = CrawlerProcess()
process.crawl(subcategories)
process.start()
Would be exceptionally grateful if you could provides any hints on how to deal with the above issue.
Let me know if you have any questions.
I would suggest you to extract next page number by using this
and then construct next page url using this number.
next_page_number = response.css('.nextPage::attr(ng-click)').re_first('\d+')

Can't select elements with scrapy-splash

I'm using this code, to extract text in specific link class. I can select one element of that class .extract_first() and I coudn't with all the elements of the same class, I want to be able to select them all and store them in a list. There is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
class MySpider(scrapy.Spider):
name = "quotes4"
start_urls = ["https://www.woolworths.com.au/shop/browse/drinks/cordials-juices-iced-teas/iced-teas"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse)
def parse(self, response):
# I can select first element of class
'''yield{
'name': response.css(".shelfProductTile-descriptionLink::text").extract_first()
}'''
# But not all the elements of the same class
a= response.css(".shelfProductTile-descriptionLink::text").extract()
print ('list lengh is : ' + str(len(a))) # OUTPUT : 0
Am I doing something wrong ? Thanks.
Do you need to be using scrapy_splash for this? Your yield statement looks like regular scrapy code, not scrapy_splash. If what you're scraping from is just html (not javascript) then you don't need scrapy_splash.

How to i can generate list from p tag ?

Please check the site :
https://www.americanberkshire.com/california.html
there are all in p tag
i wanna separating with each element but i can found effective way
# -*- coding: utf-8 -*-
import scrapy
class AmericanberkshireSpider(scrapy.Spider):
name = 'americanberkshire'
allowed_domains = ['americanberkshire.com']
start_urls = ['https://www.americanberkshire.com/california.html']
def parse(self, response):
lists=
Maybe if you use xpath 2.0, you can use regex in selector like //p[matches(text(),'[\w\s]+\([\w+]\)','i')].
Or try to iterate like this (not exact code, just example):
for sel in response.css('p'):
txt = sel.css('::text').get()
if not txt or not re.match('[\w\s]+\([\w+]\)', txt):
continue
# do what you need with selector sel
def parse(self, response):
for red_paragraph in response.xpath('//p[re:test(text(), "\([A-Z]{3,}\)")]'):
paragraphs = [red_paragraph]
for paragraph in red_paragraph.xpath('./following-sibling::p'):
if not paragraph.xpath('string(.)').extract_first().strip():
break
paragraphs.append(paragraph)
# In each iteration reaching here, paragraphs will contain a list of
# related paragraphs.

How to scrapy these data's from the website?

Here's an example: [http://www.cincinnati.com/story/money/2016/11/26/see-which-companies-16-deloitte-100/94441104/][1]
Ideally like to see a neatly crawled and extracted output data array with the following fields:
Company Name
2016 Rank
2015 Rank
Years in Business
Business Description
Website
2015 Revenues
2014 Revenues
HQ City
Year Founded
Employees
Is family owned?
from each of the specific company data pages.I'm purely beginner to scrapy i want know how to extract links automatically. Here in this code i'm feeding it manual. Can anyone help me here.
import scrapy
from spy.items import SpyItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.linkextractors import LinkExtractor
class ProjectSpider(CrawlSpider):
name = "project"
allowed_domains = ["cincinnati.com/story/money/2016/11/26/see-which-companies-16-deloitte-100/94441104/"]
start_urls = [100Links in here]
def parse(self, response):
item = SpyItem()
item['title'] = response.xpath('//*[#id="overlay"]/div[2]/article/div[3]/p[1]/strong/text()').extract()
item['Business'] =response.xpath('//*[#id="overlay"]/div[2]/article/div[3]/p[4]/text()').extract()
item['website'] =response.xpath('//p[5]/a/text()').extract()
item['Ranking']=response.xpath('//*[#id="overlay"]/div[2]/article/div[3]/p[2]/text()[1]').extract()
item['HQ']=response.css('p:nth-child(12)::text').extract()
item['Revenue2015']=response.xpath('//*[#id="overlay"]/div[2]/article/div[3]/p[7]/text()').extract()
item['Revenue2014']=response.css('p:nth-child(10)::text').extract()
item['YearFounded']=response.xpath('//p[11]/text()').extract().encode('utf-8')
item['Employees']=response.xpath('//article/div[3]/p[12]/text()').extract()
item['FamilyOwned']=response.xpath('//*[#id="overlay"]/div[2]/article/div[3]/p[13]/text()').extract()
yield item
There are at least two issues with your code.
allowed_domain has to be a domain. Not more.
You use a CrawlSpider that is meant to be used with Rules. You don't have any rules.
In the following there is some tested code as starting point:
import scrapy
class ProjectItem(scrapy.Item):
title = scrapy.Field()
owned = scrapy.Field()
class ProjectSpider(scrapy.Spider):
name = "cin100"
allowed_domains = ['cincinnati.com']
start_urls = ['http://www.cincinnati.com/story/money/2016/11/26/see-which-companies-16-deloitte-100/94441104/']
def parse(self, response):
# get selector for all 100 companies
sel_companies = response.xpath('//p[contains(.,"Here are the companies")]/following-sibling::p/a')
# create request for every single company detail page from href
for sel_companie in sel_companies:
href = sel_companie.xpath('./#href').extract_first()
url = response.urljoin(href)
request = scrapy.Request(url, callback=self.parse_company_detail)
yield request
def parse_company_detail(self, response):
# On detail page create item
item = ProjectItem()
# get detail information with specific XPath statements
# e.g. title is the first paragraph
item['title'] = response.xpath('//div[#role="main"]/p[1]//text()').extract_first()
# e.g. family owned has a label we can select
item['owned'] = response.xpath('//div[#role="main"]/p[contains(.,"Family owned")]/text()').extract_first()
# find clever XPaths for other fields ...
# ...
# Finally: yield the item
yield item

Scrapy Data Table extract

I am trying to scrape "https://www.expireddomains.net/deleted-com-domains/"
for the expired domain data list.
I always get empty item fields for the following
class ExpiredSpider(BaseSpider):
name = "expired"
allowed_domains = ["example.com"]
start_urls = ['https://www.expireddomains.net/deleted-com-domains/']
def parse(self, response):
log.msg('parse(%s)' % response.url, level = log.DEBUG)
rows = response.xpath('//table[#class="base1"]/tbody/tr')
for row in rows:
item = DomainItem()
item['domain'] = row.xpath('td[1]/text()').extract()
item['bl'] = row.xpath('td[2]/text()').extract()
yield item
Can somebody point out what is wrong? Thanks.
As a first note, you should use scrapy.Spider instead of BaseSpider which is deprecated
Secondly, .extract() method returns a list rather than a single element.
This is how the item extraction should look like
item['domain'] = row.xpath('td[1]/text()').extract_first()
item['bl'] = row.xpath('td[2]/text()').extract_first()
Also,
You should use the built in python logging library
import logging
logging.debug("parse("+response.url+")")

Resources