How to scrapy these data's from the website? - web-scraping

Here's an example: [http://www.cincinnati.com/story/money/2016/11/26/see-which-companies-16-deloitte-100/94441104/][1]
Ideally like to see a neatly crawled and extracted output data array with the following fields:
Company Name
2016 Rank
2015 Rank
Years in Business
Business Description
Website
2015 Revenues
2014 Revenues
HQ City
Year Founded
Employees
Is family owned?
from each of the specific company data pages.I'm purely beginner to scrapy i want know how to extract links automatically. Here in this code i'm feeding it manual. Can anyone help me here.
import scrapy
from spy.items import SpyItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.linkextractors import LinkExtractor
class ProjectSpider(CrawlSpider):
name = "project"
allowed_domains = ["cincinnati.com/story/money/2016/11/26/see-which-companies-16-deloitte-100/94441104/"]
start_urls = [100Links in here]
def parse(self, response):
item = SpyItem()
item['title'] = response.xpath('//*[#id="overlay"]/div[2]/article/div[3]/p[1]/strong/text()').extract()
item['Business'] =response.xpath('//*[#id="overlay"]/div[2]/article/div[3]/p[4]/text()').extract()
item['website'] =response.xpath('//p[5]/a/text()').extract()
item['Ranking']=response.xpath('//*[#id="overlay"]/div[2]/article/div[3]/p[2]/text()[1]').extract()
item['HQ']=response.css('p:nth-child(12)::text').extract()
item['Revenue2015']=response.xpath('//*[#id="overlay"]/div[2]/article/div[3]/p[7]/text()').extract()
item['Revenue2014']=response.css('p:nth-child(10)::text').extract()
item['YearFounded']=response.xpath('//p[11]/text()').extract().encode('utf-8')
item['Employees']=response.xpath('//article/div[3]/p[12]/text()').extract()
item['FamilyOwned']=response.xpath('//*[#id="overlay"]/div[2]/article/div[3]/p[13]/text()').extract()
yield item

There are at least two issues with your code.
allowed_domain has to be a domain. Not more.
You use a CrawlSpider that is meant to be used with Rules. You don't have any rules.
In the following there is some tested code as starting point:
import scrapy
class ProjectItem(scrapy.Item):
title = scrapy.Field()
owned = scrapy.Field()
class ProjectSpider(scrapy.Spider):
name = "cin100"
allowed_domains = ['cincinnati.com']
start_urls = ['http://www.cincinnati.com/story/money/2016/11/26/see-which-companies-16-deloitte-100/94441104/']
def parse(self, response):
# get selector for all 100 companies
sel_companies = response.xpath('//p[contains(.,"Here are the companies")]/following-sibling::p/a')
# create request for every single company detail page from href
for sel_companie in sel_companies:
href = sel_companie.xpath('./#href').extract_first()
url = response.urljoin(href)
request = scrapy.Request(url, callback=self.parse_company_detail)
yield request
def parse_company_detail(self, response):
# On detail page create item
item = ProjectItem()
# get detail information with specific XPath statements
# e.g. title is the first paragraph
item['title'] = response.xpath('//div[#role="main"]/p[1]//text()').extract_first()
# e.g. family owned has a label we can select
item['owned'] = response.xpath('//div[#role="main"]/p[contains(.,"Family owned")]/text()').extract_first()
# find clever XPaths for other fields ...
# ...
# Finally: yield the item
yield item

Related

Get structured output with Scrapy

I'm just starting to use scrapy and this is one of my first few projects. I am trying to scrape some company metadata from https://www.baincapitalprivateequity.com/portfolio/ . I have figured out my selectors but I'm unable to structure the output. I'm currently getting everything in one cell but I want the output to be one row for each company. If someone could help with where I'm going wrong, it'll be really great.
import scrapy
from ..items import BainpeItem
class BainPeSpider(scrapy.Spider):
name = 'Bain-PE'
allowed_domains = ['baincapitalprivateequity.com']
start_urls = ['https://www.baincapitalprivateequity.com/portfolio/']
def parse(self, response):
items = BainpeItem()
all_cos = response.css('div.grid')
for i in all_cos:
company = i.css('ul li::text').extract()
about = i.css('div.companyDetail p').extract()
items['company'] = company
items['about'] = about
yield items
You can just yield each item in the for loop:
for i in all_cos:
item = BainpeItem()
company = i.css('ul li::text').extract()
about = i.css('div.companyDetail p').extract()
item['company'] = company
item['about'] = about
yield item
This way each item will arrive in the pipeline separately.

Beautiful Soup Pagination, find_all not finding text within next_page class. Need also to extract data from URLS

I've been working on this for a week and am determined to get this working!
My ultimate goal is to write a webscraper where you can insert the county name and the scraper will produce a csv file of information from mugshots - Name, Location, Eye Color, Weight, Hair Color and Height (it's a genetics project I am working on).
The site organization is primary site page --> state page --> county page -- 120 mugshots with name and url --> url with data I am ultimately after and next links to another set of 120.
I thought the best way to do this would be to write a scraper that will grab the URLs and Names from the table of 120 mugshots and then use pagination to grab all the URLs and names from the rest of the county (in some cases there are 10's of thousands). I can get the first 120, but my pagination doesn't work.. so Im ending up with a csv of 120 names and urls.
I closely followed this article which was very helpful
from bs4 import BeautifulSoup
import requests
import lxml
import pandas as pd
county_name = input('Please, enter a county name: /Arizona/Maricopa-County-AZ \n')
print(f'Searching {county_name}. Wait, please...')
base_url = 'https://www.mugshots.com'
search_url = f'https://mugshots.com/US-Counties/{county_name}/'
data = {'Name': [],'URL': []}
def export_table_and_print(data):
table = pd.DataFrame(data, columns=['Name', 'URL'])
table.index = table.index + 1
table.to_csv('mugshots.csv', index=False)
print('Scraping done. Here are the results:')
print(table)
def get_mugshot_attributes(mugshot):
name = mugshot.find('div', attrs={'class', 'label'})
url = mugshot.find('a', attrs={'class', 'image-preview'})
name=name.text
url=mugshot.get('href')
url = base_url + url
data['Name'].append(name)
data['URL'].append(url)
def parse_page(next_url):
page = requests.get(next_url)
if page.status_code == requests.codes.ok:
bs = BeautifulSoup(page.text, 'lxml')
list_all_mugshot = bs.find_all('a', attrs={'class', 'image-preview'})
for mugshot in list_all_mugshot:
get_mugshot_attributes(mugshot)
next_page_text = mugshot.find('a class' , attrs={'next page'})
if next_page_text == 'Next':
next_page_text=mugshot.get_text()
next_page_url=mugshot.get('href')
next_page_url=base_url+next_page_url
print(next_page_url)
parse_page(next_page_url)
else:
export_table_and_print(data)
parse_page(search_url)
Any ideas on how to get the pagination to work and also how to eventually get the data from the list of URLs I scrape?
I appreciate your help! I've been working in python for a few months now, but the BS4 and Scrapy stuff is so confusing for some reason.
Thank you so much community!
Anna
It seems you want to know the logic as to how you can get the content using populated urls derived from each of the page traversing next pages. This is how you can parse all the links from each page including next page and then use those links to get the content from their inner pages.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = "https://mugshots.com/"
base = "https://mugshots.com"
def get_next_pages(link):
print("**"*20,"current page:",link)
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select("[itemprop='name'] > a[href^='/Current-Events/']"):
yield from get_main_content(urljoin(base,item.get("href")))
next_page = soup.select_one(".pagination > a:contains('Next')")
if next_page:
next_page = urljoin(url,next_page.get("href"))
yield from get_next_pages(next_page)
def get_main_content(link):
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
item = soup.select_one("h1#item-title > span[itemprop='name']").text
yield item
if __name__ == '__main__':
for elem in get_next_pages(url):
print(elem)

When using scrapy shell, I get no data from response.xpath

I am trying to scrape a betting site. However, when I check for the retrieved data in scrapy shell, I receive nothing.
The xpath to what I need is: //*[#id="yui_3_5_0_1_1562259076537_31330"] and when I write in the shell this is what I get:
In [18]: response.xpath ( '//*[#id="yui_3_5_0_1_1562259076537_31330"]')
Out[18]: []
The output is [] but I expected to be something from which I could extract the href.
When I use the "inspect" tool from Chrome, while the site is still loading, this id is outlined in purple. Does this mean that the site is using JavaScipt? And if this is true, is this the reason why scrapy does not find the item and returns []?
i try scraping the site just using Scrapy and this is my result.
This the items.py file
import scrapy
class LifeMatchsItem(scrapy.Item):
Event = scrapy.Field() # Name of event
Match = scrapy.Field() # Teams1 vs Team2
Date = scrapy.Field() # Date of Match
This is my Spider code
import scrapy
from LifeMatchesProject.items import LifeMatchsItem
class LifeMatchesSpider(scrapy.Spider):
name = 'life_matches'
start_urls = ['http://www.betfair.com/sport/home#sscpl=ro/']
custom_settings = {'FEED_EXPORT_ENCODING': 'utf-8'}
def parse(self, response):
for event in response.xpath('//div[contains(#class,"events-title")]'):
for element in event.xpath('./following-sibling::ul[1]/li'):
item = LifeMatchsItem()
item['Event'] = event.xpath('./a/#title').get()
item['Match'] = element.xpath('.//div[contains(#class,"event-name-info")]/a/#data-event').get()
item['Date'] = element.xpath('normalize-space(.//div[contains(#class,"event-name-info")]/a//span[#class="date"]/text())').get()
yield item
And this is the result

How to scrape all of the data from the website?

My code only giving me 44 links data instead of 102. Can Someone say me why it is Extracting like that?I would appreciate your help.How can i extract it properly???
import scrapy
class ProjectItem(scrapy.Item):
title = scrapy.Field()
owned = scrapy.Field()
Revenue2014 = scrapy.Field()
Revenue2015 = scrapy.Field()
Website = scrapy.Field()
Rank = scrapy.Field()
Employees = scrapy.Field()
headquarters = scrapy.Field()
FoundedYear = scrapy.Field()
class ProjectSpider(scrapy.Spider):
name = "cin100"
allowed_domains = ['cincinnati.com']
start_urls = ['http://www.cincinnati.com/story/money/2016/11/26/see-which-companies-16-deloitte-100/94441104/']
def parse(self, response):
# get selector for all 100 companies
sel_companies = response.xpath('//p[contains(.,"click or tap here.")]/following-sibling::p/a')
# create request for every single company detail page from href
for sel_companie in sel_companies:
href = sel_companie.xpath('./#href').extract_first()
url = response.urljoin(href)
request = scrapy.Request(url, callback=self.parse_company_detail)
yield request
def parse_company_detail(self, response):
# On detail page create item
item = ProjectItem()
# get detail information with specific XPath statements
# e.g. title is the first paragraph
item['title'] = response.xpath('//div[#role="main"]/p[1]//text()').extract_first().rsplit('-')[1]
# e.g. family owned has a label we can select
item['owned'] = response.xpath('//div[#role="main"]/p[contains(.,"Family owned")]/text()').extract_first()
item['Revenue2014'] ='$'+response.xpath('//div[#role="main"]/p[contains(.,"2014")]/text()').extract_first().rsplit('$')[1]
item['Revenue2015'] ='$'+response.xpath('//div[#role="main"]/p[contains(.,"$")]/text()').extract_first().rsplit('$')[1]
item['Website'] = response.xpath('//div[#role="main"]/p/a[contains(.,"www.")]/#href').extract_first()
item['Rank'] = response.xpath('//div[#role="main"]/p[contains(.,"rank")]/text()').extract_first()
item['Employees'] = response.xpath('//div[#role="main"]/p[contains(.,"Employ")]/text()').extract_first()
item['headquarters'] = response.xpath('//div[#role="main"]/p[10]//text()').extract()
item['FoundedYear'] = response.xpath('//div[#role="main"]/p[contains(.,"founded")]/text()').extract()
# Finally: yield the item
yield item
Looking closer at the output of scrapy you'll find that starting after a few dozen of requests they get redirected like shown below:
DEBUG: Redirecting (302) to <GET http://www.cincinnati.com/get-access/?return=http%3A%2F%2Fwww.cincinnati.com%2Fstory%2Fmoney%2F2016%2F11%2F27%2Ffrischs-restaurants%2F94430718%2F> from <GET http://www.cincinnati.com/story/money/2016/11/27/frischs-restaurants/94430718/>
The page that gets requested says: We hope you have enjoyed your complimentary access.
So it looks like they offer only limited access to anonymous users. You probably need to register to their service to get full access to the data.
There are a few potential problems with your xpaths:
it's usually a bad idea to make xpaths look for text that's on a page. Text can change from one minute to the next. The layout and html structure is much more long lived.
using 'following-siblings' is also a last-resort xpath feature that is quite vulnerable to slight changes on the website.
What I would be doing instead:
# iterate all paragraphs within the article:
for para in response.xpath("//*[#itemprop='articleBody']/p"):
url = para.xpath("./a/#href").extract()
# ... etc
len( response.xpath("//*[#itemprop='articleBody']/p")) gives me the expected 102 by the way.
You might have to filter the URLs to remove non-company urls like the on labeled with "click or tap here"

How To Remove White Space in Scrapy Spider Data

I am writing my first spider in Scrapy and attempting to follow the documentation. I have implemented ItemLoaders. The spider extracts the data, but the data contains many line returns. I have tried many ways to remove them, but nothing seems to work. The replace_escape_chars utility is supposed to work, but I can't figure out how to use it with the ItemLoader. Also some people use (unicode.strip), but again, I can't seem to get it to work. Some people try to use these in items.py and others in the spider. How can I clean the data of these line returns (\r\n)? My items.py file only contains the item names and field(). The spider code is below:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from scrapy.utils.markup import replace_escape_chars
from ccpstore.items import Greenhouse
class GreenhouseSpider(BaseSpider):
name = "greenhouse"
allowed_domains = ["domain.com"]
start_urls = [
"http://www.domain.com",
]
def parse(self, response):
items = []
l = XPathItemLoader(item=Greenhouse(), response=response)
l.add_xpath('name', '//div[#class="product_name"]')
l.add_xpath('title', '//h1')
l.add_xpath('usage', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl00_liItem"]')
l.add_xpath('repeat', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl02_liItem"]')
l.add_xpath('direction', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl03_liItem"]')
items.append(l.load_item())
return items
You can use the default_output_processor on the loader and also other processors on individual fields, see title:
from scrapy.spider import BaseSpider
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Compose, MapCompose
from w3lib.html import replace_escape_chars, remove_tags
from ccpstore.items import Greenhouse
class GreenhouseSpider(BaseSpider):
name = "greenhouse"
allowed_domains = ["domain.com"]
start_urls = ["http://www.domain.com"]
def parse(self, response):
l = XPathItemLoader(Greenhouse(), response=response)
l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)
l.add_xpath('name', '//div[#class="product_name"]')
l.add_xpath('title', '//h1', Compose(remove_tags))
l.add_xpath('usage', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl00_liItem"]')
l.add_xpath('repeat', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl02_liItem"]')
l.add_xpath('direction', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl03_liItem"]')
return l.load_item()
It turns out that there were also many blank spaces in the data, so combining the answer of Steven with some more research allowed the data to have all tags, line returns and duplicate spaces removed. The working code is below. Note the addition of text() on the loader lines which removes the tags and the split and join processors to remove spaces and line returns.
def parse(self, response):
items = []
l = XPathItemLoader(item=Greenhouse(), response=response)
l.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
l.default_output_processor = Join()
l.add_xpath('title', '//h1/text()')
l.add_xpath('usage', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl00_liItem"]/text()')
l.add_xpath('repeat', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl02_liItem"]/text()')
l.add_xpath('direction', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl03_liItem"]/text()')
items.append(l.load_item())
return items

Resources