scrapy fail to scrap all the items - css

There are more than 500 items, but scrapy shell only manages 5 items.
from urllib import response
import scrapy
class Elo1Spider(scrapy.Spider):
name = 'elo1'
allowed_domains = ['exportleftovers.com']
start_urls = ['http://exportleftovers.com/']
def parse(self, response):
for products in response.css('div.product-wrap'):
yield {
'name':products.css('a.product-thumbnail__title::text').get() ,
'price' : products.css('span.money::text').get().strip(),
}
next_page = response.css('a.pagination-next').attrib['href']
if next_page is not None:
yield response.follow(next_page,callback=self.parse)

Related

Python Playwright's async does not process all of the scraped pages

Scraping and parsing Javascript pages in Playwright.
There are about 100 URLs, but the process ends without completing all of them.
What could be the cause of this?
The code is working so far.
Is the for syntax in the wrong place?
I would appreciate it if you could tell me if I am using async incorrectly.
Changed to current code.
The following commands are executed in Scrapy.
scrapy runspider kuti_info.py
import scrapy
import requests
from bs4 import BeautifulSoup
from time import sleep
from scrapy.selector import Selector
from playwright.sync_api import sync_playwright
import asyncio
class KutiSpider(scrapy.Spider):
name = 'kuti'
allowed_domains = ['xxxxxxx.jp']
start_urls = ['https://xxxxxxx.jp/']
def parse(self, response):
urls = response.xpath('//ul[#class="areaList"]/a/#href')[0].get()
yield response.follow(url=urls, callback=self.parse_area)
# urls = response.xpath('//ul[#class="areaList"]')
# for url in urls:
# yield response.follow(url=url.xpath('.//a/#href').get(), callback=self.parse_area)
def parse_area(self, response):
urls = response.xpath('//div[#class="salonName"]')
for url in urls:
yield response.follow(url=url.xpath('.//h3/a/#href').get(), callback=self.parse_shop)
# next_page = response.xpath('//div[#class="pager"]//li/a[contains(text(), "次へ")]/#href').get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse_area)
async def parse_shop(self, response):
try:
r = requests.get(response.url)
soup = BeautifulSoup(r.text, 'html.parser')
repo = soup.find('div', {'class': 'abbr uTxt'})
except:
pass
urls = response.xpath('//div[#class="viewMore"]/a/#href').get()
for url in [urls]:
newurls = response.urljoin(url) href="/therapistlist.php?id=!!!!"
yield response.follow(url=newurls, callback=self.parse_therapist)
# yield SeleniumRequest(url=str(newurls), screenshot=True, callback=self.parse_therapist, wait_time=2)
try:
yield {
'shop_name': response.xpath('//span[#class="now"]/a/span/text()').get(),
'shop_url': response.xpath('//dd/a/#href').get(),
'area': response.xpath('//div[#class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
'report-therapi-name': response.xpath('//div[#class="heading"]//span[#class="thName"]/a[1]/text()').get(),
'report': repo.text
}
except:
pass
async def parse_therapist(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
idurls = selector.xpath('//li[#therapist_id]/a/#href').get()
# browser.close()
yield response.follow(url=idurls, callback=self.parse_thera_page)
async def parse_thera_page(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
print(response.url)
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
print(selector.xpath('//p[#class="TopicPath"]/span[#class="now"]/a/span/text()'))
# try:
# r = requests.get(response.url)
# soup = BeautifulSoup(r.text, 'html.parser')
# repo = soup.find('div', {'class': 'txt'})
# except:
# pass
yield {
'therapist_name': selector.xpath('//p[#class="TopicPath"]/span[#class="now"]/a/span/text()').get(),
# 'report': repo.text
}
I see .get() in some places so it get only first item from list - ie. it gets first therapist from list ~250 therapists. And maybe this is the problem that you get less results.
I found that therapistlist.php?id=... uses JavaScript to read all data as JSON from therapistlist.php?id=...&more (with &more at the end) and it render page. And this way I read therapistlist as JSON data without Playwright so I get results much,much faster.
I get ~800 therapists in ~1 minute.
If you write data in CSV then you may have another problem.
In CSV all items must have the same columns - and if Scrapy see {'therapist_name': ...} with column therapist_name which it doesn't have in shop data then it skips it - and you may get file only with shops without therapists. I added field therapist_name in shop data and now CSV saves also therapists.
import scrapy
from time import sleep
from scrapy.selector import Selector
class KutiSpider(scrapy.Spider):
name = 'kuti'
allowed_domains = ['men-esthe.jp']
start_urls = ['https://men-esthe.jp/']
def parse(self, response):
print('[parse] url:', response.url)
urls = response.xpath('//ul[#class="areaList"]/a/#href')[0].get()
print('[parse] len(urls):', len(urls), type(urls))
yield response.follow(url=urls, callback=self.parse_area)
# urls = response.xpath('//ul[#class="areaList"]')
# for url in urls:
# yield response.follow(url=url.xpath('.//a/#href').get(), callback=self.parse_area)
def parse_area(self, response):
print('[parse_area] url:', response.url)
urls = response.xpath('//div[#class="salonName"]')
print('[parse_area] len(urls):', len(urls), type(urls))
for url in urls:
url = url.xpath('.//h3/a/#href').get()
yield response.follow(url, callback=self.parse_shop)
# next_page = response.xpath('//div[#class="pager"]//li/a[contains(text(), "次へ")]/#href').get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse_area)
def parse_shop(self, response):
print('[parse_shop] url:', response.url)
urls = response.xpath('//div[#class="viewMore"]/a/#href')
print('[parse_shop] len(urls):', len(urls), type(urls))
for url in urls.getall():
print('[parse_shop] url:', url)
yield response.follow(url=url + '&more', callback=self.parse_therapist)
yield {
'shop_name': response.xpath('//span[#class="now"]/a/span/text()').get(),
'shop_url': response.xpath('//dd/a/#href').get(),
'area': response.xpath('//div[#class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
'report-therapi-name': response.xpath('//div[#class="heading"]//span[#class="thName"]/a[1]/text()').get(),
'report': response.css('div.abbr.uTxt').text,
'therapist_name': "",
}
def parse_therapist(self, response):
print('[parse_therapist] url:', response.url)
data = response.json()
for item in data:
url = '/therapist.php?id=' + item['id']
yield response.follow(url=url, callback=self.parse_thera_page)
def parse_thera_page(self, response):
print('[parse_thera_page] url:', response.url)
print('now:', response.xpath('//p[#class="TopicPath"]/span[#class="now"]/a/span/text()'))
yield {
'shop_name': '',
'shop_url': '',
'area': '',
'report-therapi-name': '',
'report': '',
'therapist_name': response.xpath('//p[#class="TopicPath"]/span[#class="now"]/a/span/text()').get(),
}

I want to go to the all the pages of yelp webiste and extract data from

I want to go to all the pages of the yelp site but cann't
this is the code
# packages
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import urllib
import os
import json
import datetime
import csv
# property scraper class
class Yelp(scrapy.Spider):
# scraper name
name = 'home business'
base_url = 'https://www.yelp.com/search?'
params = {
'find_desc': 'Home Cleaning',
'find_loc':'North Dallas, Dallas, TX',
#'start' : ''
}
page = 0
current_page = 1
# headers
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
}
#params['start'] = page
try:
os.remove('abx.csv')
except OSError:
pass
# custom settings
custom_settings = {
'CONCURRENT_REQUEST_PER_DOMAIN': 2,
'DOWNLOAD_DELAY': 1
}
# general crawler
def start_requests(self):
url = self.base_url + urllib.parse.urlencode(self.params)
# initial HTTP request
yield scrapy.Request(
url=url,
headers=self.headers,
callback=self.parse_listing
)
def parse_listing(self, response):
lists = response.css('h4[class="css-1l5lt1i"]')
for link in lists:
link = link.css('a::attr(href)').get()
link = 'https://www.yelp.com/' + link
#print('\n\nlink:',link,'\n\n')
yield response.follow(link, headers = self.headers, callback = self.parse_cards)
break
try:
#self.params['start'] = self.page
try:
total_pages = response.css('.text-align--center__09f24__1P1jK .css-e81eai::text').get()[5:7]
print(total_pages)
self.page +=10
self.current_page +=1
except Exception as e:
total_pages = 1
print('totl:',total_pages)
print('PAGE %s | %s ' % (self.current_page, total_pages))
if int(self.page/10) <= int(total_pages):
self.log('\n\n %s | %s\n\n ' %(self.page/10, total_pages))
next_page = response.url + '&start=' + str(self.page)
yield response.follow(url = next_page, headers = self.headers, callback = self.parse_listing)
except:
print('only single page',self.current_page)
def parse_cards(self,response):
print('\nok\n')
# main driver
if __name__ == '__main__':
# run scraper
process = CrawlerProcess()
process.crawl(Yelp)
process.start()
#Yelp.parse_cards(Yelp, '')
I applied try and except method also but cann't done the job.
The main problem is in the next page with the param '&start=' if i increment the start to 10 in every time then the url become every time like this
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=10&start=20&start=30'
and so on i want to only the url start will increment to start=10 and after them start=20 and so on.
like this
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=20'
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=30'
and so on.
Just find the link to the next page and follow that
next_page = response.css("a.next-link::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
This is pretty similar to what is done in the scrapy tutorial, have you followed that? Was there a reason you couldn't do it this way?
In the end your entire spider can become
from scrapy import Spider
class Yelp(Spider):
# scraper name
name = "home business"
start_urls = [
"https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX"
]
def parse(self, response):
for link in response.css("h4 > span > a"):
yield response.follow(link, callback=self.parse_cards)
next_page = response.css("a.next-link::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_cards(self, response):
print("parse_cards", response.url)
I removed the start_requests stuff to keep it simple for this example (something you should probably try to do when asking questions)

Python import error and can't find some files

I have been getting errors and I do not know how to go about it. Its telling me i have an import error.
from ..items import QuotetutorialItem
ImportError: attempted relative import with no known parent package.
import scrapy
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
from..items import QuotetutorialItem
class Quotespider(scrapy.Spider):
name = 'quotes'
start_urls =[
'http://quotes.toscrape.com/login'
]
def parse(self, response):
token = response.css('form input::attr(value)').extract_first()
return FormRequest.from_response(response, formdata={
'csrf_token' : token,
'username' : 'abc',
'password' : '123',
}, callback=self.start_scraping)
def start_scraping(self, response):
open_in_browser(response)
items = QuotetutorialItem()
all_div_quotes = response.css('div.quote')
for quotes in all_div_quotes:
title = quotes.css('span.text::text').extract()
author = quotes.css('.author::text').extract()
tag = quotes.css('.tag::text').extract()
items['title'] = title
items['author'] = author
items['tag'] = tag
yield items

Scrapy init returns a None

I have a scrapy scrawling script.
class QuotesSpider(scrapy.Spider):
name = 'quotes'
def __init__(self, *args, **kwargs):
super(QuotesSpider, self).__init__(*args, **kwargs)
self.cat = [kwargs.get('cat')]
print(self.cat)
def start_requests(self):
#print(self.params)
urls = ['https://google.com/html/?q=a%v%c']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
On command line:
scrapy crawl quotes -a cat="avc"
When I run the command:
It prints "None"
How can I access the value "avc" passed through the command line in the program
There are things missing in your code. Check out the below sample code
class QuotesSpider(scrapy.Spider):
name = 'quotes'
def __init__(self, *args, **kwargs):
super(QuotesSpider, self).__init__(*args, **kwargs)
self.cat = kwargs.get('cat')
print(self.cat)
def start_requests(self):
# print(self.params)
urls = [f"https://www.google.com/search?q={self.cat}"]
# urls = ['https://google.com/html/?q=a%v%c']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
print(response, "Response <---")
The requested google URL was wrong, I have changed it with the new one.
<200 https://www.google.com/search?q=avc> Response <---

530 error when trying to open FTP directory

I want to use Scrapy to download files and navigate folders at ftp://ftp.co.palm-beach.fl.us/Building%20Permits/.
Here's my spider:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class LatestPermitsSpider(scrapy.Spider):
name= "latest_permits"
allowed_domains=["ftp.co.palm-beach.fl.us"]
handle_httpstatus_list = [404]
ftpUser= "the_username"
ftpPW= "the_password"
permitFilesDir= "ftp://ftp.co.palm-beach.fl.us/Building%20Permits/"
def start_requests(self):
yield Request(
url=self.permitFilesDir,
meta={
"ftp_user": self.ftpUser,
"ftp_password": self.ftpPW
}
)
def parse(self,response):
print response.body
When I run scrapy crawl latest_permits, I get this error:
ConnectionLost: ('FTP connection lost', <twisted.python.failure.Failure twisted.protocols.ftp.CommandFailed: ['530 Sorry, no ANONYMOUS access allowed.']>)
Why does this error come up even when I supply the correct username and password?
Look at the below source code of scrapy
https://github.com/scrapy/scrapy/blob/master/scrapy/core/downloader/handlers/ftp.py
The issue is not with your username or password. The issue is the scrapy supports only files to be downloaded using ftp it doesn't add support for listing directories. The url you are using is a directory url
There is a possible workaround to actually use a package name ftptree
Add handlers.py with below code
import json
from twisted.protocols.ftp import FTPFileListProtocol
from scrapy.http import Response
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
class FtpListingHandler(FTPDownloadHandler):
def gotClient(self, client, request, filepath):
self.client = client
protocol = FTPFileListProtocol()
return client.list(filepath, protocol).addCallbacks(
callback=self._build_response, callbackArgs=(request, protocol),
errback=self._failed, errbackArgs=(request,))
def _build_response(self, result, request, protocol):
self.result = result
body = json.dumps(protocol.files)
return Response(url=request.url, status=200, body=body)
And then in your settings.py use
DOWNLOAD_HANDLERS = {'ftp': 'cralwername.handlers.FtpListingHandler'}
A sample spider
import os
import json
from urlparse import urlparse
from scrapy import Spider
from scrapy.http.request import Request
from ftptree_crawler.items import FtpTreeLeaf
class AnonFtpRequest(Request):
anon_meta = {'ftp_user': 'anonymous',
'ftp_password': 'laserson#cloudera.com'}
def __init__(self, *args, **kwargs):
super(AnonFtpRequest, self).__init__(*args, **kwargs)
self.meta.update(self.anon_meta)
class FtpTreeSpider(Spider):
name = 'ftptree'
def __init__(self, config_file, *args, **kwargs):
super(FtpTreeSpider, self).__init__(*args, **kwargs)
with open(config_file, 'r') as ip:
config = json.loads(ip.read())
url = 'ftp://%s/%s' % (config['host'], config['root_path'])
self.start_url = url
self.site_id = config['id']
def start_requests(self):
yield AnonFtpRequest(self.start_url)
def parse(self, response):
url = urlparse(response.url)
basepath = url.path
files = json.loads(response.body)
for f in files:
if f['filetype'] == 'd':
path = os.path.join(response.url, f['filename'])
request = AnonFtpRequest(path)
yield request
if f['filetype'] == '-':
path = os.path.join(basepath, f['filename'])
result = FtpTreeLeaf(
filename=f['filename'], path=path, size=f['size'])
yield result
Links to look at if you need further information
https://github.com/laserson/ftptree/blob/master/ftptree_crawler/
https://gearheart.io/blog/crawling-ftp-server-with-scrapy/

Resources