I want to go to the all the pages of yelp webiste and extract data from - web-scraping

I want to go to all the pages of the yelp site but cann't
this is the code
# packages
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import urllib
import os
import json
import datetime
import csv
# property scraper class
class Yelp(scrapy.Spider):
# scraper name
name = 'home business'
base_url = 'https://www.yelp.com/search?'
params = {
'find_desc': 'Home Cleaning',
'find_loc':'North Dallas, Dallas, TX',
#'start' : ''
}
page = 0
current_page = 1
# headers
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
}
#params['start'] = page
try:
os.remove('abx.csv')
except OSError:
pass
# custom settings
custom_settings = {
'CONCURRENT_REQUEST_PER_DOMAIN': 2,
'DOWNLOAD_DELAY': 1
}
# general crawler
def start_requests(self):
url = self.base_url + urllib.parse.urlencode(self.params)
# initial HTTP request
yield scrapy.Request(
url=url,
headers=self.headers,
callback=self.parse_listing
)
def parse_listing(self, response):
lists = response.css('h4[class="css-1l5lt1i"]')
for link in lists:
link = link.css('a::attr(href)').get()
link = 'https://www.yelp.com/' + link
#print('\n\nlink:',link,'\n\n')
yield response.follow(link, headers = self.headers, callback = self.parse_cards)
break
try:
#self.params['start'] = self.page
try:
total_pages = response.css('.text-align--center__09f24__1P1jK .css-e81eai::text').get()[5:7]
print(total_pages)
self.page +=10
self.current_page +=1
except Exception as e:
total_pages = 1
print('totl:',total_pages)
print('PAGE %s | %s ' % (self.current_page, total_pages))
if int(self.page/10) <= int(total_pages):
self.log('\n\n %s | %s\n\n ' %(self.page/10, total_pages))
next_page = response.url + '&start=' + str(self.page)
yield response.follow(url = next_page, headers = self.headers, callback = self.parse_listing)
except:
print('only single page',self.current_page)
def parse_cards(self,response):
print('\nok\n')
# main driver
if __name__ == '__main__':
# run scraper
process = CrawlerProcess()
process.crawl(Yelp)
process.start()
#Yelp.parse_cards(Yelp, '')
I applied try and except method also but cann't done the job.
The main problem is in the next page with the param '&start=' if i increment the start to 10 in every time then the url become every time like this
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=10&start=20&start=30'
and so on i want to only the url start will increment to start=10 and after them start=20 and so on.
like this
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=20'
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=30'
and so on.

Just find the link to the next page and follow that
next_page = response.css("a.next-link::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
This is pretty similar to what is done in the scrapy tutorial, have you followed that? Was there a reason you couldn't do it this way?
In the end your entire spider can become
from scrapy import Spider
class Yelp(Spider):
# scraper name
name = "home business"
start_urls = [
"https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX"
]
def parse(self, response):
for link in response.css("h4 > span > a"):
yield response.follow(link, callback=self.parse_cards)
next_page = response.css("a.next-link::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_cards(self, response):
print("parse_cards", response.url)
I removed the start_requests stuff to keep it simple for this example (something you should probably try to do when asking questions)

Related

How to scrape url links when the website takes us to a splash screen?

import requests
from bs4 import BeautifulSoup
import re
R = []
url = "https://ascscotties.com/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; ' \
'Intel Mac OS X 10.6; rv:16.0) Gecko/20100101 Firefox/16.0'}
reqs = requests.get(url, headers=headers)
soup = BeautifulSoup(reqs.text, 'html.parser')
links= soup.find_all('a',href=re.compile("roster"))
s=[url + link.get("href") for link in links]
for i in s:
r = requests.get(i, allow_redirects=True, headers=headers)
if r.status_code < 400:
R.append(r.url)
Output
['https://ascscotties.com/sports/womens-basketball/roster',
'https://ascscotties.com/sports/womens-cross-country/roster',
'https://ascscotties.com/sports/womens-soccer/roster',
'https://ascscotties.com/sports/softball/roster',
'https://ascscotties.com/sports/womens-tennis/roster',
'https://ascscotties.com/sports/womens-volleyball/roster']
The code looks for roster links from url's and gives output, but like "https://auyellowjackets.com/" it fails as the url takes use to a splash screen. What can be done?
The site uses a cookie to indicate it has shown a splash screen before. So set it to get to the main page:
import re
import requests
from bs4 import BeautifulSoup
R = []
url = "https://auyellowjackets.com"
cookies = {"splash_2": "splash_2"} # <--- set cookie
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; "
"Intel Mac OS X 10.6; rv:16.0) Gecko/20100101 Firefox/16.0"
}
reqs = requests.get(url, headers=headers, cookies=cookies)
soup = BeautifulSoup(reqs.text, "html.parser")
links = soup.find_all("a", href=re.compile("roster"))
s = [url + link.get("href") for link in links]
for i in s:
r = requests.get(i, allow_redirects=True, headers=headers)
if r.status_code < 400:
R.append(r.url)
print(*R, sep="\n")
Prints:
https://auyellowjackets.com/sports/mens-basketball/roster
https://auyellowjackets.com/sports/mens-cross-country/roster
https://auyellowjackets.com/sports/football/roster
https://auyellowjackets.com/sports/mens-track-and-field/roster
https://auyellowjackets.com/sports/mwrest/roster
https://auyellowjackets.com/sports/womens-basketball/roster
https://auyellowjackets.com/sports/womens-cross-country/roster
https://auyellowjackets.com/sports/womens-soccer/roster
https://auyellowjackets.com/sports/softball/roster
https://auyellowjackets.com/sports/womens-track-and-field/roster
https://auyellowjackets.com/sports/volleyball/roster

Python Playwright's async does not process all of the scraped pages

Scraping and parsing Javascript pages in Playwright.
There are about 100 URLs, but the process ends without completing all of them.
What could be the cause of this?
The code is working so far.
Is the for syntax in the wrong place?
I would appreciate it if you could tell me if I am using async incorrectly.
Changed to current code.
The following commands are executed in Scrapy.
scrapy runspider kuti_info.py
import scrapy
import requests
from bs4 import BeautifulSoup
from time import sleep
from scrapy.selector import Selector
from playwright.sync_api import sync_playwright
import asyncio
class KutiSpider(scrapy.Spider):
name = 'kuti'
allowed_domains = ['xxxxxxx.jp']
start_urls = ['https://xxxxxxx.jp/']
def parse(self, response):
urls = response.xpath('//ul[#class="areaList"]/a/#href')[0].get()
yield response.follow(url=urls, callback=self.parse_area)
# urls = response.xpath('//ul[#class="areaList"]')
# for url in urls:
# yield response.follow(url=url.xpath('.//a/#href').get(), callback=self.parse_area)
def parse_area(self, response):
urls = response.xpath('//div[#class="salonName"]')
for url in urls:
yield response.follow(url=url.xpath('.//h3/a/#href').get(), callback=self.parse_shop)
# next_page = response.xpath('//div[#class="pager"]//li/a[contains(text(), "次へ")]/#href').get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse_area)
async def parse_shop(self, response):
try:
r = requests.get(response.url)
soup = BeautifulSoup(r.text, 'html.parser')
repo = soup.find('div', {'class': 'abbr uTxt'})
except:
pass
urls = response.xpath('//div[#class="viewMore"]/a/#href').get()
for url in [urls]:
newurls = response.urljoin(url) href="/therapistlist.php?id=!!!!"
yield response.follow(url=newurls, callback=self.parse_therapist)
# yield SeleniumRequest(url=str(newurls), screenshot=True, callback=self.parse_therapist, wait_time=2)
try:
yield {
'shop_name': response.xpath('//span[#class="now"]/a/span/text()').get(),
'shop_url': response.xpath('//dd/a/#href').get(),
'area': response.xpath('//div[#class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
'report-therapi-name': response.xpath('//div[#class="heading"]//span[#class="thName"]/a[1]/text()').get(),
'report': repo.text
}
except:
pass
async def parse_therapist(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
idurls = selector.xpath('//li[#therapist_id]/a/#href').get()
# browser.close()
yield response.follow(url=idurls, callback=self.parse_thera_page)
async def parse_thera_page(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
print(response.url)
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
print(selector.xpath('//p[#class="TopicPath"]/span[#class="now"]/a/span/text()'))
# try:
# r = requests.get(response.url)
# soup = BeautifulSoup(r.text, 'html.parser')
# repo = soup.find('div', {'class': 'txt'})
# except:
# pass
yield {
'therapist_name': selector.xpath('//p[#class="TopicPath"]/span[#class="now"]/a/span/text()').get(),
# 'report': repo.text
}
I see .get() in some places so it get only first item from list - ie. it gets first therapist from list ~250 therapists. And maybe this is the problem that you get less results.
I found that therapistlist.php?id=... uses JavaScript to read all data as JSON from therapistlist.php?id=...&more (with &more at the end) and it render page. And this way I read therapistlist as JSON data without Playwright so I get results much,much faster.
I get ~800 therapists in ~1 minute.
If you write data in CSV then you may have another problem.
In CSV all items must have the same columns - and if Scrapy see {'therapist_name': ...} with column therapist_name which it doesn't have in shop data then it skips it - and you may get file only with shops without therapists. I added field therapist_name in shop data and now CSV saves also therapists.
import scrapy
from time import sleep
from scrapy.selector import Selector
class KutiSpider(scrapy.Spider):
name = 'kuti'
allowed_domains = ['men-esthe.jp']
start_urls = ['https://men-esthe.jp/']
def parse(self, response):
print('[parse] url:', response.url)
urls = response.xpath('//ul[#class="areaList"]/a/#href')[0].get()
print('[parse] len(urls):', len(urls), type(urls))
yield response.follow(url=urls, callback=self.parse_area)
# urls = response.xpath('//ul[#class="areaList"]')
# for url in urls:
# yield response.follow(url=url.xpath('.//a/#href').get(), callback=self.parse_area)
def parse_area(self, response):
print('[parse_area] url:', response.url)
urls = response.xpath('//div[#class="salonName"]')
print('[parse_area] len(urls):', len(urls), type(urls))
for url in urls:
url = url.xpath('.//h3/a/#href').get()
yield response.follow(url, callback=self.parse_shop)
# next_page = response.xpath('//div[#class="pager"]//li/a[contains(text(), "次へ")]/#href').get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse_area)
def parse_shop(self, response):
print('[parse_shop] url:', response.url)
urls = response.xpath('//div[#class="viewMore"]/a/#href')
print('[parse_shop] len(urls):', len(urls), type(urls))
for url in urls.getall():
print('[parse_shop] url:', url)
yield response.follow(url=url + '&more', callback=self.parse_therapist)
yield {
'shop_name': response.xpath('//span[#class="now"]/a/span/text()').get(),
'shop_url': response.xpath('//dd/a/#href').get(),
'area': response.xpath('//div[#class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
'report-therapi-name': response.xpath('//div[#class="heading"]//span[#class="thName"]/a[1]/text()').get(),
'report': response.css('div.abbr.uTxt').text,
'therapist_name': "",
}
def parse_therapist(self, response):
print('[parse_therapist] url:', response.url)
data = response.json()
for item in data:
url = '/therapist.php?id=' + item['id']
yield response.follow(url=url, callback=self.parse_thera_page)
def parse_thera_page(self, response):
print('[parse_thera_page] url:', response.url)
print('now:', response.xpath('//p[#class="TopicPath"]/span[#class="now"]/a/span/text()'))
yield {
'shop_name': '',
'shop_url': '',
'area': '',
'report-therapi-name': '',
'report': '',
'therapist_name': response.xpath('//p[#class="TopicPath"]/span[#class="now"]/a/span/text()').get(),
}

I have scrapy script, but I can not scrape data, don't knew why

I run the script, but I got none, but there are data on the url
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
class GetSpider(scrapy.Spider):
name = 'gets'
start_urls = ['https://www.retailmenot.com/coupons/insurance?u=ZTF65B5PJZEU3JDF326WY2SXOQ']
def parse(self, response):
s = Selector(response)
code = s.xpath("//button[contains(#class,'CopyCode')][1]/text()").get()
yield {'code':code}
I expect 52YR, but i got None
The easiest way to go about this is probably to load the json in the script as a python dictionary and navigate through it to get to the codes.
The below code should get you started:
import scrapy
import json
import logging
class GetSpider(scrapy.Spider):
name = 'gets'
start_urls = ['https://www.retailmenot.com/coupons/insurance?u=ZTF65B5PJZEU3JDF326WY2SXOQ']
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
custom_settings = {'ROBOTSTXT_OBEY': False}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url,
callback=self.parse,
headers=self.headers,
dont_filter=True)
def parse(self, response):
script = response.xpath(
'//script[contains(text(), "__NEXT_DATA__")]/text()'
).extract_first()
dict_start_index = script.index('{')
dict_end_index = script.index('};') + 1
data = json.loads(script[dict_start_index:dict_end_index])
coupon_data = data['props']['pageProps']['serverState']['apollo']['data']
for key, value in coupon_data.items():
try:
code = value['code']
except KeyError:
logging.debug("no code found")
else:
yield {'code': code}

530 error when trying to open FTP directory

I want to use Scrapy to download files and navigate folders at ftp://ftp.co.palm-beach.fl.us/Building%20Permits/.
Here's my spider:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class LatestPermitsSpider(scrapy.Spider):
name= "latest_permits"
allowed_domains=["ftp.co.palm-beach.fl.us"]
handle_httpstatus_list = [404]
ftpUser= "the_username"
ftpPW= "the_password"
permitFilesDir= "ftp://ftp.co.palm-beach.fl.us/Building%20Permits/"
def start_requests(self):
yield Request(
url=self.permitFilesDir,
meta={
"ftp_user": self.ftpUser,
"ftp_password": self.ftpPW
}
)
def parse(self,response):
print response.body
When I run scrapy crawl latest_permits, I get this error:
ConnectionLost: ('FTP connection lost', <twisted.python.failure.Failure twisted.protocols.ftp.CommandFailed: ['530 Sorry, no ANONYMOUS access allowed.']>)
Why does this error come up even when I supply the correct username and password?
Look at the below source code of scrapy
https://github.com/scrapy/scrapy/blob/master/scrapy/core/downloader/handlers/ftp.py
The issue is not with your username or password. The issue is the scrapy supports only files to be downloaded using ftp it doesn't add support for listing directories. The url you are using is a directory url
There is a possible workaround to actually use a package name ftptree
Add handlers.py with below code
import json
from twisted.protocols.ftp import FTPFileListProtocol
from scrapy.http import Response
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
class FtpListingHandler(FTPDownloadHandler):
def gotClient(self, client, request, filepath):
self.client = client
protocol = FTPFileListProtocol()
return client.list(filepath, protocol).addCallbacks(
callback=self._build_response, callbackArgs=(request, protocol),
errback=self._failed, errbackArgs=(request,))
def _build_response(self, result, request, protocol):
self.result = result
body = json.dumps(protocol.files)
return Response(url=request.url, status=200, body=body)
And then in your settings.py use
DOWNLOAD_HANDLERS = {'ftp': 'cralwername.handlers.FtpListingHandler'}
A sample spider
import os
import json
from urlparse import urlparse
from scrapy import Spider
from scrapy.http.request import Request
from ftptree_crawler.items import FtpTreeLeaf
class AnonFtpRequest(Request):
anon_meta = {'ftp_user': 'anonymous',
'ftp_password': 'laserson#cloudera.com'}
def __init__(self, *args, **kwargs):
super(AnonFtpRequest, self).__init__(*args, **kwargs)
self.meta.update(self.anon_meta)
class FtpTreeSpider(Spider):
name = 'ftptree'
def __init__(self, config_file, *args, **kwargs):
super(FtpTreeSpider, self).__init__(*args, **kwargs)
with open(config_file, 'r') as ip:
config = json.loads(ip.read())
url = 'ftp://%s/%s' % (config['host'], config['root_path'])
self.start_url = url
self.site_id = config['id']
def start_requests(self):
yield AnonFtpRequest(self.start_url)
def parse(self, response):
url = urlparse(response.url)
basepath = url.path
files = json.loads(response.body)
for f in files:
if f['filetype'] == 'd':
path = os.path.join(response.url, f['filename'])
request = AnonFtpRequest(path)
yield request
if f['filetype'] == '-':
path = os.path.join(basepath, f['filename'])
result = FtpTreeLeaf(
filename=f['filename'], path=path, size=f['size'])
yield result
Links to look at if you need further information
https://github.com/laserson/ftptree/blob/master/ftptree_crawler/
https://gearheart.io/blog/crawling-ftp-server-with-scrapy/

Webcrawler not working with HTTPS

I am having an issue with my web crawler. It can run through any regular old website like a charm, but when it runs into a https protocol it doesn't seem to work.
This is the error I am getting when I try to run a https url through my crawler (name 'htmltext' is not defined)
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re
re.IGNORECASE = True
from urllib.parse import urlparse
#SourceUrl
url = "https://en.wikipedia.org/wiki/Main_Page"
urls = [url]
z = urlparse(urls[0])
TopLevel = z.scheme+'://'+z.netloc
visited =[url]
robotsUrl = TopLevel +'/robots.txt'
while len(urls) < 100:
try:
htmltext = urllib.request.urlopen(urls[0]).read()
robots = urllib.request.urlopen(robotsUrl).read()
disallowList = re.findall(b'Disallow\:\s*([a-zA-Z0-9\*\-\/\_\?\.\%\:\&]+)', robots)
except:
print (urls[0])
sourceCode = BeautifulSoup(htmltext, "html.parser")
urls.pop(0)
print(len(urls))
for link in sourceCode.findAll('a', href=True):
if "http://" not in link['href']:
link['href'] = urllib.parse.urljoin(url,link['href'])
in_disallow = False
for i in range(len(disallowList)):
if (disallowList[i]).upper().decode() in link['href'].upper():
in_disallow = True
break
if not in_disallow:
if link['href'] not in visited:
urls.append(link['href'])
visited.append(link['href'])
print (visited)

Resources