I have a scrapy scrawling script.
class QuotesSpider(scrapy.Spider):
name = 'quotes'
def __init__(self, *args, **kwargs):
super(QuotesSpider, self).__init__(*args, **kwargs)
self.cat = [kwargs.get('cat')]
print(self.cat)
def start_requests(self):
#print(self.params)
urls = ['https://google.com/html/?q=a%v%c']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
On command line:
scrapy crawl quotes -a cat="avc"
When I run the command:
It prints "None"
How can I access the value "avc" passed through the command line in the program
There are things missing in your code. Check out the below sample code
class QuotesSpider(scrapy.Spider):
name = 'quotes'
def __init__(self, *args, **kwargs):
super(QuotesSpider, self).__init__(*args, **kwargs)
self.cat = kwargs.get('cat')
print(self.cat)
def start_requests(self):
# print(self.params)
urls = [f"https://www.google.com/search?q={self.cat}"]
# urls = ['https://google.com/html/?q=a%v%c']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
print(response, "Response <---")
The requested google URL was wrong, I have changed it with the new one.
<200 https://www.google.com/search?q=avc> Response <---
Related
There are more than 500 items, but scrapy shell only manages 5 items.
from urllib import response
import scrapy
class Elo1Spider(scrapy.Spider):
name = 'elo1'
allowed_domains = ['exportleftovers.com']
start_urls = ['http://exportleftovers.com/']
def parse(self, response):
for products in response.css('div.product-wrap'):
yield {
'name':products.css('a.product-thumbnail__title::text').get() ,
'price' : products.css('span.money::text').get().strip(),
}
next_page = response.css('a.pagination-next').attrib['href']
if next_page is not None:
yield response.follow(next_page,callback=self.parse)
Scraping and parsing Javascript pages in Playwright.
There are about 100 URLs, but the process ends without completing all of them.
What could be the cause of this?
The code is working so far.
Is the for syntax in the wrong place?
I would appreciate it if you could tell me if I am using async incorrectly.
Changed to current code.
The following commands are executed in Scrapy.
scrapy runspider kuti_info.py
import scrapy
import requests
from bs4 import BeautifulSoup
from time import sleep
from scrapy.selector import Selector
from playwright.sync_api import sync_playwright
import asyncio
class KutiSpider(scrapy.Spider):
name = 'kuti'
allowed_domains = ['xxxxxxx.jp']
start_urls = ['https://xxxxxxx.jp/']
def parse(self, response):
urls = response.xpath('//ul[#class="areaList"]/a/#href')[0].get()
yield response.follow(url=urls, callback=self.parse_area)
# urls = response.xpath('//ul[#class="areaList"]')
# for url in urls:
# yield response.follow(url=url.xpath('.//a/#href').get(), callback=self.parse_area)
def parse_area(self, response):
urls = response.xpath('//div[#class="salonName"]')
for url in urls:
yield response.follow(url=url.xpath('.//h3/a/#href').get(), callback=self.parse_shop)
# next_page = response.xpath('//div[#class="pager"]//li/a[contains(text(), "次へ")]/#href').get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse_area)
async def parse_shop(self, response):
try:
r = requests.get(response.url)
soup = BeautifulSoup(r.text, 'html.parser')
repo = soup.find('div', {'class': 'abbr uTxt'})
except:
pass
urls = response.xpath('//div[#class="viewMore"]/a/#href').get()
for url in [urls]:
newurls = response.urljoin(url) href="/therapistlist.php?id=!!!!"
yield response.follow(url=newurls, callback=self.parse_therapist)
# yield SeleniumRequest(url=str(newurls), screenshot=True, callback=self.parse_therapist, wait_time=2)
try:
yield {
'shop_name': response.xpath('//span[#class="now"]/a/span/text()').get(),
'shop_url': response.xpath('//dd/a/#href').get(),
'area': response.xpath('//div[#class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
'report-therapi-name': response.xpath('//div[#class="heading"]//span[#class="thName"]/a[1]/text()').get(),
'report': repo.text
}
except:
pass
async def parse_therapist(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
idurls = selector.xpath('//li[#therapist_id]/a/#href').get()
# browser.close()
yield response.follow(url=idurls, callback=self.parse_thera_page)
async def parse_thera_page(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
print(response.url)
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
print(selector.xpath('//p[#class="TopicPath"]/span[#class="now"]/a/span/text()'))
# try:
# r = requests.get(response.url)
# soup = BeautifulSoup(r.text, 'html.parser')
# repo = soup.find('div', {'class': 'txt'})
# except:
# pass
yield {
'therapist_name': selector.xpath('//p[#class="TopicPath"]/span[#class="now"]/a/span/text()').get(),
# 'report': repo.text
}
I see .get() in some places so it get only first item from list - ie. it gets first therapist from list ~250 therapists. And maybe this is the problem that you get less results.
I found that therapistlist.php?id=... uses JavaScript to read all data as JSON from therapistlist.php?id=...&more (with &more at the end) and it render page. And this way I read therapistlist as JSON data without Playwright so I get results much,much faster.
I get ~800 therapists in ~1 minute.
If you write data in CSV then you may have another problem.
In CSV all items must have the same columns - and if Scrapy see {'therapist_name': ...} with column therapist_name which it doesn't have in shop data then it skips it - and you may get file only with shops without therapists. I added field therapist_name in shop data and now CSV saves also therapists.
import scrapy
from time import sleep
from scrapy.selector import Selector
class KutiSpider(scrapy.Spider):
name = 'kuti'
allowed_domains = ['men-esthe.jp']
start_urls = ['https://men-esthe.jp/']
def parse(self, response):
print('[parse] url:', response.url)
urls = response.xpath('//ul[#class="areaList"]/a/#href')[0].get()
print('[parse] len(urls):', len(urls), type(urls))
yield response.follow(url=urls, callback=self.parse_area)
# urls = response.xpath('//ul[#class="areaList"]')
# for url in urls:
# yield response.follow(url=url.xpath('.//a/#href').get(), callback=self.parse_area)
def parse_area(self, response):
print('[parse_area] url:', response.url)
urls = response.xpath('//div[#class="salonName"]')
print('[parse_area] len(urls):', len(urls), type(urls))
for url in urls:
url = url.xpath('.//h3/a/#href').get()
yield response.follow(url, callback=self.parse_shop)
# next_page = response.xpath('//div[#class="pager"]//li/a[contains(text(), "次へ")]/#href').get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse_area)
def parse_shop(self, response):
print('[parse_shop] url:', response.url)
urls = response.xpath('//div[#class="viewMore"]/a/#href')
print('[parse_shop] len(urls):', len(urls), type(urls))
for url in urls.getall():
print('[parse_shop] url:', url)
yield response.follow(url=url + '&more', callback=self.parse_therapist)
yield {
'shop_name': response.xpath('//span[#class="now"]/a/span/text()').get(),
'shop_url': response.xpath('//dd/a/#href').get(),
'area': response.xpath('//div[#class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
'report-therapi-name': response.xpath('//div[#class="heading"]//span[#class="thName"]/a[1]/text()').get(),
'report': response.css('div.abbr.uTxt').text,
'therapist_name': "",
}
def parse_therapist(self, response):
print('[parse_therapist] url:', response.url)
data = response.json()
for item in data:
url = '/therapist.php?id=' + item['id']
yield response.follow(url=url, callback=self.parse_thera_page)
def parse_thera_page(self, response):
print('[parse_thera_page] url:', response.url)
print('now:', response.xpath('//p[#class="TopicPath"]/span[#class="now"]/a/span/text()'))
yield {
'shop_name': '',
'shop_url': '',
'area': '',
'report-therapi-name': '',
'report': '',
'therapist_name': response.xpath('//p[#class="TopicPath"]/span[#class="now"]/a/span/text()').get(),
}
I have a simple, linear DAG(created using Airflow 2.0) with two tasks. I have custom operators for each of the task which extend over BaseOperator. Following is the code for dag and operators:-
class Operator1(BaseOperator):
#apply_defaults
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
def execute(self, context):
...
logging.info('First task')
context['task_instance'].xcom_push(key="payload", value=data)
return data
class Operator2(BaseOperator):
#apply_defaults
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
def execute(self, context):
...
logging.info("context is ", context)
parameters = context['task_instance'].xcom_pull(key="payload", value=data)
with DAG('dag_1', default_args=DEFAULT_ARGS, schedule_interval=None) as dag:
TASK_1 = Operator1(
task_id='task_1',
do_xcom_push=True)
TASK_2 = Operator2(
task_id='task_2',
do_xcom_push=True)
TASK_1 >> TASK_2
When I run the DAG, I find that context which is used for getting xcom values is empty. I have searched a lot of answers on stackoverflow and tried the way mentioned in them but they didn't work.
Would really appreciate some hint over the issue - how to push and pull xcom values in custom operators?
I took your code and run it, the first problem was that start_date wasn't defined, so it ended up in an exception:
Exception has occurred: AirflowException (note: full exception trace is shown but execution is paused at: _run_module_as_main)
Task is missing the start_date parameter
Also, in Operator1 class, data variable is not defined. I guess maybe you missed them when you made the code example.
Other than that the code worked, but I think you should consider defining the task_id parameter when doing the xcom_pull operation.
From TaskInstance xcom_pull method description:
:param task_ids: Only XComs from tasks with matching ids will be
pulled. Can pass None to remove the filter.
Here is the code of a working example, note that I use two equivalent methods to perform the XComs operations:
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.utils.decorators import apply_defaults
from airflow.models import BaseOperator
class Operator1(BaseOperator):
#apply_defaults
def __init__(self, *args, **kwargs) -> None:
super(Operator1, self).__init__(*args, **kwargs)
def execute(self, context):
print('First task')
data = "valuable_data"
more_data = "more_valueable_data"
context['task_instance'].xcom_push(key="payload", value=data)
self.xcom_push(context, "more_data", more_data)
return data
class Operator2(BaseOperator):
#apply_defaults
def __init__(self, *args, **kwargs) -> None:
super(Operator2, self).__init__(*args, **kwargs)
def execute(self, context):
# print(f"context is {context}")
data = context['task_instance'].xcom_pull(
"task_1",
key="payload")
more_data = self.xcom_pull(context, "task_1", key="more_data")
print(f"Obtained data: {data}")
print(f"Obtained more_data: {more_data}")
with DAG('dag_1',
default_args={'owner': 'airflow'},
start_date=days_ago(1),
catchup=False,
schedule_interval=None) as dag:
TASK_1 = Operator1(
task_id='task_1'
)
TASK_2 = Operator2(
task_id='task_2'
)
TASK_1 >> TASK_2
Log from Task_2:
[2021-06-15 12:55:01,206] {taskinstance.py:1255} INFO - Exporting the following env vars:
AIRFLOW_CTX_DAG_OWNER=airflow
AIRFLOW_CTX_DAG_ID=dag_1
AIRFLOW_CTX_TASK_ID=task_2
AIRFLOW_CTX_EXECUTION_DATE=2021-06-14T00:00:00+00:00
AIRFLOW_CTX_DAG_RUN_ID=backfill__2021-06-14T00:00:00+00:00
Obtained data: valuable_data
Obtained more_data: more_valueable_data
[2021-06-15 12:55:01,227] {taskinstance.py:1159} INFO - Marking task as SUCCESS. dag_id=dag_1, task_id=task_2, execution_date=20210614T000000, start_date=20210615T120402, end_date=20210615T125501
Side notes: I changed the __init__ method in order to accept *args as well. I'm using print but It could be done using Airflow logger as self.log.info('msg') .
Let me know if that worked for you!
I am using Airflow version of 1.9.2 with Python 2.7 in Ubuntu. I tried to inherit from ParentOperator class which works fine itself and to create a class called ChildOperator. But when I create a ChildOperator instance, I think some keyword arguments are missing or messed up here and I am getting this error:
airflow.exceptions.AirflowException: Use keyword arguments when
initializing operators
Here is a simplified example:
class ParentOperator(BaseOperator, SkipMixin):
#apply_defaults
def __init__(self,
conn_id,
object,
args={},
s3_conn_id=None,
s3_key=None,
s3_bucket=None,
fields=None,
*args,
**kwargs
):
super(ParentOperator, self).__init__(*args, **kwargs)
...
class ChildOperator(ParentOperator):
#apply_defaults
def __init__(self,
conn_id,
object,
args={},
s3_conn_id=None,
s3_key=None,
s3_bucket=None,
fields=None,
*args,
**kwargs
):
args=...
super(ChildOperator, self).__init__(
conn_id,
object,
args=args,
s3_conn_id=s3_conn_id,
s3_key=s3_key,
s3_bucket=s3_bucket,
fields=fields,
*args,
**kwargs
)
...
myobjc = ChildOperator(
conn_id="my_default",
object=table,
args={},
s3_conn_id='s3_postgres_dump',
s3_key=s3_key,
s3_bucket=s3_bucket,
dag=dag,
task_id="task1"
)
Any idea what is causing this error? Is this more of a Python specific issue?
__init__ function of ChildOperator needs to have all keyword parameters like the following (for the first two parameters of conn_id and object):
super(ChildOperator, self).__init__(
conn_id=conn_id,
object=object,
args=args,
s3_conn_id=s3_conn_id,
s3_key=s3_key,
s3_bucket=s3_bucket,
fields=fields,
*args,
**kwargs
)
I want to use Scrapy to download files and navigate folders at ftp://ftp.co.palm-beach.fl.us/Building%20Permits/.
Here's my spider:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class LatestPermitsSpider(scrapy.Spider):
name= "latest_permits"
allowed_domains=["ftp.co.palm-beach.fl.us"]
handle_httpstatus_list = [404]
ftpUser= "the_username"
ftpPW= "the_password"
permitFilesDir= "ftp://ftp.co.palm-beach.fl.us/Building%20Permits/"
def start_requests(self):
yield Request(
url=self.permitFilesDir,
meta={
"ftp_user": self.ftpUser,
"ftp_password": self.ftpPW
}
)
def parse(self,response):
print response.body
When I run scrapy crawl latest_permits, I get this error:
ConnectionLost: ('FTP connection lost', <twisted.python.failure.Failure twisted.protocols.ftp.CommandFailed: ['530 Sorry, no ANONYMOUS access allowed.']>)
Why does this error come up even when I supply the correct username and password?
Look at the below source code of scrapy
https://github.com/scrapy/scrapy/blob/master/scrapy/core/downloader/handlers/ftp.py
The issue is not with your username or password. The issue is the scrapy supports only files to be downloaded using ftp it doesn't add support for listing directories. The url you are using is a directory url
There is a possible workaround to actually use a package name ftptree
Add handlers.py with below code
import json
from twisted.protocols.ftp import FTPFileListProtocol
from scrapy.http import Response
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
class FtpListingHandler(FTPDownloadHandler):
def gotClient(self, client, request, filepath):
self.client = client
protocol = FTPFileListProtocol()
return client.list(filepath, protocol).addCallbacks(
callback=self._build_response, callbackArgs=(request, protocol),
errback=self._failed, errbackArgs=(request,))
def _build_response(self, result, request, protocol):
self.result = result
body = json.dumps(protocol.files)
return Response(url=request.url, status=200, body=body)
And then in your settings.py use
DOWNLOAD_HANDLERS = {'ftp': 'cralwername.handlers.FtpListingHandler'}
A sample spider
import os
import json
from urlparse import urlparse
from scrapy import Spider
from scrapy.http.request import Request
from ftptree_crawler.items import FtpTreeLeaf
class AnonFtpRequest(Request):
anon_meta = {'ftp_user': 'anonymous',
'ftp_password': 'laserson#cloudera.com'}
def __init__(self, *args, **kwargs):
super(AnonFtpRequest, self).__init__(*args, **kwargs)
self.meta.update(self.anon_meta)
class FtpTreeSpider(Spider):
name = 'ftptree'
def __init__(self, config_file, *args, **kwargs):
super(FtpTreeSpider, self).__init__(*args, **kwargs)
with open(config_file, 'r') as ip:
config = json.loads(ip.read())
url = 'ftp://%s/%s' % (config['host'], config['root_path'])
self.start_url = url
self.site_id = config['id']
def start_requests(self):
yield AnonFtpRequest(self.start_url)
def parse(self, response):
url = urlparse(response.url)
basepath = url.path
files = json.loads(response.body)
for f in files:
if f['filetype'] == 'd':
path = os.path.join(response.url, f['filename'])
request = AnonFtpRequest(path)
yield request
if f['filetype'] == '-':
path = os.path.join(basepath, f['filename'])
result = FtpTreeLeaf(
filename=f['filename'], path=path, size=f['size'])
yield result
Links to look at if you need further information
https://github.com/laserson/ftptree/blob/master/ftptree_crawler/
https://gearheart.io/blog/crawling-ftp-server-with-scrapy/