530 error when trying to open FTP directory - web-scraping

I want to use Scrapy to download files and navigate folders at ftp://ftp.co.palm-beach.fl.us/Building%20Permits/.
Here's my spider:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class LatestPermitsSpider(scrapy.Spider):
name= "latest_permits"
allowed_domains=["ftp.co.palm-beach.fl.us"]
handle_httpstatus_list = [404]
ftpUser= "the_username"
ftpPW= "the_password"
permitFilesDir= "ftp://ftp.co.palm-beach.fl.us/Building%20Permits/"
def start_requests(self):
yield Request(
url=self.permitFilesDir,
meta={
"ftp_user": self.ftpUser,
"ftp_password": self.ftpPW
}
)
def parse(self,response):
print response.body
When I run scrapy crawl latest_permits, I get this error:
ConnectionLost: ('FTP connection lost', <twisted.python.failure.Failure twisted.protocols.ftp.CommandFailed: ['530 Sorry, no ANONYMOUS access allowed.']>)
Why does this error come up even when I supply the correct username and password?

Look at the below source code of scrapy
https://github.com/scrapy/scrapy/blob/master/scrapy/core/downloader/handlers/ftp.py
The issue is not with your username or password. The issue is the scrapy supports only files to be downloaded using ftp it doesn't add support for listing directories. The url you are using is a directory url
There is a possible workaround to actually use a package name ftptree
Add handlers.py with below code
import json
from twisted.protocols.ftp import FTPFileListProtocol
from scrapy.http import Response
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
class FtpListingHandler(FTPDownloadHandler):
def gotClient(self, client, request, filepath):
self.client = client
protocol = FTPFileListProtocol()
return client.list(filepath, protocol).addCallbacks(
callback=self._build_response, callbackArgs=(request, protocol),
errback=self._failed, errbackArgs=(request,))
def _build_response(self, result, request, protocol):
self.result = result
body = json.dumps(protocol.files)
return Response(url=request.url, status=200, body=body)
And then in your settings.py use
DOWNLOAD_HANDLERS = {'ftp': 'cralwername.handlers.FtpListingHandler'}
A sample spider
import os
import json
from urlparse import urlparse
from scrapy import Spider
from scrapy.http.request import Request
from ftptree_crawler.items import FtpTreeLeaf
class AnonFtpRequest(Request):
anon_meta = {'ftp_user': 'anonymous',
'ftp_password': 'laserson#cloudera.com'}
def __init__(self, *args, **kwargs):
super(AnonFtpRequest, self).__init__(*args, **kwargs)
self.meta.update(self.anon_meta)
class FtpTreeSpider(Spider):
name = 'ftptree'
def __init__(self, config_file, *args, **kwargs):
super(FtpTreeSpider, self).__init__(*args, **kwargs)
with open(config_file, 'r') as ip:
config = json.loads(ip.read())
url = 'ftp://%s/%s' % (config['host'], config['root_path'])
self.start_url = url
self.site_id = config['id']
def start_requests(self):
yield AnonFtpRequest(self.start_url)
def parse(self, response):
url = urlparse(response.url)
basepath = url.path
files = json.loads(response.body)
for f in files:
if f['filetype'] == 'd':
path = os.path.join(response.url, f['filename'])
request = AnonFtpRequest(path)
yield request
if f['filetype'] == '-':
path = os.path.join(basepath, f['filename'])
result = FtpTreeLeaf(
filename=f['filename'], path=path, size=f['size'])
yield result
Links to look at if you need further information
https://github.com/laserson/ftptree/blob/master/ftptree_crawler/
https://gearheart.io/blog/crawling-ftp-server-with-scrapy/

Related

Pact: Error when trying to setup mock provider

I'm trying to write my first Pact-python test using pytest, Could someone please tell me what's wrong with my code?
import unittest
import requests
import json
import pytest
import atexit
from pact import Consumer, Provider
pact = Consumer('Consumer').has_pact_with(Provider('Provider'), host_name='mockservice', port=8080)
pact.start_service()
atexit.register(pact.stop_service)
class InterviewDetails(unittest.TestCase):
def test_candidate_report_api(self):
candidate_report_payload = {}
resp = requests.post("http://localhost:1234/users/",data=json.dumps(candidate_report_payload))
response = json.loads(resp.text)
return response
#pytest.mark.health1
def test_candidate_report(self):
expected = {}
(pact.given('Comment')
.upon_receiving('comment')
.with_request(method='POST', path="http://localhost:1234/users/", headers={})
.will_respond_with(200, body=expected))
with pact:
pact.setup()
result = self.test_candidate_report_api()
self.assertEqual(result, expected)
pact.verify()
The error from stacktrace:
AttributeError: module 'pact' has no attribute 'Like'
Can you please confirm you're using pact-python from https://github.com/pact-foundation/pact-python/ (and not pactman, a project that is not maintained by the Pact Foundation)?
It might be related to the way you have setup your test?
Here is an example project you can use for reference: https://github.com/pactflow/example-consumer-python/
Relevant test code:
"""pact test for product service client"""
import json
import logging
import os
import requests
from requests.auth import HTTPBasicAuth
import pytest
from pact import Consumer, Like, Provider, Term, Format
from src.consumer import ProductConsumer
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
print(Format().__dict__)
PACT_MOCK_HOST = 'localhost'
PACT_MOCK_PORT = 1234
PACT_DIR = os.path.dirname(os.path.realpath(__file__))
#pytest.fixture
def consumer():
return ProductConsumer(
'http://{host}:{port}'
.format(host=PACT_MOCK_HOST, port=PACT_MOCK_PORT)
)
#pytest.fixture(scope='session')
def pact(request):
pact = Consumer('pactflow-example-consumer-python').has_pact_with(
Provider('pactflow-example-provider-python'), host_name=PACT_MOCK_HOST, port=PACT_MOCK_PORT,
pact_dir="./pacts", log_dir="./logs")
try:
print('start service')
pact.start_service()
yield pact
finally:
print('stop service')
pact.stop_service()
def test_get_product(pact, consumer):
expected = {
'id': "27",
'name': 'Margharita',
'type': 'Pizza'
}
(pact
.given('a product with ID 10 exists')
.upon_receiving('a request to get a product')
.with_request('GET', '/product/10')
.will_respond_with(200, body=Like(expected)))
with pact:
user = consumer.get_product('10')
assert user.name == 'Margharita'

How to change xcom in Airflow to accomodate large data?

I am using the following code in my Airflow operator:
import json
import pandas as pd
from airflow.exceptions import AirflowException
from airflow.hooks.http_hook import HttpHook
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
class HttpToGoogleCloudStorageOperator(BaseOperator):
template_fields = ['endpoint', 'data', 'headers', ]
template_ext = ()
ui_color = '#f4a460'
#apply_defaults
def __init__(self,
endpoint,
project_id,
table_id,
data=None,
headers=None,
auth=None,
http_conn_id='http_default',
*args, **kwargs):
super(HttpToGoogleCloudStorageOperator, self).__init__(*args, **kwargs)
self.table_id = table_id
self.http_conn_id = http_conn_id
self.method = "GET"
self.endpoint = endpoint
self.headers = headers or {}
self.auth = auth
self.data = data or {}
def execute(self, context):
http = HttpHook(self.method, http_conn_id=self.http_conn_id)
self.log.info("Calling HTTP method " + self.endpoint)
response = http.run(self.endpoint, self.data, self.headers,auth=self.auth)
self.log.info("Got response")
Unfortunately the data returned is too large (about 5k) to fit in the standard xcom and I get this error:
{taskinstance.py:1059} ERROR - (_mysql_exceptions.DataError) (1406, "Data too long for column 'value' at row 1")
Is there a way I can tell http_hook to use a different xcom, or (even better) not use xcom at all? I have looked around and I do not see a solution.
Thanks for any tips or pointers.
Edit: Here is how I call the operator. Note that nowhere do I specify xcom.
query_load_task = HttpToGoogleCloudStorageOperator(
task_id="query_load_task",
endpoint=endpoint,
project_id="my_gcp_poroject_id",
table_id="dataset.table",
data=None,
auth=(username, password))
It's preferable to store data to a system designed for such (e.g.: the file system, AWS S3, Azure, etc.) and instead return a unique identifier to reference the location of the data, for the file system this would likely be the full path (e.g.: /tmp/acme_response_20200709.csv) that way you leverage the best of both the storage system and your database.
If you add your code I'd be happy to take a crack at writing up some psuedo-code as an example.

Scrapy init returns a None

I have a scrapy scrawling script.
class QuotesSpider(scrapy.Spider):
name = 'quotes'
def __init__(self, *args, **kwargs):
super(QuotesSpider, self).__init__(*args, **kwargs)
self.cat = [kwargs.get('cat')]
print(self.cat)
def start_requests(self):
#print(self.params)
urls = ['https://google.com/html/?q=a%v%c']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
On command line:
scrapy crawl quotes -a cat="avc"
When I run the command:
It prints "None"
How can I access the value "avc" passed through the command line in the program
There are things missing in your code. Check out the below sample code
class QuotesSpider(scrapy.Spider):
name = 'quotes'
def __init__(self, *args, **kwargs):
super(QuotesSpider, self).__init__(*args, **kwargs)
self.cat = kwargs.get('cat')
print(self.cat)
def start_requests(self):
# print(self.params)
urls = [f"https://www.google.com/search?q={self.cat}"]
# urls = ['https://google.com/html/?q=a%v%c']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
print(response, "Response <---")
The requested google URL was wrong, I have changed it with the new one.
<200 https://www.google.com/search?q=avc> Response <---

How to create JOBDIR settings in Scrpay Spider dynamically?

I want to create JOBDIR setting from Spider __init__ or dynamically when I call that spider .
I want to create different JOBDIR for different spiders , like FEED_URI in the below example
class QtsSpider(scrapy.Spider):
name = 'qts'
custom_settings = {
'FEED_URI': 'data_files/' + '%(site_name)s.csv',
'FEED_FORMAT': "csv",
#'JOBDIR': 'resume/' + '%(site_name2)s'
}
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com']
def __init__(self, **kw):
super(QtsSpider, self).__init__(**kw)
self.site_name = kw.get('site_name')
def parse(self, response):
#our rest part of code
and we are calling that script from this way
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def main_function():
all_spiders = ['spider1','spider2','spider3'] # 3 different spiders
process = CrawlerProcess(get_project_settings())
for spider_name in all_spiders:
process.crawl('qts', site_name = spider_name )
process.start()
main_function()
How to achieve that dynamic creation of JOBDIR for different Spider like FEED_URI ?? Help will be appreciated.
I found myself needing the same sort of functionality, mostly due to not wanting to repetitively add a custom JOBDIR to each spider's custom_settings property. So, I created a simple extension that subclasses the original SpiderState extension that Scrapy utilizes to save the state of crawls.
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.extensions.spiderstate import SpiderState
import os
class SpiderStateManager(SpiderState):
"""
SpiderState Purpose: Store and load spider state during a scraping job
Added Purpose: Create a unique subdirectory within JOBDIR for each spider based on spider.name property
Reasoning: Reduces repetitive code
Usage: Instead of needing to add subdirectory paths in each spider.custom_settings dict
Simply specify the base JOBDIR in settings.py and the subdirectories are automatically managed
"""
def __init__(self, jobdir=None):
self.jobdir = jobdir
super(SpiderStateManager, self).__init__(jobdir=self.jobdir)
#classmethod
def from_crawler(cls, crawler):
base_jobdir = crawler.settings['JOBDIR']
if not base_jobdir:
raise NotConfigured
spider_jobdir = os.path.join(base_jobdir, crawler.spidercls.name)
if not os.path.exists(spider_jobdir):
os.makedirs(spider_jobdir)
obj = cls(spider_jobdir)
crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
return obj
To enable it, remember to add the proper settings to your settings.py like so
EXTENSIONS = {
# We want to disable the original SpiderState extension and use our own
"scrapy.extensions.spiderstate.SpiderState": None,
"spins.extensions.SpiderStateManager": 0
}
JOBDIR = "C:/Users/CaffeinatedMike/PycharmProjects/ScrapyDapyDoo/jobs"
Exactly how you have set the site_name, you can pass another argument,
process.crawl('qts', site_name=spider_name, jobdir='dirname that you want to keep')
will be available as a spiders attribute so you can write
def __init__(self):
jobdir = getattr(self, 'jobdir', None)
if jobdir:
self.custom_settings['JOBDIR'] = jobdir

import start_urls from a csv file in Scrapy

I recently start web-scraping using scrapy, I generated a list of urls that I want to scrape from into a txt document separate by a new line. This is my crawler code:
import scrapy
import csv
import sys
from realtor.items import RealtorItem
from scrapy.spider import BaseSpider
#from scrapy.selector import HtmlXPathSelector
#from realtor.items import RealtorItem
class RealtorSpider(scrapy.Spider):
name = "realtor"
allowed_domains = ["realtor.com"]
with open('realtor2.txt') as f:
start_urls = [url.strip() for url in f.readlines()]
def parse(self, response):
#hxs = HtmlXPathSelector(response)
#sites = hxs.select('//div/li/div/a/#href')
sites = response.xpath('//a[contains(#href, "/realestateandhomes-detail/")]')
items = []
for site in sites:
print(site.extract())
item = RealtorItem()
item['link'] = site.xpath('#href').extract()
items.append(item)
return items
now my goal is to read the links from realtor2.txt and start parsing through them, however I get a valueError missing scheme in request URL :
File "C:\Users\Ash\Anaconda2\lib\site-packages\scrapy\http\request\__init__.py", line 58, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url:
%FF%FEw%00w%00w%00.%00r%00e%00a%00l%00t%00o%00r%00.%00c%00o%00m%00/%00r%00e%00a%00l%00e%00s%00t%00a%00t%00e%00a%00n%00d%00h%00o%00m%00e%00s%00-%00d%00e%00t%00a%00i%00l%00/%005%000%00-%00M%00e%00n%00o%00r%00e%00s%00-%00A%00v%00e%00-%00A%00p%00t%00-%006%001%000%00_%00C%00o%00r%00a%00l%00-%00G%00a%00b%00l%00e%00s%00_%00F%00L%00_%003%003%001%003%004%00_%00M%005%003%008%000%006%00-%005%008%006%007%007%00%0D%00
2017-06-25 22:28:35 [scrapy.core.engine] INFO: Closing spider (finished)
I think there may be an issue while defining start_urls, but I dont know how to proceed,
"ValueError: Missing scheme in request url" means that you are missing http.
You can use urljoin to avoid this problem.

Resources