import start_urls from a csv file in Scrapy - web-scraping

I recently start web-scraping using scrapy, I generated a list of urls that I want to scrape from into a txt document separate by a new line. This is my crawler code:
import scrapy
import csv
import sys
from realtor.items import RealtorItem
from scrapy.spider import BaseSpider
#from scrapy.selector import HtmlXPathSelector
#from realtor.items import RealtorItem
class RealtorSpider(scrapy.Spider):
name = "realtor"
allowed_domains = ["realtor.com"]
with open('realtor2.txt') as f:
start_urls = [url.strip() for url in f.readlines()]
def parse(self, response):
#hxs = HtmlXPathSelector(response)
#sites = hxs.select('//div/li/div/a/#href')
sites = response.xpath('//a[contains(#href, "/realestateandhomes-detail/")]')
items = []
for site in sites:
print(site.extract())
item = RealtorItem()
item['link'] = site.xpath('#href').extract()
items.append(item)
return items
now my goal is to read the links from realtor2.txt and start parsing through them, however I get a valueError missing scheme in request URL :
File "C:\Users\Ash\Anaconda2\lib\site-packages\scrapy\http\request\__init__.py", line 58, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url:
%FF%FEw%00w%00w%00.%00r%00e%00a%00l%00t%00o%00r%00.%00c%00o%00m%00/%00r%00e%00a%00l%00e%00s%00t%00a%00t%00e%00a%00n%00d%00h%00o%00m%00e%00s%00-%00d%00e%00t%00a%00i%00l%00/%005%000%00-%00M%00e%00n%00o%00r%00e%00s%00-%00A%00v%00e%00-%00A%00p%00t%00-%006%001%000%00_%00C%00o%00r%00a%00l%00-%00G%00a%00b%00l%00e%00s%00_%00F%00L%00_%003%003%001%003%004%00_%00M%005%003%008%000%006%00-%005%008%006%007%007%00%0D%00
2017-06-25 22:28:35 [scrapy.core.engine] INFO: Closing spider (finished)
I think there may be an issue while defining start_urls, but I dont know how to proceed,

"ValueError: Missing scheme in request url" means that you are missing http.
You can use urljoin to avoid this problem.

Related

Scape issue in beautiful soup, NoneType' object has no attribute 'find_all

Trying to execute this code for scraping the specific websites / rss feeds metioned here below
keep getting :
Traceback (most recent call last):
File "C:\Users\Jeanne\Desktop\PYPDIT\pyscape.py", line 28, in
transcripts = [url_to_transcript(u) for u in urls]
File "C:\Users\Jeanne\Desktop\PYPDIT\pyscape.py", line 28, in
transcripts = [url_to_transcript(u) for u in urls]
File "C:\Users\Jeanne\Desktop\PYPDIT\pyscape.py", line 17, in url_to_transcript
text = [p.text for p in soup.find(class_="itemcontent").find_all('p')]
AttributeError: 'NoneType' object has no attribute 'find_all'
Please advise.
import requests
from bs4 import BeautifulSoup
import pickle
def url_to_transcript(url):
page = requests.get(url).text
soup = BeautifulSoup(page, "lxml")
text = [p.text for p in soup.find(class_="itemcontent").find_all('p')]
print(url)
return text
URLs of transcripts in scope
urls = ['http://feeds.nos.nl/nosnieuwstech',
'http://feeds.nos.nl/nosnieuwsalgemeen']
transcripts = [url_to_transcript(u) for u in urls]
The html returned is not the same as you see on the page. You can use the following:
import requests
from bs4 import BeautifulSoup
# import pickle
urls = ['http://feeds.nos.nl/nosnieuwstech','http://feeds.nos.nl/nosnieuwsalgemeen']
with requests.Session() as s:
for url in urls:
page = s.get(url).text
soup = BeautifulSoup(page, "lxml")
print(url)
print([[i.text for i in desc.select('p')] for desc in soup.select('description')[1:]])
print('--'*100)

Scrapy: one jobdir per spider

I have a Scrapy project with multiple spiders. How can I run all spiders with its own jobdir? I show in the following code how am I executing all spiders with a single jobdir.
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
settings.set('JOBDIR', 'saved_crawl', priority='cmdline')
process = CrawlerProcess(settings)
process.crawl('spider1')
process.crawl('spider2')
process.start()
Ok, I found that the solution is very simple. I just need to define JOBDIR in the custom settings dict, for each spider:
class Spider1(scrapy.Spider):
name = 'spider1'
custom_settings = {'JOBDIR': 'crawl_spider1'}

How to create JOBDIR settings in Scrpay Spider dynamically?

I want to create JOBDIR setting from Spider __init__ or dynamically when I call that spider .
I want to create different JOBDIR for different spiders , like FEED_URI in the below example
class QtsSpider(scrapy.Spider):
name = 'qts'
custom_settings = {
'FEED_URI': 'data_files/' + '%(site_name)s.csv',
'FEED_FORMAT': "csv",
#'JOBDIR': 'resume/' + '%(site_name2)s'
}
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com']
def __init__(self, **kw):
super(QtsSpider, self).__init__(**kw)
self.site_name = kw.get('site_name')
def parse(self, response):
#our rest part of code
and we are calling that script from this way
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def main_function():
all_spiders = ['spider1','spider2','spider3'] # 3 different spiders
process = CrawlerProcess(get_project_settings())
for spider_name in all_spiders:
process.crawl('qts', site_name = spider_name )
process.start()
main_function()
How to achieve that dynamic creation of JOBDIR for different Spider like FEED_URI ?? Help will be appreciated.
I found myself needing the same sort of functionality, mostly due to not wanting to repetitively add a custom JOBDIR to each spider's custom_settings property. So, I created a simple extension that subclasses the original SpiderState extension that Scrapy utilizes to save the state of crawls.
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.extensions.spiderstate import SpiderState
import os
class SpiderStateManager(SpiderState):
"""
SpiderState Purpose: Store and load spider state during a scraping job
Added Purpose: Create a unique subdirectory within JOBDIR for each spider based on spider.name property
Reasoning: Reduces repetitive code
Usage: Instead of needing to add subdirectory paths in each spider.custom_settings dict
Simply specify the base JOBDIR in settings.py and the subdirectories are automatically managed
"""
def __init__(self, jobdir=None):
self.jobdir = jobdir
super(SpiderStateManager, self).__init__(jobdir=self.jobdir)
#classmethod
def from_crawler(cls, crawler):
base_jobdir = crawler.settings['JOBDIR']
if not base_jobdir:
raise NotConfigured
spider_jobdir = os.path.join(base_jobdir, crawler.spidercls.name)
if not os.path.exists(spider_jobdir):
os.makedirs(spider_jobdir)
obj = cls(spider_jobdir)
crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
return obj
To enable it, remember to add the proper settings to your settings.py like so
EXTENSIONS = {
# We want to disable the original SpiderState extension and use our own
"scrapy.extensions.spiderstate.SpiderState": None,
"spins.extensions.SpiderStateManager": 0
}
JOBDIR = "C:/Users/CaffeinatedMike/PycharmProjects/ScrapyDapyDoo/jobs"
Exactly how you have set the site_name, you can pass another argument,
process.crawl('qts', site_name=spider_name, jobdir='dirname that you want to keep')
will be available as a spiders attribute so you can write
def __init__(self):
jobdir = getattr(self, 'jobdir', None)
if jobdir:
self.custom_settings['JOBDIR'] = jobdir

530 error when trying to open FTP directory

I want to use Scrapy to download files and navigate folders at ftp://ftp.co.palm-beach.fl.us/Building%20Permits/.
Here's my spider:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class LatestPermitsSpider(scrapy.Spider):
name= "latest_permits"
allowed_domains=["ftp.co.palm-beach.fl.us"]
handle_httpstatus_list = [404]
ftpUser= "the_username"
ftpPW= "the_password"
permitFilesDir= "ftp://ftp.co.palm-beach.fl.us/Building%20Permits/"
def start_requests(self):
yield Request(
url=self.permitFilesDir,
meta={
"ftp_user": self.ftpUser,
"ftp_password": self.ftpPW
}
)
def parse(self,response):
print response.body
When I run scrapy crawl latest_permits, I get this error:
ConnectionLost: ('FTP connection lost', <twisted.python.failure.Failure twisted.protocols.ftp.CommandFailed: ['530 Sorry, no ANONYMOUS access allowed.']>)
Why does this error come up even when I supply the correct username and password?
Look at the below source code of scrapy
https://github.com/scrapy/scrapy/blob/master/scrapy/core/downloader/handlers/ftp.py
The issue is not with your username or password. The issue is the scrapy supports only files to be downloaded using ftp it doesn't add support for listing directories. The url you are using is a directory url
There is a possible workaround to actually use a package name ftptree
Add handlers.py with below code
import json
from twisted.protocols.ftp import FTPFileListProtocol
from scrapy.http import Response
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
class FtpListingHandler(FTPDownloadHandler):
def gotClient(self, client, request, filepath):
self.client = client
protocol = FTPFileListProtocol()
return client.list(filepath, protocol).addCallbacks(
callback=self._build_response, callbackArgs=(request, protocol),
errback=self._failed, errbackArgs=(request,))
def _build_response(self, result, request, protocol):
self.result = result
body = json.dumps(protocol.files)
return Response(url=request.url, status=200, body=body)
And then in your settings.py use
DOWNLOAD_HANDLERS = {'ftp': 'cralwername.handlers.FtpListingHandler'}
A sample spider
import os
import json
from urlparse import urlparse
from scrapy import Spider
from scrapy.http.request import Request
from ftptree_crawler.items import FtpTreeLeaf
class AnonFtpRequest(Request):
anon_meta = {'ftp_user': 'anonymous',
'ftp_password': 'laserson#cloudera.com'}
def __init__(self, *args, **kwargs):
super(AnonFtpRequest, self).__init__(*args, **kwargs)
self.meta.update(self.anon_meta)
class FtpTreeSpider(Spider):
name = 'ftptree'
def __init__(self, config_file, *args, **kwargs):
super(FtpTreeSpider, self).__init__(*args, **kwargs)
with open(config_file, 'r') as ip:
config = json.loads(ip.read())
url = 'ftp://%s/%s' % (config['host'], config['root_path'])
self.start_url = url
self.site_id = config['id']
def start_requests(self):
yield AnonFtpRequest(self.start_url)
def parse(self, response):
url = urlparse(response.url)
basepath = url.path
files = json.loads(response.body)
for f in files:
if f['filetype'] == 'd':
path = os.path.join(response.url, f['filename'])
request = AnonFtpRequest(path)
yield request
if f['filetype'] == '-':
path = os.path.join(basepath, f['filename'])
result = FtpTreeLeaf(
filename=f['filename'], path=path, size=f['size'])
yield result
Links to look at if you need further information
https://github.com/laserson/ftptree/blob/master/ftptree_crawler/
https://gearheart.io/blog/crawling-ftp-server-with-scrapy/

how to load multiple pages one by one in QWebPage

I am trying to crawl news article pages for comments. After some research I found that mostly websites use an iframe for it. I want to get the "src" of the iframe. I am using QtWebKit in Python using PySide. It is actually working but just once. It is not loading other webpages. I am using the following code:
import sys
import pymysql
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from pprint import pprint
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
try:
self.app = QApplication(sys.argv)
except RuntimeError:
self.app = QCoreApplication.instance()
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def visit(url):
r = Render(url)
p = r.frame.toHtml()
f_url = str(r.frame.url().toString())
return p
def is_comment_url(url):
lower_url = url.lower()
n = lower_url.find("comment")
if n>0:
return True
else:
return False
with open("urls.txt") as f:
content = f.read().splitlines()
list_of_urls = []
for url in content:
page = visit(url)
soup = BeautifulSoup(page)
for tag in soup.findAll('iframe', src=True):
link = tag['src']
if is_comment_url(link):
print(link)
list_of_urls += link
pprint(list_of_urls)
But the issue is it works only for single iteration and gets stuck.
Also is there any way to save a web page as it is as displayed by the browser (after executing all the javascript etc.)

Resources