Scrapy: one jobdir per spider

Scrapy: one jobdir per spider - web-scraping

I have a Scrapy project with multiple spiders. How can I run all spiders with its own jobdir? I show in the following code how am I executing all spiders with a single jobdir.
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
settings.set('JOBDIR', 'saved_crawl', priority='cmdline')
process = CrawlerProcess(settings)
process.crawl('spider1')
process.crawl('spider2')
process.start()

Ok, I found that the solution is very simple. I just need to define JOBDIR in the custom settings dict, for each spider:
class Spider1(scrapy.Spider):
name = 'spider1'
custom_settings = {'JOBDIR': 'crawl_spider1'}

Related

Url Persian not found in django/passenger

sample Url:
site.com/category/%D9%81%D8%AA%D9%88%DA%AF%D8%B1%D8%A7%D9%81%DB%8C_%D8%B1%DB%8C%D9%86%D9%88%D9%BE%D9%84%D8%A7%D8%B3%D8%AA%DB%8C/
url config:
url(r'^category/(?P<page_slug>.*)/$', views.category, name='category'),
passenger config:
import imp
import os
import sys
sys.path.insert(0, os.path.dirname(__file__))
wsgi = imp.load_source('wsgi', 'photography/wsgi.py')
application = wsgi.application
wsgi config:
"""
WSGI config for photography project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/1.10/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "photography.settings")
#os.environ["DJANGO_SETTINGS_MODULE"] = "photography.settings"
application = get_wsgi_application()
but response 404 not found in url!
in problem for all url persian slug.
when change config wsgi to unquote:
from urllib.parse import unquote
def application(environ, start_fn):
environ['PATH_INFO'] = unquote(environ['PATH_INFO'])
app = get_wsgi_application()
print(environ)
return app(environ, start_fn)
change url into:
site.com/category/%C3%99%C2%81%C3%98%C2%AA%C3%99%C2%88%C3%9A%C2%AF%C3%98%C2%B1%C3%98%C2%A7%C3%99/tag/%D9%81%D8%AA%D9%88%DA%AF%D8%B1%D8%A7%D9%81%DB%8C-%D9%BE%D8%B2%D8%B4%DA%A9%DB%8C/
But there is an open problem !
I applied all the changes I found with the search, but there is still a problem!
Output one of the changes in the wsgi:
App 3585081 output: set_script_prefix(get_script_name(environ))
App 3585081 output: File "/home/sepandteb/virtualenv/sepandteb/3.5/lib/python3.5/site-packages/django/core/handlers/wsgi.py", line 210, in get_script_name
App 3585081 output: return script_name.decode(UTF_8)
App 3585081 output: UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 27: unexpected end of data

use encode('utf-8') for Persian character and set # -*- coding: utf-8 -*- on top of file.
for example when you save data on DB or read on DB use this function.
i.e :
slug.encode('utf-8')

You can use "uri_to_iri" in the view, for example:
from django.shortcuts import get_object_or_404
from django.utils.encoding import uri_to_iri
def blog_detail(request, slug):
post= get_object_or_404(Post, slug=uri_to_iri(slug))

How to create JOBDIR settings in Scrpay Spider dynamically?

I want to create JOBDIR setting from Spider __init__ or dynamically when I call that spider .
I want to create different JOBDIR for different spiders , like FEED_URI in the below example
class QtsSpider(scrapy.Spider):
name = 'qts'
custom_settings = {
'FEED_URI': 'data_files/' + '%(site_name)s.csv',
'FEED_FORMAT': "csv",
#'JOBDIR': 'resume/' + '%(site_name2)s'
}
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com']
def __init__(self, **kw):
super(QtsSpider, self).__init__(**kw)
self.site_name = kw.get('site_name')
def parse(self, response):
#our rest part of code
and we are calling that script from this way
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def main_function():
all_spiders = ['spider1','spider2','spider3'] # 3 different spiders
process = CrawlerProcess(get_project_settings())
for spider_name in all_spiders:
process.crawl('qts', site_name = spider_name )
process.start()
main_function()
How to achieve that dynamic creation of JOBDIR for different Spider like FEED_URI ?? Help will be appreciated.

I found myself needing the same sort of functionality, mostly due to not wanting to repetitively add a custom JOBDIR to each spider's custom_settings property. So, I created a simple extension that subclasses the original SpiderState extension that Scrapy utilizes to save the state of crawls.
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.extensions.spiderstate import SpiderState
import os
class SpiderStateManager(SpiderState):
"""
SpiderState Purpose: Store and load spider state during a scraping job
Added Purpose: Create a unique subdirectory within JOBDIR for each spider based on spider.name property
Reasoning: Reduces repetitive code
Usage: Instead of needing to add subdirectory paths in each spider.custom_settings dict
Simply specify the base JOBDIR in settings.py and the subdirectories are automatically managed
"""
def __init__(self, jobdir=None):
self.jobdir = jobdir
super(SpiderStateManager, self).__init__(jobdir=self.jobdir)
#classmethod
def from_crawler(cls, crawler):
base_jobdir = crawler.settings['JOBDIR']
if not base_jobdir:
raise NotConfigured
spider_jobdir = os.path.join(base_jobdir, crawler.spidercls.name)
if not os.path.exists(spider_jobdir):
os.makedirs(spider_jobdir)
obj = cls(spider_jobdir)
crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
return obj
To enable it, remember to add the proper settings to your settings.py like so
EXTENSIONS = {
# We want to disable the original SpiderState extension and use our own
"scrapy.extensions.spiderstate.SpiderState": None,
"spins.extensions.SpiderStateManager": 0
}
JOBDIR = "C:/Users/CaffeinatedMike/PycharmProjects/ScrapyDapyDoo/jobs"

Exactly how you have set the site_name, you can pass another argument,
process.crawl('qts', site_name=spider_name, jobdir='dirname that you want to keep')
will be available as a spiders attribute so you can write
def __init__(self):
jobdir = getattr(self, 'jobdir', None)
if jobdir:
self.custom_settings['JOBDIR'] = jobdir

Cannot find the desired link for download (Python BeautifulSoup)

I am pretty new to Python Beautiful Soup and I don't have much knowledge about html or js. I tried to use bs4 to download all xls files in this page, but it seems that bs4 cannot find the links under "attachment" section. Could someone help me out?
My current code is:
"""
Scrapping of all county-level raw data from
http://www.countyhealthrankings.org for all years. Data stored in RawData
folder.
Code modified from https://null-byte.wonderhowto.com/how-to/download-all-
pdfs-webpage-with-python-script-0163031/
"""
from bs4 import BeautifulSoup
import urlparse
import urllib2
import os
import sys
"""
Get all links
"""
def getAllLinks(url):
page=urllib2.urlopen(url)
soup = BeautifulSoup(page.read(),"html.parser")
links = soup.find_all('a', href=True)
return links
def download(links):
for link in links:
#raw_input("Press Enter to continue...")
#print link
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))[1]
suffix = os.path.splitext(os.path.basename(link['href']))[1]
if os.path.splitext(os.path.basename(link['href']))[1] == '.xls':
print link #cannot find anything
currentLink = urllib2.urlopen(link)
links =
getAllLinks("http://www.countyhealthrankings.org/app/iowa/2017/downloads")
download(links)
(By the way, my desired link looks like this.)
Thanks!

This seems to be one of the tasks for which BeautifulSoup (in itself, at least) is inadequate. You can, however, do it with selenium.
>>> from selenium import webdriver
>>> driver = webdriver.Chrome()
>>> driver.get('http://www.countyhealthrankings.org/app/iowa/2017/downloads')
>>> links = driver.find_elements_by_xpath('.//span[#class="file"]/a')
>>> len(links)
30
>>> for link in links:
... link.get_attribute('href')
...
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2017_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20County%20Health%20Rankings%20Iowa%20Data%20-%20v1.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2016_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20County%20Health%20Rankings%20Iowa%20Data%20-%20v3.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2015_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20County%20Health%20Rankings%20Iowa%20Data%20-%20v3.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2014_IA_v2.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20County%20Health%20Rankings%20Iowa%20Data%20-%20v6.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2013_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20County%20Health%20Ranking%20Iowa%20Data%20-%20v1_0.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2012_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2011_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2010_IA_0.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2010%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'

import start_urls from a csv file in Scrapy

I recently start web-scraping using scrapy, I generated a list of urls that I want to scrape from into a txt document separate by a new line. This is my crawler code:
import scrapy
import csv
import sys
from realtor.items import RealtorItem
from scrapy.spider import BaseSpider
#from scrapy.selector import HtmlXPathSelector
#from realtor.items import RealtorItem
class RealtorSpider(scrapy.Spider):
name = "realtor"
allowed_domains = ["realtor.com"]
with open('realtor2.txt') as f:
start_urls = [url.strip() for url in f.readlines()]
def parse(self, response):
#hxs = HtmlXPathSelector(response)
#sites = hxs.select('//div/li/div/a/#href')
sites = response.xpath('//a[contains(#href, "/realestateandhomes-detail/")]')
items = []
for site in sites:
print(site.extract())
item = RealtorItem()
item['link'] = site.xpath('#href').extract()
items.append(item)
return items
now my goal is to read the links from realtor2.txt and start parsing through them, however I get a valueError missing scheme in request URL :
File "C:\Users\Ash\Anaconda2\lib\site-packages\scrapy\http\request\__init__.py", line 58, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url:
%FF%FEw%00w%00w%00.%00r%00e%00a%00l%00t%00o%00r%00.%00c%00o%00m%00/%00r%00e%00a%00l%00e%00s%00t%00a%00t%00e%00a%00n%00d%00h%00o%00m%00e%00s%00-%00d%00e%00t%00a%00i%00l%00/%005%000%00-%00M%00e%00n%00o%00r%00e%00s%00-%00A%00v%00e%00-%00A%00p%00t%00-%006%001%000%00_%00C%00o%00r%00a%00l%00-%00G%00a%00b%00l%00e%00s%00_%00F%00L%00_%003%003%001%003%004%00_%00M%005%003%008%000%006%00-%005%008%006%007%007%00%0D%00
2017-06-25 22:28:35 [scrapy.core.engine] INFO: Closing spider (finished)
I think there may be an issue while defining start_urls, but I dont know how to proceed,

"ValueError: Missing scheme in request url" means that you are missing http.
You can use urljoin to avoid this problem.

how to load multiple pages one by one in QWebPage

I am trying to crawl news article pages for comments. After some research I found that mostly websites use an iframe for it. I want to get the "src" of the iframe. I am using QtWebKit in Python using PySide. It is actually working but just once. It is not loading other webpages. I am using the following code:
import sys
import pymysql
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from pprint import pprint
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
try:
self.app = QApplication(sys.argv)
except RuntimeError:
self.app = QCoreApplication.instance()
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def visit(url):
r = Render(url)
p = r.frame.toHtml()
f_url = str(r.frame.url().toString())
return p
def is_comment_url(url):
lower_url = url.lower()
n = lower_url.find("comment")
if n>0:
return True
else:
return False
with open("urls.txt") as f:
content = f.read().splitlines()
list_of_urls = []
for url in content:
page = visit(url)
soup = BeautifulSoup(page)
for tag in soup.findAll('iframe', src=True):
link = tag['src']
if is_comment_url(link):
print(link)
list_of_urls += link
pprint(list_of_urls)
But the issue is it works only for single iteration and gets stuck.
Also is there any way to save a web page as it is as displayed by the browser (after executing all the javascript etc.)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Scrapy: one jobdir per spider - web-scraping

Ok, I found that the solution is very simple. I just need to define JOBDIR in the custom settings dict, for each spider: class Spider1(scrapy.Spider): name = 'spider1' custom_settings = {'JOBDIR': 'crawl_spider1'}

Related

Url Persian not found in django/passenger

How to create JOBDIR settings in Scrpay Spider dynamically?

Cannot find the desired link for download (Python BeautifulSoup)

import start_urls from a csv file in Scrapy

how to load multiple pages one by one in QWebPage

Categories

Resources