How to replace or remove special characters from scrapy? - web-scraping

I just started learning scrapy and trying to make spider to grab some info from website and trying to replace or remove special characters in 'short_descr'
import scrapy
class TravelspudSpider(scrapy.Spider):
name = 'travelSpud'
allowed_domains = ['www.tripadvisor.ca']
start_urls = [
'https://www.tripadvisor.ca/Attractions-g294265-Activities-c57-Singapore.html/'
]
base_url = 'https://www.tripadvisor.ca'
def parse(self, response, **kwargs):
for items in response.xpath('//div[#class= "_19L437XW _1qhi5DVB CO7bjfl5"]'):
yield {
'name': items.xpath('.//span/div[#class= "_1gpq3zsA _1zP41Z7X"]/text()').extract()[1],
'reviews': items.xpath('.//span[#class= "DrjyGw-P _26S7gyB4 _14_buatE _1dimhEoy"]/text()').extract(),
'rating': items.xpath('.//a/div[#class= "zTTYS8QR"]/svg/#title').extract(),
'short_descr': items.xpath('.//div[#class= "_3W_31Rvp _1nUIPWja _17LAEUXp _2b3s5IMB"]'
'/div[#class="DrjyGw-P _26S7gyB4 _3SccQt-T"]/text()').extract(),
'place': items.xpath('.//div[#class= "ZtPwio2G"]'
'/div'
'/div[#class= "DrjyGw-P _26S7gyB4 _3SccQt-T"]/text()').extract(),
'cost': items.xpath('.//div[#class= "DrjyGw-P _26S7gyB4 _3SccQt-T"]'
'/div[#class= "DrjyGw-P _1SRa-qNz _2AAjjcx8"]'
'/text()').extract(),
}
next_page_partial_url = response.css("div._1I73Kb0a").css("div._3djM0GaD").xpath('.//a/#href').extract_first()
if next_page_partial_url is not None:
next_page_url = self.base_url + next_page_partial_url
yield scrapy.Request(next_page_url, callback=self.parse)
Character I'm trying to replace is Hiking Trails • Scenic Walking Areas. The dot in the middle decodes in csv file as •
Everyting else works like a charm.
I've tried to use .replace(), but I'm getting an error:
AttributeError: 'list' object has no attribute 'replace'
Any help would be appreciated

If you're removing these special characters just because they appear weirdly in a CSV file, then I suggest not removing them. Just simply add the following line in the settings.py file.
FEED_EXPORT_ENCODING = 'utf-8-sig'
This will print the special character in your CSV file.

Related

How to create JOBDIR settings in Scrpay Spider dynamically?

I want to create JOBDIR setting from Spider __init__ or dynamically when I call that spider .
I want to create different JOBDIR for different spiders , like FEED_URI in the below example
class QtsSpider(scrapy.Spider):
name = 'qts'
custom_settings = {
'FEED_URI': 'data_files/' + '%(site_name)s.csv',
'FEED_FORMAT': "csv",
#'JOBDIR': 'resume/' + '%(site_name2)s'
}
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com']
def __init__(self, **kw):
super(QtsSpider, self).__init__(**kw)
self.site_name = kw.get('site_name')
def parse(self, response):
#our rest part of code
and we are calling that script from this way
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def main_function():
all_spiders = ['spider1','spider2','spider3'] # 3 different spiders
process = CrawlerProcess(get_project_settings())
for spider_name in all_spiders:
process.crawl('qts', site_name = spider_name )
process.start()
main_function()
How to achieve that dynamic creation of JOBDIR for different Spider like FEED_URI ?? Help will be appreciated.
I found myself needing the same sort of functionality, mostly due to not wanting to repetitively add a custom JOBDIR to each spider's custom_settings property. So, I created a simple extension that subclasses the original SpiderState extension that Scrapy utilizes to save the state of crawls.
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.extensions.spiderstate import SpiderState
import os
class SpiderStateManager(SpiderState):
"""
SpiderState Purpose: Store and load spider state during a scraping job
Added Purpose: Create a unique subdirectory within JOBDIR for each spider based on spider.name property
Reasoning: Reduces repetitive code
Usage: Instead of needing to add subdirectory paths in each spider.custom_settings dict
Simply specify the base JOBDIR in settings.py and the subdirectories are automatically managed
"""
def __init__(self, jobdir=None):
self.jobdir = jobdir
super(SpiderStateManager, self).__init__(jobdir=self.jobdir)
#classmethod
def from_crawler(cls, crawler):
base_jobdir = crawler.settings['JOBDIR']
if not base_jobdir:
raise NotConfigured
spider_jobdir = os.path.join(base_jobdir, crawler.spidercls.name)
if not os.path.exists(spider_jobdir):
os.makedirs(spider_jobdir)
obj = cls(spider_jobdir)
crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
return obj
To enable it, remember to add the proper settings to your settings.py like so
EXTENSIONS = {
# We want to disable the original SpiderState extension and use our own
"scrapy.extensions.spiderstate.SpiderState": None,
"spins.extensions.SpiderStateManager": 0
}
JOBDIR = "C:/Users/CaffeinatedMike/PycharmProjects/ScrapyDapyDoo/jobs"
Exactly how you have set the site_name, you can pass another argument,
process.crawl('qts', site_name=spider_name, jobdir='dirname that you want to keep')
will be available as a spiders attribute so you can write
def __init__(self):
jobdir = getattr(self, 'jobdir', None)
if jobdir:
self.custom_settings['JOBDIR'] = jobdir

import start_urls from a csv file in Scrapy

I recently start web-scraping using scrapy, I generated a list of urls that I want to scrape from into a txt document separate by a new line. This is my crawler code:
import scrapy
import csv
import sys
from realtor.items import RealtorItem
from scrapy.spider import BaseSpider
#from scrapy.selector import HtmlXPathSelector
#from realtor.items import RealtorItem
class RealtorSpider(scrapy.Spider):
name = "realtor"
allowed_domains = ["realtor.com"]
with open('realtor2.txt') as f:
start_urls = [url.strip() for url in f.readlines()]
def parse(self, response):
#hxs = HtmlXPathSelector(response)
#sites = hxs.select('//div/li/div/a/#href')
sites = response.xpath('//a[contains(#href, "/realestateandhomes-detail/")]')
items = []
for site in sites:
print(site.extract())
item = RealtorItem()
item['link'] = site.xpath('#href').extract()
items.append(item)
return items
now my goal is to read the links from realtor2.txt and start parsing through them, however I get a valueError missing scheme in request URL :
File "C:\Users\Ash\Anaconda2\lib\site-packages\scrapy\http\request\__init__.py", line 58, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url:
%FF%FEw%00w%00w%00.%00r%00e%00a%00l%00t%00o%00r%00.%00c%00o%00m%00/%00r%00e%00a%00l%00e%00s%00t%00a%00t%00e%00a%00n%00d%00h%00o%00m%00e%00s%00-%00d%00e%00t%00a%00i%00l%00/%005%000%00-%00M%00e%00n%00o%00r%00e%00s%00-%00A%00v%00e%00-%00A%00p%00t%00-%006%001%000%00_%00C%00o%00r%00a%00l%00-%00G%00a%00b%00l%00e%00s%00_%00F%00L%00_%003%003%001%003%004%00_%00M%005%003%008%000%006%00-%005%008%006%007%007%00%0D%00
2017-06-25 22:28:35 [scrapy.core.engine] INFO: Closing spider (finished)
I think there may be an issue while defining start_urls, but I dont know how to proceed,
"ValueError: Missing scheme in request url" means that you are missing http.
You can use urljoin to avoid this problem.

OpenStack SDK - How to create image with Kernel id and Ramdisk parameters?

I've been trying to create an OpenStack image informing the Kernel Id and Ramdisk Id, using the OpenStack Unified SDK (https://github.com/openstack/python-openstacksdk), but without success. I know this is possible, because the OpenStack CLI have this parameters, as shown on this page (http://docs.openstack.org/cli-reference/glance.html#glance-image-create), where the CLI have the "--kernel-id" and "--ramdisk-id" parameters. I've used this parameter in the terminal and confirmed they work, but I need to use them in python.
I'm trying to use the upload_method, as described here http://developer.openstack.org/sdks/python/openstacksdk/users/proxies/image.html#image-api-v2 but I can't get the attrs parameter right. Documentation only say it is suposed to be a dictionary. Here is the code I'm using
...
atrib = {
'properties': {
'kernel_id': 'd84e1f2b-8d8c-4a4a-8858-77a8d5a93cb1',
'ramdisk_id': 'cfef18e0-006e-477a-a098-593d43435a1e'
}
}
with open(file) as fimage:
image = image_service.upload_image(
name=name,
data=fimage,
disk_format='qcow2',
container_format='bare',
**atrib)
....
And here is the error I'm getting:
File "builder.py", line 121, in main
**atrib
File "/usr/lib/python2.7/site-packages/openstack/image/v2/_proxy.py", line 51, in upload_image
**attrs)
File "/usr/lib/python2.7/site-packages/openstack/proxy2.py", line 193, in _create
return res.create(self.session)
File "/usr/lib/python2.7/site-packages/openstack/resource2.py", line 570, in create
json=request.body, headers=request.headers)
File "/usr/lib/python2.7/site-packages/keystoneauth1/session.py", line 675, in post
return self.request(url, 'POST', **kwargs)
File "/usr/lib/python2.7/site-packages/openstack/session.py", line 52, in map_exceptions_wrapper
http_status=e.http_status, cause=e)
openstack.exceptions.HttpException: HttpException: Bad Request, 400 Bad Request
Provided object does not match schema 'image': {u'kernel_id': u'd84e1f2b-8d8c-4a4a-8858-77a8d5a93cb1', u'ramdisk_id': u'cfef18e0-006e-477a-a098-593d43435a1e'} is not of type 'string' Failed validating 'type' in schema['additionalProperties']: {'type': 'string'} On instance[u'properties']: {u'kernel_id': u'd84e1f2b-8d8c-4a4a-8858-77a8d5a93cb1', u'ramdisk_id': u'cfef18e0-006e-477a-a098-593d43435a1e'}
Already tried to use the update_image method, but without success, passing kernel id and ramdisk id as a strings creates the instance, but it does not boot.
Does anyone knows how to solve this?
what the version of the glance api you use?
I have read the code in openstackclient/image/v1/images.py, openstackclient/v1/shell.py
## in shell.py
def do_image_create(gc, args):
...
fields = dict(filter(lambda x: x[1] is not None, vars(args).items()))
raw_properties = fields.pop('property')
fields['properties'] = {}
for datum in raw_properties:
key, value = datum.split('=', 1)
fields['properties'][key] = value
...
image = gc.images.create(**fields)
## in images.py
def create(self, **kwargs):
...
for field in kwargs:
if field in CREATE_PARAMS:
fields[field] = kwargs[field]
elif field == 'return_req_id':
continue
else:
msg = 'create() got an unexpected keyword argument \'%s\''
raise TypeError(msg % field)
hdrs = self._image_meta_to_headers(fields)
...
resp, body = self.client.post('/v1/images',
headers=hdrs,
data=image_data)
...
and openstackclient/v2/shell.py,openstackclient/image/v2.images.py(and i have debuged this too)
## in shell.py
def do_image_create(gc, args):
...
raw_properties = fields.pop('property', [])
for datum in raw_properties:
key, value = datum.split('=', 1)
fields[key] = value
...
image = gc.images.create(**fields)
##in images.py
def create(self, **kwargs):
"""Create an image.""
url = '/v2/images'
image = self.model()
for (key, value) in kwargs.items():
try:
setattr(image, key, value)
except warlock.InvalidOperation as e:
raise TypeError(utils.exception_to_str(e))
resp, body = self.http_client.post(url, data=image)
...
it seems that, you can create a image use your way in version 1.0, but in version 2.0, you should use the kernel_id and ramdisk_id as below
atrib = {
'kernel_id': 'd84e1f2b-8d8c-4a4a-8858-77a8d5a93cb1',
'ramdisk_id': 'cfef18e0-006e-477a-a098-593d43435a1e'
}
but the OpenStack SDK seems it can't trans those two argments to the url (because there is no Body define in openstack/image/v2/image.py. so you should modify the OpenStack SDK to support this.
BTW, the code of OpenStack is a little different from it's version, but many things are same.

scrapy InitSpider: set Rules in __init__?

I am building a recursive webspider with an optional login. I want to make most settings dynamic via a json config file.
In my __init__ function, I am reading this file and try to populate all variables, however, this does not work with Rules.
class CrawlpySpider(InitSpider):
...
#----------------------------------------------------------------------
def __init__(self, *args, **kwargs):
"""Constructor: overwrite parent __init__ function"""
# Call parent init
super(CrawlpySpider, self).__init__(*args, **kwargs)
# Get command line arg provided configuration param
config_file = kwargs.get('config')
# Validate configuration file parameter
if not config_file:
logging.error('Missing argument "-a config"')
logging.error('Usage: scrapy crawl crawlpy -a config=/path/to/config.json')
self.abort = True
# Check if it is actually a file
elif not os.path.isfile(config_file):
logging.error('Specified config file does not exist')
logging.error('Not found in: "' + config_file + '"')
self.abort = True
# All good, read config
else:
# Load json config
fpointer = open(config_file)
data = fpointer.read()
fpointer.close()
# convert JSON to dict
config = json.loads(data)
# config['rules'] is simply a string array which looks like this:
# config['rules'] = [
# 'password',
# 'reset',
# 'delete',
# 'disable',
# 'drop',
# 'logout',
# ]
CrawlpySpider.rules = (
Rule(
LinkExtractor(
allow_domains=(self.allowed_domains),
unique=True,
deny=tuple(config['rules'])
),
callback='parse',
follow=False
),
)
Scrapy still crawls the pages that are present in config['rules'] and therefore also hits the logout page. So the specified pages are not being denied. What am I missing here?
Update:
I have already tried by setting CrawlpySpider.rules = ... as well as self.rules = ... inside __init__. Both variants do not work.
Spider: InitSpider
Rules: LinkExtractor
Before crawl: Doing login prior crawling
I even try to deny that in my parse function
# Dive deeper?
# The nesting depth is now handled via a custom middle-ware (middlewares.py)
#if curr_depth < self.max_depth or self.max_depth == 0:
links = LinkExtractor().extract_links(response)
for link in links:
for ignore in self.ignores:
if (ignore not in link.url) and (ignore.lower() not in link.url.lower()) and link.url.find(ignore) == -1:
yield Request(link.url, meta={'depth': curr_depth+1, 'referer': response.url})
You are setting a class attribute where you want to set an instance attribute:
# this:
CrawlpySpider.rules = (
# should be this:
self.rules = (
<...>

Create a portal_user_catalog and have it used (Plone)

I'm creating a fork of my Plone site (which has not been forked for a long time). This site has a special catalog object for user profiles (a special Archetypes-based object type) which is called portal_user_catalog:
$ bin/instance debug
>>> portal = app.Plone
>>> print [d for d in portal.objectMap() if d['meta_type'] == 'Plone Catalog Tool']
[{'meta_type': 'Plone Catalog Tool', 'id': 'portal_catalog'},
{'meta_type': 'Plone Catalog Tool', 'id': 'portal_user_catalog'}]
This looks reasonable because the user profiles don't have most of the indexes of the "normal" objects, but have a small set of own indexes.
Since I found no way how to create this object from scratch, I exported it from the old site (as portal_user_catalog.zexp) and imported it in the new site. This seemed to work, but I can't add objects to the imported catalog, not even by explicitly calling the catalog_object method. Instead, the user profiles are added to the standard portal_catalog.
Now I found a module in my product which seems to serve the purpose (Products/myproduct/exportimport/catalog.py):
"""Catalog tool setup handlers.
$Id: catalog.py 77004 2007-06-24 08:57:54Z yuppie $
"""
from Products.GenericSetup.utils import exportObjects
from Products.GenericSetup.utils import importObjects
from Products.CMFCore.utils import getToolByName
from zope.component import queryMultiAdapter
from Products.GenericSetup.interfaces import IBody
def importCatalogTool(context):
"""Import catalog tool.
"""
site = context.getSite()
obj = getToolByName(site, 'portal_user_catalog')
parent_path=''
if obj and not obj():
importer = queryMultiAdapter((obj, context), IBody)
path = '%s%s' % (parent_path, obj.getId().replace(' ', '_'))
__traceback_info__ = path
print [importer]
if importer:
print importer.name
if importer.name:
path = '%s%s' % (parent_path, 'usercatalog')
print path
filename = '%s%s' % (path, importer.suffix)
print filename
body = context.readDataFile(filename)
if body is not None:
importer.filename = filename # for error reporting
importer.body = body
if getattr(obj, 'objectValues', False):
for sub in obj.objectValues():
importObjects(sub, path+'/', context)
def exportCatalogTool(context):
"""Export catalog tool.
"""
site = context.getSite()
obj = getToolByName(site, 'portal_user_catalog', None)
if tool is None:
logger = context.getLogger('catalog')
logger.info('Nothing to export.')
return
parent_path=''
exporter = queryMultiAdapter((obj, context), IBody)
path = '%s%s' % (parent_path, obj.getId().replace(' ', '_'))
if exporter:
if exporter.name:
path = '%s%s' % (parent_path, 'usercatalog')
filename = '%s%s' % (path, exporter.suffix)
body = exporter.body
if body is not None:
context.writeDataFile(filename, body, exporter.mime_type)
if getattr(obj, 'objectValues', False):
for sub in obj.objectValues():
exportObjects(sub, path+'/', context)
I tried to use it, but I have no idea how it is supposed to be done;
I can't call it TTW (should I try to publish the methods?!).
I tried it in a debug session:
$ bin/instance debug
>>> portal = app.Plone
>>> from Products.myproduct.exportimport.catalog import exportCatalogTool
>>> exportCatalogTool(portal)
Traceback (most recent call last):
File "<console>", line 1, in <module>
File ".../Products/myproduct/exportimport/catalog.py", line 58, in exportCatalogTool
site = context.getSite()
AttributeError: getSite
So, if this is the way to go, it looks like I need a "real" context.
Update: To get this context, I tried an External Method:
# -*- coding: utf-8 -*-
from Products.myproduct.exportimport.catalog import exportCatalogTool
from pdb import set_trace
def p(dt, dd):
print '%-16s%s' % (dt+':', dd)
def main(self):
"""
Export the portal_user_catalog
"""
g = globals()
print '#' * 79
for a in ('__package__', '__module__'):
if a in g:
p(a, g[a])
p('self', self)
set_trace()
exportCatalogTool(self)
However, wenn I called it, I got the same <PloneSite at /Plone> object as the argument to the main function, which didn't have the getSite attribute. Perhaps my site doesn't call such External Methods correctly?
Or would I need to mention this module somehow in my configure.zcml, but how? I searched my directory tree (especially below Products/myproduct/profiles) for exportimport, the module name, and several other strings, but I couldn't find anything; perhaps there has been an integration once but was broken ...
So how do I make this portal_user_catalog work?
Thank you!
Update: Another debug session suggests the source of the problem to be some transaction matter:
>>> portal = app.Plone
>>> puc = portal.portal_user_catalog
>>> puc._catalog()
[]
>>> profiles_folder = portal.some_folder_with_profiles
>>> for o in profiles_folder.objectValues():
... puc.catalog_object(o)
...
>>> puc._catalog()
[<Products.ZCatalog.Catalog.mybrains object at 0x69ff8d8>, ...]
This population of the portal_user_catalog doesn't persist; after termination of the debug session and starting fg, the brains are gone.
It looks like the problem was indeed related with transactions.
I had
import transaction
...
class Browser(BrowserView):
...
def processNewUser(self):
....
transaction.commit()
before, but apparently this was not good enough (and/or perhaps not done correctly).
Now I start the transaction explicitly with transaction.begin(), save intermediate results with transaction.savepoint(), abort the transaction explicitly with transaction.abort() in case of errors (try / except), and have exactly one transaction.commit() at the end, in the case of success. Everything seems to work.
Of course, Plone still doesn't take this non-standard catalog into account; when I "clear and rebuild" it, it is empty afterwards. But for my application it works well enough.

Resources