proxy-pool is already defined as a collector - web-scraping

I'm using both user_agents and proxy pool as bypass. I regrouped all of my spiders in a single one. When I crawl my main spider I face a proxy_pool error "proxy-pool is already defined as a collector"
Is there a way to use proxy_pool for all of my spiders?
Here is the code of my main spider if it can help :
configure_logging()
settings = get_project_settings()
runner = CrawlerRunner(settings)
for spider_name in runner.spiders.list():
runner.crawl(spider_name)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
setting.py
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610,
'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620
}
PROXY_POOL_ENABLED = True

Related

Python: Run only one function async

I have a large legacy application that has one function that is a prime candidate to be executed async. It's IO bound (network and disk) and doesn't return anything.
This is a very simple similar implementation:
import random
import time
import requests
def fetch_urls(site):
wait = random.randint(0, 5)
filename = site.split("/")[2].replace(".", "_")
print(f"Will fetch {site} in {wait} seconds")
time.sleep(wait)
r = requests.get(site)
with open(filename, "w") as fd:
fd.write(r.text)
def something(sites):
for site in sites:
fetch_urls(site)
return True
def main():
sites = ["https://www.google.com", "https://www.reddit.com", "https://www.msn.com"]
start = time.perf_counter()
something(sites)
total_time = time.perf_counter() - start
print(f"Finished in {total_time}")
if __name__ == "__main__":
main()
My end goal would be updating the something function to run fetch_urls async.
I cannot change fetch_urls.
All documentation and tutorials I can find assumes my entire application is async (starting from async def main()) but this is not the case.
It's a huge application spanning across multiple modules and re-factoring everything for a single function doesn't look right.
For what I understand I will need to create a loop, add tasks to it and dispatch it somehow, but I tried many different things and I still get everything running just one after another - as oppose to concurrently.
I would appreciate any assistance. Thanks!
Replying to myself, it seems there is no easy way to do that with async. Ended up using concurrent.futures
import time
import requests
import concurrent.futures
def fetch_urls(url, name):
wait = 5
filename = url.split("/")[2].replace(".", "_")
print(f"Will fetch {name} in {wait} seconds")
time.sleep(wait)
r = requests.get(url)
with open(filename, "w") as fd:
fd.write(r.text)
def something(sites):
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
future_to_url = {
executor.submit(fetch_urls, url["url"], url["name"]): (url)
for url in sites["children"]
}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print("%r generated an exception: %s" % (url, exc))
return True
def main():
sites = {
"parent": "https://stackoverflow.com",
"children": [
{"name": "google", "url": "https://google.com"},
{"name": "reddit", "url": "https://reddit.com"},
],
}
start = time.perf_counter()
something(sites)
total_time = time.perf_counter() - start
print(f"Finished in {total_time}")

Openstack Not able to connect to using rest api

I have a local install on openstack on my virtual box. I am trying to use the lib cloud api to connnect and get a list of images,flavours etc.
Below is the code that I am trying to execute
from libcloud.compute.types import Provider
from libcloud.compute.providers import get_driver
# Authentication information so you can authenticate to DreamCompute
# copy the details from the OpenStack RC file
# https://dashboard.dreamcompute.com/project/access_and_security/api_access/openrc/
auth_username = 'admin'
auth_password = 'f882e2f4eaad434c'
TENANT_NAME = 'admin'
project_name = 'admin'
auth_url = 'http://192.168.56.101:5000/v3/tokens'
region_name = 'RegionOne'
provider = get_driver(Provider.OPENSTACK)
conn = provider(auth_username,
auth_password,
ex_force_auth_url=auth_url,
ex_force_auth_version='2.0_password',
ex_tenant_name=project_name,
ex_force_service_type='compute',
ex_force_service_name='compute',
ex_force_base_url='http://192.168.56.101:8774/v2.1/29a8949bc3a04bfead0654be8e552017')
# Get the image that we want to use by its id
# NOTE: the image_id may change. See the documentation to find
# all the images available to your user
image_id = '4525d442-e9f4-4d19-889f-49ab03be93df'
image = conn.get_image(image_id)
# Get the flavor that we want to use by its id
flavor_id = '100'
flavor = conn.ex_get_size(flavor_id)
# Create the node with the name “PracticeInstance”
# and the size and image we chose above
instance = conn.create_node(name='PracticeInstance', image=image, size=flavor)
When I run the above code I am getting below error:
C:\Python dev\website\music\openstack>python openstack.py
Traceback (most recent call last):
File "openstack.py", line 30, in <module>
image = conn.get_image(image_id)
File "C:\Users\C5265680\AppData\Local\Programs\Python\Python36\lib\site-packages\libcloud\compute\drivers\openstack.py", line 2028, in get_image
'/images/%s' % (image_id,)).object['image'])
File "C:\Users\C5265680\AppData\Local\Programs\Python\Python36\lib\site-packages\libcloud\common\openstack.py", line 223, in request
raw=raw)
File "C:\Users\C5265680\AppData\Local\Programs\Python\Python36\lib\site-packages\libcloud\common\base.py", line 536, in request
action = self.morph_action_hook(action)
File "C:\Users\C5265680\AppData\Local\Programs\Python\Python36\lib\site-packages\libcloud\common\openstack.py", line 290, in morph_action_hook
self._populate_hosts_and_request_paths()
File "C:\Users\C5265680\AppData\Local\Programs\Python\Python36\lib\site-packages\libcloud\common\openstack.py", line 324, in _populate_hosts_and_request_paths
osa = osa.authenticate(**kwargs) # may throw InvalidCreds
File "C:\Users\C5265680\AppData\Local\Programs\Python\Python36\lib\site-packages\libcloud\common\openstack_identity.py", line 855, in authenticate
return self._authenticate_2_0_with_password()
File "C:\Users\C5265680\AppData\Local\Programs\Python\Python36\lib\site-packages\libcloud\common\openstack_identity.py", line 880, in _authenticate_2_0_with_password
return self._authenticate_2_0_with_body(reqbody)
File "C:\Users\C5265680\AppData\Local\Programs\Python\Python36\lib\site-packages\libcloud\common\openstack_identity.py", line 885, in _authenticate_2_0_with_body
method='POST')
File "C:\Users\C5265680\AppData\Local\Programs\Python\Python36\lib\site-packages\libcloud\common\base.py", line 637, in request
response = responseCls(**kwargs)
File "C:\Users\C5265680\AppData\Local\Programs\Python\Python36\lib\site-packages\libcloud\common\base.py", line 157, in __init__
message=self.parse_error())
libcloud.common.exceptions.BaseHTTPError: {"error": {"message": "The resource could not be found.", "code": 404, "title": "Not Found"}}
I have checked the logs in my server at /var/log/keystone and it does not give any error so I am guessing that I am able to login.
Also there are many examples which show that after the above steps I should be able to get list of images/flavors/servers.
Not sure why i am not able to connect. Can someone please help me with this.
I think you should modify the auth_url and provider arguments.
As following argument set is worked at my environment.
auth_username = 'admin'
auth_password = 'f882e2f4eaad434c'
TENANT_NAME = 'admin'
project_name = 'admin'
auth_url = 'http://192.168.56.101:5000'
region_name = 'RegionOne'
provider = get_driver(Provider.OPENSTACK)
conn = provider(auth_username,
auth_password,
ex_force_auth_url=auth_url,
ex_force_auth_version='2.0_password',
ex_tenant_name=project_name)
Updated 2017.11.16
#user8040338 Your error message is the first clue,
The resource could not be found. and status code 404, the most cause of its message and status code is wrong rest api url format.
Firstly, you need to check keystone v2.0 rest api format.
At the same time, you check again Libcloud reference.
For instance, the argument ex_force_auth_version had been specified the api version 2.0_password (v2), but auth_url variable formed the url including the version resource /v3, it was wrong version and usage of API with the libcloud API argument you specified. auth_url should be a base URL from API usage.
The similar processes about each argument of API should be conducted repeatedly until solving issues.

OpenStack SDK - How to create image with Kernel id and Ramdisk parameters?

I've been trying to create an OpenStack image informing the Kernel Id and Ramdisk Id, using the OpenStack Unified SDK (https://github.com/openstack/python-openstacksdk), but without success. I know this is possible, because the OpenStack CLI have this parameters, as shown on this page (http://docs.openstack.org/cli-reference/glance.html#glance-image-create), where the CLI have the "--kernel-id" and "--ramdisk-id" parameters. I've used this parameter in the terminal and confirmed they work, but I need to use them in python.
I'm trying to use the upload_method, as described here http://developer.openstack.org/sdks/python/openstacksdk/users/proxies/image.html#image-api-v2 but I can't get the attrs parameter right. Documentation only say it is suposed to be a dictionary. Here is the code I'm using
...
atrib = {
'properties': {
'kernel_id': 'd84e1f2b-8d8c-4a4a-8858-77a8d5a93cb1',
'ramdisk_id': 'cfef18e0-006e-477a-a098-593d43435a1e'
}
}
with open(file) as fimage:
image = image_service.upload_image(
name=name,
data=fimage,
disk_format='qcow2',
container_format='bare',
**atrib)
....
And here is the error I'm getting:
File "builder.py", line 121, in main
**atrib
File "/usr/lib/python2.7/site-packages/openstack/image/v2/_proxy.py", line 51, in upload_image
**attrs)
File "/usr/lib/python2.7/site-packages/openstack/proxy2.py", line 193, in _create
return res.create(self.session)
File "/usr/lib/python2.7/site-packages/openstack/resource2.py", line 570, in create
json=request.body, headers=request.headers)
File "/usr/lib/python2.7/site-packages/keystoneauth1/session.py", line 675, in post
return self.request(url, 'POST', **kwargs)
File "/usr/lib/python2.7/site-packages/openstack/session.py", line 52, in map_exceptions_wrapper
http_status=e.http_status, cause=e)
openstack.exceptions.HttpException: HttpException: Bad Request, 400 Bad Request
Provided object does not match schema 'image': {u'kernel_id': u'd84e1f2b-8d8c-4a4a-8858-77a8d5a93cb1', u'ramdisk_id': u'cfef18e0-006e-477a-a098-593d43435a1e'} is not of type 'string' Failed validating 'type' in schema['additionalProperties']: {'type': 'string'} On instance[u'properties']: {u'kernel_id': u'd84e1f2b-8d8c-4a4a-8858-77a8d5a93cb1', u'ramdisk_id': u'cfef18e0-006e-477a-a098-593d43435a1e'}
Already tried to use the update_image method, but without success, passing kernel id and ramdisk id as a strings creates the instance, but it does not boot.
Does anyone knows how to solve this?
what the version of the glance api you use?
I have read the code in openstackclient/image/v1/images.py, openstackclient/v1/shell.py
## in shell.py
def do_image_create(gc, args):
...
fields = dict(filter(lambda x: x[1] is not None, vars(args).items()))
raw_properties = fields.pop('property')
fields['properties'] = {}
for datum in raw_properties:
key, value = datum.split('=', 1)
fields['properties'][key] = value
...
image = gc.images.create(**fields)
## in images.py
def create(self, **kwargs):
...
for field in kwargs:
if field in CREATE_PARAMS:
fields[field] = kwargs[field]
elif field == 'return_req_id':
continue
else:
msg = 'create() got an unexpected keyword argument \'%s\''
raise TypeError(msg % field)
hdrs = self._image_meta_to_headers(fields)
...
resp, body = self.client.post('/v1/images',
headers=hdrs,
data=image_data)
...
and openstackclient/v2/shell.py,openstackclient/image/v2.images.py(and i have debuged this too)
## in shell.py
def do_image_create(gc, args):
...
raw_properties = fields.pop('property', [])
for datum in raw_properties:
key, value = datum.split('=', 1)
fields[key] = value
...
image = gc.images.create(**fields)
##in images.py
def create(self, **kwargs):
"""Create an image.""
url = '/v2/images'
image = self.model()
for (key, value) in kwargs.items():
try:
setattr(image, key, value)
except warlock.InvalidOperation as e:
raise TypeError(utils.exception_to_str(e))
resp, body = self.http_client.post(url, data=image)
...
it seems that, you can create a image use your way in version 1.0, but in version 2.0, you should use the kernel_id and ramdisk_id as below
atrib = {
'kernel_id': 'd84e1f2b-8d8c-4a4a-8858-77a8d5a93cb1',
'ramdisk_id': 'cfef18e0-006e-477a-a098-593d43435a1e'
}
but the OpenStack SDK seems it can't trans those two argments to the url (because there is no Body define in openstack/image/v2/image.py. so you should modify the OpenStack SDK to support this.
BTW, the code of OpenStack is a little different from it's version, but many things are same.

Create a portal_user_catalog and have it used (Plone)

I'm creating a fork of my Plone site (which has not been forked for a long time). This site has a special catalog object for user profiles (a special Archetypes-based object type) which is called portal_user_catalog:
$ bin/instance debug
>>> portal = app.Plone
>>> print [d for d in portal.objectMap() if d['meta_type'] == 'Plone Catalog Tool']
[{'meta_type': 'Plone Catalog Tool', 'id': 'portal_catalog'},
{'meta_type': 'Plone Catalog Tool', 'id': 'portal_user_catalog'}]
This looks reasonable because the user profiles don't have most of the indexes of the "normal" objects, but have a small set of own indexes.
Since I found no way how to create this object from scratch, I exported it from the old site (as portal_user_catalog.zexp) and imported it in the new site. This seemed to work, but I can't add objects to the imported catalog, not even by explicitly calling the catalog_object method. Instead, the user profiles are added to the standard portal_catalog.
Now I found a module in my product which seems to serve the purpose (Products/myproduct/exportimport/catalog.py):
"""Catalog tool setup handlers.
$Id: catalog.py 77004 2007-06-24 08:57:54Z yuppie $
"""
from Products.GenericSetup.utils import exportObjects
from Products.GenericSetup.utils import importObjects
from Products.CMFCore.utils import getToolByName
from zope.component import queryMultiAdapter
from Products.GenericSetup.interfaces import IBody
def importCatalogTool(context):
"""Import catalog tool.
"""
site = context.getSite()
obj = getToolByName(site, 'portal_user_catalog')
parent_path=''
if obj and not obj():
importer = queryMultiAdapter((obj, context), IBody)
path = '%s%s' % (parent_path, obj.getId().replace(' ', '_'))
__traceback_info__ = path
print [importer]
if importer:
print importer.name
if importer.name:
path = '%s%s' % (parent_path, 'usercatalog')
print path
filename = '%s%s' % (path, importer.suffix)
print filename
body = context.readDataFile(filename)
if body is not None:
importer.filename = filename # for error reporting
importer.body = body
if getattr(obj, 'objectValues', False):
for sub in obj.objectValues():
importObjects(sub, path+'/', context)
def exportCatalogTool(context):
"""Export catalog tool.
"""
site = context.getSite()
obj = getToolByName(site, 'portal_user_catalog', None)
if tool is None:
logger = context.getLogger('catalog')
logger.info('Nothing to export.')
return
parent_path=''
exporter = queryMultiAdapter((obj, context), IBody)
path = '%s%s' % (parent_path, obj.getId().replace(' ', '_'))
if exporter:
if exporter.name:
path = '%s%s' % (parent_path, 'usercatalog')
filename = '%s%s' % (path, exporter.suffix)
body = exporter.body
if body is not None:
context.writeDataFile(filename, body, exporter.mime_type)
if getattr(obj, 'objectValues', False):
for sub in obj.objectValues():
exportObjects(sub, path+'/', context)
I tried to use it, but I have no idea how it is supposed to be done;
I can't call it TTW (should I try to publish the methods?!).
I tried it in a debug session:
$ bin/instance debug
>>> portal = app.Plone
>>> from Products.myproduct.exportimport.catalog import exportCatalogTool
>>> exportCatalogTool(portal)
Traceback (most recent call last):
File "<console>", line 1, in <module>
File ".../Products/myproduct/exportimport/catalog.py", line 58, in exportCatalogTool
site = context.getSite()
AttributeError: getSite
So, if this is the way to go, it looks like I need a "real" context.
Update: To get this context, I tried an External Method:
# -*- coding: utf-8 -*-
from Products.myproduct.exportimport.catalog import exportCatalogTool
from pdb import set_trace
def p(dt, dd):
print '%-16s%s' % (dt+':', dd)
def main(self):
"""
Export the portal_user_catalog
"""
g = globals()
print '#' * 79
for a in ('__package__', '__module__'):
if a in g:
p(a, g[a])
p('self', self)
set_trace()
exportCatalogTool(self)
However, wenn I called it, I got the same <PloneSite at /Plone> object as the argument to the main function, which didn't have the getSite attribute. Perhaps my site doesn't call such External Methods correctly?
Or would I need to mention this module somehow in my configure.zcml, but how? I searched my directory tree (especially below Products/myproduct/profiles) for exportimport, the module name, and several other strings, but I couldn't find anything; perhaps there has been an integration once but was broken ...
So how do I make this portal_user_catalog work?
Thank you!
Update: Another debug session suggests the source of the problem to be some transaction matter:
>>> portal = app.Plone
>>> puc = portal.portal_user_catalog
>>> puc._catalog()
[]
>>> profiles_folder = portal.some_folder_with_profiles
>>> for o in profiles_folder.objectValues():
... puc.catalog_object(o)
...
>>> puc._catalog()
[<Products.ZCatalog.Catalog.mybrains object at 0x69ff8d8>, ...]
This population of the portal_user_catalog doesn't persist; after termination of the debug session and starting fg, the brains are gone.
It looks like the problem was indeed related with transactions.
I had
import transaction
...
class Browser(BrowserView):
...
def processNewUser(self):
....
transaction.commit()
before, but apparently this was not good enough (and/or perhaps not done correctly).
Now I start the transaction explicitly with transaction.begin(), save intermediate results with transaction.savepoint(), abort the transaction explicitly with transaction.abort() in case of errors (try / except), and have exactly one transaction.commit() at the end, in the case of success. Everything seems to work.
Of course, Plone still doesn't take this non-standard catalog into account; when I "clear and rebuild" it, it is empty afterwards. But for my application it works well enough.

Development Mode For uWSGI/Pylons (Reload new code)

I have a setup such that an nginx server passes control off to uWsgi, which launches a pylons app using the following in my xml configuration file:
<ini-paste>...</ini-paste>
Everything is working nicely, and I was able to set it to debug mode using the following in the associated ini file, like:
debug = true
Except debug mode only prints out errors, and doesn't reload the code everytime a file has been touched. If I was running directly through paste, I could use the --reload option, but going through uWsgi complicates things.
Does anybody know of a way to tell uWsgi to tell paste to set the --reload option, or to do this directly in the paste .ini file?
I used something like the following code to solve this, the monitorFiles(...) method is called on application initialization, and it monitors the files, sending the TERM signal when it sees a change.
I'd still much prefer a solution using paster's --reload argument, as I imagine this solution has bugs:
import os
import time
import signal
from deepthought.system import deployment
from multiprocessing.process import Process
def monitorFiles():
if deployment.getDeployment().dev and not FileMonitor.isRunning:
monitor = FileMonitor(os.getpid())
try: monitor.start()
except: print "Something went wrong..."
class FileMonitor(Process):
isRunning = False
def __init__(self, masterPid):
self.updates = {}
self.rootDir = deployment.rootDir() + "/src/python"
self.skip = len(self.rootDir)
self.masterPid = masterPid
FileMonitor.isRunning = True
Process.__init__(self)
def run(self):
while True:
self._loop()
time.sleep(5)
def _loop(self):
for root, _, files in os.walk(self.rootDir):
for file in files:
if file.endswith(".py"):
self._monitorFile(root, file)
def _monitorFile(self, root, file):
mtime = os.path.getmtime("%s/%s" % (root, file))
moduleName = "%s/%s" % (root[self.skip+1:], file[:-3])
moduleName = moduleName.replace("/",".")
if not moduleName in self.updates:
self.updates[moduleName] = mtime
elif self.updates[moduleName] < mtime:
print "Change detected in %s" % moduleName
self._restartWorker()
self.updates[moduleName] = mtime
def _restartWorker(self):
os.kill(self.masterPid, signal.SIGTERM)
Use the signal framework in 0.9.7 tree
http://projects.unbit.it/uwsgi/wiki/SignalFramework
An example of auto-reloading:
import uwsgi
uwsgi.register_signal(1, "", uwsgi.reload)
uwsgi.add_file_monitor(1, 'myfile.py')
def application(env, start_response):
...

Resources