Should single scrapy crawler process speed without limit be as fast as multiple crawler processes speed?

Should single scrapy crawler process speed without limit be as fast as multiple crawler processes speed? - web-scraping

I have set CONCURRENT_REQUESTS,CONCURRENT_REQUESTS_PER_DOMAIN
and CONCURRENT_REQUESTS_PER_IP to be 1,000,000 but it just cannot be as fast as running multiple crawler processes, each processing part of a list of urls. Is that to be expected? In fact, if I run 8 crawler processes then the speed is about 8x faster.
I am not sure what I am configuring wrong. I would expect a single crawler process without any rate limit to run at the maximum speed possible, so it should be as fast as running 8 crawler processes.
import csv
import scrapy
import random
from urllib.parse import urlencode
from pprint import pprint
import requests
import re
import json
class XXXSpider(scrapy.Spider):
name = 'xxx'
def start_requests(self):
base_url = 'xxx'
base_query = 'yyy'
for s in self.words:
token = random.choice(self.tokens)
headers['token'] = token
user_agent = random.choice(user_agents)
headers['User-Agent'] = user_agent
params['q'] = base_query.format("${:s}".format(s))
encoded_params = urlencode(params)
xxx_url = "{:s}?{:s}".format(base_url, encoded_params)
yield scrapy.Request(url=xxx_url, headers=headers, callback=self.parse)
def parse(self, response):
data = json.loads(response.body)
multiple crawler processes
from scrapy.crawler import CrawlerProcess
import re
import requests
from multiprocessing import Pool
import csv
if __name__ == "__main__":
num_processes = 32
pool = Pool(num_processes)
tokens = pool.map(request_token, range(num_processes))
concurrency = 8
process = CrawlerProcess()
split_size = len(words) // concurrency
for i in range(0, len(words), split_size):
split = words[i: i+split_size]
process.crawl(XXXSpider, tokens=tokens, words=split)
process.start()

Related

How do i resolve Async tornado fetching future error

I am trying to use AsyncHTTPClient to Get/Post from a local service that is already running at port 6000.
but i keep getting an error RuntimeError: Task got bad yield: <tornado.concurrent.Future object at 0x03C9B490>
ps. im using tornado 4.4.2, this error is fixed with the latest version but how do i do it in 4.4.2? Please help!
import tornado.ioloop
from tornado.httpclient import AsyncHTTPClient
import asyncio
import tornado
import urllib
from datetime import datetime
import time
async def client(url):
http_client = AsyncHTTPClient()
response = await http_client.fetch(url)
return response.body
async def main():
http_client = AsyncHTTPClient()
url = "http://localhost:6000/listings"
result = await client(url)
print(result)
if __name__ == "__main__":
result = asyncio.run(main())
print(result)
print(int(time.time() * 1e6))

You can't use asyncio with Tornado prior to version 5.0.
Use Tornado's own ioloop to run your program:
from tornado import ioloop
if __name__ == "__main__":
result = ioloop.IOLoop.current().run_sync(main)
UPDATE: The above solution will work fine, but, if you want, you can use asyncio with Tornado 4.x. See: tornado.platform.asyncio.AsyncIOMainLoop.

I keep getting an empty task list APScheduler

I am writing a website on a Flask, I use it to run uwsgi + nginx. It was necessary to write a timer to periodically execute tasks. For this I used uwsgidecorators. The task should check the status of the scheduler tasks. To get a list of tasks, I used get_jobs(). But the list I keep getting empty.
webapp/__init__.py:
# -*- coding: utf-8 -*-
from gevent import monkey
monkey.patch_all()
import grpc.experimental.gevent
grpc.experimental.gevent.init_gevent()
from flask import Flask, session, request
from config import DevelopConfig, MqttConfig, MailConfig, ProductionConfig
from flask_sqlalchemy import SQLAlchemy
from flask_migrate import Migrate
from flask_mail import Mail
from flask_script import Manager
from flask_socketio import SocketIO
from flask_mqtt import Mqtt
from flask_login import LoginManager
from flask_babel import Babel
from flask_babel_js import BabelJS
from flask_babel import lazy_gettext as _l
from apscheduler.schedulers.gevent import GeventScheduler
app = Flask(__name__)
app.config.from_object(ProductionConfig)
app.config.from_object(MqttConfig)
app.config.from_object(MailConfig)
db = SQLAlchemy(app)
migrate = Migrate(app, db, render_as_batch=True)
mail = Mail(app)
mqtt = Mqtt(app)
manager = Manager(app, db)
login_manager = LoginManager(app)
login_manager.login_view = 'auth'
login_manager.login_message = _l("Необходимо авторизоваться для доступа к закрытой странице")
login_manager.login_message_category = "error"
scheduler = GeventScheduler()
scheduler.start()
scheduler.add_job(publish_async, args=["Hello"], id="job", trigger='interval', seconds=2)
socketio = SocketIO(app, async_mode='gevent_uwsgi') # Production Version
babel = Babel(app)
babeljs = BabelJS(app=app, view_path='/translations/')
import webapp.views
#babel.localeselector
def get_locale():
# if the user has set up the language manually it will be stored in the session,
# so we use the locale from the user settings
try:
language = session['language']
except KeyError:
language = None
if language is not None:
print(language)
return language
return request.accept_languages.best_match(app.config['LANGUAGES'].keys())
from webapp import models
def publish_async(message):
print(message)
webapp/tasks.py:
from uwsgidecorators import timer
#timer(10, target='spooler')
def check_run_tasks(args):
_list_schedulers = _scheduler_method.get_jobs()
print(_list_schedulers)
wsgi.ini:
[uwsgi]
env = PYTHONIOENCODING=UTF-8
module = wsgi:app
master = true
# processes = 5
enable-threads = true
gevent = 1024
gevent-monkey-patch = true
buffer-size=32768
# lazy-apps = true
socket = /home/sammy/projectnew/projectnew.sock
socket-timeout = 30
chmod-socket = 664
thunder-lock = true
spooler = /home/sammy/projectnew/webapp/mytasks
import = webapp/tasks.py
vacuum = true
die-on-term = true
wsgi.py:
# -*- coding: utf-8 -*-
from webapp import app, socketio
if __name__ == '__main__':
socketio.run(app, use_reloader=False)

How to change xcom in Airflow to accomodate large data?

I am using the following code in my Airflow operator:
import json
import pandas as pd
from airflow.exceptions import AirflowException
from airflow.hooks.http_hook import HttpHook
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
class HttpToGoogleCloudStorageOperator(BaseOperator):
template_fields = ['endpoint', 'data', 'headers', ]
template_ext = ()
ui_color = '#f4a460'
#apply_defaults
def __init__(self,
endpoint,
project_id,
table_id,
data=None,
headers=None,
auth=None,
http_conn_id='http_default',
*args, **kwargs):
super(HttpToGoogleCloudStorageOperator, self).__init__(*args, **kwargs)
self.table_id = table_id
self.http_conn_id = http_conn_id
self.method = "GET"
self.endpoint = endpoint
self.headers = headers or {}
self.auth = auth
self.data = data or {}
def execute(self, context):
http = HttpHook(self.method, http_conn_id=self.http_conn_id)
self.log.info("Calling HTTP method " + self.endpoint)
response = http.run(self.endpoint, self.data, self.headers,auth=self.auth)
self.log.info("Got response")
Unfortunately the data returned is too large (about 5k) to fit in the standard xcom and I get this error:
{taskinstance.py:1059} ERROR - (_mysql_exceptions.DataError) (1406, "Data too long for column 'value' at row 1")
Is there a way I can tell http_hook to use a different xcom, or (even better) not use xcom at all? I have looked around and I do not see a solution.
Thanks for any tips or pointers.
Edit: Here is how I call the operator. Note that nowhere do I specify xcom.
query_load_task = HttpToGoogleCloudStorageOperator(
task_id="query_load_task",
endpoint=endpoint,
project_id="my_gcp_poroject_id",
table_id="dataset.table",
data=None,
auth=(username, password))

It's preferable to store data to a system designed for such (e.g.: the file system, AWS S3, Azure, etc.) and instead return a unique identifier to reference the location of the data, for the file system this would likely be the full path (e.g.: /tmp/acme_response_20200709.csv) that way you leverage the best of both the storage system and your database.
If you add your code I'd be happy to take a crack at writing up some psuedo-code as an example.

Can't get fresh length of list Telegram bot API

I have flask telegram bot with pyTelegramBotAPI deployed on Heroku. Need to get fresh lenght of list in the start message. List refreshing every 5 minutes in gettinglist.py. Can't find my mistake, please help.
bot.py
import config
import gettinglist
from gettinglist import getting_list
import telebot
from flask import Flask, request
from threading import Thread
def app_run():
app.run(host="0.0.0.0", port=os.environ.get('PORT', 80))
msg_start = """
Lenght of list now: %d
""" % config.LIST_LENGHT
application_thread = Thread(target=app_run)
getting_list_thread = Thread(target=getting_list)
bot = telebot.TeleBot("<MY_BOT_TOKEN>")
#bot.message_handler(commands=['start'])
def start():
cid = m.chat.id
bot.send_message(cid, msg_start, parse_mode='html')
#app.route("/bot", methods=['POST'])
def getMessage():
bot.process_new_updates([telebot.types.Update.de_json(request.stream.read().decode("utf-8"))])
return "ok", 200
#app.route("/")
def webhook():
bot.remove_webhook()
bot.set_webhook(url="<HEROKU_APP_URL>")
return "ok", 200
if __name__ == '__main__':
application_thread.start()
getting_list_thread.start()
gettinglist.py
import config
from time import sleep
LIST_LENGHT = 0
LIST = []
def getting_list():
while True:
global LIST
global LIST_LENGHT
LIST = [num for num in range(0, 100)]
config.LIST_LENGHT = len(LIST)
return LIST
sleep(300)
config.py
LIST_LENGHT = 0

how to load multiple pages one by one in QWebPage

I am trying to crawl news article pages for comments. After some research I found that mostly websites use an iframe for it. I want to get the "src" of the iframe. I am using QtWebKit in Python using PySide. It is actually working but just once. It is not loading other webpages. I am using the following code:
import sys
import pymysql
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from pprint import pprint
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
try:
self.app = QApplication(sys.argv)
except RuntimeError:
self.app = QCoreApplication.instance()
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def visit(url):
r = Render(url)
p = r.frame.toHtml()
f_url = str(r.frame.url().toString())
return p
def is_comment_url(url):
lower_url = url.lower()
n = lower_url.find("comment")
if n>0:
return True
else:
return False
with open("urls.txt") as f:
content = f.read().splitlines()
list_of_urls = []
for url in content:
page = visit(url)
soup = BeautifulSoup(page)
for tag in soup.findAll('iframe', src=True):
link = tag['src']
if is_comment_url(link):
print(link)
list_of_urls += link
pprint(list_of_urls)
But the issue is it works only for single iteration and gets stuck.
Also is there any way to save a web page as it is as displayed by the browser (after executing all the javascript etc.)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Should single scrapy crawler process speed without limit be as fast as multiple crawler processes speed? - web-scraping

Related

How do i resolve Async tornado fetching future error

I keep getting an empty task list APScheduler

How to change xcom in Airflow to accomodate large data?

Can't get fresh length of list Telegram bot API

how to load multiple pages one by one in QWebPage

Categories

Resources