Should single scrapy crawler process speed without limit be as fast as multiple crawler processes speed? - web-scraping

I have set CONCURRENT_REQUESTS,CONCURRENT_REQUESTS_PER_DOMAIN
and CONCURRENT_REQUESTS_PER_IP to be 1,000,000 but it just cannot be as fast as running multiple crawler processes, each processing part of a list of urls. Is that to be expected? In fact, if I run 8 crawler processes then the speed is about 8x faster.
I am not sure what I am configuring wrong. I would expect a single crawler process without any rate limit to run at the maximum speed possible, so it should be as fast as running 8 crawler processes.
import csv
import scrapy
import random
from urllib.parse import urlencode
from pprint import pprint
import requests
import re
import json
class XXXSpider(scrapy.Spider):
name = 'xxx'
def start_requests(self):
base_url = 'xxx'
base_query = 'yyy'
for s in self.words:
token = random.choice(self.tokens)
headers['token'] = token
user_agent = random.choice(user_agents)
headers['User-Agent'] = user_agent
params['q'] = base_query.format("${:s}".format(s))
encoded_params = urlencode(params)
xxx_url = "{:s}?{:s}".format(base_url, encoded_params)
yield scrapy.Request(url=xxx_url, headers=headers, callback=self.parse)
def parse(self, response):
data = json.loads(response.body)
multiple crawler processes
from scrapy.crawler import CrawlerProcess
import re
import requests
from multiprocessing import Pool
import csv
if __name__ == "__main__":
num_processes = 32
pool = Pool(num_processes)
tokens = pool.map(request_token, range(num_processes))
concurrency = 8
process = CrawlerProcess()
split_size = len(words) // concurrency
for i in range(0, len(words), split_size):
split = words[i: i+split_size]
process.crawl(XXXSpider, tokens=tokens, words=split)
process.start()

Related

How do i resolve Async tornado fetching future error

I am trying to use AsyncHTTPClient to Get/Post from a local service that is already running at port 6000.
but i keep getting an error RuntimeError: Task got bad yield: <tornado.concurrent.Future object at 0x03C9B490>
ps. im using tornado 4.4.2, this error is fixed with the latest version but how do i do it in 4.4.2? Please help!
import tornado.ioloop
from tornado.httpclient import AsyncHTTPClient
import asyncio
import tornado
import urllib
from datetime import datetime
import time
async def client(url):
http_client = AsyncHTTPClient()
response = await http_client.fetch(url)
return response.body
async def main():
http_client = AsyncHTTPClient()
url = "http://localhost:6000/listings"
result = await client(url)
print(result)
if __name__ == "__main__":
result = asyncio.run(main())
print(result)
print(int(time.time() * 1e6))
You can't use asyncio with Tornado prior to version 5.0.
Use Tornado's own ioloop to run your program:
from tornado import ioloop
if __name__ == "__main__":
result = ioloop.IOLoop.current().run_sync(main)
UPDATE: The above solution will work fine, but, if you want, you can use asyncio with Tornado 4.x. See: tornado.platform.asyncio.AsyncIOMainLoop.

I keep getting an empty task list APScheduler

I am writing a website on a Flask, I use it to run uwsgi + nginx. It was necessary to write a timer to periodically execute tasks. For this I used uwsgidecorators. The task should check the status of the scheduler tasks. To get a list of tasks, I used get_jobs(). But the list I keep getting empty.
webapp/__init__.py:
# -*- coding: utf-8 -*-
from gevent import monkey
monkey.patch_all()
import grpc.experimental.gevent
grpc.experimental.gevent.init_gevent()
from flask import Flask, session, request
from config import DevelopConfig, MqttConfig, MailConfig, ProductionConfig
from flask_sqlalchemy import SQLAlchemy
from flask_migrate import Migrate
from flask_mail import Mail
from flask_script import Manager
from flask_socketio import SocketIO
from flask_mqtt import Mqtt
from flask_login import LoginManager
from flask_babel import Babel
from flask_babel_js import BabelJS
from flask_babel import lazy_gettext as _l
from apscheduler.schedulers.gevent import GeventScheduler
app = Flask(__name__)
app.config.from_object(ProductionConfig)
app.config.from_object(MqttConfig)
app.config.from_object(MailConfig)
db = SQLAlchemy(app)
migrate = Migrate(app, db, render_as_batch=True)
mail = Mail(app)
mqtt = Mqtt(app)
manager = Manager(app, db)
login_manager = LoginManager(app)
login_manager.login_view = 'auth'
login_manager.login_message = _l("Необходимо авторизоваться для доступа к закрытой странице")
login_manager.login_message_category = "error"
scheduler = GeventScheduler()
scheduler.start()
scheduler.add_job(publish_async, args=["Hello"], id="job", trigger='interval', seconds=2)
socketio = SocketIO(app, async_mode='gevent_uwsgi') # Production Version
babel = Babel(app)
babeljs = BabelJS(app=app, view_path='/translations/')
import webapp.views
#babel.localeselector
def get_locale():
# if the user has set up the language manually it will be stored in the session,
# so we use the locale from the user settings
try:
language = session['language']
except KeyError:
language = None
if language is not None:
print(language)
return language
return request.accept_languages.best_match(app.config['LANGUAGES'].keys())
from webapp import models
def publish_async(message):
print(message)
webapp/tasks.py:
from uwsgidecorators import timer
#timer(10, target='spooler')
def check_run_tasks(args):
_list_schedulers = _scheduler_method.get_jobs()
print(_list_schedulers)
wsgi.ini:
[uwsgi]
env = PYTHONIOENCODING=UTF-8
module = wsgi:app
master = true
# processes = 5
enable-threads = true
gevent = 1024
gevent-monkey-patch = true
buffer-size=32768
# lazy-apps = true
socket = /home/sammy/projectnew/projectnew.sock
socket-timeout = 30
chmod-socket = 664
thunder-lock = true
spooler = /home/sammy/projectnew/webapp/mytasks
import = webapp/tasks.py
vacuum = true
die-on-term = true
wsgi.py:
# -*- coding: utf-8 -*-
from webapp import app, socketio
if __name__ == '__main__':
socketio.run(app, use_reloader=False)

How to change xcom in Airflow to accomodate large data?

I am using the following code in my Airflow operator:
import json
import pandas as pd
from airflow.exceptions import AirflowException
from airflow.hooks.http_hook import HttpHook
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
class HttpToGoogleCloudStorageOperator(BaseOperator):
template_fields = ['endpoint', 'data', 'headers', ]
template_ext = ()
ui_color = '#f4a460'
#apply_defaults
def __init__(self,
endpoint,
project_id,
table_id,
data=None,
headers=None,
auth=None,
http_conn_id='http_default',
*args, **kwargs):
super(HttpToGoogleCloudStorageOperator, self).__init__(*args, **kwargs)
self.table_id = table_id
self.http_conn_id = http_conn_id
self.method = "GET"
self.endpoint = endpoint
self.headers = headers or {}
self.auth = auth
self.data = data or {}
def execute(self, context):
http = HttpHook(self.method, http_conn_id=self.http_conn_id)
self.log.info("Calling HTTP method " + self.endpoint)
response = http.run(self.endpoint, self.data, self.headers,auth=self.auth)
self.log.info("Got response")
Unfortunately the data returned is too large (about 5k) to fit in the standard xcom and I get this error:
{taskinstance.py:1059} ERROR - (_mysql_exceptions.DataError) (1406, "Data too long for column 'value' at row 1")
Is there a way I can tell http_hook to use a different xcom, or (even better) not use xcom at all? I have looked around and I do not see a solution.
Thanks for any tips or pointers.
Edit: Here is how I call the operator. Note that nowhere do I specify xcom.
query_load_task = HttpToGoogleCloudStorageOperator(
task_id="query_load_task",
endpoint=endpoint,
project_id="my_gcp_poroject_id",
table_id="dataset.table",
data=None,
auth=(username, password))
It's preferable to store data to a system designed for such (e.g.: the file system, AWS S3, Azure, etc.) and instead return a unique identifier to reference the location of the data, for the file system this would likely be the full path (e.g.: /tmp/acme_response_20200709.csv) that way you leverage the best of both the storage system and your database.
If you add your code I'd be happy to take a crack at writing up some psuedo-code as an example.

Can't get fresh length of list Telegram bot API

I have flask telegram bot with pyTelegramBotAPI deployed on Heroku. Need to get fresh lenght of list in the start message. List refreshing every 5 minutes in gettinglist.py. Can't find my mistake, please help.
bot.py
import config
import gettinglist
from gettinglist import getting_list
import telebot
from flask import Flask, request
from threading import Thread
def app_run():
app.run(host="0.0.0.0", port=os.environ.get('PORT', 80))
msg_start = """
Lenght of list now: %d
""" % config.LIST_LENGHT
application_thread = Thread(target=app_run)
getting_list_thread = Thread(target=getting_list)
bot = telebot.TeleBot("<MY_BOT_TOKEN>")
#bot.message_handler(commands=['start'])
def start():
cid = m.chat.id
bot.send_message(cid, msg_start, parse_mode='html')
#app.route("/bot", methods=['POST'])
def getMessage():
bot.process_new_updates([telebot.types.Update.de_json(request.stream.read().decode("utf-8"))])
return "ok", 200
#app.route("/")
def webhook():
bot.remove_webhook()
bot.set_webhook(url="<HEROKU_APP_URL>")
return "ok", 200
if __name__ == '__main__':
application_thread.start()
getting_list_thread.start()
gettinglist.py
import config
from time import sleep
LIST_LENGHT = 0
LIST = []
def getting_list():
while True:
global LIST
global LIST_LENGHT
LIST = [num for num in range(0, 100)]
config.LIST_LENGHT = len(LIST)
return LIST
sleep(300)
config.py
LIST_LENGHT = 0

how to load multiple pages one by one in QWebPage

I am trying to crawl news article pages for comments. After some research I found that mostly websites use an iframe for it. I want to get the "src" of the iframe. I am using QtWebKit in Python using PySide. It is actually working but just once. It is not loading other webpages. I am using the following code:
import sys
import pymysql
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from pprint import pprint
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
try:
self.app = QApplication(sys.argv)
except RuntimeError:
self.app = QCoreApplication.instance()
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def visit(url):
r = Render(url)
p = r.frame.toHtml()
f_url = str(r.frame.url().toString())
return p
def is_comment_url(url):
lower_url = url.lower()
n = lower_url.find("comment")
if n>0:
return True
else:
return False
with open("urls.txt") as f:
content = f.read().splitlines()
list_of_urls = []
for url in content:
page = visit(url)
soup = BeautifulSoup(page)
for tag in soup.findAll('iframe', src=True):
link = tag['src']
if is_comment_url(link):
print(link)
list_of_urls += link
pprint(list_of_urls)
But the issue is it works only for single iteration and gets stuck.
Also is there any way to save a web page as it is as displayed by the browser (after executing all the javascript etc.)

Resources