I am downloading YouTube comments with a python script that uses API keys and the YouTube Data API V3, but sooner or later I run into the following error:
{'error': {'code': 400, 'message': "The API server failed to successfully process the request. While this can be a transient error, it usually indicates that the request's input is invalid. Check the structure of the commentThread resource in the request body to ensure that it is valid.", 'errors': [{'message': "The API server failed to successfully process the request. While this can be a transient error, it usually indicates that the request's input is invalid. Check the structure of the commentThread resource in the request body to ensure that it is valid.", 'domain': 'youtube.commentThread', 'reason': 'processingFailure', 'location': 'body', 'locationType': 'other'}]}}
I am using the following code:
import argparse
import requests
import json
import time
start_time = time.time()
class YouTubeApi():
YOUTUBE_COMMENTS_URL = 'https://www.googleapis.com/youtube/v3/commentThreads'
comment_counter = 0
def is_error_response(self, response):
error = response.get('error')
if error is None:
return False
print("API Error: "
f"code={error['code']} "
f"domain={error['errors'][0]['domain']} "
f"reason={error['errors'][0]['reason']} "
f"message={error['errors'][0]['message']!r}")
print(self.comment_counter)
return True
def format_comments(self, results, likes_required):
comments_list = []
try:
for item in results["items"]:
comment = item["snippet"]["topLevelComment"]
likes = comment["snippet"]["likeCount"]
if likes < likes_required:
continue
author = comment["snippet"]["authorDisplayName"]
text = comment["snippet"]["textDisplay"]
str = "Comment by {}:\n \"{}\"\n\n".format(author, text)
str = str.encode('ascii', 'replace').decode()
comments_list.append(str)
self.comment_counter += 1
print("Comments downloaded:", self.comment_counter, end="\r")
except(KeyError):
print(results)
return comments_list
def get_video_comments(self, video_id, likes_required):
with open("API_keys.txt", "r") as f:
key_list = f.readlines()
comments_list = []
key_list = [key.strip('/n') for key in key_list]
params = {
'part': 'snippet,replies',
'maxResults': 100,
'videoId': video_id,
'textFormat': 'plainText',
'key': key_list[0]
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
comments_data = requests.get(self.YOUTUBE_COMMENTS_URL, params=params, headers=headers)
results = comments_data.json()
if self.is_error_response(results):
return []
nextPageToken = results.get("nextPageToken")
comments_list = []
comments_list += self.format_comments(results, likes_required)
while nextPageToken:
params.update({'pageToken': nextPageToken})
if self.comment_counter <= 900000:
params.update({'key': key_list[0]})
elif self.comment_counter <= 1800000:
params.update({'key': key_list[1]})
elif self.comment_counter <= 2700000:
params.update({'key': key_list[2]})
elif self.comment_counter <= 3600000:
params.update({'key': key_list[3]})
elif self.comment_counter <= 4500000:
params.update({'key': key_list[4]})
else:
params.update({'key': key_list[5]})
if self.comment_counter % 900001 == 0:
print(params["key"])
comments_data = requests.get(self.YOUTUBE_COMMENTS_URL, params=params, headers=headers)
results = comments_data.json()
if self.is_error_response(results):
return comments_list
nextPageToken = results.get("nextPageToken")
comments_list += self.format_comments(results, likes_required)
return comments_list
def get_video_id_list(self, filename):
try:
with open(filename, 'r') as file:
URL_list = file.readlines()
except FileNotFoundError:
exit("File \"" + filename + "\" not found")
list = []
for url in URL_list:
if url == "\n": # ignore empty lines
continue
if url[-1] == '\n': # delete '\n' at the end of line
url = url[:-1]
if url.find('='): # get id
id = url[url.find('=') + 1:]
list.append(id)
else:
print("Wrong URL")
return list
def main():
yt = YouTubeApi()
parser = argparse.ArgumentParser(add_help=False, description=("Download youtube comments from many videos into txt file"))
required = parser.add_argument_group("required arguments")
optional = parser.add_argument_group("optional arguments")
optional.add_argument("--likes", '-l', help="The amount of likes a comment needs to be saved", type=int)
optional.add_argument("--input", '-i', help="URL list file name")
optional.add_argument("--output", '-o', help="Output file name")
optional.add_argument("--help", '-h', help="Help", action='help')
args = parser.parse_args()
# --------------------------------------------------------------------- #
likes = 0
if args.likes:
likes = args.likes
input_file = "URL_list.txt"
if args.input:
input_file = args.input
output_file = "Comments.txt"
if args.output:
output_file = args.output
list = yt.get_video_id_list(input_file)
if not list:
exit("No URLs in input file")
try:
vid_counter = 0
with open(output_file, "a") as f:
for video_id in list:
vid_counter += 1
print("Downloading comments for video ", vid_counter, ", id: ", video_id, sep='')
comments = yt.get_video_comments(video_id, likes)
if comments:
for comment in comments:
f.write(comment)
print('\nDone!')
except KeyboardInterrupt:
exit("User Aborted the Operation")
# --------------------------------------------------------------------- #
if __name__ == '__main__':
main()
In another thread, it was discovered that google does not currently permit downloading all the comments on a popular video, however you would expect it to cut off at the same point. Instead, I have found that it can range anywhere betweek 1.5 million to 200k comments downloaded before it returns a code 400. Is this to do with a bug in my code, or is the YouTube API rejecting my request as it is clear that is a script? Would adding a time.sleep clause help with this?
(I bring forward this answer -- that I prepared to the question above at the time of its initial post -- because my assertions below seems to be confirmed once again by recent SO posts of this very kind.)
Your observations are correct. But, unfortunately, nobody but Google itself is able to provide a sound and complete answer to your question. Us -- non-Googlers (as myself!), or even the Googlers themselves (since they all sign NDAs) -- can only guess about the things implied.
Here is my educated guess, based on the investigations I made recently when responding to a very much related question (which you quoted above, yourself!):
As you already know, the API uses pagination for to return to callers sets of items of which cardinality exceed the internal limit of 50, or, by case, 100 items to be returned by each and every API endpoint invocation that provides result sets.
If you'll log the nextPageToken property that you obtain from CommentThreads.list via your object results, you'll see that those page tokens get bigger and bigger. Each and every such page token has to be passed on to the next CommentThreads.list call as the parameter pageToken.
The problem is that internally (not specified publicly, not documented) the API has a limit on the sheer length of the HTTP requests it accepts from its callers. (This happens for various reasons; e.g. security.) Therefore, when a given page token is sufficiently long, the HTTP request that the API user issues will exceed that internal limit, producing an internal error. That error surfaces to the API caller as the processingFailure error that you've encountered.
Many questions remain to be answered (e.g. why is that the page tokens have unbounded length?), but, again, those questions belong very much to the internal realm of the back-end system that's behind the API we're using. And those questions cannot be answered publicly, since are very much Google's internal business.
I am running on a Raspberry Pi 3b+ with latest Raspbian. I have created a Python 2.7 virtual environment to test a few things. I installed the Google Assistant Api using the following instructions:
https://developers.google.com/assistant/sdk/guides/library/python/embed/install-sample
-except the audio, I am using the Respeaker 4-mic hat as I already had that working fine.
When I run the sample code for pushtotalk.py it works fine except CTRL-C no longer functions (have to close the terminal window to kill it).
I made a few minor (or I thought minor) changes and when I run the code I get a strange error.
My version of the code:
# Original pushtotalk.py file Copyright (C) 2017 Google Inc.
# modified by #captstephan for T3 project
#
# my imports for the mechanical functions, motor drivers, etc.
from Raspi_PWM_Servo_Driver import PWM
from voice_engine.source import Source
from voice_engine.channel_picker import ChannelPicker
from voice_engine.kws import KWS
from voice_engine.doa_respeaker_4mic_array import DOA
from pixels import pixels
# imports from the original Google pushtotalk.py file
import concurrent.futures
import json
import logging
import os
import os.path
import pathlib2 as pathlib
import sys
import time
import uuid
import click
import grpc
import google.auth.transport.grpc
import google.auth.transport.requests
import google.oauth2.credentials
from google.assistant.embedded.v1alpha2 import (
embedded_assistant_pb2,
embedded_assistant_pb2_grpc
)
from tenacity import retry, stop_after_attempt, retry_if_exception
import assistant_helpers
import audio_helpers
import browser_helpers
import device_helpers
# set up Google Assistant variables (from pushtotalk.py)
ASSISTANT_API_ENDPOINT = 'embeddedassistant.googleapis.com'
END_OF_UTTERANCE = embedded_assistant_pb2.AssistResponse.END_OF_UTTERANCE
DIALOG_FOLLOW_ON = embedded_assistant_pb2.DialogStateOut.DIALOG_FOLLOW_ON
CLOSE_MICROPHONE = embedded_assistant_pb2.DialogStateOut.CLOSE_MICROPHONE
PLAYING = embedded_assistant_pb2.ScreenOutConfig.PLAYING
DEFAULT_GRPC_DEADLINE = 60 * 3 + 5
# set up items for motor hats
# Initialise the PWM device using the default address
pwm = PWM(0x6F)
# set max and min, servo0=Horiz, servo1=vert
servoMin0 = 155 # Min pulse length out of 4096
servoMid0 = 370
servoMax0 = 585 # Max pulse length out of 4096
servoMin1 = 410 # Min pulse length out of 4096
servoMid1 = 530
servoMax1 = 650 # Max pulse length out of 4096
pwm.setPWMFreq(60) # Set frequency to 60 Hz
# class assignment from pushtotalk.py file:
class SampleAssistant(object):
"""Sample Assistant that supports conversations and device actions.
Args:
device_model_id: identifier of the device model.
device_id: identifier of the registered device instance.
conversation_stream(ConversationStream): audio stream
for recording query and playing back assistant answer.
channel: authorized gRPC channel for connection to the
Google Assistant API.
deadline_sec: gRPC deadline in seconds for Google Assistant API call.
device_handler: callback for device actions.
"""
def __init__(self, language_code, device_model_id, device_id,
conversation_stream, display,
channel, deadline_sec, device_handler):
self.language_code = language_code
self.device_model_id = device_model_id
self.device_id = device_id
self.conversation_stream = conversation_stream
self.display = display
# Opaque blob provided in AssistResponse that,
# when provided in a follow-up AssistRequest,
# gives the Assistant a context marker within the current state
# of the multi-Assist()-RPC "conversation".
# This value, along with MicrophoneMode, supports a more natural
# "conversation" with the Assistant.
self.conversation_state = None
# Force reset of first conversation.
self.is_new_conversation = True
# Create Google Assistant API gRPC client.
self.assistant = embedded_assistant_pb2_grpc.EmbeddedAssistantStub(
channel
)
self.deadline = deadline_sec
self.device_handler = device_handler
def __enter__(self):
return self
def __exit__(self, etype, e, traceback):
if e:
return False
self.conversation_stream.close()
def is_grpc_error_unavailable(e):
is_grpc_error = isinstance(e, grpc.RpcError)
if is_grpc_error and (e.code() == grpc.StatusCode.UNAVAILABLE):
logging.error('grpc unavailable error: %s', e)
return True
return False
#retry(reraise=True, stop=stop_after_attempt(3),
retry=retry_if_exception(is_grpc_error_unavailable))
def assist(self):
"""Send a voice request to the Assistant and playback the response.
Returns: True if conversation should continue.
"""
continue_conversation = False
device_actions_futures = []
self.conversation_stream.start_recording()
logging.info('Recording audio request.')
def iter_log_assist_requests():
for c in self.gen_assist_requests():
assistant_helpers.log_assist_request_without_audio(c)
yield c
logging.debug('Reached end of AssistRequest iteration.')
# This generator yields AssistResponse proto messages
# received from the gRPC Google Assistant API.
for resp in self.assistant.Assist(iter_log_assist_requests(),
self.deadline):
assistant_helpers.log_assist_response_without_audio(resp)
if resp.event_type == END_OF_UTTERANCE:
logging.info('End of audio request detected.')
logging.info('Stopping recording.')
self.conversation_stream.stop_recording()
if resp.speech_results:
logging.info('Transcript of user request: "%s".',
' '.join(r.transcript
for r in resp.speech_results))
if len(resp.audio_out.audio_data) > 0:
if not self.conversation_stream.playing:
self.conversation_stream.stop_recording()
self.conversation_stream.start_playback()
logging.info('Playing assistant response.')
self.conversation_stream.write(resp.audio_out.audio_data)
if resp.dialog_state_out.conversation_state:
conversation_state = resp.dialog_state_out.conversation_state
logging.debug('Updating conversation state.')
self.conversation_state = conversation_state
if resp.dialog_state_out.volume_percentage != 0:
volume_percentage = resp.dialog_state_out.volume_percentage
logging.info('Setting volume to %s%%', volume_percentage)
self.conversation_stream.volume_percentage = volume_percentage
if resp.dialog_state_out.microphone_mode == DIALOG_FOLLOW_ON:
continue_conversation = True
logging.info('Expecting follow-on query from user.')
elif resp.dialog_state_out.microphone_mode == CLOSE_MICROPHONE:
continue_conversation = False
if resp.device_action.device_request_json:
device_request = json.loads(
resp.device_action.device_request_json
)
fs = self.device_handler(device_request)
if fs:
device_actions_futures.extend(fs)
if self.display and resp.screen_out.data:
system_browser = browser_helpers.system_browser
system_browser.display(resp.screen_out.data)
if len(device_actions_futures):
logging.info('Waiting for device executions to complete.')
concurrent.futures.wait(device_actions_futures)
logging.info('Finished playing assistant response.')
self.conversation_stream.stop_playback()
return continue_conversation
def gen_assist_requests(self):
"""Yields: AssistRequest messages to send to the API."""
config = embedded_assistant_pb2.AssistConfig(
audio_in_config=embedded_assistant_pb2.AudioInConfig(
encoding='LINEAR16',
sample_rate_hertz=self.conversation_stream.sample_rate,
),
audio_out_config=embedded_assistant_pb2.AudioOutConfig(
encoding='LINEAR16',
sample_rate_hertz=self.conversation_stream.sample_rate,
volume_percentage=self.conversation_stream.volume_percentage,
),
dialog_state_in=embedded_assistant_pb2.DialogStateIn(
language_code=self.language_code,
conversation_state=self.conversation_state,
is_new_conversation=self.is_new_conversation,
),
device_config=embedded_assistant_pb2.DeviceConfig(
device_id=self.device_id,
device_model_id=self.device_model_id,
)
)
if self.display:
config.screen_out_config.screen_mode = PLAYING
# Continue current conversation with later requests.
self.is_new_conversation = False
# The first AssistRequest must contain the AssistConfig
# and no audio data.
yield embedded_assistant_pb2.AssistRequest(config=config)
for data in self.conversation_stream:
# Subsequent requests need audio data, but not config.
yield embedded_assistant_pb2.AssistRequest(audio_in=data)
#click.command()
#click.option('--api-endpoint', default=ASSISTANT_API_ENDPOINT,
metavar='<api endpoint>', show_default=True,
help='Address of Google Assistant API service.')
#click.option('--credentials',
metavar='<credentials>', show_default=True,
default=os.path.join(click.get_app_dir('google-oauthlib-tool'),
'credentials.json'),
help='Path to read OAuth2 credentials.')
#click.option('--project-id',
metavar='<project id>',
help=('Google Developer Project ID used for registration '
'if --device-id is not specified'))
#click.option('--device-model-id',
metavar='<device model id>',
help=(('Unique device model identifier, '
'if not specifed, it is read from --device-config')))
#click.option('--device-id',
metavar='<device id>',
help=(('Unique registered device instance identifier, '
'if not specified, it is read from --device-config, '
'if no device_config found: a new device is registered '
'using a unique id and a new device config is saved')))
#click.option('--device-config', show_default=True,
metavar='<device config>',
default=os.path.join(
click.get_app_dir('googlesamples-assistant'),
'device_config.json'),
help='Path to save and restore the device configuration')
#click.option('--lang', show_default=True,
metavar='<language code>',
default='en-US',
help='Language code of the Assistant')
#click.option('--display', is_flag=True, default=False,
help='Enable visual display of Assistant responses in HTML.')
#click.option('--verbose', '-v', is_flag=True, default=False,
help='Verbose logging.')
#click.option('--input-audio-file', '-i',
metavar='<input file>',
help='Path to input audio file. '
'If missing, uses audio capture')
#click.option('--output-audio-file', '-o',
metavar='<output file>',
help='Path to output audio file. '
'If missing, uses audio playback')
#click.option('--audio-sample-rate',
default=audio_helpers.DEFAULT_AUDIO_SAMPLE_RATE,
metavar='<audio sample rate>', show_default=True,
help='Audio sample rate in hertz.')
#click.option('--audio-sample-width',
default=audio_helpers.DEFAULT_AUDIO_SAMPLE_WIDTH,
metavar='<audio sample width>', show_default=True,
help='Audio sample width in bytes.')
#click.option('--audio-iter-size',
default=audio_helpers.DEFAULT_AUDIO_ITER_SIZE,
metavar='<audio iter size>', show_default=True,
help='Size of each read during audio stream iteration in bytes.')
#click.option('--audio-block-size',
default=audio_helpers.DEFAULT_AUDIO_DEVICE_BLOCK_SIZE,
metavar='<audio block size>', show_default=True,
help=('Block size in bytes for each audio device '
'read and write operation.'))
#click.option('--audio-flush-size',
default=audio_helpers.DEFAULT_AUDIO_DEVICE_FLUSH_SIZE,
metavar='<audio flush size>', show_default=True,
help=('Size of silence data in bytes written '
'during flush operation'))
#click.option('--grpc-deadline', default=DEFAULT_GRPC_DEADLINE,
metavar='<grpc deadline>', show_default=True,
help='gRPC deadline in seconds')
#click.option('--once', default=False, is_flag=True,
help='Force termination after a single conversation.')
def main(api_endpoint, credentials, project_id,
device_model_id, device_id, device_config,
lang, display, verbose,
input_audio_file, output_audio_file,
audio_sample_rate, audio_sample_width,
audio_iter_size, audio_block_size, audio_flush_size,
grpc_deadline, once, *args, **kwargs):
# Inserted the following code to set up the snowboy keyword activation using "Hey T3"
src = Source(rate=16000, channels=4, frames_size=320)
ch1 = ChannelPicker(channels=4, pick=1)
kws = KWS()
doa = DOA(rate=16000)
src.link(ch1)
ch1.link(kws)
src.link(doa)
pixels.listen()
pwm.setPWM(0, 0, 370)
pwm.setPWM(1, 0, 640)
# When snowboy detects the custom keyword, set the camera position to near direction of voice
def on_detected(keyword):
position = doa.get_direction()
pixels.wakeup(position)
print('detected {} at direction {}'.format(keyword, position))
if position >= 30 and position <= 180:
pwm.setPWM(0, 0, 175)
pwm.setPWM(1, 0, 500)
elif position > 180 and position <= 330:
pwm.setPWM(0, 0, 560)
pwm.setPWM(1, 0, 500)
elif position > 330 or position < 30:
pwm.setPWM(0, 0, 370)
pwm.setPWM(1, 0, 6200)
else:
pwm.setPWM(0, 0, 370)
pwm.setPWM(1, 0, 640)
# end of stuff I inserted
# Setup logging.
logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
# Load OAuth 2.0 credentials.
try:
with open(credentials, 'r') as f:
credentials = google.oauth2.credentials.Credentials(token=None,
**json.load(f))
http_request = google.auth.transport.requests.Request()
credentials.refresh(http_request)
except Exception as e:
logging.error('Error loading credentials: %s', e)
logging.error('Run google-oauthlib-tool to initialize '
'new OAuth 2.0 credentials.')
sys.exit(-1)
# Create an authorized gRPC channel.
grpc_channel = google.auth.transport.grpc.secure_authorized_channel(
credentials, http_request, api_endpoint)
logging.info('Connecting to %s', api_endpoint)
# Configure audio source and sink.
audio_device = None
if input_audio_file:
audio_source = audio_helpers.WaveSource(
open(input_audio_file, 'rb'),
sample_rate=audio_sample_rate,
sample_width=audio_sample_width
)
else:
audio_source = audio_device = (
audio_device or audio_helpers.SoundDeviceStream(
sample_rate=audio_sample_rate,
sample_width=audio_sample_width,
block_size=audio_block_size,
flush_size=audio_flush_size
)
)
if output_audio_file:
audio_sink = audio_helpers.WaveSink(
open(output_audio_file, 'wb'),
sample_rate=audio_sample_rate,
sample_width=audio_sample_width
)
else:
audio_sink = audio_device = (
audio_device or audio_helpers.SoundDeviceStream(
sample_rate=audio_sample_rate,
sample_width=audio_sample_width,
block_size=audio_block_size,
flush_size=audio_flush_size
)
)
# Create conversation stream with the given audio source and sink.
conversation_stream = audio_helpers.ConversationStream(
source=audio_source,
sink=audio_sink,
iter_size=audio_iter_size,
sample_width=audio_sample_width,
)
if not device_id or not device_model_id:
try:
with open(device_config) as f:
device = json.load(f)
device_id = device['id']
device_model_id = device['model_id']
logging.info("Using device model %s and device id %s",
device_model_id,
device_id)
except Exception as e:
logging.warning('Device config not found: %s' % e)
logging.info('Registering device')
if not device_model_id:
logging.error('Option --device-model-id required '
'when registering a device instance.')
sys.exit(-1)
if not project_id:
logging.error('Option --project-id required '
'when registering a device instance.')
sys.exit(-1)
device_base_url = (
'https://%s/v1alpha2/projects/%s/devices' % (api_endpoint,
project_id)
)
device_id = str(uuid.uuid1())
payload = {
'id': device_id,
'model_id': device_model_id,
'client_type': 'SDK_SERVICE'
}
session = google.auth.transport.requests.AuthorizedSession(
credentials
)
r = session.post(device_base_url, data=json.dumps(payload))
if r.status_code != 200:
logging.error('Failed to register device: %s', r.text)
sys.exit(-1)
logging.info('Device registered: %s', device_id)
pathlib.Path(os.path.dirname(device_config)).mkdir(exist_ok=True)
with open(device_config, 'w') as f:
json.dump(payload, f)
device_handler = device_helpers.DeviceRequestHandler(device_id)
#device_handler.command('action.devices.commands.OnOff')
def onoff(on):
if on:
logging.info('Turning device on')
else:
logging.info('Turning device off')
#device_handler.command('com.example.commands.BlinkLight')
def blink(speed, number):
logging.info('Blinking device %s times.' % number)
delay = 1
if speed == "SLOWLY":
delay = 2
elif speed == "QUICKLY":
delay = 0.5
for i in range(int(number)):
logging.info('Device is blinking.')
time.sleep(delay)
with SampleAssistant(lang, device_model_id, device_id,
conversation_stream, display,
grpc_channel, grpc_deadline,
device_handler) as assistant:
# If file arguments are supplied:
# exit after the first turn of the conversation.
if input_audio_file or output_audio_file:
assistant.assist()
return
# changed the wait for keypress to a wait for keyword using the snowboy module
# If no file arguments supplied:
# keep recording voice requests using the microphone
# and playing back assistant response using the speaker.
# When the once flag is set, don't wait for a trigger. Otherwise, wait.
wait_for_user_trigger = not once
while True:
if wait_for_user_trigger:
#click.pause(info='Press Enter to send a new request...
kws.set_callback(on_detected)
continue_conversation = assistant.assist()
# wait for user trigger if there is no follow-up turn in
# the conversation.
wait_for_user_trigger = not continue_conversation
# If we only want one conversation, break.
if once and (not continue_conversation):
break
if __name__ == '__main__':
main()
I get the following error:
Traceback (most recent call last):
File "/usr/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"main", fname, loader, pkg_name)
File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/home/pi/T3google.py", line 501, in
main()
File "/home/pi/env/local/lib/python2.7/site-packages/click/core.py", line 722, in call
return self.main(*args, **kwargs)
File "/home/pi/env/local/lib/python2.7/site-packages/click/core.py", line 697, in main
rv = self.invoke(ctx)
File "/home/pi/env/local/lib/python2.7/site-packages/click/core.py", line 895, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/home/pi/env/local/lib/python2.7/site-packages/click/core.py", line 535, in invoke
return callback(*args, **kwargs)
File "/home/pi/T3google.py", line 425, in main
device_handler = device_helpers.DeviceRequestHandler(device_id)
AttributeError: 'module' object has no attribute 'DeviceRequestHandler'
I am still new to python, but the sample code from Google works and the code snippet I inserted works stand-alone in another file.
I checked device_helpers.py and DeviceRequestHandler is a class that takes the device_id as input. It works fine if I call it in pushtotalk.py, but not in the modified code.
Any thoughts anyone?
Thanks in advance,
Stephan
I shut down for other reasons, came back and restarted the next day and problem disappeared. Likely something required a reboot to complete install and I missed it. That part is working fine now, haven't been able to completely get this to work as expected.
I am building a recursive webspider with an optional login. I want to make most settings dynamic via a json config file.
In my __init__ function, I am reading this file and try to populate all variables, however, this does not work with Rules.
class CrawlpySpider(InitSpider):
...
#----------------------------------------------------------------------
def __init__(self, *args, **kwargs):
"""Constructor: overwrite parent __init__ function"""
# Call parent init
super(CrawlpySpider, self).__init__(*args, **kwargs)
# Get command line arg provided configuration param
config_file = kwargs.get('config')
# Validate configuration file parameter
if not config_file:
logging.error('Missing argument "-a config"')
logging.error('Usage: scrapy crawl crawlpy -a config=/path/to/config.json')
self.abort = True
# Check if it is actually a file
elif not os.path.isfile(config_file):
logging.error('Specified config file does not exist')
logging.error('Not found in: "' + config_file + '"')
self.abort = True
# All good, read config
else:
# Load json config
fpointer = open(config_file)
data = fpointer.read()
fpointer.close()
# convert JSON to dict
config = json.loads(data)
# config['rules'] is simply a string array which looks like this:
# config['rules'] = [
# 'password',
# 'reset',
# 'delete',
# 'disable',
# 'drop',
# 'logout',
# ]
CrawlpySpider.rules = (
Rule(
LinkExtractor(
allow_domains=(self.allowed_domains),
unique=True,
deny=tuple(config['rules'])
),
callback='parse',
follow=False
),
)
Scrapy still crawls the pages that are present in config['rules'] and therefore also hits the logout page. So the specified pages are not being denied. What am I missing here?
Update:
I have already tried by setting CrawlpySpider.rules = ... as well as self.rules = ... inside __init__. Both variants do not work.
Spider: InitSpider
Rules: LinkExtractor
Before crawl: Doing login prior crawling
I even try to deny that in my parse function
# Dive deeper?
# The nesting depth is now handled via a custom middle-ware (middlewares.py)
#if curr_depth < self.max_depth or self.max_depth == 0:
links = LinkExtractor().extract_links(response)
for link in links:
for ignore in self.ignores:
if (ignore not in link.url) and (ignore.lower() not in link.url.lower()) and link.url.find(ignore) == -1:
yield Request(link.url, meta={'depth': curr_depth+1, 'referer': response.url})
You are setting a class attribute where you want to set an instance attribute:
# this:
CrawlpySpider.rules = (
# should be this:
self.rules = (
<...>