Scrapy: Limiting the number of next page that are scraped. Unfortunately, the DEPTH_LIMIT custom setting doesn't work

Scrapy: Limiting the number of next page that are scraped. Unfortunately, the DEPTH_LIMIT custom setting doesn't work - web-scraping

I have build a simple amazon scraper to download listings of the products. However, I am not sure how I can limit the number of next pages that are crawled. Ideally, I don't want the spider to crawl more than 10 pages for each main page that it starts with. Some of the URLs in fact only have 2 pages.
Here is my code:
import scrapy
from scrapy.crawler import CrawlerProcess
from scraper_api import ScraperAPIClient
#Error Management Modules
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
from .datatransformation import ErrorFileManagement
# Importing all defined attributes and items to be scraped!
from ..items import AmazonListingItems
from ..attributes import *
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.sql.expression import column
class AmazonListings(scrapy.Spider):
name = "amazonlistings"
def start_requests(self):
error = ErrorManager()
client = ScraperAPIClient('50903e1bf8db5418a25334f3e3ed7c74')
db = create_engine('postgresql://postgres:Maisha123#localhost:5432')
urls = db.execute('select category_url from scrapycategory')
df = pd.DataFrame(urls.fetchall())
urls = df.values.tolist()
for url in urls:
yield scrapy.Request(client.scrapyGet(url=url[0]), callback=self.parse, errback=error.error_handler, dont_filter=True)
custom_settings = {
'DEPTH_LIMIT' : 3,
'DOWNLOAD_DELAYED': 5
}
def parse(self, response):
items = AmazonListingItems()
ap = AttributeParser()
error = ErrorManager()
client = ScraperAPIClient('50903e1bf8db5418a25334f3e3ed7c74')
itemlist = ap.itemlist(response)
if itemlist:
for item in itemlist:
items['mainurl'] = response.url
items['producturl'] = ap.producturl(item)
items['productname'] = ap.productname(item)
items['price'] = ap.price(item)
items['ratings'] = ap.ratings(item)
items['reviews'] = ap.reviews(item)
items['heroimg'] = ap.heroimg(item)
items['badge'] = ap.badge(item)
yield items
next_page = ap.next_page(response)
if next_page:
dom = 'www.amazon.com'
if dom in next_page:
request = scrapy.Request(client.scrapyGet(next_page), callback=self.parse,errback=error.error_handler)
yield request
else:
next_page_url = 'https://www.amazon.com' + next_page
request = scrapy.Request(client.scrapyGet(next_page_url), callback=self.parse,errback=error.error_handler)
yield request
else:
error.error_handler(response, itemlist=False)
#All Attribute Parser
class AttributeParser:
def itemlist(self, response):
itemlist = []
itemlist.append(response.css('.zg-item'))
itemlist.append(response.css('.s-asin .sg-col-inner'))
if itemlist:
for item in itemlist:
if item:
return item
def producturl(self, response):
for urls in AmazonListing_producturl:
value = response.css(urls).extract()
if value:
return value
def productname(self, response):
for productname in AmazonListing_productname:
value = response.css(productname).extract()
if value:
return value
def price(self, response):
for price in AmazonListing_price:
value = response.css(price).extract()
if value:
return value
def ratings(self, response):
for ratings in AmazonListing_ratings:
value = response.css(ratings).extract()
if value:
return value
def reviews(self, response):
for reviews in AmazonListing_reviews:
value = response.css(reviews).extract()
if value:
return value
def heroimg(self, response):
for heroimg in AmazonListing_heroimg:
value = response.css(heroimg).extract()
if value:
return value
def badge(self, response):
for badge in AmazonListing_badge:
value = response.css(badge).extract()
if value:
return value
def next_page(self,response):
for nxtpg in AmazonListing_nextpage:
value = response.css(nxtpg).get()
if value:
return value
else:
return None
class ErrorManager:
def error_handler(self, failure, itemlist=True):
er = ErrorFileManagement()
if itemlist == False:
response = failure
failure_record = {
'request_url': response.url,
'request_url': response.request.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response.body': response.body,
}
er.addError(failure_record)
elif failure.check(HttpError):
response = failure.value.response
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure_record)
elif failure.check(DNSLookupError):
response = failure.request
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure)
elif failure.check(TimeoutError, TCPTimedOutError):
response = failure.request
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure_record)
elif failure.status == 200:
response = failure
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure_record)
else:
response = failure
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure_record)
process = CrawlerProcess(settings={
'FEEDS': {
'/mnt/d/dev/dsiqscraper/amzlistings.csv': {'format':'csv'},
},
})
process.crawl(AmazonListings)
process.start()

custom_settings supposed to be a class attribute.
Like this:
class AmazonListings(scrapy.Spider):
name = "amazonlistings"
custom_settings = {
'DEPTH_LIMIT' : 3,
'DOWNLOAD_DELAYED': 5
}
def start_requests(self):
error = ErrorManager()
client = ScraperAPIClient('50903e1bf8db5418a25334f3e3ed7c74')
db = create_engine('postgresql://postgres:Maisha123#localhost:5432')
urls = db.execute('select category_url from scrapycategory')
df = pd.DataFrame(urls.fetchall())
urls = df.values.tolist()
for url in urls:
yield scrapy.Request(client.scrapyGet(url=url[0]), callback=self.parse, errback=error.error_handler, dont_filter=True)
def parse...........

Related

request params encryption - http API source airbyte connector

I am developing an HTTP API source using the airbyte CDK.
When I am using the request params method I am returning the following response:
return {'startIndex': 0, 'resultsPerPage': self.max_results_per_page,
'pubStartDate': "2022-07-30T13:57:21:000 UTC%2B03:00", 'pubEndDate': "2022-07-31T13:57:21:000 UTC%2B03:00"}
But for some reason, those fields are being encrypted and the params that are being passed to the API are:
pubStartDate: "2022-07-30T13%3A57%3A21%3A000+UTC%252B03%3A00"
pubEndDate: "2022-07-31T13%3A57%3A21%3A000+UTC%252B03%3A00"
Is there anything I am doing wrong?
How can I send those date time strings correctly - without them being encrypted?
The API I am using: https://nvd.nist.gov/developers/vulnerabilities
Thanks in advance, any help will be much appreciated!!
The full connector code:
class NvdVulnerabilitiesStream(HttpStream, ABC):
url_base = "https://services.nvd.nist.gov/"
max_results_per_page = 2000
def __init__(self, days_to_fetch: int):
auth = NoAuth()
super().__init__(authenticator=auth)
self.days_to_fetch = days_to_fetch
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
if response.json()['resultsPerPage'] < self.max_results_per_page:
return None
return {"startIndex": response.json()['startIndex'] + self.max_results_per_page, "resultsPerPage": self.max_results_per_page}
def request_params(
self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None
) -> MutableMapping[str, Any]:
if not next_page_token and self.days_to_fetch != -1:
end_time = datetime.now()
start_time = end_time - timedelta(days=self.days_to_fetch)
if not next_page_token:
return {'startIndex': 0, 'resultsPerPage': self.max_results_per_page,
'pubStartDate': start_time.strftime(DATETIME_SCHEME), 'pubEndDate': end_time.strftime(DATETIME_SCHEME)}
if not next_page_token:
return {"startIndex": 0, "resultsPerPage": self.max_results_per_page}
return {"startIndex": next_page_token['startIndex'], "resultsPerPage": next_page_token['resultsPerPage']}
def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
for vulnerability in response.json()["result"]["CVE_Items"]:
yield vulnerability
print(response.json()["startIndex"])
time.sleep(6)
class Vulnerabilities(NvdVulnerabilitiesStream):
primary_key = None
def __init__(self, days_to_fetch: int):
super().__init__(days_to_fetch)
def path(
self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None
) -> str:
return "rest/json/cves/1.0/"```

gRPC: From Node Js, How to send Array of float by using repeated bytes of protobuff to python

I would like to send a list of float list via nodejs and receive it in python using protobuff's repeated bytes type.
The graph helps to understand the problem:
I tried with this configuration and what I get on the python side is not really what I expect:
tensors=[b'-TWW', b'-TWW', b'-TWW', b'-TWW']
Here is my test in node.
Client :
const PROTO_PATH = __dirname + '/route_guide.proto';
const async = require('async');
const grpc = require('#grpc/grpc-js');
const protoLoader = require('#grpc/proto-loader');
const packageDefinition = protoLoader.loadSync(
PROTO_PATH,
{
keepCase: true,
longs: String,
enums: String,
defaults: true,
oneofs: true
});
const routeguide = grpc.loadPackageDefinition(packageDefinition).routeguide;
const client = new routeguide.RouteGuide('localhost:50051',
grpc.credentials.createInsecure());
function runJoin(callback) {
const call = client.join();
call.on('data', function(receivedMessage) {
console.log('Got message "' + JSON.stringify(receivedMessage));
});
call.on('end', callback);
messageToSend = {
msg: 'parameters_res',
parameters_res: {
parameters: {
tensors: [
new Buffer.from(new Float64Array([45.1]).buffer),
new Buffer.from(new Float64Array([45.1, 84.5, 87.9, 87.1]).buffer),
new Buffer.from(new Float64Array([45.1, 84.5, 87.9, 87.1]).buffer),
new Buffer.from(new Float64Array([45.1, 84.5, 87.9, 87.1]).buffer)
],
tensor_type: 'numpy.ndarray'
}
}
}
console.log(messageToSend);
console.log(messageToSend.parameters_res.parameters.tensors)
call.write(messageToSend);
call.end();
}
function main() {
async.series([
runJoin
]);
}
if (require.main === module) {
main();
}
exports.runJoin = runJoin;
route_guide.proto:
syntax = "proto3";
option java_multiple_files = true;
option java_package = "io.grpc.examples.routeguide";
option java_outer_classname = "RouteGuideProto";
option objc_class_prefix = "RTG";
package routeguide;
service RouteGuide {
rpc Join(stream ClientMessage) returns (stream ClientMessage) {}
}
message RouteNote {
repeated bytes model = 1;
}
message ClientMessage {
message Disconnect { Reason reason = 1; }
message ParametersRes { Parameters parameters = 1; }
oneof msg {
Disconnect disconnect = 1;
ParametersRes parameters_res = 2;
}
}
message Parameters {
repeated bytes tensors = 1;
string tensor_type = 2;
}
enum Reason {
UNKNOWN = 0;
RECONNECT = 1;
POWER_DISCONNECTED = 2;
WIFI_UNAVAILABLE = 3;
ACK = 4;
}
Server:
const PROTO_PATH = __dirname + '/route_guide.proto';
const grpc = require('#grpc/grpc-js');
const protoLoader = require('#grpc/proto-loader');
const packageDefinition = protoLoader.loadSync(
PROTO_PATH,
{keepCase: true,
longs: String,
enums: String,
defaults: true,
oneofs: true
});
const routeguide = grpc.loadPackageDefinition(packageDefinition).routeguide;
function join(call) {
call.on('data', function(receivedMessage) {
console.log("SERVER RECEIVE:");
console.log(receivedMessage);
console.log(receivedMessage.parameters_res.parameters.tensors)
for (const element of receivedMessage.parameters_res.parameters.tensors) {
console.log(element)
}
call.write(receivedMessage);
});
call.on('end', function() {
call.end();
});
}
function getServer() {
var server = new grpc.Server();
server.addService(routeguide.RouteGuide.service, {
join: join
});
return server;
}
if (require.main === module) {
var routeServer = getServer();
routeServer.bindAsync('0.0.0.0:50051', grpc.ServerCredentials.createInsecure(), () => {
routeServer.start()
});
}
exports.getServer = getServer;
MyStartegy.py:
from logging import WARNING
from typing import Callable, Dict, List, Optional, Tuple, cast
import numpy as np
import flwr as fl
from flwr.common import (
EvaluateIns,
EvaluateRes,
FitIns,
FitRes,
Parameters,
Scalar,
Weights,
)
from flwr.common.logger import log
from flwr.server.client_manager import ClientManager
from flwr.server.client_proxy import ClientProxy
from flwr.server.strategy.aggregate import aggregate, weighted_loss_avg
from flwr.server.strategy import Strategy
from tensorflow import Tensor
DEPRECATION_WARNING = """
DEPRECATION WARNING: deprecated `eval_fn` return format
loss, accuracy
move to
loss, {"accuracy": accuracy}
instead. Note that compatibility with the deprecated return format will be
removed in a future release.
"""
DEPRECATION_WARNING_INITIAL_PARAMETERS = """
DEPRECATION WARNING: deprecated initial parameter type
flwr.common.Weights (i.e., List[np.ndarray])
will be removed in a future update, move to
flwr.common.Parameters
instead. Use
parameters = flwr.common.weights_to_parameters(weights)
to easily transform `Weights` to `Parameters`.
"""
class MyStrategy(Strategy):
"""Configurable FedAvg strategy implementation."""
# pylint: disable=too-many-arguments,too-many-instance-attributes
def __init__(
self,
fraction_fit: float = 0.1,
fraction_eval: float = 0.1,
min_fit_clients: int = 2,
min_eval_clients: int = 2,
min_available_clients: int = 2,
eval_fn: Optional[
Callable[[Weights], Optional[Tuple[float, Dict[str, Scalar]]]]
] = None,
on_fit_config_fn: Optional[Callable[[int], Dict[str, Scalar]]] = None,
on_evaluate_config_fn: Optional[Callable[[int], Dict[str, Scalar]]] = None,
accept_failures: bool = True,
initial_parameters: Optional[Parameters] = None,
) -> None:
"""Federated Averaging strategy.
Implementation based on https://arxiv.org/abs/1602.05629
Args:
fraction_fit (float, optional): Fraction of clients used during
training. Defaults to 0.1.
fraction_eval (float, optional): Fraction of clients used during
validation. Defaults to 0.1.
min_fit_clients (int, optional): Minimum number of clients used
during training. Defaults to 2.
min_eval_clients (int, optional): Minimum number of clients used
during validation. Defaults to 2.
min_available_clients (int, optional): Minimum number of total
clients in the system. Defaults to 2.
eval_fn (Callable[[Weights], Optional[Tuple[float, float]]], optional):
Function used for validation. Defaults to None.
on_fit_config_fn (Callable[[int], Dict[str, Scalar]], optional):
Function used to configure training. Defaults to None.
on_evaluate_config_fn (Callable[[int], Dict[str, Scalar]], optional):
Function used to configure validation. Defaults to None.
accept_failures (bool, optional): Whether or not accept rounds
containing failures. Defaults to True.
initial_parameters (Parameters, optional): Initial global model parameters.
"""
super().__init__()
self.min_fit_clients = min_fit_clients
self.min_eval_clients = min_eval_clients
self.fraction_fit = fraction_fit
self.fraction_eval = fraction_eval
self.min_available_clients = min_available_clients
self.eval_fn = eval_fn
self.on_fit_config_fn = on_fit_config_fn
self.on_evaluate_config_fn = on_evaluate_config_fn
self.accept_failures = accept_failures
self.initial_parameters = initial_parameters
def __repr__(self) -> str:
rep = f"FedAvg(accept_failures={self.accept_failures})"
return rep
def num_fit_clients(self, num_available_clients: int) -> Tuple[int, int]:
"""Return the sample size and the required number of available
clients."""
num_clients = int(num_available_clients * self.fraction_fit)
return max(num_clients, self.min_fit_clients), self.min_available_clients
def num_evaluation_clients(self, num_available_clients: int) -> Tuple[int, int]:
"""Use a fraction of available clients for evaluation."""
num_clients = int(num_available_clients * self.fraction_eval)
return max(num_clients, self.min_eval_clients), self.min_available_clients
def initialize_parameters(
self, client_manager: ClientManager
) -> Optional[Parameters]:
"""Initialize global model parameters."""
initial_parameters = self.initial_parameters
self.initial_parameters = None # Don't keep initial parameters in memory
if isinstance(initial_parameters, list):
log(WARNING, DEPRECATION_WARNING_INITIAL_PARAMETERS)
initial_parameters = self.weights_to_parameters(weights=initial_parameters)
return initial_parameters
def evaluate(
self, parameters: Parameters
) -> Optional[Tuple[float, Dict[str, Scalar]]]:
"""Evaluate model parameters using an evaluation function."""
if self.eval_fn is None:
# No evaluation function provided
return None
weights = self.parameters_to_weights(parameters)
eval_res = self.eval_fn(weights)
if eval_res is None:
return None
loss, other = eval_res
if isinstance(other, float):
print(DEPRECATION_WARNING)
metrics = {"accuracy": other}
else:
metrics = other
return loss, metrics
def configure_fit(
self, rnd: int, parameters: Parameters, client_manager: ClientManager
) -> List[Tuple[ClientProxy, FitIns]]:
"""Configure the next round of training."""
config = {}
if self.on_fit_config_fn is not None:
# Custom fit config function provided
config = self.on_fit_config_fn(rnd)
fit_ins = FitIns(parameters, config)
# Sample clients
sample_size, min_num_clients = self.num_fit_clients(
client_manager.num_available()
)
clients = client_manager.sample(
num_clients=sample_size, min_num_clients=min_num_clients
)
# Return client/config pairs
return [(client, fit_ins) for client in clients]
def configure_evaluate(
self, rnd: int, parameters: Parameters, client_manager: ClientManager
) -> List[Tuple[ClientProxy, EvaluateIns]]:
"""Configure the next round of evaluation."""
# Do not configure federated evaluation if fraction_eval is 0
if self.fraction_eval == 0.0:
return []
# Parameters and config
config = {}
if self.on_evaluate_config_fn is not None:
# Custom evaluation config function provided
config = self.on_evaluate_config_fn(rnd)
evaluate_ins = EvaluateIns(parameters, config)
# Sample clients
if rnd >= 0:
sample_size, min_num_clients = self.num_evaluation_clients(
client_manager.num_available()
)
clients = client_manager.sample(
num_clients=sample_size, min_num_clients=min_num_clients
)
else:
clients = list(client_manager.all().values())
# Return client/config pairs
return [(client, evaluate_ins) for client in clients]
def aggregate_fit(
self,
rnd: int,
results: List[Tuple[ClientProxy, FitRes]],
failures: List[BaseException],
) -> Tuple[Optional[Parameters], Dict[str, Scalar]]:
"""Aggregate fit results using weighted average."""
if not results:
return None, {}
# Do not aggregate if there are failures and failures are not accepted
if not self.accept_failures and failures:
return None, {}
# Convert results
print("\n\n aggregate_fit")
print(results)
weights_results = [
(self.parameters_to_weights(fit_res.parameters), fit_res.num_examples)
for client, fit_res in results
]
print("weights_results")
print(weights_results)
return self.weights_to_parameters(aggregate(weights_results)), {}
def aggregate_evaluate(
self,
rnd: int,
results: List[Tuple[ClientProxy, EvaluateRes]],
failures: List[BaseException],
) -> Tuple[Optional[float], Dict[str, Scalar]]:
"""Aggregate evaluation losses using weighted average."""
if not results:
return None, {}
# Do not aggregate if there are failures and failures are not accepted
if not self.accept_failures and failures:
return None, {}
loss_aggregated = weighted_loss_avg(
[
(evaluate_res.num_examples, evaluate_res.loss)
for _, evaluate_res in results
]
)
return loss_aggregated, {}
def weights_to_parameters(self, weights: Weights) -> Parameters:
"""Convert NumPy weights to parameters object."""
print('weights_to_parameters')
print(weights)
tensors = [self.ndarray_to_bytes(ndarray) for ndarray in weights]
return Parameters(tensors=tensors, tensor_type="numpy.nda")
def parameters_to_weights(self, parameters: Parameters) -> Weights:
"""Convert parameters object to NumPy weights."""
print('parameters_to_weights')
print(parameters)
return [self.bytes_to_ndarray(tensor) for tensor in parameters.tensors]
# pylint: disable=R0201
def ndarray_to_bytes(self, ndarray: np.ndarray) -> bytes:
"""Serialize NumPy array to bytes."""
print('ndarray_to_bytes')
print(ndarray)
return None
# pylint: disable=R0201
def bytes_to_ndarray(self, tensor: bytes) -> np.ndarray:
"""Deserialize NumPy array from bytes."""
print('bytes_to_ndarray')
print(tensor)
return None
# Start Flower server for three rounds of federated learning
fl.server.start_server(
server_address='localhost:5006',
config={"num_rounds": 2},
strategy=MyStrategy()
)
Is Float64Array the right type?
What should I use on the python side to deserialize the data?
I specify that I cannot modify the proto.
Thank you in advance for your explanations.

Marshmallow 3.10.0 ValidationError: Missing data for required field?

I am new to Marshmallow (3.10.0) and I am trying to understand the following errors that I am getting:
Traceback (most recent call last):
File "/opt/venv/lib/python3.7/site-packages/marshmallow/schema.py", line 779, in _run_validator
validator_func(output, partial=partial, many=many)
File "/usr/src/wazo-confd/wazo_confd/helpers/destination.py", line 351, in _validate_skill_rule_variables
['skill_rule_id'],
marshmallow.exceptions.ValidationError: Missing data for required field. When `skill_rule_variables` is defined
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/src/wazo-confd/wazo_confd/helpers/common.py", line 28, in wrapper
return func(*args, **kwargs)
File "/opt/venv/lib/python3.7/site-packages/xivo/auth_verifier.py", line 155, in wrapper
return func(*args, **kwargs)
File "/opt/venv/lib/python3.7/site-packages/xivo/auth_verifier.py", line 125, in wrapper
return func(*args, **kwargs)
File "/usr/src/wazo-confd/wazo_confd/plugins/call_filter_fallback/resource.py", line 36, in put
fallbacks = self.schema().load(request.get_json(), partial=True)
File "/opt/venv/lib/python3.7/site-packages/marshmallow/schema.py", line 728, in load
data, many=many, partial=partial, unknown=unknown, postprocess=True
File "/opt/venv/lib/python3.7/site-packages/marshmallow/schema.py", line 866, in _do_load
unknown=unknown,
File "/opt/venv/lib/python3.7/site-packages/marshmallow/schema.py", line 674, in _deserialize
index=index,
File "/opt/venv/lib/python3.7/site-packages/marshmallow/schema.py", line 496, in _call_and_store
value = getter_func(data)
File "/opt/venv/lib/python3.7/site-packages/marshmallow/schema.py", line 667, in <lambda>
val, field_name, data, **d_kwargs
File "/opt/venv/lib/python3.7/site-packages/marshmallow/fields.py", line 356, in deserialize
output = self._deserialize(value, attr, data, **kwargs)
File "/usr/src/wazo-confd/wazo_confd/helpers/destination.py", line 544, in _deserialize
return fields.Nested(schema, **self.kwargs)._deserialize(value, attr, data)
File "/opt/venv/lib/python3.7/site-packages/marshmallow/fields.py", line 611, in _deserialize
return self._load(value, data, partial=partial)
File "/opt/venv/lib/python3.7/site-packages/marshmallow/fields.py", line 594, in _load
valid_data = self.schema.load(value, unknown=self.unknown, partial=partial)
File "/opt/venv/lib/python3.7/site-packages/marshmallow/schema.py", line 728, in load
data, many=many, partial=partial, unknown=unknown, postprocess=True
File "/opt/venv/lib/python3.7/site-packages/marshmallow/schema.py", line 891, in _do_load
field_errors=field_errors,
File "/opt/venv/lib/python3.7/site-packages/marshmallow/schema.py", line 1194, in _invoke_schema_validators
partial=partial,
File "/opt/venv/lib/python3.7/site-packages/marshmallow/schema.py", line 781, in _run_validator
error_store.store_error(err.messages, err.field_name, index=index)
File "/opt/venv/lib/python3.7/site-packages/marshmallow/error_store.py", line 22, in store_error
messages = {field_name: messages}
TypeError: unhashable type: 'list'
The first one seems to occur in this file (destination.py):
# Copyright 2016-2022 The Wazo Authors (see the AUTHORS file)
# SPDX-License-Identifier: GPL-3.0-or-later
import json
from marshmallow import (
EXCLUDE,
Schema,
fields,
pre_dump,
post_load,
post_dump,
validates_schema,
validates,
)
from marshmallow.exceptions import ValidationError
from marshmallow.validate import Length, OneOf, Regexp, Predicate, Range
from xivo_dao.helpers import errors
from xivo_dao.helpers.exception import NotFoundError
from xivo_dao.resources.application import dao as application_dao
from xivo_dao.resources.conference import dao as conference_dao
from xivo_dao.resources.group import dao as group_dao
from xivo_dao.resources.ivr import dao as ivr_dao
from xivo_dao.resources.moh import dao as moh_dao
from xivo_dao.resources.outcall import dao as outcall_dao
from xivo_dao.resources.queue import dao as queue_dao
from xivo_dao.resources.skill_rule import dao as skill_rule_dao
from xivo_dao.resources.switchboard import dao as switchboard_dao
from xivo_dao.resources.user import dao as user_dao
from xivo_dao.resources.voicemail import dao as voicemail_dao
from wazo_confd.helpers.mallow import StrictBoolean
from wazo_confd.helpers.validator import GetResource, Validator
import logging
logger = logging.getLogger(__name__)
COMMAND_REGEX = r'^(?!(try)?system\()[a-zA-Z]{3,}\((.*)\)$'
CONTEXT_REGEX = r'^[a-zA-Z0-9_-]{1,39}$'
EXTEN_REGEX = r'^[0-9*#]{1,255}$'
SKILL_RULE_VARIABLE_REGEX = r'^[^[;\|]+$'
class BaseDestinationSchema(Schema):
class Meta:
unknown = EXCLUDE
type = fields.String(
validate=OneOf(
[
'application',
'conference',
'custom',
'extension',
'group',
'hangup',
'ivr',
'none',
'outcall',
'queue',
'sound',
'switchboard',
'user',
'voicemail',
]
),
required=True,
)
#post_dump
def convert_type_to_user(self, data, **kwargs):
if data['type'] == 'endcall':
data['type'] = 'hangup'
return data
#post_load
def convert_type_to_database(self, data, **kwargs):
if data['type'] == 'hangup':
data['type'] = 'endcall'
return data
class ApplicationDestinationSchema(BaseDestinationSchema):
application = fields.String(
validate=OneOf(
['callback_disa', 'custom', 'directory', 'disa', 'fax_to_mail', 'voicemail']
),
attribute='subtype',
required=True,
)
#post_dump
def convert_application_to_user(self, data, **kwargs):
if data['application'] == 'callbackdisa':
data['application'] = 'callback_disa'
elif data['application'] == 'faxtomail':
data['application'] = 'fax_to_mail'
elif data['application'] == 'voicemailmain':
data['application'] = 'voicemail'
return data
#post_load
def convert_application_to_database(self, data, **kwargs):
if data['subtype'] == 'callback_disa':
data['subtype'] = 'callbackdisa'
elif data['subtype'] == 'fax_to_mail':
data['subtype'] = 'faxtomail'
elif data['subtype'] == 'voicemail':
data['subtype'] = 'voicemailmain'
return data
class CallBackDISADestinationSchema(ApplicationDestinationSchema):
pin = fields.String(
validate=(Predicate('isdigit'), Length(max=40)),
allow_none=True,
attribute='actionarg1',
)
context = fields.String(
validate=Regexp(CONTEXT_REGEX), attribute='actionarg2', required=True
)
class CustomApplicationDestinationSchema(ApplicationDestinationSchema):
application_uuid = fields.UUID(attribute='actionarg1', required=True)
_application = fields.Nested(
'ApplicationSchema', only=['name'], attribute='application', dump_only=True
)
#post_dump
def make_application_fields_flat(self, data, **kwargs):
if data.get('_application'):
data['application_name'] = data['_application']['name']
data.pop('_application', None)
return data
class DISADestinationSchema(ApplicationDestinationSchema):
pin = fields.String(
validate=(Predicate('isdigit'), Length(max=40)),
allow_none=True,
attribute='actionarg1',
)
context = fields.String(
validate=Regexp(CONTEXT_REGEX), attribute='actionarg2', required=True
)
class DirectoryDestinationSchema(ApplicationDestinationSchema):
context = fields.String(
validate=Regexp(CONTEXT_REGEX), attribute='actionarg1', required=True
)
class FaxToMailDestinationSchema(ApplicationDestinationSchema):
email = fields.Email(validate=Length(max=80), attribute='actionarg1', required=True)
class VoicemailMainDestinationSchema(ApplicationDestinationSchema):
context = fields.String(
validate=Regexp(CONTEXT_REGEX), attribute='actionarg1', required=True
)
class ConferenceDestinationSchema(BaseDestinationSchema):
conference_id = fields.Integer(attribute='actionarg1', required=True)
conference = fields.Nested('ConferenceSchema', only=['name'], dump_only=True)
#post_dump
def make_conference_fields_flat(self, data, **kwargs):
if data.get('conference'):
data['conference_name'] = data['conference']['name']
data.pop('conference', None)
return data
class CustomDestinationSchema(BaseDestinationSchema):
command = fields.String(
validate=(Regexp(COMMAND_REGEX), Length(max=255)),
attribute='actionarg1',
required=True,
)
class ExtensionDestinationSchema(BaseDestinationSchema):
exten = fields.String(
validate=Regexp(EXTEN_REGEX), attribute='actionarg1', required=True
)
context = fields.String(
validate=Regexp(CONTEXT_REGEX), attribute='actionarg2', required=True
)
class GroupDestinationSchema(BaseDestinationSchema):
group_id = fields.Integer(attribute='actionarg1', required=True)
ring_time = fields.Float(
validate=Range(min=0), attribute='actionarg2', allow_none=True
)
group = fields.Nested('GroupSchema', only=['label', 'name'], dump_only=True)
#post_dump
def make_group_fields_flat(self, data, **kwargs):
if data.get('group'):
# TODO(pc-m): Label was added in 21.04 group_name should be remove when we remove
# the compatibility logic in group schema
data['group_name'] = data['group']['name']
data['group_label'] = data['group']['label']
data.pop('group', None)
return data
class HangupDestinationSchema(BaseDestinationSchema):
cause = fields.String(
validate=OneOf(['busy', 'congestion', 'normal']),
attribute='subtype',
missing='normal',
required=False,
)
#post_dump
def convert_cause_to_user(self, data, **kwargs):
if data['cause'] == 'hangup':
data['cause'] = 'normal'
return data
#post_load
def convert_cause_to_database(self, data, **kwargs):
if data['subtype'] == 'normal':
data['subtype'] = 'hangup'
return data
class BusyDestinationSchema(HangupDestinationSchema):
timeout = fields.Float(
attribute='actionarg1', validate=Range(min=0), allow_none=True
)
class CongestionDestinationSchema(HangupDestinationSchema):
timeout = fields.Float(
attribute='actionarg1', validate=Range(min=0), allow_none=True
)
class IVRDestinationSchema(BaseDestinationSchema):
ivr_id = fields.Integer(attribute='actionarg1', required=True)
ivr = fields.Nested('IvrSchema', only=['name'], dump_only=True)
#post_dump
def make_ivr_fields_flat(self, data, **kwargs):
if data.get('ivr'):
data['ivr_name'] = data['ivr']['name']
data.pop('ivr', None)
return data
class NormalDestinationSchema(HangupDestinationSchema):
pass
class NoneDestinationSchema(BaseDestinationSchema):
pass
class OutcallDestinationSchema(BaseDestinationSchema):
outcall_id = fields.Integer(attribute='actionarg1', required=True)
exten = fields.String(
validate=(Predicate('isdigit'), Length(max=255)),
attribute='actionarg2',
required=True,
)
class QueueDestinationSchema(BaseDestinationSchema):
queue_id = fields.Integer(attribute='actionarg1', required=True)
ring_time = fields.Float(validate=Range(min=0), allow_none=True)
skill_rule_id = fields.Integer(allow_none=True)
skill_rule_variables = fields.Dict(allow_none=True)
queue = fields.Nested('QueueSchema', only=['label'], dump_only=True)
#pre_dump
def separate_action(self, data, **kwargs):
options = data.actionarg2.split(';') if data.actionarg2 else []
data.ring_time = None
data.skill_rule_id = None
data.skill_rule_variables = None
_skill_rule_variables = None
if len(options) == 1:
data.ring_time = options[0] or None
elif len(options) == 2: # id is always bound with variables
data.skill_rule_id = options[0]
_skill_rule_variables = options[1] or None
elif len(options) == 3:
data.ring_time = options[0]
data.skill_rule_id = options[1]
_skill_rule_variables = options[2] or None
if _skill_rule_variables:
_skill_rule_variables = _skill_rule_variables.replace(
'|', ','
) # dialplan interpret comma ...
data.skill_rule_variables = json.loads(_skill_rule_variables)
return data
#post_load
def merge_action(self, data, **kwargs):
ring_time = data.pop('ring_time', None)
skill_rule_id = data.pop('skill_rule_id', None)
skill_rule_variables = data.pop('skill_rule_variables', None)
skill_rule_variables_str = (
json.dumps(skill_rule_variables).replace(',', '|')
if skill_rule_variables
else ''
)
data[
'actionarg2'
] = '{ring_time}{sep1}{skill_rule_id}{sep2}{skill_rule_variables}'.format(
ring_time=ring_time or '',
sep1=';' if ring_time and skill_rule_id else '',
skill_rule_id=skill_rule_id or '',
sep2=';' if skill_rule_id else '',
skill_rule_variables=skill_rule_variables_str,
)
return data
#post_dump
def make_queue_fields_flat(self, data, **kwargs):
if data.get('queue'):
data['queue_label'] = data['queue']['label']
data.pop('queue', None)
return data
#validates_schema
def _validate_skill_rule_variables(self, data, **kwargs):
logger.critical('------------------------------------------')
logger.critical(data)
logger.critical(kwargs)
logger.critical('------------------------------------------')
if not data.get('skill_rule_variables'):
return
if not data.get('skill_rule_id'):
raise ValidationError(
'Missing data for required field. When `skill_rule_variables` is defined',
['skill_rule_id'],
)
#validates('skill_rule_variables')
def _validate_skill_rule_variables_value(self, variables):
# with marshmallow 3.0 we can set this validator on the field declaration
if not variables:
return
validator = Regexp(SKILL_RULE_VARIABLE_REGEX)
for key, value in variables.items():
validator(key)
validator(value)
class SoundDestinationSchema(BaseDestinationSchema):
filename = fields.String(
validate=Length(max=255), attribute='actionarg1', required=True
)
skip = StrictBoolean()
no_answer = StrictBoolean()
#pre_dump
def separate_action(self, data, **kwargs):
options = data.actionarg2 if data.actionarg2 else ''
data.skip = True if 'skip' in options else False
data.no_answer = True if 'noanswer' in options else False
return data
#post_load
def merge_action(self, data, **kwargs):
data['actionarg2'] = '{skip}{noanswer}'.format(
skip='skip' if data.pop('skip', False) else '',
noanswer='noanswer' if data.pop('no_answer', False) else '',
)
return data
class SwitchboardDestinationSchema(BaseDestinationSchema):
switchboard_uuid = fields.UUID(attribute='actionarg1', required=True)
ring_time = fields.Float(
validate=Range(min=0), attribute='actionarg2', allow_none=True
)
switchboard = fields.Nested('SwitchboardSchema', only=['name'], dump_only=True)
#post_dump
def make_switchboard_fields_flat(self, data, **kwargs):
if data.get('switchboard'):
data['switchboard_name'] = data['switchboard']['name']
data.pop('switchboard', None)
return data
class UserDestinationSchema(BaseDestinationSchema):
user_id = fields.Integer(attribute='actionarg1', required=True)
ring_time = fields.Float(validate=Range(min=0), allow_none=True)
moh_uuid = fields.UUID(allow_none=True)
user = fields.Nested('UserSchema', only=['firstname', 'lastname'], dump_only=True)
#post_dump
def make_user_fields_flat(self, data, **kwargs):
if data.get('user'):
data['user_firstname'] = data['user']['firstname']
data['user_lastname'] = data['user']['lastname']
data.pop('user', None)
return data
#pre_dump
def separate_action(self, data, **kwargs):
options = data.actionarg2.split(';') if data.actionarg2 else []
data.ring_time = None
data.moh_uuid = None
if len(options) > 0:
data.ring_time = options[0] or None
if len(options) > 1: # id is always bound with variables
data.moh_uuid = options[1]
return data
#post_load
def merge_action(self, data, **kwargs):
ring_time = data.pop('ring_time', None)
moh_uuid = data.pop('moh_uuid', None)
actionarg2 = ''
if ring_time is not None:
actionarg2 += str(ring_time)
if moh_uuid is not None:
actionarg2 += ';{}'.format(moh_uuid)
data['actionarg2'] = actionarg2
return data
class VoicemailDestinationSchema(BaseDestinationSchema):
voicemail_id = fields.Integer(attribute='actionarg1', required=True)
skip_instructions = StrictBoolean()
greeting = fields.String(validate=OneOf(['busy', 'unavailable']), allow_none=True)
voicemail = fields.Nested('VoicemailSchema', only=['name'], dump_only=True)
#pre_dump
def separate_action(self, data, **kwargs):
options = data.actionarg2 if data.actionarg2 else ''
data.skip_instructions = True if 's' in options else False
data.greeting = (
'busy' if 'b' in options else 'unavailable' if 'u' in options else None
)
return data
#post_load
def merge_action(self, data, **kwargs):
greeting = data.pop('greeting', None)
data['actionarg2'] = '{}{}'.format(
'b' if greeting == 'busy' else 'u' if greeting == 'unavailable' else '',
's' if data.pop('skip_instructions', False) else '',
)
return data
#post_dump
def make_voicemail_fields_flat(self, data, **kwargs):
if data.get('voicemail'):
data['voicemail_name'] = data['voicemail']['name']
data.pop('voicemail', None)
return data
class DestinationField(fields.Nested):
application_schemas = {
'callback_disa': CallBackDISADestinationSchema,
'callbackdisa': CallBackDISADestinationSchema,
'custom': CustomApplicationDestinationSchema,
'directory': DirectoryDestinationSchema,
'disa': DISADestinationSchema,
'fax_to_mail': FaxToMailDestinationSchema,
'faxtomail': FaxToMailDestinationSchema,
'voicemail': VoicemailMainDestinationSchema,
'voicemailmain': VoicemailMainDestinationSchema,
}
hangup_schemas = {
'busy': BusyDestinationSchema,
'congestion': CongestionDestinationSchema,
'normal': NormalDestinationSchema,
'hangup': NormalDestinationSchema,
}
destination_schemas = {
'application': ApplicationDestinationSchema,
'conference': ConferenceDestinationSchema,
'custom': CustomDestinationSchema,
'extension': ExtensionDestinationSchema,
'group': GroupDestinationSchema,
'hangup': HangupDestinationSchema,
'endcall': HangupDestinationSchema,
'ivr': IVRDestinationSchema,
'none': NoneDestinationSchema,
'outcall': OutcallDestinationSchema,
'queue': QueueDestinationSchema,
'sound': SoundDestinationSchema,
'switchboard': SwitchboardDestinationSchema,
'user': UserDestinationSchema,
'voicemail': VoicemailDestinationSchema,
}
def __init__(self, **kwargs):
# FIXME(sileht): I'm not sure validation works here...
# This of dynamic nesterd stuffs should not done like this.
self.kwargs = kwargs
self.kwargs["unknown"] = EXCLUDE
super().__init__(BaseDestinationSchema, **self.kwargs)
def _deserialize(self, value, attr, data, **kwargs):
self.schema.context = self.context
base = super()._deserialize(value, attr, data, **kwargs)
schema = self.destination_schemas[base['type']]
if base['type'] == 'application':
base = fields.Nested(schema, **self.kwargs)._deserialize(value, attr, data)
schema = self.application_schemas[base['subtype']]
if base['type'] == 'endcall':
base = fields.Nested(schema, **self.kwargs)._deserialize(value, attr, data)
schema = self.hangup_schemas[base['subtype']]
return fields.Nested(schema, **self.kwargs)._deserialize(value, attr, data)
def _serialize(self, nested_obj, attr, obj):
base = super()._serialize(nested_obj, attr, obj)
if not base:
return base
schema = self.destination_schemas[base['type']]
if base['type'] == 'application':
base = fields.Nested(schema, **self.kwargs)._serialize(
nested_obj, attr, obj
)
schema = self.application_schemas[base['application']]
if base['type'] == 'hangup':
base = fields.Nested(schema, **self.kwargs)._serialize(
nested_obj, attr, obj
)
schema = self.hangup_schemas[base['cause']]
return fields.Nested(schema, **self.kwargs)._serialize(nested_obj, attr, obj)
class OptionalGetSkillRuleFromActionArg2Resource(Validator):
def __init__(self, dao_get):
self.dao_get = dao_get
def validate(self, model):
destination = QueueDestinationSchema().dump(model)
skill_rule_id = destination.get('skill_rule_id', None)
if not skill_rule_id:
return
try:
self.dao_get(skill_rule_id)
except NotFoundError:
metadata = {'skill_rule_id': skill_rule_id}
raise errors.param_not_found('skill_rule_id', 'SkillRule', **metadata)
class GetMohFromActionArg2Resource(Validator):
def __init__(self, dao_get):
self._dao_get = dao_get
def validate(self, model):
destination = UserDestinationSchema().dump(model)
moh_uuid = destination.get('moh_uuid', None)
if not moh_uuid:
return
try:
self._dao_get(moh_uuid)
except NotFoundError:
metadata = {'moh_uuid': moh_uuid}
raise errors.param_not_found('moh_uuid', 'MOH', **metadata)
class DestinationValidator:
_VALIDATORS = {
'application:callbackdisa': [],
'application:custom': [
GetResource('actionarg1', application_dao.get, 'Application')
],
'application:directory': [],
'application:disa': [],
'application:faxtomail': [],
'application:voicemailmain': [],
'conference': [GetResource('actionarg1', conference_dao.get, 'Conference')],
'custom': [],
'extension': [],
'group': [GetResource('actionarg1', group_dao.get, 'Group')],
'endcall:busy': [],
'endcall:congestion': [],
'endcall:hangup': [],
'ivr': [GetResource('actionarg1', ivr_dao.get, 'IVR')],
'none': [],
'outcall': [GetResource('actionarg1', outcall_dao.get, 'Outcall')],
'queue': [
GetResource('actionarg1', queue_dao.get, 'Queue'),
OptionalGetSkillRuleFromActionArg2Resource(skill_rule_dao.get),
],
'sound': [],
'switchboard': [GetResource('actionarg1', switchboard_dao.get, 'Switchboard')],
'user': [
GetResource('actionarg1', user_dao.get, 'User'),
GetMohFromActionArg2Resource(moh_dao.get),
],
'voicemail': [GetResource('actionarg1', voicemail_dao.get, 'Voicemail')],
}
def validate(self, destination):
for validator in self._VALIDATORS[destination.action]:
validator.validate(destination)
As for the second one, it seems to be caused within this file (resource.py):
# Copyright 2018-2020 The Wazo Authors (see the AUTHORS file)
# SPDX-License-Identifier: GPL-3.0-or-later
from flask import request
from wazo_confd.auth import required_acl
from wazo_confd.helpers.restful import ConfdResource
from .schema import CallFilterFallbackSchema
class CallFilterFallbackList(ConfdResource):
schema = CallFilterFallbackSchema
has_tenant_uuid = True
def __init__(self, service, call_filter_dao):
super().__init__()
self.service = service
self.call_filter_dao = call_filter_dao
#required_acl('confd.callfilters.{call_filter_id}.fallbacks.read')
def get(self, call_filter_id):
tenant_uuids = self._build_tenant_list({'recurse': True})
call_filter = self.call_filter_dao.get(
call_filter_id, tenant_uuids=tenant_uuids
)
return self.schema().dump(call_filter.fallbacks)
#required_acl('confd.callfilters.{call_filter_id}.fallbacks.update')
def put(self, call_filter_id):
tenant_uuids = self._build_tenant_list({'recurse': True})
call_filter = self.call_filter_dao.get(
call_filter_id, tenant_uuids=tenant_uuids
)
fallbacks = self.schema().load(request.get_json())
self.service.edit(call_filter, fallbacks)
return '', 204
I am not sure what causes these errors; tried to find online solutions but I could not fix the issue.

Is it possible to randomly sample YouTube comments with YouTube API V3?

I have been trying to download all the YouTube comments on popular videos using python requests, but it has been throwing up the following error after about a quarter of the total comments:
{'error': {'code': 400, 'message': "The API server failed to successfully process the request. While this can be a transient error, it usually indicates that the request's input is invalid. Check the structure of the commentThread resource in the request body to ensure that it is valid.", 'errors': [{'message': "The API server failed to successfully process the request. While this can be a transient error, it usually indicates that the request's input is invalid. Check the structure of the commentThread resource in the request body to ensure that it is valid.", 'domain': 'youtube.commentThread', 'reason': 'processingFailure', 'location': 'body', 'locationType': 'other'}]}}
I found this thread detailing the same issue, and it seems that it is not possible to download all the comments on popular videos.
This is my code:
import argparse
import urllib
import requests
import json
import time
start_time = time.time()
class YouTubeApi():
YOUTUBE_COMMENTS_URL = 'https://www.googleapis.com/youtube/v3/commentThreads'
comment_counter = 0
with open("API_keys.txt", "r") as f:
key_list = f.readlines()
key_list = [key.strip('/n') for key in key_list]
def format_comments(self, results, likes_required):
comments_list = []
try:
for item in results["items"]:
comment = item["snippet"]["topLevelComment"]
likes = comment["snippet"]["likeCount"]
if likes < likes_required:
continue
author = comment["snippet"]["authorDisplayName"]
text = comment["snippet"]["textDisplay"]
str = "Comment by {}:\n \"{}\"\n\n".format(author, text)
str = str.encode('ascii', 'replace').decode()
comments_list.append(str)
self.comment_counter += 1
print("Comments downloaded:", self.comment_counter, end="\r")
except(KeyError):
print(results)
return comments_list
def get_video_comments(self, video_id, likes_required):
with open("API_keys.txt", "r") as f:
key_list = f.readlines()
key_list = [key.strip('/n') for key in key_list]
if self.comment_counter <= 900000:
key = self.key_list[0]
elif self.comment_counter <= 1800000:
key = self.key_list[1]
elif self.comment_counter <= 2700000:
key = self.key_list[2]
elif self.comment_counter <= 3600000:
key = self.key_list[3]
elif self.comment_counter <= 4500000:
key = self.key_list[4]
params = {
'part': 'snippet,replies',
'maxResults': 100,
'videoId': video_id,
'textFormat': 'plainText',
'key': key
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
try:
#data = self.openURL(self.YOUTUBE_COMMENTS_URL, params)
comments_data = requests.get(self.YOUTUBE_COMMENTS_URL, params=params, headers=headers)
except ChunkedEncodingError:
tries = 5
print("Chunked Error. Retrying...")
for n in range(tries):
try:
x = 0
x += 1
print("Trying", x, "times")
response = session.post("https://www.youtube.com/comment_service_ajax", params=params, data=data, headers=headers)
comments_data = json.loads(response.text)
except ChunkedEncodingError as c:
print(c)
results = comments_data.json()
nextPageToken = results.get("nextPageToken")
commments_list = []
commments_list += self.format_comments(results, likes_required)
while nextPageToken:
params.update({'pageToken': nextPageToken})
try:
comments_data = requests.get(self.YOUTUBE_COMMENTS_URL, params=params, headers=headers)
except ChunkedEncodingError as c:
tries = 5
print("Chunked Error. Retrying...")
for n in range(tries):
try:
x = 0
x += 1
print("Trying", x, "times")
response = session.post("https://www.youtube.com/comment_service_ajax", params=params, data=data, headers=headers)
comments_data = json.loads(response.text)
except ChunkedEncodingError as c:
print(c)
results = comments_data.json()
nextPageToken = results.get("nextPageToken")
commments_list += self.format_comments(results, likes_required)
return commments_list
def get_video_id_list(self, filename):
try:
with open(filename, 'r') as file:
URL_list = file.readlines()
except FileNotFoundError:
exit("File \"" + filename + "\" not found")
list = []
for url in URL_list:
if url == "\n": # ignore empty lines
continue
if url[-1] == '\n': # delete '\n' at the end of line
url = url[:-1]
if url.find('='): # get id
id = url[url.find('=') + 1:]
list.append(id)
else:
print("Wrong URL")
return list
def main():
yt = YouTubeApi()
parser = argparse.ArgumentParser(add_help=False, description=("Download youtube comments from many videos into txt file"))
required = parser.add_argument_group("required arguments")
optional = parser.add_argument_group("optional arguments")
here: https://console.developers.google.com/apis/credentials")
optional.add_argument("--likes", '-l', help="The amount of likes a comment needs to be saved", type=int)
optional.add_argument("--input", '-i', help="URL list file name")
optional.add_argument("--output", '-o', help="Output file name")
optional.add_argument("--help", '-h', help="Help", action='help')
args = parser.parse_args()
# --------------------------------------------------------------------- #
likes = 0
if args.likes:
likes = args.likes
input_file = "URL_list.txt"
if args.input:
input_file = args.input
output_file = "Comments.txt"
if args.output:
output_file = args.output
list = yt.get_video_id_list(input_file)
if not list:
exit("No URLs in input file")
try:
vid_counter = 0
with open(output_file, "a") as f:
for video_id in list:
vid_counter += 1
print("Downloading comments for video ", vid_counter, ", id: ", video_id, sep='')
comments = yt.get_video_comments(video_id, likes)
if comments:
for comment in comments:
f.write(comment)
print('\nDone!')
except KeyboardInterrupt:
exit("User Aborted the Operation")
# --------------------------------------------------------------------- #
if __name__ == '__main__':
main()
The next best method would be to randomly sample them. Does anyone know if this is possible with the API V3?

Even if the API returns a processingFailure error, you could still catch that (or any other API error for that matter) for to terminate gracefully your pagination loop. This way your script will provide the top-level comments that it fetched from of the API prior to the occurrence of the first API error.
The error response provided by the YouTube Data API is (usually) of the following form:
{
"error": {
"errors": [
{
"domain": <string>,
"reason": <string>,
"message": <string>,
"locationType": <string>,
"location": <string>
}
],
"code": <integer>,
"message": <string>
}
}
Hence, you could have defined the following function:
def is_error_response(response):
error = response.get('error')
if error is None:
return False
print("API Error: "
f"code={error['code']} "
f"domain={error['errors'][0]['domain']} "
f"reason={error['errors'][0]['reason']} "
f"message={error['errors'][0]['message']!r}")
return True
that you'll invoke after each statement of form results = comments_data.json(). In case of the first occurrence of that statement, you'll have:
results = comments_data.json()
if is_error_response(results):
return []
nextPageToken = results.get("nextPageToken")
For the second instance of that statement:
results = comments_data.json()
if is_error_response(results):
return comments_list
nextPageToken = results.get("nextPageToken")
Notice that the function is_error_response above prints out an error message on stdout in case its argument in an API error response; this is for the purpose of having the user of your script informed about the API call failure.

Scrapy: How to crawl the next url in start_urls when a condition is met

Is there any way to stop crawling current url and, jump and crawl the next url in start_urls, when a given condition is met.
Here I test the dates in the page with a pre-defined date.
I want to stop crawling the url when that condition is met.
Edit
My code is as follows,
class MarketSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['example.com']
start_urls = []
date_limit = datetime.strptime('07/01/2019', '%m/%d,/%Y')
for url in open("urls.txt"):start_urls.append(url)
def start_requests(self):
for url in self.start_urls:
category = url.split('/')[4]
yield scrapy.Request(url=url, callback=self.parse, meta={'category': category})
def parse(self, response):
date_limit = self.date_limit
category = response.request.category
item_url = response.xpath("//div[#class='white-block-content']")
for i in item_url:
url_ = i.xpath("./a/#href").extract_first()
date = i.xpath("./p[#class='date']/text()").extract_first()
dt_obj = datetime.strptime(date, '%B %d, %Y')
if dt_obj >= date_limit:
yield scrapy.Request(url , callback = self.parse_number, meta={'category': category,'u_date':dt_obj })
next_page = response.css('a[rel="next"]::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(next_page, callback = self.parse)
def parse_number(self, response):
category = response.request.category
url = response.request.url
url = url.strip()
u_date = response.request.u_date
phone_number = response.xpath("//div[#id='contact-top']//li/#data-value").extract()
for i in phone_number:
yield {
'category': category,
'u_date': u_date,
'url': url,
'phone_number': i}

Simply calling return after yield worked for me:
yield {
'lorem': 'ipsum'
}
return

In the parse_number function, check the condition for matching the date. If the condition matches yield the data and stop crawling from the specific domain. else continue the rest crawling.

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Scrapy: Limiting the number of next page that are scraped. Unfortunately, the DEPTH_LIMIT custom setting doesn't work - web-scraping

Related

request params encryption - http API source airbyte connector

gRPC: From Node Js, How to send Array of float by using repeated bytes of protobuff to python

Marshmallow 3.10.0 ValidationError: Missing data for required field?

Is it possible to randomly sample YouTube comments with YouTube API V3?

Scrapy: How to crawl the next url in start_urls when a condition is met

Categories

Resources