I am downloading YouTube comments with a python script that uses API keys and the YouTube Data API V3, but sooner or later I run into the following error:
{'error': {'code': 400, 'message': "The API server failed to successfully process the request. While this can be a transient error, it usually indicates that the request's input is invalid. Check the structure of the commentThread resource in the request body to ensure that it is valid.", 'errors': [{'message': "The API server failed to successfully process the request. While this can be a transient error, it usually indicates that the request's input is invalid. Check the structure of the commentThread resource in the request body to ensure that it is valid.", 'domain': 'youtube.commentThread', 'reason': 'processingFailure', 'location': 'body', 'locationType': 'other'}]}}
I am using the following code:
import argparse
import requests
import json
import time
start_time = time.time()
class YouTubeApi():
YOUTUBE_COMMENTS_URL = 'https://www.googleapis.com/youtube/v3/commentThreads'
comment_counter = 0
def is_error_response(self, response):
error = response.get('error')
if error is None:
return False
print("API Error: "
f"code={error['code']} "
f"domain={error['errors'][0]['domain']} "
f"reason={error['errors'][0]['reason']} "
f"message={error['errors'][0]['message']!r}")
print(self.comment_counter)
return True
def format_comments(self, results, likes_required):
comments_list = []
try:
for item in results["items"]:
comment = item["snippet"]["topLevelComment"]
likes = comment["snippet"]["likeCount"]
if likes < likes_required:
continue
author = comment["snippet"]["authorDisplayName"]
text = comment["snippet"]["textDisplay"]
str = "Comment by {}:\n \"{}\"\n\n".format(author, text)
str = str.encode('ascii', 'replace').decode()
comments_list.append(str)
self.comment_counter += 1
print("Comments downloaded:", self.comment_counter, end="\r")
except(KeyError):
print(results)
return comments_list
def get_video_comments(self, video_id, likes_required):
with open("API_keys.txt", "r") as f:
key_list = f.readlines()
comments_list = []
key_list = [key.strip('/n') for key in key_list]
params = {
'part': 'snippet,replies',
'maxResults': 100,
'videoId': video_id,
'textFormat': 'plainText',
'key': key_list[0]
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
comments_data = requests.get(self.YOUTUBE_COMMENTS_URL, params=params, headers=headers)
results = comments_data.json()
if self.is_error_response(results):
return []
nextPageToken = results.get("nextPageToken")
comments_list = []
comments_list += self.format_comments(results, likes_required)
while nextPageToken:
params.update({'pageToken': nextPageToken})
if self.comment_counter <= 900000:
params.update({'key': key_list[0]})
elif self.comment_counter <= 1800000:
params.update({'key': key_list[1]})
elif self.comment_counter <= 2700000:
params.update({'key': key_list[2]})
elif self.comment_counter <= 3600000:
params.update({'key': key_list[3]})
elif self.comment_counter <= 4500000:
params.update({'key': key_list[4]})
else:
params.update({'key': key_list[5]})
if self.comment_counter % 900001 == 0:
print(params["key"])
comments_data = requests.get(self.YOUTUBE_COMMENTS_URL, params=params, headers=headers)
results = comments_data.json()
if self.is_error_response(results):
return comments_list
nextPageToken = results.get("nextPageToken")
comments_list += self.format_comments(results, likes_required)
return comments_list
def get_video_id_list(self, filename):
try:
with open(filename, 'r') as file:
URL_list = file.readlines()
except FileNotFoundError:
exit("File \"" + filename + "\" not found")
list = []
for url in URL_list:
if url == "\n": # ignore empty lines
continue
if url[-1] == '\n': # delete '\n' at the end of line
url = url[:-1]
if url.find('='): # get id
id = url[url.find('=') + 1:]
list.append(id)
else:
print("Wrong URL")
return list
def main():
yt = YouTubeApi()
parser = argparse.ArgumentParser(add_help=False, description=("Download youtube comments from many videos into txt file"))
required = parser.add_argument_group("required arguments")
optional = parser.add_argument_group("optional arguments")
optional.add_argument("--likes", '-l', help="The amount of likes a comment needs to be saved", type=int)
optional.add_argument("--input", '-i', help="URL list file name")
optional.add_argument("--output", '-o', help="Output file name")
optional.add_argument("--help", '-h', help="Help", action='help')
args = parser.parse_args()
# --------------------------------------------------------------------- #
likes = 0
if args.likes:
likes = args.likes
input_file = "URL_list.txt"
if args.input:
input_file = args.input
output_file = "Comments.txt"
if args.output:
output_file = args.output
list = yt.get_video_id_list(input_file)
if not list:
exit("No URLs in input file")
try:
vid_counter = 0
with open(output_file, "a") as f:
for video_id in list:
vid_counter += 1
print("Downloading comments for video ", vid_counter, ", id: ", video_id, sep='')
comments = yt.get_video_comments(video_id, likes)
if comments:
for comment in comments:
f.write(comment)
print('\nDone!')
except KeyboardInterrupt:
exit("User Aborted the Operation")
# --------------------------------------------------------------------- #
if __name__ == '__main__':
main()
In another thread, it was discovered that google does not currently permit downloading all the comments on a popular video, however you would expect it to cut off at the same point. Instead, I have found that it can range anywhere betweek 1.5 million to 200k comments downloaded before it returns a code 400. Is this to do with a bug in my code, or is the YouTube API rejecting my request as it is clear that is a script? Would adding a time.sleep clause help with this?
(I bring forward this answer -- that I prepared to the question above at the time of its initial post -- because my assertions below seems to be confirmed once again by recent SO posts of this very kind.)
Your observations are correct. But, unfortunately, nobody but Google itself is able to provide a sound and complete answer to your question. Us -- non-Googlers (as myself!), or even the Googlers themselves (since they all sign NDAs) -- can only guess about the things implied.
Here is my educated guess, based on the investigations I made recently when responding to a very much related question (which you quoted above, yourself!):
As you already know, the API uses pagination for to return to callers sets of items of which cardinality exceed the internal limit of 50, or, by case, 100 items to be returned by each and every API endpoint invocation that provides result sets.
If you'll log the nextPageToken property that you obtain from CommentThreads.list via your object results, you'll see that those page tokens get bigger and bigger. Each and every such page token has to be passed on to the next CommentThreads.list call as the parameter pageToken.
The problem is that internally (not specified publicly, not documented) the API has a limit on the sheer length of the HTTP requests it accepts from its callers. (This happens for various reasons; e.g. security.) Therefore, when a given page token is sufficiently long, the HTTP request that the API user issues will exceed that internal limit, producing an internal error. That error surfaces to the API caller as the processingFailure error that you've encountered.
Many questions remain to be answered (e.g. why is that the page tokens have unbounded length?), but, again, those questions belong very much to the internal realm of the back-end system that's behind the API we're using. And those questions cannot be answered publicly, since are very much Google's internal business.
I have the following code on Airflow 1.9:
import_op = MySqlToGoogleCloudStorageOperator(
task_id='import',
mysql_conn_id='oproduction',
google_cloud_storage_conn_id='gcpm',
provide_context=True,
approx_max_file_size_bytes = 100000000, #100MB per file
sql = 'import.sql',
params={'next_to_import': NEXT_TO_IMPORT, 'table_name' : TABLE_NAME},
bucket=GCS_BUCKET_ID,
filename=file_name_orders,
dag=dag)
Why does it genereates:
/usr/local/lib/python2.7/dist-packages/airflow/models.py:2160:
PendingDeprecationWarning: Invalid arguments were passed to
MySqlToGoogleCloudStorageOperator. Support for passing such arguments
will be dropped in Airflow 2.0. Invalid arguments were:
*args: ()
**kwargs: {'provide_context': True} category=PendingDeprecationWarning
What is the problem with the provide_context? To the best of my knowledge it is needed for the usage of params.
provide_context is not needed for params.
params parameter (dict type) can be passed to any Operator.
You would mostly use provide_context with PythonOperator, BranchPythonOperator. A good example is https://airflow.readthedocs.io/en/latest/howto/operator.html#pythonoperator.
MySqlToGoogleCloudStorageOperator has no parameter provide_context, hence it is passed in **kwargs and you get Deprecation warning.
If you check docstring of PythonOperator for provide_context :
if set to true, Airflow will pass a set of keyword arguments that can
be used in your function. This set of kwargs correspond exactly to
what you can use in your jinja templates. For this to work, you need
to define **kwargs in your function header.
It has the following code if you check the source code:
if self.provide_context:
context.update(self.op_kwargs)
context['templates_dict'] = self.templates_dict
self.op_kwargs = context
So in simple terms, it passes the following dictionary with templates_dict to your function pass in python_callable:
{
'END_DATE': ds,
'conf': configuration,
'dag': task.dag,
'dag_run': dag_run,
'ds': ds,
'ds_nodash': ds_nodash,
'end_date': ds,
'execution_date': self.execution_date,
'latest_date': ds,
'macros': macros,
'params': params,
'run_id': run_id,
'tables': tables,
'task': task,
'task_instance': self,
'task_instance_key_str': ti_key_str,
'test_mode': self.test_mode,
'ti': self,
'tomorrow_ds': tomorrow_ds,
'tomorrow_ds_nodash': tomorrow_ds_nodash,
'ts': ts,
'ts_nodash': ts_nodash,
'yesterday_ds': yesterday_ds,
'yesterday_ds_nodash': yesterday_ds_nodash,
}
So this can be used in the function as follows:
def print_context(ds, **kwargs):
pprint(kwargs)
ti = context['task_instance']
exec_date = context['execution_date']
print(ds)
return 'Whatever you return gets printed in the logs'
run_this = PythonOperator(
task_id='print_the_context',
provide_context=True,
python_callable=print_context,
dag=dag,
)
I have an Ejabberd 17.01 installation where I need to push a notification in case a recipient is offline. This seems the be a common task and solutions using a customized Ejabberd module can be found everywhere. However, I just don't get it running. First, here's me script:
-module(mod_offline_push).
-behaviour(gen_mod).
-export([start/2, stop/1]).
-export([push_message/3]).
-include("ejabberd.hrl").
-include("logger.hrl").
-include("jlib.hrl").
start(Host, _Opts) ->
?INFO_MSG("mod_offline_push loading", []),
ejabberd_hooks:add(offline_message_hook, Host, ?MODULE, push_message, 10),
ok.
stop(Host) ->
?INFO_MSG("mod_offline_push stopping", []),
ejabberd_hooks:add(offline_message_hook, Host, ?MODULE, push_message, 10),
ok.
push_message(From, To, Packet) ->
?INFO_MSG("mod_offline_push -> push_message", [To]),
Type = fxml:get_tag_attr_s(<<"type">>, Packet), % Supposedly since 16.04
%Type = xml:get_tag_attr_s(<<"type">>, Packet), % Supposedly since 13.XX
%Type = xml:get_tag_attr_s("type", Packet),
%Type = xml:get_tag_attr_s(list_to_binary("type"), Packet),
?INFO_MSG("mod_offline_push -> push_message", []),
ok.
The problem is the line Type = ... line in method push_message; without that line the last info message is logged (so the hook definitely works). When browsing online, I can find all kinds of function calls to extract elements from Packet. As far as I understand it changed over time with new releases. But it's not good, all variants lead in some kind of error. The current way returns:
2017-01-25 20:38:08.701 [error] <0.21678.0>#ejabberd_hooks:run1:332 {function_clause,[{fxml,get_tag_attr_s,[<<"type">>,{message,<<>>,normal,<<>>,{jid,<<"homer">>,<<"xxx.xxx.xxx.xxx">>,<<"conference">>,<<"homer">>,<<"xxx.xxx.xxx.xxx">>,<<"conference">>},{jid,<<"carl">>,<<"xxx.xxx.xxx.xxx">>,<<>>,<<"carl">>,<<"xxx.xxx.xxx.xxx">>,<<>>},[],[{text,<<>>,<<"sfsdfsdf">>}],undefined,[],#{}}],[{file,"src/fxml.erl"},{line,169}]},{mod_offline_push,push_message,3,[{file,"mod_offline_push.erl"},{line,33}]},{ejabberd_hooks,safe_apply,3,[{file,"src/ejabberd_hooks.erl"},{line,382}]},{ejabberd_hooks,run1,3,[{file,"src/ejabberd_hooks.erl"},{line,329}]},{ejabberd_sm,route,3,[{file,"src/ejabberd_sm.erl"},{line,126}]},{ejabberd_local,route,3,[{file,"src/ejabberd_local.erl"},{line,110}]},{ejabberd_router,route,3,[{file,"src/ejabberd_router.erl"},{line,87}]},{ejabberd_c2s,check_privacy_route,5,[{file,"src/ejabberd_c2s.erl"},{line,1886}]}]}
running hook: {offline_message_hook,[{jid,<<"homer">>,<<"xxx.xxx.xxx.xxx">>,<<"conference">>,<<"homer">>,<<"xxx.xxx.xxx.xxx">>,<<"conference">>},{jid,<<"carl">>,<<"xxx.xxx.xxx.xxx">>,<<>>,<<"carl">>,<<"xxx.xxx.xxx.xxx">>,<<>>},{message,<<>>,normal,<<>>,{jid,<<"homer">>,<<"xxx.xxx.xxx.xxx">>,<<"conference">>,<<"homer">>,<<"xxx.xxx.xxx.xxx">>,<<"conference">>},{jid,<<"carl">>,<<"xxx.xxx.xxx.xxx">>,<<>>,<<"carl">>,<<"xxx.xxx.xxx.xxx">>,<<>>},[],[{text,<<>>,<<"sfsdfsdf">>}],undefined,[],#{}}]}
I'm new Ejabberd and Erlang, so I cannot really interpret the error, but the Line 33 as mentioned in {mod_offline_push,push_message,3,[{file,"mod_offline_push.erl"}, {line,33}]} is definitely the line calling get_tag_attr_s.
UPDATE 2017/01/27: Since this cost me a lot of headache -- and I'm still not perfectly happy -- I post here my current working module in the hopes it might help others. My setup is Ejabberd 17.01 running on Ubuntu 16.04. Most stuff I tried and failed with seem to for older versions of Ejabberd:
-module(mod_fcm_fork).
-behaviour(gen_mod).
%% public methods for this module
-export([start/2, stop/1]).
-export([push_notification/3]).
%% included for writing to ejabberd log file
-include("ejabberd.hrl").
-include("logger.hrl").
-include("xmpp_codec.hrl").
%% Copied this record definition from jlib.hrl
%% Including "xmpp_codec.hrl" and "jlib.hrl" resulted in errors ("XYZ already defined")
-record(jid, {user = <<"">> :: binary(),
server = <<"">> :: binary(),
resource = <<"">> :: binary(),
luser = <<"">> :: binary(),
lserver = <<"">> :: binary(),
lresource = <<"">> :: binary()}).
start(Host, _Opts) ->
?INFO_MSG("mod_fcm_fork loading", []),
% Providing the most basic API to the clients and servers that are part of the Inets application
inets:start(),
% Add hook to handle message to user who are offline
ejabberd_hooks:add(offline_message_hook, Host, ?MODULE, push_notification, 10),
ok.
stop(Host) ->
?INFO_MSG("mod_fcm_fork stopping", []),
ejabberd_hooks:add(offline_message_hook, Host, ?MODULE, push_notification, 10),
ok.
push_notification(From, To, Packet) ->
% Generate JID of sender and receiver
FromJid = lists:concat([binary_to_list(From#jid.user), "#", binary_to_list(From#jid.server), "/", binary_to_list(From#jid.resource)]),
ToJid = lists:concat([binary_to_list(To#jid.user), "#", binary_to_list(To#jid.server), "/", binary_to_list(To#jid.resource)]),
% Get message body
MessageBody = Packet#message.body,
% Check of MessageBody is not empty
case MessageBody/=[] of
true ->
% Get first element (no idea when this list can have more elements)
[First | _ ] = MessageBody,
% Get message data and convert to string
MessageBodyText = binary_to_list(First#text.data),
send_post_request(FromJid, ToJid, MessageBodyText);
false ->
?INFO_MSG("mod_fcm_fork -> push_notification: MessageBody is empty",[])
end,
ok.
send_post_request(FromJid, ToJid, MessageBodyText) ->
%?INFO_MSG("mod_fcm_fork -> send_post_request -> MessageBodyText = ~p", [Demo]),
Method = post,
PostURL = gen_mod:get_module_opt(global, ?MODULE, post_url,fun(X) -> X end, all),
% Add data as query string. Not nice, query body would be preferable
% Problem: message body itself can be in a JSON string, and I couldn't figure out the correct encoding.
URL = lists:concat([binary_to_list(PostURL), "?", "fromjid=", FromJid,"&tojid=", ToJid,"&body=", edoc_lib:escape_uri(MessageBodyText)]),
Header = [],
ContentType = "application/json",
Body = [],
?INFO_MSG("mod_fcm_fork -> send_post_request -> URL = ~p", [URL]),
% ADD SSL CONFIG BELOW!
%HTTPOptions = [{ssl,[{versions, ['tlsv1.2']}]}],
HTTPOptions = [],
Options = [],
httpc:request(Method, {URL, Header, ContentType, Body}, HTTPOptions, Options),
ok.
Actually it fails with second arg Packet you pass to fxml:get_tag_attr_s in push_message function
{message,<<>>,normal,<<>>,
{jid,<<"homer">>,<<"xxx.xxx.xxx.xxx">>,<<"conference">>,
<<"homer">>,<<"xxx.xxx.xxx.xxx">>,<<"conference">>},
{jid,<<"carl">>,<<"xxx.xxx.xxx.xxx">>,<<>>,<<"carl">>,
<<"xxx.xxx.xxx.xxx">>,<<>>},
[],
[{text,<<>>,<<"sfsdfsdf">>}],
undefined,[],#{}}
because it is not xmlel
Looks like it is record "message" defined in tools/xmpp_codec.hrl
with <<>> id and type 'normal'
xmpp_codec.hrl
-record(message, {id :: binary(),
type = normal :: 'chat' | 'error' | 'groupchat' | 'headline' | 'normal',
lang :: binary(),
from :: any(),
to :: any(),
subject = [] :: [#text{}],
body = [] :: [#text{}],
thread :: binary(),
error :: #error{},
sub_els = [] :: [any()]}).
Include this file and use just
Type = Packet#message.type
or, if you expect binary value
Type = erlang:atom_to_binary(Packet#message.type, utf8)
The newest way to do that seems to be with xmpp:get_type/1:
Type = xmpp:get_type(Packet),
It returns an atom, in this case normal.