Is it possible to randomly sample YouTube comments with YouTube API V3? - python-requests

I have been trying to download all the YouTube comments on popular videos using python requests, but it has been throwing up the following error after about a quarter of the total comments:
{'error': {'code': 400, 'message': "The API server failed to successfully process the request. While this can be a transient error, it usually indicates that the request's input is invalid. Check the structure of the commentThread resource in the request body to ensure that it is valid.", 'errors': [{'message': "The API server failed to successfully process the request. While this can be a transient error, it usually indicates that the request's input is invalid. Check the structure of the commentThread resource in the request body to ensure that it is valid.", 'domain': 'youtube.commentThread', 'reason': 'processingFailure', 'location': 'body', 'locationType': 'other'}]}}
I found this thread detailing the same issue, and it seems that it is not possible to download all the comments on popular videos.
This is my code:
import argparse
import urllib
import requests
import json
import time
start_time = time.time()
class YouTubeApi():
YOUTUBE_COMMENTS_URL = 'https://www.googleapis.com/youtube/v3/commentThreads'
comment_counter = 0
with open("API_keys.txt", "r") as f:
key_list = f.readlines()
key_list = [key.strip('/n') for key in key_list]
def format_comments(self, results, likes_required):
comments_list = []
try:
for item in results["items"]:
comment = item["snippet"]["topLevelComment"]
likes = comment["snippet"]["likeCount"]
if likes < likes_required:
continue
author = comment["snippet"]["authorDisplayName"]
text = comment["snippet"]["textDisplay"]
str = "Comment by {}:\n \"{}\"\n\n".format(author, text)
str = str.encode('ascii', 'replace').decode()
comments_list.append(str)
self.comment_counter += 1
print("Comments downloaded:", self.comment_counter, end="\r")
except(KeyError):
print(results)
return comments_list
def get_video_comments(self, video_id, likes_required):
with open("API_keys.txt", "r") as f:
key_list = f.readlines()
key_list = [key.strip('/n') for key in key_list]
if self.comment_counter <= 900000:
key = self.key_list[0]
elif self.comment_counter <= 1800000:
key = self.key_list[1]
elif self.comment_counter <= 2700000:
key = self.key_list[2]
elif self.comment_counter <= 3600000:
key = self.key_list[3]
elif self.comment_counter <= 4500000:
key = self.key_list[4]
params = {
'part': 'snippet,replies',
'maxResults': 100,
'videoId': video_id,
'textFormat': 'plainText',
'key': key
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
try:
#data = self.openURL(self.YOUTUBE_COMMENTS_URL, params)
comments_data = requests.get(self.YOUTUBE_COMMENTS_URL, params=params, headers=headers)
except ChunkedEncodingError:
tries = 5
print("Chunked Error. Retrying...")
for n in range(tries):
try:
x = 0
x += 1
print("Trying", x, "times")
response = session.post("https://www.youtube.com/comment_service_ajax", params=params, data=data, headers=headers)
comments_data = json.loads(response.text)
except ChunkedEncodingError as c:
print(c)
results = comments_data.json()
nextPageToken = results.get("nextPageToken")
commments_list = []
commments_list += self.format_comments(results, likes_required)
while nextPageToken:
params.update({'pageToken': nextPageToken})
try:
comments_data = requests.get(self.YOUTUBE_COMMENTS_URL, params=params, headers=headers)
except ChunkedEncodingError as c:
tries = 5
print("Chunked Error. Retrying...")
for n in range(tries):
try:
x = 0
x += 1
print("Trying", x, "times")
response = session.post("https://www.youtube.com/comment_service_ajax", params=params, data=data, headers=headers)
comments_data = json.loads(response.text)
except ChunkedEncodingError as c:
print(c)
results = comments_data.json()
nextPageToken = results.get("nextPageToken")
commments_list += self.format_comments(results, likes_required)
return commments_list
def get_video_id_list(self, filename):
try:
with open(filename, 'r') as file:
URL_list = file.readlines()
except FileNotFoundError:
exit("File \"" + filename + "\" not found")
list = []
for url in URL_list:
if url == "\n": # ignore empty lines
continue
if url[-1] == '\n': # delete '\n' at the end of line
url = url[:-1]
if url.find('='): # get id
id = url[url.find('=') + 1:]
list.append(id)
else:
print("Wrong URL")
return list
def main():
yt = YouTubeApi()
parser = argparse.ArgumentParser(add_help=False, description=("Download youtube comments from many videos into txt file"))
required = parser.add_argument_group("required arguments")
optional = parser.add_argument_group("optional arguments")
here: https://console.developers.google.com/apis/credentials")
optional.add_argument("--likes", '-l', help="The amount of likes a comment needs to be saved", type=int)
optional.add_argument("--input", '-i', help="URL list file name")
optional.add_argument("--output", '-o', help="Output file name")
optional.add_argument("--help", '-h', help="Help", action='help')
args = parser.parse_args()
# --------------------------------------------------------------------- #
likes = 0
if args.likes:
likes = args.likes
input_file = "URL_list.txt"
if args.input:
input_file = args.input
output_file = "Comments.txt"
if args.output:
output_file = args.output
list = yt.get_video_id_list(input_file)
if not list:
exit("No URLs in input file")
try:
vid_counter = 0
with open(output_file, "a") as f:
for video_id in list:
vid_counter += 1
print("Downloading comments for video ", vid_counter, ", id: ", video_id, sep='')
comments = yt.get_video_comments(video_id, likes)
if comments:
for comment in comments:
f.write(comment)
print('\nDone!')
except KeyboardInterrupt:
exit("User Aborted the Operation")
# --------------------------------------------------------------------- #
if __name__ == '__main__':
main()
The next best method would be to randomly sample them. Does anyone know if this is possible with the API V3?

Even if the API returns a processingFailure error, you could still catch that (or any other API error for that matter) for to terminate gracefully your pagination loop. This way your script will provide the top-level comments that it fetched from of the API prior to the occurrence of the first API error.
The error response provided by the YouTube Data API is (usually) of the following form:
{
"error": {
"errors": [
{
"domain": <string>,
"reason": <string>,
"message": <string>,
"locationType": <string>,
"location": <string>
}
],
"code": <integer>,
"message": <string>
}
}
Hence, you could have defined the following function:
def is_error_response(response):
error = response.get('error')
if error is None:
return False
print("API Error: "
f"code={error['code']} "
f"domain={error['errors'][0]['domain']} "
f"reason={error['errors'][0]['reason']} "
f"message={error['errors'][0]['message']!r}")
return True
that you'll invoke after each statement of form results = comments_data.json(). In case of the first occurrence of that statement, you'll have:
results = comments_data.json()
if is_error_response(results):
return []
nextPageToken = results.get("nextPageToken")
For the second instance of that statement:
results = comments_data.json()
if is_error_response(results):
return comments_list
nextPageToken = results.get("nextPageToken")
Notice that the function is_error_response above prints out an error message on stdout in case its argument in an API error response; this is for the purpose of having the user of your script informed about the API call failure.

Related

request params encryption - http API source airbyte connector

I am developing an HTTP API source using the airbyte CDK.
When I am using the request params method I am returning the following response:
return {'startIndex': 0, 'resultsPerPage': self.max_results_per_page,
'pubStartDate': "2022-07-30T13:57:21:000 UTC%2B03:00", 'pubEndDate': "2022-07-31T13:57:21:000 UTC%2B03:00"}
But for some reason, those fields are being encrypted and the params that are being passed to the API are:
pubStartDate: "2022-07-30T13%3A57%3A21%3A000+UTC%252B03%3A00"
pubEndDate: "2022-07-31T13%3A57%3A21%3A000+UTC%252B03%3A00"
Is there anything I am doing wrong?
How can I send those date time strings correctly - without them being encrypted?
The API I am using: https://nvd.nist.gov/developers/vulnerabilities
Thanks in advance, any help will be much appreciated!!
The full connector code:
class NvdVulnerabilitiesStream(HttpStream, ABC):
url_base = "https://services.nvd.nist.gov/"
max_results_per_page = 2000
def __init__(self, days_to_fetch: int):
auth = NoAuth()
super().__init__(authenticator=auth)
self.days_to_fetch = days_to_fetch
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
if response.json()['resultsPerPage'] < self.max_results_per_page:
return None
return {"startIndex": response.json()['startIndex'] + self.max_results_per_page, "resultsPerPage": self.max_results_per_page}
def request_params(
self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None
) -> MutableMapping[str, Any]:
if not next_page_token and self.days_to_fetch != -1:
end_time = datetime.now()
start_time = end_time - timedelta(days=self.days_to_fetch)
if not next_page_token:
return {'startIndex': 0, 'resultsPerPage': self.max_results_per_page,
'pubStartDate': start_time.strftime(DATETIME_SCHEME), 'pubEndDate': end_time.strftime(DATETIME_SCHEME)}
if not next_page_token:
return {"startIndex": 0, "resultsPerPage": self.max_results_per_page}
return {"startIndex": next_page_token['startIndex'], "resultsPerPage": next_page_token['resultsPerPage']}
def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
for vulnerability in response.json()["result"]["CVE_Items"]:
yield vulnerability
print(response.json()["startIndex"])
time.sleep(6)
class Vulnerabilities(NvdVulnerabilitiesStream):
primary_key = None
def __init__(self, days_to_fetch: int):
super().__init__(days_to_fetch)
def path(
self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None
) -> str:
return "rest/json/cves/1.0/"```

SQLAlchemy 1.4.0b1 AsyncSession issue

I'm using SQLAlchemy 1.4.0b1's AsyncSession to update a Postgres db with asyncpg 0.21.0. The code below aims to update objects and add new objects in response to various incoming Redis stream messages
The save_revised coroutine (update) is working fine, and so is the session.add part of the td_move coroutine. However the update part of td_move, at the bottom of the function (starting from if this_train_id and msg.get('from') in finals[crossing]) , only works intermittently : I'm getting some db updates but only ~1/3 or so of the log messages indicating that an update is wanted.
Can anyone suggest what the problem(s) could be please ?
async def main():
logger.info(f"db_updater starting {datetime.now().strftime('%H:%M:%S')}")
engine = create_async_engine(os.getenv('ASYNC_DB_URL'), future=True)
async with AsyncSession(engine) as session:
crossings, headcodes, lean_params, finals, active_trains, train_ids, berthtimes, hc_types = await get_db_data(logger) # noqa: E501
pool = await aioredis.create_redis_pool(('redis', 6379), db=0, password=os.getenv('REDIS_PW'), encoding='utf-8')
last_id = '$'
while True:
all_msgs = await pool.xread(['del_hc_s', 'xing_revised', 'all_td', 'add_hc_s'], latest_ids=[last_id, last_id, last_id, last_id]) # noqa: E501
for stream_name, msg_id, msg in all_msgs:
message = dict(msg)
crossing = message.get('crossing')
if stream_name == 'all_td':
await td_move(message, train_ids, active_trains, finals, lean_params, session)
elif stream_name == 'xing_revised':
await save_revised(message, lean_params[crossing], session)
async def save_revised(msg, params, session):
train_id = msg.get('train_id')
# today_class is a SQLA model class from declarative_base()
today_class = params['today_class']
rev_time = datetime.fromtimestamp(
int(msg.get('revised')))
stmt = update(today_class).where(today_class.train_id == train_id).\
values(xing_revised=rev_time).\
execution_options(synchronize_session="fetch")
await session.execute(stmt)
if msg.get('revised_ten') != 'X':
stmt2 = update(today_class).where(today_class.train_id == train_id).\
values(xing_revised_ten=rev_time).\
execution_options(synchronize_session="fetch")
await session.execute(stmt2)
await session.commit()
async def td_move(msg, train_ids, active_trains, finals, params, session):
crossing = msg.get('crossing')
descr = msg.get('descr')
if crossing:
this_train_id = [s for s in train_ids[crossing] if descr in s]
if this_train_id:
this_train_id = this_train_id[0]
else:
return
if this_train_id and active_trains[crossing].get(this_train_id) and (
is_within_minutes(30, active_trains[crossing].get(this_train_id))):
# Td_Ca_Cc is a SQLA model class from declarative_base()
td = Td_Ca_Cc(
msg_type=msg.get('msg_type'),
descr=msg.get('descr'),
traintype=active_trains[crossing].get(
this_train_id).get('train_type'),
from_berth=msg.get('from'),
to_berth=msg.get('to'),
tdtime=dt_from_timestamp(msg.get('time')),
seconds=0,
area_id=msg.get('area_id'),
updated=datetime.now(),
crossing=crossing
)
session.add(td)
if this_train_id and msg.get('from') in finals[crossing]:
today_class = params[crossing]['today_class']
stmt = update(today_class).where(today_class.train_id == this_train_id).\
values(xing_actual=datetime.now(), cancel_time='XXX').\
execution_options(synchronize_session="fetch")
await session.execute(stmt)
logger.info(f"{crossing} {msg.get('descr')} passed {datetime.now().strftime('%H:%M:%S')}")
await session.commit()
if __name__ == '__main__':
asyncio.run(main())
For future reference the problem was making a (sync) logging call. I removed this (and will add an async logger call), the modified code is now working fine

Why is the variable modified when nginx Lua processes the request?

I just started studying Lua.
Every time I request, I want to check the name parameter in the request parameter. However, it is actually found that the self.name has changed occasionally.
For example,
request A with params: request_id = 123 & name = ABC,
request B with params: request_id = 321 & name = EFG,
in the log, I found that there are requests_id = 123, but name = EFG.
Why is that? Is my class incorrectly written?
Here is the sample codeļ¼š
main.lua:
local checker = require "checker"
local ch = checker:new()
if ch:check_name() then
ngx.header["Content-Type"] = "application/json"
ngx.status = ngx.HTTP_FORBIDDEN
ngx.exit(ngx.HTTP_FORBIDDEN)
end
checker.lua:
local utils = require "utils"
local _M = {}
function _M:new()
local o = {}
setmetatable(o, self)
self.__index = self
self.args = utils.get_req_args() or {}
local name = self.args["name"] or ""
local request_id = self.args["request_id"] or ""
self.name = name
return o
end
function _M:check_name()
ngx.log(ngx.ERR, "request_id: ", self.request_id, " name: ", self.name)
-- do some check ...
end
utils.lua
local json = require "cjson"
local _M = {}
function _M.new(self)
return self
end
function _M.get_req_args()
-- GET
local args = nil
if ngx.var.request_method == "GET" then
args = ngx.req.get_uri_args()
-- POST
elseif ngx.var.request_method == "POST" then
ngx.req.read_body()
local data = ngx.req.get_body_data()
args = json.decode(data)
end
return args
end

Scrapy: How to crawl the next url in start_urls when a condition is met

Is there any way to stop crawling current url and, jump and crawl the next url in start_urls, when a given condition is met.
Here I test the dates in the page with a pre-defined date.
I want to stop crawling the url when that condition is met.
Edit
My code is as follows,
class MarketSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['example.com']
start_urls = []
date_limit = datetime.strptime('07/01/2019', '%m/%d,/%Y')
for url in open("urls.txt"):start_urls.append(url)
def start_requests(self):
for url in self.start_urls:
category = url.split('/')[4]
yield scrapy.Request(url=url, callback=self.parse, meta={'category': category})
def parse(self, response):
date_limit = self.date_limit
category = response.request.category
item_url = response.xpath("//div[#class='white-block-content']")
for i in item_url:
url_ = i.xpath("./a/#href").extract_first()
date = i.xpath("./p[#class='date']/text()").extract_first()
dt_obj = datetime.strptime(date, '%B %d, %Y')
if dt_obj >= date_limit:
yield scrapy.Request(url , callback = self.parse_number, meta={'category': category,'u_date':dt_obj })
next_page = response.css('a[rel="next"]::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(next_page, callback = self.parse)
def parse_number(self, response):
category = response.request.category
url = response.request.url
url = url.strip()
u_date = response.request.u_date
phone_number = response.xpath("//div[#id='contact-top']//li/#data-value").extract()
for i in phone_number:
yield {
'category': category,
'u_date': u_date,
'url': url,
'phone_number': i}
Simply calling return after yield worked for me:
yield {
'lorem': 'ipsum'
}
return
In the parse_number function, check the condition for matching the date. If the condition matches yield the data and stop crawling from the specific domain. else continue the rest crawling.

Groovy http request, parse headers values

I'm sending HTTP request from application 1
http://localhost:8888/inputs/example-input?ProductId=49823&Orders_orderId=27759
to application 2.
And in application 2 I'm reciving plain string:
inputs/example-input?ProductId=49823&Orders_orderId=27759
I need to get values in they own variables as shown:
def productId = 49823
def orderId = 27759
Is there some groovy way to parse input string inputs/example-input?ProductId=49823&Orders_orderId=27759?
You need to parse the input manually, e.g.:
def input = "inputs/example-input?ProductId=49823&Orders_orderId=27759"
def parsed = input
.split("\\?")[1]
.split("&")
.inject([:]) { m, e ->
def arr = e.split("=")
m[arr[0]] = arr[1]
m
}
def productId = parsed.ProductId
def orderId = parsed.Orders_orderId
assert productId == '49823'
assert orderId == '27759'

Resources