How to change user agent when Tor ip changes in Scrapy - web-scraping

I use Tor and Privoxy with TorIpChanger to change ip after a random number of items_scraped. And it is working fine. I would like to change user-agent as well, when ip changes.
I am a bit confused about the way to go to achieve this. I have had a look at scrapy_useragents and similar solutions looking for inspiration, without a lot of success for now. This is what i'm trying to do, based on https://github.com/khpeek/scraper-compose/ and https://docs.scrapy.org/en/latest/topics/extensions.html
extensions.py
class TorRenewIdentity(object):
def __init__(self, crawler, item_count, user_agents):
self.crawler = crawler
self.item_count = self.randomize(item_count) # Randomize the item count to confound traffic analysis
self._item_count = item_count # Also remember the given item count for future randomizations
self.items_scraped = 0
self.user_agents = user_agents
# Connect the extension object to signals
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
#staticmethod
def randomize(item_count, min_factor=0.5, max_factor=1.5):
'''Randomize the number of items scraped before changing identity. (A similar technique is applied to Scrapy's DOWNLOAD_DELAY setting).'''
randomized_item_count = random.randint(int(min_factor*item_count), int(max_factor*item_count))
logger.info("The crawler will scrape the following (randomized) number of items before changing identity (again): {}".format(randomized_item_count))
return randomized_item_count
#classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('TOR_RENEW_IDENTITY_ENABLED'):
raise NotConfigured
item_count = crawler.settings.getint('TOR_ITEMS_TO_SCRAPE_PER_IDENTITY', 10)
user_agents = crawler.settings['USER_AGENT']
return cls(crawler=crawler, item_count=item_count, user_agents=user_agents) # Instantiate the extension object
def item_scraped(self, item, spider):
'''When item_count items are scraped, pause the engine and change IP address.'''
self.items_scraped += 1
if self.items_scraped == self.item_count:
logger.info("Scraped {item_count} items. Pausing engine while changing identity...".format(item_count=self.item_count))
self.crawler.engine.pause()
ip_changer.get_new_ip() # Change IP address with toripchanger https://github.com/DusanMadar/TorIpChanger
self.items_scraped = 0 # Reset the counter
self.item_count = self.randomize(self._item_count) # Generate a new random number of items to scrape before changing identity again
# Get new user agent from list
if self.user_agents:
new_user_agent = random.choice(self.user_agents)
logger.info('Load {} user_agents from settings. New user agent is {}.'.format(
len(self.user_agents) if self.user_agents else 0, new_user_agent))
# Change user agent here ?
# For next self.item_count items
# headers.setdefault('User-Agent', new_user_agent)
#
self.crawler.engine.unpause()
settings.py
USER_AGENT = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0'
]
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'scrapydevua.middlewares.ScrapydevuaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'scrapydevua.middlewares.ScrapydevuaDownloaderMiddleware': 543,
#}
EXTENSIONS = {
'scrapydevua.extensions.TorRenewIdentity': 1,
}

Related

Google Scholar profile scraping

I'm trying to retrieve the links of a Google Scholar user's work from their profile but am having trouble accessing the html that is hidden behind the "show more" button. I would like to be able to capture all the links from a user but currently can only get the first 20. Im using the following script to scrape for reference.
from bs4 import BeautifulSoup
import requests
author_url = 'https://scholar.google.com/citations?hl=en&user=mG4imMEAAAAJ'
html_content = requests.get(author_url)
soup = BeautifulSoup(html_content.text, 'lxml')
tables = soup.final_all('table)
table = tables[1]
rows = table.final_all('tr')
links = []
for row in rows:
t = row.find('a')
if t is not None:
links.append(t.get('href'))
You need to use cstart URL parameter which stands for page number, 0 is the first page, 10 is the second.. This parameter allows to skip the need to click "show more button" and does the same thing.
This parameter needs to be used in while loop in order to paginate through all articles.
To exist the loop, one of the ways would be to check certain CSS selector such as .gsc_a_e which is assigned to text when no results are present:
The great thing about such approach is that it paginates dynamically, instead of for i in range() which is hard coded and will be broken if certain authors have 20 articles and another has 2550 articles.
On the screenshot above I'm using the SelectorGadget Chrome extension that lets you pick CSS selectors by clicking on certain elements in the browser. It works great if the website is not heavily JS driven.
Keep in mind that at some point you also need to use CAPTCHA solver or proxies. This is only when you need to extract a lot of articles from multiple authors.
Code with the option to save to CSV using pandas and a full example in the online IDE:
import pandas as pd
from bs4 import BeautifulSoup
import requests, lxml, json
def bs4_scrape_articles():
params = {
"user": "mG4imMEAAAAJ", # user-id
"hl": "en", # language
"gl": "us", # country to search from
"cstart": 0, # articles page. 0 is the first page
"pagesize": "100" # articles per page
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
}
all_articles = []
articles_is_present = True
while articles_is_present:
html = requests.post("https://scholar.google.com/citations", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
for article in soup.select("#gsc_a_b .gsc_a_t"):
article_title = article.select_one(".gsc_a_at").text
article_link = f'https://scholar.google.com{article.select_one(".gsc_a_at")["href"]}'
article_authors = article.select_one(".gsc_a_at+ .gs_gray").text
article_publication = article.select_one(".gs_gray+ .gs_gray").text
all_articles.append({
"title": article_title,
"link": article_link,
"authors": article_authors,
"publication": article_publication
})
# this selector is checking for the .class that contains: "There are no articles in this profile."
# example link: https://scholar.google.com/citations?hl=en&user=mG4imMEAAAAJ&cstart=600
if soup.select_one(".gsc_a_e"):
articles_is_present = False
else:
params["cstart"] += 100 # paginate to the next page
print(json.dumps(all_articles, indent=2, ensure_ascii=False))
# pd.DataFrame(data=all_articles).to_csv(f"google_scholar_{params['user']}_articles.csv", encoding="utf-8", index=False)
bs4_scrape_articles()
Outputs (shows only last results as output is 400+ articles):
[
{
"title": "Exponential family sparse coding with application to self-taught learning with text documents",
"link": "https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:LkGwnXOMwfcC",
"authors": "H Lee, R Raina, A Teichman, AY Ng",
"publication": ""
},
{
"title": "Visual and Range Data",
"link": "https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:eQOLeE2rZwMC",
"authors": "S Gould, P Baumstarck, M Quigley, AY Ng, D Koller",
"publication": ""
}
]
If you don't want want to deal with bypassing blocks from Google or maintaining your script, have a look at the Google Scholar Author Articles API.
There's also a scholarly package that can also extract author articles.
Code that shows how to extract all author articles with Google Scholar Author Articles API:
from serpapi import GoogleScholarSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
import os
def serpapi_scrape_articles():
params = {
# https://docs.python.org/3/library/os.html
"api_key": os.getenv("API_KEY"),
"engine": "google_scholar_author",
"hl": "en",
"author_id": "mG4imMEAAAAJ",
"start": "0",
"num": "100"
}
search = GoogleScholarSearch(params)
all_articles = []
articles_is_present = True
while articles_is_present:
results = search.get_dict()
for index, article in enumerate(results["articles"], start=1):
title = article["title"]
link = article["link"]
authors = article["authors"]
publication = article.get("publication")
citation_id = article["citation_id"]
all_articles.append({
"title": title,
"link": link,
"authors": authors,
"publication": publication,
"citation_id": citation_id
})
if "next" in results.get("serpapi_pagination", {}):
# split URL in parts as a dict() and update "search" variable to a new page
search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
else:
articles_is_present = False
print(json.dumps(all_articles, indent=2, ensure_ascii=False))
# pd.DataFrame(data=all_articles).to_csv(f"serpapi_google_scholar_{params['author_id']}_articles.csv", encoding="utf-8", index=False)
serpapi_scrape_articles()
Here is one way of obtaining that data:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from tqdm import tqdm ## if Jupyter notebook: from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
big_df = pd.DataFrame()
headers = {
'accept-language': 'en-US,en;q=0.9',
'x-requested-with': 'XHR',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
payload = {'json': '1'}
for x in tqdm(range(0, 500, 100)):
url = f'https://scholar.google.com/citations?hl=en&user=mG4imMEAAAAJ&cstart={x}&pagesize=100'
r = s.post(url, data=payload)
soup = bs(r.json()['B'], 'html.parser')
works = [(x.get_text(), 'https://scholar.google.com' + x.get('href')) for x in soup.select('a') if 'javascript:void(0)' not in x.get('href') and len(x.get_text()) > 7]
df = pd.DataFrame(works, columns = ['Paper', 'Link'])
big_df = pd.concat([big_df, df], axis=0, ignore_index=True)
print(big_df)
Result in terminal:
100%
5/5 [00:03<00:00, 1.76it/s]
Paper Link
0 Latent dirichlet allocation https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:IUKN3-7HHlwC
1 On spectral clustering: Analysis and an algorithm https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:2KloaMYe4IUC
2 ROS: an open-source Robot Operating System https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:u-x6o8ySG0sC
3 Rectifier nonlinearities improve neural network acoustic models https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:gsN89kCJA0AC
4 Recursive deep models for semantic compositionality over a sentiment treebank https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:_axFR9aDTf0C
... ... ...
473 A Sparse Sampling Algorithm for Near-Optimal Planning in Large Markov Decision Processes https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:hMod-77fHWUC
474 On Discrim inative vs. Generative https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:qxL8FJ1GzNcC
475 Game Theory with Restricted Strategies https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:8k81kl-MbHgC
476 Exponential family sparse coding with application to self-taught learning with text documents https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:LkGwnXOMwfcC
477 Visual and Range Data https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:eQOLeE2rZwMC
478 rows × 2 columns
See pandas documentation at https://pandas.pydata.org/docs/
Also Requests docs: https://requests.readthedocs.io/en/latest/
For BeautifulSoup, go to https://beautiful-soup-4.readthedocs.io/en/latest/
And for TQDM visit https://pypi.org/project/tqdm/

I want to go to the all the pages of yelp webiste and extract data from

I want to go to all the pages of the yelp site but cann't
this is the code
# packages
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import urllib
import os
import json
import datetime
import csv
# property scraper class
class Yelp(scrapy.Spider):
# scraper name
name = 'home business'
base_url = 'https://www.yelp.com/search?'
params = {
'find_desc': 'Home Cleaning',
'find_loc':'North Dallas, Dallas, TX',
#'start' : ''
}
page = 0
current_page = 1
# headers
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
}
#params['start'] = page
try:
os.remove('abx.csv')
except OSError:
pass
# custom settings
custom_settings = {
'CONCURRENT_REQUEST_PER_DOMAIN': 2,
'DOWNLOAD_DELAY': 1
}
# general crawler
def start_requests(self):
url = self.base_url + urllib.parse.urlencode(self.params)
# initial HTTP request
yield scrapy.Request(
url=url,
headers=self.headers,
callback=self.parse_listing
)
def parse_listing(self, response):
lists = response.css('h4[class="css-1l5lt1i"]')
for link in lists:
link = link.css('a::attr(href)').get()
link = 'https://www.yelp.com/' + link
#print('\n\nlink:',link,'\n\n')
yield response.follow(link, headers = self.headers, callback = self.parse_cards)
break
try:
#self.params['start'] = self.page
try:
total_pages = response.css('.text-align--center__09f24__1P1jK .css-e81eai::text').get()[5:7]
print(total_pages)
self.page +=10
self.current_page +=1
except Exception as e:
total_pages = 1
print('totl:',total_pages)
print('PAGE %s | %s ' % (self.current_page, total_pages))
if int(self.page/10) <= int(total_pages):
self.log('\n\n %s | %s\n\n ' %(self.page/10, total_pages))
next_page = response.url + '&start=' + str(self.page)
yield response.follow(url = next_page, headers = self.headers, callback = self.parse_listing)
except:
print('only single page',self.current_page)
def parse_cards(self,response):
print('\nok\n')
# main driver
if __name__ == '__main__':
# run scraper
process = CrawlerProcess()
process.crawl(Yelp)
process.start()
#Yelp.parse_cards(Yelp, '')
I applied try and except method also but cann't done the job.
The main problem is in the next page with the param '&start=' if i increment the start to 10 in every time then the url become every time like this
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=10&start=20&start=30'
and so on i want to only the url start will increment to start=10 and after them start=20 and so on.
like this
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=20'
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=30'
and so on.
Just find the link to the next page and follow that
next_page = response.css("a.next-link::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
This is pretty similar to what is done in the scrapy tutorial, have you followed that? Was there a reason you couldn't do it this way?
In the end your entire spider can become
from scrapy import Spider
class Yelp(Spider):
# scraper name
name = "home business"
start_urls = [
"https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX"
]
def parse(self, response):
for link in response.css("h4 > span > a"):
yield response.follow(link, callback=self.parse_cards)
next_page = response.css("a.next-link::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_cards(self, response):
print("parse_cards", response.url)
I removed the start_requests stuff to keep it simple for this example (something you should probably try to do when asking questions)

Why is b' ' included in the excel file after web scraping?

I'm learning web scraping and was able to scrape data from a website to an excel file. However, in the excel file, you can see that it also includes b' ', instead of just the strings (names of Youtube channels, uploads, views). Any idea where this came from?
from bs4 import BeautifulSoup
import csv
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'} # Need to use this otherwise it returns error 403.
url = requests.get('https://socialblade.com/youtube/top/50/mostviewed', headers=headers)
#print(url)
soup = BeautifulSoup(url.text, 'lxml')
rows = soup.find('div', attrs = {'style': 'float: right; width: 900px;'}).find_all('div', recursive = False)[4:] # If in the inspect of the website, it uses class, then instead of 'style", type in '_class = ' instead. We don't need the first 4 rows, so [4:]
file = open('/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/My_Projects/Web_scraping/topyoutubers.csv', 'w')
writer = csv.writer(file)
# write header rows
writer.writerow(['Username', 'Uploads', 'Views'])
for row in rows:
username = row.find('a').text.strip()
numbers = row.find_all('span', attrs = {'style': 'color:#555;'})
uploads = numbers[0].text.strip()
views = numbers[1].text.strip()
print(username + ' ' + uploads + ' ' + views)
writer.writerow([username.encode('utf-8'), uploads.encode('utf-8'), views.encode('utf-8')])
file.close()
It is caused by the way you do your encoding - you might better define it once while opening the file:
file = open('topyoutubers.csv', 'w', encoding='utf-8')
New code
from bs4 import BeautifulSoup
import csv
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'} # Need to use this otherwise it returns error 403.
url = requests.get('https://socialblade.com/youtube/top/50/mostviewed', headers=headers)
#print(url)
soup = BeautifulSoup(url.text, 'lxml')
rows = soup.find('div', attrs = {'style': 'float: right; width: 900px;'}).find_all('div', recursive = False)[4:] # If in the inspect of the website, it uses class, then instead of 'style", type in '_class = ' instead. We don't need the first 4 rows, so [4:]
file = open('/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/My_Projects/Web_scraping/topyoutubers.csv', 'w', encoding='utf-8')
writer = csv.writer(file)
# write header rows
writer.writerow(['Username', 'Uploads', 'Views'])
for row in rows:
username = row.find('a').text.strip()
numbers = row.find_all('span', attrs = {'style': 'color:#555;'})
uploads = numbers[0].text.strip()
views = numbers[1].text.strip()
print(username + ' ' + uploads + ' ' + views)
writer.writerow([username, uploads, views])
file.close()
Output
Username Uploads Views
1 T-Series 15,029 143,032,749,708
2 Cocomelon - Nursery Rhymes 605 93,057,513,422
3 SET India 48,505 78,282,384,002
4 Zee TV 97,302 59,037,594,757

Why is the YouTube API v3 inconsistent with the amount of comments it lets you download before an error 400?

I am downloading YouTube comments with a python script that uses API keys and the YouTube Data API V3, but sooner or later I run into the following error:
{'error': {'code': 400, 'message': "The API server failed to successfully process the request. While this can be a transient error, it usually indicates that the request's input is invalid. Check the structure of the commentThread resource in the request body to ensure that it is valid.", 'errors': [{'message': "The API server failed to successfully process the request. While this can be a transient error, it usually indicates that the request's input is invalid. Check the structure of the commentThread resource in the request body to ensure that it is valid.", 'domain': 'youtube.commentThread', 'reason': 'processingFailure', 'location': 'body', 'locationType': 'other'}]}}
I am using the following code:
import argparse
import requests
import json
import time
start_time = time.time()
class YouTubeApi():
YOUTUBE_COMMENTS_URL = 'https://www.googleapis.com/youtube/v3/commentThreads'
comment_counter = 0
def is_error_response(self, response):
error = response.get('error')
if error is None:
return False
print("API Error: "
f"code={error['code']} "
f"domain={error['errors'][0]['domain']} "
f"reason={error['errors'][0]['reason']} "
f"message={error['errors'][0]['message']!r}")
print(self.comment_counter)
return True
def format_comments(self, results, likes_required):
comments_list = []
try:
for item in results["items"]:
comment = item["snippet"]["topLevelComment"]
likes = comment["snippet"]["likeCount"]
if likes < likes_required:
continue
author = comment["snippet"]["authorDisplayName"]
text = comment["snippet"]["textDisplay"]
str = "Comment by {}:\n \"{}\"\n\n".format(author, text)
str = str.encode('ascii', 'replace').decode()
comments_list.append(str)
self.comment_counter += 1
print("Comments downloaded:", self.comment_counter, end="\r")
except(KeyError):
print(results)
return comments_list
def get_video_comments(self, video_id, likes_required):
with open("API_keys.txt", "r") as f:
key_list = f.readlines()
comments_list = []
key_list = [key.strip('/n') for key in key_list]
params = {
'part': 'snippet,replies',
'maxResults': 100,
'videoId': video_id,
'textFormat': 'plainText',
'key': key_list[0]
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
comments_data = requests.get(self.YOUTUBE_COMMENTS_URL, params=params, headers=headers)
results = comments_data.json()
if self.is_error_response(results):
return []
nextPageToken = results.get("nextPageToken")
comments_list = []
comments_list += self.format_comments(results, likes_required)
while nextPageToken:
params.update({'pageToken': nextPageToken})
if self.comment_counter <= 900000:
params.update({'key': key_list[0]})
elif self.comment_counter <= 1800000:
params.update({'key': key_list[1]})
elif self.comment_counter <= 2700000:
params.update({'key': key_list[2]})
elif self.comment_counter <= 3600000:
params.update({'key': key_list[3]})
elif self.comment_counter <= 4500000:
params.update({'key': key_list[4]})
else:
params.update({'key': key_list[5]})
if self.comment_counter % 900001 == 0:
print(params["key"])
comments_data = requests.get(self.YOUTUBE_COMMENTS_URL, params=params, headers=headers)
results = comments_data.json()
if self.is_error_response(results):
return comments_list
nextPageToken = results.get("nextPageToken")
comments_list += self.format_comments(results, likes_required)
return comments_list
def get_video_id_list(self, filename):
try:
with open(filename, 'r') as file:
URL_list = file.readlines()
except FileNotFoundError:
exit("File \"" + filename + "\" not found")
list = []
for url in URL_list:
if url == "\n": # ignore empty lines
continue
if url[-1] == '\n': # delete '\n' at the end of line
url = url[:-1]
if url.find('='): # get id
id = url[url.find('=') + 1:]
list.append(id)
else:
print("Wrong URL")
return list
def main():
yt = YouTubeApi()
parser = argparse.ArgumentParser(add_help=False, description=("Download youtube comments from many videos into txt file"))
required = parser.add_argument_group("required arguments")
optional = parser.add_argument_group("optional arguments")
optional.add_argument("--likes", '-l', help="The amount of likes a comment needs to be saved", type=int)
optional.add_argument("--input", '-i', help="URL list file name")
optional.add_argument("--output", '-o', help="Output file name")
optional.add_argument("--help", '-h', help="Help", action='help')
args = parser.parse_args()
# --------------------------------------------------------------------- #
likes = 0
if args.likes:
likes = args.likes
input_file = "URL_list.txt"
if args.input:
input_file = args.input
output_file = "Comments.txt"
if args.output:
output_file = args.output
list = yt.get_video_id_list(input_file)
if not list:
exit("No URLs in input file")
try:
vid_counter = 0
with open(output_file, "a") as f:
for video_id in list:
vid_counter += 1
print("Downloading comments for video ", vid_counter, ", id: ", video_id, sep='')
comments = yt.get_video_comments(video_id, likes)
if comments:
for comment in comments:
f.write(comment)
print('\nDone!')
except KeyboardInterrupt:
exit("User Aborted the Operation")
# --------------------------------------------------------------------- #
if __name__ == '__main__':
main()
In another thread, it was discovered that google does not currently permit downloading all the comments on a popular video, however you would expect it to cut off at the same point. Instead, I have found that it can range anywhere betweek 1.5 million to 200k comments downloaded before it returns a code 400. Is this to do with a bug in my code, or is the YouTube API rejecting my request as it is clear that is a script? Would adding a time.sleep clause help with this?
(I bring forward this answer -- that I prepared to the question above at the time of its initial post -- because my assertions below seems to be confirmed once again by recent SO posts of this very kind.)
Your observations are correct. But, unfortunately, nobody but Google itself is able to provide a sound and complete answer to your question. Us -- non-Googlers (as myself!), or even the Googlers themselves (since they all sign NDAs) -- can only guess about the things implied.
Here is my educated guess, based on the investigations I made recently when responding to a very much related question (which you quoted above, yourself!):
As you already know, the API uses pagination for to return to callers sets of items of which cardinality exceed the internal limit of 50, or, by case, 100 items to be returned by each and every API endpoint invocation that provides result sets.
If you'll log the nextPageToken property that you obtain from CommentThreads.list via your object results, you'll see that those page tokens get bigger and bigger. Each and every such page token has to be passed on to the next CommentThreads.list call as the parameter pageToken.
The problem is that internally (not specified publicly, not documented) the API has a limit on the sheer length of the HTTP requests it accepts from its callers. (This happens for various reasons; e.g. security.) Therefore, when a given page token is sufficiently long, the HTTP request that the API user issues will exceed that internal limit, producing an internal error. That error surfaces to the API caller as the processingFailure error that you've encountered.
Many questions remain to be answered (e.g. why is that the page tokens have unbounded length?), but, again, those questions belong very much to the internal realm of the back-end system that's behind the API we're using. And those questions cannot be answered publicly, since are very much Google's internal business.

How to get the Tor ExitNode IP with Python and Stem

I'm trying to get the external IP that Tor uses, as mentioned here. When using something like myip.dnsomatic.com, this is very slow. I tried what was suggested in the aforementioned link (python + stem to control tor through the control port), but all you get is circuit's IPs with no assurance of which one is the one on the exitnode, and, sometimes the real IP is not even among the results.
Any help would be appreciated.
Also, from here, at the bottom, Amine suggests a way to renew the identity in Tor. There is an instruction, controller.get_newnym_wait(), which he uses to wait until the new connection is ready (controller is from Control in steam.control), isn't there any thing like that in Steam (sorry, I checked and double/triple checked and couldn't find nothing) that tells you that Tor is changing its identity?
You can get the exit node ip without calling a geoip site.
This is however on a different stackexchange site here - https://tor.stackexchange.com/questions/3253/how-do-i-trap-circuit-id-none-errors-in-the-stem-script-exit-used-py
As posted by #mirimir his code below essentially attaches a stream event listener function, which is then used to get the circuit id, circuit fingerprint, then finally the exit ip address -
#!/usr/bin/python
import functools
import time
from stem import StreamStatus
from stem.control import EventType, Controller
def main():
print "Tracking requests for tor exits. Press 'enter' to end."
print
with Controller.from_port() as controller:
controller.authenticate()
stream_listener = functools.partial(stream_event, controller)
controller.add_event_listener(stream_listener, EventType.STREAM)
raw_input() # wait for user to press enter
def stream_event(controller, event):
if event.status == StreamStatus.SUCCEEDED and event.circ_id:
circ = controller.get_circuit(event.circ_id)
exit_fingerprint = circ.path[-1][0]
exit_relay = controller.get_network_status(exit_fingerprint)
t = time.localtime()
print "datetime|%d-%02d-%02d %02d:%02d:%02d % (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec)
print "website|%s" % (event.target)
print "exitip|%s" % (exit_relay.address)
print "exitport|%i" % (exit_relay.or_port)
print "fingerprint|%s" % exit_relay.fingerprint
print "nickname|%s" % exit_relay.nickname
print "locale|%s" % controller.get_info("ip-to-country/%s" % exit_relay.address, 'unknown')
print
You can use this code for check current IP (change SOCKS_PORT value to yours):
import re
import stem.process
import requesocks
SOCKS_PORT = 9053
tor_process = stem.process.launch_tor()
proxy_address = 'socks5://127.0.0.1:{}'.format(SOCKS_PORT)
proxies = {
'http': proxy_address,
'https': proxy_address
}
response = requesocks.get("http://httpbin.org/ip", proxies=proxies)
print re.findall(r'[\d.-]+', response.text)[0]
tor_process.kill()
If you want to use socks you should do:
pip install requests[socks]
Then you can do:
import requests
import json
import stem.process
import stem
SOCKS_PORT = "9999"
tor = stem.process.launch_tor_with_config(
config={
'SocksPort': SOCKS_PORT,
},
tor_cmd= 'absolute_path/to/tor.exe',
)
r = requests.Session()
proxies = {
'http': 'socks5://localhost:' + SOCKS_PORT,
'https': 'socks5://localhost:' + SOCKS_PORT
}
response = r.get("http://httpbin.org/ip", proxies=proxies)
self.current_ip = response.json()['origin']

Resources