Why can't I handle 300 get responses with async? - asynchronous

As part of my homework project, I'm working with imdb.com pages.
For one task I need to make 320 get-requests to turn them into beautifulsoup objects later on.
I'm trying to do that the async way and so far I got this:
def get_tasks(session, url_links):
tasks = []
num = 1 # debugging purposes
for url in url_links:
tasks.append(session.get(url, headers={'Accept-Language': 'en', 'X_FORWARDED_FOR': '2.21.184.0'}, ssl=False))
time.sleep(1) # avoid 503 status_code
print(f"Number of responses get_tasks: {num}") # debugging purposes
num += 1 # debugging purposes
return tasks
# Getting response.texts
results = []
async def get_response_texts(url_links):
async with aiohttp.ClientSession() as session:
tasks = get_tasks(session, url_links)
responses = await asyncio.gather(*tasks)
t1 = time.perf_counter()
num = 1
for response in responses:
results.append(await response.text())
print(f"{num} responses processed") # debugging purposes
num += 1
t2 = time.perf_counter()
print(f'Asynchronous execution: Finished in {t2 - t1} seconds\n')
if __name__ == '__main__':
links = [a list of urls to films as strings]
asyncio.run(get_response_texts(links))
print(len(results))
Here comes the problem: When I process 100 requests, things seem all right, but when I make 300, I get asyncio.exceptions.TimeoutError.
Why is it so and how can I avoid that and make 320 requests asynchronously?

Related

How do i run this program asynchronously?

As a newbie in python AsyncIO, I have written a sample cook and waiter problem.
import asyncio
async def waiter():
t1 = asyncio.create_task(cook('indian', 10))
t2 = asyncio.create_task(cook('chinese', 5))
t3 = asyncio.create_task(cook('american', 15))
await t1
await t2
await t3
async def cook(name, time):
print('Preparing {}'.format(name))
await asyncio.sleep(time)
print('Prepared {}'.format(name))
asyncio.run(waiter())
ubuntu#ip-172-31-14-144:~$ python3 one.py
Preparing indian
Preparing chinese
Preparing american
Prepared chinese
Prepared indian
Prepared american
ubuntu#ip-172-31-14-144:~$
I understand from the above one.py that, the waiter takes all the orders and then gives to the cook to process them. So to further build up my understanding I thought of making a menu driven program, so that the user can choose.
import asyncio
import aioconsole
menu = {
'item1': 10,
'item2': 5,
'item3': 25,
'item4': 5
}
queue = asyncio.Queue()
tasks = []
async def cook():
print('In queue')
user_option = await queue.get()
user_option -= 1
print(user_option)
print('Preparing {}'.format(list(menu.keys())[user_option-1]))
await asyncio.sleep(menu[list(menu.keys())[user_option-1]])
print('Prepared {}'.format(list(menu.keys())[user_option-1]))
async def get_input():
inp = await aioconsole.ainput('Please enter your desired option\n')
return int(inp)
async def waiter():
user_option = 0
while True:
count = 1
print('*'*100)
print('Hello User..\n')
print('What would you like to have ??\n')
for item in menu:
print('{}. {}'.format(count, item))
count = count + 1
try:
user_option = await asyncio.wait_for(get_input(), timeout=2.0)
except asyncio.TimeoutError:
print('TIMEOUT')
if user_option:
await queue.put(user_option)
tasks.append(asyncio.create_task(coro=cook()))
for i in tasks:
await i
else:
print('In else')
pass
asyncio.run(waiter())
****************************************************************************************************
Hello User..
What would you like to have ??
1. item1
2. item2
3. item3
4. item4
Please enter your desired option
TIMEOUT
In else
****************************************************************************************************
Hello User..
What would you like to have ??
1. item1
2. item2
3. item3
4. item4
Please enter your desired option
1 -> an option is entered here
In queue
0
Preparing item4 # Item is being prepared, but the intention is this should be happening
Prepared item4 # concurrently, so that other users can place their order
****************************************************************************************************
Hello User..
What would you like to have ??
1. item1
2. item2
3. item3
4. item4
Please enter your desired option
Expectation:
In the second program when an option is entered, the cook should process them and print them on the screen concurrently, while an user can place their order, even when cook is preparing some thing.
Problem:
As soon as an option is entered, the waiter function waits for the cook to complete and then displays the menu.
Python 3.8.10 is used
Thanks
import asyncio
import aioconsole
menu = {
'item1': 10,
'item2': 5,
'item3': 25,
'item4': 5
}
tasks = []
#async def cook(queue):
#
# print('In queue')
# user_option = await queue.get()
# user_option -= 1
# print(user_option)
# print('Preparing {}'.format(list(menu.keys())[user_option-1]))
# await asyncio.sleep(menu[list(menu.keys())[user_option-1]])
# print('Prepared {}'.format(list(menu.keys())[user_option-1]))
#
async def cook(queue):
while True:
print('In queue')
user_option = await queue.get()
user_option -= 1
print(user_option)
print('Preparing {}'.format(list(menu.keys())[user_option-1]))
await asyncio.sleep(menu[list(menu.keys())[user_option-1]])
print('Prepared {}'.format(list(menu.keys())[user_option-1]))
async def get_input():
inp = await aioconsole.ainput('Please enter your desired option\n')
return int(inp)
async def waiter(queue):
user_option = 0
while True:
count = 1
print('*'*100)
print('Hello User..\n')
print('What would you like to have ??\n')
for item in menu:
print('{}. {}'.format(count, item))
count = count + 1
try:
user_option = await asyncio.wait_for(get_input(), timeout=1.0)
print('You entered {}'.format(user_option))
except asyncio.TimeoutError:
pass
if user_option > 0:
print('Inserting option into queue {}'.format(user_option))
await queue.put(user_option)
user_option = -1
await asyncio.sleep(3)
async def main():
queue = asyncio.Queue()
task1 = asyncio.create_task(waiter(queue))
task2 = asyncio.create_task(cook(queue))
await asyncio.gather(task1, task2)
asyncio.run(main())
The waiter can now concurrently take orders and the cook function prints when the item is prepared.

Return list of tasks from function that should be run in sequence in Airflow

I want to return 2 or more tasks from a function that should be run in sequence in the spot they're inserted in the dependencies, see below.
t1 = PythonOperator()
def generate_tasks():
t2 = PythonOperator()
t3 = PythonOperator()
return magic(t2, t3) # magic needed here (preferably)
t1 >> generate_tasks() # otherwise here
# desired result: t1 >> t2 >> t3
Is this doable? As I understand it Airflow 2.0 seems to achieve this with a TaskGroup, but we're on Google's Composer, and 2.0 won't be available for a while.
Best workaround I've found:
t1 = PythonOperator()
def generate_tasks():
t2 = PythonOperator()
t3 = PythonOperator()
return [t2, t3]
tasks = generate_tasks()
t1 >> tasks[0] >> tasks[1]
But I'd really like that to be abstracted away, as it more or less defeats the purpose of having multiple operators returned from a single function. We want it to be a single unit as far as the end user knows, even though it can be composed of 2 or more tasks.
How to do it with TaskGroup in Airflow 2.0:
class Encryptor:
def encrypt_and_archive(self):
with TaskGroup("archive_and_encrypt") as section_1:
encrypt = DummyOperator(task_id="encrypt")
archive = BashOperator(task_id="archive", bash_command='echo 1')
encrypt >> archive
return section_1
with DAG(dag_id="example_return_task_group", start_date=days_ago(2), tags=["example"]) as dag:
start = DummyOperator(task_id="start")
encrypt_and_archive = Encryptor().encrypt_and_archive()
end = DummyOperator(task_id='end')
# 👇 single variable, containing two tasks
start >> encrypt_and_archive >> end
Which creates the following graph:
Is something similar remotely doable before 2.0?
You didn't explain what magic(t2, t3) is.
TaskGroup is strictly UI feature it doesn't effect on the DAG logic. According to your description it seems that you are looking for a specific logic (otherwise what is magic?).
I believe this is what you are after:
default_args = {
'owner': 'airflow',
'start_date': datetime(2021, 1, 24),
}
def generate_tasks():
operator_list =[]
for i in range(5): # Replace to generate the logic you wish to dynamically create tasks
op = DummyOperator(task_id=f"t{str(i)}_task", dag=dag)
if i>0:
operator_list[i - 1] >> op
operator_list.append(op)
return operator_list
with DAG(
dag_id='loop',
default_args=default_args,
schedule_interval=None,
) as dag:
start_op = DummyOperator(task_id='start_task')
end_op = DummyOperator(task_id='end_task')
tasks = generate_tasks()
start_op >> tasks[0]
tasks[-1] >> end_op
You can replace the DummyOperator with any operator you'd like.

My discord bot keeps getting disconnected for some reason

I haven't changed my bot in weeks and for some reason I would get an error message like this every day for the past 5 or so days https://imgur.com/VCLx2kv
I don't think the error is caused by my code besides the whole loop thing which I don't know how to fix and it hasn't caused me any problems before, but if you're curious about that part I have the part of code that causes that issue below
I already tried regenerating my token.
#client.event
async def dead_check():
i = 1
d = datetime.now()
date = str(d.strftime("%Y-%m-%d"))
server = client.get_server(id = '105388450575859712')
while i == 1:
async for message in client.logs_from(discord.Object(id='561667365927124992'), limit=9999999):
if date in message.content:
usid = message.content.split('=')
usid1 = usid[1].split(' ')
count = message.content.split('#')
cd = message.content.split('?')
ev = cd[1]
if ev == '00':
number = 0
elif ev == '01':
number = 1
elif ev == '10':
number = 2
elif ev == '11':
number = 3
name = count[0]
await client.send_message(discord.Object(id='339182193911922689'), '#here\n' + name + ' has reached the deadline for the **FRICKLING** program.\nThe user has attended ' + str(number) + ' events.')
async for message in client.logs_from(discord.Object(id='567328773922619392'), limit=9999):
if date in message.content and message.reactions:
usid = message.content.split(' ')
user=await client.get_user_info(usid[0])
await client.send_message(discord.Object(id='567771853796540465'), user.mention + ' needs to be paid, if you have already paid him - react with :HYPERS:')
await client.delete_message(message)
await asyncio.sleep(60*60*24)
#client.event
async def on_ready():
await client.change_presence(game=Game(name='with nuclear waste'))
print('Ready, bitch')
asyncio.get_event_loop().run_until_complete(dead_check())
Have you tried reducing the limit of those logs_from calls? 9999999 is a pretty big number, and it may have slowed things down enough that the heartbeat isn't being sent at the proper times. You should also sanitize that image of the error message, it contains your bot token.
Credit to Patrick Haugh, but I wanted to close this thread and he didnt post it as an answer

Will a nest loop help in parsing results

I am trying to pull information from two different dictionaries. (excuse me because I am literally hacking to understand.)
I have a for loop that gives me the vmname. I have another for loop that gives me the other information like 'replicationid'.
I could be doing a very huge assumption here but hey ill start there. what I want to do it to integrate for loop 1 and for loop 2. as so the results are like this, is it even possible?
initial output of for loop1 which I can get:
vma
vmb
vmc
initial output of for loop2 which I can get:
replication job 1
replication job 2
replication job 3
desired results is:
vma
replication job 1
vmb
replication job 2
vmc
replication job 3
def get_replication_job_status():
sms = boto3.client('sms')
resp = sms.get_replication_jobs()
#print(resp)
things = [(cl['replicationJobId'], cl['serverId']) for cl in
resp['replicationJobList']]
thangs = [cl['vmServer'] for cl in resp['replicationJobList']]
for i in thangs:
print()
print("this is vm " + (i['vmName']))
print("this is the vm location " + (i['vmPath']))
print("this is the vm address, " +(str(i['vmServerAddress'])))
for j in things:
print("The Replication ID is : " +(str(j[0])))
again I want:
vma
replication job 1
vmb
replication job 2
vmc
replication job 3
im am getting:
vma
replication job 1
replication job 2
replication job 3
vmb
replication job 1
replication job 2
replication job 3
..
..
..
If you are sure that your lists both have the same length, then what you need is python built-in zip function:
for thing, thang in zip(things, thangs):
print()
print(thing)
print(thang)
But if one of the lists is longer then another then zip will crop both lists to have the same length as the shortest, for example:
>>> for i, j in zip(range(3), range(5)):
... print(i, j)
...
(0, 0)
(1, 1)
(2, 2)
UPD:
You can also unpack your tuples right in for loop definition, so each item (they are 2-tuples) in things list gets saved to two variables:
for (replicationJobId, serverId), thang in zip(things, thangs):
print()
print(replicationJobId)
print(serverId)
print(thang)
UPD 2:
Why do you split resp into to two lists?
def get_replication_job_status():
sms = boto3.client('sms')
resp = sms.get_replication_jobs()
#print(resp)
for replication_job in resp['replicationJobList']:
vm_server = replication_job['vmServer']
print()
print("this is vm:", vm_server['vmName'])
print("this is the vm location:", vm_server['vmPath'])
print("this is the vm address:", vm_server['vmServerAddress'])
print("The Replication ID is :", replication_job['replicationJobId'])

Dask losing workers over time

Here is the mcve to demonstrate losing workers over time. This is a followup to
Distributing graphs to across cluster nodes
The example is not quite minimal but it does give an idea of our typical work patterns. The sleep is necessary to cause the problem. This occurs in the full application because of the need to generate a large graph from previous results.
When I run this on a cluster, I use dask-ssh to get 32 workers over 8 nodes:
dask-ssh --nprocs 4 --nthreads 1 --scheduler-port 8786 --log-directory `pwd` --hostfile hostfile.$JOBID &
sleep 10
It should run in less than about 10 minutes with the full set of workers. I follow the execution on the diagnostics screen. Under events, I see the workers being added but then I sometimes but not always see removal of a number of workers, usually leaving only those on the node hosting the scheduler.
""" Test to illustrate losing workers under dask/distributed.
This mimics the overall structure and workload of our processing.
Tim Cornwell 9 Sept 2017
realtimcornwell#gmail.com
"""
import numpy
from dask import delayed
from distributed import Client
# Make some randomly located points on 2D plane
def init_sparse(n, margin=0.1):
numpy.random.seed(8753193)
return numpy.array([numpy.random.uniform(margin, 1.0 - margin, n),
numpy.random.uniform(margin, 1.0 - margin, n)]).reshape([n, 2])
# Put the points onto a grid and FFT, skip to save time
def grid_data(sparse_data, shape, skip=100):
grid = numpy.zeros(shape, dtype='complex')
loc = numpy.round(shape * sparse_data).astype('int')
for i in range(0, sparse_data.shape[0], skip):
grid[loc[i,:]] = 1.0
return numpy.fft.fft(grid).real
# Accumulate all psfs into one psf
def accumulate(psf_list):
lpsf = 0.0 * psf_list[0]
for p in psf_list:
lpsf += p
return lpsf
if __name__ == '__main__':
import sys
import time
start=time.time()
# Process nchunks each of length len_chunk 2d points, making a psf of size shape
len_chunk = int(1e6)
nchunks = 16
shape=[512, 512]
skip = 100
# We pass in the scheduler from the invoking script
if len(sys.argv) > 1:
scheduler = sys.argv[1]
client = Client(scheduler)
else:
client = Client()
print("On initialisation", client)
sparse_graph = [delayed(init_sparse)(len_chunk) for i in range(nchunks)]
sparse_graph = client.compute(sparse_graph, sync=True)
print("After first sparse_graph", client)
xfr_graph = [delayed(grid_data)(s, shape=shape, skip=skip) for s in sparse_graph]
xfr = client.compute(xfr_graph, sync=True)
print("After xfr", client)
tsleep = 120.0
print("Sleeping now for %.1f seconds" % tsleep)
time.sleep(tsleep)
print("After sleep", client)
sparse_graph = [delayed(init_sparse)(len_chunk) for i in range(nchunks)]
# sparse_graph = client.compute(sparse_graph, sync=True)
xfr_graph = [delayed(grid_data)(s, shape=shape, skip=skip) for s in sparse_graph]
psf_graph = delayed(accumulate)(xfr_graph)
psf = client.compute(psf_graph, sync=True)
print("*** Successfully reached end in %.1f seconds ***" % (time.time() - start))
print(numpy.max(psf))
print("After psf", client)
client.shutdown()
exit()
Grep'ing a typical run for Client shows:
On initialisation <Client: scheduler='tcp://sand-8-17:8786' processes=16 cores=16>
After first sparse_graph <Client: scheduler='tcp://sand-8-17:8786' processes=16 cores=16>
After xfr <Client: scheduler='tcp://sand-8-17:8786' processes=16 cores=16>
After sleep <Client: scheduler='tcp://sand-8-17:8786' processes=4 cores=4>
After psf <Client: scheduler='tcp://sand-8-17:8786' processes=4 cores=4>
Thanks,
Tim
It's not quite clear why this works but it did. We were using dask-ssh but needed more control over the creation of the workers. Eventually we settled on:
scheduler=$(head -1 hostfile.$JOBID)
hostIndex=0
for host in `cat hostfile.$JOBID`; do
echo "Working on $host ...."
if [ "$hostIndex" = "0" ]; then
echo "run dask-scheduler"
ssh $host dask-scheduler --port=8786 &
sleep 5
fi
echo "run dask-worker"
ssh $host dask-worker --host ${host} --nprocs NUMBER_PROCS_PER_NODE \
--nthreads NUMBER_THREADS \
--memory-limit 0.25 --local-directory /tmp $scheduler:8786 &
sleep 1
hostIndex="1"
done
echo "Scheduler and workers now running"

Resources