Airflow taskflow - run task in parallele

Airflow taskflow - run task in parallele - airflow

Wanted to try the new taskflow API I came to the point where I need to have 2 parallels task.
With Airflow v1 I was use to do something like
task_1 >> [task_2, task_3]
[task_2, task_3] >> task_4
The way we call the task is different now for PythonOperator
How can I do the list with TaskFlow ?
Thanks

if each task is depended on the value from previous task you can achieve it by:
from airflow.utils.dates import days_ago
from airflow.decorators import task, dag
#task
def task_1():
return 'first task'
#task
def task_2(value):
return 'second task'
#task
def task_3(value):
return 'third task'
#task
def task_4(value1, value2):
return 'forth task'
default_args = {
'owner': 'airflow',
'start_date': days_ago(2),
}
#dag(dag_id='taskflow_stackoverflow', schedule_interval='#once', default_args=default_args, catchup=False)
def my_dag():
op_1 = task_1()
op_2 = task_2(op_1)
op_3 = task_3(op_1)
op_4 = task_4(op_2, op_3)
dag = my_dag()
The syntax that you mentioned is also supported but you won't get direct access to the xcom values from previous tasks:
#task
def task_1():
return 'first task'
#task
def task_2():
return 'second task'
#task
def task_3():
return 'third task'
#task
def task_4():
return 'forth task'
default_args = {
'owner': 'airflow',
'start_date': days_ago(2),
}
#dag(dag_id='taskflow_stackoverflow', schedule_interval='#once', default_args=default_args, catchup=False)
def my_dag():
op_1 = task_1()
op_2 = task_2()
op_3 = task_3()
op_4 = task_4()
op_1 >> [op_2, op_3]
[op_2, op_3] >> op_4
dag = my_dag()
Probably you need to mix the two options of syntax depending on what you want to achieve.

Related

Handling TimeOut Exception in AsyncIO

import asyncio
import aiohttp
from time import perf_counter
import csv
path = "*******************"
domains = []
total_count=0
with open(path, 'r') as file:
csvreader = csv.reader(file)
for row in csvreader:
try:
website = row[4].split("//")[-1].split("www.")[-1].split('/')[0]
if website == "":
continue
domains.append(website)
except:
continue
sample = domains[0:50]
async def fetch(s, body):
async with s.post('https://****************', json=body) as r:
if r.status!= 200:
pass
enrich_response = await r.json()
#print(enrich_response)
employees = enrich_response['employees']
for employee in employees:
if(employee['job_title'] == "Owner"):
print(employee)
print("************************************************")
global total_count
total_count += 1
print("Total Count:", total_count)
continue
elif(employee['job_title'] == "CEO"):
print(employee)
print("***************************************************")
total_count+=1
print("Total Count:", total_count)
continue
else:
continue
async def fetch_all(s,bodies):
tasks = []
for body in bodies:
task = asyncio.create_task(fetch(s, body))
tasks.append(task)
res = await asyncio.gather(*tasks)
return res
async def main():
# apikeys = list(apikeysone.keys.values())
bodies = []
for domain in sample:
body = {
"api_key": "********************************",
"domain" : "{}".format(domain)
}
bodies.append(body)
async with aiohttp.ClientSession() as session:
data = await fetch_all(session, bodies)
print(data[0])
if __name__ == '__main__':
start = perf_counter()
try:
asyncio.run(main())
except Exception as e:
print(e)
pass
stop = perf_counter()
print("Time taken:", stop - start)
Hi!
I'm trying to connect to a scraping service provider using asyncio, instead of simple synchronous api calls.
But I get a TimeOut error. How could I use exception handling to wait a few seconds before retrying it once again? Or just skipping that task if it fails?
Thank you in advance fellow coder!
Tried adding to some places continue/pass

Try exploring asyncio.wait_for() function. It takes an awaitable and a timeout value. If task isn't completed before timeout value, it raises asyncio.exceptions.TimeoutError which you can handle in any way you want in except clause.
A typical example (from Python doc) is as follows:
async def eternity():
# Sleep for one hour
await asyncio.sleep(3600)
print('yay!')
async def main():
# Wait for at most 1 second
try:
await asyncio.wait_for(eternity(), timeout=1.0)
except TimeoutError:
print('timeout!')
asyncio.run(main())
# Expected output:
#
# timeout!

FastAPI, column computers.id does not exist

Here is the code of my main.py in FastAPI:
from typing import List, Union
import datetime
import databases
import sqlalchemy
from fastapi import FastAPI
from pydantic import BaseModel
DATABASE_URL = "postgresql://username:password#localhost/collector"
database = databases.Database(DATABASE_URL)
metadata = sqlalchemy.MetaData()
computers = sqlalchemy.Table(
"computers",
metadata,
sqlalchemy.Column("id", sqlalchemy.Integer, primary_key=True, index=True),
sqlalchemy.Column("computername", sqlalchemy.String),
sqlalchemy.Column("computerip", sqlalchemy.String),
sqlalchemy.Column("computerexternalip", sqlalchemy.String),
sqlalchemy.Column("time", sqlalchemy.DateTime),
)
engine = sqlalchemy.create_engine(
DATABASE_URL
)
metadata.create_all(engine)
class ComputerBase(BaseModel):
computername: str
computerip: str
computerexternalip: str
time: str = datetime.datetime
class ComputerIn(ComputerBase):
pass
class Computer(ComputerBase):
id: int
class Config:
orm_mode = True
app = FastAPI()
#app.on_event("startup")
async def startup():
await database.connect()
#app.on_event("shutdown")
async def shutdown():
await database.disconnect()
#app.get("/computers/", response_model=List[Computer])
async def read_computers():
query = computers.select()
print(query)
return await database.fetch_all(query)
#app.post("/computers/", response_model=Computer)
async def create_computer(computer: ComputerIn):
current_time = datetime.datetime.utcnow
query = computers.insert().values(computername=computer.computername, computerip=computer.computerip, computerexternalip=computer.computerexternalip, time=current_time)
last_record_id = await database.execute(query)
return {**computer.dict(), "id": last_record_id}
When I go on https://localhost:8000/computers, I get this error:
asyncpg.exceptions.UndefinedColumnError: column computers.id does not
exist
Which I don't understand since I declare a table names "computers" with an id column at the begining of my code.
Any idea ?
Thank you

why fastapi do not resolve my dependency?

handler
#router.post("/profile/{user_id}/grade", response_model=UserGradeResponseSchema)
#access_control("create", user_grade_acl)
async def new_grade(
user_id: int,
grade: UserGradeCreateSchema,
db: Session = Depends(get_db),
):
try:
created_grade = create_grade(db, grade, user_id, reviewer.id)
except IntegrityError:
raise ModelNotFoundError("User")
return created_grade
access_control decorator:
def get_user_principals(user: User = Depends(verify_token)):
return [
f"user:{user.id}",
f"user:{user.is_staff}",
f"user:{user.is_superuser}",
Authenticated,
Everyone
]
def access_control(action: str, acl: Union[Callable, List[Tuple[str, str, str]]]):
def intermediate_wrapper(func: Callable):
#wraps(func)
async def wrapper(access=Depends(get_user_principals), *args, **kwargs):
return await func(*args, **kwargs)
return wrapper
return intermediate_wrapper
It doesn't call the get_user_principals dependency (I tried outputting the access value, outputs Depends(get_user_principals)
)

Airflow - Use TaskGroup and PythonBranchOperator in the same DAG

I am currently using Airflow Taskflow API 2.0. I am having an issue of combining the use of TaskGroup and BranchPythonOperator.
Below is my code:
import airflow
from airflow.models import DAG
from airflow.decorators import task, dag
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import BranchPythonOperator, PythonOperator
from airflow.operators.python import task, get_current_context
from random import randint
from airflow.utils.task_group import TaskGroup
default_args = {
'owner': 'Airflow',
'start_date': airflow.utils.dates.days_ago(2),
}
#task
def dummy_task():
return {}
#task
def task_b():
return {}
#task
def task_c():
return {}
def final_step():
return {}
def get_tasks(**kwargs):
task = 'task_a'
return task
with DAG(dag_id='branch_dag',
default_args=default_args,
schedule_interval=None) as dag:
with TaskGroup('task_a') as task_a:
obj = dummy_task()
tasks = BranchPythonOperator(
task_id='check_api',
python_callable=get_tasks,
provide_context=True
)
final_step = PythonOperator(
task_id='final_step',
python_callable=final_step,
trigger_rule='one_success'
)
b = task_b()
c = task_c()
tasks >> task_a >> final_step
tasks >> b >> final_step
tasks >> c >> final_step
When i trigger this DAG, i get the below error inside the check_api task:
airflow.exceptions.TaskNotFound: Task task_a not found
Is it possible to get this working and using TaskGroup in conjunction with BranchPythonOperator?
Thanks,

BranchPythonOperator is expected to return task_ids
You need to change the get_tasksfunction to:
def get_tasks(**kwargs):
task = 'task_a.dummy_task'
return task

Query graphite index.json for a specific sub-tree

I'm querying Graphite's index.json to get all the metrics. Is there an option to pass a root metric and get only a sub-tree? Something like:
http://<my.graphite>/metrics/index.json?query="my.metric.subtree"

That is not supported.
What you can do however is call /metrics/find recursively (call it again for each branch encountered)
Something like this:
#!/usr/bin/python
from __future__ import print_function
import requests
import json
import argparse
try:
from Queue import Queue
except:
from queue import Queue
from threading import Thread, Lock
import sys
import unicodedata
outLock = Lock()
def output(msg):
with outLock:
print(msg)
sys.stdout.flush()
class Walker(Thread):
def __init__(self, queue, url, user=None, password=None, seriesFrom=None, depth=None):
Thread.__init__(self)
self.queue = queue
self.url = url
self.user = user
self.password = password
self.seriesFrom = seriesFrom
self.depth = depth
def run(self):
while True:
branch = self.queue.get()
try:
branch[0].encode('ascii')
except Exception as e:
with outLock:
sys.stderr.write('found branch with invalid characters: ')
sys.stderr.write(unicodedata.normalize('NFKD', branch[0]).encode('utf-8','xmlcharrefreplace'))
sys.stderr.write('\n')
else:
if self.depth is not None and branch[1] == self.depth:
output(branch[0])
else:
self.walk(branch[0], branch[1])
self.queue.task_done()
def walk(self, prefix, depth):
payload = {
"query": (prefix + ".*") if prefix else '*',
"format": "treejson"
}
if self.seriesFrom:
payload['from']=self.seriesFrom
auth = None
if self.user is not None:
auth = (self.user, self.password)
r = requests.get(
self.url + '/metrics/find',
params=payload,
auth=auth,
)
if r.status_code != 200:
sys.stderr.write(r.text+'\n')
raise Exception(
'Error walking finding series: branch={branch} reason={reason}'
.format(branch=unicodedata.normalize('NFKD', prefix).encode('ascii','replace'), reason=r.reason)
)
metrics = r.json()
for metric in metrics:
try:
if metric['leaf']:
output(metric['id'])
else:
self.queue.put((metric['id'], depth+1))
except Exception as e:
output(metric)
raise e
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--url", help="Graphite URL", required=True)
parser.add_argument("--prefix", help="Metrics prefix", required=False, default='')
parser.add_argument("--user", help="Basic Auth username", required=False)
parser.add_argument("--password", help="Basic Auth password", required=False)
parser.add_argument("--concurrency", help="concurrency", default=8, required=False, type=int)
parser.add_argument("--from", dest='seriesFrom', help="only get series that have been active since this time", required=False)
parser.add_argument("--depth", type=int, help="maximum depth to traverse. If set, the branches at the depth will be printed", required=False)
args = parser.parse_args()
url = args.url
prefix = args.prefix
user = args.user
password = args.password
concurrency = args.concurrency
seriesFrom = args.seriesFrom
depth = args.depth
queue = Queue()
for x in range(concurrency):
worker = Walker(queue, url, user, password, seriesFrom, depth)
worker.daemon = True
worker.start()
queue.put((prefix, 0))
queue.join()
Note: this code comes from: https://github.com/grafana/cloud-graphite-scripts/blob/master/query/walk_metrics.py

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Airflow taskflow - run task in parallele - airflow

Wanted to try the new taskflow API I came to the point where I need to have 2 parallels task. With Airflow v1 I was use to do something like task_1 >> [task_2, task_3] [task_2, task_3] >> task_4 The way we call the task is different now for PythonOperator How can I do the list with TaskFlow ? Thanks

Related

Handling TimeOut Exception in AsyncIO

FastAPI, column computers.id does not exist

why fastapi do not resolve my dependency?

Airflow - Use TaskGroup and PythonBranchOperator in the same DAG

Query graphite index.json for a specific sub-tree

Categories

Resources