Wanted to try the new taskflow API I came to the point where I need to have 2 parallels task.
With Airflow v1 I was use to do something like
task_1 >> [task_2, task_3]
[task_2, task_3] >> task_4
The way we call the task is different now for PythonOperator
How can I do the list with TaskFlow ?
Thanks
if each task is depended on the value from previous task you can achieve it by:
from airflow.utils.dates import days_ago
from airflow.decorators import task, dag
#task
def task_1():
return 'first task'
#task
def task_2(value):
return 'second task'
#task
def task_3(value):
return 'third task'
#task
def task_4(value1, value2):
return 'forth task'
default_args = {
'owner': 'airflow',
'start_date': days_ago(2),
}
#dag(dag_id='taskflow_stackoverflow', schedule_interval='#once', default_args=default_args, catchup=False)
def my_dag():
op_1 = task_1()
op_2 = task_2(op_1)
op_3 = task_3(op_1)
op_4 = task_4(op_2, op_3)
dag = my_dag()
The syntax that you mentioned is also supported but you won't get direct access to the xcom values from previous tasks:
#task
def task_1():
return 'first task'
#task
def task_2():
return 'second task'
#task
def task_3():
return 'third task'
#task
def task_4():
return 'forth task'
default_args = {
'owner': 'airflow',
'start_date': days_ago(2),
}
#dag(dag_id='taskflow_stackoverflow', schedule_interval='#once', default_args=default_args, catchup=False)
def my_dag():
op_1 = task_1()
op_2 = task_2()
op_3 = task_3()
op_4 = task_4()
op_1 >> [op_2, op_3]
[op_2, op_3] >> op_4
dag = my_dag()
Probably you need to mix the two options of syntax depending on what you want to achieve.
Related
import asyncio
import aiohttp
from time import perf_counter
import csv
path = "*******************"
domains = []
total_count=0
with open(path, 'r') as file:
csvreader = csv.reader(file)
for row in csvreader:
try:
website = row[4].split("//")[-1].split("www.")[-1].split('/')[0]
if website == "":
continue
domains.append(website)
except:
continue
sample = domains[0:50]
async def fetch(s, body):
async with s.post('https://****************', json=body) as r:
if r.status!= 200:
pass
enrich_response = await r.json()
#print(enrich_response)
employees = enrich_response['employees']
for employee in employees:
if(employee['job_title'] == "Owner"):
print(employee)
print("************************************************")
global total_count
total_count += 1
print("Total Count:", total_count)
continue
elif(employee['job_title'] == "CEO"):
print(employee)
print("***************************************************")
total_count+=1
print("Total Count:", total_count)
continue
else:
continue
async def fetch_all(s,bodies):
tasks = []
for body in bodies:
task = asyncio.create_task(fetch(s, body))
tasks.append(task)
res = await asyncio.gather(*tasks)
return res
async def main():
# apikeys = list(apikeysone.keys.values())
bodies = []
for domain in sample:
body = {
"api_key": "********************************",
"domain" : "{}".format(domain)
}
bodies.append(body)
async with aiohttp.ClientSession() as session:
data = await fetch_all(session, bodies)
print(data[0])
if __name__ == '__main__':
start = perf_counter()
try:
asyncio.run(main())
except Exception as e:
print(e)
pass
stop = perf_counter()
print("Time taken:", stop - start)
Hi!
I'm trying to connect to a scraping service provider using asyncio, instead of simple synchronous api calls.
But I get a TimeOut error. How could I use exception handling to wait a few seconds before retrying it once again? Or just skipping that task if it fails?
Thank you in advance fellow coder!
Tried adding to some places continue/pass
Try exploring asyncio.wait_for() function. It takes an awaitable and a timeout value. If task isn't completed before timeout value, it raises asyncio.exceptions.TimeoutError which you can handle in any way you want in except clause.
A typical example (from Python doc) is as follows:
async def eternity():
# Sleep for one hour
await asyncio.sleep(3600)
print('yay!')
async def main():
# Wait for at most 1 second
try:
await asyncio.wait_for(eternity(), timeout=1.0)
except TimeoutError:
print('timeout!')
asyncio.run(main())
# Expected output:
#
# timeout!
Here is the code of my main.py in FastAPI:
from typing import List, Union
import datetime
import databases
import sqlalchemy
from fastapi import FastAPI
from pydantic import BaseModel
DATABASE_URL = "postgresql://username:password#localhost/collector"
database = databases.Database(DATABASE_URL)
metadata = sqlalchemy.MetaData()
computers = sqlalchemy.Table(
"computers",
metadata,
sqlalchemy.Column("id", sqlalchemy.Integer, primary_key=True, index=True),
sqlalchemy.Column("computername", sqlalchemy.String),
sqlalchemy.Column("computerip", sqlalchemy.String),
sqlalchemy.Column("computerexternalip", sqlalchemy.String),
sqlalchemy.Column("time", sqlalchemy.DateTime),
)
engine = sqlalchemy.create_engine(
DATABASE_URL
)
metadata.create_all(engine)
class ComputerBase(BaseModel):
computername: str
computerip: str
computerexternalip: str
time: str = datetime.datetime
class ComputerIn(ComputerBase):
pass
class Computer(ComputerBase):
id: int
class Config:
orm_mode = True
app = FastAPI()
#app.on_event("startup")
async def startup():
await database.connect()
#app.on_event("shutdown")
async def shutdown():
await database.disconnect()
#app.get("/computers/", response_model=List[Computer])
async def read_computers():
query = computers.select()
print(query)
return await database.fetch_all(query)
#app.post("/computers/", response_model=Computer)
async def create_computer(computer: ComputerIn):
current_time = datetime.datetime.utcnow
query = computers.insert().values(computername=computer.computername, computerip=computer.computerip, computerexternalip=computer.computerexternalip, time=current_time)
last_record_id = await database.execute(query)
return {**computer.dict(), "id": last_record_id}
When I go on https://localhost:8000/computers, I get this error:
asyncpg.exceptions.UndefinedColumnError: column computers.id does not
exist
Which I don't understand since I declare a table names "computers" with an id column at the begining of my code.
Any idea ?
Thank you
handler
#router.post("/profile/{user_id}/grade", response_model=UserGradeResponseSchema)
#access_control("create", user_grade_acl)
async def new_grade(
user_id: int,
grade: UserGradeCreateSchema,
db: Session = Depends(get_db),
):
try:
created_grade = create_grade(db, grade, user_id, reviewer.id)
except IntegrityError:
raise ModelNotFoundError("User")
return created_grade
access_control decorator:
def get_user_principals(user: User = Depends(verify_token)):
return [
f"user:{user.id}",
f"user:{user.is_staff}",
f"user:{user.is_superuser}",
Authenticated,
Everyone
]
def access_control(action: str, acl: Union[Callable, List[Tuple[str, str, str]]]):
def intermediate_wrapper(func: Callable):
#wraps(func)
async def wrapper(access=Depends(get_user_principals), *args, **kwargs):
return await func(*args, **kwargs)
return wrapper
return intermediate_wrapper
It doesn't call the get_user_principals dependency (I tried outputting the access value, outputs Depends(get_user_principals)
)
I am currently using Airflow Taskflow API 2.0. I am having an issue of combining the use of TaskGroup and BranchPythonOperator.
Below is my code:
import airflow
from airflow.models import DAG
from airflow.decorators import task, dag
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import BranchPythonOperator, PythonOperator
from airflow.operators.python import task, get_current_context
from random import randint
from airflow.utils.task_group import TaskGroup
default_args = {
'owner': 'Airflow',
'start_date': airflow.utils.dates.days_ago(2),
}
#task
def dummy_task():
return {}
#task
def task_b():
return {}
#task
def task_c():
return {}
def final_step():
return {}
def get_tasks(**kwargs):
task = 'task_a'
return task
with DAG(dag_id='branch_dag',
default_args=default_args,
schedule_interval=None) as dag:
with TaskGroup('task_a') as task_a:
obj = dummy_task()
tasks = BranchPythonOperator(
task_id='check_api',
python_callable=get_tasks,
provide_context=True
)
final_step = PythonOperator(
task_id='final_step',
python_callable=final_step,
trigger_rule='one_success'
)
b = task_b()
c = task_c()
tasks >> task_a >> final_step
tasks >> b >> final_step
tasks >> c >> final_step
When i trigger this DAG, i get the below error inside the check_api task:
airflow.exceptions.TaskNotFound: Task task_a not found
Is it possible to get this working and using TaskGroup in conjunction with BranchPythonOperator?
Thanks,
BranchPythonOperator is expected to return task_ids
You need to change the get_tasksfunction to:
def get_tasks(**kwargs):
task = 'task_a.dummy_task'
return task
I'm querying Graphite's index.json to get all the metrics. Is there an option to pass a root metric and get only a sub-tree? Something like:
http://<my.graphite>/metrics/index.json?query="my.metric.subtree"
That is not supported.
What you can do however is call /metrics/find recursively (call it again for each branch encountered)
Something like this:
#!/usr/bin/python
from __future__ import print_function
import requests
import json
import argparse
try:
from Queue import Queue
except:
from queue import Queue
from threading import Thread, Lock
import sys
import unicodedata
outLock = Lock()
def output(msg):
with outLock:
print(msg)
sys.stdout.flush()
class Walker(Thread):
def __init__(self, queue, url, user=None, password=None, seriesFrom=None, depth=None):
Thread.__init__(self)
self.queue = queue
self.url = url
self.user = user
self.password = password
self.seriesFrom = seriesFrom
self.depth = depth
def run(self):
while True:
branch = self.queue.get()
try:
branch[0].encode('ascii')
except Exception as e:
with outLock:
sys.stderr.write('found branch with invalid characters: ')
sys.stderr.write(unicodedata.normalize('NFKD', branch[0]).encode('utf-8','xmlcharrefreplace'))
sys.stderr.write('\n')
else:
if self.depth is not None and branch[1] == self.depth:
output(branch[0])
else:
self.walk(branch[0], branch[1])
self.queue.task_done()
def walk(self, prefix, depth):
payload = {
"query": (prefix + ".*") if prefix else '*',
"format": "treejson"
}
if self.seriesFrom:
payload['from']=self.seriesFrom
auth = None
if self.user is not None:
auth = (self.user, self.password)
r = requests.get(
self.url + '/metrics/find',
params=payload,
auth=auth,
)
if r.status_code != 200:
sys.stderr.write(r.text+'\n')
raise Exception(
'Error walking finding series: branch={branch} reason={reason}'
.format(branch=unicodedata.normalize('NFKD', prefix).encode('ascii','replace'), reason=r.reason)
)
metrics = r.json()
for metric in metrics:
try:
if metric['leaf']:
output(metric['id'])
else:
self.queue.put((metric['id'], depth+1))
except Exception as e:
output(metric)
raise e
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--url", help="Graphite URL", required=True)
parser.add_argument("--prefix", help="Metrics prefix", required=False, default='')
parser.add_argument("--user", help="Basic Auth username", required=False)
parser.add_argument("--password", help="Basic Auth password", required=False)
parser.add_argument("--concurrency", help="concurrency", default=8, required=False, type=int)
parser.add_argument("--from", dest='seriesFrom', help="only get series that have been active since this time", required=False)
parser.add_argument("--depth", type=int, help="maximum depth to traverse. If set, the branches at the depth will be printed", required=False)
args = parser.parse_args()
url = args.url
prefix = args.prefix
user = args.user
password = args.password
concurrency = args.concurrency
seriesFrom = args.seriesFrom
depth = args.depth
queue = Queue()
for x in range(concurrency):
worker = Walker(queue, url, user, password, seriesFrom, depth)
worker.daemon = True
worker.start()
queue.put((prefix, 0))
queue.join()
Note: this code comes from: https://github.com/grafana/cloud-graphite-scripts/blob/master/query/walk_metrics.py