Airflow - Use TaskGroup and PythonBranchOperator in the same DAG - airflow

I am currently using Airflow Taskflow API 2.0. I am having an issue of combining the use of TaskGroup and BranchPythonOperator.
Below is my code:
import airflow
from airflow.models import DAG
from airflow.decorators import task, dag
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import BranchPythonOperator, PythonOperator
from airflow.operators.python import task, get_current_context
from random import randint
from airflow.utils.task_group import TaskGroup
default_args = {
'owner': 'Airflow',
'start_date': airflow.utils.dates.days_ago(2),
}
#task
def dummy_task():
return {}
#task
def task_b():
return {}
#task
def task_c():
return {}
def final_step():
return {}
def get_tasks(**kwargs):
task = 'task_a'
return task
with DAG(dag_id='branch_dag',
default_args=default_args,
schedule_interval=None) as dag:
with TaskGroup('task_a') as task_a:
obj = dummy_task()
tasks = BranchPythonOperator(
task_id='check_api',
python_callable=get_tasks,
provide_context=True
)
final_step = PythonOperator(
task_id='final_step',
python_callable=final_step,
trigger_rule='one_success'
)
b = task_b()
c = task_c()
tasks >> task_a >> final_step
tasks >> b >> final_step
tasks >> c >> final_step
When i trigger this DAG, i get the below error inside the check_api task:
airflow.exceptions.TaskNotFound: Task task_a not found
Is it possible to get this working and using TaskGroup in conjunction with BranchPythonOperator?
Thanks,

BranchPythonOperator is expected to return task_ids
You need to change the get_tasksfunction to:
def get_tasks(**kwargs):
task = 'task_a.dummy_task'
return task

Related

FastAPI, column computers.id does not exist

Here is the code of my main.py in FastAPI:
from typing import List, Union
import datetime
import databases
import sqlalchemy
from fastapi import FastAPI
from pydantic import BaseModel
DATABASE_URL = "postgresql://username:password#localhost/collector"
database = databases.Database(DATABASE_URL)
metadata = sqlalchemy.MetaData()
computers = sqlalchemy.Table(
"computers",
metadata,
sqlalchemy.Column("id", sqlalchemy.Integer, primary_key=True, index=True),
sqlalchemy.Column("computername", sqlalchemy.String),
sqlalchemy.Column("computerip", sqlalchemy.String),
sqlalchemy.Column("computerexternalip", sqlalchemy.String),
sqlalchemy.Column("time", sqlalchemy.DateTime),
)
engine = sqlalchemy.create_engine(
DATABASE_URL
)
metadata.create_all(engine)
class ComputerBase(BaseModel):
computername: str
computerip: str
computerexternalip: str
time: str = datetime.datetime
class ComputerIn(ComputerBase):
pass
class Computer(ComputerBase):
id: int
class Config:
orm_mode = True
app = FastAPI()
#app.on_event("startup")
async def startup():
await database.connect()
#app.on_event("shutdown")
async def shutdown():
await database.disconnect()
#app.get("/computers/", response_model=List[Computer])
async def read_computers():
query = computers.select()
print(query)
return await database.fetch_all(query)
#app.post("/computers/", response_model=Computer)
async def create_computer(computer: ComputerIn):
current_time = datetime.datetime.utcnow
query = computers.insert().values(computername=computer.computername, computerip=computer.computerip, computerexternalip=computer.computerexternalip, time=current_time)
last_record_id = await database.execute(query)
return {**computer.dict(), "id": last_record_id}
When I go on https://localhost:8000/computers, I get this error:
asyncpg.exceptions.UndefinedColumnError: column computers.id does not
exist
Which I don't understand since I declare a table names "computers" with an id column at the begining of my code.
Any idea ?
Thank you

Airflow taskflow - run task in parallele

Wanted to try the new taskflow API I came to the point where I need to have 2 parallels task.
With Airflow v1 I was use to do something like
task_1 >> [task_2, task_3]
[task_2, task_3] >> task_4
The way we call the task is different now for PythonOperator
How can I do the list with TaskFlow ?
Thanks
if each task is depended on the value from previous task you can achieve it by:
from airflow.utils.dates import days_ago
from airflow.decorators import task, dag
#task
def task_1():
return 'first task'
#task
def task_2(value):
return 'second task'
#task
def task_3(value):
return 'third task'
#task
def task_4(value1, value2):
return 'forth task'
default_args = {
'owner': 'airflow',
'start_date': days_ago(2),
}
#dag(dag_id='taskflow_stackoverflow', schedule_interval='#once', default_args=default_args, catchup=False)
def my_dag():
op_1 = task_1()
op_2 = task_2(op_1)
op_3 = task_3(op_1)
op_4 = task_4(op_2, op_3)
dag = my_dag()
The syntax that you mentioned is also supported but you won't get direct access to the xcom values from previous tasks:
#task
def task_1():
return 'first task'
#task
def task_2():
return 'second task'
#task
def task_3():
return 'third task'
#task
def task_4():
return 'forth task'
default_args = {
'owner': 'airflow',
'start_date': days_ago(2),
}
#dag(dag_id='taskflow_stackoverflow', schedule_interval='#once', default_args=default_args, catchup=False)
def my_dag():
op_1 = task_1()
op_2 = task_2()
op_3 = task_3()
op_4 = task_4()
op_1 >> [op_2, op_3]
[op_2, op_3] >> op_4
dag = my_dag()
Probably you need to mix the two options of syntax depending on what you want to achieve.

TypeError: register_tortoise() got an unexpected keyword argument 'add_exeption_handlers'

Code was copies from https://testdriven.io/courses/tdd-fastapi/postgres-setup/ but it show an exeption while running with uvicorn
import os
from fastapi import FastAPI, Depends
from tortoise.contrib.fastapi import register_tortoise
from app.config import get_settings, Settings
app = FastAPI()
register_tortoise(
app,
db_url=os.environ.get("DATABASE_URL"),
modules={"models": ["app.models.tortoise"]},
generate_schemas=True,
add_exeption_handlers=True,
)
#app.get("/ping")
async def pong(settings: Settings = Depends(get_settings)):
return {
"ping": "pong!",
"environment": settings.environment,
"testing": settings.testing
}

TypeError: 'coroutine' object is not subscriptable in python Quart Framework

from quart import Quart, request, render_template, jsonify
import json
import os, sys
import pandas as pd
import requests
import asyncio
from pylon.model.db_models import RawFiles
from pylon.orm import db
app = Quart(__name__)
#app.route('/upload', methods=['POST'])
async def handle_form():
f = await request.files['filename']
f.save(f.filename)
data = pd.read_csv(f.filename)
data.to_json("json_data.json")
data = pd.read_json("json_data.json")
os.remove("json_data.json")
os.remove(f.filename)
print(type(data))
print(data)
return ""
#app.route("/")
async def index():
return await render_template('upload.html')
if __name__ == "__main__":
app.run(host="bheem11.arch.des.co", port=5043, debug = True)
I am getting one error described in title. I am working in quartz framework in python. Hoping for proper solution. Actually i am getting coroutine error when #app.route("/upload", methods = "post") execute.
This line await request.files['filename'] should be (await request.files)['filename']. Without the parenthesis everything to the right of await is evaluated first, which results in the attempt to subscribe (['filename'] operation) the files attribute. This doesn't work as the files attribute returns a coroutine - which is not subscriptable. There is more on this in the Quart documentation.

Tornado performance issue with MySQLand Redis

I have a tornado server running with MySQL for DB and Redis for cache. I am using web socket to send/receive data. My code is like this:
Server
import logging
import os.path
import uuid
import sys
import json
import tornadis
import tormysql
import tornado.escape
import tornado.ioloop
import tornado.options
import tornado.web
import tornado.websocket
from tornado import gen
from tornado.concurrent import Future
from tornado.options import define, options
#gen.coroutine
def getFromDB(query):
with (yield dbPool.Connection()) as conn:
with conn.cursor() as cursor:
yield cursor.execute(query)
datas = cursor.fetchall()
return datas
return None
#gen.coroutine
def getFromCache(cmd):
pipeline = tornadis.Pipeline()
pipeline.stack_call(cmd)
with (yield cachePool.connected_client()) as singleClient:
redisResult = yield singleClient.call(pipeline)
if isinstance(redisResult, tornadis.TornadisException):
print("Redis exception: %s"%(redisResult))
else:
return redisResult
async def getData(dbQuery, cacheQuery):
waitDict = {}
if dbQuery:
waitDict['db'] = getFromDB(dbQuery)
if cacheQuery:
waitDict['cache'] = getFromCache(cacheQuery)
resultList = []
if len(waitDict) > 0:
await gen.multi(waitDict)
if 'db' in waitDict:
dbRes = waitDict['db'].result()
if dbRes:
for eachResult in dbRes:
changeRes = someFunct(eachResult)
resultList.append(changeRes)
if 'cache' in waitDict:
cacheRes = waitDict['cache'].result()
if cacheRes:
for eachResult in cacheRes:
changeRes = someFunct(eachResult)
resultList.append(changeRes)
return resultList
class SocketHandler(tornado.websocket.WebSocketHandler):
SUPPORTED_METHODS = ("GET")
def open(self):
print("Socket open:%s"%(self))
def on_close(self):
print("Socket closed:%s"%(self))
async def on_message(self, inp):
if requestForData:
ret = await getData(dbQuery, cacheQuery)
self.write_message(ret)
class Application(tornado.web.Application):
def __init__(self):
handlers = [
(r"/sock", SocketHandler),
]
define("port", default=8000, help="run on the given port", type=int)
tornado.options.parse_command_line()
app = Application()
app.listen(options.port)
print("PORT:%s"%(options.port))
tornado.ioloop.IOLoop.current().start()
I am using tornadis for Redis and tormysql for MySQL.
I am running this setup on amazon linux instance m5.large with 2vCPUs memeory:8Gib.
Client
I am trying to simulate the traffic using web socket. The code is like this:
import sys
import json
import asyncio
import websockets
def getData():
for i in range(100):
async with websockets.connect(SOCKET_URL, extra_headers=extraHeaders) as websocket:
for i an range(100):
await websocket.send("get data")
reply = await websocket.recv()
print(reply)
asyncio.get_event_loop().run_until_complete(getData())
I am running multiple instance of the client.
The server is running good but its able to handle only 25 connections. After 25 connections the delay for the reply from the server increases. I want server to reply to be very fast. How do I decrease the delay for the response? So is there any problem in the code?

Resources