Airflow: Dynamically generate tasks with TaskFlow API - airflow

Previously I used the following snippet to dynamically generate tasks:
dummy_start_task = PythonOperator(
task_id="dummy_start",
default_args=default_args,
python_callable=dummy_start,
dag=dag
)
make_images_tasks = list()
for n in range(WORKERS):
globals()[f"make_images_{n}_task"] = PythonOperator(
task_id=f'make_images_{n}',
default_args=default_args,
python_callable=make_images,
op_kwargs={"n": n},
dag=dag
)
make_images_tasks.append(globals()[f"make_images_{n}_task"])
dummy_collector_task = PythonOperator(
task_id="dummy_collector",
default_args=default_args,
python_callable=dummy_collector,
dag=dag
)
dummy_start_task >> make_images_tasks >> dummy_collector_task
# in collector_task I would use:
# items = task_instance.xcom_pull(task_ids=[f"make_images_{n}" for n in range(int(WORKERS))])
# to get the XCOMs from the these dynamically generated tasks
How can I achieve that using the TaskFlow API? (Spawn multiple tasks and then get their XComs in the following collector-task)

Here's an example:
from datetime import datetime
from airflow import DAG
from airflow.decorators import task
with DAG(dag_id="example_taskflow", start_date=datetime(2022, 1, 1), schedule_interval=None) as dag:
#task
def dummy_start_task():
pass
tasks = []
for n in range(3):
#task(task_id=f"make_images_{n}")
def images_task(i):
return i
tasks.append(images_task(n))
#task
def dummy_collector_task(tasks):
print(tasks)
dummy_start_task_ = dummy_start_task()
dummy_start_task_ >> tasks
dummy_collector_task(tasks)
Which gives the following DAG:
The make_images_* tasks take 0, 1, and 2 as input (and also use it in the tasks' id) and return the value. The dummy_collector_task takes all outputs from the make_images_* tasks and prints [0, 1, 2].

Related

How to enforce max active run = 1 for a group of DAGs in Airflow?

I have a group of DAGs and I only want one of them to run at any given time.
ExternalTaskSensor will not work if I trigger a backfill job for one of them for a very old date.
I am aware of pool and priority weights method.
Another approach could be to make a custom operator and check all the dag runs of all the dags in the group.
Is there any other method to achieve this?
Airflow doesn't support this feature, even if you use pools, you need to use the same pool for all the tasks from all the dags in the group, and set the pool slots to 1, which break the parallelism.
You can achieve this by merging the dags in one dag and adding a branch operator which processes a param from dag_run conf to know which dag should it runs:
import pendulum
from airflow.operators.empty import EmptyOperator
from airflow.operators.python import BranchPythonOperator
from airflow.models.param import Param
from airflow.models import DAG
from airflow.decorators import task
default_args = {}
def dag_1(main_dag: DAG):
dag_id = "dag_1"
start_task = EmptyOperator(
task_id=dag_id,
dag=main_dag
)
task_1 = EmptyOperator(
task_id=f"{dag_id}.task1",
dag=main_dag
)
task_2 = EmptyOperator(
task_id=f"{dag_id}.task2",
dag=main_dag
)
start_task >> task_1 >> task_2
return start_task
def dag_2(main_dag: DAG):
dag_id = "dag_2"
start_task = EmptyOperator(
task_id=dag_id,
dag=main_dag
)
task_1 = EmptyOperator(
task_id=f"{dag_id}.task1",
dag=main_dag
)
task_2 = EmptyOperator(
task_id=f"{dag_id}.task2",
dag=main_dag
)
task_3 = EmptyOperator(
task_id=f"{dag_id}.task3",
dag=main_dag
)
start_task >> [task_1, task_2] >> task_3
return start_task
with DAG(
dag_id='multiple_dags',
default_args=default_args,
start_date=pendulum.datetime(2023, 1, 1),
schedule=None,
max_active_runs=1,
params={
"dag_id": Param(default="dag_1", enum=["dag_1", "dag_2"])
}
) as dag:
#task.branch(task_id="start_task")
def branch(**context):
return context["params"]["dag_id"]
branch() >> [
dag_1(dag),
dag_2(dag)
]
for param dag_1:
for param dag_2:
Then if you want to run these dags on different schedules, you can create N new dags contains one task from TriggerDagRunOperator to trigger the main dag and pass the dag id as param:
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
def create_trigger_dag(dag_id, schedule):
with DAG(
dag_id=dag_id,
start_date=pendulum.datetime(2023, 1, 1),
schedule=schedule,
catchup=False
) as dag:
TriggerDagRunOperator(
task_id="trigger_dag",
trigger_dag_id="multiple_dags",
conf={
"dag_id": dag_id
}
)
return dag
trigger_dag_1 = create_trigger_dag(dag_id="dag_1", schedule="*/1 * * * *")
trigger_dag_2 = create_trigger_dag(dag_id="dag_2", schedule="*/2 * * * *")
And here is the result, 2 runs from dag_1 for each run from dag_2:

How to use Dynamic Task Mapping with TaskGroups

In my actual DAG, I need to first get a list of IDs and then for each ID run a set of tasks.
I have used Dynamic Task Mapping to pass a list to a single task or operator to have it process the list, but can we do this using a TaskGroup as well?
If I can figure out how to pass a variable value at the TaskGroup level, so it uses that value in all sub tasks, then I should be able to meet my requirement.
The below should give you an idea of what I am looking for, just need help getting it working.
from airflow import DAG, XComArg
from datetime import datetime
from airflow.decorators import task
from airflow.utils.task_group import TaskGroup
from airflow.operators.python import PythonOperator
with DAG(
'dtm_tg_test',
schedule_interval = None,
start_date = datetime(2022, 1, 1)
) as dag:
def getList():
return [ "Hello", "World" ]
def printText(text):
print(text)
get_list = PythonOperator(
task_id = "get_list",
python_callable = getList,
dag = dag
)
with TaskGroup.partial(
group_id = "task_group"
).expand(
list = XComArg(get_list)
) as task_group:
print_text = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_again = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_text >> print_again
get_list >> task_group
You can achieve it with the following example :
list_ids = ['45', '48']
#task_group()
def parent_group(list_ids: List[str]) -> List[TaskGroup]:
return list(map(build_group_for_id, list_ids))
def build_group_for_id(current_id: str) -> TaskGroup:
with TaskGroup(group_id=f'group_for_id_{current_id}') as group:
print_text = PythonOperator(
task_id = f"print_output_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id }
dag = dag
)
print_again = PythonOperator(
task_id = f"print_output_other_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id}
dag = dag
print_text >> print_again
return group
with airflow.DAG(
"my_dag", default_args=args, schedule_interval=None,
) as dag:
DummyOperator(task_id='start_dag') >> parent_group(list_ids())
Some explanations :
I create a parent taskGroup called parent_group
This parent group takes the list of IDs
I add a loop and for each parent ID, I create a TaskGroup containing your 2 Aiflow tasks (print operators)
For the TaskGroup related to a parent ID, the TaskGroup ID is built from it in order to be unique in the DAG
For the print operators inside the TaskGroup, I generated again the task IDs by the current parent ID

Run an airflow task after a task in a loop, not after all tasks in a loop

Let's say we have these tasks:
for endpoint in ENDPOINTS:
latest_only = LatestOnlyOperator(
task_id=f'{endpoint.name}_latest_only',
)
s3 = SnowflakeQOperator(
task_id=f'{endpoint.name}_to_S3',
boostr_conn_id='boostr_default',
s3_conn_id='aws_default',
partition=endpoint.partition,
endpoint=endpoint
)
short_circuit = ShortCircuitOperator(
task_id=f"short_circuit_missing_{endpoint.name}",
op_kwargs={'endpoint_to_check': endpoint, 'aws_conn_id': 'aws_default'},
python_callable=check_file_exists,
provide_context=True
)
s3 >> short_circuit
and let's say I want to add one task to run after nbc_to_s3 which is one of the '{endpoint.name}' task in the s3 task.
we're importing ENDPOINTS which contains several class, with the 'name' method:
#property
def name(self) -> str:
return 'nbc'
I've tried to add it outside of the loop like this:
nbc_to_s3 >> new_task but that doesn't work because 'nbc_to_s3' is not defined
You could apply some logic within the loop to set a new dependency for new_task like so (apologies for the quick mockup):
from airflow.decorators import dag
from airflow.operators.dummy import DummyOperator
from datetime import datetime
ENDPOINTS = ["nbc", "cbs", "bravo", "espn"]
DEFAULT_ARGS = dict(owner="airflow", start_date=datetime(2021, 6, 9))
DAG_ARGS = dict(schedule_interval=None, default_args=DEFAULT_ARGS, catchup=False)
#dag(**DAG_ARGS)
def run_task_after_loop():
for endpoint in ENDPOINTS:
s3 = DummyOperator(
task_id=f"{endpoint}_to_S3",
)
short_circuit = DummyOperator(
task_id=f"short_circuit_missing_{endpoint}",
)
s3 >> short_circuit
if endpoint == "nbc":
new_task = DummyOperator(task_id=f"new_task_{endpoint}")
s3 >> new_task
dag = run_task_after_loop()

How to trigger a task in airflow if immediate parent task fails?

What i am mainly aiming for is that the restore_denormalized_es_Data should only get triggered when the load_denormalized_es_data task fails. If the load_denormalized_es_data task is successful then the command should be directed to end . Here as you can see , my restore is working when archive fails and load is skipped or retrying as a result i am getting wrong answers.
Have stated the code i am using
import sys
import os
from datetime import datetime
#import files what u want to import
# Airflow level imports
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator,BranchPythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.utils.trigger_rule import TriggerRule
#Imported all the functions and the code is able to call the functions with ease
# Name of the Dag
DAG_NAME = "DAG"
#Default arguments
default_args = {
"owner": "Mehul",
"start_date": datetime.today().strftime("%Y-%m-%d"),
"provide_context": True
}
# Define the dag object
dag = DAG(
DAG_NAME,
default_args=default_args,
schedule_interval=None
)
archive_denormalized_es_data = PythonOperator(
task_id = "archive_denormalized_es_data",
python_callable = archive_current_ES_data,
trigger_rule=TriggerRule.ALL_SUCCESS,
provide_context = False,
dag=dag
)
load_denormalized_es_data = PythonOperator(
task_id = "load_denormalized_es_data",
python_callable = es_load,
provide_context = False,
trigger_rule = TriggerRule.ALL_SUCCESS,
dag=dag
)
restore_denormalized_es_data = PythonOperator(
task_id = "restore_denormalized_es_data",
python_callable = restore_current_ES_data,
trigger_rule=TriggerRule.ALL_FAILED,
provide_context=False,
dag=dag
)
END = DummyOperator(
task_id="END",
trigger_rule=TriggerRule.ALL_SUCCESS,
dag=dag)
denormalized_data_creation>>archive_denormalized_es_data>>load_denormalized_es_data
load_denormalized_es_data<<archive_denormalized_es_data<<denormalized_data_creation
load_denormalized_es_data>>restore_denormalized_es_data
restore_denormalized_es_data<<load_denormalized_es_data
load_denormalized_es_data>>END
END<<load_denormalized_es_data
restore_denormalized_es_data>>END
END<<restore_denormalized_es_data
Here is the picture of the pipelines referred above
If I understand correctly, you want to skip the rest of the pipeline if A fails.
ShortCircuitOperator will allow Airflow to short circuit (skip) the rest of the pipeline.
Here is an example that does what you outlined.
from datetime import datetime
from airflow.models import DAG
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator, ShortCircuitOperator
from airflow.utils.trigger_rule import TriggerRule
from airflow.utils.state import State
def proceed(**context):
ti = context['dag_run'].get_task_instance(a.task_id)
if ti.state == State.FAILED:
return False
else:
return True
dag = DAG(
dag_id="dag",
start_date=datetime(2021, 4, 5),
schedule_interval='#once',
)
with dag:
a = PythonOperator(
task_id='archive_denormalized_es_data',
python_callable=lambda x: 1
)
gate = ShortCircuitOperator(
task_id='gate',
python_callable=proceed,
trigger_rule=TriggerRule.ALL_DONE
)
b = PythonOperator(
task_id='load_denormalized_es_data',
python_callable=lambda: 1
)
c = DummyOperator(
task_id='restore_denormalized_es_data',
trigger_rule=TriggerRule.ALL_FAILED
)
d = DummyOperator(
task_id='END',
trigger_rule=TriggerRule.ONE_SUCCESS
)
a >> gate >> b >> c
[b, c] >> d
If archive_denormalized_es_data fails, the rest of the pipeline is skipped, meaning Airflow does not run restore_denormalized_es_data
If load_denormalized_es_data fails, restore_denormalized_es_data runs and continues to END.
If load_denormalized_es_data succeeds, restore_denormalized_es_data is skipped and continues to END.
You code is essentially missing the logic to skip when archive_denormalized_es_data fails, which the ShortCircuitOperator takes care of for you.

create tasks in Airflow by looping over a list and pass arguments

edit:
This will work, I defined ex_func_airflow(var_1 = i) which was causing the issue
I would like to create tasks in airflow by looping on a list.
tabs = [1,2,3,4,5]
for i in tabs:
task = PythonOperator(
task_id = name,
provide_context=False,
op_args = [i],
python_callable=ex_func_airflow,
dag=dag)
task_0 >> task >> task_1
When this is run in airflow the argument that is passed is always the last element in that list.
So i'm essentially running:
ex_func_airflow(6)
five times instead of running
ex_func_airflow(1)
ex_func_airflow(2)
ex_func_airflow(3)
..etc.
How would can I pass the correct arguments for each task?
The following codes work for me.
def print_context(ds, **kwargs):
print("hello")
def ex_func_airflow(i):
print(i)
dag = DAG(
dag_id="loop_dag",
schedule_interval=None,
start_date=datetime(2018, 12, 31),
)
task_0 = PythonOperator(
task_id='task_0',
provide_context=True,
python_callable=print_context,
dag=dag)
task_1 = PythonOperator(
task_id='task_1',
provide_context=True,
python_callable=print_context,
dag=dag)
tabs = [1, 2, 3, 4, 5]
for i in tabs:
task_id = f'task_tab_{i}'
task = PythonOperator(
task_id=task_id,
provide_context=False,
op_args=[i],
python_callable=ex_func_airflow,
dag=dag)
task_0 >> task >> task_1

Resources