Airflow setting conditional dependency - airflow

Hello I am trying to set conditional dependency in Airflow, in the below flow my objective is to run print-conf-success only after successful execution of print-conf-1 and print-conf-2 and print-conf-failure in either of them fails. In the below dependency I setup upstream as a list of [print-conf-2, print-conf-1] expecting it to have both the task as upstream however instead of having both the tasks as upstream its coming as downstream for each of them. What is the correct way to set dependency for having both having success status for [print-conf-2, print-conf-1] for task print-conf-success and failure for either of them for task print-conf-failure
"""Example DAG demonstrating the usage of the PythonOperator."""
import time
from pprint import pprint
from datetime import datetime
from airflow.utils.trigger_rule import TriggerRule
from airflow import DAG
from airflow.operators.python import PythonOperator, PythonVirtualenvOperator
DEFAULT_ARGS = {
'owner': 'admin',
'depends_on_past': False,
'start_date': datetime(2022, 5, 20, 0),
'retries': 2
}
def print_log(**kwargs):
print("--------------------")
print("1, 2, 3")
print("--------------------")
def print_log_failed(**kwargs):
print("--------------------")
print("1, 2, 3, failed")
print("--------------------")
with DAG(dag_id="test_dag", schedule_interval=None, default_args=DEFAULT_ARGS, max_active_runs=10) as dag:
log_conf = PythonOperator(
task_id='print-conf-success',
provide_context=True,
python_callable=print_log,
trigger_rule=TriggerRule.ALL_SUCCESS,
dag=dag)
log_conf_failure = PythonOperator(
task_id='print-conf-failure',
provide_context=True,
python_callable=print_log,
trigger_rule=TriggerRule.ALL_SUCCESS,
dag=dag)
log_conf_1 = PythonOperator(
task_id='print-conf-1',
provide_context=True,
python_callable=print_log,
trigger_rule=TriggerRule.ALL_SUCCESS,
dag=dag)
log_conf_2 = PythonOperator(
task_id='print-conf-2',
provide_context=True,
python_callable=print_log,
trigger_rule=TriggerRule.ALL_SUCCESS,
dag=dag)
log_conf_3 = PythonOperator(
task_id='print-conf-3',
provide_context=True,
python_callable=print_log_failed,
trigger_rule=TriggerRule.ONE_FAILED,
dag=dag)
log_conf.set_upstream([log_conf_1, log_conf_2])
log_conf_failure.set_upstream([log_conf_1, log_conf_2])
log_conf_3 >> ([log_conf_1, log_conf_2])

I think this is what you are after:
print-conf-1, print-conf-2, print-conf-3 can be successful or fail (for demonstration in the code below print-conf-3 will always fail).
print-conf-failure will be executed only if at least 1 upstream task has failed.
print-conf-failure will be executed only if all upstream tasks are successful.
code:
from datetime import datetime
from airflow.utils.trigger_rule import TriggerRule
from airflow import DAG, AirflowException
from airflow.operators.python import PythonOperator
DEFAULT_ARGS = {
'owner': 'admin',
'depends_on_past': False,
'start_date': datetime(2022, 5, 20, 0),
'retries': 2
}
def print_log(**kwargs):
print("--------------------")
print("1, 2, 3")
print("--------------------")
def print_log_failed(**kwargs):
print("--------------------")
print("1, 2, 3, failed")
print("--------------------")
raise AirflowException("failing")
with DAG(dag_id="example_test_dag", schedule_interval=None, default_args=DEFAULT_ARGS, max_active_runs=10) as dag:
log_conf = PythonOperator(
task_id='print-conf-success',
provide_context=True, # Remove this line if you are on Airflow 2
python_callable=print_log)
log_conf_failure = PythonOperator(
task_id='print-conf-failure',
provide_context=True, # Remove this line if you are on Airflow 2
python_callable=print_log,
trigger_rule=TriggerRule.ONE_FAILED)
log_conf_1 = PythonOperator(
task_id='print-conf-1',
provide_context=True, # Remove this line if you are on Airflow 2
python_callable=print_log)
log_conf_2 = PythonOperator(
task_id='print-conf-2',
provide_context=True, # Remove this line if you are on Airflow 2
python_callable=print_log)
log_conf_3 = PythonOperator(
task_id='print-conf-3',
provide_context=True, # Remove this line if you are on Airflow 2
python_callable=print_log_failed)
[log_conf_1, log_conf_2] >> log_conf
[log_conf_1, log_conf_2, log_conf_3] >> log_conf_failure

Related

apache airflow ExternalTaskMarker clear another dag's task recursively but task state is None

i'm testing ExternalTaskSensor and ExternalTaskMarker.
ExternalTaskSensor wait until external Dag's Task finished and ExternalTaskMarker clear another dag's task recursively.
https://airflow.apache.org/docs/apache-airflow/stable/howto/operator/external_task_sensor.html
this is my parent dag
# parent_dag.py
from datetime import datetime, timedelta
from airflow.sensors.external_task import ExternalTaskMarker
from airflow.operators.bash import BashOperator
from airflow import DAG
default_args = {
"owner": "admin",
"retries": 0,
"depends_on_past": False,
"retry_delay": timedelta(minutes=2),
}
dag = DAG(
dag_id='parent_dag',
default_args=default_args,
start_date=datetime(2022, 1, 1, 9, 00, 0),
schedule_interval='#daily',
catchup=True
)
task_1 = BashOperator(
task_id='echo_hello',
bash_command='echo HELLO!!!!',
dag=dag,
)
task_2 = ExternalTaskMarker(
task_id='parent_trigger',
external_dag_id='child_dag',
external_task_id='receive_call',
dag=dag
)
task_1 >> task_2
and child dag
# child_dag.py
from datetime import datetime, timedelta
from airflow.sensors.external_task import ExternalTaskMarker, ExternalTaskSensor
from airflow.operators.bash import BashOperator
from airflow import DAG
default_args = {
"owner": "admin",
"retries": 0,
"depends_on_past": False,
"retry_delay": timedelta(minutes=2)
}
dag = DAG(
dag_id='child_dag',
default_args=default_args,
start_date=datetime(2022, 1, 1, 9, 00, 0),
schedule_interval='#daily',
catchup=True
)
receive_call = ExternalTaskSensor(
task_id='receive_call',
external_dag_id='parent_dag',
external_task_id='parent_trigger',
dag=dag
)
task_1 = BashOperator(
task_id='echo_hello',
bash_command='echo HELLO!!!!',
dag=dag
)
receive_call >> task_1
Sensor work but Marker doesn't work as i expected.
when i clear parent dag's with ExternalTaskMarker, child dag change to None state. i expected child dag will be clear and rescheduled.
do i missunderstanding about ExternalTaskSensor and ExternalTaskMarker?

How to trigger a task in airflow if immediate parent task fails?

What i am mainly aiming for is that the restore_denormalized_es_Data should only get triggered when the load_denormalized_es_data task fails. If the load_denormalized_es_data task is successful then the command should be directed to end . Here as you can see , my restore is working when archive fails and load is skipped or retrying as a result i am getting wrong answers.
Have stated the code i am using
import sys
import os
from datetime import datetime
#import files what u want to import
# Airflow level imports
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator,BranchPythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.utils.trigger_rule import TriggerRule
#Imported all the functions and the code is able to call the functions with ease
# Name of the Dag
DAG_NAME = "DAG"
#Default arguments
default_args = {
"owner": "Mehul",
"start_date": datetime.today().strftime("%Y-%m-%d"),
"provide_context": True
}
# Define the dag object
dag = DAG(
DAG_NAME,
default_args=default_args,
schedule_interval=None
)
archive_denormalized_es_data = PythonOperator(
task_id = "archive_denormalized_es_data",
python_callable = archive_current_ES_data,
trigger_rule=TriggerRule.ALL_SUCCESS,
provide_context = False,
dag=dag
)
load_denormalized_es_data = PythonOperator(
task_id = "load_denormalized_es_data",
python_callable = es_load,
provide_context = False,
trigger_rule = TriggerRule.ALL_SUCCESS,
dag=dag
)
restore_denormalized_es_data = PythonOperator(
task_id = "restore_denormalized_es_data",
python_callable = restore_current_ES_data,
trigger_rule=TriggerRule.ALL_FAILED,
provide_context=False,
dag=dag
)
END = DummyOperator(
task_id="END",
trigger_rule=TriggerRule.ALL_SUCCESS,
dag=dag)
denormalized_data_creation>>archive_denormalized_es_data>>load_denormalized_es_data
load_denormalized_es_data<<archive_denormalized_es_data<<denormalized_data_creation
load_denormalized_es_data>>restore_denormalized_es_data
restore_denormalized_es_data<<load_denormalized_es_data
load_denormalized_es_data>>END
END<<load_denormalized_es_data
restore_denormalized_es_data>>END
END<<restore_denormalized_es_data
Here is the picture of the pipelines referred above
If I understand correctly, you want to skip the rest of the pipeline if A fails.
ShortCircuitOperator will allow Airflow to short circuit (skip) the rest of the pipeline.
Here is an example that does what you outlined.
from datetime import datetime
from airflow.models import DAG
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator, ShortCircuitOperator
from airflow.utils.trigger_rule import TriggerRule
from airflow.utils.state import State
def proceed(**context):
ti = context['dag_run'].get_task_instance(a.task_id)
if ti.state == State.FAILED:
return False
else:
return True
dag = DAG(
dag_id="dag",
start_date=datetime(2021, 4, 5),
schedule_interval='#once',
)
with dag:
a = PythonOperator(
task_id='archive_denormalized_es_data',
python_callable=lambda x: 1
)
gate = ShortCircuitOperator(
task_id='gate',
python_callable=proceed,
trigger_rule=TriggerRule.ALL_DONE
)
b = PythonOperator(
task_id='load_denormalized_es_data',
python_callable=lambda: 1
)
c = DummyOperator(
task_id='restore_denormalized_es_data',
trigger_rule=TriggerRule.ALL_FAILED
)
d = DummyOperator(
task_id='END',
trigger_rule=TriggerRule.ONE_SUCCESS
)
a >> gate >> b >> c
[b, c] >> d
If archive_denormalized_es_data fails, the rest of the pipeline is skipped, meaning Airflow does not run restore_denormalized_es_data
If load_denormalized_es_data fails, restore_denormalized_es_data runs and continues to END.
If load_denormalized_es_data succeeds, restore_denormalized_es_data is skipped and continues to END.
You code is essentially missing the logic to skip when archive_denormalized_es_data fails, which the ShortCircuitOperator takes care of for you.

How to retry an upstream task?

task a > task b > task c
If C fails I want to retry A. Is this possible? There are a few other tickets which involve subdags, but I would like to just be able to clear A.
I'm hoping to use on_retry_callback in task C but I don't know how to call task A.
There is another question which does this in a subdag, but I am not using subdags.
I'm trying to do this, but it doesn't seem to work:
def callback_for_failures(context):
print("*** retrying ***")
if context['task'].upstream_list:
context['task'].upstream_list[0].clear()
As other comments mentioned, I would use caution to make sure you aren't getting into an endless loop of clearing/retries. But you can call a bash command as part of your on_failure_callback and then specify which tasks you want to clear, and if you want downstream/upstream tasks cleared etc.
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta
def clear_upstream_task(context):
execution_date = context.get("execution_date")
clear_tasks = BashOperator(
task_id='clear_tasks',
bash_command=f'airflow tasks clear -s {execution_date} -t t1 -d -y clear_upstream_task'
)
return clear_tasks.execute(context=context)
# Default settings applied to all tasks
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5)
}
with DAG('clear_upstream_task',
start_date=datetime(2021, 1, 1),
max_active_runs=3,
schedule_interval=timedelta(minutes=5),
default_args=default_args,
catchup=False
) as dag:
t0 = DummyOperator(
task_id='t0'
)
t1 = DummyOperator(
task_id='t1'
)
t2 = DummyOperator(
task_id='t2'
)
t3 = BashOperator(
task_id='t3',
bash_command='exit 123',
on_failure_callback=clear_upstream_task
)
t0 >> t1 >> t2 >> t3

Airflow: Dynamically derive dag_id to be called from another DAG

I am trying to derive name of the DAG to be called in another DAG dynamically. In the following task "trigger_transform_dag" fails to execute. Can you please help me with deriving the dag id for task 'trigger_transform_dag' dynamically
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'start_date': airflow.utils.dates.days_ago(0),
}
def run_dag(**context):
file_path='ABC'
context['ti'].xcom_push(key = 'key1', value = file_path)
return 1
def check_file_name(**context):
pulled_value_1 = context['ti'].xcom_pull(task_ids = 'run_dataflow_template',key = 'key1')
if pulled_value_1 = 'ABC':
push_value = 'sample1'
return push_value
else:
push_value = 'sample2'
return push_value
return pulled_value_1
with DAG('sample',
default_args=default_args,
schedule_interval='10 * * * *',
start_date=datetime(2017, 3, 20),
max_active_runs=1,
catchup=False) as dag:
t1 = PythonOperator(
task_id='run_dataflow_template',
provide_context=True,
python_callable=run_dag
)
t2 = TriggerDagRunOperator(
task_id="trigger_transform_dag",
provide_context=True,
trigger_dag_id=check_file_name()
)
end = DummyOperator(
trigger_rule='one_success',
task_id='end')
t1 >> t2 >> end
I don't know if there is a simpler way, but you can create a custom operator that takes inspiration from the TriggerDagRunOperator (https://github.com/apache/airflow/blob/master/airflow/operators/dagrun_operator.py) and uses the passed Callable to get the function.
Something I hacked together really quick (can be definitely improved):
from airflow.models import DAG
from airflow.utils.dates import days_ago, timedelta
from airflow.operators.dagrun_operator import TriggerDagRunOperator
import random
import datetime
from typing import Dict, Optional, Union, Callable
from airflow.api.common.experimental.trigger_dag import trigger_dag
from airflow.models import BaseOperator
from airflow.utils import timezone
from airflow.utils.decorators import apply_defaults
class TriggerDagRunWithFuncOperator(BaseOperator):
"""
Triggers a DAG run for a specified ``dag_id``
:param trigger_dag_id_f: the dag_id function to trigger
:type trigger_dag_id_f: Callable
:param conf: Configuration for the DAG run
:type conf: dict
:param execution_date: Execution date for the dag (templated)
:type execution_date: str or datetime.datetime
"""
template_fields = ("execution_date", "conf")
ui_color = "#ffefeb"
#apply_defaults
def __init__(
self,
get_dag_name_f: Callable,
conf: Optional[Dict] = None,
execution_date: Optional[Union[str, datetime.datetime]] = None,
*args,
**kwargs
) -> None:
super().__init__(*args, **kwargs)
self.conf = conf
self.get_dag_name_f = get_dag_name_f
if not isinstance(execution_date, (str, datetime.datetime, type(None))):
raise TypeError(
"Expected str or datetime.datetime type for execution_date."
"Got {}".format(type(execution_date))
)
self.execution_date: Optional[datetime.datetime] = execution_date # type: ignore
def execute(self, context: Dict):
if isinstance(self.execution_date, datetime.datetime):
run_id = "trig__{}".format(self.execution_date.isoformat())
elif isinstance(self.execution_date, str):
run_id = "trig__{}".format(self.execution_date)
self.execution_date = timezone.parse(self.execution_date) # trigger_dag() expects datetime
else:
run_id = "trig__{}".format(timezone.utcnow().isoformat())
dag_id_to_call = self.get_dag_name_f()
# Ignore MyPy type for self.execution_date because it doesn't pick up the timezone.parse() for strings
trigger_dag(
dag_id=dag_id_to_call,
run_id=run_id,
conf=self.conf,
execution_date=self.execution_date,
replace_microseconds=False,
)
args={
'owner': 'arocketman',
'start_date': days_ago(1)
}
dag = DAG(dag_id='dyna_dag', default_args=args, schedule_interval=None)
def your_function():
return 'my_sample_dag'
with dag:
run_this_task = TriggerDagRunWithFuncOperator(
task_id='run_this',
get_dag_name_f=your_function
)

Execute single task AFTER dynamically-generated tasks via for-loop

Suppose I have the follow DAG (basic placeholder functions), that uses a for-loop to dynamically generate tasks (from iterating over a list):
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'ETLUSER',
'depends_on_past': False,
'start_date': datetime(2019, 12, 16, 0, 0, 0),
'email': ['xxx#xxx.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5)
}
dag = DAG('xxx', catchup=False,
default_args=default_args, schedule_interval='0 */4 * * *')
# Some dummy function
def StepOne(x):
print(x)
def StepTwo():
print("Okay, we finished all of Step 1.")
some_list = [1, 2, 3, 4, 5, 6]
for t in some_list:
task_id = f'FirstStep_{t}'
task = PythonOperator(
task_id=task_id,
python_callable=StepOne,
provide_context=False,
op_kwargs={'x': str(t)},
dag=dag
)
task
I want to introduce some additional task that's simply:
task2 = PythonOperator(
task_id="SecondStep",
python_callable=StepTwo,
provide_context=False,
dag=dag
)
That runs only after all the steps in the first have finished. Linearly, this would be task >> task2
How do I go about doing this?
You can have task dependencies with array.
Do taskC after both taskA and taskB finished.
[taskA, taskB] >> taskC
or
Do taskB and taskC in parallel after taskA finished.
taskA >> [taskB, taskC]
as long as 1 side of upstream or downstream are non-array.
Thus, for your example,
task1 = []
for t in some_list:
task_id = f'FirstStep_{t}'
task1.append(PythonOperator(
task_id=task_id,
python_callable=StepOne,
provide_context=False,
op_kwargs={'x': str(t)},
dag=dag))
task2 = PythonOperator(
task_id="SecondStep",
python_callable=StepTwo,
provide_context=False,
dag=dag)
task1 >> task2

Resources