Airflow skipping subdags' tasks during scheduled run - airflow

I have a DAG that runs well when triggered manually from the Web UI, but on the scheduled daily run all subdag are marked as success after 60 seconds and all the tasks inside those subdags are skipped.
Why are the tasks skipped on a scheduled run?
MainDag:
with models.DAG(
"MainDag",
schedule_interval='#daily',
start_date=dates.days_ago(0),
user_defined_macros=TEMPLATE_ENV,
) as dag:
prepare_factory = SubDagOperator(
task_id="prepare_factory ",
trigger_rule="one_success",
subdag=subdag_prepare_factory.sub_dag(
dag.dag_id, "prepare_factory", dag.start_date, dag.schedule_interval
),
)
SubDag:
def sub_dag(parent_dag_name, child_dag_name, start_date, schedule_interval):
with models.DAG(
"{0}.{1}".format(parent_dag_name, child_dag_name),
schedule_interval=schedule_interval,
start_date=start_date,
user_defined_macros=TEMPLATE_ENV,
) as dag:
# HOOKS
hook = _sshHook.getSshHook()
# Tasks
step_1 = ssh_operator.SSHOperator(
task_id="step_1",
ssh_hook=hook,
command="script.sh"),
)

The problem was with the start_date. Using date.days_agowas creating error between the main dag and the subdags at midnight.
I'm now using a static date with datetime
Example with static date
with models.DAG(
"MainDag",
schedule_interval='30 0 * * *',
start_date=datetime(2021, 4, 7),
user_defined_macros=TEMPLATE_ENV,
) as dag:

Related

Airflow v2.4.2 - New monthly DAG not running when scheduled

I have the following in the dag.py file, this is a newly pushed to prod DAG, it should have run at 14UTC (9EST) it should have ran a few hours ago but it still hasn't run even thought in the UI is still saying it will run at 14UTC.
DAG_NAME = "revenue_monthly"
START_DATE = datetime(2023, 1, 12)
SCHEDULE_INTERVAL = "0 14 3 * *"
default_args = {
'owner': 'airflow',
'start_date': START_DATE,
'depends_on_past': False
}
dag = DAG(DAG_NAME,
default_args=default_args,
schedule_interval=SCHEDULE_INTERVAL,
doc_md=doc_md,
max_active_runs=1,
catchup=False,
)
See picture below of the UI:
The date and time you are seeing as Next Run is the logical_date which is the start of the data interval. With the current configuration the first DAGrun will be on data from 2023-02-03 to 2023-03-03 so the DAG will only actually be running on 2023-03-03 (the Run After date, you can see that one when you are viewing the DAG and hover over the schedule in the upper right corner:
Assuming you want the DAG to do the run it would have done on 2023-02-03 (today) you can achieve that by backfilling one run, either by manually backfilling. Or by using catchup=True with a start_date before 2023-01-03:
from airflow import DAG
from pendulum import datetime
from airflow.operators.empty import EmptyOperator
DAG_NAME = "revenue_monthly_1"
START_DATE = datetime(2023, 1, 1)
SCHEDULE_INTERVAL = "0 14 3 * *"
doc_md="documentation"
default_args = {
'owner': 'airflow',
'start_date': START_DATE,
'depends_on_past': False
}
with DAG(
DAG_NAME,
default_args=default_args,
schedule_interval=SCHEDULE_INTERVAL,
doc_md=doc_md,
max_active_runs=1,
catchup=True,
) as dag:
t1 = EmptyOperator(task_id="t1")
gave me one run with the run id scheduled__2023-01-03T14:00:00+00:00 and the next_run date interval 2023-02-03 to 2023-03-03 which will Run after 2023-03-03.
This guide might help with terminology Airflow uses around schedules.

How to enforce max active run = 1 for a group of DAGs in Airflow?

I have a group of DAGs and I only want one of them to run at any given time.
ExternalTaskSensor will not work if I trigger a backfill job for one of them for a very old date.
I am aware of pool and priority weights method.
Another approach could be to make a custom operator and check all the dag runs of all the dags in the group.
Is there any other method to achieve this?
Airflow doesn't support this feature, even if you use pools, you need to use the same pool for all the tasks from all the dags in the group, and set the pool slots to 1, which break the parallelism.
You can achieve this by merging the dags in one dag and adding a branch operator which processes a param from dag_run conf to know which dag should it runs:
import pendulum
from airflow.operators.empty import EmptyOperator
from airflow.operators.python import BranchPythonOperator
from airflow.models.param import Param
from airflow.models import DAG
from airflow.decorators import task
default_args = {}
def dag_1(main_dag: DAG):
dag_id = "dag_1"
start_task = EmptyOperator(
task_id=dag_id,
dag=main_dag
)
task_1 = EmptyOperator(
task_id=f"{dag_id}.task1",
dag=main_dag
)
task_2 = EmptyOperator(
task_id=f"{dag_id}.task2",
dag=main_dag
)
start_task >> task_1 >> task_2
return start_task
def dag_2(main_dag: DAG):
dag_id = "dag_2"
start_task = EmptyOperator(
task_id=dag_id,
dag=main_dag
)
task_1 = EmptyOperator(
task_id=f"{dag_id}.task1",
dag=main_dag
)
task_2 = EmptyOperator(
task_id=f"{dag_id}.task2",
dag=main_dag
)
task_3 = EmptyOperator(
task_id=f"{dag_id}.task3",
dag=main_dag
)
start_task >> [task_1, task_2] >> task_3
return start_task
with DAG(
dag_id='multiple_dags',
default_args=default_args,
start_date=pendulum.datetime(2023, 1, 1),
schedule=None,
max_active_runs=1,
params={
"dag_id": Param(default="dag_1", enum=["dag_1", "dag_2"])
}
) as dag:
#task.branch(task_id="start_task")
def branch(**context):
return context["params"]["dag_id"]
branch() >> [
dag_1(dag),
dag_2(dag)
]
for param dag_1:
for param dag_2:
Then if you want to run these dags on different schedules, you can create N new dags contains one task from TriggerDagRunOperator to trigger the main dag and pass the dag id as param:
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
def create_trigger_dag(dag_id, schedule):
with DAG(
dag_id=dag_id,
start_date=pendulum.datetime(2023, 1, 1),
schedule=schedule,
catchup=False
) as dag:
TriggerDagRunOperator(
task_id="trigger_dag",
trigger_dag_id="multiple_dags",
conf={
"dag_id": dag_id
}
)
return dag
trigger_dag_1 = create_trigger_dag(dag_id="dag_1", schedule="*/1 * * * *")
trigger_dag_2 = create_trigger_dag(dag_id="dag_2", schedule="*/2 * * * *")
And here is the result, 2 runs from dag_1 for each run from dag_2:

Airflow: Dynamically generate tasks with TaskFlow API

Previously I used the following snippet to dynamically generate tasks:
dummy_start_task = PythonOperator(
task_id="dummy_start",
default_args=default_args,
python_callable=dummy_start,
dag=dag
)
make_images_tasks = list()
for n in range(WORKERS):
globals()[f"make_images_{n}_task"] = PythonOperator(
task_id=f'make_images_{n}',
default_args=default_args,
python_callable=make_images,
op_kwargs={"n": n},
dag=dag
)
make_images_tasks.append(globals()[f"make_images_{n}_task"])
dummy_collector_task = PythonOperator(
task_id="dummy_collector",
default_args=default_args,
python_callable=dummy_collector,
dag=dag
)
dummy_start_task >> make_images_tasks >> dummy_collector_task
# in collector_task I would use:
# items = task_instance.xcom_pull(task_ids=[f"make_images_{n}" for n in range(int(WORKERS))])
# to get the XCOMs from the these dynamically generated tasks
How can I achieve that using the TaskFlow API? (Spawn multiple tasks and then get their XComs in the following collector-task)
Here's an example:
from datetime import datetime
from airflow import DAG
from airflow.decorators import task
with DAG(dag_id="example_taskflow", start_date=datetime(2022, 1, 1), schedule_interval=None) as dag:
#task
def dummy_start_task():
pass
tasks = []
for n in range(3):
#task(task_id=f"make_images_{n}")
def images_task(i):
return i
tasks.append(images_task(n))
#task
def dummy_collector_task(tasks):
print(tasks)
dummy_start_task_ = dummy_start_task()
dummy_start_task_ >> tasks
dummy_collector_task(tasks)
Which gives the following DAG:
The make_images_* tasks take 0, 1, and 2 as input (and also use it in the tasks' id) and return the value. The dummy_collector_task takes all outputs from the make_images_* tasks and prints [0, 1, 2].

How to schedule the airflow DAG to run just after the end of the previous running DAG?

I have a simple DAG with 2 PythonOperator and schedule interval for 2 minutes:
with DAG(dag_id='example_cron', schedule_interval='*/2 * * * *', start_date=days_ago(2)) as dag:
def task1_func(ti):
print("start task 1")
time.sleep(random.randint(0, 70))
print("end task 1")
def task2_func(ti):
print("start task 2")
time.sleep(random.randint(0, 70))
print("end task 2")
task1 = PythonOperator(task_id='task1', python_callable=task1_func, provide_context=True)
task2 = PythonOperator(task_id='task2', python_callable=task2_func, provide_context=True)
task1 >> task2
The DAG can run more than 2 minutes and this means that more than one DAG may run in parallel.
How can I configure the DAG to run just if the previous run has finished ?
You simply need to add max_active_runs=1 to your DAG object.
with DAG(..., max_active_runs=1) as dag:
Not part of your question but please note that days_ago(2) is deprecated and in any case you should not use dynamic dates for start_date (see docs)

Airflow Scheduling: how to run initial setup task only once?

If my DAG is this
[setup] -> [processing-task] -> [end].
How can I schedule this DAG to run periodically, while running [setup] task only once (on first scheduled run) and skipping it for all later runs?
Check out this post in medium which describes how to implement a "run once" operator. I have successfully used this several times.
Here is a way to do it without need to create a new class. I found this simpler than the accepted answer and it worked well for my use case.
Might be useful for others!
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import BranchPythonOperator
with DAG(
dag_id='your_dag_id',
default_args={
'depends_on_past': False,
'email': ['you#email.com'],
'email_on_failure': True,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
},
description='Dag with initial setup task that only runs on start_date',
start_date=datetime(2000, 1, 1),
# Runs daily at 1 am
schedule_interval='0 1 * * *',
# catchup must be true if start_date is before datetime.now()
catchup=True,
max_active_runs=1,
) as dag:
def branch_fn(**kwargs):
# Have to make sure start_date will equal data_interval_start on first run
# This dag is daily but since the schedule_interval is set to 1 am data_interval_start would be
# 2000-01-01 01:00:00 when it needs to be
# 2000-01-01 00:00:00
date = kwargs['data_interval_start'].replace(hour=0, minute=0, second=0, microsecond=0)
if date == dag.start_date:
return 'initial_task'
else:
return 'skip_initial_task'
branch_task = BranchPythonOperator(
task_id='branch_task',
python_callable=branch_fn,
provide_context=True
)
initial_task = DummyOperator(
task_id="initial_task"
)
skip_initial_task = DummyOperator(
task_id="skip_initial_task"
)
next_task = DummyOperator(
task_id="next_task",
# This is important otherwise next_task would be skipped
trigger_rule="one_success"
)
branch_task >> [initial_task, skip_initial_task] >> next_task

Resources