Airflow sla_miss_callback function not triggering - airflow

I have been trying to get a slack message callback to trigger on SLA misses. I've noticed that:
SLA misses get registered successfully in the Airflow web UI at
slamiss/list/
on_failure_callback works successfully
However, the sla_miss_callback function itself will never get triggered.
What I've tried:
Different combinations adding sla and sla_miss_callback at the
default_args level, the DAG level, and the task level
Checking logs on our scheduler and workers for SLA related messages (see also here), but we haven't seen anything
The slack message callback function works if called from any other
basic task or function
default_args = {
"owner": "airflow",
"depends_on_past": False,
'start_date': airflow.utils.dates.days_ago(n=0,minute=1),
'on_failure_callback': send_task_failed_msg_to_slack,
'sla': timedelta(minutes=1),
"retries": 0,
"pool": 'canary',
'priority_weight': 1
}
dag = airflow.DAG(
dag_id='sla_test',
default_args=default_args,
sla_miss_callback=send_sla_miss_message_to_slack,
schedule_interval='*/5 * * * *',
catchup=False,
max_active_runs=1,
dagrun_timeout=timedelta(minutes=5)
)
def sleep():
""" Sleep for 2 minutes """
time.sleep(90)
LOGGER.info("Slept for 2 minutes")
def simple_print(**context):
""" Prints a message """
print("Hello World!")
sleep = PythonOperator(
task_id="sleep",
python_callable=sleep,
dag=dag
)
simple_task = PythonOperator(
task_id="simple_task",
python_callable=simple_print,
provide_context=True,
dag=dag
)
sleep >> simple_task

I was in similar situation once.
On investigating the scheduler log, I found the following error:
[2020-07-08 09:14:32,781] {scheduler_job.py:534} INFO - --------------> ABOUT TO CALL SLA MISS CALL BACK
[2020-07-08 09:14:32,781] {scheduler_job.py:541} ERROR - Could not call sla_miss_callback for DAG
sla_miss_alert() takes 1 positional arguments but 5 were given
The problem is that your sla_miss_callback function is expecting only 1 argument, but actually this should be like:
def sla_miss_alert(dag, task_list, blocking_task_list, slas, blocking_tis):
"""Function that alerts me that dag_id missed sla"""
# <function code here>
For reference, checkout the Airflow source code.
Note: Don't put sla_miss_callback=sla_miss_alert in default_args. It should be defined in the DAG definition itself.

Example of using SLA missed and Execution Timeout alerts:
At first, you'll get SLA missed after 2 minutes task run,
and then, after 4 minutes task will fail with Execution Timeout alert.
"sla": timedelta(minutes=2), # Default Task SLA time
"execution_timeout": timedelta(minutes=4), # Default Task Execution Timeout
Also, you have log_url right in the message, so you can easily open task log in Airflow.
Example Slack Message
import time
from datetime import datetime, timedelta
from textwrap import dedent
from typing import Any, Dict, List, Optional, Tuple
from airflow import AirflowException
from airflow.contrib.operators.slack_webhook_operator import SlackWebhookOperator
from airflow.exceptions import AirflowTaskTimeout
from airflow.hooks.base_hook import BaseHook
from airflow.models import DAG, TaskInstance
from airflow.operators.python_operator import PythonOperator
SLACK_STATUS_TASK_FAILED = ":red_circle: Task Failed"
SLACK_STATUS_EXECUTION_TIMEOUT = ":alert: Task Failed by Execution Timeout."
def send_slack_alert_sla_miss(
dag: DAG,
task_list: str,
blocking_task_list: str,
slas: List[Tuple],
blocking_tis: List[TaskInstance],
) -> None:
"""Send `SLA missed` alert to Slack"""
task_instance: TaskInstance = blocking_tis[0]
message = dedent(
f"""
:warning: Task SLA missed.
*DAG*: {dag.dag_id}
*Task*: {task_instance.task_id}
*Execution Time*: {task_instance.execution_date.strftime("%Y-%m-%d %H:%M:%S")} UTC
*SLA Time*: {task_instance.task.sla}
_* Time by which the job is expected to succeed_
*Task State*: `{task_instance.state}`
*Blocking Task List*: {blocking_task_list}
*Log URL*: {task_instance.log_url}
"""
)
send_slack_alert(message=message)
def send_slack_alert_task_failed(context: Dict[str, Any]) -> None:
"""Send `Task Failed` notification to Slack"""
task_instance: TaskInstance = context.get("task_instance")
exception: AirflowException = context.get("exception")
status = SLACK_STATUS_TASK_FAILED
if isinstance(exception, AirflowTaskTimeout):
status = SLACK_STATUS_EXECUTION_TIMEOUT
# Prepare formatted Slack message
message = dedent(
f"""
{status}
*DAG*: {task_instance.dag_id}
*Task*: {task_instance.task_id}
*Execution Time*: {context.get("execution_date").to_datetime_string()} UTC
*SLA Time*: {task_instance.task.sla}
_* Time by which the job is expected to succeed_
*Execution Timeout*: {task_instance.task.execution_timeout}
_** Max time allowed for the execution of this task instance_
*Task Duration*: {timedelta(seconds=round(task_instance.duration))}
*Task State*: `{task_instance.state}`
*Exception*: {exception}
*Log URL*: {task_instance.log_url}
"""
)
send_slack_alert(
message=message,
context=context,
)
def send_slack_alert(
message: str,
context: Optional[Dict[str, Any]] = None,
) -> None:
"""Send prepared message to Slack"""
slack_webhook_token = BaseHook.get_connection("slack").password
notification = SlackWebhookOperator(
task_id="slack_notification",
http_conn_id="slack",
webhook_token=slack_webhook_token,
message=message,
username="airflow",
)
notification.execute(context)
# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_args = {
"owner": "airflow",
"email": ["test#test,com"],
"email_on_failure": True,
"depends_on_past": False,
"retry_delay": timedelta(minutes=5),
"sla": timedelta(minutes=2), # Default Task SLA time
"execution_timeout": timedelta(minutes=4), # Default Task Execution Timeout
"on_failure_callback": send_slack_alert_task_failed,
}
with DAG(
dag_id="test_sla",
schedule_interval="*/5 * * * *",
start_date=datetime(2021, 1, 11),
default_args=default_args,
sla_miss_callback=send_slack_alert_sla_miss, # Must be set here, not in default_args!
) as dag:
delay_python_task = PythonOperator(
task_id="delay_five_minutes_python_task",
#MIKE MILLIGAN ADDED THIS
sla=timedelta(minutes=2),
python_callable=lambda: time.sleep(300),
)

It seems that the only way to make the sla_miss_callback work is by explicitly passing the arguments that it needs... nothing else has worked for me and these arguments: 'dag', 'task_list', 'blocking_task_list', 'slas', and 'blocking_tis' are not been sent to the callback at all.
TypeError: print_sla_miss() missing 5 required positional arguments: 'dag', 'task_list', 'blocking_task_list', 'slas', and 'blocking_tis'

A lot of these answers are 90% complete so I wanted to share my example using bash operators which combined what I found from all of the responses above and other resources
The most important things being how you define sla_miss_callback in the dag definition and not in the default_args, and not passing context to the sla function.
"""
A simple example showing the basics of using a custom SLA notification response.
"""
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import timedelta, datetime
from airflow.operators.slack_operator import SlackAPIPostOperator
from slack import slack_attachment
from airflow.hooks.base_hook import BaseHook
import urllib
#slack alert for sla_miss
def slack_sla_miss(dag, task_list, blocking_task_list, slas, blocking_tis):
dag_id = slas[0].dag_id
task_id = slas[0].task_id
execution_date = slas[0].execution_date.isoformat()
base_url = 'webserver_url_here'
encoded_execution_date = urllib.parse.quote_plus(execution_date)
dag_url = (f'{base_url}/graph?dag_id={dag_id}'
f'&execution_date={encoded_execution_date}')
message = (f':alert: *Airflow SLA Miss*'
f'\n\n'
f'*DAG:* {dag_id}\n'
f'*Task:* {task_id}\n'
f'*Execution Date:* {execution_date}'
f'\n\n'
f'<{dag_url}|Click here to view DAG>')
sla_miss_alert = SlackAPIPostOperator(
task_id='slack_sla_miss',
channel='airflow-alerts-test',
token=str(BaseHook.get_connection("slack").password),
text = message
)
return sla_miss_alert.execute()
#slack alert for successful task completion
def slack_success_task(context):
success_alert = SlackAPIPostOperator(
task_id='slack_success',
channel='airflow-alerts-test',
token=str(BaseHook.get_connection("slack").password),
text = "Test successful"
)
return success_alert.execute(context=context)
default_args = {
"depends_on_past": False,
'start_date': datetime(2020, 11, 18),
"retries": 0
}
# Create a basic DAG with our args
# Note: Don't put sla_miss_callback=sla_miss_alert in default_args. It should be defined in the DAG definition itself.
dag = DAG(
dag_id='sla_slack_v6',
default_args=default_args,
sla_miss_callback=slack_sla_miss,
catchup=False,
# A common interval to make the job fire when we run it
schedule_interval=timedelta(minutes=3)
)
# Add a task that will always fail the SLA
t1 = BashOperator(
task_id='timeout_test_sla_miss',
# Sleep 60 seconds to guarantee we miss the SLA
bash_command='sleep 60',
# Do not retry so the SLA miss fires after the first execution
retries=0,
#on_success_callback = slack_success_task,
provide_context = True,
# Set our task up with a 10 second SLA
sla=timedelta(seconds=10),
dag=dag
)
t2 = BashOperator(
task_id='timeout_test_sla_miss_task_2',
# Sleep 30 seconds to guarantee we miss the SLA of 20 seconds set in this task
bash_command='sleep 30',
# Do not retry so the SLA miss fires after the first execution
retries=0,
#on_success_callback = slack_success_task,
provide_context = True,
# Set our task up with a 20 second SLA
sla=timedelta(seconds=20),
dag=dag
)
t3 = BashOperator(
task_id='timeout_test_sla_miss_task_3',
# Sleep 60 seconds to guarantee we miss the SLA
bash_command='sleep 60',
# Do not retry so the SLA miss fires after the first execution
retries=0,
#on_success_callback = slack_success_task,
provide_context = True,
# Set our task up with a 30 second SLA
sla=timedelta(seconds=30),
dag=dag
)
t1 >> t2 >> t3

I think the airflow documentation is a bit fuzzy on this.
Instead of the method signature as
def slack_sla_miss(dag, task_list, blocking_task_list, slas, blocking_tis)
Modify your signature like this
def slack_sla_miss(*args, **kwargs)
This way all the parameters get passed. You will not get the errors which you are seeing in the logs.
Learnt this on url - https://www.cloudwalker.io/2020/12/15/airflow-sla-management/

I had the same issue, but was able to get it working with this code:
import logging as log
import airflow
import time
from datetime import timedelta
# The DAG object; we'll need this to instantiate a DAG
from airflow import DAG
# Operators; we need this to operate!
from airflow.operators.python_operator import PythonOperator
from airflow import configuration
import urllib
from airflow.operators.slack_operator import SlackAPIPostOperator
def sleep():
""" Sleep for 2 minutes """
time.sleep(60*2)
log.info("Slept for 2 minutes")
def simple_print(**context):
""" Prints a message """
print("Hello World!")
def slack_on_sla_miss(dag,
task_list,
blocking_task_list,
slas,
blocking_tis):
log.info('Running slack_on_sla_miss')
slack_conn_id = 'slack_default'
slack_channel = '#general'
dag_id = slas[0].dag_id
task_id = slas[0].task_id
execution_date = slas[0].execution_date.isoformat()
base_url = configuration.get('webserver', 'BASE_URL')
encoded_execution_date = urllib.parse.quote_plus(execution_date)
dag_url = (f'{base_url}/graph?dag_id={dag_id}'
f'&execution_date={encoded_execution_date}')
message = (f':o: *Airflow SLA Miss*'
f'\n\n'
f'*DAG:* {dag_id}\n'
f'*Task:* {task_id}\n'
f'*Execution Date:* {execution_date}'
f'\n\n'
f'<{dag_url}|Click here to view>')
slack_op = SlackAPIPostOperator(task_id='slack_failed',
slack_conn_id=slack_conn_id,
channel=slack_channel,
text=message)
slack_op.execute()
default_args = {
"owner": "airflow",
"depends_on_past": False,
'start_date': airflow.utils.dates.days_ago(n=0, minute=1),
"retries": 0,
'priority_weight': 1,
}
dag = DAG(
dag_id='sla_test',
default_args=default_args,
sla_miss_callback=slack_on_sla_miss,
schedule_interval='*/5 * * * *',
catchup=False,
max_active_runs=1,
)
with dag:
sleep = PythonOperator(
task_id="sleep",
python_callable=sleep,
)
simple_task = PythonOperator(
task_id="simple_task",
python_callable=simple_print,
provide_context=True,
sla=timedelta(minutes=1),
)
sleep >> simple_task

I've run into this issue myself. Unlike the on_failure_callback that is looking for a python callable function, it appears that sla_miss_callback needs the full function call.
An example that is working for me:
def sla_miss_alert(dag_id):
"""
Function that alerts me that dag_id missed sla
"""
<function code here>
def task_failure_alert(dag_id, context):
"""
Function that alerts me that a task failed
"""
<function code here>
dag_id = 'sla_test'
default_args = {
"owner": "airflow",
"depends_on_past": False,
'start_date': airflow.utils.dates.days_ago(n=0,minute=1),
'on_failure_callback': partial(task_failure_alert, dag_id),
'sla': timedelta(minutes=1),
"retries": 0,
"pool": 'canary',
'priority_weight': 1
}
dag = airflow.DAG(
dag_id='sla_test',
default_args=default_args,
sla_miss_callback=sla_miss_alert(dag_id),
schedule_interval='*/5 * * * *',
catchup=False,
max_active_runs=1,
dagrun_timeout=timedelta(minutes=5)
)
As far as I can tell, sla_miss_callback doesn't have access to context, which is unfortunate. Once I stopped looking for the context, I finally got my alerts.

Related

View on_failure_callback DAG logger

Let's take an example DAG.
Here is the code for it.
import logging
from airflow import DAG
from datetime import datetime, timedelta
from airflow.models import TaskInstance
from airflow.operators.python import PythonOperator
from airflow.operators.dummy import DummyOperator
def task_failure_notification_alert(context):
logging.info("Task context details: %s", str(context))
def dag_failure_notification_alert(context):
logging.info("DAG context details: %s", str(context))
def red_exception_task(ti: TaskInstance, **kwargs):
raise Exception('red')
default_args = {
"owner": "analytics",
"start_date": datetime(2021, 12, 12),
'retries': 0,
'retry_delay': timedelta(),
"schedule_interval": "#daily"
}
dag = DAG('logger_dag',
default_args=default_args,
catchup=False,
on_failure_callback=dag_failure_notification_alert
)
start_task = DummyOperator(task_id="start_task", dag=dag, on_failure_callback=task_failure_notification_alert)
red_task = PythonOperator(
dag=dag,
task_id='red_task',
python_callable=red_exception_task,
provide_context=True,
on_failure_callback=task_failure_notification_alert
)
end_task = DummyOperator(task_id="end_task", dag=dag, on_failure_callback=task_failure_notification_alert)
start_task >> red_task >> end_task
We can see two functions i.e. task_failure_notification_alert and dag_failure_notification_alert are being called in case of failures.
We can see logs in case of Task failure by the below steps.
We can see logs for the task as below.
but I am unable to find logs for the on_failure_callback of DAG anywhere in UI. Where can we see it?
Under airflow/logs find the "scheduler" folder, under it look for the specific date you ran the Dag for example 2022-12-03 and there you will see name of the dag_file.log.

Airflow triggering the "on_failure_callback" when the "dagrun_timeout" is exceeded

Currently working on setting up alerts for long running tasks in Airflow. To cancel/fail the airflow dag I've put "dagrun_timeout" in the default_args, and it does what I need, fails/errors the dag when its been running for too long (usually stuck). The only problem is that the function in "on_failure_callback" doesn't get called when the dagrun_timeout is exceeded, because the "on_failure_callback" is on the task level (I think) while the dagrun_timeout is on the dag level.
How can I execute the "on_failure_callback" when the dagrun_timeout is exceeded, or how can I specify a function to be called when a dag fails? Or should I re-think my approach?
Try setting on_failure_callback during DAG declaration:
with DAG(
dag_id="failure_callback_example",
on_failure_callback=_on_dag_run_fail,
...
) as dag:
...
The explanation is that on_failure_callback defined in default_args will get passed only to the Tasks being created and not to the DAG object.
Here is an example to try this behaviour:
from datetime import datetime, timedelta
from airflow import DAG
from airflow.models import TaskInstance
from airflow.operators.bash import BashOperator
def _on_dag_run_fail(context):
print("***DAG failed!! do something***")
print(f"The DAG failed because: {context['reason']}")
print(context)
def _alarm(context):
print("** Alarm Alarm!! **")
task_instance: TaskInstance = context.get("task_instance")
print(f"Task Instance: {task_instance} failed!")
default_args = {
"owner": "mi_empresa",
"email_on_failure": False,
"on_failure_callback": _alarm,
}
with DAG(
dag_id="failure_callback_example",
start_date=datetime(2021, 9, 7),
schedule_interval=None,
default_args=default_args,
catchup=False,
on_failure_callback=_on_dag_run_fail,
dagrun_timeout=timedelta(seconds=45),
) as dag:
delayed = BashOperator(
task_id="delayed",
bash_command='echo "waiting..";sleep 60; echo "Done!!"',
)
will_fail = BashOperator(
task_id="will_fail",
bash_command="exit 1",
# on_failure_callback=_alarm,
)
delayed >> will_fail
You can find the logs of the callbacks execution in the Scheduler logs AIRFLOW_HOME/logs/scheduler/date/failure_callback_example :
[2021-09-24 13:12:34,285] {logging_mixin.py:104} INFO - [2021-09-24 13:12:34,285] {dag.py:862} INFO - Executing dag callback function: <function _on_dag_run_fail at 0x7f83102e8670>
[2021-09-24 13:12:34,336] {logging_mixin.py:104} INFO - ***DAG failed!! do something***
[2021-09-24 13:12:34,345] {logging_mixin.py:104} INFO - The DAG failed because: timed_out
Edit:
Within the context dict the key reason is passed in order to specify the cause of the DAG run failure. Some values are: 'reason': 'timed_out' or 'reason': 'task_failure' . This could be use to perfom specific behaviour in the callback based on the reason of the DAG Run failure.

DAG marked as "success" if one task fails, because of trigger rule ALL_DONE

I have the following DAG with 3 tasks:
start --> special_task --> end
The task in the middle can succeed or fail, but end must always be executed (imagine this is a task for cleanly closing resources). For that, I used the trigger rule ALL_DONE:
end.trigger_rule = trigger_rule.TriggerRule.ALL_DONE
Using that, end is properly executed if special_task fails. However, since end is the last task and succeeds, the DAG is always marked as SUCCESS.
How can I configure my DAG so that if one of the tasks failed, the whole DAG is marked as FAILED?
Example to reproduce
import datetime
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils import trigger_rule
dag = DAG(
dag_id='my_dag',
start_date=datetime.datetime.today(),
schedule_interval=None
)
start = BashOperator(
task_id='start',
bash_command='echo start',
dag=dag
)
special_task = BashOperator(
task_id='special_task',
bash_command='exit 1', # force failure
dag=dag
)
end = BashOperator(
task_id='end',
bash_command='echo end',
dag=dag
)
end.trigger_rule = trigger_rule.TriggerRule.ALL_DONE
start.set_downstream(special_task)
special_task.set_downstream(end)
This post seems to be related, but the answer does not suit my needs, since the downstream task end must be executed (hence the mandatory trigger_rule).
I thought it was an interesting question and spent some time figuring out how to achieve it without an extra dummy task. It became a bit of a superfluous task, but here's the end result:
This is the full DAG:
import airflow
from airflow import AirflowException
from airflow.models import DAG, TaskInstance, BaseOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.utils.db import provide_session
from airflow.utils.state import State
from airflow.utils.trigger_rule import TriggerRule
default_args = {"owner": "airflow", "start_date": airflow.utils.dates.days_ago(3)}
dag = DAG(
dag_id="finally_task_set_end_state",
default_args=default_args,
schedule_interval="0 0 * * *",
description="Answer for question https://stackoverflow.com/questions/51728441",
)
start = BashOperator(task_id="start", bash_command="echo start", dag=dag)
failing_task = BashOperator(task_id="failing_task", bash_command="exit 1", dag=dag)
#provide_session
def _finally(task, execution_date, dag, session=None, **_):
upstream_task_instances = (
session.query(TaskInstance)
.filter(
TaskInstance.dag_id == dag.dag_id,
TaskInstance.execution_date == execution_date,
TaskInstance.task_id.in_(task.upstream_task_ids),
)
.all()
)
upstream_states = [ti.state for ti in upstream_task_instances]
fail_this_task = State.FAILED in upstream_states
print("Do logic here...")
if fail_this_task:
raise AirflowException("Failing task because one or more upstream tasks failed.")
finally_ = PythonOperator(
task_id="finally",
python_callable=_finally,
trigger_rule=TriggerRule.ALL_DONE,
provide_context=True,
dag=dag,
)
succesful_task = DummyOperator(task_id="succesful_task", dag=dag)
start >> [failing_task, succesful_task] >> finally_
Look at the _finally function, which is called by the PythonOperator. There are a few key points here:
Annotate with #provide_session and add argument session=None, so you can query the Airflow DB with session.
Query all upstream task instances for the current task:
upstream_task_instances = (
session.query(TaskInstance)
.filter(
TaskInstance.dag_id == dag.dag_id,
TaskInstance.execution_date == execution_date,
TaskInstance.task_id.in_(task.upstream_task_ids),
)
.all()
)
From the returned task instances, get the states and check if State.FAILED is in there:
upstream_states = [ti.state for ti in upstream_task_instances]
fail_this_task = State.FAILED in upstream_states
Perform your own logic:
print("Do logic here...")
And finally, fail the task if fail_this_task=True:
if fail_this_task:
raise AirflowException("Failing task because one or more upstream tasks failed.")
The end result:
As #JustinasMarozas explained in a comment, a solution is to create a dummy task like :
dummy = DummyOperator(
task_id='test',
dag=dag
)
and bind it downstream to special_task :
failing_task.set_downstream(dummy)
Thus, the DAG is marked as failed, and the dummy task is marked as upstream_failed.
Hope there is an out-of-the-box solution, but waiting for that, this solution does the job.
To expand on Bas Harenslak answer, a simpler _finally function which will check the state of all tasks (not only the upstream ones) can be:
def _finally(**kwargs):
for task_instance in kwargs['dag_run'].get_task_instances():
if task_instance.current_state() != State.SUCCESS and \
task_instance.task_id != kwargs['task_instance'].task_id:
raise Exception("Task {} failed. Failing this DAG run".format(task_instance.task_id))

apache airflow - Cannot load the dag bag to handle failure

I have created a on_failure_callback function(refering Airflow default on_failure_callback) to handle task's failure.
It works well when there is only one task in a DAG, however, if there are 2 more tasks, a task is randomly failed since the operator is null, it can resume later by manully . In airflow-scheduler.out the log is:
[2018-05-08 14:24:21,237] {models.py:1595} ERROR - Executor reports
task instance %s finished (%s) although the task says its %s. Was the
task killed externally? NoneType [2018-05-08 14:24:21,238]
{jobs.py:1435} ERROR - Cannot load the dag bag to handle failure for
. Setting task to FAILED without
callbacks or retries. Do you have enough resources?
The DAG code is:
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import timedelta
import airflow
from devops.util import WechatUtil
from devops.util import JiraUtil
def on_failure_callback(context):
ti = context['task_instance']
log_url = ti.log_url
owner = ti.task.owner
ti_str = str(context['task_instance'])
wechat_msg = "%s - Owner:%s"%(ti_str,owner)
WeChatUtil.notify_team(wechat_msg)
jira_desc = "Please check log from url %s"%(log_url)
JiraUtil.create_incident("DW",ti_str,jira_desc,owner)
args = {
'queue': 'default',
'start_date': airflow.utils.dates.days_ago(1),
'retry_delay': timedelta(minutes=1),
'on_failure_callback': on_failure_callback,
'owner': 'user1',
}
dag = DAG(dag_id='test_dependence1',default_args=args,schedule_interval='10 16 * * *')
load_crm_goods = BashOperator(
task_id='crm_goods_job',
bash_command='date',
dag=dag)
load_crm_memeber = BashOperator(
task_id='crm_member_job',
bash_command='date',
dag=dag)
load_crm_order = BashOperator(
task_id='crm_order_job',
bash_command='date',
dag=dag)
load_crm_eur_invt = BashOperator(
task_id='crm_eur_invt_job',
bash_command='date',
dag=dag)
crm_member_cohort_analysis = BashOperator(
task_id='crm_member_cohort_analysis_job',
bash_command='date',
dag=dag)
crm_member_cohort_analysis.set_upstream(load_crm_goods)
crm_member_cohort_analysis.set_upstream(load_crm_memeber)
crm_member_cohort_analysis.set_upstream(load_crm_order)
crm_member_cohort_analysis.set_upstream(load_crm_eur_invt)
crm_member_kpi_daily = BashOperator(
task_id='crm_member_kpi_daily_job',
bash_command='date',
dag=dag)
crm_member_kpi_daily.set_upstream(crm_member_cohort_analysis)
I had tried to update the airflow.cfg by adding the default memory from 512 to even 4096, but no luck. Would anyone have any advice ?
Ialso try to updated my JiraUtil and WechatUtil as following, encoutering the same error
WechatUtil:
import requests
class WechatUtil:
#staticmethod
def notify_trendy_user(user_ldap_id, message):
return None
#staticmethod
def notify_bigdata_team(message):
return None
JiraUtil:
import json
import requests
class JiraUtil:
#staticmethod
def execute_jql(jql):
return None
#staticmethod
def create_incident(projectKey, summary, desc, assignee=None):
return None
(I'm shooting tracer bullets a bit here, so bear with me if this answer doesn't get it right on the first try.)
The null operator issue with multiple task instances is weird... it would help approaching troubleshooting this if you could boil the current code down to a MCVE e.g., 1–2 operators and excluding the JiraUtil and WechatUtil parts if they're not related to the callback failure.
Here are 2 ideas:
1. Can you try changing the line that fetches the task instance out of the context to see if this makes a difference?
Before:
def on_failure_callback(context):
ti = context['task_instance']
...
After:
def on_failure_callback(context):
ti = context['ti']
...
I saw this usage in the Airflow repo (https://github.com/apache/incubator-airflow/blob/c1d583f91a0b4185f760a64acbeae86739479cdb/airflow/contrib/hooks/qubole_check_hook.py#L88). It's possible it can be accessed both ways.
2. Can you try adding provide_context=True on the operators either as a kwarg or in default_args?

Triggering A SubDag

EDITED
I have edited this question by considering the inputs from #tobi6
I copied the subdag operator from Airflow source code
Source code: https://github.com/apache/incubator-airflow/blob/master/airflow/operators/subdag_operator.py
I modified a few things in the execute method. The changes were made to trigger the SubDag and wait until the SubDag completes execution. The trigger is working great but the tasks are not being executed (DAG is in the running/Green state while the tasks are in the null/White state).
Please refer below for the changes I made:
from airflow.exceptions import AirflowException
from airflow.models import BaseOperator, Pool
from airflow.utils.decorators import apply_defaults
from airflow.utils.db import provide_session
from airflow.utils.state import State
from airflow.executors import GetDefaultExecutor
from time import sleep
import logging
from datetime import datetime
class SubDagOperator(BaseOperator):
template_fields = tuple()
ui_color = '#555'
ui_fgcolor = '#fff'
#provide_session
#apply_defaults
def __init__(
self,
subdag,
executor=GetDefaultExecutor(),
*args, **kwargs):
"""
Yo dawg. This runs a sub dag. By convention, a sub dag's dag_id
should be prefixed by its parent and a dot. As in `parent.child`.
:param subdag: the DAG object to run as a subdag of the current DAG.
:type subdag: airflow.DAG
:param dag: the parent DAG
:type subdag: airflow.DAG
"""
import airflow.models
dag = kwargs.get('dag') or airflow.models._CONTEXT_MANAGER_DAG
if not dag:
raise AirflowException('Please pass in the `dag` param or call '
'within a DAG context manager')
session = kwargs.pop('session')
super(SubDagOperator, self).__init__(*args, **kwargs)
# validate subdag name
if dag.dag_id + '.' + kwargs['task_id'] != subdag.dag_id:
raise AirflowException(
"The subdag's dag_id should have the form "
"'{{parent_dag_id}}.{{this_task_id}}'. Expected "
"'{d}.{t}'; received '{rcvd}'.".format(
d=dag.dag_id, t=kwargs['task_id'], rcvd=subdag.dag_id))
# validate that subdag operator and subdag tasks don't have a
# pool conflict
if self.pool:
conflicts = [t for t in subdag.tasks if t.pool == self.pool]
if conflicts:
# only query for pool conflicts if one may exist
pool = (
session
.query(Pool)
.filter(Pool.slots == 1)
.filter(Pool.pool == self.pool)
.first()
)
if pool and any(t.pool == self.pool for t in subdag.tasks):
raise AirflowException(
'SubDagOperator {sd} and subdag task{plural} {t} both '
'use pool {p}, but the pool only has 1 slot. The '
'subdag tasks will never run.'.format(
sd=self.task_id,
plural=len(conflicts) > 1,
t=', '.join(t.task_id for t in conflicts),
p=self.pool
)
)
self.subdag = subdag
self.executor = executor
def execute(self, context):
dag_run = self.subdag.create_dagrun(
conf=context['dag_run'].conf,
state=State.RUNNING,
execution_date=context['execution_date'],
run_id='trig__' + str(datetime.utcnow()),
external_trigger=True
)
while True:
if dag_run.get_state() == State.FAILED or dag_run.get_state() == State.SUCCESS:
break
else:
sleep(10)
continue
Below is the code that shows how I'm using the same
from airflow import DAG
from operators.sd_operator import SubDagOperator # My SubDag Operator
from airflow.operators.python_operator import PythonOperator
import logging
from datetime import datetime
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2017, 7, 17),
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False,
}
def print_dag_details(**kwargs):
logging.info(str(kwargs['dag_run'].conf))
with DAG('example_dag', schedule_interval=None, catchup=False, default_args=default_args) as dag:
task_1 = SubDagOperator(
subdag=sub_dag_func('example_dag', 'sub_dag_1'),
task_id='sub_dag_1'
)
task_2 = SubDagOperator(
subdag=sub_dag_func('example_dag', 'sub_dag_2'),
task_id='sub_dag_2',
)
print_kwargs = PythonOperator(
task_id='print_kwargs',
python_callable=print_dag_details,
provide_context=True
)
print_kwargs >> task_1 >> task_2
Any information you provide would be helpful. Thanks in advance.
It is a bit hard to understand your question without context.
"I copied the subdag operator and modified a few things in the execute method."
From where was this copied?
"The trigger is working great ..."
How does this look like?
There are a few things I saw in the code:
It might be helpful to add assigned fields to the function call of sub_dag_func, e.g. sub_dag_func(subdag='parent_dag'...).
In the binary shift definition, used to set upstream / downstream there are tasks defined I cannot find in the DAG (df_job_1, df_job_2). This might be connected to SubDAGs (haven't looked into them yet).
The name of the sub dag seems inconsistent with the comment in the code saying By convention, a sub dag's dag_id should be prefixed by its parent and a dot but it is sub_dag_1, sub_dag_2

Resources