Test Dag run for Airflow 1.9 in unittest - airflow

I had implemented test case for running an individual dag but it does not seem to work in 1.9 and may be due to stricter pool which got introduced in airflow 1.8
.
I am trying to run below test case:
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
class DAGTest(unittest.TestCase):
def make_tasks(self):
dag = DAG('test_dag', description='a test',
schedule_interval='#once',
start_date=datetime(2018, 6, 26),
catchup=False)
du1 = DummyOperator(task_id='dummy1', dag=dag)
du2 = DummyOperator(task_id='dummy2', dag=dag)
du3 = DummyOperator(task_id='dummy3', dag=dag)
du1 >> du2 >> du3
dag.run()
def test_execute(self):
self.make_tasks()
exception :
Dependencies not met for <TaskInstance: test_dag.dummy3 2018-06-26 00:00:00 [upstream_failed]>, dependency 'Trigger Rule' FAILED: Task's trigger rule 'all_success' requires all
upstream tasks to have succeeded, but found 1 non-success(es).
upstream_tasks_state={'skipped': 0L, 'successes': 0L, 'failed': 0L,'upstream_failed': 1L, 'done': 1L, 'total': 1}, upstream_task_ids=['dummy2']
What am I doing it wrong?
I have tried both LocalExecutor and SequentialExecutor
Environment:
Python 2.7
Airflow 1.9
I believe it is trying to execute all the tasks simultaneously without respecting the dependencies.
Note: Similar code use to work in Airflow 1.7

I'm not familar with Airflow 1.7, but I guess it didn't have the same "DagBag" concept that Airflow1.8 and upwards have.
You can't run a DAG that you have created like this, because dag.run() starts a new python process and it will have to find the DAG from a dag folder it parses on disk - but it can't. This was included as a message in the output (but you didn't include the full error message/output)
What are you trying to test by creating a dag in the test files? Is it a custom operator? Then you would be better off testing that directly. For instance, here is how I test a custom operator stand-alone:
class MyPluginTest(unittest.TestCase)
def setUp(self):
dag = DAG(TEST_DAG_ID, schedule_interval='* * * * Thu', default_args={'start_date': DEFAULT_DATE})
self.dag = dag
self.op = myplugin.FindTriggerFileForExecutionPeriod(
dag=dag,
task_id='test',
prefix='s3://bucket/some/prefix',
)
self.ti = TaskInstance(task=self.op, execution_date=DEFAULT_DATE)
# Other S3 setup here, specific to my test
def test_execute_no_trigger(self):
with self.assertRaises(RuntimeError):
self.ti.run(ignore_ti_state=True)
# It shouldn't have anything in XCom
self.assertEqual(
self.ti.xcom_pull(task_ids=self.op.task_id),
None
)

Here's a function you can use in a pytest test case that will run the tasks of your DAG in order.
from datetime import timedelta
import pytest
from unittest import TestCase
#pytest.fixture
def test_dag(dag):
dag._schedule_interval = timedelta(days=1) # override cuz #once gets skipped
done = set([])
def run(key):
task = dag.task_dict[key]
for k in task._upstream_task_ids:
run(k)
if key not in done:
print(f'running task {key}...')
date = dag.default_args['start_date']
task.run(date, date, ignore_ti_state=True)
done.add(key)
for k, _ in dag.task_dict.items():
run(k)
You can then use test_dag(dag) instead of dag.run() in your test.
You'll need to make sure your logging in your custom operators uses self.log.info() rather than logging.info() or print(), or they won't show up.
You may also need to run your test using python -m pytest -s test_my_dag.py, as without the -s flag Airflow's stdout will not be captured.
I'm still trying to figure out how to handle inter-DAG dependencies.

Related

Apache Airflow unit and integration test

I am new to Apache Airflow and I am trying to figure out how to unit/integration test my dags/tasks
Here is my directory structure
/airflow
/dags
/tests/dags
I created a simple DAG which has a task to reads data from a Postgres table
def read_files(ti):
sql = "select id from files where status='NEW'"
pg_hook = PostgresHook(postgres_conn_id="metadata")
connection = pg_hook.get_conn()
cursor = connection.cursor()
cursor.execute(sql)
files = cursor.fetchall()
ti.xcom_push(key="files_to_process", value=files)
with DAG(dag_id="check_for_new_files", schedule_interval=timedelta(minutes=30),
start_date=datetime(2022, 9, 1), catchup=False) as dag:
check_files = PythonOperator(task_id="read_files",
python_callable=read_files)
Is it possible to test this by mocking Airflow/Postgres connection etc
yes it is possible to do test in dags, here is an example of basic things you can do:
import unittest
from airflow.models import DagBag
class TestCheckForNewFilesDAG(unittest.TestCase):
"""Check Dag"""
def setUp(self):
self.dagbag = DagBag()
def test_task_count(self):
"""Check task count for a dag"""
dag_id='check_for_new_files'
dag = self.dagbag.get_dag(dag_id)
self.assertEqual(len(dag.tasks), 1)
def test_contain_tasks(self):
"""Check task contains in hello_world dag"""
dag_id='check_for_new_files'
dag = self.dagbag.get_dag(dag_id)
tasks = dag.tasks
task_ids = list(map(lambda task: task.task_id, tasks))
self.assertListEqual(task_ids, ['read_files'])
def test_dependencies_of_read_files_task(self):
"""Check the task dependencies of a taskin hello_world dag"""
dag_id='check_for_new_files'
dag = self.dagbag.get_dag(dag_id)
read_files_task = dag.get_task('read_files')
# to be use in case you have upstream task
upstream_task_ids = list(map(lambda task: task.task_id,
read_files_task.upstream_list))
self.assertListEqual(upstream_task_ids, [])
downstream_task_ids = list(map(lambda task: task.task_id,
read_files_task.downstream_list))
self.assertListEqual(downstream_task_ids, [])
suite = unittest.TestLoader().loadTestsFromTestCase(TestHelloWorldDAG)
unittest.TextTestRunner(verbosity=2).run(suite)
In case of verifying that manipulated data of files are moved correctly the documentations suggest:
https://airflow.apache.org/docs/apache-airflow/2.0.1/best-practices.html#self-checks
Self-Checks
You can also implement checks in a DAG to make sure the tasks are producing the results as expected. As an example, if you have a task that pushes data to S3, you can implement a check in the next task. For example, the check could make sure that the partition is created in S3 and perform some simple checks to determine if the data is correct.
I think this is an excellent and straightforward way to verify a specific task.
Here there are other useful links you can use:
https://www.youtube.com/watch?v=ANJnYbLwLjE
In the next ones, they talk about mock
https://www.astronomer.io/guides/testing-airflow/
https://medium.com/#montadhar/apache-airflow-testing-guide-7956a3f4bbf5
https://godatadriven.com/blog/testing-and-debugging-apache-airflow/

Generating airflow DAGs dynamically

I am trying to generate airflow dags using a template in a python code, and using globals() as defined here
To define dag object and saving it. Below is my code :
import datetime as dt
import sys
import airflow
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
argumentList = sys.argv
owner = argumentList[1]
dag_name = argumentList[2]
taskID = argumentList[3]
bashCommand = argumentList[4]
default_args = {
'owner': owner,
'start_date': dt.datetime(2019, 6, 1),
'retries': 1,
'retry_delay': dt.timedelta(minutes=5),
}
def dagCreate():
with DAG(dag_name,
default_args=default_args,
schedule_interval=None,
) as dag:
print_hello = BashOperator(task_id=taskID, bash_command=bashCommand)
return dag
globals()[dag_name] = dagCreate()
I have kept this python code outside dag_folder, and executing it as follows :
python bash-dag-generator.py Airflow test_bash_generate auto_bash_task ls
But I don't see any DAG generated in the airflow webserver UI. I am not sure where I am going wrong.
As per the official documentation:
DAGs are defined in standard Python files that are placed in Airflow’s DAG_FOLDER. Airflow will execute the code in each file to dynamically build the DAG objects. You can have as many DAGs as you want, each describing an arbitrary number of tasks. In general, each one should correspond to a single logical workflow.
So unless your code is actually inside the DAG_FOLDER, it will not be registered as a DAG.
The way I have been able to implement Dynamic DAGs is by using Airflow Variable.
In the below example I have a csv file that has list of Bash command like ls, echo etc. As part of the read_file task I am updating the file location to the Airflow Variable. The part where we read the csv file and loop through the commands is where the dynamic DAGs get created.
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.models import Variable
from datetime import datetime, timedelta
import csv
'''
Orchestrate the Dynamic Tasks
'''
def read_file_task():
print('I am reading a File and setting variables ')
Variable.set('dynamic-dag-sample','/home/bashoperator.csv')
with DAG('dynamic-dag-sample',
start_date=datetime(2018, 11, 1)) as dag:
read_file_task = PythonOperator(task_id='read_file_task',
python_callable=read_file_task, provide_context=True,
dag=dag)
dynamic_dag_sample_file_path = Variable.get("dynamic-dag-sample")
if dynamic_dag_sample_file_path != None:
with open(dynamic_dag_sample_file_path) as csv_file:
reader = csv.DictReader(csv_file)
line_count = 0
for row in reader:
bash_task = BashOperator(task_id=row['Taskname'], bash_command=row['Command'])
read_file_task.set_downstream(bash_task)

Programmatically clear the state of airflow task instances

I want to clear the tasks in DAG B when DAG A completes execution. Both A and B are scheduled DAGs.
Is there any operator/way to clear the state of tasks and re-run DAG B programmatically?
I'm aware of the CLI option and Web UI option to clear the tasks.
I would recommend staying away from CLI here!
The airflow functionality of dags/tasks are much better exposed when referencing the objects, as compared to going through BashOperator and/or CLI module.
Add a python operation to dag A named "clear_dag_b", that imports dag_b from the dags folder(module) and this:
from dags.dag_b import dag as dag_b
def clear_dag_b(**context):
exec_date = context[some date object, I forget the name]
dag_b.clear(start_date=exec_date, end_date=exec_date)
Important! If you for some reason do not match or overlap the dag_b schedule time with start_date/end_date, the clear() operation will miss the dag executions. This example assumes dag A and B are scheduled identical, and that you only want to clear day X from B, when A executes day X
It might make sense to include a check for whether the dag_b has already run or not, before clearing:
dab_b_run = dag_b.get_dagrun(exec_date) # returns None or a dag_run object
cli.py is an incredibly useful place to peep into SQLAlchemy magic of Airflow.
The clear command is implemented here
#cli_utils.action_logging
def clear(args):
logging.basicConfig(
level=settings.LOGGING_LEVEL,
format=settings.SIMPLE_LOG_FORMAT)
dags = get_dags(args)
if args.task_regex:
for idx, dag in enumerate(dags):
dags[idx] = dag.sub_dag(
task_regex=args.task_regex,
include_downstream=args.downstream,
include_upstream=args.upstream)
DAG.clear_dags(
dags,
start_date=args.start_date,
end_date=args.end_date,
only_failed=args.only_failed,
only_running=args.only_running,
confirm_prompt=not args.no_confirm,
include_subdags=not args.exclude_subdags,
include_parentdag=not args.exclude_parentdag,
)
Looking at the source, you can either
replicate it (assuming you also want to modify the functionality a bit)
or maybe just do from airflow.bin import cli and invoke the required functions directly
Since my objective was to re-run the DAG B whenever DAG A completes execution, i ended up clearing the DAG B using BashOperator:
# Clear the tasks in another dag
last_task = BashOperator(
task_id='last_task',
bash_command= 'airflow clear example_target_dag -c ',
dag=dag)
first_task >> last_task
It is possible but I would be careful about getting into an endless loop of retries if the task never succeeds. You can call a bash command within the on_retry_callback where you can specify which tasks/dag runs you want to clear.
This works in 2.0 as the clear commands have changed
https://airflow.apache.org/docs/apache-airflow/stable/cli-and-env-variables-ref.html#clear
In this example, I am clearing from t2 & downstream tasks when t3 eventually fails:
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta
def clear_upstream_task(context):
execution_date = context.get("execution_date")
clear_tasks = BashOperator(
task_id='clear_tasks',
bash_command=f'airflow tasks clear -s {execution_date} -t t2 -d -y clear_upstream_task'
)
return clear_tasks.execute(context=context)
# Default settings applied to all tasks
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5)
}
with DAG('clear_upstream_task',
start_date=datetime(2021, 1, 1),
max_active_runs=3,
schedule_interval=timedelta(minutes=5),
default_args=default_args,
catchup=False
) as dag:
t0 = DummyOperator(
task_id='t0'
)
t1 = DummyOperator(
task_id='t1'
)
t2 = DummyOperator(
task_id='t2'
)
t3 = BashOperator(
task_id='t3',
bash_command='exit 123',
#retries=1,
on_failure_callback=clear_upstream_task
)
t0 >> t1 >> t2 >> t3

How to use the result of a BashOperator task as argument of another Airflow task?

I need to pass a job_id parameter to my object DatabricksRunNowOperator(). The job_id is the result of executing the databricks jobs create --json '{myjson} command.
$ databricks jobs create --json '{myjson}'
{job_id: 12}
import os
import subprocess
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators.databricks_operator import DatabricksRunNowOperator
def pull_function():
returned_output = subprocess.check_output("echo ti.xcom_pull(key='jobid_CreateCreateRobot')")
return returned_output
dag_CreateRobot = DAG(dag_id='CreateRobot',
default_args={'owner': 'eric',
'email': [],
'depends_on_past': False,
'start_date':'2019-09-16 16:48:28.803023',
'provide_context': True},
schedule_interval='#once')
CreateRobot = BashOperator(dag=dag_CreateRobot,
task_id='CreateRobot',
bash_command="databricks jobs create --json '{myjson}')")\
RunRobot = DatabricksRunNowOperator(dag=dag_CreateRobot,
task_id=ti.xcom_pull('RunCreateRobot'),
job_id=pull_function(),
databricks_conn_id='myconn',
json={'token': 'mytoken' })
RunRobot.set_upstream(CreateRobot)
I wrote this code for explaining my goal but it does not work. How can I do for using the result of a BashOperator task into other task that depends of it?
The bash command in the BashOperator needs to be $ databricks jobs create --json '{myjson}'
i.e.
CreateRobot = BashOperator(dag=dag_CreateRobot,
task_id='CreateRobot',
bash_command="databricks jobs create --json '{myjson}')",
xcom_push=True #Specify this in older airflow versions)
The above operator when executed pushes the last line of the output to xcom. (https://airflow.apache.org/_modules/airflow/operators/bash_operator.html)
The xcom value can be accessed using :
ti.xcom_pull(task_ids='CreateRobot')

apache airflow - Cannot load the dag bag to handle failure

I have created a on_failure_callback function(refering Airflow default on_failure_callback) to handle task's failure.
It works well when there is only one task in a DAG, however, if there are 2 more tasks, a task is randomly failed since the operator is null, it can resume later by manully . In airflow-scheduler.out the log is:
[2018-05-08 14:24:21,237] {models.py:1595} ERROR - Executor reports
task instance %s finished (%s) although the task says its %s. Was the
task killed externally? NoneType [2018-05-08 14:24:21,238]
{jobs.py:1435} ERROR - Cannot load the dag bag to handle failure for
. Setting task to FAILED without
callbacks or retries. Do you have enough resources?
The DAG code is:
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import timedelta
import airflow
from devops.util import WechatUtil
from devops.util import JiraUtil
def on_failure_callback(context):
ti = context['task_instance']
log_url = ti.log_url
owner = ti.task.owner
ti_str = str(context['task_instance'])
wechat_msg = "%s - Owner:%s"%(ti_str,owner)
WeChatUtil.notify_team(wechat_msg)
jira_desc = "Please check log from url %s"%(log_url)
JiraUtil.create_incident("DW",ti_str,jira_desc,owner)
args = {
'queue': 'default',
'start_date': airflow.utils.dates.days_ago(1),
'retry_delay': timedelta(minutes=1),
'on_failure_callback': on_failure_callback,
'owner': 'user1',
}
dag = DAG(dag_id='test_dependence1',default_args=args,schedule_interval='10 16 * * *')
load_crm_goods = BashOperator(
task_id='crm_goods_job',
bash_command='date',
dag=dag)
load_crm_memeber = BashOperator(
task_id='crm_member_job',
bash_command='date',
dag=dag)
load_crm_order = BashOperator(
task_id='crm_order_job',
bash_command='date',
dag=dag)
load_crm_eur_invt = BashOperator(
task_id='crm_eur_invt_job',
bash_command='date',
dag=dag)
crm_member_cohort_analysis = BashOperator(
task_id='crm_member_cohort_analysis_job',
bash_command='date',
dag=dag)
crm_member_cohort_analysis.set_upstream(load_crm_goods)
crm_member_cohort_analysis.set_upstream(load_crm_memeber)
crm_member_cohort_analysis.set_upstream(load_crm_order)
crm_member_cohort_analysis.set_upstream(load_crm_eur_invt)
crm_member_kpi_daily = BashOperator(
task_id='crm_member_kpi_daily_job',
bash_command='date',
dag=dag)
crm_member_kpi_daily.set_upstream(crm_member_cohort_analysis)
I had tried to update the airflow.cfg by adding the default memory from 512 to even 4096, but no luck. Would anyone have any advice ?
Ialso try to updated my JiraUtil and WechatUtil as following, encoutering the same error
WechatUtil:
import requests
class WechatUtil:
#staticmethod
def notify_trendy_user(user_ldap_id, message):
return None
#staticmethod
def notify_bigdata_team(message):
return None
JiraUtil:
import json
import requests
class JiraUtil:
#staticmethod
def execute_jql(jql):
return None
#staticmethod
def create_incident(projectKey, summary, desc, assignee=None):
return None
(I'm shooting tracer bullets a bit here, so bear with me if this answer doesn't get it right on the first try.)
The null operator issue with multiple task instances is weird... it would help approaching troubleshooting this if you could boil the current code down to a MCVE e.g., 1–2 operators and excluding the JiraUtil and WechatUtil parts if they're not related to the callback failure.
Here are 2 ideas:
1. Can you try changing the line that fetches the task instance out of the context to see if this makes a difference?
Before:
def on_failure_callback(context):
ti = context['task_instance']
...
After:
def on_failure_callback(context):
ti = context['ti']
...
I saw this usage in the Airflow repo (https://github.com/apache/incubator-airflow/blob/c1d583f91a0b4185f760a64acbeae86739479cdb/airflow/contrib/hooks/qubole_check_hook.py#L88). It's possible it can be accessed both ways.
2. Can you try adding provide_context=True on the operators either as a kwarg or in default_args?

Resources