Airflow Dependecies with XCOM and BashOperators - airflow

This however is not working properly and I should admit that it is my first time working with python. Any help would be very useful. I have put together a test DAG to do the following but it does not work :
run task t1 and return a value
run task t2 if value from 1 is ALL_SUCCESS
from datetime import datetime
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
def set_trigger(taskid, **kwargs):
xcomValue = {{ task_instance.xcom_pull(task_ids=taskid) }}
print( xcomValue, " <------- LOOK HERE XCOM VAR")
if(xcomValue == "0"):
return TriggerRule.ALL_SUCCESS
return TriggerRule.ALL_FAILED
dag = DAG(dag_id="example_bash_operator", schedule_interval=None, start_date=datetime(2018, 12, 31) ) as dag:
t1 = BashOperator(
task_id="t1",
bash_command='do something && echo 0 ',
dag=dag
)
t2 = BashOperator(
task_id="t2",
bash_command='do something else here ',
trigger_rule=set_trigger,
dag=dag,
)
t1 >> t2
```

Why not use BranchPythonOperator (docs):
This way you only run t2 if the value returned by t1 is 0
from datetime import datetime
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
def set_trigger(taskid, **kwargs):
xcomValue = {{ task_instance.xcom_pull(task_ids=taskid) }}
print( xcomValue, " <------- LOOK HERE XCOM VAR")
if(xcomValue == "0"):
return TriggerRule.ALL_SUCCESS
return TriggerRule.ALL_FAILED
dag = DAG(dag_id="example_bash_operator", schedule_interval=None, start_date=datetime(2018, 12, 31) ) as dag:
t1 = BashOperator(
task_id="t1",
bash_command='do something && echo 0 ',
dag=dag
)
def branch_func(**kwargs):
ti = kwargs['ti']
xcom_value = int(ti.xcom_pull(task_ids='t1'))
if xcom_value == '0':
return 't2'
check_t1 = BranchPythonOperator(
task_id='check_t1',
provide_context=True,
python_callable=branch_func,
dag=dag)
t2 = BashOperator(
task_id="t2",
bash_command='do something else here ',
trigger_rule=set_trigger,
dag=dag,
)
t1 >> t2

Related

Airflow trigger dag with config

I try to use configs in dag using "trigger w/config".
def execute(**kwargs):
dag_run = kwargs['dag_run']
start_date = dag_run.conf['start_dt'] if 'start_dt' in dag_run.conf.keys() else kwargs['start_dt']
end_date = dag_run.conf['end_dt'] if 'end_dt' in dag_run.conf.keys() else kwargs['end_dt']
print(f'start_date = {start_date}, end_date = {end_date}')
dag = DAG(
"corp_dev_ods_test_dag",
default_args=default_args,
description='DAG',
schedule_interval='10 1 * * *',
start_date=days_ago(0),
#params={'dt' : '{{ macros.ds_add(ds, -7) }}'},
catchup=False,
tags=['dev']
)
run_submit = PythonVirtualenvOperator(
task_id='run_submit',
requirements=dag_requirements,
python_callable=execute,
system_site_packages=False,
dag=dag,
op_kwargs={'start_dt' : '{{ macros.ds_add(ds, -7) }}', 'end_dt': '{{ macros.ds_add(ds, -7) }}'}
)
run_submit
I got "KeyError": kwargs["dag_run"]. But in case of PythonOperator (Instead of PythonVirtualenvOperator) it works.
So, how can I use such parameters in my dag?
You need to provide an empty params variable in your task, for example:
from airflow.decorators import dag, task
from datetime import datetime
default_params = {"start_date": "2022-01-01", "end_date": "2022-12-01"}
#dag(
schedule=None,
start_date=datetime(2021, 1, 1),
catchup=False,
tags=['using_params'],
params=default_params
)
def mydag():
#task
def extract(params={}):
import helper
filenames = helper.extract(start=params.get("start_date"))
return filenames
extract()
_dag = mydag()
Now in the UI when you Trigger DAG w/ config you should be able to see and change the default params. And be able to access it in your dag task.

Airflow Subdag tasks are stuck in None state while subdag is showing as Running

I have a problem with my dag getting stuck at subdag. The subdag is in RUNNING state but on zooming in all the tasks of the subdag are in None status.
Using Airflow 2.1.1 with LocalExecutor.
Below is the main dag:
default_args = {
'owner' : 'airflow',
'retries' : 1,
'depends_on_past' : False
}
dag = DAG('loop_example',
start_date = datetime(2022,1,1),
schedule_interval = None,
catchup = False,
tags=['loop']
)
## function to filter src_name based on a DB table/log file entry
def check_valid_src(src_name):
hook = MySqlHook(mysql_conn_id='mysql_conn')
sql='SELECT src_name FROM ingsted_src_log_table'
myresult=hook.get_records(sql)
valid_src_names = []
for src in myresult:
valid_src_names.append(src[0])
if src_name in valid_src_names:
return True
else:
return False
first = DummyOperator(task_id = 'first',dag=dag)
last = DummyOperator(task_id = 'last',dag=dag)
options = ['branch_a','branch_b','branch_c','branch_d']
for option in options:
if check_valid_src(option):
t = SubDagOperator(task_id = f'section_{option}',
subdag=subdag('loop_example',f'section_{option}',default_args,option),
dag=dag
)
first >> t >> last
subdag code:
def subdag(parent_dag_name, child_dag_name, args,option):
dag_subdag = DAG(
dag_id=f'{parent_dag_name}.{child_dag_name}',
default_args=args,
start_date = datetime(2022,1,1),
schedule_interval=None,
)
t1= BashOperator(
task_id=f'Echo_source_name',
bash_command = f'echo {option}',
default_args=args,
dag=dag_subdag
)
t2= BashOperator(
task_id=f'Echo_source_number',
bash_command = f'echo "{option}" | cut -d "_" f2',
default_args=args,
dag=dag_subdag,
)
t1 >> t2
return dag_subdag
Earlier the start_date of the main_dag and subdag was not same so I tried running again making the start_date as same but still it gets stuck.
Is there anything that I am missing here
You have to pass is_paused_upon_creation=False in subdag.
dag_subdag = DAG(
dag_id=f'{parent_dag_name}.{child_dag_name}',
default_args=args,
start_date = datetime(2022,1,1),
schedule_interval=None,is_paused_upon_creation=False
)

Airflow dynamically genarated task not run in order

I have created dynamic tasks generation dag. Tasks are generated accurately, But those tasks are not trigger in order,not work in consistently.
i have noticed it triggered on alphanumeric order.
Let's check run_modification_ tasks. i have generated 0 to 29 tasks. i have noticed it trigger on below format.
run_modification_0
run_modification_1
run_modification_10
run_modification_11
run_modification_12
run_modification_13
run_modification_14
run_modification_15
run_modification_16
run_modification_17
run_modification_18
run_modification_19
run_modification_2
run_modification_21
run_modification_23....
But i need to run it on tasks order like
run_modification_0
run_modification_1
run_modification_2
run_modification_3
run_modification_4
run_modification_5..
Please help me to run those tasks on task created order.
from datetime import date, timedelta, datetime
from airflow.utils.dates import days_ago
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.postgres_operator import PostgresOperator
from airflow.hooks.postgres_hook import PostgresHook
from airflow.models import Variable
import os
args = {
'owner': 'Airflow',
'start_date': days_ago(2),
}
dag = DAG(
dag_id='tastOrder',
default_args=args,
schedule_interval=None,
tags=['task']
)
modification_processXcom = """ cd {{ ti.xcom_pull(task_ids=\'run_modification_\'+params.i, key=\'taskDateFolder\') }} """
def modificationProcess(ds,**kwargs):
today = datetime.strptime('2021-01-01', '%Y-%m-%d').date()
i = str(kwargs['i'])
newDate = today-timedelta(days=int(i))
print(str(newDate))
kwargs["ti"].xcom_push("taskDateFolder", str(newDate))
def getDays():
today = today = datetime.strptime('2021-01-01', '%Y-%m-%d').date()
yesterday = today - timedelta(days=30)
day_Diff = today-yesterday
return day_Diff,today
day_Diff, today = getDays()
for i in reversed(range(0,day_Diff.days)):
run_modification = PythonOperator(
task_id='run_modification_'+str(i),
provide_context=True,
python_callable=modificationProcess,
op_kwargs={'i': str(i)},
dag=dag,
)
modification_processXcom = BashOperator(
task_id='modification_processXcom_'+str(i),
bash_command=modification_processXcom,
params = {'i' :str(i)},
dag = dag
)
run_modification >> modification_processXcom
To get the dependency as:
run_modification_1 -> modification_processXcom_1 ->
run_modification_2 -> modification_processXcom_2 -> ... - >
run_modification_29 -> modification_processXcom_29
You can do:
from datetime import datetime
from airflow import DAG
from airflow.operators.bash import BashOperator
dag = DAG(
dag_id='my_dag',
schedule_interval=None,
start_date=datetime(2021, 8, 10),
catchup=False,
is_paused_upon_creation=False,
)
mylist1 = []
mylist2 = []
for i in range(1, 30):
mylist1.append(
BashOperator( # Replace with your requested operator
task_id=f'run_modification_{i}',
bash_command=f"""echo executing run_modification_{i}""",
dag=dag,
)
)
mylist2.append(
BashOperator( # Replace with your requested operator
task_id=f'modification_processXcom_{i}',
bash_command=f"""echo executing modification_processXcom_{i}""",
dag=dag,
)
)
if len(mylist1) > 0:
mylist1[-1] >> mylist2[-1] # This set dependency between run_modifiation to modification_processXcom
if len(mylist1) > 1:
mylist2[-2] >> mylist1[-1] # This set dependency between modification_processXcom to previous run_modifiation
This code create a list of operators and set them to run one after another as:
Tree view:

How to trigger a task in airflow if immediate parent task fails?

What i am mainly aiming for is that the restore_denormalized_es_Data should only get triggered when the load_denormalized_es_data task fails. If the load_denormalized_es_data task is successful then the command should be directed to end . Here as you can see , my restore is working when archive fails and load is skipped or retrying as a result i am getting wrong answers.
Have stated the code i am using
import sys
import os
from datetime import datetime
#import files what u want to import
# Airflow level imports
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator,BranchPythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.utils.trigger_rule import TriggerRule
#Imported all the functions and the code is able to call the functions with ease
# Name of the Dag
DAG_NAME = "DAG"
#Default arguments
default_args = {
"owner": "Mehul",
"start_date": datetime.today().strftime("%Y-%m-%d"),
"provide_context": True
}
# Define the dag object
dag = DAG(
DAG_NAME,
default_args=default_args,
schedule_interval=None
)
archive_denormalized_es_data = PythonOperator(
task_id = "archive_denormalized_es_data",
python_callable = archive_current_ES_data,
trigger_rule=TriggerRule.ALL_SUCCESS,
provide_context = False,
dag=dag
)
load_denormalized_es_data = PythonOperator(
task_id = "load_denormalized_es_data",
python_callable = es_load,
provide_context = False,
trigger_rule = TriggerRule.ALL_SUCCESS,
dag=dag
)
restore_denormalized_es_data = PythonOperator(
task_id = "restore_denormalized_es_data",
python_callable = restore_current_ES_data,
trigger_rule=TriggerRule.ALL_FAILED,
provide_context=False,
dag=dag
)
END = DummyOperator(
task_id="END",
trigger_rule=TriggerRule.ALL_SUCCESS,
dag=dag)
denormalized_data_creation>>archive_denormalized_es_data>>load_denormalized_es_data
load_denormalized_es_data<<archive_denormalized_es_data<<denormalized_data_creation
load_denormalized_es_data>>restore_denormalized_es_data
restore_denormalized_es_data<<load_denormalized_es_data
load_denormalized_es_data>>END
END<<load_denormalized_es_data
restore_denormalized_es_data>>END
END<<restore_denormalized_es_data
Here is the picture of the pipelines referred above
If I understand correctly, you want to skip the rest of the pipeline if A fails.
ShortCircuitOperator will allow Airflow to short circuit (skip) the rest of the pipeline.
Here is an example that does what you outlined.
from datetime import datetime
from airflow.models import DAG
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator, ShortCircuitOperator
from airflow.utils.trigger_rule import TriggerRule
from airflow.utils.state import State
def proceed(**context):
ti = context['dag_run'].get_task_instance(a.task_id)
if ti.state == State.FAILED:
return False
else:
return True
dag = DAG(
dag_id="dag",
start_date=datetime(2021, 4, 5),
schedule_interval='#once',
)
with dag:
a = PythonOperator(
task_id='archive_denormalized_es_data',
python_callable=lambda x: 1
)
gate = ShortCircuitOperator(
task_id='gate',
python_callable=proceed,
trigger_rule=TriggerRule.ALL_DONE
)
b = PythonOperator(
task_id='load_denormalized_es_data',
python_callable=lambda: 1
)
c = DummyOperator(
task_id='restore_denormalized_es_data',
trigger_rule=TriggerRule.ALL_FAILED
)
d = DummyOperator(
task_id='END',
trigger_rule=TriggerRule.ONE_SUCCESS
)
a >> gate >> b >> c
[b, c] >> d
If archive_denormalized_es_data fails, the rest of the pipeline is skipped, meaning Airflow does not run restore_denormalized_es_data
If load_denormalized_es_data fails, restore_denormalized_es_data runs and continues to END.
If load_denormalized_es_data succeeds, restore_denormalized_es_data is skipped and continues to END.
You code is essentially missing the logic to skip when archive_denormalized_es_data fails, which the ShortCircuitOperator takes care of for you.

Airflow BranchPythonOperator

I'm trying to run task in parallel, but i know BranchPythonOperator return only one branch. My problem is, how can i return more than one task if necessary?
Here is my dag:
if i have only one file it's working good, for this case. But if i have two or more files, it execute only one task, and skip all the others. I'd like to run concerned task in parallel, if i have 4 files, i need to run them in parallel and skip the others.
How can i do something like this?
My code:
import datetime as dt
from airflow import DAG
import shutil
import os
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.contrib.sensors.file_sensor import FileSensor
from airflow.operators.dagrun_operator import TriggerDagRunOperator
scriptAirflow = '/home/alexw/scriptAirflow/testFile/'
uploadPath='/apps/lv-manuf2020-data/80_DATA/00_Loading/'
receiptPath= '/apps/lv-manuf2020-data/80_DATA/01_Receipt/'
allReceiptFiles=os.listdir(receiptPath)
branchTask=['kpi_opj_data', 'material_mvke','material_mara','material_mbew','material_marm','material_mdma','material_marc','material_mard']
def parseFileName(file):
splitFile = file.split('_')
baseName= splitFile[2:]
newBaseName='_'.join(baseName)
formatDate= newBaseName.split('-')
baseFileName = formatDate[0].lower()
return baseFileName
def onlyCsvFiles():
if(os.listdir(uploadPath)):
for files in os.listdir(uploadPath):
if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
shutil.move(uploadPath+files, receiptPath)
print(files+' moved in ' + receiptPath+files)
for files in os.listdir(receiptPath):
if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
return "result_mv"
else:
return "no_file_timeout"
else:
print('No file in upload_00')
def result():
if allReceiptFiles:
mem_flag = False
fms_flag = False
for files in allReceiptFiles:
if (files.startswith('MEM') and files.endswith('.csv')):
mem_flag = True
if (files.startswith('FMS') and files.endswith('.csv')):
fms_flag = True
if mem_flag and fms_flag:
return "run_both_scripts"
if mem_flag:
return "run_for_mem"
if fms_flag:
return "run_for_fms"
else:
print('No script to launch')
pass
def returnGoodBranch():
checkScript=[]
for files in os.listdir(receiptPath):
newFiles = parseFileName(files)
checkScript.append(newFiles)
for scriptFiles in checkScript:
if scriptFiles.startswith(scriptFiles):
return scriptFiles
default_args = {
'owner': 'testParallel',
'start_date': dt.datetime(2020, 2, 17),
'retries': 1,
}
dag = DAG('testParallel', default_args=default_args, description='airflow_manuf2020_v4',
schedule_interval=None, catchup=False)
file_sensor = FileSensor(
task_id="file_sensor",
filepath=uploadPath,
fs_conn_id='airflow_db',
poke_interval=10,
dag=dag,
)
move_csv = BranchPythonOperator(
task_id='move_csv',
python_callable=onlyCsvFiles,
trigger_rule='none_failed',
dag=dag,
)
result_mv = BranchPythonOperator(
task_id='result_mv',
python_callable=result,
trigger_rule='none_failed',
dag=dag,
)
run_Mem_Script = DummyOperator(
task_id="run_for_mem",
dag=dag,
)
kpi_obj_data = BashOperator(
task_id='kpi_obj_data',
bash_command='python3 '+scriptAirflow+'kpi_obj_data.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
run_Fms_Script = BranchPythonOperator(
task_id="run_for_fms",
python_callable=returnGoodBranch,
trigger_rule='all_success',
dag=dag,
)
material_makt = BashOperator(
task_id="material_makt",
bash_command='python3 '+scriptAirflow+'material_makt.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mara = BashOperator(
task_id="material_mara",
bash_command='python3 '+scriptAirflow+'material_mara.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_marc = BashOperator(
task_id="material_marc",
bash_command='python3 '+scriptAirflow+'material_marc.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mard = BashOperator(
task_id="material_mard",
bash_command='python3 '+scriptAirflow+'material_mard.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_marm = BashOperator(
task_id="material_marm",
bash_command='python3 '+scriptAirflow+'material_marm.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mbew = BashOperator(
task_id="material_mbew",
bash_command='python3 '+scriptAirflow+'material_mbew.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mdma = BashOperator(
task_id="material_mdma",
bash_command='python3 '+scriptAirflow+'material_mdma.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mvke = BashOperator(
task_id="material_mvke",
bash_command='python3 '+scriptAirflow+'material_mvke.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
run_both_scripts = DummyOperator(
task_id="run_both_scripts",
dag=dag,
)
no_file_timeout= BashOperator(
task_id="no_file_timeout",
bash_command='sleep 300',
trigger_rule='all_done',
dag=dag,
)
rerun_dag_no_file = TriggerDagRunOperator(
task_id='rerun_dag_no_file',
trigger_dag_id='testParallel',
trigger_rule='all_success',
dag=dag,
)
checking_file= DummyOperator(
task_id='file_ok',
trigger_rule='all_done',
dag=dag,
)
rerun_dag=TriggerDagRunOperator(
task_id='rerun_dag',
trigger_dag_id='testParallel',
trigger_rule='all_done',
dag=dag,
)
move_csv.set_upstream(file_sensor)
result_mv.set_upstream(move_csv)
no_file_timeout.set_upstream(move_csv)
run_both_scripts.set_upstream(result_mv)
run_Fms_Script.set_upstream(result_mv)
run_Mem_Script.set_upstream(result_mv)
kpi_obj_data.set_upstream(run_Mem_Script)
kpi_obj_data.set_upstream(run_both_scripts)
material_makt.set_upstream(run_both_scripts)
material_mara.set_upstream(run_both_scripts)
material_marc.set_upstream(run_both_scripts)
material_mard.set_upstream(run_both_scripts)
material_marm.set_upstream(run_both_scripts)
material_mbew.set_upstream(run_both_scripts)
material_mdma.set_upstream(run_both_scripts)
material_mvke.set_upstream(run_both_scripts)
material_makt.set_upstream(run_Fms_Script)
material_mara.set_upstream(run_Fms_Script)
material_marc.set_upstream(run_Fms_Script)
material_mard.set_upstream(run_Fms_Script)
material_marm.set_upstream(run_Fms_Script)
material_mbew.set_upstream(run_Fms_Script)
material_mdma.set_upstream(run_Fms_Script)
material_mvke.set_upstream(run_Fms_Script)
checking_file.set_upstream(material_mvke)
checking_file.set_upstream(material_makt)
checking_file.set_upstream(material_mara)
checking_file.set_upstream(material_marc)
checking_file.set_upstream(material_mard)
checking_file.set_upstream(material_marm)
checking_file.set_upstream(material_mbew)
checking_file.set_upstream(material_mdma)
checking_file.set_upstream(material_mvke)
checking_file.set_upstream(kpi_obj_data)
rerun_dag.set_upstream(checking_file)
rerun_dag_no_file.set_upstream(no_file_timeout)
tasks are BashOperator and call a python script
I stuck on this, if someone have a solution ! thanks a lot!
The BranchPythonOperaror can return a list of task ids.
For example, you want to execute material_marm, material_mbew and material_mdma,
you just need to return those task ids in your python callable function.
return ["material_marm", "material_mbew", "material_mdma"]
If you want to learn more about the BranchPythonOperator, check my post, I believe it will help you :)

Resources