Airflow BranchPythonOperator

Airflow BranchPythonOperator - airflow

I'm trying to run task in parallel, but i know BranchPythonOperator return only one branch. My problem is, how can i return more than one task if necessary?
Here is my dag:
if i have only one file it's working good, for this case. But if i have two or more files, it execute only one task, and skip all the others. I'd like to run concerned task in parallel, if i have 4 files, i need to run them in parallel and skip the others.
How can i do something like this?
My code:
import datetime as dt
from airflow import DAG
import shutil
import os
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.contrib.sensors.file_sensor import FileSensor
from airflow.operators.dagrun_operator import TriggerDagRunOperator
scriptAirflow = '/home/alexw/scriptAirflow/testFile/'
uploadPath='/apps/lv-manuf2020-data/80_DATA/00_Loading/'
receiptPath= '/apps/lv-manuf2020-data/80_DATA/01_Receipt/'
allReceiptFiles=os.listdir(receiptPath)
branchTask=['kpi_opj_data', 'material_mvke','material_mara','material_mbew','material_marm','material_mdma','material_marc','material_mard']
def parseFileName(file):
splitFile = file.split('_')
baseName= splitFile[2:]
newBaseName='_'.join(baseName)
formatDate= newBaseName.split('-')
baseFileName = formatDate[0].lower()
return baseFileName
def onlyCsvFiles():
if(os.listdir(uploadPath)):
for files in os.listdir(uploadPath):
if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
shutil.move(uploadPath+files, receiptPath)
print(files+' moved in ' + receiptPath+files)
for files in os.listdir(receiptPath):
if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
return "result_mv"
else:
return "no_file_timeout"
else:
print('No file in upload_00')
def result():
if allReceiptFiles:
mem_flag = False
fms_flag = False
for files in allReceiptFiles:
if (files.startswith('MEM') and files.endswith('.csv')):
mem_flag = True
if (files.startswith('FMS') and files.endswith('.csv')):
fms_flag = True
if mem_flag and fms_flag:
return "run_both_scripts"
if mem_flag:
return "run_for_mem"
if fms_flag:
return "run_for_fms"
else:
print('No script to launch')
pass
def returnGoodBranch():
checkScript=[]
for files in os.listdir(receiptPath):
newFiles = parseFileName(files)
checkScript.append(newFiles)
for scriptFiles in checkScript:
if scriptFiles.startswith(scriptFiles):
return scriptFiles
default_args = {
'owner': 'testParallel',
'start_date': dt.datetime(2020, 2, 17),
'retries': 1,
}
dag = DAG('testParallel', default_args=default_args, description='airflow_manuf2020_v4',
schedule_interval=None, catchup=False)
file_sensor = FileSensor(
task_id="file_sensor",
filepath=uploadPath,
fs_conn_id='airflow_db',
poke_interval=10,
dag=dag,
)
move_csv = BranchPythonOperator(
task_id='move_csv',
python_callable=onlyCsvFiles,
trigger_rule='none_failed',
dag=dag,
)
result_mv = BranchPythonOperator(
task_id='result_mv',
python_callable=result,
trigger_rule='none_failed',
dag=dag,
)
run_Mem_Script = DummyOperator(
task_id="run_for_mem",
dag=dag,
)
kpi_obj_data = BashOperator(
task_id='kpi_obj_data',
bash_command='python3 '+scriptAirflow+'kpi_obj_data.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
run_Fms_Script = BranchPythonOperator(
task_id="run_for_fms",
python_callable=returnGoodBranch,
trigger_rule='all_success',
dag=dag,
)
material_makt = BashOperator(
task_id="material_makt",
bash_command='python3 '+scriptAirflow+'material_makt.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mara = BashOperator(
task_id="material_mara",
bash_command='python3 '+scriptAirflow+'material_mara.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_marc = BashOperator(
task_id="material_marc",
bash_command='python3 '+scriptAirflow+'material_marc.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mard = BashOperator(
task_id="material_mard",
bash_command='python3 '+scriptAirflow+'material_mard.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_marm = BashOperator(
task_id="material_marm",
bash_command='python3 '+scriptAirflow+'material_marm.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mbew = BashOperator(
task_id="material_mbew",
bash_command='python3 '+scriptAirflow+'material_mbew.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mdma = BashOperator(
task_id="material_mdma",
bash_command='python3 '+scriptAirflow+'material_mdma.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mvke = BashOperator(
task_id="material_mvke",
bash_command='python3 '+scriptAirflow+'material_mvke.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
run_both_scripts = DummyOperator(
task_id="run_both_scripts",
dag=dag,
)
no_file_timeout= BashOperator(
task_id="no_file_timeout",
bash_command='sleep 300',
trigger_rule='all_done',
dag=dag,
)
rerun_dag_no_file = TriggerDagRunOperator(
task_id='rerun_dag_no_file',
trigger_dag_id='testParallel',
trigger_rule='all_success',
dag=dag,
)
checking_file= DummyOperator(
task_id='file_ok',
trigger_rule='all_done',
dag=dag,
)
rerun_dag=TriggerDagRunOperator(
task_id='rerun_dag',
trigger_dag_id='testParallel',
trigger_rule='all_done',
dag=dag,
)
move_csv.set_upstream(file_sensor)
result_mv.set_upstream(move_csv)
no_file_timeout.set_upstream(move_csv)
run_both_scripts.set_upstream(result_mv)
run_Fms_Script.set_upstream(result_mv)
run_Mem_Script.set_upstream(result_mv)
kpi_obj_data.set_upstream(run_Mem_Script)
kpi_obj_data.set_upstream(run_both_scripts)
material_makt.set_upstream(run_both_scripts)
material_mara.set_upstream(run_both_scripts)
material_marc.set_upstream(run_both_scripts)
material_mard.set_upstream(run_both_scripts)
material_marm.set_upstream(run_both_scripts)
material_mbew.set_upstream(run_both_scripts)
material_mdma.set_upstream(run_both_scripts)
material_mvke.set_upstream(run_both_scripts)
material_makt.set_upstream(run_Fms_Script)
material_mara.set_upstream(run_Fms_Script)
material_marc.set_upstream(run_Fms_Script)
material_mard.set_upstream(run_Fms_Script)
material_marm.set_upstream(run_Fms_Script)
material_mbew.set_upstream(run_Fms_Script)
material_mdma.set_upstream(run_Fms_Script)
material_mvke.set_upstream(run_Fms_Script)
checking_file.set_upstream(material_mvke)
checking_file.set_upstream(material_makt)
checking_file.set_upstream(material_mara)
checking_file.set_upstream(material_marc)
checking_file.set_upstream(material_mard)
checking_file.set_upstream(material_marm)
checking_file.set_upstream(material_mbew)
checking_file.set_upstream(material_mdma)
checking_file.set_upstream(material_mvke)
checking_file.set_upstream(kpi_obj_data)
rerun_dag.set_upstream(checking_file)
rerun_dag_no_file.set_upstream(no_file_timeout)
tasks are BashOperator and call a python script
I stuck on this, if someone have a solution ! thanks a lot!

The BranchPythonOperaror can return a list of task ids.
For example, you want to execute material_marm, material_mbew and material_mdma,
you just need to return those task ids in your python callable function.
return ["material_marm", "material_mbew", "material_mdma"]
If you want to learn more about the BranchPythonOperator, check my post, I believe it will help you :)

Related

pull xcom value inside custom operator airflow

I wrote a custom operator called HadoopPutHdfs in Airflow,
so I need to pass xxx parameter to HadoopPutHdfs and I need to fill xxx with the return value from the generate_file_path task
with DAG(dag_id='my_custom_operator_dag', schedule_interval='1 * * * *', default_args=default_args, catchup=False) as dag:
generate_file_path = PythonOperator(
task_id='generate_file_path',
python_callable=generate_file_path_func,
dag=dag,
)
put_to_hdfs = HadoopPutHdfs(
task_id='put_to_hdfs',
headers={'Content-Type': 'text/plain'},
hdfs_path='webhdfs/v1/user/hive/13.zip',
hadoop_host='10.10.10.146',
hadoop_port=9870,
source_path='/opt/airflow/dags/1.zip',
dag=dag,
xxx= "{{ ti.xcom_pull(task_ids=['generate_file_path']) }}",
)
this line not work ,
xxx= "{{ ti.xcom_pull(task_ids=['generate_file_path']) }}"
How can I pass the amount of ‍‍‍generate_file_path function to xxx perameter?

Sounds like you are missing the definition of xxx as a template_field in your custom operator. For example:
class CustomDummyOperator(BaseOperator):
template_fields = ('msg_from_previous_task',)
def __init__(self,
msg_from_previous_task,
*args, **kwargs) -> None:
super(CustomDummyOperator, self).__init__(*args, **kwargs)
self.msg_from_previous_task = msg_from_previous_task
def execute(self, context):
print(f"Message: {self.msg_from_previous_task}")
DAG:
def return_a_str():
return "string_value_from_op1"
task_1 = PythonOperator(
task_id='task_1',
dag=dag,
python_callable=return_a_str,
)
task_2 = CustomDummyOperator(
task_id='task_2',
dag=dag,
msg_from_previous_task="{{ ti.xcom_pull(task_ids='task_1') }}"
)
The output from task_2 is: Message: string_value_from_op1
You could use XcomArg for a cleaner syntax:
task_2 = CustomDummyOperator(
task_id='task_2',
dag=dag,
msg_from_previous_task=task_1.output
# msg_from_previous_task="{{ ti.xcom_pull(task_ids='task_1') }}"
)

In Airflow- Can we get the output of sql executed in JdbcOperator?

Can we see or get the output of a sql executed in JdbcOperator?
with DAG(dag_id='Exasol_DB_Checks',schedule_interval= '#hourly',default_args=default_args,catchup=False,template_searchpath=tmpl_search_path) as dag:
start_task=DummyOperator(task_id='start_task',dag=dag)
sql_task_1 = JdbcOperator(task_id='sql_cmd',
jdbc_conn_id='Exasol_db',
sql = ['select current_timestamp;','select current_user from DUAL;',"test.sql"],
autocommit=True,
params={
"my_param": "{{ var.value.source_path }}"}
)
start_task >> sql_task_1

Maybe you can use a JdbcHook inside a PythonOperator for your needs:
task = PythonOperator(
task_id='task1',
python_callable=do_work,
dag=dag
)
def do_work():
jdbc_hook = JdbcHook(jdbc_conn_id="some_db"),
jdbc_conn = jdbc_hook.get_conn()
jdbc_cursor = jdbc_conn.cursor()
jdbc_cursor.execute('SELECT ......')
row = jdbc_cursor.fetchone()[0]
task1 > task2
https://airflow.apache.org/docs/stable/concepts.html#hooks

Use params within params when creating an Airflow operator

Here is an airflow operator example
t3 = BashOperator(
task_id='templated',
params={'my_param': 'Parameter I passed in'},
dag=dag,
)
Is that possible to use params in params, like this?
t3 = BashOperator(
task_id='templated',
params={'my_param': 'Parameter I passed in',
'my_param2': '{{ params.myparam }} again'},
dag=dag,
)
Some of my params depend on others. Not sure the best way to do it.
Is this ok, if I use macro in params?
t3 = BashOperator(
task_id='templated',
params={"epoch": "{{ next_execution_date.int_timestamp }}"},
dag=dag,
)

Airflow Dependecies with XCOM and BashOperators

This however is not working properly and I should admit that it is my first time working with python. Any help would be very useful. I have put together a test DAG to do the following but it does not work :
run task t1 and return a value
run task t2 if value from 1 is ALL_SUCCESS
from datetime import datetime
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
def set_trigger(taskid, **kwargs):
xcomValue = {{ task_instance.xcom_pull(task_ids=taskid) }}
print( xcomValue, " <------- LOOK HERE XCOM VAR")
if(xcomValue == "0"):
return TriggerRule.ALL_SUCCESS
return TriggerRule.ALL_FAILED
dag = DAG(dag_id="example_bash_operator", schedule_interval=None, start_date=datetime(2018, 12, 31) ) as dag:
t1 = BashOperator(
task_id="t1",
bash_command='do something && echo 0 ',
dag=dag
)
t2 = BashOperator(
task_id="t2",
bash_command='do something else here ',
trigger_rule=set_trigger,
dag=dag,
)
t1 >> t2
```

Why not use BranchPythonOperator (docs):
This way you only run t2 if the value returned by t1 is 0
from datetime import datetime
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
def set_trigger(taskid, **kwargs):
xcomValue = {{ task_instance.xcom_pull(task_ids=taskid) }}
print( xcomValue, " <------- LOOK HERE XCOM VAR")
if(xcomValue == "0"):
return TriggerRule.ALL_SUCCESS
return TriggerRule.ALL_FAILED
dag = DAG(dag_id="example_bash_operator", schedule_interval=None, start_date=datetime(2018, 12, 31) ) as dag:
t1 = BashOperator(
task_id="t1",
bash_command='do something && echo 0 ',
dag=dag
)
def branch_func(**kwargs):
ti = kwargs['ti']
xcom_value = int(ti.xcom_pull(task_ids='t1'))
if xcom_value == '0':
return 't2'
check_t1 = BranchPythonOperator(
task_id='check_t1',
provide_context=True,
python_callable=branch_func,
dag=dag)
t2 = BashOperator(
task_id="t2",
bash_command='do something else here ',
trigger_rule=set_trigger,
dag=dag,
)
t1 >> t2

Airflow best way to skipping task?

I'm trying to run tasks independently and in parallel,
I've dag look like this:
---> patternA ---> file1a
---> file2a
---> file3a
sensor ---> move_csv ---> result_mv ---> rerun_dag
---> patternB ---> file1b
---> file2b
---> file3b
my dag.py:
sensor = FileSensor(
task_id="sensor ",
filepath=filePath,
fs_conn_id='airflow_db',
poke_interval=10,
dag=dag,
)
move_csv = BranchPythonOperator(
task_id='move_csv',
python_callable=moveCsvFile,
trigger_rule='none_failed',
dag=dag,
)
result_mv = BranchPythonOperator(
task_id='result_mv',
python_callable=result,
trigger_rule='none_failed',
dag=dag,
)
pattern_A = DummyOperator(
task_id="pattern_A ",
dag=dag,
)
pattern_B = DummyOperator(
task_id="pattern_B ",
dag=dag,
)
file1 = BashOperator(
task_id="file1a ",
bash_command='python3 '+scriptPath+'file1.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
file2 = BashOperator(
task_id="file2a",
bash_command='python3 '+scriptPath+'file2.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
file3 = BashOperator(
task_id="file3a",
bash_command='python3 '+scriptPath+'file3.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
file1 = BashOperator(
task_id="file1b ",
bash_command='python3 '+scriptPath+'file1b.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
file2 = BashOperator(
task_id="file2b",
bash_command='python3 '+scriptPath+'file2b.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
file3 = BashOperator(
task_id="file3b",
bash_command='python3 '+scriptPath+'file3b.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
move_csv.set_upstream(sensor)
result_mv.set_upstream(move_csv)
patternA.set_upstream(result_mv)
patternB.set_upstream(result_mv)
file1a.set_upstream(patternA)
file2a.set_upstream(patternA)
file3a.set_upstream(patternA)
file1b.set_upstream(patternB)
file2b.set_upstream(patternB)
file3b.set_upstream(patternB)
rerun.set_uptstream( from all file ...)
what is the best way, in patternA to skip file2a and file3a if i only have file1a matching pattern ?
And if i've file1a and file2a matching i'd like to run them in parallel and skip file3a.
My files task are running a python script call with a BashOperator.
Thanks for help ! :)

You can use BranchOperator for skipping the task
more detail here

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Airflow BranchPythonOperator - airflow

Related

pull xcom value inside custom operator airflow

In Airflow- Can we get the output of sql executed in JdbcOperator?

Use params within params when creating an Airflow operator

Airflow Dependecies with XCOM and BashOperators

Airflow best way to skipping task?

Categories

Resources