I'm trying to run task in parallel, but i know BranchPythonOperator return only one branch. My problem is, how can i return more than one task if necessary?
Here is my dag:
if i have only one file it's working good, for this case. But if i have two or more files, it execute only one task, and skip all the others. I'd like to run concerned task in parallel, if i have 4 files, i need to run them in parallel and skip the others.
How can i do something like this?
My code:
import datetime as dt
from airflow import DAG
import shutil
import os
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.contrib.sensors.file_sensor import FileSensor
from airflow.operators.dagrun_operator import TriggerDagRunOperator
scriptAirflow = '/home/alexw/scriptAirflow/testFile/'
uploadPath='/apps/lv-manuf2020-data/80_DATA/00_Loading/'
receiptPath= '/apps/lv-manuf2020-data/80_DATA/01_Receipt/'
allReceiptFiles=os.listdir(receiptPath)
branchTask=['kpi_opj_data', 'material_mvke','material_mara','material_mbew','material_marm','material_mdma','material_marc','material_mard']
def parseFileName(file):
splitFile = file.split('_')
baseName= splitFile[2:]
newBaseName='_'.join(baseName)
formatDate= newBaseName.split('-')
baseFileName = formatDate[0].lower()
return baseFileName
def onlyCsvFiles():
if(os.listdir(uploadPath)):
for files in os.listdir(uploadPath):
if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
shutil.move(uploadPath+files, receiptPath)
print(files+' moved in ' + receiptPath+files)
for files in os.listdir(receiptPath):
if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
return "result_mv"
else:
return "no_file_timeout"
else:
print('No file in upload_00')
def result():
if allReceiptFiles:
mem_flag = False
fms_flag = False
for files in allReceiptFiles:
if (files.startswith('MEM') and files.endswith('.csv')):
mem_flag = True
if (files.startswith('FMS') and files.endswith('.csv')):
fms_flag = True
if mem_flag and fms_flag:
return "run_both_scripts"
if mem_flag:
return "run_for_mem"
if fms_flag:
return "run_for_fms"
else:
print('No script to launch')
pass
def returnGoodBranch():
checkScript=[]
for files in os.listdir(receiptPath):
newFiles = parseFileName(files)
checkScript.append(newFiles)
for scriptFiles in checkScript:
if scriptFiles.startswith(scriptFiles):
return scriptFiles
default_args = {
'owner': 'testParallel',
'start_date': dt.datetime(2020, 2, 17),
'retries': 1,
}
dag = DAG('testParallel', default_args=default_args, description='airflow_manuf2020_v4',
schedule_interval=None, catchup=False)
file_sensor = FileSensor(
task_id="file_sensor",
filepath=uploadPath,
fs_conn_id='airflow_db',
poke_interval=10,
dag=dag,
)
move_csv = BranchPythonOperator(
task_id='move_csv',
python_callable=onlyCsvFiles,
trigger_rule='none_failed',
dag=dag,
)
result_mv = BranchPythonOperator(
task_id='result_mv',
python_callable=result,
trigger_rule='none_failed',
dag=dag,
)
run_Mem_Script = DummyOperator(
task_id="run_for_mem",
dag=dag,
)
kpi_obj_data = BashOperator(
task_id='kpi_obj_data',
bash_command='python3 '+scriptAirflow+'kpi_obj_data.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
run_Fms_Script = BranchPythonOperator(
task_id="run_for_fms",
python_callable=returnGoodBranch,
trigger_rule='all_success',
dag=dag,
)
material_makt = BashOperator(
task_id="material_makt",
bash_command='python3 '+scriptAirflow+'material_makt.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mara = BashOperator(
task_id="material_mara",
bash_command='python3 '+scriptAirflow+'material_mara.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_marc = BashOperator(
task_id="material_marc",
bash_command='python3 '+scriptAirflow+'material_marc.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mard = BashOperator(
task_id="material_mard",
bash_command='python3 '+scriptAirflow+'material_mard.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_marm = BashOperator(
task_id="material_marm",
bash_command='python3 '+scriptAirflow+'material_marm.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mbew = BashOperator(
task_id="material_mbew",
bash_command='python3 '+scriptAirflow+'material_mbew.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mdma = BashOperator(
task_id="material_mdma",
bash_command='python3 '+scriptAirflow+'material_mdma.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mvke = BashOperator(
task_id="material_mvke",
bash_command='python3 '+scriptAirflow+'material_mvke.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
run_both_scripts = DummyOperator(
task_id="run_both_scripts",
dag=dag,
)
no_file_timeout= BashOperator(
task_id="no_file_timeout",
bash_command='sleep 300',
trigger_rule='all_done',
dag=dag,
)
rerun_dag_no_file = TriggerDagRunOperator(
task_id='rerun_dag_no_file',
trigger_dag_id='testParallel',
trigger_rule='all_success',
dag=dag,
)
checking_file= DummyOperator(
task_id='file_ok',
trigger_rule='all_done',
dag=dag,
)
rerun_dag=TriggerDagRunOperator(
task_id='rerun_dag',
trigger_dag_id='testParallel',
trigger_rule='all_done',
dag=dag,
)
move_csv.set_upstream(file_sensor)
result_mv.set_upstream(move_csv)
no_file_timeout.set_upstream(move_csv)
run_both_scripts.set_upstream(result_mv)
run_Fms_Script.set_upstream(result_mv)
run_Mem_Script.set_upstream(result_mv)
kpi_obj_data.set_upstream(run_Mem_Script)
kpi_obj_data.set_upstream(run_both_scripts)
material_makt.set_upstream(run_both_scripts)
material_mara.set_upstream(run_both_scripts)
material_marc.set_upstream(run_both_scripts)
material_mard.set_upstream(run_both_scripts)
material_marm.set_upstream(run_both_scripts)
material_mbew.set_upstream(run_both_scripts)
material_mdma.set_upstream(run_both_scripts)
material_mvke.set_upstream(run_both_scripts)
material_makt.set_upstream(run_Fms_Script)
material_mara.set_upstream(run_Fms_Script)
material_marc.set_upstream(run_Fms_Script)
material_mard.set_upstream(run_Fms_Script)
material_marm.set_upstream(run_Fms_Script)
material_mbew.set_upstream(run_Fms_Script)
material_mdma.set_upstream(run_Fms_Script)
material_mvke.set_upstream(run_Fms_Script)
checking_file.set_upstream(material_mvke)
checking_file.set_upstream(material_makt)
checking_file.set_upstream(material_mara)
checking_file.set_upstream(material_marc)
checking_file.set_upstream(material_mard)
checking_file.set_upstream(material_marm)
checking_file.set_upstream(material_mbew)
checking_file.set_upstream(material_mdma)
checking_file.set_upstream(material_mvke)
checking_file.set_upstream(kpi_obj_data)
rerun_dag.set_upstream(checking_file)
rerun_dag_no_file.set_upstream(no_file_timeout)
tasks are BashOperator and call a python script
I stuck on this, if someone have a solution ! thanks a lot!
The BranchPythonOperaror can return a list of task ids.
For example, you want to execute material_marm, material_mbew and material_mdma,
you just need to return those task ids in your python callable function.
return ["material_marm", "material_mbew", "material_mdma"]
If you want to learn more about the BranchPythonOperator, check my post, I believe it will help you :)
Related
I wrote a custom operator called HadoopPutHdfs in Airflow,
so I need to pass xxx parameter to HadoopPutHdfs and I need to fill xxx with the return value from the generate_file_path task
with DAG(dag_id='my_custom_operator_dag', schedule_interval='1 * * * *', default_args=default_args, catchup=False) as dag:
generate_file_path = PythonOperator(
task_id='generate_file_path',
python_callable=generate_file_path_func,
dag=dag,
)
put_to_hdfs = HadoopPutHdfs(
task_id='put_to_hdfs',
headers={'Content-Type': 'text/plain'},
hdfs_path='webhdfs/v1/user/hive/13.zip',
hadoop_host='10.10.10.146',
hadoop_port=9870,
source_path='/opt/airflow/dags/1.zip',
dag=dag,
xxx= "{{ ti.xcom_pull(task_ids=['generate_file_path']) }}",
)
this line not work ,
xxx= "{{ ti.xcom_pull(task_ids=['generate_file_path']) }}"
How can I pass the amount of generate_file_path function to xxx perameter?
Sounds like you are missing the definition of xxx as a template_field in your custom operator. For example:
class CustomDummyOperator(BaseOperator):
template_fields = ('msg_from_previous_task',)
def __init__(self,
msg_from_previous_task,
*args, **kwargs) -> None:
super(CustomDummyOperator, self).__init__(*args, **kwargs)
self.msg_from_previous_task = msg_from_previous_task
def execute(self, context):
print(f"Message: {self.msg_from_previous_task}")
DAG:
def return_a_str():
return "string_value_from_op1"
task_1 = PythonOperator(
task_id='task_1',
dag=dag,
python_callable=return_a_str,
)
task_2 = CustomDummyOperator(
task_id='task_2',
dag=dag,
msg_from_previous_task="{{ ti.xcom_pull(task_ids='task_1') }}"
)
The output from task_2 is: Message: string_value_from_op1
You could use XcomArg for a cleaner syntax:
task_2 = CustomDummyOperator(
task_id='task_2',
dag=dag,
msg_from_previous_task=task_1.output
# msg_from_previous_task="{{ ti.xcom_pull(task_ids='task_1') }}"
)
Can we see or get the output of a sql executed in JdbcOperator?
with DAG(dag_id='Exasol_DB_Checks',schedule_interval= '#hourly',default_args=default_args,catchup=False,template_searchpath=tmpl_search_path) as dag:
start_task=DummyOperator(task_id='start_task',dag=dag)
sql_task_1 = JdbcOperator(task_id='sql_cmd',
jdbc_conn_id='Exasol_db',
sql = ['select current_timestamp;','select current_user from DUAL;',"test.sql"],
autocommit=True,
params={
"my_param": "{{ var.value.source_path }}"}
)
start_task >> sql_task_1
Maybe you can use a JdbcHook inside a PythonOperator for your needs:
task = PythonOperator(
task_id='task1',
python_callable=do_work,
dag=dag
)
def do_work():
jdbc_hook = JdbcHook(jdbc_conn_id="some_db"),
jdbc_conn = jdbc_hook.get_conn()
jdbc_cursor = jdbc_conn.cursor()
jdbc_cursor.execute('SELECT ......')
row = jdbc_cursor.fetchone()[0]
task1 > task2
https://airflow.apache.org/docs/stable/concepts.html#hooks
Here is an airflow operator example
t3 = BashOperator(
task_id='templated',
params={'my_param': 'Parameter I passed in'},
dag=dag,
)
Is that possible to use params in params, like this?
t3 = BashOperator(
task_id='templated',
params={'my_param': 'Parameter I passed in',
'my_param2': '{{ params.myparam }} again'},
dag=dag,
)
Some of my params depend on others. Not sure the best way to do it.
Is this ok, if I use macro in params?
t3 = BashOperator(
task_id='templated',
params={"epoch": "{{ next_execution_date.int_timestamp }}"},
dag=dag,
)
This however is not working properly and I should admit that it is my first time working with python. Any help would be very useful. I have put together a test DAG to do the following but it does not work :
run task t1 and return a value
run task t2 if value from 1 is ALL_SUCCESS
from datetime import datetime
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
def set_trigger(taskid, **kwargs):
xcomValue = {{ task_instance.xcom_pull(task_ids=taskid) }}
print( xcomValue, " <------- LOOK HERE XCOM VAR")
if(xcomValue == "0"):
return TriggerRule.ALL_SUCCESS
return TriggerRule.ALL_FAILED
dag = DAG(dag_id="example_bash_operator", schedule_interval=None, start_date=datetime(2018, 12, 31) ) as dag:
t1 = BashOperator(
task_id="t1",
bash_command='do something && echo 0 ',
dag=dag
)
t2 = BashOperator(
task_id="t2",
bash_command='do something else here ',
trigger_rule=set_trigger,
dag=dag,
)
t1 >> t2
```
Why not use BranchPythonOperator (docs):
This way you only run t2 if the value returned by t1 is 0
from datetime import datetime
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
def set_trigger(taskid, **kwargs):
xcomValue = {{ task_instance.xcom_pull(task_ids=taskid) }}
print( xcomValue, " <------- LOOK HERE XCOM VAR")
if(xcomValue == "0"):
return TriggerRule.ALL_SUCCESS
return TriggerRule.ALL_FAILED
dag = DAG(dag_id="example_bash_operator", schedule_interval=None, start_date=datetime(2018, 12, 31) ) as dag:
t1 = BashOperator(
task_id="t1",
bash_command='do something && echo 0 ',
dag=dag
)
def branch_func(**kwargs):
ti = kwargs['ti']
xcom_value = int(ti.xcom_pull(task_ids='t1'))
if xcom_value == '0':
return 't2'
check_t1 = BranchPythonOperator(
task_id='check_t1',
provide_context=True,
python_callable=branch_func,
dag=dag)
t2 = BashOperator(
task_id="t2",
bash_command='do something else here ',
trigger_rule=set_trigger,
dag=dag,
)
t1 >> t2
I'm trying to run tasks independently and in parallel,
I've dag look like this:
---> patternA ---> file1a
---> file2a
---> file3a
sensor ---> move_csv ---> result_mv ---> rerun_dag
---> patternB ---> file1b
---> file2b
---> file3b
my dag.py:
sensor = FileSensor(
task_id="sensor ",
filepath=filePath,
fs_conn_id='airflow_db',
poke_interval=10,
dag=dag,
)
move_csv = BranchPythonOperator(
task_id='move_csv',
python_callable=moveCsvFile,
trigger_rule='none_failed',
dag=dag,
)
result_mv = BranchPythonOperator(
task_id='result_mv',
python_callable=result,
trigger_rule='none_failed',
dag=dag,
)
pattern_A = DummyOperator(
task_id="pattern_A ",
dag=dag,
)
pattern_B = DummyOperator(
task_id="pattern_B ",
dag=dag,
)
file1 = BashOperator(
task_id="file1a ",
bash_command='python3 '+scriptPath+'file1.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
file2 = BashOperator(
task_id="file2a",
bash_command='python3 '+scriptPath+'file2.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
file3 = BashOperator(
task_id="file3a",
bash_command='python3 '+scriptPath+'file3.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
file1 = BashOperator(
task_id="file1b ",
bash_command='python3 '+scriptPath+'file1b.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
file2 = BashOperator(
task_id="file2b",
bash_command='python3 '+scriptPath+'file2b.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
file3 = BashOperator(
task_id="file3b",
bash_command='python3 '+scriptPath+'file3b.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
move_csv.set_upstream(sensor)
result_mv.set_upstream(move_csv)
patternA.set_upstream(result_mv)
patternB.set_upstream(result_mv)
file1a.set_upstream(patternA)
file2a.set_upstream(patternA)
file3a.set_upstream(patternA)
file1b.set_upstream(patternB)
file2b.set_upstream(patternB)
file3b.set_upstream(patternB)
rerun.set_uptstream( from all file ...)
what is the best way, in patternA to skip file2a and file3a if i only have file1a matching pattern ?
And if i've file1a and file2a matching i'd like to run them in parallel and skip file3a.
My files task are running a python script call with a BashOperator.
Thanks for help ! :)
You can use BranchOperator for skipping the task
more detail here