The code below is a situation in which var1 and var2 are passed using the conf parameter when triggering another dag from the first dag.
trigger = TriggerDagRunOperator(
trigger_dag_id='dag2',
task_id="trigger",
wait_for_completion=True,
reset_dag_run=False,
poke_interval=30,
do_xcom_push=True,
execution_date="{{ execution_date }}",
conf={
"var1": "{{ task_instance.xcom_pull(task_ids='task1', key='var1') }}",
"var2": "{{ task_instance.xcom_pull(task_ids='task1', key='var2') }}",
},
dag=dag
)
In the second dag, I tried to print the var1 and var2 that are expected to be passed to conf.
def print_conf(**kwargs):
conf = kwargs['dag_run'].conf
print(conf)
print(conf['var1'])
print(conf['var2'])
print_op = PythonOperator(
task_id='print',
provide_context=True,
python_callable=print_conf,
dag=dag
)
But the output was that the values of var1 and var2 were None.
{"var1": "None", "var2": "None"}
And even if I check the conf passed to the run in the airflow ui, the value was None.
How to pass arguments between dags through conf?
What could I have done wrong?
There are cases where it fails to import data saved with xcom_push to xcom_pull.
I solved it by using return_value from PythonOperator.
alter_table = EmrAddStepsOperator(
task_id="alter_table",
job_flow_name="rpda-emr",
cluster_states=["WAITING", "RUNNING"],
aws_conn_id="aws_default",
steps="{{ task_instance.xcom_pull(task_ids='init_variables', key='return_value', dag_id=task_instance.dag_id)['alter_table_step']
}}",
do_xcom_push=True,
dag=dag
)
Related
I'm trying to print the content of a variable computed in airflow dag ,therefore I used an echo in a bash operator, but It doesn't work
I tried with a predefined variable but, I got the same output
here is an example :
with DAG(
dag_id="tmp",
) as dag:
test="test"
test_dag = BashOperator(
task_id="test_dag",
bash_command='echo $test',
)
the output is always empty :
Output:
INFO -
You can pass the variable in a f-string. You can even set the task_id based on your variable. See:
with DAG(
dag_id="tmp",
) as dag:
test="test"
test_dag = BashOperator(
task_id=f"t1_{test}",
bash_command=f"echo {test}",
)
You can also use the env parameter like so:
with DAG(dag_id="tmp", start_date=datetime(2022, 1, 1), schedule_interval=None) as dag:
test_dag = BashOperator(
task_id="t1",
bash_command="echo $test",
env={"test": "something"},
)
I have a problem with my dag getting stuck at subdag. The subdag is in RUNNING state but on zooming in all the tasks of the subdag are in None status.
Using Airflow 2.1.1 with LocalExecutor.
Below is the main dag:
default_args = {
'owner' : 'airflow',
'retries' : 1,
'depends_on_past' : False
}
dag = DAG('loop_example',
start_date = datetime(2022,1,1),
schedule_interval = None,
catchup = False,
tags=['loop']
)
## function to filter src_name based on a DB table/log file entry
def check_valid_src(src_name):
hook = MySqlHook(mysql_conn_id='mysql_conn')
sql='SELECT src_name FROM ingsted_src_log_table'
myresult=hook.get_records(sql)
valid_src_names = []
for src in myresult:
valid_src_names.append(src[0])
if src_name in valid_src_names:
return True
else:
return False
first = DummyOperator(task_id = 'first',dag=dag)
last = DummyOperator(task_id = 'last',dag=dag)
options = ['branch_a','branch_b','branch_c','branch_d']
for option in options:
if check_valid_src(option):
t = SubDagOperator(task_id = f'section_{option}',
subdag=subdag('loop_example',f'section_{option}',default_args,option),
dag=dag
)
first >> t >> last
subdag code:
def subdag(parent_dag_name, child_dag_name, args,option):
dag_subdag = DAG(
dag_id=f'{parent_dag_name}.{child_dag_name}',
default_args=args,
start_date = datetime(2022,1,1),
schedule_interval=None,
)
t1= BashOperator(
task_id=f'Echo_source_name',
bash_command = f'echo {option}',
default_args=args,
dag=dag_subdag
)
t2= BashOperator(
task_id=f'Echo_source_number',
bash_command = f'echo "{option}" | cut -d "_" f2',
default_args=args,
dag=dag_subdag,
)
t1 >> t2
return dag_subdag
Earlier the start_date of the main_dag and subdag was not same so I tried running again making the start_date as same but still it gets stuck.
Is there anything that I am missing here
You have to pass is_paused_upon_creation=False in subdag.
dag_subdag = DAG(
dag_id=f'{parent_dag_name}.{child_dag_name}',
default_args=args,
start_date = datetime(2022,1,1),
schedule_interval=None,is_paused_upon_creation=False
)
I wrote a custom operator called HadoopPutHdfs in Airflow,
so I need to pass xxx parameter to HadoopPutHdfs and I need to fill xxx with the return value from the generate_file_path task
with DAG(dag_id='my_custom_operator_dag', schedule_interval='1 * * * *', default_args=default_args, catchup=False) as dag:
generate_file_path = PythonOperator(
task_id='generate_file_path',
python_callable=generate_file_path_func,
dag=dag,
)
put_to_hdfs = HadoopPutHdfs(
task_id='put_to_hdfs',
headers={'Content-Type': 'text/plain'},
hdfs_path='webhdfs/v1/user/hive/13.zip',
hadoop_host='10.10.10.146',
hadoop_port=9870,
source_path='/opt/airflow/dags/1.zip',
dag=dag,
xxx= "{{ ti.xcom_pull(task_ids=['generate_file_path']) }}",
)
this line not work ,
xxx= "{{ ti.xcom_pull(task_ids=['generate_file_path']) }}"
How can I pass the amount of generate_file_path function to xxx perameter?
Sounds like you are missing the definition of xxx as a template_field in your custom operator. For example:
class CustomDummyOperator(BaseOperator):
template_fields = ('msg_from_previous_task',)
def __init__(self,
msg_from_previous_task,
*args, **kwargs) -> None:
super(CustomDummyOperator, self).__init__(*args, **kwargs)
self.msg_from_previous_task = msg_from_previous_task
def execute(self, context):
print(f"Message: {self.msg_from_previous_task}")
DAG:
def return_a_str():
return "string_value_from_op1"
task_1 = PythonOperator(
task_id='task_1',
dag=dag,
python_callable=return_a_str,
)
task_2 = CustomDummyOperator(
task_id='task_2',
dag=dag,
msg_from_previous_task="{{ ti.xcom_pull(task_ids='task_1') }}"
)
The output from task_2 is: Message: string_value_from_op1
You could use XcomArg for a cleaner syntax:
task_2 = CustomDummyOperator(
task_id='task_2',
dag=dag,
msg_from_previous_task=task_1.output
# msg_from_previous_task="{{ ti.xcom_pull(task_ids='task_1') }}"
)
Can we see or get the output of a sql executed in JdbcOperator?
with DAG(dag_id='Exasol_DB_Checks',schedule_interval= '#hourly',default_args=default_args,catchup=False,template_searchpath=tmpl_search_path) as dag:
start_task=DummyOperator(task_id='start_task',dag=dag)
sql_task_1 = JdbcOperator(task_id='sql_cmd',
jdbc_conn_id='Exasol_db',
sql = ['select current_timestamp;','select current_user from DUAL;',"test.sql"],
autocommit=True,
params={
"my_param": "{{ var.value.source_path }}"}
)
start_task >> sql_task_1
Maybe you can use a JdbcHook inside a PythonOperator for your needs:
task = PythonOperator(
task_id='task1',
python_callable=do_work,
dag=dag
)
def do_work():
jdbc_hook = JdbcHook(jdbc_conn_id="some_db"),
jdbc_conn = jdbc_hook.get_conn()
jdbc_cursor = jdbc_conn.cursor()
jdbc_cursor.execute('SELECT ......')
row = jdbc_cursor.fetchone()[0]
task1 > task2
https://airflow.apache.org/docs/stable/concepts.html#hooks
Here is an airflow operator example
t3 = BashOperator(
task_id='templated',
params={'my_param': 'Parameter I passed in'},
dag=dag,
)
Is that possible to use params in params, like this?
t3 = BashOperator(
task_id='templated',
params={'my_param': 'Parameter I passed in',
'my_param2': '{{ params.myparam }} again'},
dag=dag,
)
Some of my params depend on others. Not sure the best way to do it.
Is this ok, if I use macro in params?
t3 = BashOperator(
task_id='templated',
params={"epoch": "{{ next_execution_date.int_timestamp }}"},
dag=dag,
)