Airflow: Get previous task id in the next task - airflow

I have 2 tasks. In the first, python operator computes something and in the second I want to use the output of the python operator in the Http operator. Here is my code:
source_list = ['account', 'sales']
for source_type in source_list:
t2 = PythonOperator(
task_id='compute_next_gather_time_for_' + source_type,
python_callable=compute_next_gather_time,
provide_context=True,
trigger_rule=TriggerRule.ALL_SUCCESS,
op_args=[source_type],
retries=3
)
t3 = SimpleHttpOperator(
task_id='request_' + source_type + '_report',
method='POST',
http_conn_id='abc',
endpoint=endpoint,
data=json.dumps({
"query": {
"start": "{{ task_instance.xcom_pull(task_ids='prev_task_id') }}",
"stop": str(yesterday),
"fields": [
1
]
}
}),
headers={"Content-Type": "application/json", "Authorization": 'abc'},
response_check=lambda response: True if len(response.json()) == 0 else False,
log_response=True,
retries=3
)
Query: I want to pass previous task id in t3 in its data variable. I am not sure how to do that since t2 task id is not constant. It changes with changing source_type. Evidently, when I tried it did not render it.

I was able to get it by doing this:
next(iter(context['task'].upstream_task_ids))

I haven't used Jinja templating in any of my DAGs before, but I have been faced with similar problems where I was needing to retrieve XCOM values from a particular task that has a dynamically generated task_id.
You could define the task_ids in T3 in the same way you defined the task_id in T2. For example:
source_list = ['account', 'sales']
for source_type in source_list:
task_id='compute_next_gather_time_for_' + source_type
t2 = PythonOperator(
task_id=task_id,
python_callable=compute_next_gather_time,
provide_context=True,
trigger_rule=TriggerRule.ALL_SUCCESS,
op_args=[source_type],
retries=3
)
t3 = SimpleHttpOperator(
task_id='request_' + source_type + '_report',
method='POST',
http_conn_id='abc',
endpoint=endpoint,
data=json.dumps({
"query": {
"start": "{{ task_instance.xcom_pull(task_ids=task_id) }}",
"stop": str(yesterday),
"fields": [
1
]
}
}),
headers={"Content-Type": "application/json", "Authorization": 'abc'},
response_check=lambda response: True if len(response.json()) == 0 else False,
log_response=True,
retries=3
)

To elaborate a bit on #cosbor11's answer.
The approach uses the Airflow task object extracted from the key-word arguments supplied by Airflow during a DAG run. These were once referred to as context and there was an argument to PythonOperator provide_context, but that is deprecated now, I believe. The context is always provided now, making available task, task-instance and other objects and attributes.
So, we can pull things like upstream_task_ids from the task object. Use an iterator or just access it as a list.
def my_python_callable(**context):
upstream_id = next(iter(context['task'].upstream_task_ids))
upstream_ids = context['task'].upstream_task_ids
print(f"got upstream task_id from the task object in the Airflow-provided context: {upstream_id} from a list: {upstream_ids}")
with DAG('silly_hats') as dag:
task0 = DummyOperator(task_id = 'my_spoons_too_big')
task1 = PythonOperator(task_id = 'i_am_a_banana', python_callable = my_python_callable, dag = dag)

Related

How to use Dynamic Task Mapping with TaskGroups

In my actual DAG, I need to first get a list of IDs and then for each ID run a set of tasks.
I have used Dynamic Task Mapping to pass a list to a single task or operator to have it process the list, but can we do this using a TaskGroup as well?
If I can figure out how to pass a variable value at the TaskGroup level, so it uses that value in all sub tasks, then I should be able to meet my requirement.
The below should give you an idea of what I am looking for, just need help getting it working.
from airflow import DAG, XComArg
from datetime import datetime
from airflow.decorators import task
from airflow.utils.task_group import TaskGroup
from airflow.operators.python import PythonOperator
with DAG(
'dtm_tg_test',
schedule_interval = None,
start_date = datetime(2022, 1, 1)
) as dag:
def getList():
return [ "Hello", "World" ]
def printText(text):
print(text)
get_list = PythonOperator(
task_id = "get_list",
python_callable = getList,
dag = dag
)
with TaskGroup.partial(
group_id = "task_group"
).expand(
list = XComArg(get_list)
) as task_group:
print_text = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_again = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_text >> print_again
get_list >> task_group
You can achieve it with the following example :
list_ids = ['45', '48']
#task_group()
def parent_group(list_ids: List[str]) -> List[TaskGroup]:
return list(map(build_group_for_id, list_ids))
def build_group_for_id(current_id: str) -> TaskGroup:
with TaskGroup(group_id=f'group_for_id_{current_id}') as group:
print_text = PythonOperator(
task_id = f"print_output_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id }
dag = dag
)
print_again = PythonOperator(
task_id = f"print_output_other_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id}
dag = dag
print_text >> print_again
return group
with airflow.DAG(
"my_dag", default_args=args, schedule_interval=None,
) as dag:
DummyOperator(task_id='start_dag') >> parent_group(list_ids())
Some explanations :
I create a parent taskGroup called parent_group
This parent group takes the list of IDs
I add a loop and for each parent ID, I create a TaskGroup containing your 2 Aiflow tasks (print operators)
For the TaskGroup related to a parent ID, the TaskGroup ID is built from it in order to be unique in the DAG
For the print operators inside the TaskGroup, I generated again the task IDs by the current parent ID

Airflow:Passing of XCOM value to next task for each multiple tasks

Requirement: Pass XCOM value from the previous task to the next tasks for each multiple tasks
Error: Error rendering template: can only concatenate str (not "set") to str
I tried as below
params={"s3_key": list(date.keys())[0]}
Reading the params in the SQL file code as below
SELECT
$1 AS json_array
FROM
'#STAGE/'
(file_format = > 'public.s3_json',
pattern = > '{{ti.xcom_pull(task_ids='api_into_s3_' + params.s3_key)[0]}}'
)
Dag code
dats = [{f"date_{x}": (datetime.date.today() - datetime.timedelta(days=x)).strftime("%Y-%m-%d")} for x in range(int(end_offset_days), int(start_offset_days))]
default_args = {
"start_date": datetime.datetime(2022, 1, 1),
"provide_context": True,
"execution_timeout": timedelta(minutes=180),
}
with DAG(
dag_id=dag_id,
default_args=default_args,
schedule_interval=schedule_interval,
dagrun_timeout=timedelta(minutes=180),
max_active_runs=1,
params={},
) as dag:
t0 = EmptyOperator(task_id="start_task")
dates = dats
api_tsk = []
snflk_tsk = []
for date in dates:
api_tsk.append(
APIOper(
task_id=f"api_into_s3_{list(date.keys())[0]}",
date_run=list(date.values())[0],
)
)
snflk_tsk.append(
SnowflakeOperator(
task_id=f"s3_into_snflk_{list(date.keys())[0]}",
snowflake_conn_id="snflk_conn_id",
sql="queries.sql",
warehouse="main_wh",
schema="stg_sch",
params={"s3_key": list(date.keys())[0]}
)
)
t3 = EmptyOperator(task_id="end_task")
t0 >> api_tasks
for i, x in enumerate(zip(api_tasks, snflk_tasks)):
if i == 0:
continue
else:
api_tsk[i - 1] >> snflk_tsk[i - 1]
api_tsk[i] >> snflk_tsk[i]
snflk_tsk >> t3
return dag
As below, the xcom value from api_into_s3_date_0 needs to be passed into s3_into_snflk_date_0 tasks and so on
> UPDATES: below is the xcom value from the previous task
But in the log, the query is showing as
SELECT $1 AS json_array
FROM '#STAGE/'
(file_format => 'public.S3_JSON',
pattern => a
)
the SQL file has been updated as below
pattern =>{{ti.xcom_pull(task_ids='api_into_s3_'+params.s3_key)[0]}}

I want to pass arguments from dag to trigger another dag

The code below is a situation in which var1 and var2 are passed using the conf parameter when triggering another dag from the first dag.
trigger = TriggerDagRunOperator(
trigger_dag_id='dag2',
task_id="trigger",
wait_for_completion=True,
reset_dag_run=False,
poke_interval=30,
do_xcom_push=True,
execution_date="{{ execution_date }}",
conf={
"var1": "{{ task_instance.xcom_pull(task_ids='task1', key='var1') }}",
"var2": "{{ task_instance.xcom_pull(task_ids='task1', key='var2') }}",
},
dag=dag
)
In the second dag, I tried to print the var1 and var2 that are expected to be passed to conf.
def print_conf(**kwargs):
conf = kwargs['dag_run'].conf
print(conf)
print(conf['var1'])
print(conf['var2'])
print_op = PythonOperator(
task_id='print',
provide_context=True,
python_callable=print_conf,
dag=dag
)
But the output was that the values ​​of var1 and var2 were None.
{"var1": "None", "var2": "None"}
And even if I check the conf passed to the run in the airflow ui, the value was None.
How to pass arguments between dags through conf?
What could I have done wrong?
There are cases where it fails to import data saved with xcom_push to xcom_pull.
I solved it by using return_value from PythonOperator.
alter_table = EmrAddStepsOperator(
task_id="alter_table",
job_flow_name="rpda-emr",
cluster_states=["WAITING", "RUNNING"],
aws_conn_id="aws_default",
steps="{{ task_instance.xcom_pull(task_ids='init_variables', key='return_value', dag_id=task_instance.dag_id)['alter_table_step']
}}",
do_xcom_push=True,
dag=dag
)

How can I re-run the same DAG periodically?

I have a DAG copying data to S3 using PySpark like below:
...
bucket = 'my.bucket'
schema = 'my_schema'
table = 'my_table'
ymd = pendulum.parse('{{ execution_date }}').strftime('%Y%m%d')
spark_script = 'my_spark_script'
DEFAULT_ARGS = {
'owner': 'burgerphilia',
'start_date': '2020-09-01',
'on_failure_callback': alert.slack_fail_alert,
'depends_on_past': False
}
SPARK_STEPS = [
{
'Name': f'{schema}_{table}_step',
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
'sudo',
'spark-submit',
...
f's3://{bucket}/spark-script/{spark_script}.py',
'--ymd',
f'{ymd}'
]
}
}
]
def delete_s3_object(bucket, schema, table, ymd):
"""
:param bucket: bucket name
:type buket: str
:param schema: schema name(the same as hive schema)
:type schema: str
:param table: table name(the same as hive table)
:type table: str
:param ymd: date to delete, '%Y%m%d' format
:type ymd: str
"""
aws_hook = AwsHook(aws_conn_id='aws_conn')
session = aws_hook.get_session(region_name='ap-northeast-2')
s3 = session.resource('s3')
bucket = s3.Bucket(bucket)
bucket.objects.filter(Prefix=f'{schema}/{table}/ymd={ymd}/').delete()
with DAG(
dag_id=f'{schema}_{table}',
default_args=DEFAULT_ARGS,
catchup=False,
schedule_interval="40 06 * * *"
) as dag:
object_cleaner = PythonOperator(
task_id = 'delete_object',
python_callable=delete_s3_object,
op_kwargs={'bucket': bucket, 'schema': schema, 'table': table, ymd': ymd}
)
step_adder = EmrAddStepsOperator(
task_id='add_step',
job_flow_id=job_flow_id,
aws_conn_id='aws_conn',
steps=SPARK_STEPS,
)
step_checker = EmrStepSensor(
task_id='watch_step',
job_flow_id=job_flow_id,
step_id="{{ task_instance.xcom_pull(task_ids='add_step', key='return_value')[0] }}",
aws_conn_id='aws_conn',
)
object_cleaner >> step_adder >> step_checker
This DAG is working on a daily basis, but the thing is data source(Oracle DB) is updated sometimes. So I should re-run the same DAG every Monday and the first day of month to update previous one(e.g. On 2020/11/02, re-run 2020/10/26 ~ 2020/11/01). Is there the best way to handle this?
There is no direct way to do it. You can try 2 things:
use dynamic dag (https://www.astronomer.io/guides/dynamically-generating-dags/) to create 2 dags with different schedule_interval.
Create another dag which will trigger this dag on different scheduler_interval.

Airflow : how to deliver data from parent dag to sub dag (by xcom or other ways)

Initially, I wanted to deliver value from parent DAG to a subdag by xcom_push/xcom_pull (Airflow provided functions), but I found it has no effect after trying so many times.
Can someone help me to implement value delivering between parent dag and subdag?
Here's my business code. I just want to deliver daily_check from parent dag to sub dag.
# function used by parent DAG
def templated_daily_check(ds, **kwargs):
hour_2_check_daily = kwargs['hour_2_check_daily']
zk_hosts = kwargs['zk_hosts']
zk_path_daily = kwargs['zk_path_daily']
check_time = kwargs['check_time']
# invoke custom python function dailyCheck to get zookeeper data
dailyOrNot = dailyTask.dailyCheck(zk_hosts, zk_path_daily, hour_2_check_daily, check_time)
print ("=====================================================================================================>")
print (dailyOrNot)
# daily_check:data that need to be transferred to subdag
kwargs['ti'].xcom_push(key='daily_check', value=dailyOrNot)
if dailyOrNot:
dailyTask.dailyMark(zk_hosts, zk_path_daily, check_time)
# function to create sub DAG
def sub_dag_iaoc_or_iaocPlusaauoo(parent_dag_name, child_dag_name, all_task_name):
subDag = DAG(
dag_id='%s.%s' % (parent_dag_name, child_dag_name),
default_args=default_argsi, schedule_interval=timedelta(hours=dag_duration_hours)
)
# daily_check:data that need to be received from parent dag,and my way to receive data from parent dag as follows,but not effect
daily_check = os.popen('''echo {{ task_instance.xcom_pull(task_ids='daily_check_bf_all', key='daily_check', dag_id=%s) }}''' % (parent_dag_name)).read()
# Based on the values received above,to determine the number of operators generated
if daily_check == str(False):
t_r_fm_all_iaoc = BashOperator(
task_id='r_fm_all_iaoc',
bash_command= """{}/fm/sh/airflow_preparerun_task.sh {} {} r_fm_all_iaoc""".format(USER_HOME_FOLDER, trade_jar_rev, USER_HOME_FOLDER) + ";" +
"""{} {} {} {} {} {} -r {} -iaoc """.format(spark_submit_fixed_conf, yarn_res_fixed_args, trade_class_fm, '{}/fm/jar/{}/fmtrade-{}.jar'.format(USER_HOME_FOLDER, 'r_fm_all_iaoc', trade_jar_rev), trade_fixed_args, trade_task_fixed_args_fm, all_task_name),
dag=subDag
)
else:
t_r_fm_all_iaoc = BashOperator(
task_id='r_fm_all_iaoc',
bash_command= """{}/fm/sh/airflow_preparerun_task.sh {} {} r_fm_all_iaoc""".format(USER_HOME_FOLDER, trade_jar_rev, USER_HOME_FOLDER) + ";" +
"""{} {} {} {} {} {} -r {} -iaoc """.format(spark_submit_fixed_conf, yarn_res_fixed_args, trade_class_fm, '{}/fm/jar/{}/fmtrade-{}.jar'.format(USER_HOME_FOLDER, 'r_fm_all_iaoc', trade_jar_rev), trade_fixed_args, trade_task_fixed_args_fm, all_task_name),
dag=subDag
)
t_r_fm_all_aauoo = BashOperator(
task_id='r_fm_all_aauoo',
bash_command= """{}/fm/sh/airflow_preparerun_task.sh {} {} r_fm_all_aauoo""".format(USER_HOME_FOLDER, trade_jar_rev, USER_HOME_FOLDER) + ";" +
"""{} {} {} {} {} {} -r {} -aauoo """.format(spark_submit_fixed_conf, yarn_res_fixed_args, trade_class_fm, '{}/fm/jar/{}/fmtrade-{}.jar'.format(USER_HOME_FOLDER, 'r_fm_all_aauoo', trade_jar_rev), trade_fixed_args, trade_task_fixed_args_fm_all, all_task_name),
dag=subDag
)
return subDag
t_daily_check_bf_all = PythonOperator(
task_id='daily_check_bf_all',
provide_context=True,
python_callable=templated_daily_check,
op_kwargs={
'hour_2_check_daily': hour_2_check_daily,
'zk_hosts': zk_hosts,
'zk_path_daily': zk_path_daily,
"check_time": time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
},
dag=dag
)
t_r_fm_all_subdag = SubDagOperator(
subdag=sub_dag_iaoc_or_iaocPlusaauoo('afFMTradeDag',
'iaoc_or_iaocPlusaauoo',
'all'),
task_id='iaoc_or_iaocPlusaauoo',
dag=dag
)

Resources