How to pass multiple arguments separately to the next task in Airflow? - airflow

I am using a templated python file that I import on DAG. I want to pass multiple outputs(lists) generated by the 1st task to the next task. I tried this:
def funct1():
a = ['apple', 'orange']
b = ['mango', 'lemon']
return a, b
def funct2(a,b):
for k in a,
quote1 = (f"i love {k}!")
print(quote)
for x in b,
quote2 = (f"i love {k}!")
print(quote2)
dag = DAG("fruits", default_args=default_args)
t1 = PythonOperator(
task_id='descr',
python_callable=filenameoftemplate.funct1,
dag=dag)
t2 = PythonOperator(
task_id='display',
python_callable=filenameoftemplate.funct2,
op_kwargs=
{'a': t1.output,
'b': t2.output},
provide_context=True,
dag=dag)
result:
i love [apple, orange, mango, lemon]!
i love [apple, orange, mango, lemon]!
i love [apple, orange, mango, lemon]!
i love [apple, orange, mango, lemon]!
Thing is, all the returned values are being read as one on the next tasks. How can I pass and pull them as separate arguments? I want the result to be:
i love apple!
i love orange!
i love mango!
i love lemon!

try to use a counter variable to determine access which value
def funct1():
a = ['apple', 'orange']
b = ['mango', 'lemon']
return a, b
# Use counter to determine access which value
counter = 0
def funct2(a,b):
quote1 = (f"i love {a[counter]}!")
print(quote1)
quote2 = (f"i love {b[counter]}!")
print(quote2)
counter += 1
dag = DAG("fruits", default_args=default_args)
t1 = PythonOperator(
task_id='descr',
python_callable=filenameoftemplate.funct1,
dag=dag)
t2 = PythonOperator(
task_id='display',
python_callable=filenameoftemplate.funct2,
op_kwargs=
{'a': t1.output,
'b': t2.output},
provide_context=True,
dag=dag)def funct1():
a = ['apple', 'orange']
b = ['mango', 'lemon']
return a, b
# Use counter to determine access which value
counter = 0
def funct2(a,b):
quote1 = (f"i love {a[counter]}!")
print(quote1)
quote2 = (f"i love {b[counter]}!")
print(quote2)
counter += 1
dag = DAG("fruits", default_args=default_args)
t1 = PythonOperator(
task_id='descr',
python_callable=filenameoftemplate.funct1,
dag=dag)
t2 = PythonOperator(
task_id='display',
python_callable=filenameoftemplate.funct2,
op_kwargs=
{'a': t1.output,
'b': t2.output},
provide_context=True,
dag=dag)

Related

How to use Dynamic Task Mapping with TaskGroups

In my actual DAG, I need to first get a list of IDs and then for each ID run a set of tasks.
I have used Dynamic Task Mapping to pass a list to a single task or operator to have it process the list, but can we do this using a TaskGroup as well?
If I can figure out how to pass a variable value at the TaskGroup level, so it uses that value in all sub tasks, then I should be able to meet my requirement.
The below should give you an idea of what I am looking for, just need help getting it working.
from airflow import DAG, XComArg
from datetime import datetime
from airflow.decorators import task
from airflow.utils.task_group import TaskGroup
from airflow.operators.python import PythonOperator
with DAG(
'dtm_tg_test',
schedule_interval = None,
start_date = datetime(2022, 1, 1)
) as dag:
def getList():
return [ "Hello", "World" ]
def printText(text):
print(text)
get_list = PythonOperator(
task_id = "get_list",
python_callable = getList,
dag = dag
)
with TaskGroup.partial(
group_id = "task_group"
).expand(
list = XComArg(get_list)
) as task_group:
print_text = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_again = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_text >> print_again
get_list >> task_group
You can achieve it with the following example :
list_ids = ['45', '48']
#task_group()
def parent_group(list_ids: List[str]) -> List[TaskGroup]:
return list(map(build_group_for_id, list_ids))
def build_group_for_id(current_id: str) -> TaskGroup:
with TaskGroup(group_id=f'group_for_id_{current_id}') as group:
print_text = PythonOperator(
task_id = f"print_output_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id }
dag = dag
)
print_again = PythonOperator(
task_id = f"print_output_other_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id}
dag = dag
print_text >> print_again
return group
with airflow.DAG(
"my_dag", default_args=args, schedule_interval=None,
) as dag:
DummyOperator(task_id='start_dag') >> parent_group(list_ids())
Some explanations :
I create a parent taskGroup called parent_group
This parent group takes the list of IDs
I add a loop and for each parent ID, I create a TaskGroup containing your 2 Aiflow tasks (print operators)
For the TaskGroup related to a parent ID, the TaskGroup ID is built from it in order to be unique in the DAG
For the print operators inside the TaskGroup, I generated again the task IDs by the current parent ID

Airflow: Dynamically generate tasks with TaskFlow API

Previously I used the following snippet to dynamically generate tasks:
dummy_start_task = PythonOperator(
task_id="dummy_start",
default_args=default_args,
python_callable=dummy_start,
dag=dag
)
make_images_tasks = list()
for n in range(WORKERS):
globals()[f"make_images_{n}_task"] = PythonOperator(
task_id=f'make_images_{n}',
default_args=default_args,
python_callable=make_images,
op_kwargs={"n": n},
dag=dag
)
make_images_tasks.append(globals()[f"make_images_{n}_task"])
dummy_collector_task = PythonOperator(
task_id="dummy_collector",
default_args=default_args,
python_callable=dummy_collector,
dag=dag
)
dummy_start_task >> make_images_tasks >> dummy_collector_task
# in collector_task I would use:
# items = task_instance.xcom_pull(task_ids=[f"make_images_{n}" for n in range(int(WORKERS))])
# to get the XCOMs from the these dynamically generated tasks
How can I achieve that using the TaskFlow API? (Spawn multiple tasks and then get their XComs in the following collector-task)
Here's an example:
from datetime import datetime
from airflow import DAG
from airflow.decorators import task
with DAG(dag_id="example_taskflow", start_date=datetime(2022, 1, 1), schedule_interval=None) as dag:
#task
def dummy_start_task():
pass
tasks = []
for n in range(3):
#task(task_id=f"make_images_{n}")
def images_task(i):
return i
tasks.append(images_task(n))
#task
def dummy_collector_task(tasks):
print(tasks)
dummy_start_task_ = dummy_start_task()
dummy_start_task_ >> tasks
dummy_collector_task(tasks)
Which gives the following DAG:
The make_images_* tasks take 0, 1, and 2 as input (and also use it in the tasks' id) and return the value. The dummy_collector_task takes all outputs from the make_images_* tasks and prints [0, 1, 2].

Return list of tasks from function that should be run in sequence in Airflow

I want to return 2 or more tasks from a function that should be run in sequence in the spot they're inserted in the dependencies, see below.
t1 = PythonOperator()
def generate_tasks():
t2 = PythonOperator()
t3 = PythonOperator()
return magic(t2, t3) # magic needed here (preferably)
t1 >> generate_tasks() # otherwise here
# desired result: t1 >> t2 >> t3
Is this doable? As I understand it Airflow 2.0 seems to achieve this with a TaskGroup, but we're on Google's Composer, and 2.0 won't be available for a while.
Best workaround I've found:
t1 = PythonOperator()
def generate_tasks():
t2 = PythonOperator()
t3 = PythonOperator()
return [t2, t3]
tasks = generate_tasks()
t1 >> tasks[0] >> tasks[1]
But I'd really like that to be abstracted away, as it more or less defeats the purpose of having multiple operators returned from a single function. We want it to be a single unit as far as the end user knows, even though it can be composed of 2 or more tasks.
How to do it with TaskGroup in Airflow 2.0:
class Encryptor:
def encrypt_and_archive(self):
with TaskGroup("archive_and_encrypt") as section_1:
encrypt = DummyOperator(task_id="encrypt")
archive = BashOperator(task_id="archive", bash_command='echo 1')
encrypt >> archive
return section_1
with DAG(dag_id="example_return_task_group", start_date=days_ago(2), tags=["example"]) as dag:
start = DummyOperator(task_id="start")
encrypt_and_archive = Encryptor().encrypt_and_archive()
end = DummyOperator(task_id='end')
# 👇 single variable, containing two tasks
start >> encrypt_and_archive >> end
Which creates the following graph:
Is something similar remotely doable before 2.0?
You didn't explain what magic(t2, t3) is.
TaskGroup is strictly UI feature it doesn't effect on the DAG logic. According to your description it seems that you are looking for a specific logic (otherwise what is magic?).
I believe this is what you are after:
default_args = {
'owner': 'airflow',
'start_date': datetime(2021, 1, 24),
}
def generate_tasks():
operator_list =[]
for i in range(5): # Replace to generate the logic you wish to dynamically create tasks
op = DummyOperator(task_id=f"t{str(i)}_task", dag=dag)
if i>0:
operator_list[i - 1] >> op
operator_list.append(op)
return operator_list
with DAG(
dag_id='loop',
default_args=default_args,
schedule_interval=None,
) as dag:
start_op = DummyOperator(task_id='start_task')
end_op = DummyOperator(task_id='end_task')
tasks = generate_tasks()
start_op >> tasks[0]
tasks[-1] >> end_op
You can replace the DummyOperator with any operator you'd like.

Airflow DAG Task Dependency in a Loop

I have a DAG that needs to recompile this customer lists in various brands. The script is called with two arguments brand and listtype.
I need the brands to run concurrently, but the list types to be dependent on the preceding list type, but I can't figure out how to do that in a loop. Can ya'll help me out?
BrandsToRun = ['A', 'B', 'C']
ListTypes = ['1', '2', '3']
# Defining the DAG
################################################################################
with DAG(
'MusterMaster',
default_args = default_args,
description = 'x',
# schedule_interval = None
schedule_interval = '30 4 * * *',
catchup = False
) as MusterMaster:
for Brand in BrandsToRun:
for ListType in ListTypes:
ListLoad = BashOperator(
task_id='Load_'+str(Brand)+'_'+str(ListType),
bash_command = """python3 '/usr/local/bin/MusterMaster.py' {0} {1}""".format(Brand[0], ListType[0]),
pool='logs'
)
ListLoad
I want the tasks to have dependency structure like this, but I can't figure it out. Brand should run concurrently, but ListTypes should be dependent on the preceding ListType.
Muster A 1 >> Muster A 2 >> Muster A 3
Muster B 1 >> Muster B 2 >> Muster B 3
Muster C 1 >> Muster C 2 >> Muster C 3
How can I best accomplish this?
You can do:
for Brand in BrandsToRun:
list = []
for ListType in ListTypes:
list.append(BashOperator(
task_id='Load_'+str(Brand)+'_'+str(ListType),
bash_command = """python3 '/usr/local/bin/MusterMaster.py' {0} {1}""".format(Brand[0], ListType[0]),
pool='logs'))
if len(list) > 1:
list[-2] >> list[-1]
Which will give you:

create tasks in Airflow by looping over a list and pass arguments

edit:
This will work, I defined ex_func_airflow(var_1 = i) which was causing the issue
I would like to create tasks in airflow by looping on a list.
tabs = [1,2,3,4,5]
for i in tabs:
task = PythonOperator(
task_id = name,
provide_context=False,
op_args = [i],
python_callable=ex_func_airflow,
dag=dag)
task_0 >> task >> task_1
When this is run in airflow the argument that is passed is always the last element in that list.
So i'm essentially running:
ex_func_airflow(6)
five times instead of running
ex_func_airflow(1)
ex_func_airflow(2)
ex_func_airflow(3)
..etc.
How would can I pass the correct arguments for each task?
The following codes work for me.
def print_context(ds, **kwargs):
print("hello")
def ex_func_airflow(i):
print(i)
dag = DAG(
dag_id="loop_dag",
schedule_interval=None,
start_date=datetime(2018, 12, 31),
)
task_0 = PythonOperator(
task_id='task_0',
provide_context=True,
python_callable=print_context,
dag=dag)
task_1 = PythonOperator(
task_id='task_1',
provide_context=True,
python_callable=print_context,
dag=dag)
tabs = [1, 2, 3, 4, 5]
for i in tabs:
task_id = f'task_tab_{i}'
task = PythonOperator(
task_id=task_id,
provide_context=False,
op_args=[i],
python_callable=ex_func_airflow,
dag=dag)
task_0 >> task >> task_1

Resources