I understand for PythonOperator/BashOperator we can use Xcom to communicate.
e.g.
def func(**context):
context['task_instance'].xcom_pull()
However, I am wondering how to access xcom for a custom operator during run time.
My operator looks like this:
class ECHOXOperator(BaseOperator):
#apply_defaults
def __init__(self, x, *args, **kwargs):
self.x = x
super(ECHOXOperator, self).__init__(*args, **kwargs)
def execute(self, context):
print(self.x)
So in my DAG:
I can do
task2 = ECHOXOperator(x = 'Hello")
And it works well. But how can I access x from an upstream task?
Something like:
def task1(**context):
task_instance = context['task_instance']
task_instance.xcom_push(key="x", value="Hello")
generate_data = PythonOperator(
task_id="task1",
python_callable=task1,
dag=dag,
)
task2 = ECHOXOperator(x = task_instance.xcom_pull('task1', 'x'), provide_context=True)
task1 >> task2
This is not working because task_instance in ECHOXOperator is not defined.
Thanks
You should pass x as templated_fields in your Custom Operator.
class ECHOXOperator(BaseOperator):
template_fields = ['x']
#apply_defaults
def __init__(self, x, *args, **kwargs):
self.x = x
super(ECHOXOperator, self).__init__(*args, **kwargs)
def execute(self, context):
print(self.x)
And now you can do the following to get the value of Xcom passed in previous task:
def task1(**context):
task_instance = context['task_instance']
task_instance.xcom_push(key="x", value="Hello")
generate_data = PythonOperator(
task_id="task1",
python_callable=task1,
dag=dag,
)
task2 = ECHOXOperator(x = "{{ ti.xcom_pull('task1', 'x') }}")
task1 >> task2
More information regarding templated_fields and Jinja templating: https://airflow.readthedocs.io/en/latest/concepts.html#id1
Related
In my actual DAG, I need to first get a list of IDs and then for each ID run a set of tasks.
I have used Dynamic Task Mapping to pass a list to a single task or operator to have it process the list, but can we do this using a TaskGroup as well?
If I can figure out how to pass a variable value at the TaskGroup level, so it uses that value in all sub tasks, then I should be able to meet my requirement.
The below should give you an idea of what I am looking for, just need help getting it working.
from airflow import DAG, XComArg
from datetime import datetime
from airflow.decorators import task
from airflow.utils.task_group import TaskGroup
from airflow.operators.python import PythonOperator
with DAG(
'dtm_tg_test',
schedule_interval = None,
start_date = datetime(2022, 1, 1)
) as dag:
def getList():
return [ "Hello", "World" ]
def printText(text):
print(text)
get_list = PythonOperator(
task_id = "get_list",
python_callable = getList,
dag = dag
)
with TaskGroup.partial(
group_id = "task_group"
).expand(
list = XComArg(get_list)
) as task_group:
print_text = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_again = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_text >> print_again
get_list >> task_group
You can achieve it with the following example :
list_ids = ['45', '48']
#task_group()
def parent_group(list_ids: List[str]) -> List[TaskGroup]:
return list(map(build_group_for_id, list_ids))
def build_group_for_id(current_id: str) -> TaskGroup:
with TaskGroup(group_id=f'group_for_id_{current_id}') as group:
print_text = PythonOperator(
task_id = f"print_output_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id }
dag = dag
)
print_again = PythonOperator(
task_id = f"print_output_other_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id}
dag = dag
print_text >> print_again
return group
with airflow.DAG(
"my_dag", default_args=args, schedule_interval=None,
) as dag:
DummyOperator(task_id='start_dag') >> parent_group(list_ids())
Some explanations :
I create a parent taskGroup called parent_group
This parent group takes the list of IDs
I add a loop and for each parent ID, I create a TaskGroup containing your 2 Aiflow tasks (print operators)
For the TaskGroup related to a parent ID, the TaskGroup ID is built from it in order to be unique in the DAG
For the print operators inside the TaskGroup, I generated again the task IDs by the current parent ID
Previously I used the following snippet to dynamically generate tasks:
dummy_start_task = PythonOperator(
task_id="dummy_start",
default_args=default_args,
python_callable=dummy_start,
dag=dag
)
make_images_tasks = list()
for n in range(WORKERS):
globals()[f"make_images_{n}_task"] = PythonOperator(
task_id=f'make_images_{n}',
default_args=default_args,
python_callable=make_images,
op_kwargs={"n": n},
dag=dag
)
make_images_tasks.append(globals()[f"make_images_{n}_task"])
dummy_collector_task = PythonOperator(
task_id="dummy_collector",
default_args=default_args,
python_callable=dummy_collector,
dag=dag
)
dummy_start_task >> make_images_tasks >> dummy_collector_task
# in collector_task I would use:
# items = task_instance.xcom_pull(task_ids=[f"make_images_{n}" for n in range(int(WORKERS))])
# to get the XCOMs from the these dynamically generated tasks
How can I achieve that using the TaskFlow API? (Spawn multiple tasks and then get their XComs in the following collector-task)
Here's an example:
from datetime import datetime
from airflow import DAG
from airflow.decorators import task
with DAG(dag_id="example_taskflow", start_date=datetime(2022, 1, 1), schedule_interval=None) as dag:
#task
def dummy_start_task():
pass
tasks = []
for n in range(3):
#task(task_id=f"make_images_{n}")
def images_task(i):
return i
tasks.append(images_task(n))
#task
def dummy_collector_task(tasks):
print(tasks)
dummy_start_task_ = dummy_start_task()
dummy_start_task_ >> tasks
dummy_collector_task(tasks)
Which gives the following DAG:
The make_images_* tasks take 0, 1, and 2 as input (and also use it in the tasks' id) and return the value. The dummy_collector_task takes all outputs from the make_images_* tasks and prints [0, 1, 2].
I wrote a custom operator called HadoopPutHdfs in Airflow,
so I need to pass xxx parameter to HadoopPutHdfs and I need to fill xxx with the return value from the generate_file_path task
with DAG(dag_id='my_custom_operator_dag', schedule_interval='1 * * * *', default_args=default_args, catchup=False) as dag:
generate_file_path = PythonOperator(
task_id='generate_file_path',
python_callable=generate_file_path_func,
dag=dag,
)
put_to_hdfs = HadoopPutHdfs(
task_id='put_to_hdfs',
headers={'Content-Type': 'text/plain'},
hdfs_path='webhdfs/v1/user/hive/13.zip',
hadoop_host='10.10.10.146',
hadoop_port=9870,
source_path='/opt/airflow/dags/1.zip',
dag=dag,
xxx= "{{ ti.xcom_pull(task_ids=['generate_file_path']) }}",
)
this line not work ,
xxx= "{{ ti.xcom_pull(task_ids=['generate_file_path']) }}"
How can I pass the amount of generate_file_path function to xxx perameter?
Sounds like you are missing the definition of xxx as a template_field in your custom operator. For example:
class CustomDummyOperator(BaseOperator):
template_fields = ('msg_from_previous_task',)
def __init__(self,
msg_from_previous_task,
*args, **kwargs) -> None:
super(CustomDummyOperator, self).__init__(*args, **kwargs)
self.msg_from_previous_task = msg_from_previous_task
def execute(self, context):
print(f"Message: {self.msg_from_previous_task}")
DAG:
def return_a_str():
return "string_value_from_op1"
task_1 = PythonOperator(
task_id='task_1',
dag=dag,
python_callable=return_a_str,
)
task_2 = CustomDummyOperator(
task_id='task_2',
dag=dag,
msg_from_previous_task="{{ ti.xcom_pull(task_ids='task_1') }}"
)
The output from task_2 is: Message: string_value_from_op1
You could use XcomArg for a cleaner syntax:
task_2 = CustomDummyOperator(
task_id='task_2',
dag=dag,
msg_from_previous_task=task_1.output
# msg_from_previous_task="{{ ti.xcom_pull(task_ids='task_1') }}"
)
I am trying to derive name of the DAG to be called in another DAG dynamically. In the following task "trigger_transform_dag" fails to execute. Can you please help me with deriving the dag id for task 'trigger_transform_dag' dynamically
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'start_date': airflow.utils.dates.days_ago(0),
}
def run_dag(**context):
file_path='ABC'
context['ti'].xcom_push(key = 'key1', value = file_path)
return 1
def check_file_name(**context):
pulled_value_1 = context['ti'].xcom_pull(task_ids = 'run_dataflow_template',key = 'key1')
if pulled_value_1 = 'ABC':
push_value = 'sample1'
return push_value
else:
push_value = 'sample2'
return push_value
return pulled_value_1
with DAG('sample',
default_args=default_args,
schedule_interval='10 * * * *',
start_date=datetime(2017, 3, 20),
max_active_runs=1,
catchup=False) as dag:
t1 = PythonOperator(
task_id='run_dataflow_template',
provide_context=True,
python_callable=run_dag
)
t2 = TriggerDagRunOperator(
task_id="trigger_transform_dag",
provide_context=True,
trigger_dag_id=check_file_name()
)
end = DummyOperator(
trigger_rule='one_success',
task_id='end')
t1 >> t2 >> end
I don't know if there is a simpler way, but you can create a custom operator that takes inspiration from the TriggerDagRunOperator (https://github.com/apache/airflow/blob/master/airflow/operators/dagrun_operator.py) and uses the passed Callable to get the function.
Something I hacked together really quick (can be definitely improved):
from airflow.models import DAG
from airflow.utils.dates import days_ago, timedelta
from airflow.operators.dagrun_operator import TriggerDagRunOperator
import random
import datetime
from typing import Dict, Optional, Union, Callable
from airflow.api.common.experimental.trigger_dag import trigger_dag
from airflow.models import BaseOperator
from airflow.utils import timezone
from airflow.utils.decorators import apply_defaults
class TriggerDagRunWithFuncOperator(BaseOperator):
"""
Triggers a DAG run for a specified ``dag_id``
:param trigger_dag_id_f: the dag_id function to trigger
:type trigger_dag_id_f: Callable
:param conf: Configuration for the DAG run
:type conf: dict
:param execution_date: Execution date for the dag (templated)
:type execution_date: str or datetime.datetime
"""
template_fields = ("execution_date", "conf")
ui_color = "#ffefeb"
#apply_defaults
def __init__(
self,
get_dag_name_f: Callable,
conf: Optional[Dict] = None,
execution_date: Optional[Union[str, datetime.datetime]] = None,
*args,
**kwargs
) -> None:
super().__init__(*args, **kwargs)
self.conf = conf
self.get_dag_name_f = get_dag_name_f
if not isinstance(execution_date, (str, datetime.datetime, type(None))):
raise TypeError(
"Expected str or datetime.datetime type for execution_date."
"Got {}".format(type(execution_date))
)
self.execution_date: Optional[datetime.datetime] = execution_date # type: ignore
def execute(self, context: Dict):
if isinstance(self.execution_date, datetime.datetime):
run_id = "trig__{}".format(self.execution_date.isoformat())
elif isinstance(self.execution_date, str):
run_id = "trig__{}".format(self.execution_date)
self.execution_date = timezone.parse(self.execution_date) # trigger_dag() expects datetime
else:
run_id = "trig__{}".format(timezone.utcnow().isoformat())
dag_id_to_call = self.get_dag_name_f()
# Ignore MyPy type for self.execution_date because it doesn't pick up the timezone.parse() for strings
trigger_dag(
dag_id=dag_id_to_call,
run_id=run_id,
conf=self.conf,
execution_date=self.execution_date,
replace_microseconds=False,
)
args={
'owner': 'arocketman',
'start_date': days_ago(1)
}
dag = DAG(dag_id='dyna_dag', default_args=args, schedule_interval=None)
def your_function():
return 'my_sample_dag'
with dag:
run_this_task = TriggerDagRunWithFuncOperator(
task_id='run_this',
get_dag_name_f=your_function
)
edit:
This will work, I defined ex_func_airflow(var_1 = i) which was causing the issue
I would like to create tasks in airflow by looping on a list.
tabs = [1,2,3,4,5]
for i in tabs:
task = PythonOperator(
task_id = name,
provide_context=False,
op_args = [i],
python_callable=ex_func_airflow,
dag=dag)
task_0 >> task >> task_1
When this is run in airflow the argument that is passed is always the last element in that list.
So i'm essentially running:
ex_func_airflow(6)
five times instead of running
ex_func_airflow(1)
ex_func_airflow(2)
ex_func_airflow(3)
..etc.
How would can I pass the correct arguments for each task?
The following codes work for me.
def print_context(ds, **kwargs):
print("hello")
def ex_func_airflow(i):
print(i)
dag = DAG(
dag_id="loop_dag",
schedule_interval=None,
start_date=datetime(2018, 12, 31),
)
task_0 = PythonOperator(
task_id='task_0',
provide_context=True,
python_callable=print_context,
dag=dag)
task_1 = PythonOperator(
task_id='task_1',
provide_context=True,
python_callable=print_context,
dag=dag)
tabs = [1, 2, 3, 4, 5]
for i in tabs:
task_id = f'task_tab_{i}'
task = PythonOperator(
task_id=task_id,
provide_context=False,
op_args=[i],
python_callable=ex_func_airflow,
dag=dag)
task_0 >> task >> task_1