Is there a way to create a dynamic executor_config dictionary that holds its values throughout the DAG execution, but it's different for each DAG execution? Either a UUID that holds for the dag execution or using some of the xcom data works for me.
I have tried both things with the following results:
I created a random variable (using UUID) at the beginning of the DAG's definition, assuming it will hold for all the execution, but it turns out it does not hold.
with DAG("myDag", start_date=datetime(2021, 1, 1), schedule_interval=None) as myDag:
UUID = uuid4().hex
task_create_disk = PythonOperator(
task_id = 'create_disk',
python_callable = createPvc,
op_args=[UUID] # UUID = 1111
)
task_main = PythonOperator(
task_id = 'main',
python_callable = main,
executor_config=getPvcConfig(UUID) # UUID = 2222 does not hold for each task execution
)
task_delete_disk = PythonOperator(
task_id = 'delete_disk',
python_callable = deletePvc,
op_args=[UUID] # UUID = 3333 does not hold for each task execution
)
I tried pulling from the xcom using a templated string, but it turns out only op_args and op_kwargs are templated fields, and using a template in the executor_config field will just render the string without templating.
with DAG("myDag", start_date=datetime(2021, 1, 1), schedule_interval=None) as myDag:
task_create_disk = PythonOperator(
task_id = 'create_disk',
python_callable = createPvc,
op_args=["{{ti.xcom_pull(key='myKey', task_ids='previous_setup_task')}}"] # works OK
)
task_main = PythonOperator(
task_id = 'main',
python_callable = main,
executor_config=getPvcConfig("{{ti.xcom_pull(key='myKey', task_ids='previous_setup_task')}}") # this field is not templatable
)
task_delete_disk = PythonOperator(
task_id = 'delete_disk',
python_callable = deletePvc,
op_args=["{{ti.xcom_pull(key='myKey', task_ids='previous_setup_task')}}"] # works OK
)
Any ideas?
Related
I am trying to pull an xcom from within a PythonOperator, but I think I might be missing an element of understanding.
This is my dag :
# dag arguments
default_args = {
"start_date": datetime(2022, 11, 23),
"retries": 1,
"retry_delay": timedelta(minutes=5),
# "max_active_runs": 3,
"schedule_interval": "#daily",
"catchup": False, # enable if you don't want historical dag runs to run
# "dagrun_timeout": timedelta(minutes=20),
}
with DAG(
dag_id="migrate_dtm",
default_args=default_args,
) as dag:
load_data_migration_mapping = PythonOperator(
task_id="load_data_migration_mapping",
python_callable=load_data_migration_map,
)
list_objects = GCSListObjectsOperator(
task_id="list_objects",
bucket=LEGACY_DTM_HISTORY_BUCKET,
prefix="raw_data/datamart/{{ds_nodash}}",
impersonation_chain=IMPERSONATED_SERVICE_ACCOUNT, # Only use in dev env
)
match_mapping_with_data = PythonOperator(
task_id="match_mapping_with_data",
python_callable=match_data_with_migration_map,
provide_context=True,
op_kwargs={
"initial_migration_map": "initial_migration_map",
# "files_and_prefixes": "{{task_instance.xcom_pull(task_ids='list_objects')}}",
"files_and_prefixes": "list_objects",
},
)
(load_data_migration_mapping >> list_objects >> match_mapping_with_data)
In my third PythonOperator : match_mapping_with_data, when I pass the following macro
"files_and_prefixes": "{{task_instance.xcom_pull(task_ids='list_objects')}}"
I can see that it is correctly interpreted within the logs. But when I try to only give it the list_objects task name, in order to perform an xcom_pull in the subsequent function match_data_with_migration_map I get a None.
A partial content of my function until the point where it breaks :
def match_data_with_migration_map(
ti,
**kwargs,
):
# Set arguments' (default) values
xcom_initial_migration_map = kwargs.setdefault(
"initial_migration_map",
"initial_migration_map",
)
xcom_files_and_prefixes = kwargs.setdefault(
"files_and_prefixes",
"list_objects",
)
# Parse Xcoms:
initial_migration_map = ti.xcom_pull(
key=xcom_initial_migration_map,
)
if not initial_migration_map:
err_msg = (
"The xcom initial_migration_map has not "
f"been correctly set : {initial_migration_map}"
)
raise XcomNULL(err_msg)
print(
f"xcom : initial_migration_map - Key: {xcom_initial_migration_map}, "
f"Value = {initial_migration_map}"
)
files_and_prefixes = ti.xcom_pull(
key=xcom_files_and_prefixes,
)
if not files_and_prefixes:
err_msg = (
"The xcom files_and_prefixes has not "
f"been correctly set : {files_and_prefixes}"
)
raise XcomNULL(err_msg)
print(
f"xcom : files_and_prefixes - Key: {xcom_files_and_prefixes}, "
f"Value = {files_and_prefixes}"
)
The custom class..
class XcomNULL(Exception):
pass
N.B : The initial_migration_map XCom is correctly pulled because I set it in the first PythonOperator: load_data_migration_mapping via ti.xcom_push(key="initial_migration_map")
EDIT (Kind of a workaround) :
It works when I set : "files_and_prefixes": "return_value",, how do I rename this XCom in order to have several within my dag run?
In my actual DAG, I need to first get a list of IDs and then for each ID run a set of tasks.
I have used Dynamic Task Mapping to pass a list to a single task or operator to have it process the list, but can we do this using a TaskGroup as well?
If I can figure out how to pass a variable value at the TaskGroup level, so it uses that value in all sub tasks, then I should be able to meet my requirement.
The below should give you an idea of what I am looking for, just need help getting it working.
from airflow import DAG, XComArg
from datetime import datetime
from airflow.decorators import task
from airflow.utils.task_group import TaskGroup
from airflow.operators.python import PythonOperator
with DAG(
'dtm_tg_test',
schedule_interval = None,
start_date = datetime(2022, 1, 1)
) as dag:
def getList():
return [ "Hello", "World" ]
def printText(text):
print(text)
get_list = PythonOperator(
task_id = "get_list",
python_callable = getList,
dag = dag
)
with TaskGroup.partial(
group_id = "task_group"
).expand(
list = XComArg(get_list)
) as task_group:
print_text = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_again = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_text >> print_again
get_list >> task_group
You can achieve it with the following example :
list_ids = ['45', '48']
#task_group()
def parent_group(list_ids: List[str]) -> List[TaskGroup]:
return list(map(build_group_for_id, list_ids))
def build_group_for_id(current_id: str) -> TaskGroup:
with TaskGroup(group_id=f'group_for_id_{current_id}') as group:
print_text = PythonOperator(
task_id = f"print_output_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id }
dag = dag
)
print_again = PythonOperator(
task_id = f"print_output_other_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id}
dag = dag
print_text >> print_again
return group
with airflow.DAG(
"my_dag", default_args=args, schedule_interval=None,
) as dag:
DummyOperator(task_id='start_dag') >> parent_group(list_ids())
Some explanations :
I create a parent taskGroup called parent_group
This parent group takes the list of IDs
I add a loop and for each parent ID, I create a TaskGroup containing your 2 Aiflow tasks (print operators)
For the TaskGroup related to a parent ID, the TaskGroup ID is built from it in order to be unique in the DAG
For the print operators inside the TaskGroup, I generated again the task IDs by the current parent ID
Previously I used the following snippet to dynamically generate tasks:
dummy_start_task = PythonOperator(
task_id="dummy_start",
default_args=default_args,
python_callable=dummy_start,
dag=dag
)
make_images_tasks = list()
for n in range(WORKERS):
globals()[f"make_images_{n}_task"] = PythonOperator(
task_id=f'make_images_{n}',
default_args=default_args,
python_callable=make_images,
op_kwargs={"n": n},
dag=dag
)
make_images_tasks.append(globals()[f"make_images_{n}_task"])
dummy_collector_task = PythonOperator(
task_id="dummy_collector",
default_args=default_args,
python_callable=dummy_collector,
dag=dag
)
dummy_start_task >> make_images_tasks >> dummy_collector_task
# in collector_task I would use:
# items = task_instance.xcom_pull(task_ids=[f"make_images_{n}" for n in range(int(WORKERS))])
# to get the XCOMs from the these dynamically generated tasks
How can I achieve that using the TaskFlow API? (Spawn multiple tasks and then get their XComs in the following collector-task)
Here's an example:
from datetime import datetime
from airflow import DAG
from airflow.decorators import task
with DAG(dag_id="example_taskflow", start_date=datetime(2022, 1, 1), schedule_interval=None) as dag:
#task
def dummy_start_task():
pass
tasks = []
for n in range(3):
#task(task_id=f"make_images_{n}")
def images_task(i):
return i
tasks.append(images_task(n))
#task
def dummy_collector_task(tasks):
print(tasks)
dummy_start_task_ = dummy_start_task()
dummy_start_task_ >> tasks
dummy_collector_task(tasks)
Which gives the following DAG:
The make_images_* tasks take 0, 1, and 2 as input (and also use it in the tasks' id) and return the value. The dummy_collector_task takes all outputs from the make_images_* tasks and prints [0, 1, 2].
I have a problem with my dag getting stuck at subdag. The subdag is in RUNNING state but on zooming in all the tasks of the subdag are in None status.
Using Airflow 2.1.1 with LocalExecutor.
Below is the main dag:
default_args = {
'owner' : 'airflow',
'retries' : 1,
'depends_on_past' : False
}
dag = DAG('loop_example',
start_date = datetime(2022,1,1),
schedule_interval = None,
catchup = False,
tags=['loop']
)
## function to filter src_name based on a DB table/log file entry
def check_valid_src(src_name):
hook = MySqlHook(mysql_conn_id='mysql_conn')
sql='SELECT src_name FROM ingsted_src_log_table'
myresult=hook.get_records(sql)
valid_src_names = []
for src in myresult:
valid_src_names.append(src[0])
if src_name in valid_src_names:
return True
else:
return False
first = DummyOperator(task_id = 'first',dag=dag)
last = DummyOperator(task_id = 'last',dag=dag)
options = ['branch_a','branch_b','branch_c','branch_d']
for option in options:
if check_valid_src(option):
t = SubDagOperator(task_id = f'section_{option}',
subdag=subdag('loop_example',f'section_{option}',default_args,option),
dag=dag
)
first >> t >> last
subdag code:
def subdag(parent_dag_name, child_dag_name, args,option):
dag_subdag = DAG(
dag_id=f'{parent_dag_name}.{child_dag_name}',
default_args=args,
start_date = datetime(2022,1,1),
schedule_interval=None,
)
t1= BashOperator(
task_id=f'Echo_source_name',
bash_command = f'echo {option}',
default_args=args,
dag=dag_subdag
)
t2= BashOperator(
task_id=f'Echo_source_number',
bash_command = f'echo "{option}" | cut -d "_" f2',
default_args=args,
dag=dag_subdag,
)
t1 >> t2
return dag_subdag
Earlier the start_date of the main_dag and subdag was not same so I tried running again making the start_date as same but still it gets stuck.
Is there anything that I am missing here
You have to pass is_paused_upon_creation=False in subdag.
dag_subdag = DAG(
dag_id=f'{parent_dag_name}.{child_dag_name}',
default_args=args,
start_date = datetime(2022,1,1),
schedule_interval=None,is_paused_upon_creation=False
)
I am trying to add airflow dag dynamically looping through the dictionary keys and assigning keys as dag name.
dags are creating fine but i am getting :"This DAG isn't available in the webserver DagBag object. It shows up in this list because the scheduler marked it as active in the metdata database" and its not clickable.
def create_dag(dag_id):
args = build_default_args(config_file)
dag = DAG(dag_id,schedule_interval='30 11 * * *', default_args=args)
with dag:
init_task = BashOperator(
task_id='test_init_task',
bash_command='echo "task"',
dag=dag
)
init_task
return dag
def get_data(**kwargs):
my_list=[]
file = open("/home/airflow/gcs/data/test.json")
data=json.load(file)
return data
data1 = data()
for dict in data1:
for pair in dict.items():
key , value = pair
print "key",ls_table ,"value",metrics
dag_id = '{}'.format(key)
default_args = {'owner': 'airflow',
'start_date': datetime(2019, 6, 18)
}
schedule = '#daily'
globals()[dag_id] = create_dag(dag_id)