I am trying to pull an xcom from within a PythonOperator, but I think I might be missing an element of understanding.
This is my dag :
# dag arguments
default_args = {
"start_date": datetime(2022, 11, 23),
"retries": 1,
"retry_delay": timedelta(minutes=5),
# "max_active_runs": 3,
"schedule_interval": "#daily",
"catchup": False, # enable if you don't want historical dag runs to run
# "dagrun_timeout": timedelta(minutes=20),
}
with DAG(
dag_id="migrate_dtm",
default_args=default_args,
) as dag:
load_data_migration_mapping = PythonOperator(
task_id="load_data_migration_mapping",
python_callable=load_data_migration_map,
)
list_objects = GCSListObjectsOperator(
task_id="list_objects",
bucket=LEGACY_DTM_HISTORY_BUCKET,
prefix="raw_data/datamart/{{ds_nodash}}",
impersonation_chain=IMPERSONATED_SERVICE_ACCOUNT, # Only use in dev env
)
match_mapping_with_data = PythonOperator(
task_id="match_mapping_with_data",
python_callable=match_data_with_migration_map,
provide_context=True,
op_kwargs={
"initial_migration_map": "initial_migration_map",
# "files_and_prefixes": "{{task_instance.xcom_pull(task_ids='list_objects')}}",
"files_and_prefixes": "list_objects",
},
)
(load_data_migration_mapping >> list_objects >> match_mapping_with_data)
In my third PythonOperator : match_mapping_with_data, when I pass the following macro
"files_and_prefixes": "{{task_instance.xcom_pull(task_ids='list_objects')}}"
I can see that it is correctly interpreted within the logs. But when I try to only give it the list_objects task name, in order to perform an xcom_pull in the subsequent function match_data_with_migration_map I get a None.
A partial content of my function until the point where it breaks :
def match_data_with_migration_map(
ti,
**kwargs,
):
# Set arguments' (default) values
xcom_initial_migration_map = kwargs.setdefault(
"initial_migration_map",
"initial_migration_map",
)
xcom_files_and_prefixes = kwargs.setdefault(
"files_and_prefixes",
"list_objects",
)
# Parse Xcoms:
initial_migration_map = ti.xcom_pull(
key=xcom_initial_migration_map,
)
if not initial_migration_map:
err_msg = (
"The xcom initial_migration_map has not "
f"been correctly set : {initial_migration_map}"
)
raise XcomNULL(err_msg)
print(
f"xcom : initial_migration_map - Key: {xcom_initial_migration_map}, "
f"Value = {initial_migration_map}"
)
files_and_prefixes = ti.xcom_pull(
key=xcom_files_and_prefixes,
)
if not files_and_prefixes:
err_msg = (
"The xcom files_and_prefixes has not "
f"been correctly set : {files_and_prefixes}"
)
raise XcomNULL(err_msg)
print(
f"xcom : files_and_prefixes - Key: {xcom_files_and_prefixes}, "
f"Value = {files_and_prefixes}"
)
The custom class..
class XcomNULL(Exception):
pass
N.B : The initial_migration_map XCom is correctly pulled because I set it in the first PythonOperator: load_data_migration_mapping via ti.xcom_push(key="initial_migration_map")
EDIT (Kind of a workaround) :
It works when I set : "files_and_prefixes": "return_value",, how do I rename this XCom in order to have several within my dag run?
Related
In my actual DAG, I need to first get a list of IDs and then for each ID run a set of tasks.
I have used Dynamic Task Mapping to pass a list to a single task or operator to have it process the list, but can we do this using a TaskGroup as well?
If I can figure out how to pass a variable value at the TaskGroup level, so it uses that value in all sub tasks, then I should be able to meet my requirement.
The below should give you an idea of what I am looking for, just need help getting it working.
from airflow import DAG, XComArg
from datetime import datetime
from airflow.decorators import task
from airflow.utils.task_group import TaskGroup
from airflow.operators.python import PythonOperator
with DAG(
'dtm_tg_test',
schedule_interval = None,
start_date = datetime(2022, 1, 1)
) as dag:
def getList():
return [ "Hello", "World" ]
def printText(text):
print(text)
get_list = PythonOperator(
task_id = "get_list",
python_callable = getList,
dag = dag
)
with TaskGroup.partial(
group_id = "task_group"
).expand(
list = XComArg(get_list)
) as task_group:
print_text = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_again = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_text >> print_again
get_list >> task_group
You can achieve it with the following example :
list_ids = ['45', '48']
#task_group()
def parent_group(list_ids: List[str]) -> List[TaskGroup]:
return list(map(build_group_for_id, list_ids))
def build_group_for_id(current_id: str) -> TaskGroup:
with TaskGroup(group_id=f'group_for_id_{current_id}') as group:
print_text = PythonOperator(
task_id = f"print_output_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id }
dag = dag
)
print_again = PythonOperator(
task_id = f"print_output_other_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id}
dag = dag
print_text >> print_again
return group
with airflow.DAG(
"my_dag", default_args=args, schedule_interval=None,
) as dag:
DummyOperator(task_id='start_dag') >> parent_group(list_ids())
Some explanations :
I create a parent taskGroup called parent_group
This parent group takes the list of IDs
I add a loop and for each parent ID, I create a TaskGroup containing your 2 Aiflow tasks (print operators)
For the TaskGroup related to a parent ID, the TaskGroup ID is built from it in order to be unique in the DAG
For the print operators inside the TaskGroup, I generated again the task IDs by the current parent ID
Is there a way to create a dynamic executor_config dictionary that holds its values throughout the DAG execution, but it's different for each DAG execution? Either a UUID that holds for the dag execution or using some of the xcom data works for me.
I have tried both things with the following results:
I created a random variable (using UUID) at the beginning of the DAG's definition, assuming it will hold for all the execution, but it turns out it does not hold.
with DAG("myDag", start_date=datetime(2021, 1, 1), schedule_interval=None) as myDag:
UUID = uuid4().hex
task_create_disk = PythonOperator(
task_id = 'create_disk',
python_callable = createPvc,
op_args=[UUID] # UUID = 1111
)
task_main = PythonOperator(
task_id = 'main',
python_callable = main,
executor_config=getPvcConfig(UUID) # UUID = 2222 does not hold for each task execution
)
task_delete_disk = PythonOperator(
task_id = 'delete_disk',
python_callable = deletePvc,
op_args=[UUID] # UUID = 3333 does not hold for each task execution
)
I tried pulling from the xcom using a templated string, but it turns out only op_args and op_kwargs are templated fields, and using a template in the executor_config field will just render the string without templating.
with DAG("myDag", start_date=datetime(2021, 1, 1), schedule_interval=None) as myDag:
task_create_disk = PythonOperator(
task_id = 'create_disk',
python_callable = createPvc,
op_args=["{{ti.xcom_pull(key='myKey', task_ids='previous_setup_task')}}"] # works OK
)
task_main = PythonOperator(
task_id = 'main',
python_callable = main,
executor_config=getPvcConfig("{{ti.xcom_pull(key='myKey', task_ids='previous_setup_task')}}") # this field is not templatable
)
task_delete_disk = PythonOperator(
task_id = 'delete_disk',
python_callable = deletePvc,
op_args=["{{ti.xcom_pull(key='myKey', task_ids='previous_setup_task')}}"] # works OK
)
Any ideas?
I'm trying to print the content of a variable computed in airflow dag ,therefore I used an echo in a bash operator, but It doesn't work
I tried with a predefined variable but, I got the same output
here is an example :
with DAG(
dag_id="tmp",
) as dag:
test="test"
test_dag = BashOperator(
task_id="test_dag",
bash_command='echo $test',
)
the output is always empty :
Output:
INFO -
You can pass the variable in a f-string. You can even set the task_id based on your variable. See:
with DAG(
dag_id="tmp",
) as dag:
test="test"
test_dag = BashOperator(
task_id=f"t1_{test}",
bash_command=f"echo {test}",
)
You can also use the env parameter like so:
with DAG(dag_id="tmp", start_date=datetime(2022, 1, 1), schedule_interval=None) as dag:
test_dag = BashOperator(
task_id="t1",
bash_command="echo $test",
env={"test": "something"},
)
Requirement: Pass XCOM value from the previous task to the next tasks for each multiple tasks
Error: Error rendering template: can only concatenate str (not "set") to str
I tried as below
params={"s3_key": list(date.keys())[0]}
Reading the params in the SQL file code as below
SELECT
$1 AS json_array
FROM
'#STAGE/'
(file_format = > 'public.s3_json',
pattern = > '{{ti.xcom_pull(task_ids='api_into_s3_' + params.s3_key)[0]}}'
)
Dag code
dats = [{f"date_{x}": (datetime.date.today() - datetime.timedelta(days=x)).strftime("%Y-%m-%d")} for x in range(int(end_offset_days), int(start_offset_days))]
default_args = {
"start_date": datetime.datetime(2022, 1, 1),
"provide_context": True,
"execution_timeout": timedelta(minutes=180),
}
with DAG(
dag_id=dag_id,
default_args=default_args,
schedule_interval=schedule_interval,
dagrun_timeout=timedelta(minutes=180),
max_active_runs=1,
params={},
) as dag:
t0 = EmptyOperator(task_id="start_task")
dates = dats
api_tsk = []
snflk_tsk = []
for date in dates:
api_tsk.append(
APIOper(
task_id=f"api_into_s3_{list(date.keys())[0]}",
date_run=list(date.values())[0],
)
)
snflk_tsk.append(
SnowflakeOperator(
task_id=f"s3_into_snflk_{list(date.keys())[0]}",
snowflake_conn_id="snflk_conn_id",
sql="queries.sql",
warehouse="main_wh",
schema="stg_sch",
params={"s3_key": list(date.keys())[0]}
)
)
t3 = EmptyOperator(task_id="end_task")
t0 >> api_tasks
for i, x in enumerate(zip(api_tasks, snflk_tasks)):
if i == 0:
continue
else:
api_tsk[i - 1] >> snflk_tsk[i - 1]
api_tsk[i] >> snflk_tsk[i]
snflk_tsk >> t3
return dag
As below, the xcom value from api_into_s3_date_0 needs to be passed into s3_into_snflk_date_0 tasks and so on
> UPDATES: below is the xcom value from the previous task
But in the log, the query is showing as
SELECT $1 AS json_array
FROM '#STAGE/'
(file_format => 'public.S3_JSON',
pattern => a
)
the SQL file has been updated as below
pattern =>{{ti.xcom_pull(task_ids='api_into_s3_'+params.s3_key)[0]}}
I am trying to add airflow dag dynamically looping through the dictionary keys and assigning keys as dag name.
dags are creating fine but i am getting :"This DAG isn't available in the webserver DagBag object. It shows up in this list because the scheduler marked it as active in the metdata database" and its not clickable.
def create_dag(dag_id):
args = build_default_args(config_file)
dag = DAG(dag_id,schedule_interval='30 11 * * *', default_args=args)
with dag:
init_task = BashOperator(
task_id='test_init_task',
bash_command='echo "task"',
dag=dag
)
init_task
return dag
def get_data(**kwargs):
my_list=[]
file = open("/home/airflow/gcs/data/test.json")
data=json.load(file)
return data
data1 = data()
for dict in data1:
for pair in dict.items():
key , value = pair
print "key",ls_table ,"value",metrics
dag_id = '{}'.format(key)
default_args = {'owner': 'airflow',
'start_date': datetime(2019, 6, 18)
}
schedule = '#daily'
globals()[dag_id] = create_dag(dag_id)