How to pass previous task state as parameter to another task within the Airflow Taskflow API? - airflow

I want to get the status of a SparkSubmitOperator, transform it to some value that my API understands and pass it within the payload of a SimpleHttpOperator so that I can update the job status inside my DB. I want to do this by using Taskflow API.
I wrote the code below but I get this error when I try to load it:
Broken DAG: [/opt/airflow/dags/export/inapp_clicks/export.py] Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.6/site-packages/airflow/models/baseoperator.py", line 1378, in set_downstream
self._set_relatives(task_or_task_list, upstream=False, edge_modifier=edge_modifier)
File "/home/airflow/.local/lib/python3.6/site-packages/airflow/models/baseoperator.py", line 1316, in _set_relatives
task_object.update_relative(self, not upstream)
AttributeError: 'function' object has no attribute 'update_relative'
Code:
from datetime import datetime
from airflow.decorators import dag, task
from airflow.models import Variable
from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
#dag(schedule_interval=None, start_date=datetime.now(), tags=["export", "inapp"])
def export_inapp_clicks():
DEFAULT_NUM_EXECUTORS = 2
DEFAULT_EXECUTOR_CORES = 3
DEFAULT_EXECUTOR_MEMORY = "2g"
DEFAULT_DRIVER_MEMORY = "1g"
#task()
def update_job_status(dag, ti, execution_date):
jst = dag.get_task("export_inapp_clicks_job_submission")
jsti = TaskInstance(jst, execution_date)
xcom_value = ti.xcom_pull(task_ids="export_inapp_clicks_job_submission")
print("Task:", jst)
print("Task Instance:", jsti)
print("Task State:", jsti.current_state())
print("XCOM Value:", xcom_value)
# TODO: call API via SimpleHttpOperator
job_submission = SparkSubmitOperator(
task_id="export_inapp_clicks_job_submission",
conn_id="yarn",
name="{{ dag_run.conf['name'] }}",
conf=Variable.get("export_inapp_clicks_conf", deserialize_json=True),
jars=Variable.get("export_inapp_clicks_jars"),
application=Variable.get("pyspark_executor_path"),
application_args=[
"--module",
"export_inapp_clicks.export",
"--org-id",
"{{ dag_run.conf['orgId'] }}",
"--app-id",
"{{ dag_run.conf['appId'] }}",
"--inapp-id",
"{{ dag_run.conf['inappId'] }}",
"--start-date",
"{{ dag_run.conf['startDate'] }}",
"--end-date",
"{{ dag_run.conf['endDate'] }}",
"--data-path",
Variable.get("event_data_path"),
"--es-nodes",
Variable.get("es_nodes"),
"--destination",
Variable.get("export_inapp_clicks_output"),
"--explain",
"--debug",
"--encode-columns",
"--log-level",
"WARN"
],
py_files=Variable.get("export_inapp_clicks_py_files"),
num_executors=Variable.get("export_inapp_clicks_num_executors", DEFAULT_NUM_EXECUTORS),
executor_cores=Variable.get("export_inapp_clicks_executor_cores", DEFAULT_EXECUTOR_CORES),
executor_memory=Variable.get("export_inapp_clicks_executor_memory", DEFAULT_EXECUTOR_MEMORY),
driver_memory=Variable.get("export_inapp_clicks_driver_memory", DEFAULT_DRIVER_MEMORY),
status_poll_interval=10
)
job_submission >> update_job_status
export_dag = export_inapp_clicks()

Consider the following example, the first task will correspond to your SparkSubmitOperator task:
_get_upstream_task Takes care of getting the state of the first task from the second one, by performing a query to the metadata database:
DAG definition, first two task:
import json
from airflow.decorators import dag, task
from airflow.utils.dates import days_ago
from airflow.utils.db import provide_session
from airflow.models.taskinstance import TaskInstance
from airflow.providers.http.operators.http import SimpleHttpOperator
#dag(
default_args= {"owner": "airflow"},
schedule_interval=None,
start_date=days_ago(0),
catchup=False,
tags=["custom_example", "TaskFlow"],
)
def taskflow_previous_task():
#provide_session
def _get_upstream_task(upstream_task_id, dag, execution_date, session=None, **_):
upstream_ti = (
session.query(TaskInstance)
.filter(
TaskInstance.dag_id == dag.dag_id,
TaskInstance.execution_date == execution_date,
TaskInstance.task_id == upstream_task_id,
)
.first()
)
return upstream_ti
#task
def job_submission_task(**context):
print(f"Task Id: {context['ti'].task_id}")
return {"job_data": "something"}
#task(trigger_rule='all_done')
def update_job_status(job_data, **context):
print(f"Data from previous Task: {job_data['job_data']}")
upstream_ti = _get_upstream_task("job_submission_task", **context)
print(f"Upstream_ti state: {upstream_ti.state}")
return upstream_ti.state
job_results = job_submission_task()
job_status = update_job_status(job_results)
job_submission_task returns a dict that is passed to update_job_status via Xcoms using XcomArg which is a main feature of Taskflow API. By doing so you get to avoid explicitly perfoming xcom_pull() and xcom_push() operations.
Once you get the TaskInstance object from _get_upstream_task method, you can return it and retrieve it again from the last task wich will perfom the HTTP request:
Final task, end of DAG definition:
task_post_op = SimpleHttpOperator(
task_id="post_op",
endpoint="post",
data=json.dumps({"job_status": f"{job_status}"}),
headers={"Content-Type": "application/json"},
log_response=True,
)
job_status >> task_post_op
example_dag = taskflow_previous_task()
Since the param data of SimpleHttpOperator is templated, you can retrieve the Xcom value from the second task using Jinja:
data=json.dumps({"job_status": f"{job_status}"}),
Execution logs:
Task_1:
AIRFLOW_CTX_DAG_RUN_ID=manual__2021-08-20T23:15:15.226853+00:00
[2021-08-20 23:15:17,148] {logging_mixin.py:104} INFO - Task Id: job_submission_task
[2021-08-20 23:15:17,148] {python.py:151} INFO - Done. Returned value was: {'job_data': 'something'}
[2021-08-20 23:15:17,202] {taskinstance.py:1211} INFO - Marking task as SUCCESS.
Task_2:
AIRFLOW_CTX_DAG_ID=taskflow_previous_task
AIRFLOW_CTX_TASK_ID=update_job_status
AIRFLOW_CTX_EXECUTION_DATE=2021-08-20T23:15:15.226853+00:00
AIRFLOW_CTX_DAG_RUN_ID=manual__2021-08-20T23:15:15.226853+00:00
[2021-08-20 23:15:18,768] {logging_mixin.py:104} INFO - Data from previous Task: something
[2021-08-20 23:15:18,792] {logging_mixin.py:104} INFO - Upstream_ti state: success
[2021-08-20 23:15:18,793] {python.py:151} INFO - Done. Returned value was: success
[2021-08-20 23:15:18,874] {taskinstance.py:1211} INFO - Marking task as SUCCESS.
Task_3:
AIRFLOW_CTX_DAG_ID=taskflow_previous_task
AIRFLOW_CTX_TASK_ID=post_op
AIRFLOW_CTX_EXECUTION_DATE=2021-08-20T23:15:15.226853+00:00
AIRFLOW_CTX_DAG_RUN_ID=manual__2021-08-20T23:15:15.226853+00:00
[2021-08-20 23:15:21,201] {http.py:111} INFO - Calling HTTP method
[2021-08-20 23:15:21,228] {base.py:78} INFO - Using connection to: id: http_default. Host: https://www.httpbin.org, Port: None, Schema: , Login: , Password: None, extra: {}
[2021-08-20 23:15:21,245] {http.py:140} INFO - Sending 'POST' to url: https://www.httpbin.org/post
[2021-08-20 23:15:21,973] {http.py:115} INFO - {
"args": {},
"data": "{\"job_status\": \"success\"}",
"files": {},
"form": {},
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Content-Length": "25",
"Content-Type": "application/json",
"Host": "www.httpbin.org",
"User-Agent": "python-requests/2.25.1",
"X-Amzn-Trace-Id": "Root=1-61203789-0136b7557ba4e0116bb5e16d"
},
"json": {
"job_status": "success"
},
"origin": "200.73.153.254",
"url": "https://www.httpbin.org/post"
}
[2021-08-20 23:15:22,027] {taskinstance.py:1211} INFO - Marking task as SUCCESS.
Let me know if that worked for you, I tried to use as many Taskflow features as I could.
Source: Docs1 Docs2
Edit:
Added trigger_rule='all_done' to update_job_status task.

Related

Parse context from Task to DAG in case of success or failure

There are multiple tasks running inside a DAG according to below code.
import logging
from airflow import DAG
from datetime import datetime, timedelta
from util.email_util import Email
from util.slack_alert_util import task_failure_alert
from airflow.operators.dummy import DummyOperator
from airflow.operators.postgres_operator import PostgresOperator
def dag_failure_notification_alert(context):
# Slack notification
logging.info("Sending DAG Slack notification")
task_failure_alert(context)
# Email notification
subject = 'DAG Failure Alert'
from_email = 'abcd#xyz.com'
to_email = ['abcd#xyz.com']
dag_name = str(context['dag'])[6:-1]
dag_run = str(context['dag_run'])[8:-1]
message_body = """
<html>
<body>
<strong>Airflow DAG Failure Report</strong><br /><br />
Dag Name: {}<br />
Dag run details: {}<br />
Execution date and time: {}<br />
Run ID: {}<br />
Task Instance Key: {}<br />
Exception: {}<br />
</body>
</html>
""".format(dag_name, dag_run, str(context['execution_date']), str(context['run_id']),
str(context['task_instance_key_str']), str(context.get('exception')))
logging.info("Message body created for DAG as: %s", message_body)
email_obj = Email(
{'Subject': subject, 'From': from_email, 'To': to_email, 'body': message_body, 'file': None, 'filename': '',
'body_type': 'html'})
email_obj.send()
def task_failure_notification_alert(context):
# Slack notification
logging.info("Sending Task Slack notification")
task_failure_alert(context)
default_args = {
"owner": "analytics",
"start_date": datetime(2021, 12, 12),
'retries': 0,
'retry_delay': timedelta(),
"schedule_interval": "#daily"
}
dag = DAG('test_alert_notification',
default_args=default_args,
catchup=False,
on_failure_callback=dag_failure_notification_alert
)
start_task = DummyOperator(task_id="start_task", dag=dag, on_failure_callback=task_failure_notification_alert)
end_task = DummyOperator(task_id="end_task", dag=dag, on_failure_callback=task_failure_notification_alert)
create_table_sql_query = '''
CREATE TABLE dummy_table (id INT NOT NULL, name VARCHAR(250) NOT NULL);
'''
for i in range(5):
create_table_task = PostgresOperator(
sql=create_table_sql_query,
task_id=str(i),
postgres_conn_id="postgres_dummy_test",
dag=dag,
on_failure_callback=task_failure_notification_alert
)
start_task >> create_table_task >> end_task
DAG graph according to the above code.
As we can see in the above DAG graph image that if parallel Postgres tasks i.e. 0,1,2,3,4 is failing then on_failure_callback will call the python function(task_failure_notification_alert) with context to send a slack notification.
In the end, it is sending slack and email notifications both in case of DAG failure with context having on_failure_callback with dag_failure_notification_alert function call.
In case of Task failure,
The output seems to be like this:
DAG FAIL ALERT
dag: <DAG: test_alert_notification>,
dag_run: <DagRun test_alert_notification # 2022-11-29 12:03:13.245324+00:00: manual__2022-11-29T12:03:13.245324+00:00, externally triggered: True>,
execution_date: 2022-11-29T12:03:13.245324+00:00,
run_id: manual__2022-11-29T12:03:13.245324+00:00,
task_instance_key_str: test_alert_notification__4__20221129
exception: The conn_id postgres_dummy_test isn't defined
or
DAG FAIL ALERT
dag: <DAG: test_alert_notification>,
dag_run: <DagRun test_alert_notification # 2022-11-29 12:03:13.245324+00:00: manual__2022-11-29T12:03:13.245324+00:00, externally triggered: True>,
execution_date: 2022-11-29T12:03:13.245324+00:00,
run_id: manual__2022-11-29T12:03:13.245324+00:00,
task_instance_key_str: test_alert_notification__5__20221129
exception: The conn_id postgres_dummy_test isn't defined
for each different task.
In DAG failure, the context contains an exception as None and only a single task instance key which is the last success ID.
DAG failure Output format:
DAG FAIL ALERT
dag: <DAG: test_alert_notification>,
dag_run: <DagRun test_alert_notification # 2022-11-30 09:33:02.032456+00:00: manual__2022-11-30T09:33:02.032456+00:00, externally triggered: True>,
execution_date: 2022-11-30T09:33:02.032456+00:00,
run_id: manual__2022-11-30T09:33:02.032456+00:00,
task_instance_key_str: test_alert_notification__start_task__20221130
exception: None
I want to pass task failure information i.e exceptions and task instances todag_failure_notification_alert to send an email with accumulated information of all failure tasks.
I tried using a common global variable i.e. exceptions and task_instances as a list and appending all task exceptions and task instances to it inside the task_failure_notification_alert function. Later using the same variable inside the dag_failure_notification_alert function but it didn't work.
I tried using python callback as mentioned here but it works with PythonOperator only.
I read about XCOM push and pull mechanism but it focuses on sharing data between tasks(if I understand it correctly) and unsure how to use it here.
As I am new to this Airflow. Kindly suggest the best way to do it. Any other method which suits best for this kind of requirement?
Here is the solution I find for it from the stack overflow answer.
We can get the list of failed tasks by using passed context only.
e.g.
ti = context['task_instance']
for t in ti.get_dagrun().get_task_instances(state=TaskInstanceState.FAILED): # type: TaskInstance
logging.info(f'failed dag: {t.dag_id}, task: {t.task_id}, url: {t.log_url}')
Updating the dag_failure_notification_alert as
def dag_failure_notification_alert(context):
# Slack notification
logging.info("Sending DAG Slack notification")
task_failure_alert(context)
failed_tasks = []
dag_name = None
ti = context['task_instance']
for t in ti.get_dagrun().get_task_instances(state=TaskInstanceState.FAILED): # type: TaskInstance
logging.info(f'failed dag: {t.dag_id}, task: {t.task_id}, url: {t.log_url}')
dag_name = t.dag_id
failed_tasks.append({'id': t.task_id, 'url': t.log_url})
if failed_tasks:
# Email notification
subject = 'DAG Failure Alert'
from_email = 'abcd#xyz.com'
to_email = ['abcd#xyz.com']
task_url_link = ""
for failed_task in failed_tasks:
task_url_link += """{}, """.format(failed_task['url'], failed_task['id'])
task_url_link = task_url_link[:-2]
message_body = """
<html>
<body>
<strong>Airflow DAG Failure Report</strong><br /><br />
<b>Dag Name:</b> {}<br />
<b>Task Details:</b> [{}]<br />
<br />
Thanks,<br />
</body>
</html>
""".format(dag_name, task_url_link)
logging.info("Message body created for DAG as: %s", message_body)
email_obj = Email(
{'Subject': subject, 'From': from_email, 'To': to_email, 'body': message_body, 'file': None, 'filename': '',
'body_type': 'html'})
email_obj.send()
else:
logging.info("No failure Tasks fetched.")
Hope this will help anyone if someone faces the same issue that's why I post the answer.

DataflowTemplatedJobStartOperator in Apache Airflow not setting Job Region as expected

I'm running Apache Airflow 2.2.5 on Google Cloud Composer 2.0.31. The region for my composer instance is europe-west1. I'm trying to use Composer to trigger a dataflow job on a different project. I'm using the below DAG to run DataflowTemplatedJobStartOperator. The issue I'm running into is that when the DAG executes the job region is us-central1 with the worker_location in europe-west1. I've tried lots of different combinations of parameters and can't seem to get the job_region to be europe-west1 as well. Any ideas on what I might be doing wrong?
import datetime
from airflow import models
from airflow.providers.google.cloud.operators.dataflow import (
DataflowTemplatedJobStartOperator,
)
from airflow.utils.dates import days_ago
default_args = {
"start_date": days_ago(1),
"retries":0,
"dataflow_default_options": {
"project": "my-project",
"location": "europe-west1",
"zone": "europe-west1-b",
"stagingLocation": "gs://my-bucket-temp/temp/",
"tempLocation": "gs://my-bucket-temp/temp/",
"workerMachineType": "n1-standard-1",
},
}
with models.DAG(
"dataflow-batch-redis-revision-1",
default_args=default_args,
schedule_interval=datetime.timedelta(days=1),
) as dag:
start_template_job = DataflowTemplatedJobStartOperator(
task_id="dataflow_operator_batch_bq_to_redis",
template="gs://my-bucket-temp/template/BatchRedisUpdatePipelineTemplate",
parameters={
"inputTopic": "inputtopic",
"bigQueryInputProject": "inputproject",
"bigQueryInputDataset": "dataset",
"bigQueryInputTable": "table",
"bigQueryInputSQLBranchMetadata": "DUMMY",
"bigQueryInputSQLBranchSkuWeek": "DUMMY",
"redisHost": "host",
"redisPort": "6379",
},
)
For this operator you have to pass the location, because the default value is us-central1 :
DataflowTemplatedJobStartOperator(
task_id="dataflow_operator_batch_bq_to_redis",
template="gs://my-bucket-temp/template/BatchRedisUpdatePipelineTemplate",
location="europe-west1",
parameters={
"inputTopic": "inputtopic",
"bigQueryInputProject": "inputproject",
"bigQueryInputDataset": "dataset",
"bigQueryInputTable": "table",
"bigQueryInputSQLBranchMetadata": "DUMMY",
"bigQueryInputSQLBranchSkuWeek": "DUMMY",
"redisHost": "host",
"redisPort": "6379",
}
)
In the constructor of this operation, we can see that the location field has a default value with us-central1 :
DEFAULT_DATAFLOW_LOCATION = "us-central1"
def __init__(
self,
*,
template: str,
job_name: str = "{{task.task_id}}",
options: Optional[Dict[str, Any]] = None,
dataflow_default_options: Optional[Dict[str, Any]] = None,
parameters: Optional[Dict[str, str]] = None,
project_id: Optional[str] = None,
location: str = DEFAULT_DATAFLOW_LOCATION,
gcp_conn_id: str = "google_cloud_default",
delegate_to: Optional[str] = None,
poll_sleep: int = 10,
impersonation_chain: Optional[Union[str, Sequence[str]]] = None,
environment: Optional[Dict] = None,
cancel_timeout: Optional[int] = 10 * 60,
wait_until_finished: Optional[bool] = None,
**kwargs,
)
If you pass the argument region": "europe-west1" in the Dataflow options, this Airflow operator will overwrite this value by the location field value.
That's why you have to pass the location field in the operator.

Airflow Xcom not getting resolved return task_instance string

I am facing an odd issue with xcom_pull where it is always returning back a xcom_pull string
"{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}"
My requirement is simple I have pushed an xcom using python operator and with xcom_pull I am trying to retrieve the value and pass it as an http_conn_id for SimpleHttpOperator but the variable is returning a string instead of resolving xcom_pull value.
Python Operator is successfully able to push XCom.
Code:
from datetime import datetime
import simplejson as json
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.providers.http.operators.http import SimpleHttpOperator
from google.auth.transport.requests import Request
default_airflow_args = {
"owner": "divyaansh",
"depends_on_past": False,
"start_date": datetime(2022, 5, 18),
"retries": 0,
"schedule_interval": "#hourly",
}
project_configs = {
"project_id": "test",
"conn_id": "google_cloud_storage_default",
"bucket_name": "test-transfer",
"folder_name": "processed-test-rdf",
}
def get_config_vals(**kwargs) -> dict:
"""
Get config vals from airlfow variable and store it as xcoms
"""
task_instance = kwargs["task_instance"]
task_instance.xcom_push(key="http_con_id", value="gcp_cloud_function")
def generate_api_token(cf_name: str):
"""
generate token for api request
"""
import google.oauth2.id_token
request = Request()
target_audience = f"https://us-central1-test-a2h.cloudfunctions.net/{cf_name}"
return google.oauth2.id_token.fetch_id_token(
request=request, audience=target_audience
)
with DAG(
dag_id="cf_test",
default_args=default_airflow_args,
catchup=False,
render_template_as_native_obj=True,
) as dag:
start = DummyOperator(task_id="start")
config_vals = PythonOperator(
task_id="get_config_val", python_callable=get_config_vals, provide_context=True
)
ip_data = json.dumps(
{
"bucket_name": project_configs["bucket_name"],
"file_name": "dummy",
"target_location": "/valid",
}
)
conn_id = "{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}"
api_token = generate_api_token("new-cp")
cf_task = SimpleHttpOperator(
task_id="file_decrypt_and_validate_cf",
http_conn_id=conn_id,
method="POST",
endpoint="new-cp",
data=json.dumps(
json.dumps(
{
"bucket_name": "test-transfer",
"file_name": [
"processed-test-rdf/dummy_20220501.txt",
"processed-test-rdf/dummy_20220502.txt",
],
"target_location": "/valid",
}
)
),
headers={
"Authorization": f"bearer {api_token}",
"Content-Type": "application/json",
},
do_xcom_push=True,
log_response=True,
)
print("task new-cp", cf_task)
check_flow = DummyOperator(task_id="check_flow")
end = DummyOperator(task_id="end")
start >> config_vals >> cf_task >> check_flow >> end
Error Message:
raise AirflowNotFoundException(f"The conn_id `{conn_id}` isn't defined") airflow.exceptions.AirflowNotFoundException: The conn_id `"{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}"` isn't defined
I have tried several different days but nothing seems to be working.
Can someone point me to the right direction here.
Airflow-version : 2.2.3
Composer-version : 2.0.11
In SimpleHttpOperator the http_conn_id parameter is not templated field thus you can not use Jinja engine with it. This means that this parameter can not be rendered. So when you pass "{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}" to the operator you expect it to be replaced during runtime with the value stored in Xcom by previous task but in fact Airflow consider it just as a regular string this is also what the exception tells you. It actually try to search a connection with the name of your very long string but couldn't find it so it tells you that the connection is not defined.
To solve it you can create a custom operator:
class MySimpleHttpOperator(SimpleHttpOperator):
template_fields = SimpleHttpOperator.template_fields + ("http_conn_id",)
Then you should replace SimpleHttpOperator with MySimpleHttpOperator in your DAG.
This change makes the string that you set in http_conn_id to be passed via the Jinja engine. So in your case the string will be replaced with the Xcom value as you expect.

Airflow how to set default values for dag_run.conf

I'm trying to setup an Airflow DAG that provides default values available from dag_run.conf. This works great when running the DAG from the webUI, using the "Run w/ Config" option. However when running on the schedule, the dag_run.conf dict is not present, and the task will fail, e.g.
jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'key1'
Below is an example job.
Is it possible to make it so that dag_run.conf always contains the dict defined by params here?
from airflow import DAG
from airflow.utils.dates import hours_ago
from airflow.operators.bash import BashOperator
from datetime import timedelta
def do_something(val1: str, val2: str) -> str:
return f'echo vars are: "{val1}, {val2}"'
params = {
'key1': 'def1',
'key2': 'def2',
}
default_args = {
'retries': 0,
}
with DAG(
'template_test',
default_args=default_args,
schedule_interval=timedelta(minutes=1),
start_date=hours_ago(1),
params = params,
) as dag:
hello_t = BashOperator(
task_id='example-command',
bash_command=do_something('{{dag_run.conf["key1"]}}', '{{dag_run.conf["key2"]}}'),
config=params,
)
The closest I've seen is in For Apache Airflow, How can I pass the parameters when manually trigger DAG via CLI?, however there they leverage Jinja and if/else - which would require defining these default parameters twice. I'd like to define them only once.
You could use DAG params to achieve what you are looking for:
params (dict) – a dictionary of DAG level parameters that are made accessible in templates, namespaced under params. These params can be overridden at the task level.
You can define params at DAG or Task levels and also add or modify them from the UI in the Trigger DAG w/ config section.
Example DAG:
default_args = {
"owner": "airflow",
}
dag = DAG(
dag_id="example_dag_params",
default_args=default_args,
schedule_interval="*/5 * * * *",
start_date=days_ago(1),
params={"param1": "first_param"},
catchup=False,
)
with dag:
bash_task = BashOperator(
task_id="bash_task", bash_command="echo bash_task: {{ params.param1 }}"
)
Output log:
[2021-10-02 20:23:25,808] {logging_mixin.py:104} INFO - Running <TaskInstance: example_dag_params.bash_task 2021-10-02T23:15:00+00:00 [running]> on host worker_01
[2021-10-02 20:23:25,867] {taskinstance.py:1302} INFO - Exporting the following env vars:
AIRFLOW_CTX_DAG_OWNER=***
AIRFLOW_CTX_DAG_ID=example_dag_params
AIRFLOW_CTX_TASK_ID=bash_task
AIRFLOW_CTX_EXECUTION_DATE=2021-10-02T23:15:00+00:00
AIRFLOW_CTX_DAG_RUN_ID=scheduled__2021-10-02T23:15:00+00:00
[2021-10-02 20:23:25,870] {subprocess.py:52} INFO - Tmp dir root location:
/tmp
[2021-10-02 20:23:25,871] {subprocess.py:63} INFO - Running command: ['bash', '-c', 'echo bash_task: first_param']
[2021-10-02 20:23:25,884] {subprocess.py:74} INFO - Output:
[2021-10-02 20:23:25,886] {subprocess.py:78} INFO - bash_task: first_param
[2021-10-02 20:23:25,887] {subprocess.py:82} INFO - Command exited with return code 0
From the logs, notice that the dag_run is scheduled and the params are still there.
You can find a more extensive example on using parameters in this answer.
Hope that works for you!

Airflow: How to pass data from a decorated task to SimpleHttpOperator?

I recently started using Apache airflow. In am using Taskflow API with one decorated task with id Get_payload and SimpleHttpOperator. Task Get_payload gets data from database, does some data manipulation and returns a dict as payload.
Probelm
Unable to pass data from previous task into the next task. Yes I am aware of XComs but whole purpose of using Taskflow API is to avoid direct interactions with XComs. Getting below error when get_data is directly passed to data property of SimpleHttpOperator.
airflow.exceptions.AirflowException: 400:BAD REQUEST
What have I tried so far?
As mentioned in this SO answer, I used template_field in my custom sensor to define the field in which to expect the data from the previous task. In case of SimpleHttpOperator operator I cannot edit it to do the same. So how to solve it similarly in SimpleHttpOperator?
I have checkd this SO answer and this as well.
DAG:
from airflow.decorators import dag, task
from airflow.providers.http.operators.http import SimpleHttpOperator
from datetime import datetime
default_args = {
"owner": "airflow",
"start_date": datetime(2021, 1, 1),
}
#dag(default_args=default_args, schedule_interval=None, tags=["Http Operators"])
def http_operator():
#task(multiple_outputs=True)
def Get_payload(**kwargs):
# STEP 1: Get data from database.
# STEP 2: Manipulate data.
# STEP 3: Return payload.
data = {
"key_1": "Value 1",
"key_2": "Value 2",
"key_3": "Value 3",
"key_4": "Value 4",
}
return data
get_data = Get_payload()
ml_api = SimpleHttpOperator(
task_id="some_api",
http_conn_id="http_conn_id",
method="POST",
endpoint="/some-path",
data=get_data,
headers={"Content-Type": "application/json"},
)
get_data >> ml_api
http_operator_dag = http_operator()
Full log:
[2021-08-28 20:28:12,947] {taskinstance.py:903} INFO - Dependencies all met for <TaskInstance: http_operator.clf_api 2021-08-28T20:28:10.265689+00:00 [queued]>
[2021-08-28 20:28:12,970] {taskinstance.py:903} INFO - Dependencies all met for <TaskInstance: http_operator.clf_api 2021-08-28T20:28:10.265689+00:00 [queued]>
[2021-08-28 20:28:12,970] {taskinstance.py:1094} INFO -
--------------------------------------------------------------------------------
[2021-08-28 20:28:12,971] {taskinstance.py:1095} INFO - Starting attempt 1 of 1
[2021-08-28 20:28:12,971] {taskinstance.py:1096} INFO -
--------------------------------------------------------------------------------
[2021-08-28 20:28:12,982] {taskinstance.py:1114} INFO - Executing <Task(SimpleHttpOperator): clf_api> on 2021-08-28T20:28:10.265689+00:00
[2021-08-28 20:28:12,987] {standard_task_runner.py:52} INFO - Started process 19229 to run task
[2021-08-28 20:28:12,991] {standard_task_runner.py:76} INFO - Running: ['***', 'tasks', 'run', 'http_operator', 'clf_api', '2021-08-28T20:28:10.265689+00:00', '--job-id', '71', '--pool', 'default_pool', '--raw', '--subdir', 'DAGS_FOLDER/Http_Operator.py', '--cfg-path', '/tmp/tmp4l9hwi4q', '--error-file', '/tmp/tmpk1yrhtki']
[2021-08-28 20:28:12,993] {standard_task_runner.py:77} INFO - Job 71: Subtask clf_api
[2021-08-28 20:28:13,048] {logging_mixin.py:109} INFO - Running <TaskInstance: http_operator.clf_api 2021-08-28T20:28:10.265689+00:00 [running]> on host d332abee08c8
[2021-08-28 20:28:13,126] {taskinstance.py:1251} INFO - Exporting the following env vars:
AIRFLOW_CTX_DAG_OWNER=***
AIRFLOW_CTX_DAG_ID=http_operator
AIRFLOW_CTX_TASK_ID=clf_api
AIRFLOW_CTX_EXECUTION_DATE=2021-08-28T20:28:10.265689+00:00
AIRFLOW_CTX_DAG_RUN_ID=manual__2021-08-28T20:28:10.265689+00:00
[2021-08-28 20:28:13,128] {http.py:111} INFO - Calling HTTP method
[2021-08-28 20:28:13,141] {base.py:70} INFO - Using connection to: id: ML_API. Host: <IP-REMOVED>, Port: None, Schema: , Login: dexter, Password: ***, extra: {}
[2021-08-28 20:28:13,144] {http.py:140} INFO - Sending 'POST' to url: http://<IP-REMOVED>/classify
[2021-08-28 20:28:13,841] {http.py:154} ERROR - HTTP error: BAD REQUEST
[2021-08-28 20:28:13,842] {http.py:155} ERROR - <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>400 Bad Request</title>
<h1>Bad Request</h1>
<p>Failed to decode JSON object: Expecting value: line 1 column 1 (char 0)</p>
[2021-08-28 20:28:13,874] {taskinstance.py:1462} ERROR - Task failed with exception
Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.8/site-packages/airflow/providers/http/hooks/http.py", line 152, in check_response
response.raise_for_status()
File "/home/airflow/.local/lib/python3.8/site-packages/requests/models.py", line 953, in raise_for_status
raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 400 Client Error: BAD REQUEST for url: http://<IP-REMOVED>/classify
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.8/site-packages/airflow/models/taskinstance.py", line 1164, in _run_raw_task
self._prepare_and_execute_task_with_callbacks(context, task)
File "/home/airflow/.local/lib/python3.8/site-packages/airflow/models/taskinstance.py", line 1282, in _prepare_and_execute_task_with_callbacks
result = self._execute_task(context, task_copy)
File "/home/airflow/.local/lib/python3.8/site-packages/airflow/models/taskinstance.py", line 1312, in _execute_task
result = task_copy.execute(context=context)
File "/home/airflow/.local/lib/python3.8/site-packages/airflow/providers/http/operators/http.py", line 113, in execute
response = http.run(self.endpoint, self.data, self.headers, self.extra_options)
File "/home/airflow/.local/lib/python3.8/site-packages/airflow/providers/http/hooks/http.py", line 141, in run
return self.run_and_check(session, prepped_request, extra_options)
File "/home/airflow/.local/lib/python3.8/site-packages/airflow/providers/http/hooks/http.py", line 198, in run_and_check
self.check_response(response)
File "/home/airflow/.local/lib/python3.8/site-packages/airflow/providers/http/hooks/http.py", line 156, in check_response
raise AirflowException(str(response.status_code) + ":" + response.reason)
airflow.exceptions.AirflowException: 400:BAD REQUEST
[2021-08-28 20:28:13,882] {taskinstance.py:1505} INFO - Marking task as FAILED. dag_id=http_operator, task_id=clf_api, execution_date=20210828T202810, start_date=20210828T202812, end_date=20210828T202813
[2021-08-28 20:28:13,969] {local_task_job.py:151} INFO - Task exited with return code 1
[2021-08-28 20:28:14,043] {local_task_job.py:261} INFO - 0 downstream tasks scheduled from follow-on schedule check
As suggested by #Josh Fell in the comments, I had two mistakes in my DAG.
Wrap the data in json.dumps(data) before returning it from Get_payload.
Remove multiple_outputs=True from the task decorator of Get_payload.
Final code:
import json
from airflow.decorators import dag, task
from airflow.providers.http.operators.http import SimpleHttpOperator
from datetime import datetime
default_args = {
"owner": "airflow",
"start_date": datetime(2021, 1, 1),
}
#dag(default_args=default_args, schedule_interval=None, tags=["Http Operators"])
def http_operator():
#task()
def Get_payload(**kwargs):
# STEP 1: Get data from database.
# STEP 2: Manipulate data.
# STEP 3: Return payload.
data = {
"key_1": "Value 1",
"key_2": "Value 2",
"key_3": "Value 3",
"key_4": "Value 4",
}
return json.dumps(data)
get_data = Get_payload()
ml_api = SimpleHttpOperator(
task_id="some_api",
http_conn_id="http_conn_id",
method="POST",
endpoint="/some-path",
data=get_data,
headers={"Content-Type": "application/json"},
)
get_data >> ml_api
http_operator_dag = http_operator()

Resources