DataflowTemplatedJobStartOperator in Apache Airflow not setting Job Region as expected - airflow

I'm running Apache Airflow 2.2.5 on Google Cloud Composer 2.0.31. The region for my composer instance is europe-west1. I'm trying to use Composer to trigger a dataflow job on a different project. I'm using the below DAG to run DataflowTemplatedJobStartOperator. The issue I'm running into is that when the DAG executes the job region is us-central1 with the worker_location in europe-west1. I've tried lots of different combinations of parameters and can't seem to get the job_region to be europe-west1 as well. Any ideas on what I might be doing wrong?
import datetime
from airflow import models
from airflow.providers.google.cloud.operators.dataflow import (
DataflowTemplatedJobStartOperator,
)
from airflow.utils.dates import days_ago
default_args = {
"start_date": days_ago(1),
"retries":0,
"dataflow_default_options": {
"project": "my-project",
"location": "europe-west1",
"zone": "europe-west1-b",
"stagingLocation": "gs://my-bucket-temp/temp/",
"tempLocation": "gs://my-bucket-temp/temp/",
"workerMachineType": "n1-standard-1",
},
}
with models.DAG(
"dataflow-batch-redis-revision-1",
default_args=default_args,
schedule_interval=datetime.timedelta(days=1),
) as dag:
start_template_job = DataflowTemplatedJobStartOperator(
task_id="dataflow_operator_batch_bq_to_redis",
template="gs://my-bucket-temp/template/BatchRedisUpdatePipelineTemplate",
parameters={
"inputTopic": "inputtopic",
"bigQueryInputProject": "inputproject",
"bigQueryInputDataset": "dataset",
"bigQueryInputTable": "table",
"bigQueryInputSQLBranchMetadata": "DUMMY",
"bigQueryInputSQLBranchSkuWeek": "DUMMY",
"redisHost": "host",
"redisPort": "6379",
},
)

For this operator you have to pass the location, because the default value is us-central1 :
DataflowTemplatedJobStartOperator(
task_id="dataflow_operator_batch_bq_to_redis",
template="gs://my-bucket-temp/template/BatchRedisUpdatePipelineTemplate",
location="europe-west1",
parameters={
"inputTopic": "inputtopic",
"bigQueryInputProject": "inputproject",
"bigQueryInputDataset": "dataset",
"bigQueryInputTable": "table",
"bigQueryInputSQLBranchMetadata": "DUMMY",
"bigQueryInputSQLBranchSkuWeek": "DUMMY",
"redisHost": "host",
"redisPort": "6379",
}
)
In the constructor of this operation, we can see that the location field has a default value with us-central1 :
DEFAULT_DATAFLOW_LOCATION = "us-central1"
def __init__(
self,
*,
template: str,
job_name: str = "{{task.task_id}}",
options: Optional[Dict[str, Any]] = None,
dataflow_default_options: Optional[Dict[str, Any]] = None,
parameters: Optional[Dict[str, str]] = None,
project_id: Optional[str] = None,
location: str = DEFAULT_DATAFLOW_LOCATION,
gcp_conn_id: str = "google_cloud_default",
delegate_to: Optional[str] = None,
poll_sleep: int = 10,
impersonation_chain: Optional[Union[str, Sequence[str]]] = None,
environment: Optional[Dict] = None,
cancel_timeout: Optional[int] = 10 * 60,
wait_until_finished: Optional[bool] = None,
**kwargs,
)
If you pass the argument region": "europe-west1" in the Dataflow options, this Airflow operator will overwrite this value by the location field value.
That's why you have to pass the location field in the operator.

Related

How to dynamically generate tasks via an XCOM dictionary for a a non TaskInstance operator?

I have a dag that outputs a dictionary (map), passed through an XCom.
I want to generate a number of tasks according to the keys of said XCom dictionary.
This is how the dictionary looks in the xcom :
{
"F1": {
"source": {
"project": "legacy_project",
"bucket": "legacy_bucket",
"prefix": "prefix/{{ds_nodash}}/F1",
"files": [
"file_1.csv"
]
},
"destination": {
"project_id": "new_project_1",
"bucket": "new_bucket_1",
"prefix": "DTM/F1/{{ds_nodash}}"
}
},
"F2": {
"source": {
"project": "legacy_project",
"bucket": "legacy_bucket",
"prefix": "prefix/{{ds_nodash}}/F2",
"files": [
"file_1.csv"
]
},
"destination": {
"project_id": "new_project_2",
"bucket": "new_bucket_2",
"prefix": "DTM/F2/{{ds_nodash}}"
}
}
// ...
}
Notice that I used {{ds_nodash}} in the generated xcom in order for it to be replaced by the date of execution of the dag.
What I want, is to create a task for each of F1,F2,...,Fn and input the F1["source"]["bucket"], F1["source"]["prefix"] into a GCSObjectsWithPrefixExistenceSensor, with a custom task_id for each task..
Something like this :
for f_key in <xcom_output_dict>:
GCSObjectsWithPrefixExistenceSensor(
task_id = f"{f_key}_sensor",
bucket = f_key["source"]["bucket"]
prefix = f_key["source"]["prefix"]
)
I tried reading up on expand and partial, but it's really unclear how one can pass the dictionary contents such as bucket and prefix to the task operator...
EDIT :
One of my trials :
from airflow import DAG, XComArg
from airflow.contrib.sensors.gcs_sensor import (
GCSObjectsWithPrefixExistenceSensor,
)
...
generate_prefix_existance_sensor_kwargs = PythonOperator(
task_id="generate_prefix_existance_sensor_kwargs",
python_callable=gen_prefix_existance_sensor_kwargs,
provide_context=True,
op_kwargs={"effective_migration_map": "effective_migration_map"},
)
sensor_files = GCSObjectsWithPrefixExistenceSensor.expand_kwargs(
input=XComArg(generate_prefix_existance_sensor_kwargs),
)
(
... # Some other tasks
>> generate_prefix_existance_sensor_kwargs
>> sensor_files
)
With the code to the kwargs generation function
def gen_prefix_existance_sensor_kwargs(ti, **kwargs) -> List[Dict[str, str]]:
effective_migration_map = load_xcom_via_kwarg(
ti=ti,
key="effective_migration_map",
default_value="effective_migration_map",
kwargs=kwargs,
)
args_dicts = []
for f_key in effective_migration_map:
args_dicts.append(
{
"task_id": f_key,
"bucket": effective_migration_map[f_key]["source"]["bucket"],
"prefix": effective_migration_map[f_key]["source"]["prefix"],
}
)
return args_dicts
But I get :
AttributeError: type object 'GCSObjectsWithPrefixExistenceSensor' has no attribute 'expand_kwargs'
Which is weird because I saw an Astronomer.io video where it is used. I guess it isn't implemented yet in Airflow 2.3.4?
EDIT 1 :
I create the wrapper for the operator :
class GCSObjectsWithPrefixExistenceSensorWrapper(
GCSObjectsWithPrefixExistenceSensor
):
"""This class is a temporary work around to using expand_kwargs(),
as expand() can only take one argument,
we must create a wrapper around every operator we use
"""
def __init__(
self,
src_or_dest: str,
inp_parameters: Tuple[str, Dict[str, Dict[str, str]]],
**kwargs
):
if src_or_dest not in ["source", "destination"]:
raise TypeError(
"Bad argument for src_or_dest, must be either 'source' or 'destination'"
)
else:
bucket = inp_parameters[1][src_or_dest]["bucket"]
prefix = inp_parameters[1][src_or_dest]["prefix"]
super().__init__(
bucket=bucket,
prefix=prefix,
**kwargs,
)
I add it to my dag ..
updated_map_2 = match_data_with_migration_map(
src_mig_map=updated_map_1,
files_and_prefixes="{{ti.xcom_pull('list_files_and_prefixes')}}",
)
GCSObjectsWithPrefixExistenceSensorWrapper.partial(
task_id="sensor_files",
src_or_dest = "source",
impersonation_chain=IMPERSONATED_SERVICE_ACCOUNT,
).expand(inp_parameters=updated_map_2.output)
I get this error :
AttributeError: 'dict' object has no attribute 'output'
EDIT 2 :
The previous task to this operator was not decorated correctly, so it created the "EDIT 1" problems
I've tried to solve your issue with dynamic task mapping. However this solution has the down side that if you pass more parameters into the expand() it will make a cross-product
So my solution is to create a custom class that inherits from GCSObjectsWithPrefixExistenceSensor and the init function assigns correct values:
class CustomExistenceSensor(GCSObjectsWithPrefixExistenceSensor):
def __init__(self, inp_params, **kwargs):
bucket , prefix = inp_params
super().__init__(bucket=bucket, prefix=prefix, **kwargs)
By using this class we can now expand the sensor and pass an iterable of parameters (bucket, prefix):
from airflow.decorators import dag, task
from datetime import datetime
from airflow.providers.google.cloud.sensors.gcs import GCSObjectsWithPrefixExistenceSensor
class CustomExistenceSensor(GCSObjectsWithPrefixExistenceSensor):
def __init__(self, inp_params, **kwargs):
bucket , prefix = inp_params
super().__init__(bucket=bucket, prefix=prefix, **kwargs)
#dag(
schedule=None,
start_date=datetime(2022, 10, 21, hour=8),
catchup=False,
tags=['demo'],
)
def template_dag():
"""
### Template dag"""
#task()
def example_func():
pass
t1 = example_func()
# Below just to test the sensor works
# t3 = GCSObjectsWithPrefixExistenceSensor(
# task_id="test_check",
# bucket="text_stack_bucket",
# prefix="test_prefix"
# prefix="prefix",
# timeout=3)
# t4 works but the issue here is that we can pass only array with one parameter if we pass more airflow will make a cross product
# t4 = GCSObjectsWithPrefixExistenceSensor.partial(task_id="test_check_dynamic", bucket="text_stack_bucket", timeout=3).expand(prefix=["test_prefix", "prefix"])
t5 = CustomExistenceSensor.partial(task_id="test_custom_class", timeout=2).expand(inp_params=[("text_stack_bucket", "test_prefix"), ("next_existing", "prefix")])
t1 >> t2 >> t5
dag = template_dag()
So if you can amend a bit your code and make it return a list in xcom so that you can invoke it like so:
task = CustomExistenceSensor.partial(task_id='dynamic_tasks').expand(inp_params=previous_task.output)

Airflow Xcom not getting resolved return task_instance string

I am facing an odd issue with xcom_pull where it is always returning back a xcom_pull string
"{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}"
My requirement is simple I have pushed an xcom using python operator and with xcom_pull I am trying to retrieve the value and pass it as an http_conn_id for SimpleHttpOperator but the variable is returning a string instead of resolving xcom_pull value.
Python Operator is successfully able to push XCom.
Code:
from datetime import datetime
import simplejson as json
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.providers.http.operators.http import SimpleHttpOperator
from google.auth.transport.requests import Request
default_airflow_args = {
"owner": "divyaansh",
"depends_on_past": False,
"start_date": datetime(2022, 5, 18),
"retries": 0,
"schedule_interval": "#hourly",
}
project_configs = {
"project_id": "test",
"conn_id": "google_cloud_storage_default",
"bucket_name": "test-transfer",
"folder_name": "processed-test-rdf",
}
def get_config_vals(**kwargs) -> dict:
"""
Get config vals from airlfow variable and store it as xcoms
"""
task_instance = kwargs["task_instance"]
task_instance.xcom_push(key="http_con_id", value="gcp_cloud_function")
def generate_api_token(cf_name: str):
"""
generate token for api request
"""
import google.oauth2.id_token
request = Request()
target_audience = f"https://us-central1-test-a2h.cloudfunctions.net/{cf_name}"
return google.oauth2.id_token.fetch_id_token(
request=request, audience=target_audience
)
with DAG(
dag_id="cf_test",
default_args=default_airflow_args,
catchup=False,
render_template_as_native_obj=True,
) as dag:
start = DummyOperator(task_id="start")
config_vals = PythonOperator(
task_id="get_config_val", python_callable=get_config_vals, provide_context=True
)
ip_data = json.dumps(
{
"bucket_name": project_configs["bucket_name"],
"file_name": "dummy",
"target_location": "/valid",
}
)
conn_id = "{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}"
api_token = generate_api_token("new-cp")
cf_task = SimpleHttpOperator(
task_id="file_decrypt_and_validate_cf",
http_conn_id=conn_id,
method="POST",
endpoint="new-cp",
data=json.dumps(
json.dumps(
{
"bucket_name": "test-transfer",
"file_name": [
"processed-test-rdf/dummy_20220501.txt",
"processed-test-rdf/dummy_20220502.txt",
],
"target_location": "/valid",
}
)
),
headers={
"Authorization": f"bearer {api_token}",
"Content-Type": "application/json",
},
do_xcom_push=True,
log_response=True,
)
print("task new-cp", cf_task)
check_flow = DummyOperator(task_id="check_flow")
end = DummyOperator(task_id="end")
start >> config_vals >> cf_task >> check_flow >> end
Error Message:
raise AirflowNotFoundException(f"The conn_id `{conn_id}` isn't defined") airflow.exceptions.AirflowNotFoundException: The conn_id `"{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}"` isn't defined
I have tried several different days but nothing seems to be working.
Can someone point me to the right direction here.
Airflow-version : 2.2.3
Composer-version : 2.0.11
In SimpleHttpOperator the http_conn_id parameter is not templated field thus you can not use Jinja engine with it. This means that this parameter can not be rendered. So when you pass "{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}" to the operator you expect it to be replaced during runtime with the value stored in Xcom by previous task but in fact Airflow consider it just as a regular string this is also what the exception tells you. It actually try to search a connection with the name of your very long string but couldn't find it so it tells you that the connection is not defined.
To solve it you can create a custom operator:
class MySimpleHttpOperator(SimpleHttpOperator):
template_fields = SimpleHttpOperator.template_fields + ("http_conn_id",)
Then you should replace SimpleHttpOperator with MySimpleHttpOperator in your DAG.
This change makes the string that you set in http_conn_id to be passed via the Jinja engine. So in your case the string will be replaced with the Xcom value as you expect.

MWAA ECSOperator "No task found" but succeeds

I have an ECSOperator task in MWAA.
When I trigger the task, it succeeds immediately. However, the task should take time to complete, so I do not believe it is actually starting.
When I go to inspect the task run, I get the error "No tasks found".
The task definition looks like this:
from datetime import datetime
from airflow import DAG
from airflow.providers.amazon.aws.operators.ecs import ECSOperator
dag = DAG(
"my_dag",
description = "",
start_date = datetime.fromisoformat("2022-03-28"),
catchup = False,
)
my_task = ECSOperator(
task_id = "my_task",
cluster = "my-cluster",
task_definition = "my-task",
launch_type = "FARGATE",
aws_conn_id = "aws_ecs",
overrides = {},
network_configuration = {
"awsvpcConfiguration": {
"securityGroups": [ "sg-aaaa" ],
"subnets": [ "subnet-bbbb" ],
},
},
awslogs_group = "/ecs/my-task",
)
my_task
What am I missing here?
If task executed it should have a log.
I think your issue is that the task you defined is not assigned to any DAG object thus you see No task found error (empty DAG)
You should add dag=dag:
my_task = ECSOperator(
task_id = "my_task",
...,
dag=dag
)
or use context manager to avoid such issue:
with DAG(
dag_id="my_dag",
...
) as dag:
my_task = ECSOperator(
task_id = "my_task",
...,
)
If you are using Airflow 2 you can also use dag decorator.

How to pass previous task state as parameter to another task within the Airflow Taskflow API?

I want to get the status of a SparkSubmitOperator, transform it to some value that my API understands and pass it within the payload of a SimpleHttpOperator so that I can update the job status inside my DB. I want to do this by using Taskflow API.
I wrote the code below but I get this error when I try to load it:
Broken DAG: [/opt/airflow/dags/export/inapp_clicks/export.py] Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.6/site-packages/airflow/models/baseoperator.py", line 1378, in set_downstream
self._set_relatives(task_or_task_list, upstream=False, edge_modifier=edge_modifier)
File "/home/airflow/.local/lib/python3.6/site-packages/airflow/models/baseoperator.py", line 1316, in _set_relatives
task_object.update_relative(self, not upstream)
AttributeError: 'function' object has no attribute 'update_relative'
Code:
from datetime import datetime
from airflow.decorators import dag, task
from airflow.models import Variable
from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
#dag(schedule_interval=None, start_date=datetime.now(), tags=["export", "inapp"])
def export_inapp_clicks():
DEFAULT_NUM_EXECUTORS = 2
DEFAULT_EXECUTOR_CORES = 3
DEFAULT_EXECUTOR_MEMORY = "2g"
DEFAULT_DRIVER_MEMORY = "1g"
#task()
def update_job_status(dag, ti, execution_date):
jst = dag.get_task("export_inapp_clicks_job_submission")
jsti = TaskInstance(jst, execution_date)
xcom_value = ti.xcom_pull(task_ids="export_inapp_clicks_job_submission")
print("Task:", jst)
print("Task Instance:", jsti)
print("Task State:", jsti.current_state())
print("XCOM Value:", xcom_value)
# TODO: call API via SimpleHttpOperator
job_submission = SparkSubmitOperator(
task_id="export_inapp_clicks_job_submission",
conn_id="yarn",
name="{{ dag_run.conf['name'] }}",
conf=Variable.get("export_inapp_clicks_conf", deserialize_json=True),
jars=Variable.get("export_inapp_clicks_jars"),
application=Variable.get("pyspark_executor_path"),
application_args=[
"--module",
"export_inapp_clicks.export",
"--org-id",
"{{ dag_run.conf['orgId'] }}",
"--app-id",
"{{ dag_run.conf['appId'] }}",
"--inapp-id",
"{{ dag_run.conf['inappId'] }}",
"--start-date",
"{{ dag_run.conf['startDate'] }}",
"--end-date",
"{{ dag_run.conf['endDate'] }}",
"--data-path",
Variable.get("event_data_path"),
"--es-nodes",
Variable.get("es_nodes"),
"--destination",
Variable.get("export_inapp_clicks_output"),
"--explain",
"--debug",
"--encode-columns",
"--log-level",
"WARN"
],
py_files=Variable.get("export_inapp_clicks_py_files"),
num_executors=Variable.get("export_inapp_clicks_num_executors", DEFAULT_NUM_EXECUTORS),
executor_cores=Variable.get("export_inapp_clicks_executor_cores", DEFAULT_EXECUTOR_CORES),
executor_memory=Variable.get("export_inapp_clicks_executor_memory", DEFAULT_EXECUTOR_MEMORY),
driver_memory=Variable.get("export_inapp_clicks_driver_memory", DEFAULT_DRIVER_MEMORY),
status_poll_interval=10
)
job_submission >> update_job_status
export_dag = export_inapp_clicks()
Consider the following example, the first task will correspond to your SparkSubmitOperator task:
_get_upstream_task Takes care of getting the state of the first task from the second one, by performing a query to the metadata database:
DAG definition, first two task:
import json
from airflow.decorators import dag, task
from airflow.utils.dates import days_ago
from airflow.utils.db import provide_session
from airflow.models.taskinstance import TaskInstance
from airflow.providers.http.operators.http import SimpleHttpOperator
#dag(
default_args= {"owner": "airflow"},
schedule_interval=None,
start_date=days_ago(0),
catchup=False,
tags=["custom_example", "TaskFlow"],
)
def taskflow_previous_task():
#provide_session
def _get_upstream_task(upstream_task_id, dag, execution_date, session=None, **_):
upstream_ti = (
session.query(TaskInstance)
.filter(
TaskInstance.dag_id == dag.dag_id,
TaskInstance.execution_date == execution_date,
TaskInstance.task_id == upstream_task_id,
)
.first()
)
return upstream_ti
#task
def job_submission_task(**context):
print(f"Task Id: {context['ti'].task_id}")
return {"job_data": "something"}
#task(trigger_rule='all_done')
def update_job_status(job_data, **context):
print(f"Data from previous Task: {job_data['job_data']}")
upstream_ti = _get_upstream_task("job_submission_task", **context)
print(f"Upstream_ti state: {upstream_ti.state}")
return upstream_ti.state
job_results = job_submission_task()
job_status = update_job_status(job_results)
job_submission_task returns a dict that is passed to update_job_status via Xcoms using XcomArg which is a main feature of Taskflow API. By doing so you get to avoid explicitly perfoming xcom_pull() and xcom_push() operations.
Once you get the TaskInstance object from _get_upstream_task method, you can return it and retrieve it again from the last task wich will perfom the HTTP request:
Final task, end of DAG definition:
task_post_op = SimpleHttpOperator(
task_id="post_op",
endpoint="post",
data=json.dumps({"job_status": f"{job_status}"}),
headers={"Content-Type": "application/json"},
log_response=True,
)
job_status >> task_post_op
example_dag = taskflow_previous_task()
Since the param data of SimpleHttpOperator is templated, you can retrieve the Xcom value from the second task using Jinja:
data=json.dumps({"job_status": f"{job_status}"}),
Execution logs:
Task_1:
AIRFLOW_CTX_DAG_RUN_ID=manual__2021-08-20T23:15:15.226853+00:00
[2021-08-20 23:15:17,148] {logging_mixin.py:104} INFO - Task Id: job_submission_task
[2021-08-20 23:15:17,148] {python.py:151} INFO - Done. Returned value was: {'job_data': 'something'}
[2021-08-20 23:15:17,202] {taskinstance.py:1211} INFO - Marking task as SUCCESS.
Task_2:
AIRFLOW_CTX_DAG_ID=taskflow_previous_task
AIRFLOW_CTX_TASK_ID=update_job_status
AIRFLOW_CTX_EXECUTION_DATE=2021-08-20T23:15:15.226853+00:00
AIRFLOW_CTX_DAG_RUN_ID=manual__2021-08-20T23:15:15.226853+00:00
[2021-08-20 23:15:18,768] {logging_mixin.py:104} INFO - Data from previous Task: something
[2021-08-20 23:15:18,792] {logging_mixin.py:104} INFO - Upstream_ti state: success
[2021-08-20 23:15:18,793] {python.py:151} INFO - Done. Returned value was: success
[2021-08-20 23:15:18,874] {taskinstance.py:1211} INFO - Marking task as SUCCESS.
Task_3:
AIRFLOW_CTX_DAG_ID=taskflow_previous_task
AIRFLOW_CTX_TASK_ID=post_op
AIRFLOW_CTX_EXECUTION_DATE=2021-08-20T23:15:15.226853+00:00
AIRFLOW_CTX_DAG_RUN_ID=manual__2021-08-20T23:15:15.226853+00:00
[2021-08-20 23:15:21,201] {http.py:111} INFO - Calling HTTP method
[2021-08-20 23:15:21,228] {base.py:78} INFO - Using connection to: id: http_default. Host: https://www.httpbin.org, Port: None, Schema: , Login: , Password: None, extra: {}
[2021-08-20 23:15:21,245] {http.py:140} INFO - Sending 'POST' to url: https://www.httpbin.org/post
[2021-08-20 23:15:21,973] {http.py:115} INFO - {
"args": {},
"data": "{\"job_status\": \"success\"}",
"files": {},
"form": {},
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Content-Length": "25",
"Content-Type": "application/json",
"Host": "www.httpbin.org",
"User-Agent": "python-requests/2.25.1",
"X-Amzn-Trace-Id": "Root=1-61203789-0136b7557ba4e0116bb5e16d"
},
"json": {
"job_status": "success"
},
"origin": "200.73.153.254",
"url": "https://www.httpbin.org/post"
}
[2021-08-20 23:15:22,027] {taskinstance.py:1211} INFO - Marking task as SUCCESS.
Let me know if that worked for you, I tried to use as many Taskflow features as I could.
Source: Docs1 Docs2
Edit:
Added trigger_rule='all_done' to update_job_status task.

Airflow Debugging: How to skip backfill job execution when running DAG in vscode

I have setup airflow and am running a DAG using the following vscode debug configuration:
{
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": false,
"env":{
"AIRFLOW__CORE__EXECUTOR": "DebugExecutor",
"AIRFLOW__DEBUG__FAIL_FAST": "True",
"LC_ALL": "en_US.UTF-8",
"LANG": "en_US.UTF-8"
}
}
]
}
It runs the file, my breakpoints DAG defs break as expected, then at the end of the file: It executes the dag.run() and then I wait forever for the dag to backfill, and my breakpoints within python_callable functions of tasks never break.
What airflow secret am I not seeing?
Here is my dag:
# scheduled to run every minute, poke for a new file every ten seconds
dag = DAG(
dag_id='download-from-s3',
start_date=days_ago(2),
catchup=False,
schedule_interval='*/1 * * * *',
is_paused_upon_creation=False
)
def new_file_detection(**context):
print("File found...") # a breakpoint here never lands
pprint(context)
init = BashOperator(
task_id='init',
bash_command='echo "My DAG initiated at $(date)"',
dag=dag,
)
file_sensor = S3KeySensor(
task_id='file_sensor',
poke_interval=10, # every 10 seconds
timeout=60,
bucket_key="s3://inbox/new/*",
bucket_name=None,
wildcard_match=True,
soft_fail=True,
dag=dag
)
file_found_message = PythonOperator(
task_id='file_found_message',
provide_context=True,
python_callable=new_file_detection,
dag=dag
)
init >> file_sensor >> file_found_message
if __name__ == '__main__':
dag.clear(reset_dag_runs=True)
dag.run() #this triggers a backfill job
This is working for me as expected. I can set breakpoints at DAG level, or inside the python callables definition and go through them using VSCode debugger.
I'm using the same debug settings that you provided, but I changed the parameter reset_dag_runs=True to dag_run_state=State.NONE during dag.clear() call, as specified on the DebugExecutor docs page. I believe this has changed on one of the latest releases.
Regarding backfills, I'm setting catchup=False on the DAG arguments (it works both ways). Important note, I'm running version 2.0.0 of Airflow.
Here is an example using the same code from example_xcomp.py that comes with the default installation:
Debug settings:
{
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "internalConsole",
"justMyCode": false,
"env":{
"AIRFLOW__CORE__EXECUTOR": "DebugExecutor",
"AIRFLOW__DEBUG__FAIL_FAST": "True",
}
}
]
}
Example DAG:
import logging
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
dag = DAG(
'excom_xample',
schedule_interval="#once",
start_date=days_ago(2),
default_args={'owner': 'airflow'},
tags=['example'],
catchup=False
)
value_1 = [1, 2, 3]
value_2 = {'a': 'b'}
def push(**kwargs):
"""Pushes an XCom without a specific target"""
logging.info("log before PUSH") # <<<<<<<<<<< Before landing on breakpoint
kwargs['ti'].xcom_push(key='value from pusher 1', value=value_1)
def push_by_returning(**kwargs):
"""Pushes an XCom without a specific target, just by returning it"""
return value_2
def puller(**kwargs):
"""Pull all previously pushed XComs and
check if the pushed values match the pulled values."""
ti = kwargs['ti']
# get value_1
pulled_value_1 = ti.xcom_pull(key=None, task_ids='push')
print("PRINT Line after breakpoint ") # <<<< After landing on breakpoint
if pulled_value_1 != value_1:
raise ValueError("The two values differ"
f"{pulled_value_1} and {value_1}")
# get value_2
pulled_value_2 = ti.xcom_pull(task_ids='push_by_returning')
if pulled_value_2 != value_2:
raise ValueError(
f'The two values differ {pulled_value_2} and {value_2}')
# get both value_1 and value_2
pulled_value_1, pulled_value_2 = ti.xcom_pull(
key=None, task_ids=['push', 'push_by_returning'])
if pulled_value_1 != value_1:
raise ValueError(
f'The two values differ {pulled_value_1} and {value_1}')
if pulled_value_2 != value_2:
raise ValueError(
f'The two values differ {pulled_value_2} and {value_2}')
push1 = PythonOperator(
task_id='push',
dag=dag,
python_callable=push,
)
push2 = PythonOperator(
task_id='push_by_returning',
dag=dag,
python_callable=push_by_returning,
)
pull = PythonOperator(
task_id='puller',
dag=dag,
python_callable=puller,
)
pull << [push1, push2]
if __name__ == '__main__':
from airflow.utils.state import State
dag.clear(dag_run_state=State.NONE)
dag.run()

Resources