Using airflow dag_run.conf inside custom operator - airflow

We created a custom airflow based on EMRContainerOperator and we need to take a decision based on a config passed using the airflow UI.
My custom operator:
from airflow.providers.amazon.aws.operators.emr_containers import EMRContainerOperator
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
from uuid import uuid4
from airflow.utils.decorators import apply_defaults
class EmrBatchProcessorOperator(EMRContainerOperator):
template_fields: Sequence[str] = (
"name",
"virtual_cluster_id",
"execution_role_arn",
"release_label",
"job_driver",
"operation_type"
)
#apply_defaults
def __init__(
self,
operation_type,
*args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.operation_type = operation_type
if self.operation_type == 'full':
number_of_pods=10
else:
number_of_pods=5
BASE_CONSUMER_DRIVER_ARG = {
"sparkSubmitJobDriver": {"entryPoint": "s3://bucket/batch_processor_engine/batch-processor-engine_2.12-3.0.1_0.28.jar","entryPointArguments": ["group_name=courier_api_group01"], "sparkSubmitParameters": f"--conf spark.executor.instances={ number_of_pods } --conf spark.executor.memory=32G --conf spark.executor.cores=5 --conf spark.driver.cores=1 --conf spark.driver.memory=12G --conf spark.sql.broadcastTimeout=2000 --class TableProcessorWrapper"}
}
self.job_driver = BASE_CONSUMER_DRIVER_ARG
This is the way that I call my operator:
with DAG(
dag_id="batch_processor_model_dag",
schedule_interval="#daily",
default_args=default_args,
catchup=False
) as dag:
start = DummyOperator(task_id='start', dag=dag)
end = DummyOperator(task_id='end', dag=dag, trigger_rule='none_failed')
base_consumer = EmrBatchProcessorOperator(
task_id="base_consumer",
virtual_cluster_id=VIRTUAL_CLUSTER_ID,
execution_role_arn=JOB_ROLE_ARN,
configuration_overrides=CONFIGURATION_OVERRIDES_ARG,
release_label="emr-6.5.0-latest",
job_driver={},
name="pi.py",
operation_type= '{{dag_run.conf["operation_type"]}}'
)
start >> base_consumer >> end
But this code didn't work, I can't use the dag_run.conf value.
could you help me?

Related

Airflow custom operator variables

I need to pass Airflow connection settings(AWS, Postgres) to docker container environment variables
I'm trying to do this using custom Operator and BaseHook. \
class S3ToPostgresDockerOperator(DockerOperator):
#apply_defaults
def __init__(self, aws_conn_id='aws_default', postgres_conn_id='postgres_default', **kwargs):
super(S3ToPostgresDockerOperator, self).__init__(**kwargs)
self.aws_conn = BaseHook.get_connection(aws_conn_id)
self.pg_conn = BaseHook.get_connection(postgres_conn_id)
Is it possible to do something like that, or if not how should I do it?
java_unpack_csv = S3ToPostgresDockerOperator(
...
environment={
'AWS_ACCESS_KEY': '{{ ??? }}',
'AWS_SECRET_KEY': '{{ ??? }}'
}
)
You can build up the environment kwarg passed in the DockerOperator constructor.
For example,
class S3ToPostgresDockerOperator(DockerOperator):
#apply_defaults
def __init__(self, aws_conn_id='aws_default', postgres_conn_id='postgres_default', **kwargs):
self.aws_conn = BaseHook.get_connection(aws_conn_id)
self.pg_conn = BaseHook.get_connection(postgres_conn_id)
credentials = self.aws_conn.get_credentials()
kwargs['environment'] = dict(
kwargs.pop('environment', {}),
AWS_ACCESS_KEY=credentials.access_key,
AWS_SECRET_KEY=credentials.secret_key,
PG_DATABASE_URI=self.pg_conn.get_uri()
)
super(S3ToPostgresDockerOperator, self).__init__(**kwargs)

Airflow - getting the execution_date in task when calling an Operator

I have this Operator, its pretty much the same as S3CopyObjectOperator except it looks for all objects in a folder and copies to a destination folder.
import os
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.decorators import apply_defaults
from common.s3.partition import Partition, PartitionType
from airflow.models import BaseOperator
import logging
class S3CopyObjectsOperator(BaseOperator):
#apply_defaults
def __init__(self,
aws_conn_id: str,
partition: Partition,
s3_bucket: str,
dest_prefix: str,
*args,
**kwargs):
super(S3CopyObjectsOperator, self).__init__(*args, **kwargs)
self.aws_conn_id = aws_conn_id
self.partition = partition
self.s3_bucket = s3_bucket
self.dest_prefix = dest_prefix
def execute(self, context):
self.partition.partition_value = context.get("execution_date")
logging.info(f'self.dest_prefix: {self.dest_prefix}')
exec_date = context.get("execution_date")
logging.info(f'self.partition.partition_value: {self.partition.partition_value}')
s3 = S3Hook(self.aws_conn_id)
s3_conn = s3.get_conn()
logging.info(f'source bucket -- self.partition.bucket: {self.partition.bucket}')
logging.info(f'source key -- self.partition.key_prefix: {self.partition.key_prefix}')
source_keys = s3.list_keys(bucket_name=self.partition.bucket, prefix=self.partition.key_prefix, delimiter="/")
logging.info(f'keys: {source_keys}')
for file in source_keys:
prefix, filename = os.path.split(file)
dest_key = f'{self.dest_prefix}/{filename}'
logging.info(f'Copying file {filename} to {self.dest_prefix}')
key = self.partition.key_prefix + filename
logging.info(f'key: {key}')
s3_conn.copy_object(Bucket=self.s3_bucket,
Key=f'{dest_key}',
CopySource={
'Bucket': self.partition.bucket,
'Key': key
}, ContentEncoding='csv')
However when I use this operator in my task I need my dest_prefix to include the execution date.
Things I've tried:
I've tried adding ds = '{{ ds_nodash }}' in the dag file but when I print self.dest_prefix in the Operator the value it returns he string value and not the execution date.
I've also tried creating a function but when I print self.dest_prefix in the Operator the value it returns is: self.dest_prefix: <function exec_value at 0x7fd008fcb940> See below for my task:
the execution date should be after snapshot_date=
for data_group in data_group_names:
copy_felix_to_s3 = S3CopyObjectsOperator(
task_id=f'copy_felix_{data_group}_data_to_s3',
aws_conn_id='aws_default',
s3_bucket='bucket_name',
partition=felixS3Partition(
bucket='source_bucket',
location_base=f'our_bucket/{data_group}',
partition_type=None
),
dest_prefix=f"felix/{data_group}/snapshot_date= ds",
dag=dag
)
copy_felix_to_s3
You are missing declaration of the parameter as templated field.
class S3CopyObjectsOperator(BaseOperator):
...
template_fields = ("dest_prefix",)
...
Macros (such as ds_nodash) are available only for templated fields thus if you don't specify template_fields it will handle the value you pass as string and it will not be rendered.

Using dag_run variables in airflow Dag

I am trying to use airflow variables to determine whether to execute a task or not. I have tried this and it's not working:
if '{{ params.year }}' == '{{ params.message }}':
run_this = DummyOperator (
task_id = 'dummy_dag'
)
I was hoping to get some help making it work. Also is there a better way of doing something like this in airflow?
I think a good way to solve this, is with BranchPythonOperator to branch dynamically based on the provided DAG parameters. Consider this example:
Use params to provide the parameters to the DAG (could be also done from the UI), in this example: {"enabled": True}
from airflow.decorators import dag, task
from airflow.utils.dates import days_ago
from airflow.operators.python import get_current_context, BranchPythonOperator
#dag(
default_args=default_args,
schedule_interval=None,
start_date=days_ago(1),
catchup=False,
tags=["example"],
params={"enabled": True},
)
def branch_from_dag_params():
def _print_enabled():
context = get_current_context()
enabled = context["params"].get("enabled", False)
print(f"Task id: {context['ti'].task_id}")
print(f"Enabled is: {enabled}")
#task
def task_a():
_print_enabled()
#task
def task_b():
_print_enabled()
Define a callable to the BranchPythonOperator in which you will perform your conditionals and return the next task to be executed. You can access the execution context variables from **kwargs. Also keep in mind that this operator should return a single task_id or a list of task_ids to follow downstream. Those resultant tasks should always be directly downstream from it.
def _get_task_run(ti, **kwargs):
custom_param = kwargs["params"].get("enabled", False)
if custom_param:
return "task_a"
else:
return "task_b"
branch_task = BranchPythonOperator(
task_id="branch_task",
python_callable=_get_task_run,
)
task_a_exec = task_a()
task_b_exec = task_b()
branch_task >> [task_a_exec, task_b_exec]
The result is that task_a gets executed and task_b is skipped :
AIRFLOW_CTX_DAG_OWNER=airflow
AIRFLOW_CTX_DAG_ID=branch_from_dag_params
AIRFLOW_CTX_TASK_ID=task_a
Task id: task_a
Enabled is: True
Let me know if that worked for you.
Docs

Unable to pass xcom in Custom Operators in Airflow

I have a simple, linear DAG(created using Airflow 2.0) with two tasks. I have custom operators for each of the task which extend over BaseOperator. Following is the code for dag and operators:-
class Operator1(BaseOperator):
#apply_defaults
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
def execute(self, context):
...
logging.info('First task')
context['task_instance'].xcom_push(key="payload", value=data)
return data
class Operator2(BaseOperator):
#apply_defaults
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
def execute(self, context):
...
logging.info("context is ", context)
parameters = context['task_instance'].xcom_pull(key="payload", value=data)
with DAG('dag_1', default_args=DEFAULT_ARGS, schedule_interval=None) as dag:
TASK_1 = Operator1(
task_id='task_1',
do_xcom_push=True)
TASK_2 = Operator2(
task_id='task_2',
do_xcom_push=True)
TASK_1 >> TASK_2
When I run the DAG, I find that context which is used for getting xcom values is empty. I have searched a lot of answers on stackoverflow and tried the way mentioned in them but they didn't work.
Would really appreciate some hint over the issue - how to push and pull xcom values in custom operators?
I took your code and run it, the first problem was that start_date wasn't defined, so it ended up in an exception:
Exception has occurred: AirflowException (note: full exception trace is shown but execution is paused at: _run_module_as_main)
Task is missing the start_date parameter
Also, in Operator1 class, data variable is not defined. I guess maybe you missed them when you made the code example.
Other than that the code worked, but I think you should consider defining the task_id parameter when doing the xcom_pull operation.
From TaskInstance xcom_pull method description:
:param task_ids: Only XComs from tasks with matching ids will be
pulled. Can pass None to remove the filter.
Here is the code of a working example, note that I use two equivalent methods to perform the XComs operations:
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.utils.decorators import apply_defaults
from airflow.models import BaseOperator
class Operator1(BaseOperator):
#apply_defaults
def __init__(self, *args, **kwargs) -> None:
super(Operator1, self).__init__(*args, **kwargs)
def execute(self, context):
print('First task')
data = "valuable_data"
more_data = "more_valueable_data"
context['task_instance'].xcom_push(key="payload", value=data)
self.xcom_push(context, "more_data", more_data)
return data
class Operator2(BaseOperator):
#apply_defaults
def __init__(self, *args, **kwargs) -> None:
super(Operator2, self).__init__(*args, **kwargs)
def execute(self, context):
# print(f"context is {context}")
data = context['task_instance'].xcom_pull(
"task_1",
key="payload")
more_data = self.xcom_pull(context, "task_1", key="more_data")
print(f"Obtained data: {data}")
print(f"Obtained more_data: {more_data}")
with DAG('dag_1',
default_args={'owner': 'airflow'},
start_date=days_ago(1),
catchup=False,
schedule_interval=None) as dag:
TASK_1 = Operator1(
task_id='task_1'
)
TASK_2 = Operator2(
task_id='task_2'
)
TASK_1 >> TASK_2
Log from Task_2:
[2021-06-15 12:55:01,206] {taskinstance.py:1255} INFO - Exporting the following env vars:
AIRFLOW_CTX_DAG_OWNER=airflow
AIRFLOW_CTX_DAG_ID=dag_1
AIRFLOW_CTX_TASK_ID=task_2
AIRFLOW_CTX_EXECUTION_DATE=2021-06-14T00:00:00+00:00
AIRFLOW_CTX_DAG_RUN_ID=backfill__2021-06-14T00:00:00+00:00
Obtained data: valuable_data
Obtained more_data: more_valueable_data
[2021-06-15 12:55:01,227] {taskinstance.py:1159} INFO - Marking task as SUCCESS. dag_id=dag_1, task_id=task_2, execution_date=20210614T000000, start_date=20210615T120402, end_date=20210615T125501
Side notes: I changed the __init__ method in order to accept *args as well. I'm using print but It could be done using Airflow logger as self.log.info('msg') .
Let me know if that worked for you!

Airflow File Sensor for sensing files on my local drive

does anybody have any idea on FileSensor ? I came through it while i was researching on sensing files on my local directory. The code is as follows:
task= FileSensor(
task_id="senseFile"
filepath="etc/hosts",
fs_conn_id='fs_local',
_hook=self.hook,
dag=self.dag,)
I have also set my conn_id and conn type as File (path) and gave the {'path':'mypath'} but even though i set a non existing path or if the file isnt there in the specified path, the task is completed and the dag is successful. The FileSensor doesnt seem to sense files at all.
I found the community contributed FileSenor a little bit underwhelming so wrote my own.
I got it working for files locally to where the server/scheduler was running however ran into problems when using network paths.
The trick for network paths I found was to mount the network drive to my Linux Box.
This is my DAG used to sensor_task >> proccess_task >> archive_task >> trigger rerun
Note: We use variables (sourcePath, filePattern & archivePath) entered via the WebGUI
from airflow import DAG
from airflow.operators import PythonOperator, OmegaFileSensor, ArchiveFileOperator, TriggerDagRunOperator
from datetime import datetime, timedelta
from airflow.models import Variable
default_args = {
'owner': 'glsam',
'depends_on_past': False,
'start_date': datetime(2017, 6, 26),
'provide_context': True,
'retries': 100,
'retry_delay': timedelta(seconds=30)
}
task_name = 'my_first_file_sensor_task'
filepath = Variable.get("soucePath")
filepattern = Variable.get("filePattern")
archivepath = Variable.get("archivePath")
dag = DAG(
'task_name',
default_args=default_args,
schedule_interval=None,
catchup=False,
max_active_runs=1,
concurrency=1)
sensor_task = OmegaFileSensor(
task_id=task_name,
filepath=filepath,
filepattern=filepattern,
poke_interval=3,
dag=dag)
def process_file(**context):
file_to_process = context['task_instance'].xcom_pull(
key='file_name', task_ids=task_name)
file = open(filepath + file_to_process, 'w')
file.write('This is a test\n')
file.write('of processing the file')
file.close()
proccess_task = PythonOperator(
task_id='process_the_file', python_callable=process_file, dag=dag)
archive_task = ArchiveFileOperator(
task_id='archive_file',
filepath=filepath,
task_name=task_name,
archivepath=archivepath,
dag=dag)
trigger = TriggerDagRunOperator(
task_id='trigger_dag_rerun', trigger_dag_id=task_name, dag=dag)
sensor_task >> proccess_task >> archive_task >> trigger
And then this is my FileSenor
import os
import re
from datetime import datetime
from airflow.models import BaseOperator
from airflow.plugins_manager import AirflowPlugin
from airflow.utils.decorators import apply_defaults
from airflow.operators.sensors import BaseSensorOperator
class ArchiveFileOperator(BaseOperator):
#apply_defaults
def __init__(self, filepath, archivepath, task_name, *args, **kwargs):
super(ArchiveFileOperator, self).__init__(*args, **kwargs)
self.filepath = filepath
self.archivepath = archivepath
self.task_name = task_name
def execute(self, context):
file_name = context['task_instance'].xcom_pull(self.task_name, key='file_name')
os.rename(self.filepath + file_name, self.archivepath + file_name)
class OmegaFileSensor(BaseSensorOperator):
#apply_defaults
def __init__(self, filepath, filepattern, *args, **kwargs):
super(OmegaFileSensor, self).__init__(*args, **kwargs)
self.filepath = filepath
self.filepattern = filepattern
def poke(self, context):
full_path = self.filepath
file_pattern = re.compile(self.filepattern)
directory = os.listdir(full_path)
for files in directory:
if not re.match(file_pattern, files):
# do nothing
else:
context['task_instance'].xcom_push('file_name', files)
return True
return False
class OmegaPlugin(AirflowPlugin):
name = "omega_plugin"
operators = [OmegaFileSensor, ArchiveFileOperator]

Resources