Airflow custom operator variables - airflow

I need to pass Airflow connection settings(AWS, Postgres) to docker container environment variables
I'm trying to do this using custom Operator and BaseHook. \
class S3ToPostgresDockerOperator(DockerOperator):
#apply_defaults
def __init__(self, aws_conn_id='aws_default', postgres_conn_id='postgres_default', **kwargs):
super(S3ToPostgresDockerOperator, self).__init__(**kwargs)
self.aws_conn = BaseHook.get_connection(aws_conn_id)
self.pg_conn = BaseHook.get_connection(postgres_conn_id)
Is it possible to do something like that, or if not how should I do it?
java_unpack_csv = S3ToPostgresDockerOperator(
...
environment={
'AWS_ACCESS_KEY': '{{ ??? }}',
'AWS_SECRET_KEY': '{{ ??? }}'
}
)

You can build up the environment kwarg passed in the DockerOperator constructor.
For example,
class S3ToPostgresDockerOperator(DockerOperator):
#apply_defaults
def __init__(self, aws_conn_id='aws_default', postgres_conn_id='postgres_default', **kwargs):
self.aws_conn = BaseHook.get_connection(aws_conn_id)
self.pg_conn = BaseHook.get_connection(postgres_conn_id)
credentials = self.aws_conn.get_credentials()
kwargs['environment'] = dict(
kwargs.pop('environment', {}),
AWS_ACCESS_KEY=credentials.access_key,
AWS_SECRET_KEY=credentials.secret_key,
PG_DATABASE_URI=self.pg_conn.get_uri()
)
super(S3ToPostgresDockerOperator, self).__init__(**kwargs)

Related

Using airflow dag_run.conf inside custom operator

We created a custom airflow based on EMRContainerOperator and we need to take a decision based on a config passed using the airflow UI.
My custom operator:
from airflow.providers.amazon.aws.operators.emr_containers import EMRContainerOperator
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
from uuid import uuid4
from airflow.utils.decorators import apply_defaults
class EmrBatchProcessorOperator(EMRContainerOperator):
template_fields: Sequence[str] = (
"name",
"virtual_cluster_id",
"execution_role_arn",
"release_label",
"job_driver",
"operation_type"
)
#apply_defaults
def __init__(
self,
operation_type,
*args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.operation_type = operation_type
if self.operation_type == 'full':
number_of_pods=10
else:
number_of_pods=5
BASE_CONSUMER_DRIVER_ARG = {
"sparkSubmitJobDriver": {"entryPoint": "s3://bucket/batch_processor_engine/batch-processor-engine_2.12-3.0.1_0.28.jar","entryPointArguments": ["group_name=courier_api_group01"], "sparkSubmitParameters": f"--conf spark.executor.instances={ number_of_pods } --conf spark.executor.memory=32G --conf spark.executor.cores=5 --conf spark.driver.cores=1 --conf spark.driver.memory=12G --conf spark.sql.broadcastTimeout=2000 --class TableProcessorWrapper"}
}
self.job_driver = BASE_CONSUMER_DRIVER_ARG
This is the way that I call my operator:
with DAG(
dag_id="batch_processor_model_dag",
schedule_interval="#daily",
default_args=default_args,
catchup=False
) as dag:
start = DummyOperator(task_id='start', dag=dag)
end = DummyOperator(task_id='end', dag=dag, trigger_rule='none_failed')
base_consumer = EmrBatchProcessorOperator(
task_id="base_consumer",
virtual_cluster_id=VIRTUAL_CLUSTER_ID,
execution_role_arn=JOB_ROLE_ARN,
configuration_overrides=CONFIGURATION_OVERRIDES_ARG,
release_label="emr-6.5.0-latest",
job_driver={},
name="pi.py",
operation_type= '{{dag_run.conf["operation_type"]}}'
)
start >> base_consumer >> end
But this code didn't work, I can't use the dag_run.conf value.
could you help me?

Airflow - getting the execution_date in task when calling an Operator

I have this Operator, its pretty much the same as S3CopyObjectOperator except it looks for all objects in a folder and copies to a destination folder.
import os
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.decorators import apply_defaults
from common.s3.partition import Partition, PartitionType
from airflow.models import BaseOperator
import logging
class S3CopyObjectsOperator(BaseOperator):
#apply_defaults
def __init__(self,
aws_conn_id: str,
partition: Partition,
s3_bucket: str,
dest_prefix: str,
*args,
**kwargs):
super(S3CopyObjectsOperator, self).__init__(*args, **kwargs)
self.aws_conn_id = aws_conn_id
self.partition = partition
self.s3_bucket = s3_bucket
self.dest_prefix = dest_prefix
def execute(self, context):
self.partition.partition_value = context.get("execution_date")
logging.info(f'self.dest_prefix: {self.dest_prefix}')
exec_date = context.get("execution_date")
logging.info(f'self.partition.partition_value: {self.partition.partition_value}')
s3 = S3Hook(self.aws_conn_id)
s3_conn = s3.get_conn()
logging.info(f'source bucket -- self.partition.bucket: {self.partition.bucket}')
logging.info(f'source key -- self.partition.key_prefix: {self.partition.key_prefix}')
source_keys = s3.list_keys(bucket_name=self.partition.bucket, prefix=self.partition.key_prefix, delimiter="/")
logging.info(f'keys: {source_keys}')
for file in source_keys:
prefix, filename = os.path.split(file)
dest_key = f'{self.dest_prefix}/{filename}'
logging.info(f'Copying file {filename} to {self.dest_prefix}')
key = self.partition.key_prefix + filename
logging.info(f'key: {key}')
s3_conn.copy_object(Bucket=self.s3_bucket,
Key=f'{dest_key}',
CopySource={
'Bucket': self.partition.bucket,
'Key': key
}, ContentEncoding='csv')
However when I use this operator in my task I need my dest_prefix to include the execution date.
Things I've tried:
I've tried adding ds = '{{ ds_nodash }}' in the dag file but when I print self.dest_prefix in the Operator the value it returns he string value and not the execution date.
I've also tried creating a function but when I print self.dest_prefix in the Operator the value it returns is: self.dest_prefix: <function exec_value at 0x7fd008fcb940> See below for my task:
the execution date should be after snapshot_date=
for data_group in data_group_names:
copy_felix_to_s3 = S3CopyObjectsOperator(
task_id=f'copy_felix_{data_group}_data_to_s3',
aws_conn_id='aws_default',
s3_bucket='bucket_name',
partition=felixS3Partition(
bucket='source_bucket',
location_base=f'our_bucket/{data_group}',
partition_type=None
),
dest_prefix=f"felix/{data_group}/snapshot_date= ds",
dag=dag
)
copy_felix_to_s3
You are missing declaration of the parameter as templated field.
class S3CopyObjectsOperator(BaseOperator):
...
template_fields = ("dest_prefix",)
...
Macros (such as ds_nodash) are available only for templated fields thus if you don't specify template_fields it will handle the value you pass as string and it will not be rendered.

Airflow: Zoom in Sub Dag button in customized SubDagOperator

I've designed a customized SubDagOperator. Everything works fine except that the "Zoom in Sub Dag" button doesn't appear. It seems that if the airflow UI doesn't recognise the task as a subdagoperator itself the button is not shown. I've tried to override the task_type property, as it was mentioned in and old issue, but it doesn't work for me. Do you know if it's possible to see the button with customized SubDagOperators?
Airflow version: 1.10.12
Here is my try:
class EmrSubdagOperator(SubDagOperator):
template_fields = ()
template_ext = ()
#apply_defaults
def __init__(self, *args, **kwargs):
dag = kwargs.get('dag')
task_id = kwargs.get('task_id')
spark_steps = kwargs.get('spark_steps')
job_flow_overrides = kwargs.get('job_flow_overrides')
subdag = DAG(
'{}.{}'.format(dag.dag_id, task_id),
schedule_interval=dag.schedule_interval,
start_date=dag.start_date
)
cluster_creator = EmrCreateJobFlowOperator(
dag=subdag,
task_id='create_job_flow',
job_flow_overrides=job_flow_overrides
)
step_adder = EmrAddStepsOperator(
dag=subdag,
task_id='add_steps',
job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
steps=spark_steps,
)
#Check if the last step is completed, skipped or terminated
last_step = len(spark_steps) - 1
step_checker = EmrStepSensor(
dag=subdag,
task_id='watch_step',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[" + str(last_step) +"]}}",
aws_conn_id='aws_default',
)
cluster_checker = EmrJobFlowSensor(
dag=subdag,
task_id='watch_cluster',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
)
super(EmrSubdagOperator, self).__init__(subdag=subdag, *args, **kwargs)
self.spark_steps = spark_steps
self.job_flow_overrides = job_flow_overrides
#property
def task_type(self):
return 'SubDagOperator'
I had the same problem.
I resolved that, doing:
class DynamicTasksOperator(SubDagOperator):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._task_type = 'SubDagOperator'
#property
def task_type(self):
return self._task_type
I have made the following changes in the airflow.cfg and its working for me now.
store_serialized_dags = False
store_dag_code = False

Airflow Template_fields added but variable like {{ ds }} is not working

I want to pass airflow variables to SQL query template file like this (in sql/test.sql file):
select 'test', '{{ params.test_ds }}', '{{ test_dt }}' from test_table;
I created a Operator inherited from PostgresOperator:
class EtlOperator(PostgresOperator):
template_fields = ('sql', 'test_dt', 'params')
template_ext = PostgresOperator.template_ext
#apply_defaults
def __init__(self, test_dt, params, *args, **kwargs):
super(EtlRunIdOperator, self).__init__(*args, **kwargs)
self.test_dt = test_dt
self.params = params
def execute(self, context):
super(EtlRunIdOperator, self).execute(context)
I created this task:
test_task00 = EtlOperator(
task_id=f'test_task00',
postgres_conn_id='redshift',
sql='sql/test.sql',
params={
'test_ds': '{{ ds }}'
},
database='default',
test_dt='{{ execution_date }}',
provide_context=True, # tried without it too
dag=dag
)
However, no matter the params or test_dt that are part of template_fields, the SQL still not parsing the variables, results like this:
INFO - Executing: select 'test', '{{ ds }}', '' from test_table;
Is there anywhere wrong in my configurations?

Airflow:Date not rendering in SQL file

Issue: Date not rendering in SQL file
I am not able to get the date yesterday_ds rendered in the SQL file
In bi_utils/airflow.py module I have given as YESTERDAY_DS = '{{yesterday_ds}}'
In the Dag
from bi_utils.airflow import YESTERDAY_DS
snflk_to_s3 = SnowflakeMultiSqlStatmentOperator(
task_id='snflk_to_s3',
snowflake_conn_id=SNOWFLAKE_CONN_ID,
sql=load_sql,
params={
'proc_run_task_id': [proc_start.task_id],
'yesterday_ds': YESTERDAY_DS,
},
autocommit=True,
)
In the SQL file
COPY INTO #public.stage/path/{{params.yesterday_ds}}/
Looks like you are using a custom operator. In that case, you will have to add the argument as a template_fields to resolve the jinja template.
In this case, inside your custom operator, the Code below is just to understand the structure.
class SnowflakeMultiSqlStatmentOperator(BaseOperator):
"""
Executes sql code in a Snowflake database
"""
template_fields = ('sql','params')
template_ext = ('.sql',)
ui_color = '#ededed'
#apply_defaults
def __init__(
self, sql, snowflake_conn_id='snowflake_default', parameters=None,
autocommit=True, warehouse=None, database=None, role=None,
schema=None,params=None, *args, **kwargs):
super(SnowflakeOperator, self).__init__(*args, **kwargs)
self.snowflake_conn_id = snowflake_conn_id
self.sql = sql
self.autocommit = autocommit
self.parameters = parameters
self.warehouse = warehouse
self.database = database
self.role = role
self.schema = schema
self.params=params
def execute(self, context):
self.log.info('Executing: %s', self.sql)
hook = self.get_hook()
hook.run(
self.sql,
autocommit=self.autocommit,
parameters=self.parameters)

Resources