Airflow:Date not rendering in SQL file - airflow

Issue: Date not rendering in SQL file
I am not able to get the date yesterday_ds rendered in the SQL file
In bi_utils/airflow.py module I have given as YESTERDAY_DS = '{{yesterday_ds}}'
In the Dag
from bi_utils.airflow import YESTERDAY_DS
snflk_to_s3 = SnowflakeMultiSqlStatmentOperator(
task_id='snflk_to_s3',
snowflake_conn_id=SNOWFLAKE_CONN_ID,
sql=load_sql,
params={
'proc_run_task_id': [proc_start.task_id],
'yesterday_ds': YESTERDAY_DS,
},
autocommit=True,
)
In the SQL file
COPY INTO #public.stage/path/{{params.yesterday_ds}}/

Looks like you are using a custom operator. In that case, you will have to add the argument as a template_fields to resolve the jinja template.
In this case, inside your custom operator, the Code below is just to understand the structure.
class SnowflakeMultiSqlStatmentOperator(BaseOperator):
"""
Executes sql code in a Snowflake database
"""
template_fields = ('sql','params')
template_ext = ('.sql',)
ui_color = '#ededed'
#apply_defaults
def __init__(
self, sql, snowflake_conn_id='snowflake_default', parameters=None,
autocommit=True, warehouse=None, database=None, role=None,
schema=None,params=None, *args, **kwargs):
super(SnowflakeOperator, self).__init__(*args, **kwargs)
self.snowflake_conn_id = snowflake_conn_id
self.sql = sql
self.autocommit = autocommit
self.parameters = parameters
self.warehouse = warehouse
self.database = database
self.role = role
self.schema = schema
self.params=params
def execute(self, context):
self.log.info('Executing: %s', self.sql)
hook = self.get_hook()
hook.run(
self.sql,
autocommit=self.autocommit,
parameters=self.parameters)

Related

Airflow custom operator variables

I need to pass Airflow connection settings(AWS, Postgres) to docker container environment variables
I'm trying to do this using custom Operator and BaseHook. \
class S3ToPostgresDockerOperator(DockerOperator):
#apply_defaults
def __init__(self, aws_conn_id='aws_default', postgres_conn_id='postgres_default', **kwargs):
super(S3ToPostgresDockerOperator, self).__init__(**kwargs)
self.aws_conn = BaseHook.get_connection(aws_conn_id)
self.pg_conn = BaseHook.get_connection(postgres_conn_id)
Is it possible to do something like that, or if not how should I do it?
java_unpack_csv = S3ToPostgresDockerOperator(
...
environment={
'AWS_ACCESS_KEY': '{{ ??? }}',
'AWS_SECRET_KEY': '{{ ??? }}'
}
)
You can build up the environment kwarg passed in the DockerOperator constructor.
For example,
class S3ToPostgresDockerOperator(DockerOperator):
#apply_defaults
def __init__(self, aws_conn_id='aws_default', postgres_conn_id='postgres_default', **kwargs):
self.aws_conn = BaseHook.get_connection(aws_conn_id)
self.pg_conn = BaseHook.get_connection(postgres_conn_id)
credentials = self.aws_conn.get_credentials()
kwargs['environment'] = dict(
kwargs.pop('environment', {}),
AWS_ACCESS_KEY=credentials.access_key,
AWS_SECRET_KEY=credentials.secret_key,
PG_DATABASE_URI=self.pg_conn.get_uri()
)
super(S3ToPostgresDockerOperator, self).__init__(**kwargs)

Using airflow dag_run.conf inside custom operator

We created a custom airflow based on EMRContainerOperator and we need to take a decision based on a config passed using the airflow UI.
My custom operator:
from airflow.providers.amazon.aws.operators.emr_containers import EMRContainerOperator
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
from uuid import uuid4
from airflow.utils.decorators import apply_defaults
class EmrBatchProcessorOperator(EMRContainerOperator):
template_fields: Sequence[str] = (
"name",
"virtual_cluster_id",
"execution_role_arn",
"release_label",
"job_driver",
"operation_type"
)
#apply_defaults
def __init__(
self,
operation_type,
*args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.operation_type = operation_type
if self.operation_type == 'full':
number_of_pods=10
else:
number_of_pods=5
BASE_CONSUMER_DRIVER_ARG = {
"sparkSubmitJobDriver": {"entryPoint": "s3://bucket/batch_processor_engine/batch-processor-engine_2.12-3.0.1_0.28.jar","entryPointArguments": ["group_name=courier_api_group01"], "sparkSubmitParameters": f"--conf spark.executor.instances={ number_of_pods } --conf spark.executor.memory=32G --conf spark.executor.cores=5 --conf spark.driver.cores=1 --conf spark.driver.memory=12G --conf spark.sql.broadcastTimeout=2000 --class TableProcessorWrapper"}
}
self.job_driver = BASE_CONSUMER_DRIVER_ARG
This is the way that I call my operator:
with DAG(
dag_id="batch_processor_model_dag",
schedule_interval="#daily",
default_args=default_args,
catchup=False
) as dag:
start = DummyOperator(task_id='start', dag=dag)
end = DummyOperator(task_id='end', dag=dag, trigger_rule='none_failed')
base_consumer = EmrBatchProcessorOperator(
task_id="base_consumer",
virtual_cluster_id=VIRTUAL_CLUSTER_ID,
execution_role_arn=JOB_ROLE_ARN,
configuration_overrides=CONFIGURATION_OVERRIDES_ARG,
release_label="emr-6.5.0-latest",
job_driver={},
name="pi.py",
operation_type= '{{dag_run.conf["operation_type"]}}'
)
start >> base_consumer >> end
But this code didn't work, I can't use the dag_run.conf value.
could you help me?

Access dag_run.conf in a custom pythonoperator on Airflow

I extended the existing PythonOperator on Airflow as follow:
class myPythonOperator(PythonOperator):
def __init__(self,**kwargs) -> None:
self.name = kwargs.get("name", "name is not provided")
def execute(self, context,**kwargs):
print(self.name)
super(myPythonOperator, self).execute(context)
And my task was defined as:
def task1(**kwargs):
name = kwargs.get("name", "name is not provided")
print(name)
And with the following DAG:
myTask = myPythonOperator(
task_id='myTask',
python_callable = task1,
op_kwargs={"name": "{{ dag_run.conf['name'] }}"},
provide_context=True
)
When triggering the DAG, I provided a configuration JSON from Airflow web UI, which is {"name":"foo"}
But the problem is that the name specified in JSON can only be access from task1, in ececute() it will always print name is not provided
Does anyone know the trick to access this dag_run.conf from the __init__() function of the operator?
Any help will be appreciated. Thanks
The way to access dag.run_config from a inherited class is by using template_field in Airflow, which can be found here

Airflow - getting the execution_date in task when calling an Operator

I have this Operator, its pretty much the same as S3CopyObjectOperator except it looks for all objects in a folder and copies to a destination folder.
import os
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.decorators import apply_defaults
from common.s3.partition import Partition, PartitionType
from airflow.models import BaseOperator
import logging
class S3CopyObjectsOperator(BaseOperator):
#apply_defaults
def __init__(self,
aws_conn_id: str,
partition: Partition,
s3_bucket: str,
dest_prefix: str,
*args,
**kwargs):
super(S3CopyObjectsOperator, self).__init__(*args, **kwargs)
self.aws_conn_id = aws_conn_id
self.partition = partition
self.s3_bucket = s3_bucket
self.dest_prefix = dest_prefix
def execute(self, context):
self.partition.partition_value = context.get("execution_date")
logging.info(f'self.dest_prefix: {self.dest_prefix}')
exec_date = context.get("execution_date")
logging.info(f'self.partition.partition_value: {self.partition.partition_value}')
s3 = S3Hook(self.aws_conn_id)
s3_conn = s3.get_conn()
logging.info(f'source bucket -- self.partition.bucket: {self.partition.bucket}')
logging.info(f'source key -- self.partition.key_prefix: {self.partition.key_prefix}')
source_keys = s3.list_keys(bucket_name=self.partition.bucket, prefix=self.partition.key_prefix, delimiter="/")
logging.info(f'keys: {source_keys}')
for file in source_keys:
prefix, filename = os.path.split(file)
dest_key = f'{self.dest_prefix}/{filename}'
logging.info(f'Copying file {filename} to {self.dest_prefix}')
key = self.partition.key_prefix + filename
logging.info(f'key: {key}')
s3_conn.copy_object(Bucket=self.s3_bucket,
Key=f'{dest_key}',
CopySource={
'Bucket': self.partition.bucket,
'Key': key
}, ContentEncoding='csv')
However when I use this operator in my task I need my dest_prefix to include the execution date.
Things I've tried:
I've tried adding ds = '{{ ds_nodash }}' in the dag file but when I print self.dest_prefix in the Operator the value it returns he string value and not the execution date.
I've also tried creating a function but when I print self.dest_prefix in the Operator the value it returns is: self.dest_prefix: <function exec_value at 0x7fd008fcb940> See below for my task:
the execution date should be after snapshot_date=
for data_group in data_group_names:
copy_felix_to_s3 = S3CopyObjectsOperator(
task_id=f'copy_felix_{data_group}_data_to_s3',
aws_conn_id='aws_default',
s3_bucket='bucket_name',
partition=felixS3Partition(
bucket='source_bucket',
location_base=f'our_bucket/{data_group}',
partition_type=None
),
dest_prefix=f"felix/{data_group}/snapshot_date= ds",
dag=dag
)
copy_felix_to_s3
You are missing declaration of the parameter as templated field.
class S3CopyObjectsOperator(BaseOperator):
...
template_fields = ("dest_prefix",)
...
Macros (such as ds_nodash) are available only for templated fields thus if you don't specify template_fields it will handle the value you pass as string and it will not be rendered.

How to change xcom in Airflow to accomodate large data?

I am using the following code in my Airflow operator:
import json
import pandas as pd
from airflow.exceptions import AirflowException
from airflow.hooks.http_hook import HttpHook
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
class HttpToGoogleCloudStorageOperator(BaseOperator):
template_fields = ['endpoint', 'data', 'headers', ]
template_ext = ()
ui_color = '#f4a460'
#apply_defaults
def __init__(self,
endpoint,
project_id,
table_id,
data=None,
headers=None,
auth=None,
http_conn_id='http_default',
*args, **kwargs):
super(HttpToGoogleCloudStorageOperator, self).__init__(*args, **kwargs)
self.table_id = table_id
self.http_conn_id = http_conn_id
self.method = "GET"
self.endpoint = endpoint
self.headers = headers or {}
self.auth = auth
self.data = data or {}
def execute(self, context):
http = HttpHook(self.method, http_conn_id=self.http_conn_id)
self.log.info("Calling HTTP method " + self.endpoint)
response = http.run(self.endpoint, self.data, self.headers,auth=self.auth)
self.log.info("Got response")
Unfortunately the data returned is too large (about 5k) to fit in the standard xcom and I get this error:
{taskinstance.py:1059} ERROR - (_mysql_exceptions.DataError) (1406, "Data too long for column 'value' at row 1")
Is there a way I can tell http_hook to use a different xcom, or (even better) not use xcom at all? I have looked around and I do not see a solution.
Thanks for any tips or pointers.
Edit: Here is how I call the operator. Note that nowhere do I specify xcom.
query_load_task = HttpToGoogleCloudStorageOperator(
task_id="query_load_task",
endpoint=endpoint,
project_id="my_gcp_poroject_id",
table_id="dataset.table",
data=None,
auth=(username, password))
It's preferable to store data to a system designed for such (e.g.: the file system, AWS S3, Azure, etc.) and instead return a unique identifier to reference the location of the data, for the file system this would likely be the full path (e.g.: /tmp/acme_response_20200709.csv) that way you leverage the best of both the storage system and your database.
If you add your code I'd be happy to take a crack at writing up some psuedo-code as an example.

Resources