Airflow - getting the execution_date in task when calling an Operator - airflow

I have this Operator, its pretty much the same as S3CopyObjectOperator except it looks for all objects in a folder and copies to a destination folder.
import os
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.decorators import apply_defaults
from common.s3.partition import Partition, PartitionType
from airflow.models import BaseOperator
import logging
class S3CopyObjectsOperator(BaseOperator):
#apply_defaults
def __init__(self,
aws_conn_id: str,
partition: Partition,
s3_bucket: str,
dest_prefix: str,
*args,
**kwargs):
super(S3CopyObjectsOperator, self).__init__(*args, **kwargs)
self.aws_conn_id = aws_conn_id
self.partition = partition
self.s3_bucket = s3_bucket
self.dest_prefix = dest_prefix
def execute(self, context):
self.partition.partition_value = context.get("execution_date")
logging.info(f'self.dest_prefix: {self.dest_prefix}')
exec_date = context.get("execution_date")
logging.info(f'self.partition.partition_value: {self.partition.partition_value}')
s3 = S3Hook(self.aws_conn_id)
s3_conn = s3.get_conn()
logging.info(f'source bucket -- self.partition.bucket: {self.partition.bucket}')
logging.info(f'source key -- self.partition.key_prefix: {self.partition.key_prefix}')
source_keys = s3.list_keys(bucket_name=self.partition.bucket, prefix=self.partition.key_prefix, delimiter="/")
logging.info(f'keys: {source_keys}')
for file in source_keys:
prefix, filename = os.path.split(file)
dest_key = f'{self.dest_prefix}/{filename}'
logging.info(f'Copying file {filename} to {self.dest_prefix}')
key = self.partition.key_prefix + filename
logging.info(f'key: {key}')
s3_conn.copy_object(Bucket=self.s3_bucket,
Key=f'{dest_key}',
CopySource={
'Bucket': self.partition.bucket,
'Key': key
}, ContentEncoding='csv')
However when I use this operator in my task I need my dest_prefix to include the execution date.
Things I've tried:
I've tried adding ds = '{{ ds_nodash }}' in the dag file but when I print self.dest_prefix in the Operator the value it returns he string value and not the execution date.
I've also tried creating a function but when I print self.dest_prefix in the Operator the value it returns is: self.dest_prefix: <function exec_value at 0x7fd008fcb940> See below for my task:
the execution date should be after snapshot_date=
for data_group in data_group_names:
copy_felix_to_s3 = S3CopyObjectsOperator(
task_id=f'copy_felix_{data_group}_data_to_s3',
aws_conn_id='aws_default',
s3_bucket='bucket_name',
partition=felixS3Partition(
bucket='source_bucket',
location_base=f'our_bucket/{data_group}',
partition_type=None
),
dest_prefix=f"felix/{data_group}/snapshot_date= ds",
dag=dag
)
copy_felix_to_s3

You are missing declaration of the parameter as templated field.
class S3CopyObjectsOperator(BaseOperator):
...
template_fields = ("dest_prefix",)
...
Macros (such as ds_nodash) are available only for templated fields thus if you don't specify template_fields it will handle the value you pass as string and it will not be rendered.

Related

Get dag_run context in Airflow TaskFlow task

My dag is started with configuration JSON:
{"foo" : "bar"}
I have a Python operator which uses this value:
my_task = PythonOperator(
task_id="my_task",
op_kwargs={"foo": "{{ dag_run.conf['foo'] }}"},
python_callable=lambda foo: print(foo))
I’d like to replace it with a TaskFlow task…
#task
def my_task:
# how to get foo??
How can I get a reference to context, dag_run, or otherwise get to the configuration JSON from here?
There are several ways to do this using the TaskFlow API:
import datetime
from airflow.decorators import dag, task
from airflow.operators.python import get_current_context
#dag(start_date=datetime.datetime(2023, 1, 1), schedule=None)
def so_75303816():
#task
def example_1(**context):
foo = context["dag_run"].conf["foo"]
print(foo)
#task
def example_2(dag_run=None):
foo = dag_run.conf["foo"]
print(foo)
#task
def example_3():
context = get_current_context()
foo = context["dag_run"].conf["foo"]
print(foo)
#task
def example_4(params=None):
foo = params["foo"]
print(foo)
example_1()
example_2()
example_3()
example_4()
so_75303816()
Depending on your needs/preference, you can use one of the following examples:
example_1: You get all task instance context variables and have to extract "foo".
example_2: You explicitly state via arguments you want only dag_run from the task instance context variables. Note that you have to default arguments to None.
example_3: You can also fetch the task instance context variables from inside a task using airflow.operators.python.get_current_context().
example_4: DAG run context is also available via a variable named "params".
For more information, see https://airflow.apache.org/docs/apache-airflow/stable/tutorial/taskflow.html#accessing-context-variables-in-decorated-tasks and https://airflow.apache.org/docs/apache-airflow/stable/templates-ref.html#variables.

Airflow Xcom not getting resolved return task_instance string

I am facing an odd issue with xcom_pull where it is always returning back a xcom_pull string
"{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}"
My requirement is simple I have pushed an xcom using python operator and with xcom_pull I am trying to retrieve the value and pass it as an http_conn_id for SimpleHttpOperator but the variable is returning a string instead of resolving xcom_pull value.
Python Operator is successfully able to push XCom.
Code:
from datetime import datetime
import simplejson as json
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.providers.http.operators.http import SimpleHttpOperator
from google.auth.transport.requests import Request
default_airflow_args = {
"owner": "divyaansh",
"depends_on_past": False,
"start_date": datetime(2022, 5, 18),
"retries": 0,
"schedule_interval": "#hourly",
}
project_configs = {
"project_id": "test",
"conn_id": "google_cloud_storage_default",
"bucket_name": "test-transfer",
"folder_name": "processed-test-rdf",
}
def get_config_vals(**kwargs) -> dict:
"""
Get config vals from airlfow variable and store it as xcoms
"""
task_instance = kwargs["task_instance"]
task_instance.xcom_push(key="http_con_id", value="gcp_cloud_function")
def generate_api_token(cf_name: str):
"""
generate token for api request
"""
import google.oauth2.id_token
request = Request()
target_audience = f"https://us-central1-test-a2h.cloudfunctions.net/{cf_name}"
return google.oauth2.id_token.fetch_id_token(
request=request, audience=target_audience
)
with DAG(
dag_id="cf_test",
default_args=default_airflow_args,
catchup=False,
render_template_as_native_obj=True,
) as dag:
start = DummyOperator(task_id="start")
config_vals = PythonOperator(
task_id="get_config_val", python_callable=get_config_vals, provide_context=True
)
ip_data = json.dumps(
{
"bucket_name": project_configs["bucket_name"],
"file_name": "dummy",
"target_location": "/valid",
}
)
conn_id = "{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}"
api_token = generate_api_token("new-cp")
cf_task = SimpleHttpOperator(
task_id="file_decrypt_and_validate_cf",
http_conn_id=conn_id,
method="POST",
endpoint="new-cp",
data=json.dumps(
json.dumps(
{
"bucket_name": "test-transfer",
"file_name": [
"processed-test-rdf/dummy_20220501.txt",
"processed-test-rdf/dummy_20220502.txt",
],
"target_location": "/valid",
}
)
),
headers={
"Authorization": f"bearer {api_token}",
"Content-Type": "application/json",
},
do_xcom_push=True,
log_response=True,
)
print("task new-cp", cf_task)
check_flow = DummyOperator(task_id="check_flow")
end = DummyOperator(task_id="end")
start >> config_vals >> cf_task >> check_flow >> end
Error Message:
raise AirflowNotFoundException(f"The conn_id `{conn_id}` isn't defined") airflow.exceptions.AirflowNotFoundException: The conn_id `"{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}"` isn't defined
I have tried several different days but nothing seems to be working.
Can someone point me to the right direction here.
Airflow-version : 2.2.3
Composer-version : 2.0.11
In SimpleHttpOperator the http_conn_id parameter is not templated field thus you can not use Jinja engine with it. This means that this parameter can not be rendered. So when you pass "{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}" to the operator you expect it to be replaced during runtime with the value stored in Xcom by previous task but in fact Airflow consider it just as a regular string this is also what the exception tells you. It actually try to search a connection with the name of your very long string but couldn't find it so it tells you that the connection is not defined.
To solve it you can create a custom operator:
class MySimpleHttpOperator(SimpleHttpOperator):
template_fields = SimpleHttpOperator.template_fields + ("http_conn_id",)
Then you should replace SimpleHttpOperator with MySimpleHttpOperator in your DAG.
This change makes the string that you set in http_conn_id to be passed via the Jinja engine. So in your case the string will be replaced with the Xcom value as you expect.

Using airflow dag_run.conf inside custom operator

We created a custom airflow based on EMRContainerOperator and we need to take a decision based on a config passed using the airflow UI.
My custom operator:
from airflow.providers.amazon.aws.operators.emr_containers import EMRContainerOperator
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
from uuid import uuid4
from airflow.utils.decorators import apply_defaults
class EmrBatchProcessorOperator(EMRContainerOperator):
template_fields: Sequence[str] = (
"name",
"virtual_cluster_id",
"execution_role_arn",
"release_label",
"job_driver",
"operation_type"
)
#apply_defaults
def __init__(
self,
operation_type,
*args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.operation_type = operation_type
if self.operation_type == 'full':
number_of_pods=10
else:
number_of_pods=5
BASE_CONSUMER_DRIVER_ARG = {
"sparkSubmitJobDriver": {"entryPoint": "s3://bucket/batch_processor_engine/batch-processor-engine_2.12-3.0.1_0.28.jar","entryPointArguments": ["group_name=courier_api_group01"], "sparkSubmitParameters": f"--conf spark.executor.instances={ number_of_pods } --conf spark.executor.memory=32G --conf spark.executor.cores=5 --conf spark.driver.cores=1 --conf spark.driver.memory=12G --conf spark.sql.broadcastTimeout=2000 --class TableProcessorWrapper"}
}
self.job_driver = BASE_CONSUMER_DRIVER_ARG
This is the way that I call my operator:
with DAG(
dag_id="batch_processor_model_dag",
schedule_interval="#daily",
default_args=default_args,
catchup=False
) as dag:
start = DummyOperator(task_id='start', dag=dag)
end = DummyOperator(task_id='end', dag=dag, trigger_rule='none_failed')
base_consumer = EmrBatchProcessorOperator(
task_id="base_consumer",
virtual_cluster_id=VIRTUAL_CLUSTER_ID,
execution_role_arn=JOB_ROLE_ARN,
configuration_overrides=CONFIGURATION_OVERRIDES_ARG,
release_label="emr-6.5.0-latest",
job_driver={},
name="pi.py",
operation_type= '{{dag_run.conf["operation_type"]}}'
)
start >> base_consumer >> end
But this code didn't work, I can't use the dag_run.conf value.
could you help me?

Airflow:Date not rendering in SQL file

Issue: Date not rendering in SQL file
I am not able to get the date yesterday_ds rendered in the SQL file
In bi_utils/airflow.py module I have given as YESTERDAY_DS = '{{yesterday_ds}}'
In the Dag
from bi_utils.airflow import YESTERDAY_DS
snflk_to_s3 = SnowflakeMultiSqlStatmentOperator(
task_id='snflk_to_s3',
snowflake_conn_id=SNOWFLAKE_CONN_ID,
sql=load_sql,
params={
'proc_run_task_id': [proc_start.task_id],
'yesterday_ds': YESTERDAY_DS,
},
autocommit=True,
)
In the SQL file
COPY INTO #public.stage/path/{{params.yesterday_ds}}/
Looks like you are using a custom operator. In that case, you will have to add the argument as a template_fields to resolve the jinja template.
In this case, inside your custom operator, the Code below is just to understand the structure.
class SnowflakeMultiSqlStatmentOperator(BaseOperator):
"""
Executes sql code in a Snowflake database
"""
template_fields = ('sql','params')
template_ext = ('.sql',)
ui_color = '#ededed'
#apply_defaults
def __init__(
self, sql, snowflake_conn_id='snowflake_default', parameters=None,
autocommit=True, warehouse=None, database=None, role=None,
schema=None,params=None, *args, **kwargs):
super(SnowflakeOperator, self).__init__(*args, **kwargs)
self.snowflake_conn_id = snowflake_conn_id
self.sql = sql
self.autocommit = autocommit
self.parameters = parameters
self.warehouse = warehouse
self.database = database
self.role = role
self.schema = schema
self.params=params
def execute(self, context):
self.log.info('Executing: %s', self.sql)
hook = self.get_hook()
hook.run(
self.sql,
autocommit=self.autocommit,
parameters=self.parameters)

How to change xcom in Airflow to accomodate large data?

I am using the following code in my Airflow operator:
import json
import pandas as pd
from airflow.exceptions import AirflowException
from airflow.hooks.http_hook import HttpHook
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
class HttpToGoogleCloudStorageOperator(BaseOperator):
template_fields = ['endpoint', 'data', 'headers', ]
template_ext = ()
ui_color = '#f4a460'
#apply_defaults
def __init__(self,
endpoint,
project_id,
table_id,
data=None,
headers=None,
auth=None,
http_conn_id='http_default',
*args, **kwargs):
super(HttpToGoogleCloudStorageOperator, self).__init__(*args, **kwargs)
self.table_id = table_id
self.http_conn_id = http_conn_id
self.method = "GET"
self.endpoint = endpoint
self.headers = headers or {}
self.auth = auth
self.data = data or {}
def execute(self, context):
http = HttpHook(self.method, http_conn_id=self.http_conn_id)
self.log.info("Calling HTTP method " + self.endpoint)
response = http.run(self.endpoint, self.data, self.headers,auth=self.auth)
self.log.info("Got response")
Unfortunately the data returned is too large (about 5k) to fit in the standard xcom and I get this error:
{taskinstance.py:1059} ERROR - (_mysql_exceptions.DataError) (1406, "Data too long for column 'value' at row 1")
Is there a way I can tell http_hook to use a different xcom, or (even better) not use xcom at all? I have looked around and I do not see a solution.
Thanks for any tips or pointers.
Edit: Here is how I call the operator. Note that nowhere do I specify xcom.
query_load_task = HttpToGoogleCloudStorageOperator(
task_id="query_load_task",
endpoint=endpoint,
project_id="my_gcp_poroject_id",
table_id="dataset.table",
data=None,
auth=(username, password))
It's preferable to store data to a system designed for such (e.g.: the file system, AWS S3, Azure, etc.) and instead return a unique identifier to reference the location of the data, for the file system this would likely be the full path (e.g.: /tmp/acme_response_20200709.csv) that way you leverage the best of both the storage system and your database.
If you add your code I'd be happy to take a crack at writing up some psuedo-code as an example.

Resources