Airflow deprecation warning Invalid arguments were passed - airflow

I have the following code on Airflow 1.9:
import_op = MySqlToGoogleCloudStorageOperator(
task_id='import',
mysql_conn_id='oproduction',
google_cloud_storage_conn_id='gcpm',
provide_context=True,
approx_max_file_size_bytes = 100000000, #100MB per file
sql = 'import.sql',
params={'next_to_import': NEXT_TO_IMPORT, 'table_name' : TABLE_NAME},
bucket=GCS_BUCKET_ID,
filename=file_name_orders,
dag=dag)
Why does it genereates:
/usr/local/lib/python2.7/dist-packages/airflow/models.py:2160:
PendingDeprecationWarning: Invalid arguments were passed to
MySqlToGoogleCloudStorageOperator. Support for passing such arguments
will be dropped in Airflow 2.0. Invalid arguments were:
*args: ()
**kwargs: {'provide_context': True} category=PendingDeprecationWarning
What is the problem with the provide_context? To the best of my knowledge it is needed for the usage of params.

provide_context is not needed for params.
params parameter (dict type) can be passed to any Operator.
You would mostly use provide_context with PythonOperator, BranchPythonOperator. A good example is https://airflow.readthedocs.io/en/latest/howto/operator.html#pythonoperator.
MySqlToGoogleCloudStorageOperator has no parameter provide_context, hence it is passed in **kwargs and you get Deprecation warning.
If you check docstring of PythonOperator for provide_context :
if set to true, Airflow will pass a set of keyword arguments that can
be used in your function. This set of kwargs correspond exactly to
what you can use in your jinja templates. For this to work, you need
to define **kwargs in your function header.
It has the following code if you check the source code:
if self.provide_context:
context.update(self.op_kwargs)
context['templates_dict'] = self.templates_dict
self.op_kwargs = context
So in simple terms, it passes the following dictionary with templates_dict to your function pass in python_callable:
{
'END_DATE': ds,
'conf': configuration,
'dag': task.dag,
'dag_run': dag_run,
'ds': ds,
'ds_nodash': ds_nodash,
'end_date': ds,
'execution_date': self.execution_date,
'latest_date': ds,
'macros': macros,
'params': params,
'run_id': run_id,
'tables': tables,
'task': task,
'task_instance': self,
'task_instance_key_str': ti_key_str,
'test_mode': self.test_mode,
'ti': self,
'tomorrow_ds': tomorrow_ds,
'tomorrow_ds_nodash': tomorrow_ds_nodash,
'ts': ts,
'ts_nodash': ts_nodash,
'yesterday_ds': yesterday_ds,
'yesterday_ds_nodash': yesterday_ds_nodash,
}
So this can be used in the function as follows:
def print_context(ds, **kwargs):
pprint(kwargs)
ti = context['task_instance']
exec_date = context['execution_date']
print(ds)
return 'Whatever you return gets printed in the logs'
run_this = PythonOperator(
task_id='print_the_context',
provide_context=True,
python_callable=print_context,
dag=dag,
)

Related

Airflow Xcom not getting resolved return task_instance string

I am facing an odd issue with xcom_pull where it is always returning back a xcom_pull string
"{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}"
My requirement is simple I have pushed an xcom using python operator and with xcom_pull I am trying to retrieve the value and pass it as an http_conn_id for SimpleHttpOperator but the variable is returning a string instead of resolving xcom_pull value.
Python Operator is successfully able to push XCom.
Code:
from datetime import datetime
import simplejson as json
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.providers.http.operators.http import SimpleHttpOperator
from google.auth.transport.requests import Request
default_airflow_args = {
"owner": "divyaansh",
"depends_on_past": False,
"start_date": datetime(2022, 5, 18),
"retries": 0,
"schedule_interval": "#hourly",
}
project_configs = {
"project_id": "test",
"conn_id": "google_cloud_storage_default",
"bucket_name": "test-transfer",
"folder_name": "processed-test-rdf",
}
def get_config_vals(**kwargs) -> dict:
"""
Get config vals from airlfow variable and store it as xcoms
"""
task_instance = kwargs["task_instance"]
task_instance.xcom_push(key="http_con_id", value="gcp_cloud_function")
def generate_api_token(cf_name: str):
"""
generate token for api request
"""
import google.oauth2.id_token
request = Request()
target_audience = f"https://us-central1-test-a2h.cloudfunctions.net/{cf_name}"
return google.oauth2.id_token.fetch_id_token(
request=request, audience=target_audience
)
with DAG(
dag_id="cf_test",
default_args=default_airflow_args,
catchup=False,
render_template_as_native_obj=True,
) as dag:
start = DummyOperator(task_id="start")
config_vals = PythonOperator(
task_id="get_config_val", python_callable=get_config_vals, provide_context=True
)
ip_data = json.dumps(
{
"bucket_name": project_configs["bucket_name"],
"file_name": "dummy",
"target_location": "/valid",
}
)
conn_id = "{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}"
api_token = generate_api_token("new-cp")
cf_task = SimpleHttpOperator(
task_id="file_decrypt_and_validate_cf",
http_conn_id=conn_id,
method="POST",
endpoint="new-cp",
data=json.dumps(
json.dumps(
{
"bucket_name": "test-transfer",
"file_name": [
"processed-test-rdf/dummy_20220501.txt",
"processed-test-rdf/dummy_20220502.txt",
],
"target_location": "/valid",
}
)
),
headers={
"Authorization": f"bearer {api_token}",
"Content-Type": "application/json",
},
do_xcom_push=True,
log_response=True,
)
print("task new-cp", cf_task)
check_flow = DummyOperator(task_id="check_flow")
end = DummyOperator(task_id="end")
start >> config_vals >> cf_task >> check_flow >> end
Error Message:
raise AirflowNotFoundException(f"The conn_id `{conn_id}` isn't defined") airflow.exceptions.AirflowNotFoundException: The conn_id `"{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}"` isn't defined
I have tried several different days but nothing seems to be working.
Can someone point me to the right direction here.
Airflow-version : 2.2.3
Composer-version : 2.0.11
In SimpleHttpOperator the http_conn_id parameter is not templated field thus you can not use Jinja engine with it. This means that this parameter can not be rendered. So when you pass "{{ task_instance.xcom_pull(dag_id = 'cf_test',task_ids='get_config_val',key='http_con_id') }}" to the operator you expect it to be replaced during runtime with the value stored in Xcom by previous task but in fact Airflow consider it just as a regular string this is also what the exception tells you. It actually try to search a connection with the name of your very long string but couldn't find it so it tells you that the connection is not defined.
To solve it you can create a custom operator:
class MySimpleHttpOperator(SimpleHttpOperator):
template_fields = SimpleHttpOperator.template_fields + ("http_conn_id",)
Then you should replace SimpleHttpOperator with MySimpleHttpOperator in your DAG.
This change makes the string that you set in http_conn_id to be passed via the Jinja engine. So in your case the string will be replaced with the Xcom value as you expect.

Why does a pythonoperator callable not need to accept parameters in airflow?

I do not understand how callables (function called as specified by PythonOperator) n Airflow should have their parameter list set. I have seen the with no parameters or with named params or **kwargs. I can always add "ti" or **allargs as parameters it seems, and ti seems to be used for task instance info, or ds for execution date. But my callables do not NEED params apparently. They can be simply be "def function():". If I wrote a regular python function func() instead of func(**kwargs), it would fail at runtime when called unless no params were passed. Airflow always seems to pass t1 all the time, so how can the callable function signature not require it?? Example below from a training site where _process_data func gets the ti, but _extract_bitcoin_price() does not. I was thinking that is because of the xcom push, but ti is ALWAYS available it seems, so how can "def somefunc()" ever work? I tried looking at pythonoperator source code, but I am unclear how this works or best practices for including parameters in a callable. Thanks!!
from airflow import DAG
from airflow.operators.python_operator
import PythonOperator
from datetime import datetime
import json
from typing import Dict
import requests
import logging
API = "https://api.coingecko.com/api/v3/simple/price?ids=bitcoin&vs_currencies=usd&include_market_cap=true&include_24hr_vol=true&include_24hr_change=true&include_last_updated_at=true"
def \_extract_bitcoin_price():
return requests.get(API).json()\['bitcoin'\]
def \_process_data(ti):
response = ti.xcom_pull(task_ids='extract_bitcoin_price')
logging.info(response)
processed_data = {'usd': response\['usd'\], 'change': response\['usd_24h_change'\]}
ti.xcom_push(key='processed_data', value=processed_data)
def \_store_data(ti):
data = ti.xcom_pull(task_ids='process_data', key='processed_data')
logging.info(f"Store: {data\['usd'\]} with change {data\['change'\]}")
with DAG('classic_dag', schedule_interval='#daily', start_date=datetime(2021, 12, 1), catchup=False) as dag:
extract_bitcoin_price = PythonOperator(
task_id='extract_bitcoin_price',
python_callable=_extract_bitcoin_price
)
process_data = PythonOperator(
task_id='process_data',
python_callable=_process_data
)
store_data = PythonOperator(
task_id='store_data',
python_callable=_store_data
)
extract_bitcoin_price >> process_data >> store_data
Tried callables with no params somefunc() expecting to get error saying too many params passed, but it succeeded. Adding somefunc(ti) also works! How can both work?
I think what you are missing is that Airflow allows to pass the context of the task to the python callable (as you can see one of them is the ti). These are additional useful parameters that Airflow provides and you can use them in your task.
In older Airflow versions user had to set provide_context=True which for that to work:
process_data = PythonOperator(
...,
provide_context=True
)
Since Airflow>=2.0 there is no need to use provide_context. Airflow handles it under the hood.
When you see in the Python Callable signatures like:
def func(ti, **kwargs):
...
This means that the ti is "unpacked" from the kwargs. You can also do:
def func(**kwargs):
ti = kwargs['ti']
EDIT:
I think what you are missing is that while you write:
def func()
...
store_data = PythonOperator(
task_id='task',
python_callable=func
)
Airflow does more than just calling func. The code being executed is the execute() function of PythonOperator and this function calls the python callable you provided with args and kwargs.

Airflow - getting the execution_date in task when calling an Operator

I have this Operator, its pretty much the same as S3CopyObjectOperator except it looks for all objects in a folder and copies to a destination folder.
import os
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.decorators import apply_defaults
from common.s3.partition import Partition, PartitionType
from airflow.models import BaseOperator
import logging
class S3CopyObjectsOperator(BaseOperator):
#apply_defaults
def __init__(self,
aws_conn_id: str,
partition: Partition,
s3_bucket: str,
dest_prefix: str,
*args,
**kwargs):
super(S3CopyObjectsOperator, self).__init__(*args, **kwargs)
self.aws_conn_id = aws_conn_id
self.partition = partition
self.s3_bucket = s3_bucket
self.dest_prefix = dest_prefix
def execute(self, context):
self.partition.partition_value = context.get("execution_date")
logging.info(f'self.dest_prefix: {self.dest_prefix}')
exec_date = context.get("execution_date")
logging.info(f'self.partition.partition_value: {self.partition.partition_value}')
s3 = S3Hook(self.aws_conn_id)
s3_conn = s3.get_conn()
logging.info(f'source bucket -- self.partition.bucket: {self.partition.bucket}')
logging.info(f'source key -- self.partition.key_prefix: {self.partition.key_prefix}')
source_keys = s3.list_keys(bucket_name=self.partition.bucket, prefix=self.partition.key_prefix, delimiter="/")
logging.info(f'keys: {source_keys}')
for file in source_keys:
prefix, filename = os.path.split(file)
dest_key = f'{self.dest_prefix}/{filename}'
logging.info(f'Copying file {filename} to {self.dest_prefix}')
key = self.partition.key_prefix + filename
logging.info(f'key: {key}')
s3_conn.copy_object(Bucket=self.s3_bucket,
Key=f'{dest_key}',
CopySource={
'Bucket': self.partition.bucket,
'Key': key
}, ContentEncoding='csv')
However when I use this operator in my task I need my dest_prefix to include the execution date.
Things I've tried:
I've tried adding ds = '{{ ds_nodash }}' in the dag file but when I print self.dest_prefix in the Operator the value it returns he string value and not the execution date.
I've also tried creating a function but when I print self.dest_prefix in the Operator the value it returns is: self.dest_prefix: <function exec_value at 0x7fd008fcb940> See below for my task:
the execution date should be after snapshot_date=
for data_group in data_group_names:
copy_felix_to_s3 = S3CopyObjectsOperator(
task_id=f'copy_felix_{data_group}_data_to_s3',
aws_conn_id='aws_default',
s3_bucket='bucket_name',
partition=felixS3Partition(
bucket='source_bucket',
location_base=f'our_bucket/{data_group}',
partition_type=None
),
dest_prefix=f"felix/{data_group}/snapshot_date= ds",
dag=dag
)
copy_felix_to_s3
You are missing declaration of the parameter as templated field.
class S3CopyObjectsOperator(BaseOperator):
...
template_fields = ("dest_prefix",)
...
Macros (such as ds_nodash) are available only for templated fields thus if you don't specify template_fields it will handle the value you pass as string and it will not be rendered.

How to force Jinja templating on Airflow variable?

The Airflow docs say: You can use Jinja templating with every parameter that is marked as “templated” in the documentation. It makes sense that specific parameters in the Airflow world (such as certain parameters to PythonOperator) get templated by Airflow automatically. I'm wondering what the best/correct way is to get a non-Airflow variable to get templated. My specific use case is something similar to:
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from somewhere import export_votes_data, export_queries_data
from elsewhere import ApiCaucus, ApiQueries
dag = DAG('export_training_data',
description='Export training data for all active orgs to GCS',
schedule_interval=None,
start_date=datetime(2018, 3, 26), catchup=False)
HOST = "http://api-00a.dev0.solvvy.co"
BUCKET = "gcs://my-bucket-name/{{ ds }}/" # I'd like this to get templated
votes_api = ApiCaucus.get_votes_api(HOST)
queries_api = ApiQueries.get_queries_api(HOST)
export_votes = PythonOperator(task_id="export_votes", python_callable=export_votes_data,
op_args=[BUCKET, votes_api], dag=dag)
export_queries = PythonOperator(task_id="export_queries", python_callable=export_query_data,
op_args=[BUCKET, queries_api, export_solutions.task_id], dag=dag,
provide_context=True)
The provide_context argument for the PythonOperator will pass along the arguments that are used for templating. From the documentation:
provide_context (bool) – if set to true, Airflow will pass a set of
keyword arguments that can be used in your function. This set of
kwargs correspond exactly to what you can use in your jinja templates.
For this to work, you need to define **kwargs in your function header.
With the context provided to your callable, you can then do the interpolation in your function:
def your_callable(bucket, api, **kwargs):
bucket = bucket.format(**kwargs)
[...]
Inside methods(execute/pre_execute/post_execute, and anywhere you can get the Airflow context) of an Operator:
BUCKET = "gcs://my-bucket-name/{{ ds }}/" # I'd like this to get templated
jinja_context = context['ti'].get_template_context()
rendered_content = self.render_template('', BUCKET, jinja_context)

how do I use the --conf option in airflow

I am trying to run a airflow DAG and need to pass some parameters for the tasks.
How do I read the JSON string passed as the --conf parameter in the command line trigger_dag command, in the python DAG file.
ex: airflow trigger_dag 'dag_name' -r 'run_id' --conf '{"key":"value"}'
Two ways. From inside a template field or file:
{{ dag_run.conf['key'] }}
Or when context is available, e.g. within a python callable of the PythonOperator:
context['dag_run'].conf['key']
In the example provided here https://github.com/apache/airflow/blob/master/airflow/example_dags/example_trigger_target_dag.py#L62 while trying to parse 'conf' passed in an airflow REST API call, use provide_context=True in pythonOperator.
Also, the key-value pair passed in json format in the REST API call, can be accessed in bashOperator and sparkOperator as '\'{{ dag_run.conf["key"] if dag_run else "" }}\''
dag = DAG(
dag_id="example_dag",
default_args={"start_date": days_ago(2), "owner": "airflow"},
schedule_interval=None
)
def run_this_func(**context):
"""
Print the payload "message" passed to the DagRun conf attribute.
:param context: The execution context
:type context: dict
"""
print("context", context)
print("Remotely received value of {} for key=message".format(context["dag_run"].conf["key"]))
#PythonOperator usage
run_this = PythonOperator(task_id="run_this", python_callable=run_this_func, dag=dag, provide_context=True)
#BashOperator usage
bash_task = BashOperator(
task_id="bash_task",
bash_command='echo "Here is the message: \'{{ dag_run.conf["key"] if dag_run else "" }}\'"',
dag=dag
)
#SparkSubmitOperator usage
spark_task = SparkSubmitOperator(
task_id="task_id",
conn_id=spark_conn_id,
name="task_name",
application="example.py",
application_args=[
'--key', '\'{{ dag_run.conf["key"] if dag_run else "" }}\''
],
num_executors=10,
executor_cores=5,
executor_memory='30G',
#driver_memory='2G',
conf={'spark.yarn.maxAppAttempts': 1},
dag=dag)
You can use the param variable in DAG initialization to send data in DAG tasks.

Resources