airflow.exceptions.AirflowException: Use keyword arguments when initializing operators - airflow

I am using Airflow version of 1.9.2 with Python 2.7 in Ubuntu. I tried to inherit from ParentOperator class which works fine itself and to create a class called ChildOperator. But when I create a ChildOperator instance, I think some keyword arguments are missing or messed up here and I am getting this error:
airflow.exceptions.AirflowException: Use keyword arguments when
initializing operators
Here is a simplified example:
class ParentOperator(BaseOperator, SkipMixin):
#apply_defaults
def __init__(self,
conn_id,
object,
args={},
s3_conn_id=None,
s3_key=None,
s3_bucket=None,
fields=None,
*args,
**kwargs
):
super(ParentOperator, self).__init__(*args, **kwargs)
...
class ChildOperator(ParentOperator):
#apply_defaults
def __init__(self,
conn_id,
object,
args={},
s3_conn_id=None,
s3_key=None,
s3_bucket=None,
fields=None,
*args,
**kwargs
):
args=...
super(ChildOperator, self).__init__(
conn_id,
object,
args=args,
s3_conn_id=s3_conn_id,
s3_key=s3_key,
s3_bucket=s3_bucket,
fields=fields,
*args,
**kwargs
)
...
myobjc = ChildOperator(
conn_id="my_default",
object=table,
args={},
s3_conn_id='s3_postgres_dump',
s3_key=s3_key,
s3_bucket=s3_bucket,
dag=dag,
task_id="task1"
)
Any idea what is causing this error? Is this more of a Python specific issue?

__init__ function of ChildOperator needs to have all keyword parameters like the following (for the first two parameters of conn_id and object):
super(ChildOperator, self).__init__(
conn_id=conn_id,
object=object,
args=args,
s3_conn_id=s3_conn_id,
s3_key=s3_key,
s3_bucket=s3_bucket,
fields=fields,
*args,
**kwargs
)

Related

Airflow custom operator variables

I need to pass Airflow connection settings(AWS, Postgres) to docker container environment variables
I'm trying to do this using custom Operator and BaseHook. \
class S3ToPostgresDockerOperator(DockerOperator):
#apply_defaults
def __init__(self, aws_conn_id='aws_default', postgres_conn_id='postgres_default', **kwargs):
super(S3ToPostgresDockerOperator, self).__init__(**kwargs)
self.aws_conn = BaseHook.get_connection(aws_conn_id)
self.pg_conn = BaseHook.get_connection(postgres_conn_id)
Is it possible to do something like that, or if not how should I do it?
java_unpack_csv = S3ToPostgresDockerOperator(
...
environment={
'AWS_ACCESS_KEY': '{{ ??? }}',
'AWS_SECRET_KEY': '{{ ??? }}'
}
)
You can build up the environment kwarg passed in the DockerOperator constructor.
For example,
class S3ToPostgresDockerOperator(DockerOperator):
#apply_defaults
def __init__(self, aws_conn_id='aws_default', postgres_conn_id='postgres_default', **kwargs):
self.aws_conn = BaseHook.get_connection(aws_conn_id)
self.pg_conn = BaseHook.get_connection(postgres_conn_id)
credentials = self.aws_conn.get_credentials()
kwargs['environment'] = dict(
kwargs.pop('environment', {}),
AWS_ACCESS_KEY=credentials.access_key,
AWS_SECRET_KEY=credentials.secret_key,
PG_DATABASE_URI=self.pg_conn.get_uri()
)
super(S3ToPostgresDockerOperator, self).__init__(**kwargs)

Using airflow dag_run.conf inside custom operator

We created a custom airflow based on EMRContainerOperator and we need to take a decision based on a config passed using the airflow UI.
My custom operator:
from airflow.providers.amazon.aws.operators.emr_containers import EMRContainerOperator
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
from uuid import uuid4
from airflow.utils.decorators import apply_defaults
class EmrBatchProcessorOperator(EMRContainerOperator):
template_fields: Sequence[str] = (
"name",
"virtual_cluster_id",
"execution_role_arn",
"release_label",
"job_driver",
"operation_type"
)
#apply_defaults
def __init__(
self,
operation_type,
*args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.operation_type = operation_type
if self.operation_type == 'full':
number_of_pods=10
else:
number_of_pods=5
BASE_CONSUMER_DRIVER_ARG = {
"sparkSubmitJobDriver": {"entryPoint": "s3://bucket/batch_processor_engine/batch-processor-engine_2.12-3.0.1_0.28.jar","entryPointArguments": ["group_name=courier_api_group01"], "sparkSubmitParameters": f"--conf spark.executor.instances={ number_of_pods } --conf spark.executor.memory=32G --conf spark.executor.cores=5 --conf spark.driver.cores=1 --conf spark.driver.memory=12G --conf spark.sql.broadcastTimeout=2000 --class TableProcessorWrapper"}
}
self.job_driver = BASE_CONSUMER_DRIVER_ARG
This is the way that I call my operator:
with DAG(
dag_id="batch_processor_model_dag",
schedule_interval="#daily",
default_args=default_args,
catchup=False
) as dag:
start = DummyOperator(task_id='start', dag=dag)
end = DummyOperator(task_id='end', dag=dag, trigger_rule='none_failed')
base_consumer = EmrBatchProcessorOperator(
task_id="base_consumer",
virtual_cluster_id=VIRTUAL_CLUSTER_ID,
execution_role_arn=JOB_ROLE_ARN,
configuration_overrides=CONFIGURATION_OVERRIDES_ARG,
release_label="emr-6.5.0-latest",
job_driver={},
name="pi.py",
operation_type= '{{dag_run.conf["operation_type"]}}'
)
start >> base_consumer >> end
But this code didn't work, I can't use the dag_run.conf value.
could you help me?

Airflow - getting the execution_date in task when calling an Operator

I have this Operator, its pretty much the same as S3CopyObjectOperator except it looks for all objects in a folder and copies to a destination folder.
import os
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.decorators import apply_defaults
from common.s3.partition import Partition, PartitionType
from airflow.models import BaseOperator
import logging
class S3CopyObjectsOperator(BaseOperator):
#apply_defaults
def __init__(self,
aws_conn_id: str,
partition: Partition,
s3_bucket: str,
dest_prefix: str,
*args,
**kwargs):
super(S3CopyObjectsOperator, self).__init__(*args, **kwargs)
self.aws_conn_id = aws_conn_id
self.partition = partition
self.s3_bucket = s3_bucket
self.dest_prefix = dest_prefix
def execute(self, context):
self.partition.partition_value = context.get("execution_date")
logging.info(f'self.dest_prefix: {self.dest_prefix}')
exec_date = context.get("execution_date")
logging.info(f'self.partition.partition_value: {self.partition.partition_value}')
s3 = S3Hook(self.aws_conn_id)
s3_conn = s3.get_conn()
logging.info(f'source bucket -- self.partition.bucket: {self.partition.bucket}')
logging.info(f'source key -- self.partition.key_prefix: {self.partition.key_prefix}')
source_keys = s3.list_keys(bucket_name=self.partition.bucket, prefix=self.partition.key_prefix, delimiter="/")
logging.info(f'keys: {source_keys}')
for file in source_keys:
prefix, filename = os.path.split(file)
dest_key = f'{self.dest_prefix}/{filename}'
logging.info(f'Copying file {filename} to {self.dest_prefix}')
key = self.partition.key_prefix + filename
logging.info(f'key: {key}')
s3_conn.copy_object(Bucket=self.s3_bucket,
Key=f'{dest_key}',
CopySource={
'Bucket': self.partition.bucket,
'Key': key
}, ContentEncoding='csv')
However when I use this operator in my task I need my dest_prefix to include the execution date.
Things I've tried:
I've tried adding ds = '{{ ds_nodash }}' in the dag file but when I print self.dest_prefix in the Operator the value it returns he string value and not the execution date.
I've also tried creating a function but when I print self.dest_prefix in the Operator the value it returns is: self.dest_prefix: <function exec_value at 0x7fd008fcb940> See below for my task:
the execution date should be after snapshot_date=
for data_group in data_group_names:
copy_felix_to_s3 = S3CopyObjectsOperator(
task_id=f'copy_felix_{data_group}_data_to_s3',
aws_conn_id='aws_default',
s3_bucket='bucket_name',
partition=felixS3Partition(
bucket='source_bucket',
location_base=f'our_bucket/{data_group}',
partition_type=None
),
dest_prefix=f"felix/{data_group}/snapshot_date= ds",
dag=dag
)
copy_felix_to_s3
You are missing declaration of the parameter as templated field.
class S3CopyObjectsOperator(BaseOperator):
...
template_fields = ("dest_prefix",)
...
Macros (such as ds_nodash) are available only for templated fields thus if you don't specify template_fields it will handle the value you pass as string and it will not be rendered.

Unable to pass xcom in Custom Operators in Airflow

I have a simple, linear DAG(created using Airflow 2.0) with two tasks. I have custom operators for each of the task which extend over BaseOperator. Following is the code for dag and operators:-
class Operator1(BaseOperator):
#apply_defaults
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
def execute(self, context):
...
logging.info('First task')
context['task_instance'].xcom_push(key="payload", value=data)
return data
class Operator2(BaseOperator):
#apply_defaults
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
def execute(self, context):
...
logging.info("context is ", context)
parameters = context['task_instance'].xcom_pull(key="payload", value=data)
with DAG('dag_1', default_args=DEFAULT_ARGS, schedule_interval=None) as dag:
TASK_1 = Operator1(
task_id='task_1',
do_xcom_push=True)
TASK_2 = Operator2(
task_id='task_2',
do_xcom_push=True)
TASK_1 >> TASK_2
When I run the DAG, I find that context which is used for getting xcom values is empty. I have searched a lot of answers on stackoverflow and tried the way mentioned in them but they didn't work.
Would really appreciate some hint over the issue - how to push and pull xcom values in custom operators?
I took your code and run it, the first problem was that start_date wasn't defined, so it ended up in an exception:
Exception has occurred: AirflowException (note: full exception trace is shown but execution is paused at: _run_module_as_main)
Task is missing the start_date parameter
Also, in Operator1 class, data variable is not defined. I guess maybe you missed them when you made the code example.
Other than that the code worked, but I think you should consider defining the task_id parameter when doing the xcom_pull operation.
From TaskInstance xcom_pull method description:
:param task_ids: Only XComs from tasks with matching ids will be
pulled. Can pass None to remove the filter.
Here is the code of a working example, note that I use two equivalent methods to perform the XComs operations:
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.utils.decorators import apply_defaults
from airflow.models import BaseOperator
class Operator1(BaseOperator):
#apply_defaults
def __init__(self, *args, **kwargs) -> None:
super(Operator1, self).__init__(*args, **kwargs)
def execute(self, context):
print('First task')
data = "valuable_data"
more_data = "more_valueable_data"
context['task_instance'].xcom_push(key="payload", value=data)
self.xcom_push(context, "more_data", more_data)
return data
class Operator2(BaseOperator):
#apply_defaults
def __init__(self, *args, **kwargs) -> None:
super(Operator2, self).__init__(*args, **kwargs)
def execute(self, context):
# print(f"context is {context}")
data = context['task_instance'].xcom_pull(
"task_1",
key="payload")
more_data = self.xcom_pull(context, "task_1", key="more_data")
print(f"Obtained data: {data}")
print(f"Obtained more_data: {more_data}")
with DAG('dag_1',
default_args={'owner': 'airflow'},
start_date=days_ago(1),
catchup=False,
schedule_interval=None) as dag:
TASK_1 = Operator1(
task_id='task_1'
)
TASK_2 = Operator2(
task_id='task_2'
)
TASK_1 >> TASK_2
Log from Task_2:
[2021-06-15 12:55:01,206] {taskinstance.py:1255} INFO - Exporting the following env vars:
AIRFLOW_CTX_DAG_OWNER=airflow
AIRFLOW_CTX_DAG_ID=dag_1
AIRFLOW_CTX_TASK_ID=task_2
AIRFLOW_CTX_EXECUTION_DATE=2021-06-14T00:00:00+00:00
AIRFLOW_CTX_DAG_RUN_ID=backfill__2021-06-14T00:00:00+00:00
Obtained data: valuable_data
Obtained more_data: more_valueable_data
[2021-06-15 12:55:01,227] {taskinstance.py:1159} INFO - Marking task as SUCCESS. dag_id=dag_1, task_id=task_2, execution_date=20210614T000000, start_date=20210615T120402, end_date=20210615T125501
Side notes: I changed the __init__ method in order to accept *args as well. I'm using print but It could be done using Airflow logger as self.log.info('msg') .
Let me know if that worked for you!

How can I type a function argument as native function

I have a helper function to use in python repl to move variables to global for easy debugging. But there is a mypy error:
class stepin(object): # pylint: disable=R0903
def __init__(self, func: Callable) -> None:
self.func = func
self.args = func.__code__.co_varnames
if hasattr(func, "__defaults__") and func.__defaults__:
self.defaults = dict(zip(reversed(self.args), reversed(func.__defaults__)))
else:
self.defaults = None
def __call__(self, *args, **kwargs):
result_dict = {x: None for x in self.args}
if self.defaults:
result_dict.update(self.defaults)
result_dict.update(dict(zip(self.args, args)))
result_dict.update(kwargs)
for x in result_dict.keys():
if result_dict[x] is None:
raise ValueError('Missing args: ', self.func.__qualname__, x)
globals().update(result_dict)
Now, the line
if hasattr(func, "__defaults__") and func.__defaults__:
self.defaults = dict(zip(reversed(self.args), reversed(func.__defaults__)))
raises a mypy error that says func has no __defaults__
Now I understand that the BDFL has said he despises the "hasattr" check so it's probably not gonna be solved inside mypy; then my question is, is there a way to change the __init__ typing signature to get rid of the error?
What have I tried: Callable doesn't work, understandable: not all Callables have __defaults__.
But where is the type "function"? If I type() a function it says "function" but "function" is not in preamble or "typing". I see that some people mention "FunctionType" but it's not in "typing" either.
The type of a function is types.FunctionType (in the types module).
If you modify the annotation for func from Callable to types.FunctionType, mypy no longer complains about __defaults__.

Resources