Airflow: Zoom in Sub Dag button in customized SubDagOperator - airflow

I've designed a customized SubDagOperator. Everything works fine except that the "Zoom in Sub Dag" button doesn't appear. It seems that if the airflow UI doesn't recognise the task as a subdagoperator itself the button is not shown. I've tried to override the task_type property, as it was mentioned in and old issue, but it doesn't work for me. Do you know if it's possible to see the button with customized SubDagOperators?
Airflow version: 1.10.12
Here is my try:
class EmrSubdagOperator(SubDagOperator):
template_fields = ()
template_ext = ()
#apply_defaults
def __init__(self, *args, **kwargs):
dag = kwargs.get('dag')
task_id = kwargs.get('task_id')
spark_steps = kwargs.get('spark_steps')
job_flow_overrides = kwargs.get('job_flow_overrides')
subdag = DAG(
'{}.{}'.format(dag.dag_id, task_id),
schedule_interval=dag.schedule_interval,
start_date=dag.start_date
)
cluster_creator = EmrCreateJobFlowOperator(
dag=subdag,
task_id='create_job_flow',
job_flow_overrides=job_flow_overrides
)
step_adder = EmrAddStepsOperator(
dag=subdag,
task_id='add_steps',
job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
steps=spark_steps,
)
#Check if the last step is completed, skipped or terminated
last_step = len(spark_steps) - 1
step_checker = EmrStepSensor(
dag=subdag,
task_id='watch_step',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[" + str(last_step) +"]}}",
aws_conn_id='aws_default',
)
cluster_checker = EmrJobFlowSensor(
dag=subdag,
task_id='watch_cluster',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
)
super(EmrSubdagOperator, self).__init__(subdag=subdag, *args, **kwargs)
self.spark_steps = spark_steps
self.job_flow_overrides = job_flow_overrides
#property
def task_type(self):
return 'SubDagOperator'

I had the same problem.
I resolved that, doing:
class DynamicTasksOperator(SubDagOperator):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._task_type = 'SubDagOperator'
#property
def task_type(self):
return self._task_type

I have made the following changes in the airflow.cfg and its working for me now.
store_serialized_dags = False
store_dag_code = False

Related

Airflow custom operator variables

I need to pass Airflow connection settings(AWS, Postgres) to docker container environment variables
I'm trying to do this using custom Operator and BaseHook. \
class S3ToPostgresDockerOperator(DockerOperator):
#apply_defaults
def __init__(self, aws_conn_id='aws_default', postgres_conn_id='postgres_default', **kwargs):
super(S3ToPostgresDockerOperator, self).__init__(**kwargs)
self.aws_conn = BaseHook.get_connection(aws_conn_id)
self.pg_conn = BaseHook.get_connection(postgres_conn_id)
Is it possible to do something like that, or if not how should I do it?
java_unpack_csv = S3ToPostgresDockerOperator(
...
environment={
'AWS_ACCESS_KEY': '{{ ??? }}',
'AWS_SECRET_KEY': '{{ ??? }}'
}
)
You can build up the environment kwarg passed in the DockerOperator constructor.
For example,
class S3ToPostgresDockerOperator(DockerOperator):
#apply_defaults
def __init__(self, aws_conn_id='aws_default', postgres_conn_id='postgres_default', **kwargs):
self.aws_conn = BaseHook.get_connection(aws_conn_id)
self.pg_conn = BaseHook.get_connection(postgres_conn_id)
credentials = self.aws_conn.get_credentials()
kwargs['environment'] = dict(
kwargs.pop('environment', {}),
AWS_ACCESS_KEY=credentials.access_key,
AWS_SECRET_KEY=credentials.secret_key,
PG_DATABASE_URI=self.pg_conn.get_uri()
)
super(S3ToPostgresDockerOperator, self).__init__(**kwargs)

Using airflow dag_run.conf inside custom operator

We created a custom airflow based on EMRContainerOperator and we need to take a decision based on a config passed using the airflow UI.
My custom operator:
from airflow.providers.amazon.aws.operators.emr_containers import EMRContainerOperator
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
from uuid import uuid4
from airflow.utils.decorators import apply_defaults
class EmrBatchProcessorOperator(EMRContainerOperator):
template_fields: Sequence[str] = (
"name",
"virtual_cluster_id",
"execution_role_arn",
"release_label",
"job_driver",
"operation_type"
)
#apply_defaults
def __init__(
self,
operation_type,
*args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.operation_type = operation_type
if self.operation_type == 'full':
number_of_pods=10
else:
number_of_pods=5
BASE_CONSUMER_DRIVER_ARG = {
"sparkSubmitJobDriver": {"entryPoint": "s3://bucket/batch_processor_engine/batch-processor-engine_2.12-3.0.1_0.28.jar","entryPointArguments": ["group_name=courier_api_group01"], "sparkSubmitParameters": f"--conf spark.executor.instances={ number_of_pods } --conf spark.executor.memory=32G --conf spark.executor.cores=5 --conf spark.driver.cores=1 --conf spark.driver.memory=12G --conf spark.sql.broadcastTimeout=2000 --class TableProcessorWrapper"}
}
self.job_driver = BASE_CONSUMER_DRIVER_ARG
This is the way that I call my operator:
with DAG(
dag_id="batch_processor_model_dag",
schedule_interval="#daily",
default_args=default_args,
catchup=False
) as dag:
start = DummyOperator(task_id='start', dag=dag)
end = DummyOperator(task_id='end', dag=dag, trigger_rule='none_failed')
base_consumer = EmrBatchProcessorOperator(
task_id="base_consumer",
virtual_cluster_id=VIRTUAL_CLUSTER_ID,
execution_role_arn=JOB_ROLE_ARN,
configuration_overrides=CONFIGURATION_OVERRIDES_ARG,
release_label="emr-6.5.0-latest",
job_driver={},
name="pi.py",
operation_type= '{{dag_run.conf["operation_type"]}}'
)
start >> base_consumer >> end
But this code didn't work, I can't use the dag_run.conf value.
could you help me?

Airflow run tasks in parallel

I'm confused how it's working airflow to run 2 tasks in parallel.
This is my Dag:
import datetime as dt
from airflow import DAG
import os
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
from airflow.contrib.sensors.file_sensor import FileSensor
from airflow.operators.dagrun_operator import TriggerDagRunOperator
scriptAirflow = '/home/alexw/scriptAirflow/'
uploadPath='/apps/man-data/data/to_load/'
receiptPath= '/apps/man-data/data/to_receipt/'
def result():
if(os.listdir(receiptPath)):
for files in os.listdir(receiptPath):
if files.startswith('MEM') and files.endswith('.csv'):
return 'mem_script'
pass
print('Launching script for: '+files)
elif files.startswith('FMS') and files.endswith('.csv'):
return 'fms_script'
pass
else:
pass
else:
print('No script to launch')
return "no_script"
pass
def onlyCsvFiles():
if(os.listdir(uploadPath)):
for files in os.listdir(uploadPath):
if files.startswith('MEM') or files.startswith('FMS') and files.endswith('.csv'):
return 'move_good_file'
else:
return 'move_bad_file'
else:
pass
default_args = {
'owner': 'testingA',
'start_date': dt.datetime(2020, 2, 17),
'retries': 1,
}
dag = DAG('tryingAirflow', default_args=default_args, description='airflow20',
schedule_interval=None, catchup=False)
file_sensor = FileSensor(
task_id="file_sensor",
filepath=uploadPath,
fs_conn_id='airflow_db',
poke_interval=10,
dag=dag,
)
onlyCsvFiles=BranchPythonOperator(
task_id='only_csv_files',
python_callable=onlyCsvFiles,
trigger_rule='none_failed',
dag=dag,)
move_good_file = BashOperator(
task_id="move_good_file",
bash_command='python3 '+scriptAirflow+'movingGoodFiles.py "{{ execution_date }}"',
dag=dag,
)
move_bad_file = BashOperator(
task_id="move_bad_file",
bash_command='python3 '+scriptAirflow+'movingBadFiles.py "{{ execution_date }}"',
dag=dag,
)
result_mv = BranchPythonOperator(
task_id='result_mv',
python_callable=result,
trigger_rule='none_failed',
dag=dag,
)
run_Mem_Script = BashOperator(
task_id="mem_script",
bash_command='python3 '+scriptAirflow+'memShScript.py "{{ execution_date }}"',
dag=dag,
)
run_Fms_Script = BashOperator(
task_id="fms_script",
bash_command='python3 '+scriptAirflow+'fmsScript.py "{{ execution_date }}"',
dag=dag,
)
skip_script= BashOperator(
task_id="no_script",
bash_command="echo No script to launch",
dag=dag,
)
rerun_dag=TriggerDagRunOperator(
task_id='rerun_dag',
trigger_dag_id='tryingAirflow',
trigger_rule='none_failed',
dag=dag,
)
onlyCsvFiles.set_upstream(file_sensor)
onlyCsvFiles.set_upstream(file_sensor)
move_good_file.set_upstream(onlyCsvFiles)
move_bad_file.set_upstream(onlyCsvFiles)
result_mv.set_upstream(move_good_file)
result_mv.set_upstream(move_bad_file)
run_Fms_Script.set_upstream(result_mv)
run_Mem_Script.set_upstream(result_mv)
skip_script.set_upstream(result_mv)
rerun_dag.set_upstream(run_Fms_Script)
rerun_dag.set_upstream(run_Mem_Script)
rerun_dag.set_upstream(skip_script)
When it come to choose the task in result, and if i have to call both it only execute one task and skip the other one.
I'd like to execute both task in same time when it's necessary. For my airflow.cfg. Question is: How to run task in parallel (or not if not necessary) with using BranchPythonOperator.
thx for help !
If you wanted to surely run either both scripts or none I would add a dummy task before the two tasks that need to run in parallel. Airflow will always choose one branch to execute when you use the BranchPythonOperator.
I would make these changes:
# import the DummyOperator
from airflow.operators.dummy_operator import DummyOperator
# modify the returns of the function result()
def result():
if(os.listdir(receiptPath)):
for files in os.listdir(receiptPath):
if (files.startswith('MEM') and files.endswith('.csv') or
files.startswith('FMS') and files.endswith('.csv')):
return 'run_scripts'
else:
print('No script to launch')
return "no_script"
# add the dummy task
run_scripts = DummyOperator(
task_id="run_scripts",
dag=dag
)
# add dependency
run_scripts.set_upstream(result_mv)
# CHANGE two of the dependencies to
run_Fms_Script.set_upstream(run_scripts)
run_Mem_Script.set_upstream(run_scripts)
I have to admit I never worked with LocalExecutor working on parallel tasks, but this should make sure you run both tasks in case you want to run the scripts.
EDIT:
If you want to run either none, one of the two, or both I think the easiest way is to create another task that runs both scripts in parallel in bash (or at least it runs them together with &). I would do something like this:
# import the DummyOperator
from airflow.operators.dummy_operator import DummyOperator
# modify the returns of the function result() so that it chooses between 4 different outcomes
def result():
if(os.listdir(receiptPath)):
mem_flag = False
fms_flag = False
for files in os.listdir(receiptPath):
if (files.startswith('MEM') and files.endswith('.csv')):
mem_flag = True
if (files.startswith('FMS') and files.endswith('.csv')):
fms_flag = True
if mem_flag and fms_flag:
return "both_scripts"
elif mem_flag:
return "mem_script"
elif fms_flag:
return "fms_script"
else:
return "no_script"
else:
print('No script to launch')
return "no_script"
# add the 'run both scripts' task
run_both_scripts = BashOperator(
task_id="both_script",
bash_command='python3 '+scriptAirflow+'memShScript.py "{{ execution_date }}" & python3 '+scriptAirflow+'fmsScript.py "{{ execution_date }}" &',
dag=dag,
)
# add dependency
run_both_scripts.set_upstream(result_mv)

Apache Airflow How to process multiple files in loop

Hi I am trying to process multiple files using apache airflow. I tried different options, but ended up using triggerdagrunoperator. So basically i have 2 dags, one is scheduled dag to check the file and it kicks of the trigger dag if file found. but i would like to repeat this for many files. Check one file at a time, if file exist, add parameters and call trigger dag with it.
def conditionally_trigger(context, dag_run_obj):
task_id = context['params']['task_id']
task_instance = context['task_instance']
file_type = task_instance.xcom_pull(task_id, key='file_type')
if file_type is not None and file_type != "":
dag_run_obj.payload = {'file_type': file_type, 'file_name': file_name, 'file_path': full_path}
return dag_run_obj
return None
trigger_dag_run_task = TriggerDagRunOperator(
task_id='trigger_dag_run_task',
trigger_dag_id="trigger_dag",
python_callable=conditionally_trigger,
params={'task_id': check_if_file_exists_task_id},
dag=dag,
)
def execute_check_if_file_exists_task(*args, **kwargs):
input_file_list = ["a","b"]
for item in input_file_list:
full_path = json_data[item]['input_folder_path']
directory = os.listdir(full_path)
for files in directory:
if not re.match(file_name, files):
continue
else:
# true
kwargs['ti'].xcom_push(key='file_type', value=item)
return "trigger_dag_run_task"
#false
return "file_not_found_task"
def execute_file_not_found_task(*args, **kwargs):
logging.info("File Not found path.")
file_not_found_task = PythonOperator(
task_id='file_not_found_task',
retries=3,
provide_context=True,
dag=dag,
python_callable=execute_file_not_found_task,
op_args=[])
check_if_file_exists_task = BranchPythonOperator(
task_id='check_if_file_exists_task',
retries=3,
provide_context=True,
dag=dag,
python_callable=execute_check_if_file_exists_task,
op_args=[])
check_if_file_exists_task.set_downstream(trigger_dag_run_task)
check_if_file_exists_task.set_downstream(file_not_found_task)

How to use MySqlOperator with xcom in Airflow?

I read this How to use airflow xcoms with MySqlOperator and while it has a similiar title it doesn't really address my issue.
I have the following code:
def branch_func_is_new_records(**kwargs):
ti = kwargs['ti']
xcom = ti.xcom_pull(task_ids='query_get_max_order_id')
string_to_print = 'Value in xcom is: {}'.format(xcom)
logging.info(string_to_print)
if int(xcom) > int(LAST_IMPORTED_ORDER_ID)
return 'import_orders'
else:
return 'skip_operation'
query_get_max_order_id = 'SELECT COALESCE(max(orders_id),0) FROM warehouse.orders where orders_id>1 limit 10'
get_max_order_id = MySqlOperator(
task_id='query_get_max_order_id',
sql= query_get_max_order_id,
mysql_conn_id=MyCon,
xcom_push=True,
dag=dag)
branch_op_is_new_records = BranchPythonOperator(
task_id='branch_operation_is_new_records',
provide_context=True,
python_callable=branch_func_is_new_records,
dag=dag)
get_max_order_id >> branch_op_is_new_records >> import_orders
branch_op_is_new_records >> skip_operation
The MySqlOperator returns a number according to the number the BranchPythonOperator choose the next task. It's guaranteed that the MySqlOperator has returned value greater than 0.
My problem is that nothing is pushed to XCOM by the MySqlOperator
On the UI when I go to XCOM I see nothing. The BranchPythonOperator oviously reads nothing so my code fails.
Why the XCOM doesn't work here?
The MySQL operator currently (airflow 1.10.0 at time of writing) doesn't support returning anything in XCom, so the fix for you for now is to write a small operator yourself. You can do this directly in your DAG file (untested, so there may be silly errors):
from airflow.operators.mysql_operator import MySqlOperator as BaseMySqlOperator
from airflow.hooks.mysql_hook import MySqlHook
class ReturningMySqlOperator(BaseMySqlOperator):
def execute(self, context):
self.log.info('Executing: %s', self.sql)
hook = MySqlHook(mysql_conn_id=self.mysql_conn_id,
schema=self.database)
return hook.get_first(
self.sql,
parameters=self.parameters)
def branch_func_is_new_records(**kwargs):
ti = kwargs['ti']
xcom = ti.xcom_pull(task_ids='query_get_max_order_id')
string_to_print = 'Value in xcom is: {}'.format(xcom)
logging.info(string_to_print)
if str(xcom) == 'NewRecords':
return 'import_orders'
else:
return 'skip_operation'
query_get_max_order_id = 'SELECT COALESCE(max(orders_id),0) FROM warehouse.orders where orders_id>1 limit 10'
get_max_order_id = ReturningMySqlOperator(
task_id='query_get_max_order_id',
sql= query_get_max_order_id,
mysql_conn_id=MyCon,
# xcom_push=True,
dag=dag)
branch_op_is_new_records = BranchPythonOperator(
task_id='branch_operation_is_new_records',
provide_context=True,
python_callable=branch_func_is_new_records,
dag=dag)
get_max_order_id >> branch_op_is_new_records >> import_orders
branch_op_is_new_records >> skip_operation

Resources