I wanted to skip ECSOperator Task in airflow.
Basically I have two tasks:
CUSTOMER_CONFIGS = [
{
'customer_name': 'test',
'start_date': 17 # day of the month on which you want to trigger task
},
{
'customer_name': 'test',
'start_date': 18 # day of the month on which you want to trigger task
}
]
default_args = {
'depends_on_past': False,
'retries': 0
}
with DAG(
dag_id='run-ecs-task',
default_args=default_args,
start_date=days_ago(1),
schedule_interval='0 0 * * *',
max_active_runs=1,
) as dag:
current_day = datetime.now()
current_day = current_day.strftime("%d")
tasks = []
for config in CUSTOMER_CONFIGS:
task = ECSOperator(
task_id=f'{config.get("customer_name")}',
dag=dag,
retries=AIRFLOW_ECS_OPERATOR_RETRIES,
retry_delay=timedelta(seconds=10),
**ecs_operator_args
)
if config.get('start_date') != current_day:
task.state = State.SKIPPED
tasks.append(task)
How can I skip first ecs task on the bases of some condition.
Latern I would like to make these tasks in sequece
You didn't specify what is the condition but in general you can use ShortCircuitOperator. The ShortCircuitOperator is derived from the PythonOperator. It evaluates a condition and short-circuits the workflow if the condition is False.
from airflow.operators.python import ShortCircuitOperator
def condition():
if 1 > 2: # Replace with your condition
return True
return False
conditional_task = ShortCircuitOperator(
task_id='condition',
python_callable=func
)
task = ECSOperator(...)
task2 = ECSOperator(...)
conditional_task >> task
task2
Related
I have a problem with my dag getting stuck at subdag. The subdag is in RUNNING state but on zooming in all the tasks of the subdag are in None status.
Using Airflow 2.1.1 with LocalExecutor.
Below is the main dag:
default_args = {
'owner' : 'airflow',
'retries' : 1,
'depends_on_past' : False
}
dag = DAG('loop_example',
start_date = datetime(2022,1,1),
schedule_interval = None,
catchup = False,
tags=['loop']
)
## function to filter src_name based on a DB table/log file entry
def check_valid_src(src_name):
hook = MySqlHook(mysql_conn_id='mysql_conn')
sql='SELECT src_name FROM ingsted_src_log_table'
myresult=hook.get_records(sql)
valid_src_names = []
for src in myresult:
valid_src_names.append(src[0])
if src_name in valid_src_names:
return True
else:
return False
first = DummyOperator(task_id = 'first',dag=dag)
last = DummyOperator(task_id = 'last',dag=dag)
options = ['branch_a','branch_b','branch_c','branch_d']
for option in options:
if check_valid_src(option):
t = SubDagOperator(task_id = f'section_{option}',
subdag=subdag('loop_example',f'section_{option}',default_args,option),
dag=dag
)
first >> t >> last
subdag code:
def subdag(parent_dag_name, child_dag_name, args,option):
dag_subdag = DAG(
dag_id=f'{parent_dag_name}.{child_dag_name}',
default_args=args,
start_date = datetime(2022,1,1),
schedule_interval=None,
)
t1= BashOperator(
task_id=f'Echo_source_name',
bash_command = f'echo {option}',
default_args=args,
dag=dag_subdag
)
t2= BashOperator(
task_id=f'Echo_source_number',
bash_command = f'echo "{option}" | cut -d "_" f2',
default_args=args,
dag=dag_subdag,
)
t1 >> t2
return dag_subdag
Earlier the start_date of the main_dag and subdag was not same so I tried running again making the start_date as same but still it gets stuck.
Is there anything that I am missing here
You have to pass is_paused_upon_creation=False in subdag.
dag_subdag = DAG(
dag_id=f'{parent_dag_name}.{child_dag_name}',
default_args=args,
start_date = datetime(2022,1,1),
schedule_interval=None,is_paused_upon_creation=False
)
I have a DAG copying data to S3 using PySpark like below:
...
bucket = 'my.bucket'
schema = 'my_schema'
table = 'my_table'
ymd = pendulum.parse('{{ execution_date }}').strftime('%Y%m%d')
spark_script = 'my_spark_script'
DEFAULT_ARGS = {
'owner': 'burgerphilia',
'start_date': '2020-09-01',
'on_failure_callback': alert.slack_fail_alert,
'depends_on_past': False
}
SPARK_STEPS = [
{
'Name': f'{schema}_{table}_step',
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
'sudo',
'spark-submit',
...
f's3://{bucket}/spark-script/{spark_script}.py',
'--ymd',
f'{ymd}'
]
}
}
]
def delete_s3_object(bucket, schema, table, ymd):
"""
:param bucket: bucket name
:type buket: str
:param schema: schema name(the same as hive schema)
:type schema: str
:param table: table name(the same as hive table)
:type table: str
:param ymd: date to delete, '%Y%m%d' format
:type ymd: str
"""
aws_hook = AwsHook(aws_conn_id='aws_conn')
session = aws_hook.get_session(region_name='ap-northeast-2')
s3 = session.resource('s3')
bucket = s3.Bucket(bucket)
bucket.objects.filter(Prefix=f'{schema}/{table}/ymd={ymd}/').delete()
with DAG(
dag_id=f'{schema}_{table}',
default_args=DEFAULT_ARGS,
catchup=False,
schedule_interval="40 06 * * *"
) as dag:
object_cleaner = PythonOperator(
task_id = 'delete_object',
python_callable=delete_s3_object,
op_kwargs={'bucket': bucket, 'schema': schema, 'table': table, ymd': ymd}
)
step_adder = EmrAddStepsOperator(
task_id='add_step',
job_flow_id=job_flow_id,
aws_conn_id='aws_conn',
steps=SPARK_STEPS,
)
step_checker = EmrStepSensor(
task_id='watch_step',
job_flow_id=job_flow_id,
step_id="{{ task_instance.xcom_pull(task_ids='add_step', key='return_value')[0] }}",
aws_conn_id='aws_conn',
)
object_cleaner >> step_adder >> step_checker
This DAG is working on a daily basis, but the thing is data source(Oracle DB) is updated sometimes. So I should re-run the same DAG every Monday and the first day of month to update previous one(e.g. On 2020/11/02, re-run 2020/10/26 ~ 2020/11/01). Is there the best way to handle this?
There is no direct way to do it. You can try 2 things:
use dynamic dag (https://www.astronomer.io/guides/dynamically-generating-dags/) to create 2 dags with different schedule_interval.
Create another dag which will trigger this dag on different scheduler_interval.
task a > task b > task c
If C fails I want to retry A. Is this possible? There are a few other tickets which involve subdags, but I would like to just be able to clear A.
I'm hoping to use on_retry_callback in task C but I don't know how to call task A.
There is another question which does this in a subdag, but I am not using subdags.
I'm trying to do this, but it doesn't seem to work:
def callback_for_failures(context):
print("*** retrying ***")
if context['task'].upstream_list:
context['task'].upstream_list[0].clear()
As other comments mentioned, I would use caution to make sure you aren't getting into an endless loop of clearing/retries. But you can call a bash command as part of your on_failure_callback and then specify which tasks you want to clear, and if you want downstream/upstream tasks cleared etc.
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta
def clear_upstream_task(context):
execution_date = context.get("execution_date")
clear_tasks = BashOperator(
task_id='clear_tasks',
bash_command=f'airflow tasks clear -s {execution_date} -t t1 -d -y clear_upstream_task'
)
return clear_tasks.execute(context=context)
# Default settings applied to all tasks
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5)
}
with DAG('clear_upstream_task',
start_date=datetime(2021, 1, 1),
max_active_runs=3,
schedule_interval=timedelta(minutes=5),
default_args=default_args,
catchup=False
) as dag:
t0 = DummyOperator(
task_id='t0'
)
t1 = DummyOperator(
task_id='t1'
)
t2 = DummyOperator(
task_id='t2'
)
t3 = BashOperator(
task_id='t3',
bash_command='exit 123',
on_failure_callback=clear_upstream_task
)
t0 >> t1 >> t2 >> t3
Suppose I have the follow DAG (basic placeholder functions), that uses a for-loop to dynamically generate tasks (from iterating over a list):
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'ETLUSER',
'depends_on_past': False,
'start_date': datetime(2019, 12, 16, 0, 0, 0),
'email': ['xxx#xxx.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5)
}
dag = DAG('xxx', catchup=False,
default_args=default_args, schedule_interval='0 */4 * * *')
# Some dummy function
def StepOne(x):
print(x)
def StepTwo():
print("Okay, we finished all of Step 1.")
some_list = [1, 2, 3, 4, 5, 6]
for t in some_list:
task_id = f'FirstStep_{t}'
task = PythonOperator(
task_id=task_id,
python_callable=StepOne,
provide_context=False,
op_kwargs={'x': str(t)},
dag=dag
)
task
I want to introduce some additional task that's simply:
task2 = PythonOperator(
task_id="SecondStep",
python_callable=StepTwo,
provide_context=False,
dag=dag
)
That runs only after all the steps in the first have finished. Linearly, this would be task >> task2
How do I go about doing this?
You can have task dependencies with array.
Do taskC after both taskA and taskB finished.
[taskA, taskB] >> taskC
or
Do taskB and taskC in parallel after taskA finished.
taskA >> [taskB, taskC]
as long as 1 side of upstream or downstream are non-array.
Thus, for your example,
task1 = []
for t in some_list:
task_id = f'FirstStep_{t}'
task1.append(PythonOperator(
task_id=task_id,
python_callable=StepOne,
provide_context=False,
op_kwargs={'x': str(t)},
dag=dag))
task2 = PythonOperator(
task_id="SecondStep",
python_callable=StepTwo,
provide_context=False,
dag=dag)
task1 >> task2
If my DAG is this
[setup] -> [processing-task] -> [end].
How can I schedule this DAG to run periodically, while running [setup] task only once (on first scheduled run) and skipping it for all later runs?
Check out this post in medium which describes how to implement a "run once" operator. I have successfully used this several times.
Here is a way to do it without need to create a new class. I found this simpler than the accepted answer and it worked well for my use case.
Might be useful for others!
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import BranchPythonOperator
with DAG(
dag_id='your_dag_id',
default_args={
'depends_on_past': False,
'email': ['you#email.com'],
'email_on_failure': True,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
},
description='Dag with initial setup task that only runs on start_date',
start_date=datetime(2000, 1, 1),
# Runs daily at 1 am
schedule_interval='0 1 * * *',
# catchup must be true if start_date is before datetime.now()
catchup=True,
max_active_runs=1,
) as dag:
def branch_fn(**kwargs):
# Have to make sure start_date will equal data_interval_start on first run
# This dag is daily but since the schedule_interval is set to 1 am data_interval_start would be
# 2000-01-01 01:00:00 when it needs to be
# 2000-01-01 00:00:00
date = kwargs['data_interval_start'].replace(hour=0, minute=0, second=0, microsecond=0)
if date == dag.start_date:
return 'initial_task'
else:
return 'skip_initial_task'
branch_task = BranchPythonOperator(
task_id='branch_task',
python_callable=branch_fn,
provide_context=True
)
initial_task = DummyOperator(
task_id="initial_task"
)
skip_initial_task = DummyOperator(
task_id="skip_initial_task"
)
next_task = DummyOperator(
task_id="next_task",
# This is important otherwise next_task would be skipped
trigger_rule="one_success"
)
branch_task >> [initial_task, skip_initial_task] >> next_task