Apache Airflow get execution time difference between two DAG runs - airflow

I'm currently trying to figure out how to use two dag runs' execution time to get their time difference to prevent from the DAG processing the same file multiple times. Any idea how could I check this?

You could use croniter, a library packaged with Airflow to do this:
>>> from airflow import DAG
>>> from datetime import timedelta, datetime
>>> from croniter import croniter
>>>
>>> dag_a = DAG(
... dag_id="A",
... schedule_interval="00 10 * * *",
... )
>>>
>>> dag_b = DAG(
... dag_id="B",
... schedule_interval="00 14 * * *",
... )
>>>
>>> now = datetime.now()
>>> cron_a = croniter(dag_a.schedule_interval, now)
>>> cron_b = croniter(dag_b.schedule_interval, now)
>>> time_difference = cron_b.get_prev(datetime) - cron_a.get_prev(datetime)
>>> time_difference
datetime.timedelta(0, 14400)
>>> time_difference.seconds
14400 # 4 hours

Related

Airflow dynamically genarated task not run in order

I have created dynamic tasks generation dag. Tasks are generated accurately, But those tasks are not trigger in order,not work in consistently.
i have noticed it triggered on alphanumeric order.
Let's check run_modification_ tasks. i have generated 0 to 29 tasks. i have noticed it trigger on below format.
run_modification_0
run_modification_1
run_modification_10
run_modification_11
run_modification_12
run_modification_13
run_modification_14
run_modification_15
run_modification_16
run_modification_17
run_modification_18
run_modification_19
run_modification_2
run_modification_21
run_modification_23....
But i need to run it on tasks order like
run_modification_0
run_modification_1
run_modification_2
run_modification_3
run_modification_4
run_modification_5..
Please help me to run those tasks on task created order.
from datetime import date, timedelta, datetime
from airflow.utils.dates import days_ago
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.postgres_operator import PostgresOperator
from airflow.hooks.postgres_hook import PostgresHook
from airflow.models import Variable
import os
args = {
'owner': 'Airflow',
'start_date': days_ago(2),
}
dag = DAG(
dag_id='tastOrder',
default_args=args,
schedule_interval=None,
tags=['task']
)
modification_processXcom = """ cd {{ ti.xcom_pull(task_ids=\'run_modification_\'+params.i, key=\'taskDateFolder\') }} """
def modificationProcess(ds,**kwargs):
today = datetime.strptime('2021-01-01', '%Y-%m-%d').date()
i = str(kwargs['i'])
newDate = today-timedelta(days=int(i))
print(str(newDate))
kwargs["ti"].xcom_push("taskDateFolder", str(newDate))
def getDays():
today = today = datetime.strptime('2021-01-01', '%Y-%m-%d').date()
yesterday = today - timedelta(days=30)
day_Diff = today-yesterday
return day_Diff,today
day_Diff, today = getDays()
for i in reversed(range(0,day_Diff.days)):
run_modification = PythonOperator(
task_id='run_modification_'+str(i),
provide_context=True,
python_callable=modificationProcess,
op_kwargs={'i': str(i)},
dag=dag,
)
modification_processXcom = BashOperator(
task_id='modification_processXcom_'+str(i),
bash_command=modification_processXcom,
params = {'i' :str(i)},
dag = dag
)
run_modification >> modification_processXcom
To get the dependency as:
run_modification_1 -> modification_processXcom_1 ->
run_modification_2 -> modification_processXcom_2 -> ... - >
run_modification_29 -> modification_processXcom_29
You can do:
from datetime import datetime
from airflow import DAG
from airflow.operators.bash import BashOperator
dag = DAG(
dag_id='my_dag',
schedule_interval=None,
start_date=datetime(2021, 8, 10),
catchup=False,
is_paused_upon_creation=False,
)
mylist1 = []
mylist2 = []
for i in range(1, 30):
mylist1.append(
BashOperator( # Replace with your requested operator
task_id=f'run_modification_{i}',
bash_command=f"""echo executing run_modification_{i}""",
dag=dag,
)
)
mylist2.append(
BashOperator( # Replace with your requested operator
task_id=f'modification_processXcom_{i}',
bash_command=f"""echo executing modification_processXcom_{i}""",
dag=dag,
)
)
if len(mylist1) > 0:
mylist1[-1] >> mylist2[-1] # This set dependency between run_modifiation to modification_processXcom
if len(mylist1) > 1:
mylist2[-2] >> mylist1[-1] # This set dependency between modification_processXcom to previous run_modifiation
This code create a list of operators and set them to run one after another as:
Tree view:

How to trigger a task in airflow if immediate parent task fails?

What i am mainly aiming for is that the restore_denormalized_es_Data should only get triggered when the load_denormalized_es_data task fails. If the load_denormalized_es_data task is successful then the command should be directed to end . Here as you can see , my restore is working when archive fails and load is skipped or retrying as a result i am getting wrong answers.
Have stated the code i am using
import sys
import os
from datetime import datetime
#import files what u want to import
# Airflow level imports
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator,BranchPythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.utils.trigger_rule import TriggerRule
#Imported all the functions and the code is able to call the functions with ease
# Name of the Dag
DAG_NAME = "DAG"
#Default arguments
default_args = {
"owner": "Mehul",
"start_date": datetime.today().strftime("%Y-%m-%d"),
"provide_context": True
}
# Define the dag object
dag = DAG(
DAG_NAME,
default_args=default_args,
schedule_interval=None
)
archive_denormalized_es_data = PythonOperator(
task_id = "archive_denormalized_es_data",
python_callable = archive_current_ES_data,
trigger_rule=TriggerRule.ALL_SUCCESS,
provide_context = False,
dag=dag
)
load_denormalized_es_data = PythonOperator(
task_id = "load_denormalized_es_data",
python_callable = es_load,
provide_context = False,
trigger_rule = TriggerRule.ALL_SUCCESS,
dag=dag
)
restore_denormalized_es_data = PythonOperator(
task_id = "restore_denormalized_es_data",
python_callable = restore_current_ES_data,
trigger_rule=TriggerRule.ALL_FAILED,
provide_context=False,
dag=dag
)
END = DummyOperator(
task_id="END",
trigger_rule=TriggerRule.ALL_SUCCESS,
dag=dag)
denormalized_data_creation>>archive_denormalized_es_data>>load_denormalized_es_data
load_denormalized_es_data<<archive_denormalized_es_data<<denormalized_data_creation
load_denormalized_es_data>>restore_denormalized_es_data
restore_denormalized_es_data<<load_denormalized_es_data
load_denormalized_es_data>>END
END<<load_denormalized_es_data
restore_denormalized_es_data>>END
END<<restore_denormalized_es_data
Here is the picture of the pipelines referred above
If I understand correctly, you want to skip the rest of the pipeline if A fails.
ShortCircuitOperator will allow Airflow to short circuit (skip) the rest of the pipeline.
Here is an example that does what you outlined.
from datetime import datetime
from airflow.models import DAG
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator, ShortCircuitOperator
from airflow.utils.trigger_rule import TriggerRule
from airflow.utils.state import State
def proceed(**context):
ti = context['dag_run'].get_task_instance(a.task_id)
if ti.state == State.FAILED:
return False
else:
return True
dag = DAG(
dag_id="dag",
start_date=datetime(2021, 4, 5),
schedule_interval='#once',
)
with dag:
a = PythonOperator(
task_id='archive_denormalized_es_data',
python_callable=lambda x: 1
)
gate = ShortCircuitOperator(
task_id='gate',
python_callable=proceed,
trigger_rule=TriggerRule.ALL_DONE
)
b = PythonOperator(
task_id='load_denormalized_es_data',
python_callable=lambda: 1
)
c = DummyOperator(
task_id='restore_denormalized_es_data',
trigger_rule=TriggerRule.ALL_FAILED
)
d = DummyOperator(
task_id='END',
trigger_rule=TriggerRule.ONE_SUCCESS
)
a >> gate >> b >> c
[b, c] >> d
If archive_denormalized_es_data fails, the rest of the pipeline is skipped, meaning Airflow does not run restore_denormalized_es_data
If load_denormalized_es_data fails, restore_denormalized_es_data runs and continues to END.
If load_denormalized_es_data succeeds, restore_denormalized_es_data is skipped and continues to END.
You code is essentially missing the logic to skip when archive_denormalized_es_data fails, which the ShortCircuitOperator takes care of for you.

How to set airflow cron for run once every 72 hours?

0 */3 * * * will run every day, and 0 0 */3 * * will reset when month changed.
I want it to run once every 72 hours.
'schedule_interval': timedelta(hours=72)
For testing I have executed a sample dag with below code and this is the outcome is in the screenshot.
import datetime as dt
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.dates import days_ago
dag = DAG(
dag_id='test_latest_only',
schedule_interval=dt.timedelta(minutes=4),
start_date=dt.datetime(2020,12,25),
tags=['example'],
catchup=False
)
latest_only = DummyOperator(task_id='latest_only', dag=dag)
task1 = DummyOperator(task_id='task1', dag=dag)
latest_only >> task1

How do I pass the task_id name dynamically in the Dynamic DAG creation

Below is the dynamic dag creation for each table. I need to pass Table name for the load_table, So the task can be seen as load_table_A in DAG edw_table_A and load_table_B
in the DAG edw_table_B
import datetime
import os
from functools import partial
from datetime import timedelta
from airflow.models import DAG,Variable
from airflow.contrib.operators.snowflake_operator import SnowflakeOperator
from alerts.email_operator import dag_failure_email
def get_db_dag(
*,
dag_id,
start_date,
schedule_interval,
max_taskrun,
max_dagrun,
proc_nm,
load_sql
):
default_args = {
'owner': 'airflow',
'start_date': start_date,
'provide_context': True,
'execution_timeout': timedelta(minutes=max_taskrun),
'email_on_retry': False,
}
dag = DAG(
dag_id=dag_id,
schedule_interval=schedule_interval,
dagrun_timeout=timedelta(hours=max_dagrun),
template_searchpath=tmpl_search_path,
default_args=default_args,
max_active_runs=1,
catchup='{{var.value.dag_catchup}}',
on_failure_callback='email',
)
load_table = SnowflakeOperator(
task_id='load_table',
sql=load_sql,
snowflake_conn_id=CONN_ID,
autocommit=True,
dag=dag,
)
load_table
return dag
# ======== DAG DEFINITIONS #
edw_table_A = get_db_dag(
dag_id='edw_table_A',
start_date=datetime.datetime(2020, 5, 21),
schedule_interval='0 5 * * *',
max_taskrun=3, # Minutes
max_dagrun=1, # Hours
load_sql='extract_A.sql',
)
edw_table_B = get_db_dag(
dag_id='edw_table_B',
start_date=datetime.datetime(2020, 5, 21),
schedule_interval='0 5 * * *',
max_taskrun=3, # Minutes
max_dagrun=1, # Hours
load_sql='extract_B.sql',
)
For one since you are generating different DAGs for different tables already, adding table-name to task_id (also) is not required.
But of course if you want to do it, you could do away with simple python string concatenation by adding a table_name param in your get_db_dag(..) function
def get_db_dag(
*, # what is this?
table_name, # replace dag_id param with just table_name param
start_date,
schedule_interval,
max_taskrun,
max_dagrun,
proc_nm # remove load_sql param too (it is also redundant)
):
..
dag = DAG(
dag_id=f"edw_table_{table_name}", # python 3+ string-interpolation
schedule_interval=schedule_interval,
..
)
load_table = SnowflakeOperator(
task_id=f"load_table_{table_name}", # python 3+ string-interpolation
sql=f"extract_{table_name}.sql",
snowflake_conn_id=CONN_ID,
..
)
load_table # what is this meant for? (it is redundant)
return dag
Then you can call the above function as
edw_table_A = get_db_dag(
table_name='A',
start_date=datetime.datetime(2020, 5, 21),
schedule_interval='0 5 * * *',
max_taskrun=3, # Minutes
max_dagrun=1, # Hours
)

Airflow running but nothing happening

My airflow DAG is showing its running but nothing is happening. I am not understanding what's wrong? Spend lot of time with this.
The first dag in code should set airflow variable and second dag should read this variable and run tasks based on the values in the variable. The second DAG runs 15 minutes after the first DAG. But always its showing its running.
Anybody please help. Thank you.
Airflow version 1.9.0
import datetime as dt
from airflow import models
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.operators import python_operator
from utils.etl_fail_slack_alert import task_fail_slack_alert
from airflow.contrib.hooks.bigquery_hook import BigQueryHook
from airflow.operators.dummy_operator import DummyOperator
from airflow.models import Variable
project = 'asd'
source_dataset = 'dfghdfh'
destination_dataset = 'dfhdfhdf'
table_prefix = ''
default_args = {
'start_date': '2019-09-10',
'retries': 1,
'retry_delay': dt.timedelta(minutes=2),
'on_failure_callback': task_fail_slack_alert,
}
def set_views_av():
bq_hook = BigQueryHook(bigquery_conn_id='bigquery_default',
delegate_to=None, use_legacy_sql=False)
query = ("SELECT table_id FROM `{project}.{dataset}.{table}`;".format(
project=project, dataset=source_dataset, table='__TABLES__'))
df = bq_hook.get_pandas_df(sql=query, dialect='standard')
view_names = df['table_id'].tolist()
Variable.set('view_list', '|'.join(view_names))
def bq_operator(vname, dag):
sql = ("SELECT * FROM `{project}.{dataset}.{table}`".format(
project=project, dataset=source_dataset, table=vname))
materialize_view_bq = BigQueryOperator(bql=sql,
destination_dataset_table = project + "." + destination_dataset + "." + table_prefix + vname,
task_id="materialize_" + vname,
bigquery_conn_id="bigquery_default",
google_cloud_storage_conn_id="google_cloud_default",
use_legacy_sql=False,
write_disposition="WRITE_TRUNCATE",
create_disposition="CREATE_IF_NEEDED",
query_params={},
allow_large_results=True,
dag=dag
)
return materialize_view_bq
with models.DAG(dag_id="materialize_init_views", default_args=default_args, schedule_interval="15 09 * * *", catchup=True) as dag_init:
bridge = DummyOperator(
task_id='bridge',
dag=dag_init
)
set_views = python_operator.PythonOperator(task_id="set_views",
python_callable=set_views_av
)
bridge >> set_views
with models.DAG(dag_id="materialize_views_dynamic", default_args=default_args, schedule_interval="30 09 * * *", catchup=True) as dag:
views = Variable.get("view_list").split("|")
bridge = DummyOperator(
task_id='bridge',
dag=dag
)
for vname in views:
materialize_view_bq = bq_operator(vname, dag)
bridge >> materialize_view_bq

Resources