I have a DAG copying data to S3 using PySpark like below:
...
bucket = 'my.bucket'
schema = 'my_schema'
table = 'my_table'
ymd = pendulum.parse('{{ execution_date }}').strftime('%Y%m%d')
spark_script = 'my_spark_script'
DEFAULT_ARGS = {
'owner': 'burgerphilia',
'start_date': '2020-09-01',
'on_failure_callback': alert.slack_fail_alert,
'depends_on_past': False
}
SPARK_STEPS = [
{
'Name': f'{schema}_{table}_step',
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
'sudo',
'spark-submit',
...
f's3://{bucket}/spark-script/{spark_script}.py',
'--ymd',
f'{ymd}'
]
}
}
]
def delete_s3_object(bucket, schema, table, ymd):
"""
:param bucket: bucket name
:type buket: str
:param schema: schema name(the same as hive schema)
:type schema: str
:param table: table name(the same as hive table)
:type table: str
:param ymd: date to delete, '%Y%m%d' format
:type ymd: str
"""
aws_hook = AwsHook(aws_conn_id='aws_conn')
session = aws_hook.get_session(region_name='ap-northeast-2')
s3 = session.resource('s3')
bucket = s3.Bucket(bucket)
bucket.objects.filter(Prefix=f'{schema}/{table}/ymd={ymd}/').delete()
with DAG(
dag_id=f'{schema}_{table}',
default_args=DEFAULT_ARGS,
catchup=False,
schedule_interval="40 06 * * *"
) as dag:
object_cleaner = PythonOperator(
task_id = 'delete_object',
python_callable=delete_s3_object,
op_kwargs={'bucket': bucket, 'schema': schema, 'table': table, ymd': ymd}
)
step_adder = EmrAddStepsOperator(
task_id='add_step',
job_flow_id=job_flow_id,
aws_conn_id='aws_conn',
steps=SPARK_STEPS,
)
step_checker = EmrStepSensor(
task_id='watch_step',
job_flow_id=job_flow_id,
step_id="{{ task_instance.xcom_pull(task_ids='add_step', key='return_value')[0] }}",
aws_conn_id='aws_conn',
)
object_cleaner >> step_adder >> step_checker
This DAG is working on a daily basis, but the thing is data source(Oracle DB) is updated sometimes. So I should re-run the same DAG every Monday and the first day of month to update previous one(e.g. On 2020/11/02, re-run 2020/10/26 ~ 2020/11/01). Is there the best way to handle this?
There is no direct way to do it. You can try 2 things:
use dynamic dag (https://www.astronomer.io/guides/dynamically-generating-dags/) to create 2 dags with different schedule_interval.
Create another dag which will trigger this dag on different scheduler_interval.
Related
In my actual DAG, I need to first get a list of IDs and then for each ID run a set of tasks.
I have used Dynamic Task Mapping to pass a list to a single task or operator to have it process the list, but can we do this using a TaskGroup as well?
If I can figure out how to pass a variable value at the TaskGroup level, so it uses that value in all sub tasks, then I should be able to meet my requirement.
The below should give you an idea of what I am looking for, just need help getting it working.
from airflow import DAG, XComArg
from datetime import datetime
from airflow.decorators import task
from airflow.utils.task_group import TaskGroup
from airflow.operators.python import PythonOperator
with DAG(
'dtm_tg_test',
schedule_interval = None,
start_date = datetime(2022, 1, 1)
) as dag:
def getList():
return [ "Hello", "World" ]
def printText(text):
print(text)
get_list = PythonOperator(
task_id = "get_list",
python_callable = getList,
dag = dag
)
with TaskGroup.partial(
group_id = "task_group"
).expand(
list = XComArg(get_list)
) as task_group:
print_text = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_again = PythonOperator(
task_id = "print_output",
python_callable = printText,
op_kwargs = { "text": list }
dag = dag
)
print_text >> print_again
get_list >> task_group
You can achieve it with the following example :
list_ids = ['45', '48']
#task_group()
def parent_group(list_ids: List[str]) -> List[TaskGroup]:
return list(map(build_group_for_id, list_ids))
def build_group_for_id(current_id: str) -> TaskGroup:
with TaskGroup(group_id=f'group_for_id_{current_id}') as group:
print_text = PythonOperator(
task_id = f"print_output_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id }
dag = dag
)
print_again = PythonOperator(
task_id = f"print_output_other_{current_id}",
python_callable = printText,
op_kwargs = { "text": current_id}
dag = dag
print_text >> print_again
return group
with airflow.DAG(
"my_dag", default_args=args, schedule_interval=None,
) as dag:
DummyOperator(task_id='start_dag') >> parent_group(list_ids())
Some explanations :
I create a parent taskGroup called parent_group
This parent group takes the list of IDs
I add a loop and for each parent ID, I create a TaskGroup containing your 2 Aiflow tasks (print operators)
For the TaskGroup related to a parent ID, the TaskGroup ID is built from it in order to be unique in the DAG
For the print operators inside the TaskGroup, I generated again the task IDs by the current parent ID
I want to pass the date value (from a variable) as partition column in sql file. The variable is not getting evaluated. The variable is stored in Xcom. Is there any better way to do .
Rendered template:
TO 's3://opsake/prc/deldb/ldp_pipeline_hist/partiton_key=''{{ ti.xcom_pull(task_ids='genExecParam', key='app_run_dt') }}''/'
iam_role 'arn:aws20xxx:role/Re11dccc3role'
allowoverwrite
format as parquet
maxfilesize 100 mb;, parameters: None
Code :
Here I am using the ds or inputdate
def genExecParam(**kwargs):
if 'ds_date' in kwargs['dag_run'].conf:
app_run_dt = (kwargs['dag_run'].conf['ds_date'])
else:
app_run_dt = kwargs['ds']
var_dt = datetime.fromisoformat(kwargs['ds'])
app_prev_run_dt = (var_dt + timedelta(days=-1)).strftime('%Y-%m-%d')
# (datetime.now(timezone('UTC'))+timedelta(days=-1)).strftime(date_format)
app_run_id = datetime.now(timezone('UTC')).strftime('%Y%m%d%H%M%S')
kwargs['ti'].xcom_push(key='app_run_id', value=app_run_id)
kwargs['ti'].xcom_push(key='app_run_dt', value=app_run_dt)
Dag:
from datetime import datetime, timedelta
from airflow import DAG
from airflow.providers.amazon.aws.operators.redshift import RedshiftSQLOperator
from airflow.utils.trigger_rule import TriggerRule
from pytz import timezone
import pendulum
from utils.dagUtils import *
localtz = pendulum.timezone('America/Los_Angeles')
# default arguments to DAG
default_args = {
"owner": "delta",
"depends_on_past": False,
"start_date": datetime(2022, 7, 2, tzinfo=localtz),
"email": [emalgrp],
"email_on_failure": True,
"email_on_retry": True,
"retries": 1,
"retry_delay": timedelta(minutes=5),
"max_active_runs": 1,
"catchup": False,
}
var_dt = '{{ ds }}'
with DAG(
'extrse',
default_args=default_args,
description='extse',
max_active_runs=1,
schedule_interval= '0 10 * * *',
catchup= False,
tags=['env', 'prod'],
on_success_callback=dag_success_callback,
on_failure_callback=dag_failure_callback,
template_searchpath ='/usr/local/airflow/dags/modules/'
) as dag:
AppStart = DummyOperator(task_id='start')
var_ds_date = "{{ ti.xcom_pull(task_ids='genExecParam', key='app_run_dt') }}"
var_ds_prev_date = "{{ ti.xcom_pull(task_ids='genExecParam', key='app_prev_run_dt') }}"
app_run_id = "{{ ti.xcom_pull(task_ids='genExecParam', key='app_prev_run_dt') }}"
genExecParam = PythonOperator(task_id='genExecParam', python_callable=genExecParam, provide_context=True)
sourouse = RedshiftSQLOperator(task_id='sourouse',
redshift_conn_id='deltn_id',
sql=['prc_layer/sql/lighthract.sql'],
params={"data_bucket_name": data_bucket_name,"prc_db_dir": prc_db_dir,"var_ds_date": var_ds_date})
AppStart >>genExecParam >>source_ld_lighthouse
Sql File:
UNLOAD ('select * from test.sales_report')
TO 's3://{{ params.data_bucket_name }}/{{ params.prc_db_dir }}/ldp_pipeline_hist/partiton_key={{ params.var_ds_date }}/'
iam_role 'arn:awrole'
allowoverwrite
format as parquet
maxfilesize 100 mb;
RedshiftSQLOperator's params is not part of template_fields, so jinja syntax is not allowed in params.
Instead, sql is the part of template_fields, so it is possible to do like this.
UNLOAD ('select * from test.sales_report')
TO 's3://{{ params.data_bucket_name }}/{{ params.prc_db_dir }}/ldp_pipeline_hist/partiton_key={{ ti.xcom_pull(task_ids="genExecParam", key="app_run_dt") }}/'
iam_role 'arn:awrole'
allowoverwrite
format as parquet
maxfilesize 100 mb;
Small note, it might not be a good idea to name the python function and task variable as the same name. You might get some weird behavior.
You are overwriting your python function with
genExecParam = PythonOperator(task_id='genExecParam', python_callable=genExecParam, provide_context=True)
# genExecParam is no longer your python function.
I have a problem with my dag getting stuck at subdag. The subdag is in RUNNING state but on zooming in all the tasks of the subdag are in None status.
Using Airflow 2.1.1 with LocalExecutor.
Below is the main dag:
default_args = {
'owner' : 'airflow',
'retries' : 1,
'depends_on_past' : False
}
dag = DAG('loop_example',
start_date = datetime(2022,1,1),
schedule_interval = None,
catchup = False,
tags=['loop']
)
## function to filter src_name based on a DB table/log file entry
def check_valid_src(src_name):
hook = MySqlHook(mysql_conn_id='mysql_conn')
sql='SELECT src_name FROM ingsted_src_log_table'
myresult=hook.get_records(sql)
valid_src_names = []
for src in myresult:
valid_src_names.append(src[0])
if src_name in valid_src_names:
return True
else:
return False
first = DummyOperator(task_id = 'first',dag=dag)
last = DummyOperator(task_id = 'last',dag=dag)
options = ['branch_a','branch_b','branch_c','branch_d']
for option in options:
if check_valid_src(option):
t = SubDagOperator(task_id = f'section_{option}',
subdag=subdag('loop_example',f'section_{option}',default_args,option),
dag=dag
)
first >> t >> last
subdag code:
def subdag(parent_dag_name, child_dag_name, args,option):
dag_subdag = DAG(
dag_id=f'{parent_dag_name}.{child_dag_name}',
default_args=args,
start_date = datetime(2022,1,1),
schedule_interval=None,
)
t1= BashOperator(
task_id=f'Echo_source_name',
bash_command = f'echo {option}',
default_args=args,
dag=dag_subdag
)
t2= BashOperator(
task_id=f'Echo_source_number',
bash_command = f'echo "{option}" | cut -d "_" f2',
default_args=args,
dag=dag_subdag,
)
t1 >> t2
return dag_subdag
Earlier the start_date of the main_dag and subdag was not same so I tried running again making the start_date as same but still it gets stuck.
Is there anything that I am missing here
You have to pass is_paused_upon_creation=False in subdag.
dag_subdag = DAG(
dag_id=f'{parent_dag_name}.{child_dag_name}',
default_args=args,
start_date = datetime(2022,1,1),
schedule_interval=None,is_paused_upon_creation=False
)
I wanted to skip ECSOperator Task in airflow.
Basically I have two tasks:
CUSTOMER_CONFIGS = [
{
'customer_name': 'test',
'start_date': 17 # day of the month on which you want to trigger task
},
{
'customer_name': 'test',
'start_date': 18 # day of the month on which you want to trigger task
}
]
default_args = {
'depends_on_past': False,
'retries': 0
}
with DAG(
dag_id='run-ecs-task',
default_args=default_args,
start_date=days_ago(1),
schedule_interval='0 0 * * *',
max_active_runs=1,
) as dag:
current_day = datetime.now()
current_day = current_day.strftime("%d")
tasks = []
for config in CUSTOMER_CONFIGS:
task = ECSOperator(
task_id=f'{config.get("customer_name")}',
dag=dag,
retries=AIRFLOW_ECS_OPERATOR_RETRIES,
retry_delay=timedelta(seconds=10),
**ecs_operator_args
)
if config.get('start_date') != current_day:
task.state = State.SKIPPED
tasks.append(task)
How can I skip first ecs task on the bases of some condition.
Latern I would like to make these tasks in sequece
You didn't specify what is the condition but in general you can use ShortCircuitOperator. The ShortCircuitOperator is derived from the PythonOperator. It evaluates a condition and short-circuits the workflow if the condition is False.
from airflow.operators.python import ShortCircuitOperator
def condition():
if 1 > 2: # Replace with your condition
return True
return False
conditional_task = ShortCircuitOperator(
task_id='condition',
python_callable=func
)
task = ECSOperator(...)
task2 = ECSOperator(...)
conditional_task >> task
task2
I am trying to add airflow dag dynamically looping through the dictionary keys and assigning keys as dag name.
dags are creating fine but i am getting :"This DAG isn't available in the webserver DagBag object. It shows up in this list because the scheduler marked it as active in the metdata database" and its not clickable.
def create_dag(dag_id):
args = build_default_args(config_file)
dag = DAG(dag_id,schedule_interval='30 11 * * *', default_args=args)
with dag:
init_task = BashOperator(
task_id='test_init_task',
bash_command='echo "task"',
dag=dag
)
init_task
return dag
def get_data(**kwargs):
my_list=[]
file = open("/home/airflow/gcs/data/test.json")
data=json.load(file)
return data
data1 = data()
for dict in data1:
for pair in dict.items():
key , value = pair
print "key",ls_table ,"value",metrics
dag_id = '{}'.format(key)
default_args = {'owner': 'airflow',
'start_date': datetime(2019, 6, 18)
}
schedule = '#daily'
globals()[dag_id] = create_dag(dag_id)