I want to pass the date value (from a variable) as partition column in sql file. The variable is not getting evaluated. The variable is stored in Xcom. Is there any better way to do .
Rendered template:
TO 's3://opsake/prc/deldb/ldp_pipeline_hist/partiton_key=''{{ ti.xcom_pull(task_ids='genExecParam', key='app_run_dt') }}''/'
iam_role 'arn:aws20xxx:role/Re11dccc3role'
allowoverwrite
format as parquet
maxfilesize 100 mb;, parameters: None
Code :
Here I am using the ds or inputdate
def genExecParam(**kwargs):
if 'ds_date' in kwargs['dag_run'].conf:
app_run_dt = (kwargs['dag_run'].conf['ds_date'])
else:
app_run_dt = kwargs['ds']
var_dt = datetime.fromisoformat(kwargs['ds'])
app_prev_run_dt = (var_dt + timedelta(days=-1)).strftime('%Y-%m-%d')
# (datetime.now(timezone('UTC'))+timedelta(days=-1)).strftime(date_format)
app_run_id = datetime.now(timezone('UTC')).strftime('%Y%m%d%H%M%S')
kwargs['ti'].xcom_push(key='app_run_id', value=app_run_id)
kwargs['ti'].xcom_push(key='app_run_dt', value=app_run_dt)
Dag:
from datetime import datetime, timedelta
from airflow import DAG
from airflow.providers.amazon.aws.operators.redshift import RedshiftSQLOperator
from airflow.utils.trigger_rule import TriggerRule
from pytz import timezone
import pendulum
from utils.dagUtils import *
localtz = pendulum.timezone('America/Los_Angeles')
# default arguments to DAG
default_args = {
"owner": "delta",
"depends_on_past": False,
"start_date": datetime(2022, 7, 2, tzinfo=localtz),
"email": [emalgrp],
"email_on_failure": True,
"email_on_retry": True,
"retries": 1,
"retry_delay": timedelta(minutes=5),
"max_active_runs": 1,
"catchup": False,
}
var_dt = '{{ ds }}'
with DAG(
'extrse',
default_args=default_args,
description='extse',
max_active_runs=1,
schedule_interval= '0 10 * * *',
catchup= False,
tags=['env', 'prod'],
on_success_callback=dag_success_callback,
on_failure_callback=dag_failure_callback,
template_searchpath ='/usr/local/airflow/dags/modules/'
) as dag:
AppStart = DummyOperator(task_id='start')
var_ds_date = "{{ ti.xcom_pull(task_ids='genExecParam', key='app_run_dt') }}"
var_ds_prev_date = "{{ ti.xcom_pull(task_ids='genExecParam', key='app_prev_run_dt') }}"
app_run_id = "{{ ti.xcom_pull(task_ids='genExecParam', key='app_prev_run_dt') }}"
genExecParam = PythonOperator(task_id='genExecParam', python_callable=genExecParam, provide_context=True)
sourouse = RedshiftSQLOperator(task_id='sourouse',
redshift_conn_id='deltn_id',
sql=['prc_layer/sql/lighthract.sql'],
params={"data_bucket_name": data_bucket_name,"prc_db_dir": prc_db_dir,"var_ds_date": var_ds_date})
AppStart >>genExecParam >>source_ld_lighthouse
Sql File:
UNLOAD ('select * from test.sales_report')
TO 's3://{{ params.data_bucket_name }}/{{ params.prc_db_dir }}/ldp_pipeline_hist/partiton_key={{ params.var_ds_date }}/'
iam_role 'arn:awrole'
allowoverwrite
format as parquet
maxfilesize 100 mb;
RedshiftSQLOperator's params is not part of template_fields, so jinja syntax is not allowed in params.
Instead, sql is the part of template_fields, so it is possible to do like this.
UNLOAD ('select * from test.sales_report')
TO 's3://{{ params.data_bucket_name }}/{{ params.prc_db_dir }}/ldp_pipeline_hist/partiton_key={{ ti.xcom_pull(task_ids="genExecParam", key="app_run_dt") }}/'
iam_role 'arn:awrole'
allowoverwrite
format as parquet
maxfilesize 100 mb;
Small note, it might not be a good idea to name the python function and task variable as the same name. You might get some weird behavior.
You are overwriting your python function with
genExecParam = PythonOperator(task_id='genExecParam', python_callable=genExecParam, provide_context=True)
# genExecParam is no longer your python function.
Related
I try to use configs in dag using "trigger w/config".
def execute(**kwargs):
dag_run = kwargs['dag_run']
start_date = dag_run.conf['start_dt'] if 'start_dt' in dag_run.conf.keys() else kwargs['start_dt']
end_date = dag_run.conf['end_dt'] if 'end_dt' in dag_run.conf.keys() else kwargs['end_dt']
print(f'start_date = {start_date}, end_date = {end_date}')
dag = DAG(
"corp_dev_ods_test_dag",
default_args=default_args,
description='DAG',
schedule_interval='10 1 * * *',
start_date=days_ago(0),
#params={'dt' : '{{ macros.ds_add(ds, -7) }}'},
catchup=False,
tags=['dev']
)
run_submit = PythonVirtualenvOperator(
task_id='run_submit',
requirements=dag_requirements,
python_callable=execute,
system_site_packages=False,
dag=dag,
op_kwargs={'start_dt' : '{{ macros.ds_add(ds, -7) }}', 'end_dt': '{{ macros.ds_add(ds, -7) }}'}
)
run_submit
I got "KeyError": kwargs["dag_run"]. But in case of PythonOperator (Instead of PythonVirtualenvOperator) it works.
So, how can I use such parameters in my dag?
You need to provide an empty params variable in your task, for example:
from airflow.decorators import dag, task
from datetime import datetime
default_params = {"start_date": "2022-01-01", "end_date": "2022-12-01"}
#dag(
schedule=None,
start_date=datetime(2021, 1, 1),
catchup=False,
tags=['using_params'],
params=default_params
)
def mydag():
#task
def extract(params={}):
import helper
filenames = helper.extract(start=params.get("start_date"))
return filenames
extract()
_dag = mydag()
Now in the UI when you Trigger DAG w/ config you should be able to see and change the default params. And be able to access it in your dag task.
I have a DAG copying data to S3 using PySpark like below:
...
bucket = 'my.bucket'
schema = 'my_schema'
table = 'my_table'
ymd = pendulum.parse('{{ execution_date }}').strftime('%Y%m%d')
spark_script = 'my_spark_script'
DEFAULT_ARGS = {
'owner': 'burgerphilia',
'start_date': '2020-09-01',
'on_failure_callback': alert.slack_fail_alert,
'depends_on_past': False
}
SPARK_STEPS = [
{
'Name': f'{schema}_{table}_step',
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
'sudo',
'spark-submit',
...
f's3://{bucket}/spark-script/{spark_script}.py',
'--ymd',
f'{ymd}'
]
}
}
]
def delete_s3_object(bucket, schema, table, ymd):
"""
:param bucket: bucket name
:type buket: str
:param schema: schema name(the same as hive schema)
:type schema: str
:param table: table name(the same as hive table)
:type table: str
:param ymd: date to delete, '%Y%m%d' format
:type ymd: str
"""
aws_hook = AwsHook(aws_conn_id='aws_conn')
session = aws_hook.get_session(region_name='ap-northeast-2')
s3 = session.resource('s3')
bucket = s3.Bucket(bucket)
bucket.objects.filter(Prefix=f'{schema}/{table}/ymd={ymd}/').delete()
with DAG(
dag_id=f'{schema}_{table}',
default_args=DEFAULT_ARGS,
catchup=False,
schedule_interval="40 06 * * *"
) as dag:
object_cleaner = PythonOperator(
task_id = 'delete_object',
python_callable=delete_s3_object,
op_kwargs={'bucket': bucket, 'schema': schema, 'table': table, ymd': ymd}
)
step_adder = EmrAddStepsOperator(
task_id='add_step',
job_flow_id=job_flow_id,
aws_conn_id='aws_conn',
steps=SPARK_STEPS,
)
step_checker = EmrStepSensor(
task_id='watch_step',
job_flow_id=job_flow_id,
step_id="{{ task_instance.xcom_pull(task_ids='add_step', key='return_value')[0] }}",
aws_conn_id='aws_conn',
)
object_cleaner >> step_adder >> step_checker
This DAG is working on a daily basis, but the thing is data source(Oracle DB) is updated sometimes. So I should re-run the same DAG every Monday and the first day of month to update previous one(e.g. On 2020/11/02, re-run 2020/10/26 ~ 2020/11/01). Is there the best way to handle this?
There is no direct way to do it. You can try 2 things:
use dynamic dag (https://www.astronomer.io/guides/dynamically-generating-dags/) to create 2 dags with different schedule_interval.
Create another dag which will trigger this dag on different scheduler_interval.
I want to create task which will be update columns rows and send mail for every line in data table. At the moment I create task which download the data from main table. I cannot create tasks for every line in temp data table. Could you tell what I doing wrong and how I can generate and run tasks in lopp?
from datetime import datetime, timedelta
import airflow
from airflow import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_get_data import BigQueryGetDataOperator
from airflow.contrib.operators.bigquery_check_operator import BigQueryValueCheckOperator
from airflow.operators import PythonOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'cmap',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(0),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
}
with DAG('dq_bigquery_test',
max_active_runs=1,
schedule_interval='#once',
catchup=False,
default_args=default_args) as dag:
query = "SELECT * from `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` where MailRequired = false"
insert = "INSERT into dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc (DataTimeStamp, Robot, Status) Values (CURRENT_TIMESTAMP(), 'TestRobot', 'Test')"
my_bq_task = BigQueryOperator(
task_id='query_exc_on_teste',
sql=query,
write_disposition='WRITE_TRUNCATE',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
destination_dataset_table='dev_dataquality.testTable')
get_data = BigQueryGetDataOperator(
task_id='get_data_from_query',
project_id='dbce-bi-prod-e6fd',
dataset_id='dev_dataquality',
table_id='testTable',
max_results='100',
selected_fields='Robot,Status,MailRequired',
bigquery_conn_id='google_cloud_dbce_bi_prod'
)
def process_data_from_bq(**kwargs):
ti = kwargs['ti']
update_column = []
bq_data = ti.xcom_pull(task_ids='get_data_from_query')
print(bq_data)
# Now bq_data here would have your data in Python list
for index, i in enumerate(bq_data):
update_query = "UPDATE `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` SET MailSent = True WHERE Robot = '{}'".format(i[0])
print(update_query)
update_column.append(BigQueryOperator(
task_id='update_column_{}'.format(index),
sql=update_query,
write_disposition='WRITE_EMPTY',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
dag=dag
))
if index not in [0]:
update_column[index-1] >> update_column[index]
process_data = PythonOperator(
task_id='process_data_from_bq',
python_callable=process_data_from_bq,
provide_context=True
)
my_bq_task >> get_data >> process_data
Thank you for your help!
I am trying to add airflow dag dynamically looping through the dictionary keys and assigning keys as dag name.
dags are creating fine but i am getting :"This DAG isn't available in the webserver DagBag object. It shows up in this list because the scheduler marked it as active in the metdata database" and its not clickable.
def create_dag(dag_id):
args = build_default_args(config_file)
dag = DAG(dag_id,schedule_interval='30 11 * * *', default_args=args)
with dag:
init_task = BashOperator(
task_id='test_init_task',
bash_command='echo "task"',
dag=dag
)
init_task
return dag
def get_data(**kwargs):
my_list=[]
file = open("/home/airflow/gcs/data/test.json")
data=json.load(file)
return data
data1 = data()
for dict in data1:
for pair in dict.items():
key , value = pair
print "key",ls_table ,"value",metrics
dag_id = '{}'.format(key)
default_args = {'owner': 'airflow',
'start_date': datetime(2019, 6, 18)
}
schedule = '#daily'
globals()[dag_id] = create_dag(dag_id)
I have a simple airflow workflow composed of two tasks. One does make a download of a csv file containing stock data. The other extracts the maximum stock price and write the data to another file.
If I run the first task and then the second everything works fine, instead if execute: airflow run stocks_d get_max_share it fails to meet the dependency.
import csv
from datetime import datetime
from datetime import timedelta
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
import requests
def get_stock_data():
url = "https://app.quotemedia.com/quotetools/getHistoryDownload.csv?&webmasterId=501&startDay=02&startMonth=02&startYear=2002&endDay=02&endMonth=07&endYear=2009&isRanged=false&symbol=APL"
try:
r = requests.get(url)
except requests.RequestException as re:
raise
else:
with open('/tmp/stocks/airflow_stock_data.txt', 'w') as f:
f.write(r.text)
def get_max_share():
stock_data = []
stock_max = {}
with open('/tmp/stocks/airflow_stock_data.txt', 'r') as f:
stock_reader = csv.reader(f)
next(stock_reader, None)
for row in stock_reader:
stock_data.append(row)
for stock in stock_data:
stock_max[stock[2]] = stock[0]
with open('/tmp/stocks/max_stock', 'w') as f:
stock_price = max(stock_max.keys())
stock_max_price_date = stock_max[stock_price]
stock_entry = stock_max_price_date + ' -> ' + stock_price
f.write(stock_entry)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2017, 5, 30),
'email': ['mainl#domain.io'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'catchup': False,
}
dag = DAG('stocks_d', default_args=default_args, schedule_interval=timedelta(minutes=5))
task_get_stocks = PythonOperator(task_id='get_stocks', python_callable=get_stock_data, dag=dag)
task_get_max_share = PythonOperator(task_id='get_max_share', python_callable=get_max_share, dag=dag)
task_get_max_share.set_upstream(task_get_stocks)
Any ideas why does that happen ?
$ airflow run stocks_d get_max_share
Above command only run get_max_share task not the previous task before running it.
If you need to check the whole dag running, try below command
$ airflow trigger_dag stocks_d