Create multiple task in airflow using loop - airflow

I want to create task which will be update columns rows and send mail for every line in data table. At the moment I create task which download the data from main table. I cannot create tasks for every line in temp data table. Could you tell what I doing wrong and how I can generate and run tasks in lopp?
from datetime import datetime, timedelta
import airflow
from airflow import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_get_data import BigQueryGetDataOperator
from airflow.contrib.operators.bigquery_check_operator import BigQueryValueCheckOperator
from airflow.operators import PythonOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'cmap',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(0),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
}
with DAG('dq_bigquery_test',
max_active_runs=1,
schedule_interval='#once',
catchup=False,
default_args=default_args) as dag:
query = "SELECT * from `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` where MailRequired = false"
insert = "INSERT into dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc (DataTimeStamp, Robot, Status) Values (CURRENT_TIMESTAMP(), 'TestRobot', 'Test')"
my_bq_task = BigQueryOperator(
task_id='query_exc_on_teste',
sql=query,
write_disposition='WRITE_TRUNCATE',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
destination_dataset_table='dev_dataquality.testTable')
get_data = BigQueryGetDataOperator(
task_id='get_data_from_query',
project_id='dbce-bi-prod-e6fd',
dataset_id='dev_dataquality',
table_id='testTable',
max_results='100',
selected_fields='Robot,Status,MailRequired',
bigquery_conn_id='google_cloud_dbce_bi_prod'
)
def process_data_from_bq(**kwargs):
ti = kwargs['ti']
update_column = []
bq_data = ti.xcom_pull(task_ids='get_data_from_query')
print(bq_data)
# Now bq_data here would have your data in Python list
for index, i in enumerate(bq_data):
update_query = "UPDATE `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` SET MailSent = True WHERE Robot = '{}'".format(i[0])
print(update_query)
update_column.append(BigQueryOperator(
task_id='update_column_{}'.format(index),
sql=update_query,
write_disposition='WRITE_EMPTY',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
dag=dag
))
if index not in [0]:
update_column[index-1] >> update_column[index]
process_data = PythonOperator(
task_id='process_data_from_bq',
python_callable=process_data_from_bq,
provide_context=True
)
my_bq_task >> get_data >> process_data
Thank you for your help!

Related

Airflow dynamically genarated task not run in order

I have created dynamic tasks generation dag. Tasks are generated accurately, But those tasks are not trigger in order,not work in consistently.
i have noticed it triggered on alphanumeric order.
Let's check run_modification_ tasks. i have generated 0 to 29 tasks. i have noticed it trigger on below format.
run_modification_0
run_modification_1
run_modification_10
run_modification_11
run_modification_12
run_modification_13
run_modification_14
run_modification_15
run_modification_16
run_modification_17
run_modification_18
run_modification_19
run_modification_2
run_modification_21
run_modification_23....
But i need to run it on tasks order like
run_modification_0
run_modification_1
run_modification_2
run_modification_3
run_modification_4
run_modification_5..
Please help me to run those tasks on task created order.
from datetime import date, timedelta, datetime
from airflow.utils.dates import days_ago
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.postgres_operator import PostgresOperator
from airflow.hooks.postgres_hook import PostgresHook
from airflow.models import Variable
import os
args = {
'owner': 'Airflow',
'start_date': days_ago(2),
}
dag = DAG(
dag_id='tastOrder',
default_args=args,
schedule_interval=None,
tags=['task']
)
modification_processXcom = """ cd {{ ti.xcom_pull(task_ids=\'run_modification_\'+params.i, key=\'taskDateFolder\') }} """
def modificationProcess(ds,**kwargs):
today = datetime.strptime('2021-01-01', '%Y-%m-%d').date()
i = str(kwargs['i'])
newDate = today-timedelta(days=int(i))
print(str(newDate))
kwargs["ti"].xcom_push("taskDateFolder", str(newDate))
def getDays():
today = today = datetime.strptime('2021-01-01', '%Y-%m-%d').date()
yesterday = today - timedelta(days=30)
day_Diff = today-yesterday
return day_Diff,today
day_Diff, today = getDays()
for i in reversed(range(0,day_Diff.days)):
run_modification = PythonOperator(
task_id='run_modification_'+str(i),
provide_context=True,
python_callable=modificationProcess,
op_kwargs={'i': str(i)},
dag=dag,
)
modification_processXcom = BashOperator(
task_id='modification_processXcom_'+str(i),
bash_command=modification_processXcom,
params = {'i' :str(i)},
dag = dag
)
run_modification >> modification_processXcom
To get the dependency as:
run_modification_1 -> modification_processXcom_1 ->
run_modification_2 -> modification_processXcom_2 -> ... - >
run_modification_29 -> modification_processXcom_29
You can do:
from datetime import datetime
from airflow import DAG
from airflow.operators.bash import BashOperator
dag = DAG(
dag_id='my_dag',
schedule_interval=None,
start_date=datetime(2021, 8, 10),
catchup=False,
is_paused_upon_creation=False,
)
mylist1 = []
mylist2 = []
for i in range(1, 30):
mylist1.append(
BashOperator( # Replace with your requested operator
task_id=f'run_modification_{i}',
bash_command=f"""echo executing run_modification_{i}""",
dag=dag,
)
)
mylist2.append(
BashOperator( # Replace with your requested operator
task_id=f'modification_processXcom_{i}',
bash_command=f"""echo executing modification_processXcom_{i}""",
dag=dag,
)
)
if len(mylist1) > 0:
mylist1[-1] >> mylist2[-1] # This set dependency between run_modifiation to modification_processXcom
if len(mylist1) > 1:
mylist2[-2] >> mylist1[-1] # This set dependency between modification_processXcom to previous run_modifiation
This code create a list of operators and set them to run one after another as:
Tree view:

How to retry an upstream task?

task a > task b > task c
If C fails I want to retry A. Is this possible? There are a few other tickets which involve subdags, but I would like to just be able to clear A.
I'm hoping to use on_retry_callback in task C but I don't know how to call task A.
There is another question which does this in a subdag, but I am not using subdags.
I'm trying to do this, but it doesn't seem to work:
def callback_for_failures(context):
print("*** retrying ***")
if context['task'].upstream_list:
context['task'].upstream_list[0].clear()
As other comments mentioned, I would use caution to make sure you aren't getting into an endless loop of clearing/retries. But you can call a bash command as part of your on_failure_callback and then specify which tasks you want to clear, and if you want downstream/upstream tasks cleared etc.
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta
def clear_upstream_task(context):
execution_date = context.get("execution_date")
clear_tasks = BashOperator(
task_id='clear_tasks',
bash_command=f'airflow tasks clear -s {execution_date} -t t1 -d -y clear_upstream_task'
)
return clear_tasks.execute(context=context)
# Default settings applied to all tasks
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5)
}
with DAG('clear_upstream_task',
start_date=datetime(2021, 1, 1),
max_active_runs=3,
schedule_interval=timedelta(minutes=5),
default_args=default_args,
catchup=False
) as dag:
t0 = DummyOperator(
task_id='t0'
)
t1 = DummyOperator(
task_id='t1'
)
t2 = DummyOperator(
task_id='t2'
)
t3 = BashOperator(
task_id='t3',
bash_command='exit 123',
on_failure_callback=clear_upstream_task
)
t0 >> t1 >> t2 >> t3

Airflow: Dynamically derive dag_id to be called from another DAG

I am trying to derive name of the DAG to be called in another DAG dynamically. In the following task "trigger_transform_dag" fails to execute. Can you please help me with deriving the dag id for task 'trigger_transform_dag' dynamically
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'start_date': airflow.utils.dates.days_ago(0),
}
def run_dag(**context):
file_path='ABC'
context['ti'].xcom_push(key = 'key1', value = file_path)
return 1
def check_file_name(**context):
pulled_value_1 = context['ti'].xcom_pull(task_ids = 'run_dataflow_template',key = 'key1')
if pulled_value_1 = 'ABC':
push_value = 'sample1'
return push_value
else:
push_value = 'sample2'
return push_value
return pulled_value_1
with DAG('sample',
default_args=default_args,
schedule_interval='10 * * * *',
start_date=datetime(2017, 3, 20),
max_active_runs=1,
catchup=False) as dag:
t1 = PythonOperator(
task_id='run_dataflow_template',
provide_context=True,
python_callable=run_dag
)
t2 = TriggerDagRunOperator(
task_id="trigger_transform_dag",
provide_context=True,
trigger_dag_id=check_file_name()
)
end = DummyOperator(
trigger_rule='one_success',
task_id='end')
t1 >> t2 >> end
I don't know if there is a simpler way, but you can create a custom operator that takes inspiration from the TriggerDagRunOperator (https://github.com/apache/airflow/blob/master/airflow/operators/dagrun_operator.py) and uses the passed Callable to get the function.
Something I hacked together really quick (can be definitely improved):
from airflow.models import DAG
from airflow.utils.dates import days_ago, timedelta
from airflow.operators.dagrun_operator import TriggerDagRunOperator
import random
import datetime
from typing import Dict, Optional, Union, Callable
from airflow.api.common.experimental.trigger_dag import trigger_dag
from airflow.models import BaseOperator
from airflow.utils import timezone
from airflow.utils.decorators import apply_defaults
class TriggerDagRunWithFuncOperator(BaseOperator):
"""
Triggers a DAG run for a specified ``dag_id``
:param trigger_dag_id_f: the dag_id function to trigger
:type trigger_dag_id_f: Callable
:param conf: Configuration for the DAG run
:type conf: dict
:param execution_date: Execution date for the dag (templated)
:type execution_date: str or datetime.datetime
"""
template_fields = ("execution_date", "conf")
ui_color = "#ffefeb"
#apply_defaults
def __init__(
self,
get_dag_name_f: Callable,
conf: Optional[Dict] = None,
execution_date: Optional[Union[str, datetime.datetime]] = None,
*args,
**kwargs
) -> None:
super().__init__(*args, **kwargs)
self.conf = conf
self.get_dag_name_f = get_dag_name_f
if not isinstance(execution_date, (str, datetime.datetime, type(None))):
raise TypeError(
"Expected str or datetime.datetime type for execution_date."
"Got {}".format(type(execution_date))
)
self.execution_date: Optional[datetime.datetime] = execution_date # type: ignore
def execute(self, context: Dict):
if isinstance(self.execution_date, datetime.datetime):
run_id = "trig__{}".format(self.execution_date.isoformat())
elif isinstance(self.execution_date, str):
run_id = "trig__{}".format(self.execution_date)
self.execution_date = timezone.parse(self.execution_date) # trigger_dag() expects datetime
else:
run_id = "trig__{}".format(timezone.utcnow().isoformat())
dag_id_to_call = self.get_dag_name_f()
# Ignore MyPy type for self.execution_date because it doesn't pick up the timezone.parse() for strings
trigger_dag(
dag_id=dag_id_to_call,
run_id=run_id,
conf=self.conf,
execution_date=self.execution_date,
replace_microseconds=False,
)
args={
'owner': 'arocketman',
'start_date': days_ago(1)
}
dag = DAG(dag_id='dyna_dag', default_args=args, schedule_interval=None)
def your_function():
return 'my_sample_dag'
with dag:
run_this_task = TriggerDagRunWithFuncOperator(
task_id='run_this',
get_dag_name_f=your_function
)

Airflow running but nothing happening

My airflow DAG is showing its running but nothing is happening. I am not understanding what's wrong? Spend lot of time with this.
The first dag in code should set airflow variable and second dag should read this variable and run tasks based on the values in the variable. The second DAG runs 15 minutes after the first DAG. But always its showing its running.
Anybody please help. Thank you.
Airflow version 1.9.0
import datetime as dt
from airflow import models
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.operators import python_operator
from utils.etl_fail_slack_alert import task_fail_slack_alert
from airflow.contrib.hooks.bigquery_hook import BigQueryHook
from airflow.operators.dummy_operator import DummyOperator
from airflow.models import Variable
project = 'asd'
source_dataset = 'dfghdfh'
destination_dataset = 'dfhdfhdf'
table_prefix = ''
default_args = {
'start_date': '2019-09-10',
'retries': 1,
'retry_delay': dt.timedelta(minutes=2),
'on_failure_callback': task_fail_slack_alert,
}
def set_views_av():
bq_hook = BigQueryHook(bigquery_conn_id='bigquery_default',
delegate_to=None, use_legacy_sql=False)
query = ("SELECT table_id FROM `{project}.{dataset}.{table}`;".format(
project=project, dataset=source_dataset, table='__TABLES__'))
df = bq_hook.get_pandas_df(sql=query, dialect='standard')
view_names = df['table_id'].tolist()
Variable.set('view_list', '|'.join(view_names))
def bq_operator(vname, dag):
sql = ("SELECT * FROM `{project}.{dataset}.{table}`".format(
project=project, dataset=source_dataset, table=vname))
materialize_view_bq = BigQueryOperator(bql=sql,
destination_dataset_table = project + "." + destination_dataset + "." + table_prefix + vname,
task_id="materialize_" + vname,
bigquery_conn_id="bigquery_default",
google_cloud_storage_conn_id="google_cloud_default",
use_legacy_sql=False,
write_disposition="WRITE_TRUNCATE",
create_disposition="CREATE_IF_NEEDED",
query_params={},
allow_large_results=True,
dag=dag
)
return materialize_view_bq
with models.DAG(dag_id="materialize_init_views", default_args=default_args, schedule_interval="15 09 * * *", catchup=True) as dag_init:
bridge = DummyOperator(
task_id='bridge',
dag=dag_init
)
set_views = python_operator.PythonOperator(task_id="set_views",
python_callable=set_views_av
)
bridge >> set_views
with models.DAG(dag_id="materialize_views_dynamic", default_args=default_args, schedule_interval="30 09 * * *", catchup=True) as dag:
views = Variable.get("view_list").split("|")
bridge = DummyOperator(
task_id='bridge',
dag=dag
)
for vname in views:
materialize_view_bq = bq_operator(vname, dag)
bridge >> materialize_view_bq

airflow does not satisfy task dependencies

I have a simple airflow workflow composed of two tasks. One does make a download of a csv file containing stock data. The other extracts the maximum stock price and write the data to another file.
If I run the first task and then the second everything works fine, instead if execute: airflow run stocks_d get_max_share it fails to meet the dependency.
import csv
from datetime import datetime
from datetime import timedelta
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
import requests
def get_stock_data():
url = "https://app.quotemedia.com/quotetools/getHistoryDownload.csv?&webmasterId=501&startDay=02&startMonth=02&startYear=2002&endDay=02&endMonth=07&endYear=2009&isRanged=false&symbol=APL"
try:
r = requests.get(url)
except requests.RequestException as re:
raise
else:
with open('/tmp/stocks/airflow_stock_data.txt', 'w') as f:
f.write(r.text)
def get_max_share():
stock_data = []
stock_max = {}
with open('/tmp/stocks/airflow_stock_data.txt', 'r') as f:
stock_reader = csv.reader(f)
next(stock_reader, None)
for row in stock_reader:
stock_data.append(row)
for stock in stock_data:
stock_max[stock[2]] = stock[0]
with open('/tmp/stocks/max_stock', 'w') as f:
stock_price = max(stock_max.keys())
stock_max_price_date = stock_max[stock_price]
stock_entry = stock_max_price_date + ' -> ' + stock_price
f.write(stock_entry)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2017, 5, 30),
'email': ['mainl#domain.io'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'catchup': False,
}
dag = DAG('stocks_d', default_args=default_args, schedule_interval=timedelta(minutes=5))
task_get_stocks = PythonOperator(task_id='get_stocks', python_callable=get_stock_data, dag=dag)
task_get_max_share = PythonOperator(task_id='get_max_share', python_callable=get_max_share, dag=dag)
task_get_max_share.set_upstream(task_get_stocks)
Any ideas why does that happen ?
$ airflow run stocks_d get_max_share
Above command only run get_max_share task not the previous task before running it.
If you need to check the whole dag running, try below command
$ airflow trigger_dag stocks_d

Resources