airflow does not satisfy task dependencies

airflow does not satisfy task dependencies - airflow

I have a simple airflow workflow composed of two tasks. One does make a download of a csv file containing stock data. The other extracts the maximum stock price and write the data to another file.
If I run the first task and then the second everything works fine, instead if execute: airflow run stocks_d get_max_share it fails to meet the dependency.
import csv
from datetime import datetime
from datetime import timedelta
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
import requests
def get_stock_data():
url = "https://app.quotemedia.com/quotetools/getHistoryDownload.csv?&webmasterId=501&startDay=02&startMonth=02&startYear=2002&endDay=02&endMonth=07&endYear=2009&isRanged=false&symbol=APL"
try:
r = requests.get(url)
except requests.RequestException as re:
raise
else:
with open('/tmp/stocks/airflow_stock_data.txt', 'w') as f:
f.write(r.text)
def get_max_share():
stock_data = []
stock_max = {}
with open('/tmp/stocks/airflow_stock_data.txt', 'r') as f:
stock_reader = csv.reader(f)
next(stock_reader, None)
for row in stock_reader:
stock_data.append(row)
for stock in stock_data:
stock_max[stock[2]] = stock[0]
with open('/tmp/stocks/max_stock', 'w') as f:
stock_price = max(stock_max.keys())
stock_max_price_date = stock_max[stock_price]
stock_entry = stock_max_price_date + ' -> ' + stock_price
f.write(stock_entry)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2017, 5, 30),
'email': ['mainl#domain.io'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'catchup': False,
}
dag = DAG('stocks_d', default_args=default_args, schedule_interval=timedelta(minutes=5))
task_get_stocks = PythonOperator(task_id='get_stocks', python_callable=get_stock_data, dag=dag)
task_get_max_share = PythonOperator(task_id='get_max_share', python_callable=get_max_share, dag=dag)
task_get_max_share.set_upstream(task_get_stocks)
Any ideas why does that happen ?

$ airflow run stocks_d get_max_share
Above command only run get_max_share task not the previous task before running it.
If you need to check the whole dag running, try below command
$ airflow trigger_dag stocks_d

Related

airflow S3ToRedshiftTransfer

I have a requirement to copy s3 files to redshift using copy command. I am bit new to airflow and having issues. Can some one correct the below code. Can I call the rs.execute() as such?
Error:
op.execute()
TypeError: execute() missing 1 required positional argument: 'context'
code:
import os
from airflow import DAG
from airflow.hooks.S3_hook import S3Hook
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
default_args = {
'owner': 'gra',
'depends_on_past': False,
'start_date': datetime(2020, 12, 13),
'email': ['ss.com'],
'email_on_failure': False,
'email_on_retry': False,
'schedule_interval': '#daily',
'retries': 1,
'retry_delay': timedelta(seconds=5),
}
def job1():
print('First Job to start')
def s3_redshift(**kwargs):
rs= S3ToRedshiftTransfer(redshift_conn_id ='12as',
aws_conn_id='gt_read',
schema='test',
table='dept',
s3_bucket="gng-test",
s3_key="copt.csv",
task_id="copy_redshift"
#copy_options=copy_options_,
)
rs.execute()
copy_redshift=PythonOperator(task_id='copy_redshift', python_callable=s3_redshift,provide_context=True, dag=dag)
app_start >> copy_redshift

I was able to use the boto3 to execute copy from s3 to redshift. S3ToRedshiftTransfer is can be used to do the same.
# airflow related
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
# other packages
from datetime import datetime
from datetime import timedelta
# from airflow.hooks import PostgresHook
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
#from airflow.providers.amazon.aws.transfers.s3_to_redshift import S3ToRedshiftOperator
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from airflow.operators import SimpleHttpOperator, HttpSensor, BashOperator, EmailOperator, S3KeySensor
import boto3
default_args = {
'owner': 'grit_delta',
'depends_on_past': False,
'start_date': datetime(2020, 12, 13),
'email': ['sa.com'],
'email_on_failure': False,
'email_on_retry': False,
'schedule_interval': '#daily',
'retries': 1,
'retry_delay': timedelta(seconds=5),
}
dag=DAG(dag_id='veritas_test',default_args=default_args,schedule_interval=timedelta(1))
def job1():
print('First Job to start')
file_sensor = S3KeySensor(task_id = 's3_key_sensor_task',
s3_conn_id='_read',
poke_interval=120,
timeout=18*60*60,
bucket_key = "data/test.*",
bucket_name = "g-test",
wildcard_match = True,
dag = dag
)
app_start=PythonOperator(task_id='app_start', python_callable=job1, dag=dag)
def s3_redshift(**kwargs):
rsd = boto3.client('redshift-data')
deptKey='s3://airflow-dev/code/gta/dag/dept.csv'
sqlQuery="copy test.dept from 's3://airflow-grole' CSV ;"
#sqlQuery="insert into test.dept values('d1221',100)"
print(sqlQuery)
resp = rsd.execute_statement(
ClusterIdentifier="opes",
Database="ee",
DbUser="aa",
Sql=sqlQuery
#Sql="CREATE TABLE IF NOT EXISTS test.dept (title varchar(10), rating int);"
)
print(resp)
print(" completed")
return "OK"
copy_redshift=PythonOperator(task_id='copy_redshift', python_callable=s3_redshift,provide_context=True, dag=dag)
file_sensor >>app_start >> copy_redshift

You have not defined any DAG and you don't use operators like that. I would recommend you to read a little bit more about how to use Airflow. Anyway, the code should be:
import os
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
default_args = {
'owner': 'gra',
'depends_on_past': False,
'start_date': datetime(2020, 12, 13),
'email': ['ss.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5),
}
with DAG('dag_name', schedule_interval='#daily', default_args=default_args) as dag:
rs= S3ToRedshiftTransfer(redshift_conn_id ='12as',
aws_conn_id='gt_read',
schema='test',
table='dept',
s3_bucket="gng-test",
s3_key="copt.csv",
task_id="copy_redshift"
)
app_start >> rs

How to avoid run of task when already running

I have an airflow task which scheduled to run every 3 minutes.
Sometimes the duration of the task is longer than 3 minutes, and the next schedule start (or queued), despite it is already running.
Is there a way to define the dag to NOT even queue the task if it is already in run?
# airflow related
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators import MsSqlOperator
# other packages
from datetime import datetime
from datetime import timedelta
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2020, 7, 22, 15,00,00),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5)
}
dag = DAG(
dag_id='sales',
description='Run sales',
schedule_interval = '*/3 4,5,6,7,8,9,10,11,12,13,14,15,16,17 * * 0-5',
default_args=default_args,
catchup = False)
job1 = BashOperator(
task_id='sales',
bash_command='python2 /home/manager/ETL/sales.py',
dag = dag)
job2 = MsSqlOperator(
task_id='refresh_tabular',
mssql_conn_id='mssql_globrands',
sql="USE msdb ; EXEC dbo.sp_start_job N'refresh Management-sales' ; ",
dag = dag)
job1>>job2

Create multiple task in airflow using loop

I want to create task which will be update columns rows and send mail for every line in data table. At the moment I create task which download the data from main table. I cannot create tasks for every line in temp data table. Could you tell what I doing wrong and how I can generate and run tasks in lopp?
from datetime import datetime, timedelta
import airflow
from airflow import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_get_data import BigQueryGetDataOperator
from airflow.contrib.operators.bigquery_check_operator import BigQueryValueCheckOperator
from airflow.operators import PythonOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'cmap',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(0),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
}
with DAG('dq_bigquery_test',
max_active_runs=1,
schedule_interval='#once',
catchup=False,
default_args=default_args) as dag:
query = "SELECT * from `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` where MailRequired = false"
insert = "INSERT into dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc (DataTimeStamp, Robot, Status) Values (CURRENT_TIMESTAMP(), 'TestRobot', 'Test')"
my_bq_task = BigQueryOperator(
task_id='query_exc_on_teste',
sql=query,
write_disposition='WRITE_TRUNCATE',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
destination_dataset_table='dev_dataquality.testTable')
get_data = BigQueryGetDataOperator(
task_id='get_data_from_query',
project_id='dbce-bi-prod-e6fd',
dataset_id='dev_dataquality',
table_id='testTable',
max_results='100',
selected_fields='Robot,Status,MailRequired',
bigquery_conn_id='google_cloud_dbce_bi_prod'
)
def process_data_from_bq(**kwargs):
ti = kwargs['ti']
update_column = []
bq_data = ti.xcom_pull(task_ids='get_data_from_query')
print(bq_data)
# Now bq_data here would have your data in Python list
for index, i in enumerate(bq_data):
update_query = "UPDATE `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` SET MailSent = True WHERE Robot = '{}'".format(i[0])
print(update_query)
update_column.append(BigQueryOperator(
task_id='update_column_{}'.format(index),
sql=update_query,
write_disposition='WRITE_EMPTY',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
dag=dag
))
if index not in [0]:
update_column[index-1] >> update_column[index]
process_data = PythonOperator(
task_id='process_data_from_bq',
python_callable=process_data_from_bq,
provide_context=True
)
my_bq_task >> get_data >> process_data
Thank you for your help!

How to retry an upstream task?

task a > task b > task c
If C fails I want to retry A. Is this possible? There are a few other tickets which involve subdags, but I would like to just be able to clear A.
I'm hoping to use on_retry_callback in task C but I don't know how to call task A.
There is another question which does this in a subdag, but I am not using subdags.
I'm trying to do this, but it doesn't seem to work:
def callback_for_failures(context):
print("*** retrying ***")
if context['task'].upstream_list:
context['task'].upstream_list[0].clear()

As other comments mentioned, I would use caution to make sure you aren't getting into an endless loop of clearing/retries. But you can call a bash command as part of your on_failure_callback and then specify which tasks you want to clear, and if you want downstream/upstream tasks cleared etc.
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta
def clear_upstream_task(context):
execution_date = context.get("execution_date")
clear_tasks = BashOperator(
task_id='clear_tasks',
bash_command=f'airflow tasks clear -s {execution_date} -t t1 -d -y clear_upstream_task'
)
return clear_tasks.execute(context=context)
# Default settings applied to all tasks
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5)
}
with DAG('clear_upstream_task',
start_date=datetime(2021, 1, 1),
max_active_runs=3,
schedule_interval=timedelta(minutes=5),
default_args=default_args,
catchup=False
) as dag:
t0 = DummyOperator(
task_id='t0'
)
t1 = DummyOperator(
task_id='t1'
)
t2 = DummyOperator(
task_id='t2'
)
t3 = BashOperator(
task_id='t3',
bash_command='exit 123',
on_failure_callback=clear_upstream_task
)
t0 >> t1 >> t2 >> t3

Airflow ExternalTaskSensor Stuck and Error

I have define the external_sensor like that:
external_sensor = ExternalTaskSensor(task_id='ext_sensor_task',
execution_delta=timedelta(minutes=0),
external_dag_id='book_data',
external_task_id='Dataframe_Windows_test',
dag = dag)
The another task is defined like this:
dl_processing_windows = DL_Processing(task_id='dl_processing_windows',
df_dataset_location=dl_config.WINDOWS_DATASET,
....
While in the airflow UI:
I got the error:
Argument ['task_id'] is required
I have two problems:
1. Why does such error exist?
2. Why does it not work?
The attachment:
default_args = {
'owner': 'Newt',
'retries': 2,
'retry_delay': timedelta(seconds=30),
'depends_on_past': False,
}
dag = DAG(
dag_id,
start_date = datetime(2019, 11, 20),
description= 'xxxx',
default_args = default_args,
schedule_interval = timedelta(hours=1),
)
The parameters for dag are the same for both dags!

I fixed it.
Normally, the start_date is different from the scheduler_interval. I set the start_date for both dags into the same time with the current date.
After the first dependent bag finished, the new dag began to work!

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

airflow does not satisfy task dependencies - airflow

$ airflow run stocks_d get_max_share Above command only run get_max_share task not the previous task before running it. If you need to check the whole dag running, try below command $ airflow trigger_dag stocks_d

Related

airflow S3ToRedshiftTransfer

How to avoid run of task when already running

Create multiple task in airflow using loop

How to retry an upstream task?

Airflow ExternalTaskSensor Stuck and Error

Categories

Resources