I have a simple airflow workflow composed of two tasks. One does make a download of a csv file containing stock data. The other extracts the maximum stock price and write the data to another file.
If I run the first task and then the second everything works fine, instead if execute: airflow run stocks_d get_max_share it fails to meet the dependency.
import csv
from datetime import datetime
from datetime import timedelta
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
import requests
def get_stock_data():
url = "https://app.quotemedia.com/quotetools/getHistoryDownload.csv?&webmasterId=501&startDay=02&startMonth=02&startYear=2002&endDay=02&endMonth=07&endYear=2009&isRanged=false&symbol=APL"
try:
r = requests.get(url)
except requests.RequestException as re:
raise
else:
with open('/tmp/stocks/airflow_stock_data.txt', 'w') as f:
f.write(r.text)
def get_max_share():
stock_data = []
stock_max = {}
with open('/tmp/stocks/airflow_stock_data.txt', 'r') as f:
stock_reader = csv.reader(f)
next(stock_reader, None)
for row in stock_reader:
stock_data.append(row)
for stock in stock_data:
stock_max[stock[2]] = stock[0]
with open('/tmp/stocks/max_stock', 'w') as f:
stock_price = max(stock_max.keys())
stock_max_price_date = stock_max[stock_price]
stock_entry = stock_max_price_date + ' -> ' + stock_price
f.write(stock_entry)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2017, 5, 30),
'email': ['mainl#domain.io'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'catchup': False,
}
dag = DAG('stocks_d', default_args=default_args, schedule_interval=timedelta(minutes=5))
task_get_stocks = PythonOperator(task_id='get_stocks', python_callable=get_stock_data, dag=dag)
task_get_max_share = PythonOperator(task_id='get_max_share', python_callable=get_max_share, dag=dag)
task_get_max_share.set_upstream(task_get_stocks)
Any ideas why does that happen ?
$ airflow run stocks_d get_max_share
Above command only run get_max_share task not the previous task before running it.
If you need to check the whole dag running, try below command
$ airflow trigger_dag stocks_d
Related
I have a requirement to copy s3 files to redshift using copy command. I am bit new to airflow and having issues. Can some one correct the below code. Can I call the rs.execute() as such?
Error:
op.execute()
TypeError: execute() missing 1 required positional argument: 'context'
code:
import os
from airflow import DAG
from airflow.hooks.S3_hook import S3Hook
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
default_args = {
'owner': 'gra',
'depends_on_past': False,
'start_date': datetime(2020, 12, 13),
'email': ['ss.com'],
'email_on_failure': False,
'email_on_retry': False,
'schedule_interval': '#daily',
'retries': 1,
'retry_delay': timedelta(seconds=5),
}
def job1():
print('First Job to start')
def s3_redshift(**kwargs):
rs= S3ToRedshiftTransfer(redshift_conn_id ='12as',
aws_conn_id='gt_read',
schema='test',
table='dept',
s3_bucket="gng-test",
s3_key="copt.csv",
task_id="copy_redshift"
#copy_options=copy_options_,
)
rs.execute()
copy_redshift=PythonOperator(task_id='copy_redshift', python_callable=s3_redshift,provide_context=True, dag=dag)
app_start >> copy_redshift
I was able to use the boto3 to execute copy from s3 to redshift. S3ToRedshiftTransfer is can be used to do the same.
# airflow related
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
# other packages
from datetime import datetime
from datetime import timedelta
# from airflow.hooks import PostgresHook
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
#from airflow.providers.amazon.aws.transfers.s3_to_redshift import S3ToRedshiftOperator
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from airflow.operators import SimpleHttpOperator, HttpSensor, BashOperator, EmailOperator, S3KeySensor
import boto3
default_args = {
'owner': 'grit_delta',
'depends_on_past': False,
'start_date': datetime(2020, 12, 13),
'email': ['sa.com'],
'email_on_failure': False,
'email_on_retry': False,
'schedule_interval': '#daily',
'retries': 1,
'retry_delay': timedelta(seconds=5),
}
dag=DAG(dag_id='veritas_test',default_args=default_args,schedule_interval=timedelta(1))
def job1():
print('First Job to start')
file_sensor = S3KeySensor(task_id = 's3_key_sensor_task',
s3_conn_id='_read',
poke_interval=120,
timeout=18*60*60,
bucket_key = "data/test.*",
bucket_name = "g-test",
wildcard_match = True,
dag = dag
)
app_start=PythonOperator(task_id='app_start', python_callable=job1, dag=dag)
def s3_redshift(**kwargs):
rsd = boto3.client('redshift-data')
deptKey='s3://airflow-dev/code/gta/dag/dept.csv'
sqlQuery="copy test.dept from 's3://airflow-grole' CSV ;"
#sqlQuery="insert into test.dept values('d1221',100)"
print(sqlQuery)
resp = rsd.execute_statement(
ClusterIdentifier="opes",
Database="ee",
DbUser="aa",
Sql=sqlQuery
#Sql="CREATE TABLE IF NOT EXISTS test.dept (title varchar(10), rating int);"
)
print(resp)
print(" completed")
return "OK"
copy_redshift=PythonOperator(task_id='copy_redshift', python_callable=s3_redshift,provide_context=True, dag=dag)
file_sensor >>app_start >> copy_redshift
You have not defined any DAG and you don't use operators like that. I would recommend you to read a little bit more about how to use Airflow. Anyway, the code should be:
import os
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
default_args = {
'owner': 'gra',
'depends_on_past': False,
'start_date': datetime(2020, 12, 13),
'email': ['ss.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5),
}
with DAG('dag_name', schedule_interval='#daily', default_args=default_args) as dag:
rs= S3ToRedshiftTransfer(redshift_conn_id ='12as',
aws_conn_id='gt_read',
schema='test',
table='dept',
s3_bucket="gng-test",
s3_key="copt.csv",
task_id="copy_redshift"
)
app_start >> rs
I have an airflow task which scheduled to run every 3 minutes.
Sometimes the duration of the task is longer than 3 minutes, and the next schedule start (or queued), despite it is already running.
Is there a way to define the dag to NOT even queue the task if it is already in run?
# airflow related
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators import MsSqlOperator
# other packages
from datetime import datetime
from datetime import timedelta
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2020, 7, 22, 15,00,00),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5)
}
dag = DAG(
dag_id='sales',
description='Run sales',
schedule_interval = '*/3 4,5,6,7,8,9,10,11,12,13,14,15,16,17 * * 0-5',
default_args=default_args,
catchup = False)
job1 = BashOperator(
task_id='sales',
bash_command='python2 /home/manager/ETL/sales.py',
dag = dag)
job2 = MsSqlOperator(
task_id='refresh_tabular',
mssql_conn_id='mssql_globrands',
sql="USE msdb ; EXEC dbo.sp_start_job N'refresh Management-sales' ; ",
dag = dag)
job1>>job2
I want to create task which will be update columns rows and send mail for every line in data table. At the moment I create task which download the data from main table. I cannot create tasks for every line in temp data table. Could you tell what I doing wrong and how I can generate and run tasks in lopp?
from datetime import datetime, timedelta
import airflow
from airflow import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_get_data import BigQueryGetDataOperator
from airflow.contrib.operators.bigquery_check_operator import BigQueryValueCheckOperator
from airflow.operators import PythonOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'cmap',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(0),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
}
with DAG('dq_bigquery_test',
max_active_runs=1,
schedule_interval='#once',
catchup=False,
default_args=default_args) as dag:
query = "SELECT * from `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` where MailRequired = false"
insert = "INSERT into dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc (DataTimeStamp, Robot, Status) Values (CURRENT_TIMESTAMP(), 'TestRobot', 'Test')"
my_bq_task = BigQueryOperator(
task_id='query_exc_on_teste',
sql=query,
write_disposition='WRITE_TRUNCATE',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
destination_dataset_table='dev_dataquality.testTable')
get_data = BigQueryGetDataOperator(
task_id='get_data_from_query',
project_id='dbce-bi-prod-e6fd',
dataset_id='dev_dataquality',
table_id='testTable',
max_results='100',
selected_fields='Robot,Status,MailRequired',
bigquery_conn_id='google_cloud_dbce_bi_prod'
)
def process_data_from_bq(**kwargs):
ti = kwargs['ti']
update_column = []
bq_data = ti.xcom_pull(task_ids='get_data_from_query')
print(bq_data)
# Now bq_data here would have your data in Python list
for index, i in enumerate(bq_data):
update_query = "UPDATE `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` SET MailSent = True WHERE Robot = '{}'".format(i[0])
print(update_query)
update_column.append(BigQueryOperator(
task_id='update_column_{}'.format(index),
sql=update_query,
write_disposition='WRITE_EMPTY',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
dag=dag
))
if index not in [0]:
update_column[index-1] >> update_column[index]
process_data = PythonOperator(
task_id='process_data_from_bq',
python_callable=process_data_from_bq,
provide_context=True
)
my_bq_task >> get_data >> process_data
Thank you for your help!
task a > task b > task c
If C fails I want to retry A. Is this possible? There are a few other tickets which involve subdags, but I would like to just be able to clear A.
I'm hoping to use on_retry_callback in task C but I don't know how to call task A.
There is another question which does this in a subdag, but I am not using subdags.
I'm trying to do this, but it doesn't seem to work:
def callback_for_failures(context):
print("*** retrying ***")
if context['task'].upstream_list:
context['task'].upstream_list[0].clear()
As other comments mentioned, I would use caution to make sure you aren't getting into an endless loop of clearing/retries. But you can call a bash command as part of your on_failure_callback and then specify which tasks you want to clear, and if you want downstream/upstream tasks cleared etc.
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta
def clear_upstream_task(context):
execution_date = context.get("execution_date")
clear_tasks = BashOperator(
task_id='clear_tasks',
bash_command=f'airflow tasks clear -s {execution_date} -t t1 -d -y clear_upstream_task'
)
return clear_tasks.execute(context=context)
# Default settings applied to all tasks
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5)
}
with DAG('clear_upstream_task',
start_date=datetime(2021, 1, 1),
max_active_runs=3,
schedule_interval=timedelta(minutes=5),
default_args=default_args,
catchup=False
) as dag:
t0 = DummyOperator(
task_id='t0'
)
t1 = DummyOperator(
task_id='t1'
)
t2 = DummyOperator(
task_id='t2'
)
t3 = BashOperator(
task_id='t3',
bash_command='exit 123',
on_failure_callback=clear_upstream_task
)
t0 >> t1 >> t2 >> t3
I have define the external_sensor like that:
external_sensor = ExternalTaskSensor(task_id='ext_sensor_task',
execution_delta=timedelta(minutes=0),
external_dag_id='book_data',
external_task_id='Dataframe_Windows_test',
dag = dag)
The another task is defined like this:
dl_processing_windows = DL_Processing(task_id='dl_processing_windows',
df_dataset_location=dl_config.WINDOWS_DATASET,
....
While in the airflow UI:
I got the error:
Argument ['task_id'] is required
I have two problems:
1. Why does such error exist?
2. Why does it not work?
The attachment:
default_args = {
'owner': 'Newt',
'retries': 2,
'retry_delay': timedelta(seconds=30),
'depends_on_past': False,
}
dag = DAG(
dag_id,
start_date = datetime(2019, 11, 20),
description= 'xxxx',
default_args = default_args,
schedule_interval = timedelta(hours=1),
)
The parameters for dag are the same for both dags!
I fixed it.
Normally, the start_date is different from the scheduler_interval. I set the start_date for both dags into the same time with the current date.
After the first dependent bag finished, the new dag began to work!