I am trying to create a dag for running some query in hive using hiveoperator. Code is written below :
import datetime as dt
from airflow.models import DAG
from airflow.operators.hive_operator import HiveOperator
default_args = {
'owner': 'airflow',
'start_date': dt.datetime(2020, 3, 24),
'retries': 1,
'retry_delay': dt.timedelta(minutes=5),
}
hql_query = """USE testdb;
CREATE TABLE airflow-test-table LIKE test_table;"""
dag = DAG(
dag_id='load-hive',
default_args=default_args,
schedule_interval='0 * * * *'
)
hive-copy = HiveOperator(
task_id="hive-copy",
hql=hql_query,
hive_cli_conn_id="dime_hive_cli_default",
dag=load-hive,
)
hive-copy
I am getting syntax error:
Can't assign to operator syntax error at hive-copy = HiveOperator(
line
. I am not sure what's going wrong.
Do not use the dash '-' in hive-copy, rename it to hive_copy
Related
I have an airflow task which scheduled to run every 3 minutes.
Sometimes the duration of the task is longer than 3 minutes, and the next schedule start (or queued), despite it is already running.
Is there a way to define the dag to NOT even queue the task if it is already in run?
# airflow related
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators import MsSqlOperator
# other packages
from datetime import datetime
from datetime import timedelta
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2020, 7, 22, 15,00,00),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5)
}
dag = DAG(
dag_id='sales',
description='Run sales',
schedule_interval = '*/3 4,5,6,7,8,9,10,11,12,13,14,15,16,17 * * 0-5',
default_args=default_args,
catchup = False)
job1 = BashOperator(
task_id='sales',
bash_command='python2 /home/manager/ETL/sales.py',
dag = dag)
job2 = MsSqlOperator(
task_id='refresh_tabular',
mssql_conn_id='mssql_globrands',
sql="USE msdb ; EXEC dbo.sp_start_job N'refresh Management-sales' ; ",
dag = dag)
job1>>job2
I want to create task which will be update columns rows and send mail for every line in data table. At the moment I create task which download the data from main table. I cannot create tasks for every line in temp data table. Could you tell what I doing wrong and how I can generate and run tasks in lopp?
from datetime import datetime, timedelta
import airflow
from airflow import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_get_data import BigQueryGetDataOperator
from airflow.contrib.operators.bigquery_check_operator import BigQueryValueCheckOperator
from airflow.operators import PythonOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'cmap',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(0),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
}
with DAG('dq_bigquery_test',
max_active_runs=1,
schedule_interval='#once',
catchup=False,
default_args=default_args) as dag:
query = "SELECT * from `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` where MailRequired = false"
insert = "INSERT into dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc (DataTimeStamp, Robot, Status) Values (CURRENT_TIMESTAMP(), 'TestRobot', 'Test')"
my_bq_task = BigQueryOperator(
task_id='query_exc_on_teste',
sql=query,
write_disposition='WRITE_TRUNCATE',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
destination_dataset_table='dev_dataquality.testTable')
get_data = BigQueryGetDataOperator(
task_id='get_data_from_query',
project_id='dbce-bi-prod-e6fd',
dataset_id='dev_dataquality',
table_id='testTable',
max_results='100',
selected_fields='Robot,Status,MailRequired',
bigquery_conn_id='google_cloud_dbce_bi_prod'
)
def process_data_from_bq(**kwargs):
ti = kwargs['ti']
update_column = []
bq_data = ti.xcom_pull(task_ids='get_data_from_query')
print(bq_data)
# Now bq_data here would have your data in Python list
for index, i in enumerate(bq_data):
update_query = "UPDATE `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` SET MailSent = True WHERE Robot = '{}'".format(i[0])
print(update_query)
update_column.append(BigQueryOperator(
task_id='update_column_{}'.format(index),
sql=update_query,
write_disposition='WRITE_EMPTY',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
dag=dag
))
if index not in [0]:
update_column[index-1] >> update_column[index]
process_data = PythonOperator(
task_id='process_data_from_bq',
python_callable=process_data_from_bq,
provide_context=True
)
my_bq_task >> get_data >> process_data
Thank you for your help!
import airflow
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
#from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator
from airflow.models import Variable
from datetime import datetime, timedelta
from epsilon_spark_operator import EpsilonSparkOperator
#from merlin_spark_submit_operator import MerlinSparkSubmitOperator
#start=timedelta(hours=3)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(2),
# 'end_date': datetime(2019, 12, 12),
# 'schedule_interval': '37 12 * * *'
'email': ['ankit.maloo#inmobi.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 1,
'retry_delay': timedelta(minutes=1),
'queue': 'default'
}
dag = DAG('epsilon_spark_test', default_args=default_args, schedule_interval='#once')
spark_submit_task = EpsilonSparkOperator(
task_id='spark_submit_job',
conn_id='epsilon_spark',
application='abfs://***********.jar',
java_class='com.inmobi.EpsilonTest',
application_args=['10'],
verbose=True,
cluster_name="*****",
azure_storage_conn_id="*****",
keyvault_name='*****',
keyvault_client_id_key='*****',
keyvault_client_secret_key='*****',
conf={'spark.executors': '30', 'spark.eventLog.enabled': 'false', 'spark.eventLog.dir':'/tmp', 'spark.shuffle.service.enabled':'true'},
dag=dag)
dag >> spark_submit_task
I am trying to tesr airflow cluster with a sample spark job.
Above is my python code.
When trying to deploy the dag via curl
curl -X POST -H 'Content-Type: multipart/form-data' -F 'dag_file=#/Users/ankit.maloo/dag_test.py' -F 'force=on' 'http://*.*.*.*:8080/admin/rest_api/api?api=deploy_dag'
Gives error as
Broken DAG: [/root/airflow/dags/dag_test1.py] Argument ['execution_time'] is required.
Any Idea?
I have define the external_sensor like that:
external_sensor = ExternalTaskSensor(task_id='ext_sensor_task',
execution_delta=timedelta(minutes=0),
external_dag_id='book_data',
external_task_id='Dataframe_Windows_test',
dag = dag)
The another task is defined like this:
dl_processing_windows = DL_Processing(task_id='dl_processing_windows',
df_dataset_location=dl_config.WINDOWS_DATASET,
....
While in the airflow UI:
I got the error:
Argument ['task_id'] is required
I have two problems:
1. Why does such error exist?
2. Why does it not work?
The attachment:
default_args = {
'owner': 'Newt',
'retries': 2,
'retry_delay': timedelta(seconds=30),
'depends_on_past': False,
}
dag = DAG(
dag_id,
start_date = datetime(2019, 11, 20),
description= 'xxxx',
default_args = default_args,
schedule_interval = timedelta(hours=1),
)
The parameters for dag are the same for both dags!
I fixed it.
Normally, the start_date is different from the scheduler_interval. I set the start_date for both dags into the same time with the current date.
After the first dependent bag finished, the new dag began to work!
I have a simple airflow workflow composed of two tasks. One does make a download of a csv file containing stock data. The other extracts the maximum stock price and write the data to another file.
If I run the first task and then the second everything works fine, instead if execute: airflow run stocks_d get_max_share it fails to meet the dependency.
import csv
from datetime import datetime
from datetime import timedelta
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
import requests
def get_stock_data():
url = "https://app.quotemedia.com/quotetools/getHistoryDownload.csv?&webmasterId=501&startDay=02&startMonth=02&startYear=2002&endDay=02&endMonth=07&endYear=2009&isRanged=false&symbol=APL"
try:
r = requests.get(url)
except requests.RequestException as re:
raise
else:
with open('/tmp/stocks/airflow_stock_data.txt', 'w') as f:
f.write(r.text)
def get_max_share():
stock_data = []
stock_max = {}
with open('/tmp/stocks/airflow_stock_data.txt', 'r') as f:
stock_reader = csv.reader(f)
next(stock_reader, None)
for row in stock_reader:
stock_data.append(row)
for stock in stock_data:
stock_max[stock[2]] = stock[0]
with open('/tmp/stocks/max_stock', 'w') as f:
stock_price = max(stock_max.keys())
stock_max_price_date = stock_max[stock_price]
stock_entry = stock_max_price_date + ' -> ' + stock_price
f.write(stock_entry)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2017, 5, 30),
'email': ['mainl#domain.io'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'catchup': False,
}
dag = DAG('stocks_d', default_args=default_args, schedule_interval=timedelta(minutes=5))
task_get_stocks = PythonOperator(task_id='get_stocks', python_callable=get_stock_data, dag=dag)
task_get_max_share = PythonOperator(task_id='get_max_share', python_callable=get_max_share, dag=dag)
task_get_max_share.set_upstream(task_get_stocks)
Any ideas why does that happen ?
$ airflow run stocks_d get_max_share
Above command only run get_max_share task not the previous task before running it.
If you need to check the whole dag running, try below command
$ airflow trigger_dag stocks_d