I have a requirement to copy s3 files to redshift using copy command. I am bit new to airflow and having issues. Can some one correct the below code. Can I call the rs.execute() as such?
Error:
op.execute()
TypeError: execute() missing 1 required positional argument: 'context'
code:
import os
from airflow import DAG
from airflow.hooks.S3_hook import S3Hook
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
default_args = {
'owner': 'gra',
'depends_on_past': False,
'start_date': datetime(2020, 12, 13),
'email': ['ss.com'],
'email_on_failure': False,
'email_on_retry': False,
'schedule_interval': '#daily',
'retries': 1,
'retry_delay': timedelta(seconds=5),
}
def job1():
print('First Job to start')
def s3_redshift(**kwargs):
rs= S3ToRedshiftTransfer(redshift_conn_id ='12as',
aws_conn_id='gt_read',
schema='test',
table='dept',
s3_bucket="gng-test",
s3_key="copt.csv",
task_id="copy_redshift"
#copy_options=copy_options_,
)
rs.execute()
copy_redshift=PythonOperator(task_id='copy_redshift', python_callable=s3_redshift,provide_context=True, dag=dag)
app_start >> copy_redshift
I was able to use the boto3 to execute copy from s3 to redshift. S3ToRedshiftTransfer is can be used to do the same.
# airflow related
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
# other packages
from datetime import datetime
from datetime import timedelta
# from airflow.hooks import PostgresHook
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
#from airflow.providers.amazon.aws.transfers.s3_to_redshift import S3ToRedshiftOperator
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from airflow.operators import SimpleHttpOperator, HttpSensor, BashOperator, EmailOperator, S3KeySensor
import boto3
default_args = {
'owner': 'grit_delta',
'depends_on_past': False,
'start_date': datetime(2020, 12, 13),
'email': ['sa.com'],
'email_on_failure': False,
'email_on_retry': False,
'schedule_interval': '#daily',
'retries': 1,
'retry_delay': timedelta(seconds=5),
}
dag=DAG(dag_id='veritas_test',default_args=default_args,schedule_interval=timedelta(1))
def job1():
print('First Job to start')
file_sensor = S3KeySensor(task_id = 's3_key_sensor_task',
s3_conn_id='_read',
poke_interval=120,
timeout=18*60*60,
bucket_key = "data/test.*",
bucket_name = "g-test",
wildcard_match = True,
dag = dag
)
app_start=PythonOperator(task_id='app_start', python_callable=job1, dag=dag)
def s3_redshift(**kwargs):
rsd = boto3.client('redshift-data')
deptKey='s3://airflow-dev/code/gta/dag/dept.csv'
sqlQuery="copy test.dept from 's3://airflow-grole' CSV ;"
#sqlQuery="insert into test.dept values('d1221',100)"
print(sqlQuery)
resp = rsd.execute_statement(
ClusterIdentifier="opes",
Database="ee",
DbUser="aa",
Sql=sqlQuery
#Sql="CREATE TABLE IF NOT EXISTS test.dept (title varchar(10), rating int);"
)
print(resp)
print(" completed")
return "OK"
copy_redshift=PythonOperator(task_id='copy_redshift', python_callable=s3_redshift,provide_context=True, dag=dag)
file_sensor >>app_start >> copy_redshift
You have not defined any DAG and you don't use operators like that. I would recommend you to read a little bit more about how to use Airflow. Anyway, the code should be:
import os
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
default_args = {
'owner': 'gra',
'depends_on_past': False,
'start_date': datetime(2020, 12, 13),
'email': ['ss.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5),
}
with DAG('dag_name', schedule_interval='#daily', default_args=default_args) as dag:
rs= S3ToRedshiftTransfer(redshift_conn_id ='12as',
aws_conn_id='gt_read',
schema='test',
table='dept',
s3_bucket="gng-test",
s3_key="copt.csv",
task_id="copy_redshift"
)
app_start >> rs
Related
I have been trying to create a trace using open-telemetry but the endpoint doesn't give out any results..
endpoint="http://localhost:55680"
any help appreciated
also did anyone try to install this package in airflow
! from opentelemetry.exporter.cloud_trace
(this package helps to create a trace in google cloud , trying to run that through airflow)
it throws a version error
import airflow
from airflow import DAG
from airflow.operators.dummy import DummyOperator
from airflow.operators.python_operator import PythonOperator
from datetime import timedelta
from airflow.utils.dates import days_ago
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': days_ago(2),
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
# 'queue': 'bash_queue',
# 'pool': 'backfill',
# 'priority_weight': 10,
# 'end_date': datetime(2016, 1, 1),
# 'wait_for_downstream': False,
# 'dag': dag,
# 'sla': timedelta(hours=2),
# 'execution_timeout': timedelta(seconds=300),
# 'on_failure_callback': some_function,
# 'on_success_callback': some_other_function,
# 'on_retry_callback': another_function,
# 'sla_miss_callback': yet_another_function,
# 'trigger_rule': 'all_success'
}
dag = DAG(
'create_a_trace',
default_args=default_args,
description='open telemetry trace creation',
schedule_interval=timedelta(days=1),
)
def execute():
resource = Resource(attributes={
"service.name": "test_airflow_worker"
})
trace.set_tracer_provider(TracerProvider(resource=resource))
tracer = trace.get_tracer(__name__)
otlp_exporter = OTLPSpanExporter(endpoint="http://localhost:55680", insecure=True)
span_processor = BatchSpanProcessor(otlp_exporter)
trace.get_tracer_provider().add_span_processor(span_processor)
with tracer.start_as_current_span("test_task_span"):
print("Hello Airflow!")
run_this = PythonOperator(
task_id='print_the_context',
provide_context=True,
python_callable=execute,
dag=dag,
)
run_this
I have an airflow task which scheduled to run every 3 minutes.
Sometimes the duration of the task is longer than 3 minutes, and the next schedule start (or queued), despite it is already running.
Is there a way to define the dag to NOT even queue the task if it is already in run?
# airflow related
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators import MsSqlOperator
# other packages
from datetime import datetime
from datetime import timedelta
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2020, 7, 22, 15,00,00),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5)
}
dag = DAG(
dag_id='sales',
description='Run sales',
schedule_interval = '*/3 4,5,6,7,8,9,10,11,12,13,14,15,16,17 * * 0-5',
default_args=default_args,
catchup = False)
job1 = BashOperator(
task_id='sales',
bash_command='python2 /home/manager/ETL/sales.py',
dag = dag)
job2 = MsSqlOperator(
task_id='refresh_tabular',
mssql_conn_id='mssql_globrands',
sql="USE msdb ; EXEC dbo.sp_start_job N'refresh Management-sales' ; ",
dag = dag)
job1>>job2
I want to create task which will be update columns rows and send mail for every line in data table. At the moment I create task which download the data from main table. I cannot create tasks for every line in temp data table. Could you tell what I doing wrong and how I can generate and run tasks in lopp?
from datetime import datetime, timedelta
import airflow
from airflow import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_get_data import BigQueryGetDataOperator
from airflow.contrib.operators.bigquery_check_operator import BigQueryValueCheckOperator
from airflow.operators import PythonOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'cmap',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(0),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
}
with DAG('dq_bigquery_test',
max_active_runs=1,
schedule_interval='#once',
catchup=False,
default_args=default_args) as dag:
query = "SELECT * from `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` where MailRequired = false"
insert = "INSERT into dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc (DataTimeStamp, Robot, Status) Values (CURRENT_TIMESTAMP(), 'TestRobot', 'Test')"
my_bq_task = BigQueryOperator(
task_id='query_exc_on_teste',
sql=query,
write_disposition='WRITE_TRUNCATE',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
destination_dataset_table='dev_dataquality.testTable')
get_data = BigQueryGetDataOperator(
task_id='get_data_from_query',
project_id='dbce-bi-prod-e6fd',
dataset_id='dev_dataquality',
table_id='testTable',
max_results='100',
selected_fields='Robot,Status,MailRequired',
bigquery_conn_id='google_cloud_dbce_bi_prod'
)
def process_data_from_bq(**kwargs):
ti = kwargs['ti']
update_column = []
bq_data = ti.xcom_pull(task_ids='get_data_from_query')
print(bq_data)
# Now bq_data here would have your data in Python list
for index, i in enumerate(bq_data):
update_query = "UPDATE `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` SET MailSent = True WHERE Robot = '{}'".format(i[0])
print(update_query)
update_column.append(BigQueryOperator(
task_id='update_column_{}'.format(index),
sql=update_query,
write_disposition='WRITE_EMPTY',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
dag=dag
))
if index not in [0]:
update_column[index-1] >> update_column[index]
process_data = PythonOperator(
task_id='process_data_from_bq',
python_callable=process_data_from_bq,
provide_context=True
)
my_bq_task >> get_data >> process_data
Thank you for your help!
import airflow
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
#from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator
from airflow.models import Variable
from datetime import datetime, timedelta
from epsilon_spark_operator import EpsilonSparkOperator
#from merlin_spark_submit_operator import MerlinSparkSubmitOperator
#start=timedelta(hours=3)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(2),
# 'end_date': datetime(2019, 12, 12),
# 'schedule_interval': '37 12 * * *'
'email': ['ankit.maloo#inmobi.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 1,
'retry_delay': timedelta(minutes=1),
'queue': 'default'
}
dag = DAG('epsilon_spark_test', default_args=default_args, schedule_interval='#once')
spark_submit_task = EpsilonSparkOperator(
task_id='spark_submit_job',
conn_id='epsilon_spark',
application='abfs://***********.jar',
java_class='com.inmobi.EpsilonTest',
application_args=['10'],
verbose=True,
cluster_name="*****",
azure_storage_conn_id="*****",
keyvault_name='*****',
keyvault_client_id_key='*****',
keyvault_client_secret_key='*****',
conf={'spark.executors': '30', 'spark.eventLog.enabled': 'false', 'spark.eventLog.dir':'/tmp', 'spark.shuffle.service.enabled':'true'},
dag=dag)
dag >> spark_submit_task
I am trying to tesr airflow cluster with a sample spark job.
Above is my python code.
When trying to deploy the dag via curl
curl -X POST -H 'Content-Type: multipart/form-data' -F 'dag_file=#/Users/ankit.maloo/dag_test.py' -F 'force=on' 'http://*.*.*.*:8080/admin/rest_api/api?api=deploy_dag'
Gives error as
Broken DAG: [/root/airflow/dags/dag_test1.py] Argument ['execution_time'] is required.
Any Idea?
I have a simple airflow workflow composed of two tasks. One does make a download of a csv file containing stock data. The other extracts the maximum stock price and write the data to another file.
If I run the first task and then the second everything works fine, instead if execute: airflow run stocks_d get_max_share it fails to meet the dependency.
import csv
from datetime import datetime
from datetime import timedelta
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
import requests
def get_stock_data():
url = "https://app.quotemedia.com/quotetools/getHistoryDownload.csv?&webmasterId=501&startDay=02&startMonth=02&startYear=2002&endDay=02&endMonth=07&endYear=2009&isRanged=false&symbol=APL"
try:
r = requests.get(url)
except requests.RequestException as re:
raise
else:
with open('/tmp/stocks/airflow_stock_data.txt', 'w') as f:
f.write(r.text)
def get_max_share():
stock_data = []
stock_max = {}
with open('/tmp/stocks/airflow_stock_data.txt', 'r') as f:
stock_reader = csv.reader(f)
next(stock_reader, None)
for row in stock_reader:
stock_data.append(row)
for stock in stock_data:
stock_max[stock[2]] = stock[0]
with open('/tmp/stocks/max_stock', 'w') as f:
stock_price = max(stock_max.keys())
stock_max_price_date = stock_max[stock_price]
stock_entry = stock_max_price_date + ' -> ' + stock_price
f.write(stock_entry)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2017, 5, 30),
'email': ['mainl#domain.io'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'catchup': False,
}
dag = DAG('stocks_d', default_args=default_args, schedule_interval=timedelta(minutes=5))
task_get_stocks = PythonOperator(task_id='get_stocks', python_callable=get_stock_data, dag=dag)
task_get_max_share = PythonOperator(task_id='get_max_share', python_callable=get_max_share, dag=dag)
task_get_max_share.set_upstream(task_get_stocks)
Any ideas why does that happen ?
$ airflow run stocks_d get_max_share
Above command only run get_max_share task not the previous task before running it.
If you need to check the whole dag running, try below command
$ airflow trigger_dag stocks_d