trying to create an opentelemetry trace using airflow - airflow

I have been trying to create a trace using open-telemetry but the endpoint doesn't give out any results..
endpoint="http://localhost:55680"
any help appreciated
also did anyone try to install this package in airflow
! from opentelemetry.exporter.cloud_trace
(this package helps to create a trace in google cloud , trying to run that through airflow)
it throws a version error
import airflow
from airflow import DAG
from airflow.operators.dummy import DummyOperator
from airflow.operators.python_operator import PythonOperator
from datetime import timedelta
from airflow.utils.dates import days_ago
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': days_ago(2),
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
# 'queue': 'bash_queue',
# 'pool': 'backfill',
# 'priority_weight': 10,
# 'end_date': datetime(2016, 1, 1),
# 'wait_for_downstream': False,
# 'dag': dag,
# 'sla': timedelta(hours=2),
# 'execution_timeout': timedelta(seconds=300),
# 'on_failure_callback': some_function,
# 'on_success_callback': some_other_function,
# 'on_retry_callback': another_function,
# 'sla_miss_callback': yet_another_function,
# 'trigger_rule': 'all_success'
}
dag = DAG(
'create_a_trace',
default_args=default_args,
description='open telemetry trace creation',
schedule_interval=timedelta(days=1),
)
def execute():
resource = Resource(attributes={
"service.name": "test_airflow_worker"
})
trace.set_tracer_provider(TracerProvider(resource=resource))
tracer = trace.get_tracer(__name__)
otlp_exporter = OTLPSpanExporter(endpoint="http://localhost:55680", insecure=True)
span_processor = BatchSpanProcessor(otlp_exporter)
trace.get_tracer_provider().add_span_processor(span_processor)
with tracer.start_as_current_span("test_task_span"):
print("Hello Airflow!")
run_this = PythonOperator(
task_id='print_the_context',
provide_context=True,
python_callable=execute,
dag=dag,
)
run_this

Related

Different way to set Airflow DAG fields

I would like to create an Airflow DAG and want to learn which parameters should be set in field_1 vs default_args vs args?
my_dag = DAG(
"my_dag",
"field_1"="xxx",
default_agrs=default_args,
**args
)
I checked with documentation, I understand that some parameters such as "owner" have to be set through the default_args and can't be in field_1. But looks like there's no difference for the most of parameters. I tested some fields such as "catchup" and "on_failure_callback", and they all work in all these three places.
So I wonder what's the best practice of setting parameters when create a dag?
The best practice is something like the Airflow tutorial
with DAG(
'tutorial',
# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_args={
'depends_on_past': False,
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
# 'queue': 'bash_queue',
# 'pool': 'backfill',
# 'priority_weight': 10,
# 'end_date': datetime(2016, 1, 1),
# 'wait_for_downstream': False,
# 'sla': timedelta(hours=2),
# 'execution_timeout': timedelta(seconds=300),
# 'on_failure_callback': some_function,
# 'on_success_callback': some_other_function,
# 'on_retry_callback': another_function,
# 'sla_miss_callback': yet_another_function,
# 'trigger_rule': 'all_success'
},
description='A simple tutorial DAG',
schedule_interval=timedelta(days=1),
start_date=datetime(2021, 1, 1),
catchup=False,
tags=['example'],
) as dag:
...
Reference: https://airflow.apache.org/docs/apache-airflow/stable/tutorial.html#example-pipeline-definition
But I use something like that and is enough:
import pendulum
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'retries': 1,
'retry_delay': timedelta(minutes=1)
}
with DAG(
default_args=default_args,
dag_id='dag_etl',
catchup=False,
start_date=pendulum.datetime(year=2022, month=1, day=1, tz='America/Chicago'),
schedule_interval='0 8 * * *', # https://crontab.guru/#0_8_*_*_*
description='DAG Extract Transform Load'
) as dag:
...

airflow S3ToRedshiftTransfer

I have a requirement to copy s3 files to redshift using copy command. I am bit new to airflow and having issues. Can some one correct the below code. Can I call the rs.execute() as such?
Error:
op.execute()
TypeError: execute() missing 1 required positional argument: 'context'
code:
import os
from airflow import DAG
from airflow.hooks.S3_hook import S3Hook
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
default_args = {
'owner': 'gra',
'depends_on_past': False,
'start_date': datetime(2020, 12, 13),
'email': ['ss.com'],
'email_on_failure': False,
'email_on_retry': False,
'schedule_interval': '#daily',
'retries': 1,
'retry_delay': timedelta(seconds=5),
}
def job1():
print('First Job to start')
def s3_redshift(**kwargs):
rs= S3ToRedshiftTransfer(redshift_conn_id ='12as',
aws_conn_id='gt_read',
schema='test',
table='dept',
s3_bucket="gng-test",
s3_key="copt.csv",
task_id="copy_redshift"
#copy_options=copy_options_,
)
rs.execute()
copy_redshift=PythonOperator(task_id='copy_redshift', python_callable=s3_redshift,provide_context=True, dag=dag)
app_start >> copy_redshift
I was able to use the boto3 to execute copy from s3 to redshift. S3ToRedshiftTransfer is can be used to do the same.
# airflow related
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
# other packages
from datetime import datetime
from datetime import timedelta
# from airflow.hooks import PostgresHook
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
#from airflow.providers.amazon.aws.transfers.s3_to_redshift import S3ToRedshiftOperator
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from airflow.operators import SimpleHttpOperator, HttpSensor, BashOperator, EmailOperator, S3KeySensor
import boto3
default_args = {
'owner': 'grit_delta',
'depends_on_past': False,
'start_date': datetime(2020, 12, 13),
'email': ['sa.com'],
'email_on_failure': False,
'email_on_retry': False,
'schedule_interval': '#daily',
'retries': 1,
'retry_delay': timedelta(seconds=5),
}
dag=DAG(dag_id='veritas_test',default_args=default_args,schedule_interval=timedelta(1))
def job1():
print('First Job to start')
file_sensor = S3KeySensor(task_id = 's3_key_sensor_task',
s3_conn_id='_read',
poke_interval=120,
timeout=18*60*60,
bucket_key = "data/test.*",
bucket_name = "g-test",
wildcard_match = True,
dag = dag
)
app_start=PythonOperator(task_id='app_start', python_callable=job1, dag=dag)
def s3_redshift(**kwargs):
rsd = boto3.client('redshift-data')
deptKey='s3://airflow-dev/code/gta/dag/dept.csv'
sqlQuery="copy test.dept from 's3://airflow-grole' CSV ;"
#sqlQuery="insert into test.dept values('d1221',100)"
print(sqlQuery)
resp = rsd.execute_statement(
ClusterIdentifier="opes",
Database="ee",
DbUser="aa",
Sql=sqlQuery
#Sql="CREATE TABLE IF NOT EXISTS test.dept (title varchar(10), rating int);"
)
print(resp)
print(" completed")
return "OK"
copy_redshift=PythonOperator(task_id='copy_redshift', python_callable=s3_redshift,provide_context=True, dag=dag)
file_sensor >>app_start >> copy_redshift
You have not defined any DAG and you don't use operators like that. I would recommend you to read a little bit more about how to use Airflow. Anyway, the code should be:
import os
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
default_args = {
'owner': 'gra',
'depends_on_past': False,
'start_date': datetime(2020, 12, 13),
'email': ['ss.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5),
}
with DAG('dag_name', schedule_interval='#daily', default_args=default_args) as dag:
rs= S3ToRedshiftTransfer(redshift_conn_id ='12as',
aws_conn_id='gt_read',
schema='test',
table='dept',
s3_bucket="gng-test",
s3_key="copt.csv",
task_id="copy_redshift"
)
app_start >> rs

How to avoid run of task when already running

I have an airflow task which scheduled to run every 3 minutes.
Sometimes the duration of the task is longer than 3 minutes, and the next schedule start (or queued), despite it is already running.
Is there a way to define the dag to NOT even queue the task if it is already in run?
# airflow related
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators import MsSqlOperator
# other packages
from datetime import datetime
from datetime import timedelta
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2020, 7, 22, 15,00,00),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=5)
}
dag = DAG(
dag_id='sales',
description='Run sales',
schedule_interval = '*/3 4,5,6,7,8,9,10,11,12,13,14,15,16,17 * * 0-5',
default_args=default_args,
catchup = False)
job1 = BashOperator(
task_id='sales',
bash_command='python2 /home/manager/ETL/sales.py',
dag = dag)
job2 = MsSqlOperator(
task_id='refresh_tabular',
mssql_conn_id='mssql_globrands',
sql="USE msdb ; EXEC dbo.sp_start_job N'refresh Management-sales' ; ",
dag = dag)
job1>>job2

Create multiple task in airflow using loop

I want to create task which will be update columns rows and send mail for every line in data table. At the moment I create task which download the data from main table. I cannot create tasks for every line in temp data table. Could you tell what I doing wrong and how I can generate and run tasks in lopp?
from datetime import datetime, timedelta
import airflow
from airflow import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_get_data import BigQueryGetDataOperator
from airflow.contrib.operators.bigquery_check_operator import BigQueryValueCheckOperator
from airflow.operators import PythonOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'cmap',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(0),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
}
with DAG('dq_bigquery_test',
max_active_runs=1,
schedule_interval='#once',
catchup=False,
default_args=default_args) as dag:
query = "SELECT * from `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` where MailRequired = false"
insert = "INSERT into dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc (DataTimeStamp, Robot, Status) Values (CURRENT_TIMESTAMP(), 'TestRobot', 'Test')"
my_bq_task = BigQueryOperator(
task_id='query_exc_on_teste',
sql=query,
write_disposition='WRITE_TRUNCATE',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
destination_dataset_table='dev_dataquality.testTable')
get_data = BigQueryGetDataOperator(
task_id='get_data_from_query',
project_id='dbce-bi-prod-e6fd',
dataset_id='dev_dataquality',
table_id='testTable',
max_results='100',
selected_fields='Robot,Status,MailRequired',
bigquery_conn_id='google_cloud_dbce_bi_prod'
)
def process_data_from_bq(**kwargs):
ti = kwargs['ti']
update_column = []
bq_data = ti.xcom_pull(task_ids='get_data_from_query')
print(bq_data)
# Now bq_data here would have your data in Python list
for index, i in enumerate(bq_data):
update_query = "UPDATE `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` SET MailSent = True WHERE Robot = '{}'".format(i[0])
print(update_query)
update_column.append(BigQueryOperator(
task_id='update_column_{}'.format(index),
sql=update_query,
write_disposition='WRITE_EMPTY',
create_disposition='CREATE_IF_NEEDED',
bigquery_conn_id='google_cloud_dbce_bi_prod',
use_legacy_sql=False,
dag=dag
))
if index not in [0]:
update_column[index-1] >> update_column[index]
process_data = PythonOperator(
task_id='process_data_from_bq',
python_callable=process_data_from_bq,
provide_context=True
)
my_bq_task >> get_data >> process_data
Thank you for your help!

Airflow:Broken DAG: [/root/airflow/dags/dag_test1.py] Argument ['execution_time'] is required

import airflow
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
#from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator
from airflow.models import Variable
from datetime import datetime, timedelta
from epsilon_spark_operator import EpsilonSparkOperator
#from merlin_spark_submit_operator import MerlinSparkSubmitOperator
#start=timedelta(hours=3)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(2),
# 'end_date': datetime(2019, 12, 12),
# 'schedule_interval': '37 12 * * *'
'email': ['ankit.maloo#inmobi.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 1,
'retry_delay': timedelta(minutes=1),
'queue': 'default'
}
dag = DAG('epsilon_spark_test', default_args=default_args, schedule_interval='#once')
spark_submit_task = EpsilonSparkOperator(
task_id='spark_submit_job',
conn_id='epsilon_spark',
application='abfs://***********.jar',
java_class='com.inmobi.EpsilonTest',
application_args=['10'],
verbose=True,
cluster_name="*****",
azure_storage_conn_id="*****",
keyvault_name='*****',
keyvault_client_id_key='*****',
keyvault_client_secret_key='*****',
conf={'spark.executors': '30', 'spark.eventLog.enabled': 'false', 'spark.eventLog.dir':'/tmp', 'spark.shuffle.service.enabled':'true'},
dag=dag)
dag >> spark_submit_task
I am trying to tesr airflow cluster with a sample spark job.
Above is my python code.
When trying to deploy the dag via curl
curl -X POST -H 'Content-Type: multipart/form-data' -F 'dag_file=#/Users/ankit.maloo/dag_test.py' -F 'force=on' 'http://*.*.*.*:8080/admin/rest_api/api?api=deploy_dag'
Gives error as
Broken DAG: [/root/airflow/dags/dag_test1.py] Argument ['execution_time'] is required.
Any Idea?

Resources