The conn_id envinorment variable isn't defined - airflow

I have the dag which tries connects to http endpoint nevertheless somehow it doesn't work. I defined connection using envinorment variable but somehow it is not seen by httpsensor and i am getting below error however that variable was created in the system. Whats wrong here? Below dag and full error code.
The conn_id `AIRFLOW_VAR_FOO` isn't defined
DAG:
import os
import json
import pprint
import datetime
import requests
from airflow.models import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.sftp.operators.sftp import SFTPOperator
from airflow.providers.sftp.sensors.sftp import SFTPSensor
from airflow.utils.dates import days_ago
from airflow.models import Variable
from airflow.sensors.http_sensor import HttpSensor
from airflow.hooks.base_hook import BaseHook
def init_vars():
os.environ['AIRFLOW_VAR_FOO'] = "https://mywebxxx.net/"
print(os.environ['AIRFLOW_VAR_FOO'])
with DAG(
dag_id='request_test',
schedule_interval=None,
start_date=days_ago(2)) as dag:
init_vars = PythonOperator(task_id="init_vars",
python_callable=init_vars)
task_is_api_active = HttpSensor(
task_id='is_api_active',
http_conn_id='AIRFLOW_VAR_FOO',
endpoint='post'
)
get_data = PythonOperator(task_id="get_data",
python_callable=get_data)
init_vars >> task_is_api_active
Full Error:
File "/home/airflow/.local/lib/python3.7/site-packages/airflow/models/connection.py", line 379, in get_connection_from_secrets
raise AirflowNotFoundException(f"The conn_id `{conn_id}` isn't defined")
airflow.exceptions.AirflowNotFoundException: The conn_id `AIRFLOW_VAR_FOO` isn't defined
[2022-11-04 10:32:41,720] {taskinstance.py:1551} INFO - Marking task as FAILED. dag_id=request_test, task_id=is_api_active, execution_date=20221104T103235, start_date=20221104T103240, end_date=20221104T103241
[2022-11-04 10:32:42,628] {local_task_job.py:149} INFO - Task exited with return code 1
EDIT:
import os
import json
import pprint
import datetime
import requests
from airflow.models import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.sftp.operators.sftp import SFTPOperator
from airflow.providers.sftp.sensors.sftp import SFTPSensor
from airflow.utils.dates import days_ago
from airflow.models import Variable
from airflow.sensors.http_sensor import HttpSensor
from airflow.hooks.base_hook import BaseHook
def init_vars():
os.environ['AIRFLOW_VAR_FOO'] = "https://mywebxxx.net/"
print(os.environ['AIRFLOW_VAR_FOO'])
with DAG(
dag_id='request_test',
schedule_interval=None,
start_date=days_ago(2)) as dag:
init_vars = PythonOperator(task_id="init_vars",
python_callable=init_vars)
call init_vars()
task_is_api_active = HttpSensor(
task_id='is_api_active',
http_conn_id='AIRFLOW_VAR_FOO',
endpoint='post'
)
get_data = PythonOperator(task_id="get_data",
python_callable=get_data)
task_is_api_active

You need to define the env variable as AIRFLOW_CONN_VAR_FOO and then your http_conn_id="var_foo".
for more details see this link
def init_vars():
os.environ['AIRFLOW_VAR_FOO'] = "https://mywebxxx.net/"
print(os.environ['AIRFLOW_VAR_FOO'])
with DAG(
dag_id='request_test',
schedule_interval=None,
start_date=days_ago(2)) as dag:
init_vars()
task_is_api_active = HttpSensor(
task_id='is_api_active',
http_conn_id='AIRFLOW_VAR_FOO',
endpoint='post'
)
get_data = PythonOperator(task_id="get_data",
python_callable=get_data)
task_is_api_active

Related

Multiple jobs getting triggered instead of only one job in MWAA

We are facing an issue in Managed Appflow Apache Airflow(MWAA) service, we created 10 different DAGs with almost same DAG operators (AWS Glue job) but with different parameters to same job.
We face an issue that operators in multiple DAGs are getting automatically triggered even though it is not scheduled. We tried changing the alias name for each operators in 10 DAGs but still random operators are getting triggered randomly in the 10 DAGs.
Please advise to troubleshoot this issue.
We set the dependency to the next job upon completion, but still it is not working as expected.
dag code for reference:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 21 11:31:03 2022
#author:
"""
#psycopg2-binary
from airflow.operators.bash import BashOperator
from airflow.operators.dummy import DummyOperator
import os
import boto3
import logging
import json
import pandas as pd
from datetime import timedelta
from airflow import DAG
from airflow.models.baseoperator import chain
from airflow.operators.bash import BashOperator
from airflow.operators.dummy import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.providers.amazon.aws.operators.glue import AwsGlueJobOperator
from airflow.utils.dates import days_ago
from datetime import datetime, timedelta
from airflow.providers.postgres.hooks.postgres import PostgresHook
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.models import TaskInstance
from airflow.models import XCom
from airflow.models import Variable
from airflow.utils.task_group import TaskGroup
DEFAULT_ARGS = {
"owner": "<<owner_name>>",
"depends_on_past": False,
"retries": 0,
"email_on_failure": False,
"email_on_retry": False,
}
def work_with_postgress_lims(ti,**kwargs):
try:
hook = PostgresHook(postgres_conn_id="<<db_name>>")
print(kwargs)
print("Hello from method")
print(hook)
id_list = f"SELECT id::varchar,dataobjectname FROM <<table_name>> where systeminfoid ='lims' and process_flag ='Y' and airflow_flag ='Y' order by id;"
print(hook.schema)
id_values = hook.get_records(id_list)
print(id_values)
stg_list = f"SELECT id::varchar,dataobjectname FROM <<table_name>> where systeminfoid ='lims' and process_flag ='Y' and staging_flag ='Y' and airflow_flag ='Y' order by id;"
print(hook.schema)
stg_values = hook.get_records(stg_list)
print(stg_list)
Variable.set(key='lims_table_and_dataset_list',
value=id_values, serialize_json=True)
options = Variable.get('lims_table_and_dataset_list',
default_var=['default_table'],
deserialize_json=True)
Variable.set(key='lims_stg_table_and_dataset_list',
value=stg_values, serialize_json=True)
options2 = Variable.get('lims_stg_table_and_dataset_list',
default_var=['default_table'],
deserialize_json=True)
print(options)
print(options2)
return id_values,stg_values
except Exception as e:
print(e)
glue_client = boto3.client('glue', region_name='<<region_name>>')
logger=logging.getLogger('airflow.task')
with DAG(
dag_id='kdh_source_to_curated_lims',
description="source to curated",
default_args=DEFAULT_ARGS,
dagrun_timeout=timedelta(hours=48),
start_date=datetime(2022,9,21,6,15,00),
concurrency=5,
max_active_runs=2,
schedule_interval=None) as dag:
work_with_postgress_lims = PythonOperator(
task_id='python_callable_operator_lims',
python_callable=work_with_postgress_lims,
do_xcom_push=False,
provide_context=True
)
options_dataset_id = Variable.get('lims_table_and_dataset_list',
default_var=['default_table'],
deserialize_json=True)
options_stg_dataset_id = Variable.get('lims_stg_table_and_dataset_list',
default_var=['default_table'],
deserialize_json=True)
kdh_jr_invoke_lims = AwsGlueJobOperator(task_id='kdh_jr_invoke_lims', job_name='kdh_jr_invoke', script_args={'--source_system': 'lims'})
with TaskGroup('dynamic_raw_tasks_group_lims',prefix_group_id=False,) as dynamic_raw_tasks_group_lims:
if(options_dataset_id):
for option_dataset_id in options_dataset_id:
t = AwsGlueJobOperator(task_id=option_dataset_id[1]+'_raw', job_name='kdh-rd_jr', script_args={'--dataset_id':option_dataset_id[0], '--airflow_flag':'Y'})
last = DummyOperator(task_id=option_dataset_id[1]+'_raw_end')
t >> last
with TaskGroup('dynamic_stg_tasks_group_lims',prefix_group_id=False,) as dynamic_stg_tasks_group_lims:
if(options_stg_dataset_id):
for option_stg_dataset_id in options_stg_dataset_id:
t = AwsGlueJobOperator(task_id=option_stg_dataset_id[1]+'_stg', job_name='kdh_dq_staging', script_args={'--source_system': 'lims','--table': option_stg_dataset_id[1]})
last = DummyOperator(task_id=option_stg_dataset_id[1]+'_stg_end')
t >> last
kdh_jr_curated_lims = AwsGlueJobOperator(task_id='kdh_jr_curated_lims', job_name='kdh_stg_curated', script_args={'--source_system': 'lims'})
kdh_jr_invoke_lims>>work_with_postgress_lims>>dynamic_raw_tasks_group_lims>>dynamic_stg_tasks_group_lims>>kdh_jr_curated_lims
#work_with_postgress_lims>>dynamic_raw_tasks_group_lims>>dynamic_stg_tasks_group_lims
# In[ ]:

View on_failure_callback DAG logger

Let's take an example DAG.
Here is the code for it.
import logging
from airflow import DAG
from datetime import datetime, timedelta
from airflow.models import TaskInstance
from airflow.operators.python import PythonOperator
from airflow.operators.dummy import DummyOperator
def task_failure_notification_alert(context):
logging.info("Task context details: %s", str(context))
def dag_failure_notification_alert(context):
logging.info("DAG context details: %s", str(context))
def red_exception_task(ti: TaskInstance, **kwargs):
raise Exception('red')
default_args = {
"owner": "analytics",
"start_date": datetime(2021, 12, 12),
'retries': 0,
'retry_delay': timedelta(),
"schedule_interval": "#daily"
}
dag = DAG('logger_dag',
default_args=default_args,
catchup=False,
on_failure_callback=dag_failure_notification_alert
)
start_task = DummyOperator(task_id="start_task", dag=dag, on_failure_callback=task_failure_notification_alert)
red_task = PythonOperator(
dag=dag,
task_id='red_task',
python_callable=red_exception_task,
provide_context=True,
on_failure_callback=task_failure_notification_alert
)
end_task = DummyOperator(task_id="end_task", dag=dag, on_failure_callback=task_failure_notification_alert)
start_task >> red_task >> end_task
We can see two functions i.e. task_failure_notification_alert and dag_failure_notification_alert are being called in case of failures.
We can see logs in case of Task failure by the below steps.
We can see logs for the task as below.
but I am unable to find logs for the on_failure_callback of DAG anywhere in UI. Where can we see it?
Under airflow/logs find the "scheduler" folder, under it look for the specific date you ran the Dag for example 2022-12-03 and there you will see name of the dag_file.log.

HttpSensor if endpoint not accessible or conn is wrong/not defined then do not go into next task

I have following DAG below. My concern is about HttpSensor. My target is if the endpoint is not accessible for whatever reason i do not want want the next task to be reached get_data. Moreover i would like sensor to check enpoint continously and only if endpoint is reachable then go to get_data task. What i observed at the moment is get_data is reached even if sensor do not reach endpoint or conn is no defined, then get_data is anyway reached. How to solve that?
import os
import json
import datetime
import requests
from airflow.models import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.sftp.operators.sftp import SFTPOperator
from airflow.providers.sftp.sensors.sftp import SFTPSensor
from airflow.utils.dates import days_ago
from airflow.models import Variable
from airflow.sensors.http_sensor import HttpSensor
from airflow.hooks.base_hook import BaseHook
def get_data():
response = requests.get("https://exampleweb/dis")
if (response.status_code == 200):
print("The request was a success!")
print(response.json()) # Return a string representation of the data payload
elif (response.status_code == 404):
print("Result not found!")
with DAG(
dag_id='request_test',
schedule_interval=None,
start_date=days_ago(2)) as dag:
task_is_api_active = HttpSensor(
task_id='is_api_active',
http_conn_id='someconn',
endpoint='exact'
)
get_data = PythonOperator(task_id="get_data",
python_callable=get_data)
task_is_api_active >> get_data

Overwrite Airflow DAG parameter using CLI

Given the following DAG:
import logging
from datetime import datetime
from airflow import DAG
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator
dag = DAG(
dag_id="dag_foo",
start_date=datetime(2022, 2, 28),
default_args={"owner": "Airflow", "params": {"param_a": "foo"}},
schedule_interval="#once",
catchup=False
)
def log_dag_param(param):
logging.info(param)
with dag:
DummyOperator(task_id="dummy") >> PythonOperator(
python_callable=log_dag_param, op_args=[dag.params["param_a"]]
)
I'm wondering if there is any way to overwrite an existing DAG parameter using the CLI. I'm aware of the airflow.models.dagrun.DagRun.conf, --conf parameter and this approach but I'm looking how I could overwrite a DAG parameter instead of a conf value.

Airflow : Publish a dynamically created dag

I want to be able to publish and trigger a DAG object from my code which is not in control of scheduler (viz. $AIRFLOW_HOME/dags folder)
My last resort would be to programmatically create a py file containing the DAG definition that I want to publish and save this file to the $AIRFLOW_HOME/dags folder.
I'm sure it should be easier than that.
Below is what I've tried.
import airflow
from airflow import DAG
from datetime import timedelta
from airflow.models import DagPickle
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.db import provide_session
#provide_session
def submit_dag(session=None):
args = {
'owner': 'airflow',
'start_date': airflow.utils.dates.days_ago(2)
}
dag = DAG(
dag_id='sample', default_args=args,
schedule_interval=None, start_date=airflow.utils.dates.days_ago(2),
dagrun_timeout=timedelta(minutes=60))
task = DummyOperator(task_id='one', dag=dag)
dag_pickle = DagPickle(task)
session.add(dag_pickle)
session.commit()
submit_dag()
The above code does create entries in dag_pickle table but how do I publish and later trigger this dag?
I can do pickle.dump(dag,open(DAGS_FOLDER/pickled_dags,'wb')) and have a file in DAGS FOLDER that would pickle.load(DAGS_FOLDER/pickled_dags)

Resources