Multiple jobs getting triggered instead of only one job in MWAA - airflow

We are facing an issue in Managed Appflow Apache Airflow(MWAA) service, we created 10 different DAGs with almost same DAG operators (AWS Glue job) but with different parameters to same job.
We face an issue that operators in multiple DAGs are getting automatically triggered even though it is not scheduled. We tried changing the alias name for each operators in 10 DAGs but still random operators are getting triggered randomly in the 10 DAGs.
Please advise to troubleshoot this issue.
We set the dependency to the next job upon completion, but still it is not working as expected.
dag code for reference:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 21 11:31:03 2022
#author:
"""
#psycopg2-binary
from airflow.operators.bash import BashOperator
from airflow.operators.dummy import DummyOperator
import os
import boto3
import logging
import json
import pandas as pd
from datetime import timedelta
from airflow import DAG
from airflow.models.baseoperator import chain
from airflow.operators.bash import BashOperator
from airflow.operators.dummy import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.providers.amazon.aws.operators.glue import AwsGlueJobOperator
from airflow.utils.dates import days_ago
from datetime import datetime, timedelta
from airflow.providers.postgres.hooks.postgres import PostgresHook
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.models import TaskInstance
from airflow.models import XCom
from airflow.models import Variable
from airflow.utils.task_group import TaskGroup
DEFAULT_ARGS = {
"owner": "<<owner_name>>",
"depends_on_past": False,
"retries": 0,
"email_on_failure": False,
"email_on_retry": False,
}
def work_with_postgress_lims(ti,**kwargs):
try:
hook = PostgresHook(postgres_conn_id="<<db_name>>")
print(kwargs)
print("Hello from method")
print(hook)
id_list = f"SELECT id::varchar,dataobjectname FROM <<table_name>> where systeminfoid ='lims' and process_flag ='Y' and airflow_flag ='Y' order by id;"
print(hook.schema)
id_values = hook.get_records(id_list)
print(id_values)
stg_list = f"SELECT id::varchar,dataobjectname FROM <<table_name>> where systeminfoid ='lims' and process_flag ='Y' and staging_flag ='Y' and airflow_flag ='Y' order by id;"
print(hook.schema)
stg_values = hook.get_records(stg_list)
print(stg_list)
Variable.set(key='lims_table_and_dataset_list',
value=id_values, serialize_json=True)
options = Variable.get('lims_table_and_dataset_list',
default_var=['default_table'],
deserialize_json=True)
Variable.set(key='lims_stg_table_and_dataset_list',
value=stg_values, serialize_json=True)
options2 = Variable.get('lims_stg_table_and_dataset_list',
default_var=['default_table'],
deserialize_json=True)
print(options)
print(options2)
return id_values,stg_values
except Exception as e:
print(e)
glue_client = boto3.client('glue', region_name='<<region_name>>')
logger=logging.getLogger('airflow.task')
with DAG(
dag_id='kdh_source_to_curated_lims',
description="source to curated",
default_args=DEFAULT_ARGS,
dagrun_timeout=timedelta(hours=48),
start_date=datetime(2022,9,21,6,15,00),
concurrency=5,
max_active_runs=2,
schedule_interval=None) as dag:
work_with_postgress_lims = PythonOperator(
task_id='python_callable_operator_lims',
python_callable=work_with_postgress_lims,
do_xcom_push=False,
provide_context=True
)
options_dataset_id = Variable.get('lims_table_and_dataset_list',
default_var=['default_table'],
deserialize_json=True)
options_stg_dataset_id = Variable.get('lims_stg_table_and_dataset_list',
default_var=['default_table'],
deserialize_json=True)
kdh_jr_invoke_lims = AwsGlueJobOperator(task_id='kdh_jr_invoke_lims', job_name='kdh_jr_invoke', script_args={'--source_system': 'lims'})
with TaskGroup('dynamic_raw_tasks_group_lims',prefix_group_id=False,) as dynamic_raw_tasks_group_lims:
if(options_dataset_id):
for option_dataset_id in options_dataset_id:
t = AwsGlueJobOperator(task_id=option_dataset_id[1]+'_raw', job_name='kdh-rd_jr', script_args={'--dataset_id':option_dataset_id[0], '--airflow_flag':'Y'})
last = DummyOperator(task_id=option_dataset_id[1]+'_raw_end')
t >> last
with TaskGroup('dynamic_stg_tasks_group_lims',prefix_group_id=False,) as dynamic_stg_tasks_group_lims:
if(options_stg_dataset_id):
for option_stg_dataset_id in options_stg_dataset_id:
t = AwsGlueJobOperator(task_id=option_stg_dataset_id[1]+'_stg', job_name='kdh_dq_staging', script_args={'--source_system': 'lims','--table': option_stg_dataset_id[1]})
last = DummyOperator(task_id=option_stg_dataset_id[1]+'_stg_end')
t >> last
kdh_jr_curated_lims = AwsGlueJobOperator(task_id='kdh_jr_curated_lims', job_name='kdh_stg_curated', script_args={'--source_system': 'lims'})
kdh_jr_invoke_lims>>work_with_postgress_lims>>dynamic_raw_tasks_group_lims>>dynamic_stg_tasks_group_lims>>kdh_jr_curated_lims
#work_with_postgress_lims>>dynamic_raw_tasks_group_lims>>dynamic_stg_tasks_group_lims
# In[ ]:

Related

HttpSensor if endpoint not accessible or conn is wrong/not defined then do not go into next task

I have following DAG below. My concern is about HttpSensor. My target is if the endpoint is not accessible for whatever reason i do not want want the next task to be reached get_data. Moreover i would like sensor to check enpoint continously and only if endpoint is reachable then go to get_data task. What i observed at the moment is get_data is reached even if sensor do not reach endpoint or conn is no defined, then get_data is anyway reached. How to solve that?
import os
import json
import datetime
import requests
from airflow.models import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.sftp.operators.sftp import SFTPOperator
from airflow.providers.sftp.sensors.sftp import SFTPSensor
from airflow.utils.dates import days_ago
from airflow.models import Variable
from airflow.sensors.http_sensor import HttpSensor
from airflow.hooks.base_hook import BaseHook
def get_data():
response = requests.get("https://exampleweb/dis")
if (response.status_code == 200):
print("The request was a success!")
print(response.json()) # Return a string representation of the data payload
elif (response.status_code == 404):
print("Result not found!")
with DAG(
dag_id='request_test',
schedule_interval=None,
start_date=days_ago(2)) as dag:
task_is_api_active = HttpSensor(
task_id='is_api_active',
http_conn_id='someconn',
endpoint='exact'
)
get_data = PythonOperator(task_id="get_data",
python_callable=get_data)
task_is_api_active >> get_data

The conn_id envinorment variable isn't defined

I have the dag which tries connects to http endpoint nevertheless somehow it doesn't work. I defined connection using envinorment variable but somehow it is not seen by httpsensor and i am getting below error however that variable was created in the system. Whats wrong here? Below dag and full error code.
The conn_id `AIRFLOW_VAR_FOO` isn't defined
DAG:
import os
import json
import pprint
import datetime
import requests
from airflow.models import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.sftp.operators.sftp import SFTPOperator
from airflow.providers.sftp.sensors.sftp import SFTPSensor
from airflow.utils.dates import days_ago
from airflow.models import Variable
from airflow.sensors.http_sensor import HttpSensor
from airflow.hooks.base_hook import BaseHook
def init_vars():
os.environ['AIRFLOW_VAR_FOO'] = "https://mywebxxx.net/"
print(os.environ['AIRFLOW_VAR_FOO'])
with DAG(
dag_id='request_test',
schedule_interval=None,
start_date=days_ago(2)) as dag:
init_vars = PythonOperator(task_id="init_vars",
python_callable=init_vars)
task_is_api_active = HttpSensor(
task_id='is_api_active',
http_conn_id='AIRFLOW_VAR_FOO',
endpoint='post'
)
get_data = PythonOperator(task_id="get_data",
python_callable=get_data)
init_vars >> task_is_api_active
Full Error:
File "/home/airflow/.local/lib/python3.7/site-packages/airflow/models/connection.py", line 379, in get_connection_from_secrets
raise AirflowNotFoundException(f"The conn_id `{conn_id}` isn't defined")
airflow.exceptions.AirflowNotFoundException: The conn_id `AIRFLOW_VAR_FOO` isn't defined
[2022-11-04 10:32:41,720] {taskinstance.py:1551} INFO - Marking task as FAILED. dag_id=request_test, task_id=is_api_active, execution_date=20221104T103235, start_date=20221104T103240, end_date=20221104T103241
[2022-11-04 10:32:42,628] {local_task_job.py:149} INFO - Task exited with return code 1
EDIT:
import os
import json
import pprint
import datetime
import requests
from airflow.models import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.sftp.operators.sftp import SFTPOperator
from airflow.providers.sftp.sensors.sftp import SFTPSensor
from airflow.utils.dates import days_ago
from airflow.models import Variable
from airflow.sensors.http_sensor import HttpSensor
from airflow.hooks.base_hook import BaseHook
def init_vars():
os.environ['AIRFLOW_VAR_FOO'] = "https://mywebxxx.net/"
print(os.environ['AIRFLOW_VAR_FOO'])
with DAG(
dag_id='request_test',
schedule_interval=None,
start_date=days_ago(2)) as dag:
init_vars = PythonOperator(task_id="init_vars",
python_callable=init_vars)
call init_vars()
task_is_api_active = HttpSensor(
task_id='is_api_active',
http_conn_id='AIRFLOW_VAR_FOO',
endpoint='post'
)
get_data = PythonOperator(task_id="get_data",
python_callable=get_data)
task_is_api_active
You need to define the env variable as AIRFLOW_CONN_VAR_FOO and then your http_conn_id="var_foo".
for more details see this link
def init_vars():
os.environ['AIRFLOW_VAR_FOO'] = "https://mywebxxx.net/"
print(os.environ['AIRFLOW_VAR_FOO'])
with DAG(
dag_id='request_test',
schedule_interval=None,
start_date=days_ago(2)) as dag:
init_vars()
task_is_api_active = HttpSensor(
task_id='is_api_active',
http_conn_id='AIRFLOW_VAR_FOO',
endpoint='post'
)
get_data = PythonOperator(task_id="get_data",
python_callable=get_data)
task_is_api_active

Overwrite Airflow DAG parameter using CLI

Given the following DAG:
import logging
from datetime import datetime
from airflow import DAG
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator
dag = DAG(
dag_id="dag_foo",
start_date=datetime(2022, 2, 28),
default_args={"owner": "Airflow", "params": {"param_a": "foo"}},
schedule_interval="#once",
catchup=False
)
def log_dag_param(param):
logging.info(param)
with dag:
DummyOperator(task_id="dummy") >> PythonOperator(
python_callable=log_dag_param, op_args=[dag.params["param_a"]]
)
I'm wondering if there is any way to overwrite an existing DAG parameter using the CLI. I'm aware of the airflow.models.dagrun.DagRun.conf, --conf parameter and this approach but I'm looking how I could overwrite a DAG parameter instead of a conf value.

Generating airflow DAGs dynamically

I am trying to generate airflow dags using a template in a python code, and using globals() as defined here
To define dag object and saving it. Below is my code :
import datetime as dt
import sys
import airflow
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
argumentList = sys.argv
owner = argumentList[1]
dag_name = argumentList[2]
taskID = argumentList[3]
bashCommand = argumentList[4]
default_args = {
'owner': owner,
'start_date': dt.datetime(2019, 6, 1),
'retries': 1,
'retry_delay': dt.timedelta(minutes=5),
}
def dagCreate():
with DAG(dag_name,
default_args=default_args,
schedule_interval=None,
) as dag:
print_hello = BashOperator(task_id=taskID, bash_command=bashCommand)
return dag
globals()[dag_name] = dagCreate()
I have kept this python code outside dag_folder, and executing it as follows :
python bash-dag-generator.py Airflow test_bash_generate auto_bash_task ls
But I don't see any DAG generated in the airflow webserver UI. I am not sure where I am going wrong.
As per the official documentation:
DAGs are defined in standard Python files that are placed in Airflow’s DAG_FOLDER. Airflow will execute the code in each file to dynamically build the DAG objects. You can have as many DAGs as you want, each describing an arbitrary number of tasks. In general, each one should correspond to a single logical workflow.
So unless your code is actually inside the DAG_FOLDER, it will not be registered as a DAG.
The way I have been able to implement Dynamic DAGs is by using Airflow Variable.
In the below example I have a csv file that has list of Bash command like ls, echo etc. As part of the read_file task I am updating the file location to the Airflow Variable. The part where we read the csv file and loop through the commands is where the dynamic DAGs get created.
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.models import Variable
from datetime import datetime, timedelta
import csv
'''
Orchestrate the Dynamic Tasks
'''
def read_file_task():
print('I am reading a File and setting variables ')
Variable.set('dynamic-dag-sample','/home/bashoperator.csv')
with DAG('dynamic-dag-sample',
start_date=datetime(2018, 11, 1)) as dag:
read_file_task = PythonOperator(task_id='read_file_task',
python_callable=read_file_task, provide_context=True,
dag=dag)
dynamic_dag_sample_file_path = Variable.get("dynamic-dag-sample")
if dynamic_dag_sample_file_path != None:
with open(dynamic_dag_sample_file_path) as csv_file:
reader = csv.DictReader(csv_file)
line_count = 0
for row in reader:
bash_task = BashOperator(task_id=row['Taskname'], bash_command=row['Command'])
read_file_task.set_downstream(bash_task)

Airflow : Publish a dynamically created dag

I want to be able to publish and trigger a DAG object from my code which is not in control of scheduler (viz. $AIRFLOW_HOME/dags folder)
My last resort would be to programmatically create a py file containing the DAG definition that I want to publish and save this file to the $AIRFLOW_HOME/dags folder.
I'm sure it should be easier than that.
Below is what I've tried.
import airflow
from airflow import DAG
from datetime import timedelta
from airflow.models import DagPickle
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.db import provide_session
#provide_session
def submit_dag(session=None):
args = {
'owner': 'airflow',
'start_date': airflow.utils.dates.days_ago(2)
}
dag = DAG(
dag_id='sample', default_args=args,
schedule_interval=None, start_date=airflow.utils.dates.days_ago(2),
dagrun_timeout=timedelta(minutes=60))
task = DummyOperator(task_id='one', dag=dag)
dag_pickle = DagPickle(task)
session.add(dag_pickle)
session.commit()
submit_dag()
The above code does create entries in dag_pickle table but how do I publish and later trigger this dag?
I can do pickle.dump(dag,open(DAGS_FOLDER/pickled_dags,'wb')) and have a file in DAGS FOLDER that would pickle.load(DAGS_FOLDER/pickled_dags)

Resources