PythonOperator with python_callable set gets executed constantly - airflow

import airflow
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime, timedelta
from workflow.task import some_task
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email': ['jimin.park1#aig.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=1),
'start_date': airflow.utils.dates.days_ago(0)
# 'queue': 'bash_queue',
# 'pool': 'backfill',
# 'priority_weight': 10,
# 'end_date': datetime(2016, 1, 1),
}
dag = DAG('JiminTest', default_args=default_args, schedule_interval='*/1 * * * *', catchup=False)
t1 = PythonOperator(
task_id='Task1',
provide_context=True,
python_callable=some_task,
dag=dag
)
The actual some_task itself simply appends timestamp to some file. As you can see in the dag config file, the task itself is configured to run every 1 min.
def some_task(ds, **kwargs):
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with open("test.txt", "a") as myfile:
myfile.write(current_time + '\n')
I simply tail -f the output file and started up the webserver without the scheduler running. This function was being called and things were being appended to the file when webserver starts up. When I start up the scheduler, on each execution loop, the file gets appended.
What I want is for the function to be executed on every minute as intended, not every execution loop.

The scheduler will run each DAG file every scheduler loop, including all import statements.
Is there anything running code in the file from where you are importing the function?

Try to check the scheduler_heartbeat_sec config parameter in your config file. For your case it should be smaller than 60 seconds.
If you want the scheduler not to cahtchup previous runs set catchup_by_defaultto False (I am not sure if this relevant to your question though).
Please indicate which Apache Airflow version are you using

Related

Airflow tasks not gettin running

I am trying to run a simple BASHOperator task in Airflow. The DAG when trigerred manually lists the tasks in Tree and Graph view but the tasks are always in not started state.
I have restarted my Airflow scheduler. I am running Airflow on local host using a Kubectl image on Docker Compose.
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email': ['vijayraghunath21#gmail.com'],
'email_on_success': True,
'email_on_failure': True,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=2),
}
with DAG(
dag_id='bash_demo',
default_args=default_args,
description='Bash Demo',
start_date=datetime(2021, 1, 1),
# schedule_interval='0 2 * * *',
schedule_interval=None,
max_active_runs=1,
catchup=False,
tags=['bash_demo'],
) as dag:
dag.doc_md = __doc__
# Task 1
dummy_task = DummyOperator(task_id='dummy_task')
# Task 2
bash_task = BashOperator(
task_id='bash_task', bash_command="echo 'command executed from BashOperator'")
dummy_task >> bash_task
DAG Image
As shown on the image you added the DAG is set to off thus it's not running. You should click on the toggle button to set it to on.
This issue can be avoided in two ways:
Global solution- if you wills set dags_are_paused_at_creation = False in airflow.cfg - This will effect all DAGs in the system.
Local solution - if you will use is_paused_upon_creation in the DAG contractor:
with DAG(
dag_id='bash_demo',
...
is_paused_upon_creation=False,
) as dag:
This parameter specifies if the dag is paused when created for the first time. If the dag exists already, the parameter is being ignored.

Apache-Airflow - Task is in the none state when running DAG

Just started with airflow and wanted to run simple dag with BashOperator that outputs 'Hello' to console
I noticed that my status is indefinitely stuck in 'Running'
When I go on task details, I get this:
Task is in the 'None' state which is not a valid state for execution. The task must be cleared in order to be run.
Any suggestions or hints are much appreciated.
Dag:
from datetime import timedelta
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import days_ago
default_args = {
'owner': 'dude_whose_doors_open_like_this_-W-',
'depends_on_past': False,
'start_date': days_ago(2),
'email': ['yessure#gmail.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 1,
'retry_delay': timedelta(minutes=5),
}
dag = DAG(
'Test',
default_args=default_args,
description='Test',
schedule_interval=timedelta(days=1)
)
t1 = BashOperator(
task_id='ECHO',
bash_command='echo "Hello"',
dag=dag
)
t1
I've managed to solve it by adding 'start_date': dt(1970, 1, 1),
to default args object
and adding schedule_interval=None to my dag object
Could you remove the last line of t1- this isn't necessary. Also start_dateshouldn't be set dynamically - this can lead to problems with the scheduling.

Airflow sql_path not able to read the sql files when passed as Jinja Template Variable

I am trying to use Jinja template variable as against using Variable.get('sql_path'), So as to avoid hitting DB for every scan of the dag file
Original code
import datetime
import os
from functools import partial
from datetime import timedelta
from airflow.models import DAG,Variable
from airflow.contrib.operators.snowflake_operator import SnowflakeOperator
from alerts.email_operator import dag_failure_email
SNOWFLAKE_CONN_ID = 'etl_conn'
tmpl_search_path = []
for subdir in ['business/', 'audit/', 'business/transform/']:
tmpl_search_path.append(os.path.join(Variable.get('sql_path'), subdir))
def get_db_dag(
*,
dag_id,
start_date,
schedule_interval,
max_taskrun,
max_dagrun,
proc_nm,
load_sql
):
default_args = {
'owner': 'airflow',
'start_date': start_date,
'provide_context': True,
'execution_timeout': timedelta(minutes=max_taskrun),
'retries': 0,
'retry_delay': timedelta(minutes=3),
'retry_exponential_backoff': True,
'email_on_retry': False,
}
dag = DAG(
dag_id=dag_id,
schedule_interval=schedule_interval,
dagrun_timeout=timedelta(hours=max_dagrun),
template_searchpath=tmpl_search_path,
default_args=default_args,
max_active_runs=1,
catchup='{{var.value.dag_catchup}}',
on_failure_callback=alert_email_callback,
)
load_table = SnowflakeOperator(
task_id='load_table',
sql=load_sql,
snowflake_conn_id=SNOWFLAKE_CONN_ID,
autocommit=True,
dag=dag,
)
load_vcc_svc_recon
return dag
# ======== DAG DEFINITIONS #
edw_table_A = get_db_dag(
dag_id='edw_table_A',
start_date=datetime.datetime(2020, 5, 21),
schedule_interval='0 5 * * *',
max_taskrun=3, # Minutes
max_dagrun=1, # Hours
load_sql='recon/extract.sql',
)
When I have replaced Variable.get('sql_path') with Jinja Template '{{var.value.sql_path}}' as below and ran the Dag, it threw an error as below, it was not able to get the file from the subdirectory of the SQL folder
tmpl_search_path = []
for subdir in ['bus/', 'audit/', 'business/snflk/']:
tmpl_search_path.append(os.path.join('{{var.value.sql_path}}', subdir))
Got below error as
inja2.exceptions.TemplateNotFound: extract.sql
Templates are not rendered everywhere in a DAG script. Usually they are rendered in the templated parameters of Operators. So, unless you pass the elements of tmpl_search_path to some templated parameter {{var.value.sql_path}} will not be rendered.
The template_searchpath of DAG is not templated. That is why you cannot pass Jinja templates to it.
The options of which I can think are
Hardcode the value of Variable.get('sql_path') in the pipeline script.
Save the value of Variable.get('sql_path') in a configuration file and read it from there in the pipeline script.
Move the Variable.get() call out of the for-loop. This will result in three times fewer requests to the database.
More info about templating in Airflow.

Airflow does not run dags

Context: I successfully installed Airflow on EC2, changed things like executor to LocalExecutor; sql_alchemy_conn to postgresql+psycopg2://postgres#localhost:5432/airflow; max_threads to 10.
My problem is when I create a dag which I indicate to be run everyday everything is fine, but when I create a dag to be run like at 10am on Monday and Wednesday Airflow doesn't does not run it. Does anybody know what could I do wrong and should I do in order to fix this issue?
Dag for script which runs fine and properly:
import airflow
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import timedelta
args = {
'owner': 'arseniyy123',
'start_date': airflow.utils.dates.days_ago(1),
'depends_on_past': False,
'email': ['exam#exam.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1),
}
dag = DAG(
'daily_script',
default_args=args,
description = 'daily_script',
schedule_interval = "0 10 * * *",
)
t1 = BashOperator(
task_id='daily',
bash_command='cd /root/ && python3 DAILY_WORK.py',
dag=dag)
t1
Dag for script which should run on Monday and Wednesday, but it does not run at all:
import airflow
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import timedelta
args = {
'owner': 'arseniyy123',
'start_date': airflow.utils.dates.days_ago(1),
'depends_on_past': False,
'email': ['exam#exam.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1),
}
dag = DAG(
'monday_wednesday',
default_args=args,
description = 'monday_wednesday',
schedule_interval = "0 10 * * 1,3",
)
t1 = BashOperator(
task_id='monday_wednesday',
bash_command='cd /root/ && python3 not_daily_work.py',
dag=dag)
t1
I also have some problems with scheduler, it uses to die after being working more than 10 hours, anybody know why does it happen?
Thank you in advance!
Can you try changing the start_date to a static datetime e.g. datetime.date(2020, 3, 20) instead of using airflow.utils.dates.days_ago(1)
Maybe read through the scheduling examples here, to understand why your code didn't work. From that documentation:
Let’s Repeat That The scheduler runs your job one schedule_interval AFTER the start date, at the END of the period

Airflow does not run task

I've written a python task for Airflow. When I load my DAG into Airflow, it shows everything just fine. Once I trigger a run for my DAG, it will create a run and switch to succeeded without ever running my task. The webserver and scheduler are both running and there's nothing in the log.
The task doesn't even get a status while running (not even skipped).
If I run my task directly using airflow test update_dags work 2019-01-01 it runs just fine.
This is my DAG:
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime, timedelta
import os
default_args = {
'owner': 'Airflow',
'depends_on_past': False,
'start_date': datetime.today(),
'email': ['***redacted***'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'params': {
'git_repository': '***redacted***',
'git_ref': 'origin/master',
'git_folder': '/opt/dag-repository'
}
}
def command(cmd: str, *args):
templated_command = cmd.format(*args)
print('Running command: {}'.format(templated_command))
os.system(templated_command)
def do_work(**kwargs):
params = kwargs['params']
dag_directory = kwargs['conf'].get('core', 'dags_folder')
git_repository = params['git_repository']
git_ref = params['git_ref']
git_folder = params['git_folder']
command('if [ ! -d {0} ]; then git clone {1} {0}; fi', git_folder, git_repository)
command('cd {0}; git fetch -apt', git_folder)
command('cd {0}; git reset --hard {1}', git_folder, git_ref)
command('ln -sf {0} {1}', '{}/src'.format(git_folder), dag_directory)
with DAG('update_dags', default_args=default_args, schedule_interval=timedelta(minutes=10), max_active_runs=1) as dag:
work_stage = PythonOperator(
task_id="work",
python_callable=do_work,
provide_context=True
)
I also had the same problem. I solved the problem by turning os.system(cmd) into subprocess.run(cmd, shell=True, check=True), but I'm not quite sure why. hope this helps

Resources