I have several cron expressions that I need to apply to a single DAG. There is no way to express them with one single cron expression.
Airflow 2.2 introduced Timetable. Is there an implementation that takes a list of cron expressions?
I was looking for the same thing, but didn't find anything. It would be nice if a standard one came with Airflow.
Here's a 0.1 version that I wrote for Airflow 2.2.5.
# This file is <airflow plugins directory>/timetable.py
from typing import Any, Dict, List, Optional
import pendulum
from croniter import croniter
from pendulum import DateTime, Duration, timezone, instance as pendulum_instance
from airflow.plugins_manager import AirflowPlugin
from airflow.timetables.base import DagRunInfo, DataInterval, TimeRestriction, Timetable
from airflow.exceptions import AirflowTimetableInvalid
class MultiCronTimetable(Timetable):
valid_units = ['minutes', 'hours', 'days']
def __init__(self,
cron_defs: List[str],
timezone: str = 'Europe/Berlin',
period_length: int = 0,
period_unit: str = 'hours'):
self.cron_defs = cron_defs
self.timezone = timezone
self.period_length = period_length
self.period_unit = period_unit
def infer_manual_data_interval(self, run_after: DateTime) -> DataInterval:
"""
Determines date interval for manually triggered runs.
This is simply (now - period) to now.
"""
end = run_after
if self.period_length == 0:
start = end
else:
start = self.data_period_start(end)
return DataInterval(start=start, end=end)
def next_dagrun_info(
self,
*,
last_automated_data_interval: Optional[DataInterval],
restriction: TimeRestriction) -> Optional[DagRunInfo]:
"""
Determines when the DAG should be scheduled.
"""
if restriction.earliest is None:
# No start_date. Don't schedule.
return None
is_first_run = last_automated_data_interval is None
if is_first_run:
if restriction.catchup:
scheduled_time = self.next_scheduled_run_time(restriction.earliest)
else:
scheduled_time = self.previous_scheduled_run_time()
if scheduled_time is None:
# No previous cron time matched. Find one in the future.
scheduled_time = self.next_scheduled_run_time()
else:
last_scheduled_time = last_automated_data_interval.end
if restriction.catchup:
scheduled_time = self.next_scheduled_run_time(last_scheduled_time)
else:
scheduled_time = self.previous_scheduled_run_time()
if scheduled_time is None or scheduled_time == last_scheduled_time:
# No previous cron time matched,
# or the matched cron time was the last execution time,
scheduled_time = self.next_scheduled_run_time()
elif scheduled_time > last_scheduled_time:
# Matched cron time was after last execution time, but before now.
# Use this cron time
pass
else:
# The last execution time is after the most recent matching cron time.
# Next scheduled run will be in the future
scheduled_time = self.next_scheduled_run_time()
if scheduled_time is None:
return None
if restriction.latest is not None and scheduled_time > restriction.latest:
# Over the DAG's scheduled end; don't schedule.
return None
start = self.data_period_start(scheduled_time)
return DagRunInfo(run_after=scheduled_time, data_interval=DataInterval(start=start, end=scheduled_time))
def data_period_start(self, period_end: DateTime):
return period_end - Duration(**{self.period_unit: self.period_length})
def croniter_values(self, base_datetime=None):
if not base_datetime:
tz = timezone(self.timezone)
base_datetime = pendulum.now(tz)
return [croniter(expr, base_datetime) for expr in self.cron_defs]
def next_scheduled_run_time(self, base_datetime: DateTime = None):
min_date = None
tz = timezone(self.timezone)
if base_datetime:
base_datetime_localized = base_datetime.in_timezone(tz)
else:
base_datetime_localized = pendulum.now(tz)
for cron in self.croniter_values(base_datetime_localized):
next_date = cron.get_next(DateTime)
if not min_date:
min_date = next_date
else:
min_date = min(min_date, next_date)
if min_date is None:
return None
return pendulum_instance(min_date)
def previous_scheduled_run_time(self, base_datetime: DateTime = None):
"""
Get the most recent time in the past that matches one of the cron schedules
"""
max_date = None
tz = timezone(self.timezone)
if base_datetime:
base_datetime_localized = base_datetime.in_timezone(tz)
else:
base_datetime_localized = pendulum.now(tz)
for cron in self.croniter_values(base_datetime_localized):
prev_date = cron.get_prev(DateTime)
if not max_date:
max_date = prev_date
else:
max_date = max(max_date, prev_date)
if max_date is None:
return None
return pendulum_instance(max_date)
def validate(self) -> None:
if not self.cron_defs:
raise AirflowTimetableInvalid("At least one cron definition must be present")
if self.period_unit not in self.valid_units:
raise AirflowTimetableInvalid(f'period_unit must be one of {self.valid_units}')
if self.period_length < 0:
raise AirflowTimetableInvalid(f'period_length must not be less than zero')
try:
self.croniter_values()
except Exception as e:
raise AirflowTimetableInvalid(str(e))
#property
def summary(self) -> str:
"""A short summary for the timetable.
This is used to display the timetable in the web UI. A cron expression
timetable, for example, can use this to display the expression.
"""
return ' || '.join(self.cron_defs) + f' [TZ: {self.timezone}]'
def serialize(self) -> Dict[str, Any]:
"""Serialize the timetable for JSON encoding.
This is called during DAG serialization to store timetable information
in the database. This should return a JSON-serializable dict that will
be fed into ``deserialize`` when the DAG is deserialized.
"""
return dict(cron_defs=self.cron_defs,
timezone=self.timezone,
period_length=self.period_length,
period_unit=self.period_unit)
#classmethod
def deserialize(cls, data: Dict[str, Any]) -> "MultiCronTimetable":
"""Deserialize a timetable from data.
This is called when a serialized DAG is deserialized. ``data`` will be
whatever was returned by ``serialize`` during DAG serialization.
"""
return cls(**data)
class CustomTimetablePlugin(AirflowPlugin):
name = "custom_timetable_plugin"
timetables = [MultiCronTimetable]
To use it, you provide a list of cron expressions, optionally a timezone string, optionally a period length and period unit.
For my use case I don't actually need the period length + unit, which are used to determine the DAG's data_interval. You can just leave them at the default value of 0 minutes, if your DAG doesn't care about the data_interval.
I tried to imitate standard schedule_interval behaviour. For example if catchup = False and the DAG could have potentially been triggered several times since the last run (for whatever reason, for example the DAG ran longer than expected, or the scheduler wasn't running, or it's the DAG's very first time being scheduled), then the DAG will be scheduled to run for the latest previous matching time.
I haven't really tested it with catchup = True, but in theory it would run for every matching cron time since the DAG's start_date (but only once per distinct time, for example with */30 * * * * and 0 * * * * the DAG would run twice per hour, not three times).
Example DAG file:
from time import sleep
import airflow
from airflow.operators.python import PythonOperator
import pendulum
from timetable import MultiCronTimetable
def sleepy_op():
sleep(660)
with airflow.DAG(
dag_id='timetable_test',
start_date=pendulum.datetime(2022, 6, 2, tz=pendulum.timezone('America/New_York')),
timetable=MultiCronTimetable(['*/5 * * * *', '*/3 * * * fri,sat', '1 12 3 * *'], timezone='America/New_York', period_length=10, period_unit='minutes'),
catchup=False,
max_active_runs=1) as dag:
sleepy = PythonOperator(
task_id='sleepy',
python_callable=sleepy_op
)
What's the difference between US/Mountain and AZ timezone. Why is it adding an extra 28 min?
>>> strtime = datetime.datetime.strptime('10:00pm', '%I:%M%p')
>>> tz = timezone('US/Mountain').localize(strtime)
>>> print tz
1900-01-01 22:00:00-07:00
>>> tz = timezone(us.states.lookup('AZ').capital_tz).localize(strtime)
>>> print tz
1900-01-01 22:00:00-07:28 <<-----
this is most likely due to the fact that your year is 1900 (see also this question); it works fine if you add a current year:
import datetime
from pytz import timezone
import us
strtime = datetime.datetime.strptime('2020 10:00pm', '%Y %I:%M%p')
tz = timezone('US/Mountain').localize(strtime)
print(tz)
# 2020-01-01 22:00:00-07:00
tz = timezone(us.states.lookup('AZ').capital_tz).localize(strtime)
print(tz)
# 2020-01-01 22:00:00-07:00
(I'm using Python3 but that shouldn't make a difference, I get the same 28 min offset for year 1900)
When converting time to UTC its showing one hour less than expected
I am updating a variable of dot net via moment to convert the time & show local system time to user. But post conversion i am getting one hour less. Tried utcOffset but getting error utcOffset is not a function. any suggestion
Where formData.SubmittedDate = "6/7/2019 5:44:59 AM"
$('[data-utcdate]').each(function () {
var d = moment($(this).attr('data-utcdate'));
//var isDST = d.utc().local().isDST();
//var d = moment(d).utcOffset(d);
d = d.utc();
$(this).html(d.format('MMM D, YYYY h:mm A'));
})
Getting :Jun 7, 2019 12:14 AM
Expected : Jun 7, 2019 11:44 AM
From the docs:
Get the UTC offset in minutes.
So you could use a manipulation method like add with it:
$('[data-utcdate]').each(function () {
var d = moment($(this).attr('data-utcdate'));
var offset = d.utcOffset() // will return the offset in minutes
var time = d.add(offset, "m");
$(this).html(time.format('MMM D, YYYY h:mm A'));
})
I am confused about the output of the following code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import datetime
import pytz
local_time = datetime.datetime.now(pytz.timezone('Europe/Berlin'))
utc_time = datetime.datetime.now(datetime.timezone.utc)
print(local_time - utc_time)
I thought a timezone is simply an offset (depending on factors such as the date and the location with DST and changing definitions over time... so not that simple, but still an offset). So a timezone-aware datetime I thought would simply be:
utc_time == '12:34'
+ timezone is Europe/Berlin in UTC 2018-01-01 at 12:34
=> local time = (utc + local timezone) = 2018-01-01 at 13:34
Then I thought, the difference between two datetime objects should be equal to the difference to the UTC at the same point in time (not considering the thought-construct of timezones).
Hence, if I execute "datetime.now" at (about) the same point in time and "assign" it to different time zones, the difference should be on the order of milliseconds that passed between the two commands.
But it actually is -1 day, 23:59:59.999982.
Found it; it's just a representation issue:
>>> print(local_time)
2018-03-13 14:01:14.973876+01:00
>>> print(utc_time)
2018-03-13 13:01:14.973899+00:00
>>> print(utc_time - local_time)
0:00:00.000023
>>> print(local_time - utc_time)
-1 day, 23:59:59.999977
>>> print((local_time - utc_time).total_seconds())
-2.3e-05
so it is -1 day + 23:59:59.999977
I need to standardise and compare date/time fields that are in differnt timezones. eg How do you find the time difference between the following two times?...
"18-05-2012 09:29:41 +0800"
"18-05-2012 09:29:21 +0900"
What's the best way to initialise standard varaibles with the date/time?
The output needs to display the difference and normalised data in a timezone (eg +0100) that is different to the incoming values and different to the local environment.
Expected Output:
18-05-2012 02:29:41 +0100
18-05-2012 01:29:21 +0100
Difference: 01:00:20
import java.text.SimpleDateFormat
def dates = ["18-05-2012 09:29:41 +0800",
"18-05-2012 09:29:21 +0900"].collect{
new SimpleDateFormat("dd-MM-yyyy HH:mm:ss Z").parse(it)
}
def dayDiffFormatter = new SimpleDateFormat("HH:mm:ss")
dayDiffFormatter.setTimeZone(TimeZone.getTimeZone("UTC"))
println dates[0]
println dates[1]
println "Difference "+dayDiffFormatter.format(new Date(dates[0].time-dates[1].time))
wow. doesn't look readable, does it?
Or, use the JodaTime package
#Grab( 'joda-time:joda-time:2.1' )
import org.joda.time.*
import org.joda.time.format.*
String a = "18-05-2012 09:29:41 +0800"
String b = "18-05-2012 09:29:21 +0900"
DateTimeFormatter dtf = DateTimeFormat.forPattern( "dd-MM-yyyy HH:mm:ss Z" );
def start = dtf.parseDateTime( a )
def end = dtf.parseDateTime( b )
assert 1 == Hours.hoursBetween( end, start ).hours
Solution:
Groovy/Java Date objects are stored as the number of milliseconds after
1970 and so do not contain any timezone information directly
Use Date.parse method to initialise the new date to the specified format
Use SimpleDateFormat class to specify the required output format
Use SimpleDateFormat.setTimeZone to specifiy the timezone of the output
data
By using European/London timezone rather than GMT it will
automatically adjusts for day light savings time
See here for a full list of the options for date time patterns
-
import java.text.SimpleDateFormat
import java.text.DateFormat
//Initialise the dates by parsing to the specified format
Date timeDate1 = new Date().parse("dd-MM-yyyy HH:mm:ss Z","18-05-2012 09:29:41 +0800")
Date timeDate2 = new Date().parse("dd-MM-yyyy HH:mm:ss Z","18-05-2012 09:29:21 +0900")
DateFormat yearTimeformatter = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss Z")
DateFormat dayDifferenceFormatter= new SimpleDateFormat("HH:mm:ss") //All times differences will be less than a day
// The output should contain the format in UK time (including day light savings if necessary)
yearTimeformatter.setTimeZone(TimeZone.getTimeZone("Europe/London"))
// Set to UTC. This is to store only the difference so we don't want the formatter making further adjustments
dayDifferenceFormatter.setTimeZone(TimeZone.getTimeZone("UTC"))
// Calculate difference by first converting to the number of milliseconds
msDiff = timeDate1.getTime() - timeDate2.getTime()
Date differenceDate = new Date(msDiff)
println yearTimeformatter.format(timeDate1)
println yearTimeformatter.format(timeDate2)
println "Difference " + dayDifferenceFormatter.format(differenceDate)