Below code creates the dag (the graph is also attached) which contains 2 PythonSensors and a PythonOperator.
First Sensors creates a random integer list as data and a random boolean with 50% chance of success. It logs generated values and returns PokeReturnValue
Second sensor and Python operator both tries to get data from xcom and log them.
Graph of DAG
# region IMPORTS
import random
import logging
from datetime import datetime, timedelta
from airflow import DAG
from heliocampus.configuration.constants import Constants
from airflow.operators.empty import EmptyOperator
from airflow.operators.python import PythonOperator
from airflow.sensors.python import PythonSensor
from airflow.sensors.base import PokeReturnValue
from airflow.utils.trigger_rule import TriggerRule
from box import Box
# endregion
# region configuration
constants = Constants()
dagconfig = Box({ "Code":"Test" })
# endregion
def main() -> DAG:
# region default_args
args = dict()
args['start_date'] = datetime(2021, 1, 1)
# endregion
with DAG(dag_id=dagconfig.Code, schedule_interval="#once", default_args=args, tags=['test', 'V0.1.4']) as dag:
start = EmptyOperator(task_id="start")
# region Sensors
check_all_expired_tables = PythonSensor(
task_id="CHECK_ALL_EXPIRED_TABLES",
poke_interval=timedelta(seconds=20).total_seconds(),
timeout=timedelta(minutes=1).total_seconds(),
mode="reschedule",
python_callable=check_expired_tables,
trigger_rule=TriggerRule.ALL_SUCCESS
)
check_all_expired_tables_notification = PythonOperator(
task_id="CHECK_ALL_EXPIRED_TABLES_NOTIFICATION",
python_callable=sensor_result_nofitication,
op_kwargs={"notification_source":"CHECK_ALL_EXPIRED_TABLES"},
trigger_rule=TriggerRule.ALL_FAILED
)
verify_ods_operator = PythonSensor(
task_id="VERIFY_ODS",
poke_interval=timedelta(seconds=30).total_seconds(),
timeout=timedelta(hours=2).total_seconds(),
mode="reschedule",
python_callable=verify_ods,
op_kwargs={"notification_source":"CHECK_ALL_EXPIRED_TABLES"},
trigger_rule=TriggerRule.ALL_SUCCESS
)
# endregion
end = EmptyOperator(task_id="end")
start >> check_all_expired_tables >> verify_ods_operator >> end
check_all_expired_tables >> check_all_expired_tables_notification
return dag
# region Notifications
def sensor_result_nofitication(ti, notification_source):
actual_xcom_value = ti.xcom_pull(task_ids=[notification_source])
logging.info(f"sensor_result_nofitication : Sensor without key from {notification_source} is {actual_xcom_value}")
actual_xcom_value = ti.xcom_pull(key='return_value', task_ids=[notification_source])
logging.info(f"sensor_result_nofitication : Sensor return_value from {notification_source} is {actual_xcom_value}")
# endregion
def check_expired_tables():
randomlist = random.sample(range(10, 30), 5)
randomResult = random.randint(0, 100) > 50
logging.info(f"check_expired_tables : returning PokeReturnValue(is_done={randomResult}, xcom_value={randomlist})")
return PokeReturnValue(is_done=randomResult, xcom_value=randomlist)
def verify_ods(ti, notification_source):
actual_xcom_value = ti.xcom_pull(task_ids=[notification_source])
logging.info(f"verify_ods : Sensor without key from {notification_source} is {actual_xcom_value}")
actual_xcom_value = ti.xcom_pull(key='return_value', task_ids=[notification_source])
logging.info(f"verify_ods : Sensor return_value from {notification_source} is {actual_xcom_value}")
rnd = random.randint(0, 100)
logging.info("Random Number : {num}".format(num=rnd))
return (rnd > 20)
main()
Regardless of whether the first sensor is successfull or not the data from xcom can not be logged in the second sensor or python operator.
I don't know if the problem is on the pushing side or pulling side.
I can not see any rows inserted in airflow database (xcom table).
The problem lives in the PythonSensor which is coercing the return of the python callable to boolean without checking its type first:
return_value = self.python_callable(*self.op_args, **self.op_kwargs)
return PokeReturnValue(bool(return_value))
To get the expected behavior something like this needs to be added to the PythonSensor:
return return_value if isinstance(return_value, PokeReturnValue) else PokeReturnValue(bool(return_value)
Requirement: Run tasks in parallel dynamically based on the number of offset values which is basically dates
As below it starts from the current date 0 to 4 days back(end_offset_days), so that each task can run in parallel with each date in parallel
start_offset_dayts/ end_offset_days can be dynamic, tomorrow it can be changed to 6 to run past days
I tried as the below date_list gives me a list of dates to be run in parallel, How do I pass it to the next tasks for for looping
with DAG(
dag_id=dag_id,
default_args=default_args,
schedule_interval="0 * * * *",
catchup=False,
dagrun_timeout=timedelta(minutes=180),
max_active_runs=1,
params={},
) as dag:
#task(task_id='datelist')
def datelist(**kwargs):
ti = kwargs['ti']
import datetime
date_list = [(datetime.date.today() - datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in range(0, 4)]
return date_list
for tss in date_list:
jb = PythonOperator(
task_id=jb,
provide_context=True,
python_callable=main_run,
op_kwargs={
"start_offset_days": 0,
"end_offset_days": 4
}
)
jb
return dag
Belwo is xcom values from date_list
Create a job_list and inside the for loop do job_list.append(jb)
Then the line before return dag should simply be: job_list.
Then Airflow will run all those jobs in parallel.
So the last part of your code should look like this:
job_list = []
for tss in date_list:
jb = PythonOperator(
task_id=jb,
provide_context=True,
python_callable=main_run,
op_kwargs={
"start_offset_days": 0,
"end_offset_days": 4
}
)
job_list.append(jb)
job_list
return dag
Instead of running each jb in the loop, appending it to the collection and running the entire collection, will make them all run in parallel.
I would also replace the first part of the DAG. I don't think it has to run as a task. So instead of:
#task(task_id='datelist')
def datelist(**kwargs):
ti = kwargs['ti']
import datetime
date_list = [(datetime.date.today() - datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in range(0, 4)]
return date_list
I would simply do it like this:
import datetime
date_list = [(datetime.date.today() - datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in range(0, 4)]
I want to return 2 or more tasks from a function that should be run in sequence in the spot they're inserted in the dependencies, see below.
t1 = PythonOperator()
def generate_tasks():
t2 = PythonOperator()
t3 = PythonOperator()
return magic(t2, t3) # magic needed here (preferably)
t1 >> generate_tasks() # otherwise here
# desired result: t1 >> t2 >> t3
Is this doable? As I understand it Airflow 2.0 seems to achieve this with a TaskGroup, but we're on Google's Composer, and 2.0 won't be available for a while.
Best workaround I've found:
t1 = PythonOperator()
def generate_tasks():
t2 = PythonOperator()
t3 = PythonOperator()
return [t2, t3]
tasks = generate_tasks()
t1 >> tasks[0] >> tasks[1]
But I'd really like that to be abstracted away, as it more or less defeats the purpose of having multiple operators returned from a single function. We want it to be a single unit as far as the end user knows, even though it can be composed of 2 or more tasks.
How to do it with TaskGroup in Airflow 2.0:
class Encryptor:
def encrypt_and_archive(self):
with TaskGroup("archive_and_encrypt") as section_1:
encrypt = DummyOperator(task_id="encrypt")
archive = BashOperator(task_id="archive", bash_command='echo 1')
encrypt >> archive
return section_1
with DAG(dag_id="example_return_task_group", start_date=days_ago(2), tags=["example"]) as dag:
start = DummyOperator(task_id="start")
encrypt_and_archive = Encryptor().encrypt_and_archive()
end = DummyOperator(task_id='end')
# 👇 single variable, containing two tasks
start >> encrypt_and_archive >> end
Which creates the following graph:
Is something similar remotely doable before 2.0?
You didn't explain what magic(t2, t3) is.
TaskGroup is strictly UI feature it doesn't effect on the DAG logic. According to your description it seems that you are looking for a specific logic (otherwise what is magic?).
I believe this is what you are after:
default_args = {
'owner': 'airflow',
'start_date': datetime(2021, 1, 24),
}
def generate_tasks():
operator_list =[]
for i in range(5): # Replace to generate the logic you wish to dynamically create tasks
op = DummyOperator(task_id=f"t{str(i)}_task", dag=dag)
if i>0:
operator_list[i - 1] >> op
operator_list.append(op)
return operator_list
with DAG(
dag_id='loop',
default_args=default_args,
schedule_interval=None,
) as dag:
start_op = DummyOperator(task_id='start_task')
end_op = DummyOperator(task_id='end_task')
tasks = generate_tasks()
start_op >> tasks[0]
tasks[-1] >> end_op
You can replace the DummyOperator with any operator you'd like.
I need to execute the airflow same task on 12th day of the month and 2 days before the last day of the month.
I was trying with macros and execution_date as well. Not sure how to proceed further. Could you please help on this?
def check_trigger(execution_date, day_offset, **kwargs):
target_date = execution_date - timedelta(days = day_offset)
return target_date
I would approach it like below. And twelfth_or_two_before is a Python function that simply checks the date & returns the task_id of the appropriate downstream task. (That way if the business needs ever change & you need to run the actual tasks on a different/additional day(s), you just modify that function.)
with DAG( ... ) as dag:
right_days = BranchPythonOperator(
task_id="start",
python_callable="twelfth_or_two_before,
)
do_nothing = DummyOperator(task_id="do_nothing")
actual_task = ____Operator( ... ) # This is the Operator that does actual work
start >> [do_nothing, actual_task]
I have 2 datastore models:
class KindA(ndb.Model):
field_a1 = ndb.StringProperty()
field_a2 = ndb.StringProperty()
class KindB(ndb.Model):
field_b1 = ndb.StringProperty()
field_b2 = ndb.StringProperty()
key_to_kind_a = ndb.KeyProperty(KindA)
I want to query KindB and output it to a csv file, but if an entity of KindB points to an entity in KindA I want those fields to be present in the csv as well.
If I was able to use ndb inside of a transform I would setup my pipeline like this
def format(element): # element is an `entity_pb2` object of KindB
try:
obj_a_key_id = element.properties.get('key_to_kind_a', None).key_value.path[0]
except:
obj_a_key_id = None
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<< HOW DO I DO THIS
obj_a = ndb.Key(KindA, obj_a_key_id).get() if obj_a_key_id else None
return ",".join([
element.properties.get('field_b1', None).string_value,
element.properties.get('field_b2', None).string_value,
obj_a.properties.get('field_a1', None).string_value if obj_a else '',
obj_a.properties.get('field_a2', None).string_value if obj_a else '',
]
def build_pipeline(project, start_date, end_date, export_path):
query = query_pb2.Query()
query.kind.add().name = 'KindB'
filter_1 = datastore_helper.set_property_filter(query_pb2.Filter(), 'field_b1', PropertyFilter.GREATER_THAN, start_date)
filter_2 = datastore_helper.set_property_filter(query_pb2.Filter(), 'field_b1', PropertyFilter.LESS_THAN, end_date)
datastore_helper.set_composite_filter(query.filter, CompositeFilter.AND, filter_1, filter_2)
p = beam.Pipeline(options=pipeline_options)
_ = (p
| 'read from datastore' >> ReadFromDatastore(project, query, None)
| 'format' >> beam.Map(format)
| 'write' >> apache_beam.io.WriteToText(
file_path_prefix=export_path,
file_name_suffix='.csv',
header='field_b1,field_b2,field_a1,field_a2',
num_shards=1)
)
return p
I suppose I could use ReadFromDatastore to query all entities of KindA and then use CoGroupByKey to merge them, but KindA has millions of records and that would be very inefficient.
Per the reccommendations in this answer: https://stackoverflow.com/a/49130224/4458510
I created the following utils, which were inspired by the source code of
DatastoreWriteFn in apache_beam.io.gcp.datastore.v1.datastoreio
write_mutations and fetch_entities in apache_beam.io.gcp.datastore.v1.helper
import logging
import time
from socket import error as _socket_error
from apache_beam.metrics import Metrics
from apache_beam.transforms import DoFn, window
from apache_beam.utils import retry
from apache_beam.io.gcp.datastore.v1.adaptive_throttler import AdaptiveThrottler
from apache_beam.io.gcp.datastore.v1.helper import make_partition, retry_on_rpc_error, get_datastore
from apache_beam.io.gcp.datastore.v1.util import MovingSum
from apache_beam.utils.windowed_value import WindowedValue
from google.cloud.proto.datastore.v1 import datastore_pb2, query_pb2
from googledatastore.connection import Datastore, RPCError
_WRITE_BATCH_INITIAL_SIZE = 200
_WRITE_BATCH_MAX_SIZE = 500
_WRITE_BATCH_MIN_SIZE = 10
_WRITE_BATCH_TARGET_LATENCY_MS = 5000
def _fetch_keys(project_id, keys, datastore, throttler, rpc_stats_callback=None, throttle_delay=1):
req = datastore_pb2.LookupRequest()
req.project_id = project_id
for key in keys:
req.keys.add().CopyFrom(key)
#retry.with_exponential_backoff(num_retries=5, retry_filter=retry_on_rpc_error)
def run(request):
# Client-side throttling.
while throttler.throttle_request(time.time() * 1000):
logging.info("Delaying request for %ds due to previous failures", throttle_delay)
time.sleep(throttle_delay)
if rpc_stats_callback:
rpc_stats_callback(throttled_secs=throttle_delay)
try:
start_time = time.time()
response = datastore.lookup(request)
end_time = time.time()
if rpc_stats_callback:
rpc_stats_callback(successes=1)
throttler.successful_request(start_time * 1000)
commit_time_ms = int((end_time - start_time) * 1000)
return response, commit_time_ms
except (RPCError, _socket_error):
if rpc_stats_callback:
rpc_stats_callback(errors=1)
raise
return run(req)
# Copied from _DynamicBatchSizer in apache_beam.io.gcp.datastore.v1.datastoreio
class _DynamicBatchSizer(object):
"""Determines request sizes for future Datastore RPCS."""
def __init__(self):
self._commit_time_per_entity_ms = MovingSum(window_ms=120000, bucket_ms=10000)
def get_batch_size(self, now):
"""Returns the recommended size for datastore RPCs at this time."""
if not self._commit_time_per_entity_ms.has_data(now):
return _WRITE_BATCH_INITIAL_SIZE
recent_mean_latency_ms = (self._commit_time_per_entity_ms.sum(now) / self._commit_time_per_entity_ms.count(now))
return max(_WRITE_BATCH_MIN_SIZE,
min(_WRITE_BATCH_MAX_SIZE,
_WRITE_BATCH_TARGET_LATENCY_MS / max(recent_mean_latency_ms, 1)))
def report_latency(self, now, latency_ms, num_mutations):
"""Reports the latency of an RPC to Datastore.
Args:
now: double, completion time of the RPC as seconds since the epoch.
latency_ms: double, the observed latency in milliseconds for this RPC.
num_mutations: int, number of mutations contained in the RPC.
"""
self._commit_time_per_entity_ms.add(now, latency_ms / num_mutations)
class LookupKeysFn(DoFn):
"""A `DoFn` that looks up keys in the Datastore."""
def __init__(self, project_id, fixed_batch_size=None):
self._project_id = project_id
self._datastore = None
self._fixed_batch_size = fixed_batch_size
self._rpc_successes = Metrics.counter(self.__class__, "datastoreRpcSuccesses")
self._rpc_errors = Metrics.counter(self.__class__, "datastoreRpcErrors")
self._throttled_secs = Metrics.counter(self.__class__, "cumulativeThrottlingSeconds")
self._throttler = AdaptiveThrottler(window_ms=120000, bucket_ms=1000, overload_ratio=1.25)
self._elements = []
self._batch_sizer = None
self._target_batch_size = None
def _update_rpc_stats(self, successes=0, errors=0, throttled_secs=0):
"""Callback function, called by _fetch_keys()"""
self._rpc_successes.inc(successes)
self._rpc_errors.inc(errors)
self._throttled_secs.inc(throttled_secs)
def start_bundle(self):
"""(re)initialize: connection with datastore, _DynamicBatchSizer obj"""
self._elements = []
self._datastore = get_datastore(self._project_id)
if self._fixed_batch_size:
self._target_batch_size = self._fixed_batch_size
else:
self._batch_sizer = _DynamicBatchSizer()
self._target_batch_size = self._batch_sizer.get_batch_size(time.time()*1000)
def process(self, element):
"""Collect elements and process them as a batch"""
self._elements.append(element)
if len(self._elements) >= self._target_batch_size:
return self._flush_batch()
def finish_bundle(self):
"""Flush any remaining elements"""
if self._elements:
objs = self._flush_batch()
for obj in objs:
yield WindowedValue(obj, window.MAX_TIMESTAMP, [window.GlobalWindow()])
def _flush_batch(self):
"""Fetch all of the collected keys from datastore"""
response, latency_ms = _fetch_keys(
project_id=self._project_id,
keys=self._elements,
datastore=self._datastore,
throttler=self._throttler,
rpc_stats_callback=self._update_rpc_stats)
logging.info("Successfully read %d keys in %dms.", len(self._elements), latency_ms)
if not self._fixed_batch_size:
now = time.time()*1000
self._batch_sizer.report_latency(now, latency_ms, len(self._elements))
self._target_batch_size = self._batch_sizer.get_batch_size(now)
self._elements = []
return [entity_result.entity for entity_result in response.found]
class LookupEntityFieldFn(LookupKeysFn):
"""
Looks-up a field on an EntityPb2 object
Expects a EntityPb2 object as input
Outputs a tuple, where the first element is the input object and the second element is the object found during the
lookup
"""
def __init__(self, project_id, field_name, fixed_batch_size=None):
super(LookupEntityFieldFn, self).__init__(project_id=project_id, fixed_batch_size=fixed_batch_size)
self._field_name = field_name
#staticmethod
def _pb2_key_value_to_tuple(kv):
"""Converts a key_value object into a tuple, so that it can be a dictionary key"""
path = []
for p in kv.path:
path.append(p.name)
path.append(p.id)
return tuple(path)
def _flush_batch(self):
_elements = self._elements
keys_to_fetch = []
for element in self._elements:
kv = element.properties.get(self._field_name, None)
if kv and kv.key_value and kv.key_value.path:
keys_to_fetch.append(kv.key_value)
self._elements = keys_to_fetch
read_keys = super(LookupEntityFieldFn, self)._flush_batch()
_by_key = {self._pb2_key_value_to_tuple(entity.key): entity for entity in read_keys}
output_pairs = []
for input_obj in _elements:
kv = input_obj.properties.get(self._field_name, None)
output_obj = None
if kv and kv.key_value and kv.key_value.path:
output_obj = _by_key.get(self._pb2_key_value_to_tuple(kv.key_value), None)
output_pairs.append((input_obj, output_obj))
return output_pairs
The Key to this is the line response = datastore.lookup(request), where:
datastore = get_datastore(project_id) (from apache_beam.io.gcp.datastore.v1.helper.get_datastore)
request is a LookupRequest from google.cloud.proto.datastore.v1.datastore_pb2
response is LookupResponse from google.cloud.proto.datastore.v1.datastore_pb2
The rest of the above code does things like:
using a single connection to the datastore for a dofn bundle
batches keys together before performing a lookup request
throttles interactions with the datastore if requests start to fail
(honestly I don't know how critical these bits are, I just came across them when browsing the apache_beam source code)
The resulting util function LookupEntityFieldFn(project_id, field_name) is a DoFn that takes in an entity_pb2 object as input, extracts and fetches/gets the key_property that resides on the field field_name, and outputs the result as a tuple (the fetch-result is paired with the input object)
My Pipeline code then became
def format(element): # element is a tuple `entity_pb2` objects
kind_b_element, kind_a_element = element
return ",".join([
kind_b_element.properties.get('field_b1', None).string_value,
kind_b_element.properties.get('field_b2', None).string_value,
kind_a_element.properties.get('field_a1', None).string_value if kind_a_element else '',
kind_a_element.properties.get('field_a2', None).string_value if kind_a_element else '',
]
def build_pipeline(project, start_date, end_date, export_path):
query = query_pb2.Query()
query.kind.add().name = 'KindB'
filter_1 = datastore_helper.set_property_filter(query_pb2.Filter(), 'field_b1', PropertyFilter.GREATER_THAN, start_date)
filter_2 = datastore_helper.set_property_filter(query_pb2.Filter(), 'field_b1', PropertyFilter.LESS_THAN, end_date)
datastore_helper.set_composite_filter(query.filter, CompositeFilter.AND, filter_1, filter_2)
p = beam.Pipeline(options=pipeline_options)
_ = (p
| 'read from datastore' >> ReadFromDatastore(project, query, None)
| 'extract field' >> apache_beam.ParDo(LookupEntityFieldFn(project_id=project, field_name='key_to_kind_a'))
| 'format' >> beam.Map(format)
| 'write' >> apache_beam.io.WriteToText(
file_path_prefix=export_path,
file_name_suffix='.csv',
header='field_b1,field_b2,field_a1,field_a2',
num_shards=1)
)
return p