Fetch datastore entity by id inside of a Dataflow transform - google-cloud-datastore

I have 2 datastore models:
class KindA(ndb.Model):
field_a1 = ndb.StringProperty()
field_a2 = ndb.StringProperty()
class KindB(ndb.Model):
field_b1 = ndb.StringProperty()
field_b2 = ndb.StringProperty()
key_to_kind_a = ndb.KeyProperty(KindA)
I want to query KindB and output it to a csv file, but if an entity of KindB points to an entity in KindA I want those fields to be present in the csv as well.
If I was able to use ndb inside of a transform I would setup my pipeline like this
def format(element): # element is an `entity_pb2` object of KindB
try:
obj_a_key_id = element.properties.get('key_to_kind_a', None).key_value.path[0]
except:
obj_a_key_id = None
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<< HOW DO I DO THIS
obj_a = ndb.Key(KindA, obj_a_key_id).get() if obj_a_key_id else None
return ",".join([
element.properties.get('field_b1', None).string_value,
element.properties.get('field_b2', None).string_value,
obj_a.properties.get('field_a1', None).string_value if obj_a else '',
obj_a.properties.get('field_a2', None).string_value if obj_a else '',
]
def build_pipeline(project, start_date, end_date, export_path):
query = query_pb2.Query()
query.kind.add().name = 'KindB'
filter_1 = datastore_helper.set_property_filter(query_pb2.Filter(), 'field_b1', PropertyFilter.GREATER_THAN, start_date)
filter_2 = datastore_helper.set_property_filter(query_pb2.Filter(), 'field_b1', PropertyFilter.LESS_THAN, end_date)
datastore_helper.set_composite_filter(query.filter, CompositeFilter.AND, filter_1, filter_2)
p = beam.Pipeline(options=pipeline_options)
_ = (p
| 'read from datastore' >> ReadFromDatastore(project, query, None)
| 'format' >> beam.Map(format)
| 'write' >> apache_beam.io.WriteToText(
file_path_prefix=export_path,
file_name_suffix='.csv',
header='field_b1,field_b2,field_a1,field_a2',
num_shards=1)
)
return p
I suppose I could use ReadFromDatastore to query all entities of KindA and then use CoGroupByKey to merge them, but KindA has millions of records and that would be very inefficient.

Per the reccommendations in this answer: https://stackoverflow.com/a/49130224/4458510
I created the following utils, which were inspired by the source code of
DatastoreWriteFn in apache_beam.io.gcp.datastore.v1.datastoreio
write_mutations and fetch_entities in apache_beam.io.gcp.datastore.v1.helper
import logging
import time
from socket import error as _socket_error
from apache_beam.metrics import Metrics
from apache_beam.transforms import DoFn, window
from apache_beam.utils import retry
from apache_beam.io.gcp.datastore.v1.adaptive_throttler import AdaptiveThrottler
from apache_beam.io.gcp.datastore.v1.helper import make_partition, retry_on_rpc_error, get_datastore
from apache_beam.io.gcp.datastore.v1.util import MovingSum
from apache_beam.utils.windowed_value import WindowedValue
from google.cloud.proto.datastore.v1 import datastore_pb2, query_pb2
from googledatastore.connection import Datastore, RPCError
_WRITE_BATCH_INITIAL_SIZE = 200
_WRITE_BATCH_MAX_SIZE = 500
_WRITE_BATCH_MIN_SIZE = 10
_WRITE_BATCH_TARGET_LATENCY_MS = 5000
def _fetch_keys(project_id, keys, datastore, throttler, rpc_stats_callback=None, throttle_delay=1):
req = datastore_pb2.LookupRequest()
req.project_id = project_id
for key in keys:
req.keys.add().CopyFrom(key)
#retry.with_exponential_backoff(num_retries=5, retry_filter=retry_on_rpc_error)
def run(request):
# Client-side throttling.
while throttler.throttle_request(time.time() * 1000):
logging.info("Delaying request for %ds due to previous failures", throttle_delay)
time.sleep(throttle_delay)
if rpc_stats_callback:
rpc_stats_callback(throttled_secs=throttle_delay)
try:
start_time = time.time()
response = datastore.lookup(request)
end_time = time.time()
if rpc_stats_callback:
rpc_stats_callback(successes=1)
throttler.successful_request(start_time * 1000)
commit_time_ms = int((end_time - start_time) * 1000)
return response, commit_time_ms
except (RPCError, _socket_error):
if rpc_stats_callback:
rpc_stats_callback(errors=1)
raise
return run(req)
# Copied from _DynamicBatchSizer in apache_beam.io.gcp.datastore.v1.datastoreio
class _DynamicBatchSizer(object):
"""Determines request sizes for future Datastore RPCS."""
def __init__(self):
self._commit_time_per_entity_ms = MovingSum(window_ms=120000, bucket_ms=10000)
def get_batch_size(self, now):
"""Returns the recommended size for datastore RPCs at this time."""
if not self._commit_time_per_entity_ms.has_data(now):
return _WRITE_BATCH_INITIAL_SIZE
recent_mean_latency_ms = (self._commit_time_per_entity_ms.sum(now) / self._commit_time_per_entity_ms.count(now))
return max(_WRITE_BATCH_MIN_SIZE,
min(_WRITE_BATCH_MAX_SIZE,
_WRITE_BATCH_TARGET_LATENCY_MS / max(recent_mean_latency_ms, 1)))
def report_latency(self, now, latency_ms, num_mutations):
"""Reports the latency of an RPC to Datastore.
Args:
now: double, completion time of the RPC as seconds since the epoch.
latency_ms: double, the observed latency in milliseconds for this RPC.
num_mutations: int, number of mutations contained in the RPC.
"""
self._commit_time_per_entity_ms.add(now, latency_ms / num_mutations)
class LookupKeysFn(DoFn):
"""A `DoFn` that looks up keys in the Datastore."""
def __init__(self, project_id, fixed_batch_size=None):
self._project_id = project_id
self._datastore = None
self._fixed_batch_size = fixed_batch_size
self._rpc_successes = Metrics.counter(self.__class__, "datastoreRpcSuccesses")
self._rpc_errors = Metrics.counter(self.__class__, "datastoreRpcErrors")
self._throttled_secs = Metrics.counter(self.__class__, "cumulativeThrottlingSeconds")
self._throttler = AdaptiveThrottler(window_ms=120000, bucket_ms=1000, overload_ratio=1.25)
self._elements = []
self._batch_sizer = None
self._target_batch_size = None
def _update_rpc_stats(self, successes=0, errors=0, throttled_secs=0):
"""Callback function, called by _fetch_keys()"""
self._rpc_successes.inc(successes)
self._rpc_errors.inc(errors)
self._throttled_secs.inc(throttled_secs)
def start_bundle(self):
"""(re)initialize: connection with datastore, _DynamicBatchSizer obj"""
self._elements = []
self._datastore = get_datastore(self._project_id)
if self._fixed_batch_size:
self._target_batch_size = self._fixed_batch_size
else:
self._batch_sizer = _DynamicBatchSizer()
self._target_batch_size = self._batch_sizer.get_batch_size(time.time()*1000)
def process(self, element):
"""Collect elements and process them as a batch"""
self._elements.append(element)
if len(self._elements) >= self._target_batch_size:
return self._flush_batch()
def finish_bundle(self):
"""Flush any remaining elements"""
if self._elements:
objs = self._flush_batch()
for obj in objs:
yield WindowedValue(obj, window.MAX_TIMESTAMP, [window.GlobalWindow()])
def _flush_batch(self):
"""Fetch all of the collected keys from datastore"""
response, latency_ms = _fetch_keys(
project_id=self._project_id,
keys=self._elements,
datastore=self._datastore,
throttler=self._throttler,
rpc_stats_callback=self._update_rpc_stats)
logging.info("Successfully read %d keys in %dms.", len(self._elements), latency_ms)
if not self._fixed_batch_size:
now = time.time()*1000
self._batch_sizer.report_latency(now, latency_ms, len(self._elements))
self._target_batch_size = self._batch_sizer.get_batch_size(now)
self._elements = []
return [entity_result.entity for entity_result in response.found]
class LookupEntityFieldFn(LookupKeysFn):
"""
Looks-up a field on an EntityPb2 object
Expects a EntityPb2 object as input
Outputs a tuple, where the first element is the input object and the second element is the object found during the
lookup
"""
def __init__(self, project_id, field_name, fixed_batch_size=None):
super(LookupEntityFieldFn, self).__init__(project_id=project_id, fixed_batch_size=fixed_batch_size)
self._field_name = field_name
#staticmethod
def _pb2_key_value_to_tuple(kv):
"""Converts a key_value object into a tuple, so that it can be a dictionary key"""
path = []
for p in kv.path:
path.append(p.name)
path.append(p.id)
return tuple(path)
def _flush_batch(self):
_elements = self._elements
keys_to_fetch = []
for element in self._elements:
kv = element.properties.get(self._field_name, None)
if kv and kv.key_value and kv.key_value.path:
keys_to_fetch.append(kv.key_value)
self._elements = keys_to_fetch
read_keys = super(LookupEntityFieldFn, self)._flush_batch()
_by_key = {self._pb2_key_value_to_tuple(entity.key): entity for entity in read_keys}
output_pairs = []
for input_obj in _elements:
kv = input_obj.properties.get(self._field_name, None)
output_obj = None
if kv and kv.key_value and kv.key_value.path:
output_obj = _by_key.get(self._pb2_key_value_to_tuple(kv.key_value), None)
output_pairs.append((input_obj, output_obj))
return output_pairs
The Key to this is the line response = datastore.lookup(request), where:
datastore = get_datastore(project_id) (from apache_beam.io.gcp.datastore.v1.helper.get_datastore)
request is a LookupRequest from google.cloud.proto.datastore.v1.datastore_pb2
response is LookupResponse from google.cloud.proto.datastore.v1.datastore_pb2
The rest of the above code does things like:
using a single connection to the datastore for a dofn bundle
batches keys together before performing a lookup request
throttles interactions with the datastore if requests start to fail
(honestly I don't know how critical these bits are, I just came across them when browsing the apache_beam source code)
The resulting util function LookupEntityFieldFn(project_id, field_name) is a DoFn that takes in an entity_pb2 object as input, extracts and fetches/gets the key_property that resides on the field field_name, and outputs the result as a tuple (the fetch-result is paired with the input object)
My Pipeline code then became
def format(element): # element is a tuple `entity_pb2` objects
kind_b_element, kind_a_element = element
return ",".join([
kind_b_element.properties.get('field_b1', None).string_value,
kind_b_element.properties.get('field_b2', None).string_value,
kind_a_element.properties.get('field_a1', None).string_value if kind_a_element else '',
kind_a_element.properties.get('field_a2', None).string_value if kind_a_element else '',
]
def build_pipeline(project, start_date, end_date, export_path):
query = query_pb2.Query()
query.kind.add().name = 'KindB'
filter_1 = datastore_helper.set_property_filter(query_pb2.Filter(), 'field_b1', PropertyFilter.GREATER_THAN, start_date)
filter_2 = datastore_helper.set_property_filter(query_pb2.Filter(), 'field_b1', PropertyFilter.LESS_THAN, end_date)
datastore_helper.set_composite_filter(query.filter, CompositeFilter.AND, filter_1, filter_2)
p = beam.Pipeline(options=pipeline_options)
_ = (p
| 'read from datastore' >> ReadFromDatastore(project, query, None)
| 'extract field' >> apache_beam.ParDo(LookupEntityFieldFn(project_id=project, field_name='key_to_kind_a'))
| 'format' >> beam.Map(format)
| 'write' >> apache_beam.io.WriteToText(
file_path_prefix=export_path,
file_name_suffix='.csv',
header='field_b1,field_b2,field_a1,field_a2',
num_shards=1)
)
return p

Related

How can I use Google Cloud Functions to run a web scraper?

Thanks in advance for your help.
I'm currently running a webscraper - this is the first time I've ever done something like this - It pulls addresses from the URL and then will match the address to the users input. This will be going into a chat bot, I wondering how I can make this run on Google Functions. Whats the process to do this, is there a tutorial anywhere?
This is my code so far. There is a small items file too
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import DataItem
from fuzzywuzzy import fuzz
from urllib.parse import urljoin
import scrapy
class AddressesSpider(scrapy.Spider):
name = 'Addresses'
allowed_domains = ['find-energy-certificate.service.gov.uk']
postcode = "bh10+4ah"
start_urls = ['https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode=' + postcode]
## def start_requests(self):
## self.first = input("Please enter the address you would like to match: ")
## yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
first = input("Please enter the address you would like to match: ")
highest_ratios = []
highest_item = None
for row in response.xpath('//table[#class="govuk-table"]//tr'):
address = row.xpath("normalize-space(.//a[#class='govuk-link']/text())").extract()[0].lower()
address = address.rsplit(',', 2)[0]
link = row.xpath('.//a[#class="govuk-link"]/#href').extract()
details = row.xpath("normalize-space(.//td/following-sibling::td)").extract()
ratio = fuzz.token_set_ratio(address, first)
item = DataItem()
item['link'] = link
item['details'] = details
item['address'] = address
item['ratioresult'] = ratio
if len(highest_ratios) < 3:
highest_ratios.append(item)
elif ratio > min(highest_ratios, key=lambda x: x['ratioresult'])['ratioresult']:
highest_ratios.remove(min(highest_ratios, key=lambda x: x['ratioresult']))
highest_ratios.append(item)
highest_ratios_100 = [item for item in highest_ratios if item['ratioresult'] == 100]
if highest_ratios_100:
for item in highest_ratios_100:
yield item
else:
yield max(highest_ratios, key=lambda x: x['ratioresult'])
if len(highest_ratios_100) > 1:
for i, item in enumerate(highest_ratios_100):
print(f"{i+1}: {item['address']}")
selected = int(input("Please select the correct address by entering the number corresponding to the address: ")) - 1
selected_item = highest_ratios_100[selected]
else:
selected_item = highest_ratios_100[0] if highest_ratios_100 else max(highest_ratios, key=lambda x: x['ratioresult'])
new_url = selected_item['link'][0]
new_url = str(new_url)
if new_url:
base_url = 'https://find-energy-certificate.service.gov.uk'
print(f'Base URL: {base_url}')
print(f'New URL: {new_url}')
new_url = urljoin(base_url, new_url)
print(f'Combined URL: {new_url}')
yield scrapy.Request(new_url, callback=self.parse_new_page)
def parse_new_page(self, response):
Postcode = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()])').extract()
Town = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()-1])').extract()
First = response.xpath(".//p[#class='epc-address govuk-body']").extract()
Type = response.xpath('normalize-space(//dd[1]/text())').extract_first()
Walls = response.xpath("//th[contains(text(), 'Wall')]/following-sibling::td[1]/text()").extract()
Roof = response.xpath("//th[contains(text(), 'Roof')]/following-sibling::td[1]/text()").extract()
Heating = response.xpath("//th[text()='Main heating']/following-sibling::td[1]/text()").extract_first()
CurrentScore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[1]/text[1]/text()').re_first("[0-9+]{1,2}")
Maxscore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[2]/text[1]/text()').re_first("[0-9+]{2}")
Expiry = response.xpath('normalize-space(//b)').extract_first()
FloorArea = response.xpath('//dt[contains(text(), "floor area")]/following-sibling::dd/text()').re_first("[0-9+]{2,3}")
Steps = response.xpath("//h3[contains(text(),'Step')]/text()").extract()
yield {
'Postcode': Postcode,
'Town': Town,
'First': First,
'Type': Type,
'Walls': Walls,
'Roof': Roof,
'Heating': Heating,
'CurrentScore': CurrentScore,
'Maxscore': Maxscore,
'Expiry': Expiry,
'FloorArea': FloorArea,
'Steps': Steps
}
I've tried googling and having a look around and can't get how to deploy this as a project to run on google functions or can I just copy the code into the console somewhere?
You can try running your spider from a script. However, a better solution would be to wrap scrapy in its own child process.
For example:
from multiprocessing import Process, Queue
from ... import MySpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def my_cloud_function(event, context):
def script(queue):
try:
settings = get_project_settings()
settings.setdict({
'LOG_LEVEL': 'ERROR',
'LOG_ENABLED': True,
})
process = CrawlerProcess(settings)
process.crawl(MySpider)
process.start()
queue.put(None)
except Exception as e:
queue.put(e)
queue = Queue()
# wrap the spider in a child process
main_process = Process(target=script, args=(queue,))
main_process.start() # start the process
main_process.join() # block until the spider finishes
result = queue.get() # check the process did not return an error
if result is not None:
raise result
return 'ok'
You can refer to this tutorial for more info.

Sensor return value can not be stored/retreived using PokeReturnValue

Below code creates the dag (the graph is also attached) which contains 2 PythonSensors and a PythonOperator.
First Sensors creates a random integer list as data and a random boolean with 50% chance of success. It logs generated values and returns PokeReturnValue
Second sensor and Python operator both tries to get data from xcom and log them.
Graph of DAG
# region IMPORTS
import random
import logging
from datetime import datetime, timedelta
from airflow import DAG
from heliocampus.configuration.constants import Constants
from airflow.operators.empty import EmptyOperator
from airflow.operators.python import PythonOperator
from airflow.sensors.python import PythonSensor
from airflow.sensors.base import PokeReturnValue
from airflow.utils.trigger_rule import TriggerRule
from box import Box
# endregion
# region configuration
constants = Constants()
dagconfig = Box({ "Code":"Test" })
# endregion
def main() -> DAG:
# region default_args
args = dict()
args['start_date'] = datetime(2021, 1, 1)
# endregion
with DAG(dag_id=dagconfig.Code, schedule_interval="#once", default_args=args, tags=['test', 'V0.1.4']) as dag:
start = EmptyOperator(task_id="start")
# region Sensors
check_all_expired_tables = PythonSensor(
task_id="CHECK_ALL_EXPIRED_TABLES",
poke_interval=timedelta(seconds=20).total_seconds(),
timeout=timedelta(minutes=1).total_seconds(),
mode="reschedule",
python_callable=check_expired_tables,
trigger_rule=TriggerRule.ALL_SUCCESS
)
check_all_expired_tables_notification = PythonOperator(
task_id="CHECK_ALL_EXPIRED_TABLES_NOTIFICATION",
python_callable=sensor_result_nofitication,
op_kwargs={"notification_source":"CHECK_ALL_EXPIRED_TABLES"},
trigger_rule=TriggerRule.ALL_FAILED
)
verify_ods_operator = PythonSensor(
task_id="VERIFY_ODS",
poke_interval=timedelta(seconds=30).total_seconds(),
timeout=timedelta(hours=2).total_seconds(),
mode="reschedule",
python_callable=verify_ods,
op_kwargs={"notification_source":"CHECK_ALL_EXPIRED_TABLES"},
trigger_rule=TriggerRule.ALL_SUCCESS
)
# endregion
end = EmptyOperator(task_id="end")
start >> check_all_expired_tables >> verify_ods_operator >> end
check_all_expired_tables >> check_all_expired_tables_notification
return dag
# region Notifications
def sensor_result_nofitication(ti, notification_source):
actual_xcom_value = ti.xcom_pull(task_ids=[notification_source])
logging.info(f"sensor_result_nofitication : Sensor without key from {notification_source} is {actual_xcom_value}")
actual_xcom_value = ti.xcom_pull(key='return_value', task_ids=[notification_source])
logging.info(f"sensor_result_nofitication : Sensor return_value from {notification_source} is {actual_xcom_value}")
# endregion
def check_expired_tables():
randomlist = random.sample(range(10, 30), 5)
randomResult = random.randint(0, 100) > 50
logging.info(f"check_expired_tables : returning PokeReturnValue(is_done={randomResult}, xcom_value={randomlist})")
return PokeReturnValue(is_done=randomResult, xcom_value=randomlist)
def verify_ods(ti, notification_source):
actual_xcom_value = ti.xcom_pull(task_ids=[notification_source])
logging.info(f"verify_ods : Sensor without key from {notification_source} is {actual_xcom_value}")
actual_xcom_value = ti.xcom_pull(key='return_value', task_ids=[notification_source])
logging.info(f"verify_ods : Sensor return_value from {notification_source} is {actual_xcom_value}")
rnd = random.randint(0, 100)
logging.info("Random Number : {num}".format(num=rnd))
return (rnd > 20)
main()
Regardless of whether the first sensor is successfull or not the data from xcom can not be logged in the second sensor or python operator.
I don't know if the problem is on the pushing side or pulling side.
I can not see any rows inserted in airflow database (xcom table).
The problem lives in the PythonSensor which is coercing the return of the python callable to boolean without checking its type first:
return_value = self.python_callable(*self.op_args, **self.op_kwargs)
return PokeReturnValue(bool(return_value))
To get the expected behavior something like this needs to be added to the PythonSensor:
return return_value if isinstance(return_value, PokeReturnValue) else PokeReturnValue(bool(return_value)

Airflow timetable that combines multiple cron expressions?

I have several cron expressions that I need to apply to a single DAG. There is no way to express them with one single cron expression.
Airflow 2.2 introduced Timetable. Is there an implementation that takes a list of cron expressions?
I was looking for the same thing, but didn't find anything. It would be nice if a standard one came with Airflow.
Here's a 0.1 version that I wrote for Airflow 2.2.5.
# This file is <airflow plugins directory>/timetable.py
from typing import Any, Dict, List, Optional
import pendulum
from croniter import croniter
from pendulum import DateTime, Duration, timezone, instance as pendulum_instance
from airflow.plugins_manager import AirflowPlugin
from airflow.timetables.base import DagRunInfo, DataInterval, TimeRestriction, Timetable
from airflow.exceptions import AirflowTimetableInvalid
class MultiCronTimetable(Timetable):
valid_units = ['minutes', 'hours', 'days']
def __init__(self,
cron_defs: List[str],
timezone: str = 'Europe/Berlin',
period_length: int = 0,
period_unit: str = 'hours'):
self.cron_defs = cron_defs
self.timezone = timezone
self.period_length = period_length
self.period_unit = period_unit
def infer_manual_data_interval(self, run_after: DateTime) -> DataInterval:
"""
Determines date interval for manually triggered runs.
This is simply (now - period) to now.
"""
end = run_after
if self.period_length == 0:
start = end
else:
start = self.data_period_start(end)
return DataInterval(start=start, end=end)
def next_dagrun_info(
self,
*,
last_automated_data_interval: Optional[DataInterval],
restriction: TimeRestriction) -> Optional[DagRunInfo]:
"""
Determines when the DAG should be scheduled.
"""
if restriction.earliest is None:
# No start_date. Don't schedule.
return None
is_first_run = last_automated_data_interval is None
if is_first_run:
if restriction.catchup:
scheduled_time = self.next_scheduled_run_time(restriction.earliest)
else:
scheduled_time = self.previous_scheduled_run_time()
if scheduled_time is None:
# No previous cron time matched. Find one in the future.
scheduled_time = self.next_scheduled_run_time()
else:
last_scheduled_time = last_automated_data_interval.end
if restriction.catchup:
scheduled_time = self.next_scheduled_run_time(last_scheduled_time)
else:
scheduled_time = self.previous_scheduled_run_time()
if scheduled_time is None or scheduled_time == last_scheduled_time:
# No previous cron time matched,
# or the matched cron time was the last execution time,
scheduled_time = self.next_scheduled_run_time()
elif scheduled_time > last_scheduled_time:
# Matched cron time was after last execution time, but before now.
# Use this cron time
pass
else:
# The last execution time is after the most recent matching cron time.
# Next scheduled run will be in the future
scheduled_time = self.next_scheduled_run_time()
if scheduled_time is None:
return None
if restriction.latest is not None and scheduled_time > restriction.latest:
# Over the DAG's scheduled end; don't schedule.
return None
start = self.data_period_start(scheduled_time)
return DagRunInfo(run_after=scheduled_time, data_interval=DataInterval(start=start, end=scheduled_time))
def data_period_start(self, period_end: DateTime):
return period_end - Duration(**{self.period_unit: self.period_length})
def croniter_values(self, base_datetime=None):
if not base_datetime:
tz = timezone(self.timezone)
base_datetime = pendulum.now(tz)
return [croniter(expr, base_datetime) for expr in self.cron_defs]
def next_scheduled_run_time(self, base_datetime: DateTime = None):
min_date = None
tz = timezone(self.timezone)
if base_datetime:
base_datetime_localized = base_datetime.in_timezone(tz)
else:
base_datetime_localized = pendulum.now(tz)
for cron in self.croniter_values(base_datetime_localized):
next_date = cron.get_next(DateTime)
if not min_date:
min_date = next_date
else:
min_date = min(min_date, next_date)
if min_date is None:
return None
return pendulum_instance(min_date)
def previous_scheduled_run_time(self, base_datetime: DateTime = None):
"""
Get the most recent time in the past that matches one of the cron schedules
"""
max_date = None
tz = timezone(self.timezone)
if base_datetime:
base_datetime_localized = base_datetime.in_timezone(tz)
else:
base_datetime_localized = pendulum.now(tz)
for cron in self.croniter_values(base_datetime_localized):
prev_date = cron.get_prev(DateTime)
if not max_date:
max_date = prev_date
else:
max_date = max(max_date, prev_date)
if max_date is None:
return None
return pendulum_instance(max_date)
def validate(self) -> None:
if not self.cron_defs:
raise AirflowTimetableInvalid("At least one cron definition must be present")
if self.period_unit not in self.valid_units:
raise AirflowTimetableInvalid(f'period_unit must be one of {self.valid_units}')
if self.period_length < 0:
raise AirflowTimetableInvalid(f'period_length must not be less than zero')
try:
self.croniter_values()
except Exception as e:
raise AirflowTimetableInvalid(str(e))
#property
def summary(self) -> str:
"""A short summary for the timetable.
This is used to display the timetable in the web UI. A cron expression
timetable, for example, can use this to display the expression.
"""
return ' || '.join(self.cron_defs) + f' [TZ: {self.timezone}]'
def serialize(self) -> Dict[str, Any]:
"""Serialize the timetable for JSON encoding.
This is called during DAG serialization to store timetable information
in the database. This should return a JSON-serializable dict that will
be fed into ``deserialize`` when the DAG is deserialized.
"""
return dict(cron_defs=self.cron_defs,
timezone=self.timezone,
period_length=self.period_length,
period_unit=self.period_unit)
#classmethod
def deserialize(cls, data: Dict[str, Any]) -> "MultiCronTimetable":
"""Deserialize a timetable from data.
This is called when a serialized DAG is deserialized. ``data`` will be
whatever was returned by ``serialize`` during DAG serialization.
"""
return cls(**data)
class CustomTimetablePlugin(AirflowPlugin):
name = "custom_timetable_plugin"
timetables = [MultiCronTimetable]
To use it, you provide a list of cron expressions, optionally a timezone string, optionally a period length and period unit.
For my use case I don't actually need the period length + unit, which are used to determine the DAG's data_interval. You can just leave them at the default value of 0 minutes, if your DAG doesn't care about the data_interval.
I tried to imitate standard schedule_interval behaviour. For example if catchup = False and the DAG could have potentially been triggered several times since the last run (for whatever reason, for example the DAG ran longer than expected, or the scheduler wasn't running, or it's the DAG's very first time being scheduled), then the DAG will be scheduled to run for the latest previous matching time.
I haven't really tested it with catchup = True, but in theory it would run for every matching cron time since the DAG's start_date (but only once per distinct time, for example with */30 * * * * and 0 * * * * the DAG would run twice per hour, not three times).
Example DAG file:
from time import sleep
import airflow
from airflow.operators.python import PythonOperator
import pendulum
from timetable import MultiCronTimetable
def sleepy_op():
sleep(660)
with airflow.DAG(
dag_id='timetable_test',
start_date=pendulum.datetime(2022, 6, 2, tz=pendulum.timezone('America/New_York')),
timetable=MultiCronTimetable(['*/5 * * * *', '*/3 * * * fri,sat', '1 12 3 * *'], timezone='America/New_York', period_length=10, period_unit='minutes'),
catchup=False,
max_active_runs=1) as dag:
sleepy = PythonOperator(
task_id='sleepy',
python_callable=sleepy_op
)

tkinter callback through an error: "Index 0 out of range"

I searched for hours what the heck is the reason for this error message:
I have a search entry, which update a listbox depending on my search with a callback function:
Listbox:
self.name_search=tk.StringVar()
self.name_search.trace_add('write', self.my_callback)
self.e_name_search_text = tk.Label(search_f, text="Name: ").grid(row=0, column=0, padx=10, pady=5, sticky='E')
self.e_name_search = ttk.Entry(search_f, width = 35, textvariable=self.name_search)
self.e_name_search.grid(row=0, column=1, padx=5, pady=5, sticky='W')
self.lbox = tk.Listbox(search_f, width=35, height=8)
self.lbox.bind("<Double-Button-1>", self.show_name_search)
self.lbox.bind('<Return>', self.show_name_search)
self.scrollbar = tk.Scrollbar(search_f)
self.lbox.grid(row=1, column=1, rowspan=3, padx=10, pady=1)
self.lbox.config(yscrollcommand = self.scrollbar.set)
self.scrollbar.grid(row=1, column=2, rowspan=3, padx=1, pady=1, sticky='ns')
self.scrollbar.config(command=self.lbox.yview)
So If I type my search, the listbox show me a reduced list of values out of my sqlite database, I am interessed in. If I select one with dobble click. Another sqlite query update my comboboxes.
If I select one I get this error:
Exception in Tkinter callback
Traceback (most recent call last):
File "C:\Python38-32\lib\tkinter\__init__.py", line 1883, in __call__
return self.func(*args)
File "D:\... name.py", line 337, in show_name_search
self.e_fax.current(0)
File "C:\Python38-32\lib\tkinter\ttk.py", line 717, in current
return self.tk.call(self._w, "current", newindex)
_tkinter.TclError: Index 0 out of range
Line 337 comes from another function:
def show_name_search(self, event):
self.clear_field()
widget = event.widget
selection = widget.curselection()
indName = widget.get(selection[0])
print(indName)
print("selktierter Wert: {}".format(indName))
self.realName.set(indName)
connection = sqlite3.connect(select_connect_db)
print('Database connected.')
with connection:
cursor = connection.cursor()
cursor.execute("SELECT number, type, prio, id, uniqueid FROM numbers WHERE realName=?;",(indName,))
data = cursor.fetchall()
print(data)
for row in data:
if row[1] == 'home':
self.phone_home.append(row[0])
print('HOME:',self.phone_home)
if row[1] == 'mobile':
self.mobile.append(row[0])
print('Mobile:',self.mobile)
if row[1] == 'work':
self.business.append(row[0])
print(row[0])
print('WORK:',self.business)
if row[1] == 'fax_work':
self.fax.append(row[0])
print(row[0])
print('FAX_WORK:',self.fax)
self.uid_name.set(row[4])
if len(self.phone_home) != 0:
self.e_phone['values'] = self.phone_home
self.e_phone.current(0)
if len(self.mobile) != 0:
self.e_mobile['values'] = self.mobile
self.e_mobile.current(0)
if len(self.business) != 0:
self.e_business['values'] = self.business # Set the value to the new list
self.e_business.current(0) # Set the first item of the list as current item
if len(self.business) != 0:
self.e_fax['values'] = self.fax
self.e_fax.current(0) ### Line 337 - No entry for this value in my sqlite database
Any idea, what I can search for ?
So self.e_faxseems like a ttk.Combobox to me. Consider this code here:
import tkinter as tk
from tkinter import ttk
root = tk.Tk()
values = []
lb = ttk.Combobox(root,values=values)
lb.current(0)
lb.pack()
root.mainloop()
it throughs the same Error:
_tkinter.TclError: Index 0 out of range
and the reason is the list values is empty, insert any regular string in it and it works.
Make sure if you want to set default value that there is an value.
import tkinter as tk
from tkinter import ttk
root = tk.Tk()
values = ['see']
lb = ttk.Combobox(root,values=values)
lb.current(0)
lb.pack()
root.mainloop()

How to use unittest module for a python script which calls mutiple functions and uses global variables

This is one of the function in my python script for which I am trying to write unit test case, since it uses global variables and audit and big query functions which is written as different utility scripts I am not understanding how to write #patch and execute unit test cases for the same.
How will I patch global variables?
How to patch functions which doesn't have any return for eg :audit_event_source_table, can we ignore such functions during unit testing ? if so how to do the same?
How to do assertion as I do not have any return value but have logger.info messages?
import logging
from datetime import datetime
from pathlib import Path
import sys
import __main__
from intient_research_rdm_common.utils.audit_utils import audit_event_source_table, audit_event_job_table, \
get_job_id, get_source_object_id
from intient_research_rdm_kg_core.common_utils.utils.bigquery_utils import bigquery_data_read
from intient_research_rdm_kg_core.common_utils.utils.conf_read import read_args, read_source_config, read_env_config
global project_id, service_account, conn_ip, debug, node_table_list, edge_table_list, source_name
def edge_validation():
global edge_table_list
global source_name
edge_table_na = []
edge_table_list_rowcount_zero = []
dataset_e = "prep_e_" + source_name
row_count = 0
edge_table = ""
source_object_start_timestamp = datetime.now()
source_object_id = get_source_object_id(source_name, source_object_start_timestamp)
source_object_type = AUDIT_SOURCE_OBJECT_TYPE_BIGQUERY
job_id = get_job_id(source_object_start_timestamp)
source_object_name = dataset_e
try:
for edge_table in edge_table_list:
sql_query = " SELECT * FROM " + "`" + project_id + "." + dataset_e + ".__TABLES__` WHERE table_id =" + "'" + edge_table + "'"
data_read, col_names = bigquery_data_read(service_account, sql_query, project_id)
for ind in data_read.index:
row_count = (data_read['row_count'][ind])
if len(data_read.index) == 0:
edge_table_na.append(edge_table)
elif row_count == 0:
edge_table_list_rowcount_zero.append(edge_table)
if len(edge_table_na) > 0:
logging.info("Missing Edge tables in preprocessing layer {} ".format(edge_table_na))
if len(edge_table_list_rowcount_zero) > 0:
logging.info("Edge tables with row count as zero in Pre-processing layer {} ".format(edge_table_list_rowcount_zero))
if len(edge_table_na) == 0 and len(edge_table_list_rowcount_zero) == 0:
logging.info(
"Edge list validation for the source {} has been successfully completed with no discrepancies".format(
source_name))
audit_event_source_table(source_object_id, source_object_name, source_object_type, source_name,
source_object_name,
job_id, AUDIT_JOB_STATUS_PASS, source_object_start_timestamp,
datetime.now(), 'NA', 'NA', project_id)
if len(edge_table_na) > 0 or len(edge_table_list_rowcount_zero) > 0:
audit_event_source_table(source_object_id, source_object_name, source_object_type, source_name,
source_object_name,
job_id, AUDIT_JOB_STATUS_PASS, source_object_start_timestamp,
datetime.now(), 'NA', 'NA', project_id)
sys.exit(1)
except Exception as e:
msg = '{} : Issue with the edge validation for {} is: \n{}\n'.format(datetime.now(), edge_table, e)
logging.error(msg)
audit_event_source_table(source_object_id, source_object_name, source_object_type, source_name,
source_object_name,
job_id, AUDIT_JOB_STATUS_FAIL, source_object_start_timestamp,
datetime.now(), AUDIT_ERROR_TYPE_PREPROCESSING_KG_LAYER_VALIDATION, msg,
project_id)
raise Exception(msg)
Patch global variables - in the same way that you patch a method of a class, you patch the global variable. It's not clear in your code snippet where the global variables are defined (ie. do you import these variables from another module or do you assign to those variables at the top of your Python script). Either way, you patch in the namespace where the function is being used. If you can confirm further details I will be able to assist.
Personally, the way I patch and test functions with no return value is the same. For example, if I wanted to patch the source_object_start_timestamp variable, I would use: source_object_start_timestamp = patch('pandas.datetime.utcnow', return_value="2020-08-16 20:36:06.578174").start(). For BigQuery functions, I would still patch them but in your unit test, use the mock_call_count method of the unittest.mock.mock class to test if that function has been called.
Point 2 addresses your third query - use the mock_call_count method to check how many times the mock has been called

Resources