How to change xcom in Airflow to accomodate large data? - airflow

I am using the following code in my Airflow operator:
import json
import pandas as pd
from airflow.exceptions import AirflowException
from airflow.hooks.http_hook import HttpHook
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
class HttpToGoogleCloudStorageOperator(BaseOperator):
template_fields = ['endpoint', 'data', 'headers', ]
template_ext = ()
ui_color = '#f4a460'
#apply_defaults
def __init__(self,
endpoint,
project_id,
table_id,
data=None,
headers=None,
auth=None,
http_conn_id='http_default',
*args, **kwargs):
super(HttpToGoogleCloudStorageOperator, self).__init__(*args, **kwargs)
self.table_id = table_id
self.http_conn_id = http_conn_id
self.method = "GET"
self.endpoint = endpoint
self.headers = headers or {}
self.auth = auth
self.data = data or {}
def execute(self, context):
http = HttpHook(self.method, http_conn_id=self.http_conn_id)
self.log.info("Calling HTTP method " + self.endpoint)
response = http.run(self.endpoint, self.data, self.headers,auth=self.auth)
self.log.info("Got response")
Unfortunately the data returned is too large (about 5k) to fit in the standard xcom and I get this error:
{taskinstance.py:1059} ERROR - (_mysql_exceptions.DataError) (1406, "Data too long for column 'value' at row 1")
Is there a way I can tell http_hook to use a different xcom, or (even better) not use xcom at all? I have looked around and I do not see a solution.
Thanks for any tips or pointers.
Edit: Here is how I call the operator. Note that nowhere do I specify xcom.
query_load_task = HttpToGoogleCloudStorageOperator(
task_id="query_load_task",
endpoint=endpoint,
project_id="my_gcp_poroject_id",
table_id="dataset.table",
data=None,
auth=(username, password))

It's preferable to store data to a system designed for such (e.g.: the file system, AWS S3, Azure, etc.) and instead return a unique identifier to reference the location of the data, for the file system this would likely be the full path (e.g.: /tmp/acme_response_20200709.csv) that way you leverage the best of both the storage system and your database.
If you add your code I'd be happy to take a crack at writing up some psuedo-code as an example.

Related

Mocking SQLAlchemy test within Strawberry/FastAPI

I'm working on creating unit tests for a FastAPI, Strawberry, and SQLAlchemy setup. The current API is working and returning data correctly, but I cannot figure out how to mock the underlying database for unit tests. Would love any help/guidance to figure out this issue.
Below is the test code I"m currently working with, which I'm hoping will be enough to solve this issues but happy to post more if it helps. Running this currently will produce and output of ExecutionResult(data=None, errors=[GraphQLError("'NoneType' object is not subscriptable", locations=[SourceLocation(line=3, column=13)], path=['biomarkers'])], extensions={}), which seems to indicate that it is almost working but not quite reaching the mocked data within UnifiedAlchemyMagicMock.
import uuid
import unittest
from unittest import mock
import strawberry
from strawberry.extensions import Extension
from alchemy_mock.mocking import UnifiedAlchemyMagicMock
from app.api.api_v1 import api
from app.models import biomarker as biomarker_models
class MockSession:
'''Create Mock Session for Db'''
session = UnifiedAlchemyMagicMock(data=[
(
[mock.call.query(biomarker_models.Biomarker)],
[biomarker_models.Biomarker(
name="hello",
id=uuid.UUID('1a8d8791-946c-4fc4-8f5d-1b0c4f5ee2f5'),
quest_biomarker_code="quest"),
biomarker_models.Biomarker(
name="test",
id=uuid.uuid4(),
quest_biomarker_code="palazo")]
)
])
class MockRequest(Extension):
'''Mock Request state for context'''
def on_request_start(self):
self.execution_context.context["db"] = MockSession()
def on_request_end(self):
self.execution_context.context["db"].close()
class BioMarkerTestCase(unittest.TestCase):
'''Test Biomarker'''
def setUp(self) -> None:
self.strawberry_schema = strawberry.Schema(
query=api.Query,
mutation=api.Mutation,
extensions=[MockRequest],
types=api.QUERY_TYPE_LIST)
def test_query_get_all(self) -> None:
'''test biomarker query'''
query = """
query {
biomarkers {
id
name
whyItMatters
questBiomarkerCode
modeOfAcquisition
questRefRangeLow
questRefRangeHigh
optimalRangeLow
optimalRangeHigh
withinRangeRecommendations
belowRangeRecommendations
aboveRangeRecommendations
crossReferenceBiomarkers
notes
resourcesCited
measurementUnits
isCritical
resultDataType
critical{
id
biomarkerId
isPriority1
priority1Range
isPriority2
priority2Range
}
}
}
"""
query_result = query_result = self.strawberry_schema.execute_sync(query)
self.assertIsNotNone(query_result.data)
When using
query_result = query_result = self.strawberry_schema.execute_sync(query)
the context_value is defaulted to None, which I think is the cause of your errors.
try with:
query_result = query_result = self.strawberry_schema.execute_sync(query, context_value={})

Airflow 2.0: Encapsulating DAG in class using Taskflow API

I have pipelines where the mechanics are always the same, a sequence of two tasks.
So I try to abstract the construction of it through a parent abstract class (using TaskFlow API):
from abc import ABC, abstractmethod
from airflow.decorators import dag, task
from datetime import datetime
def AbstractDag(ABC):
#abstractmethod
def task_1(self):
"""task 1"""
#abstractmethod
def task_2(self, data):
"""task 2"""
def dag_wrapper(self):
#dag(schedule_interval=None, start_date=datetime(2022, 1, 1))
def dag():
#task(task_id='task_1')
def task_1():
return self.task_1()
#task(task_id='task_2')
def task_2(data):
return self.task_2(data)
task_2(task_1())
return dag
But when I try to inherit this class, I can't see my dag in the interface:
class MyCustomDag(AbstractDag):
def task_1(self):
return 2
#abstractmethod
def task_2(self, data):
print(data)
custom_dag = MyCustomDag()
dag_object = custom_dag.dag_wrapper()
Do you have any idea how to do this? or better ideas to abstract this?
Thanks a lot!
Nicolas
I was able to get your example DAG to render in the UI with just a couple small tweaks:
The MyCustomDag.task_2 method doesn't need to be decorated as an abstractmethod.
Using dag() as the wrapped DAG object function name has its issues since it's also a decorator name.
In the AbstractDag.dag_wrapper method you do need to call the #dag-decorated function.
Here is the code I used:
from abc import ABC, abstractmethod
from airflow.decorators import dag, task
from datetime import datetime
class AbstractDag(ABC):
#abstractmethod
def task_1(self):
"""task 1"""
#abstractmethod
def task_2(self, data):
"""task 2"""
def dag_wrapper(self):
#dag(schedule_interval=None, start_date=datetime(2022, 1, 1))
def _dag():
#task(task_id='task_1')
def task_1():
return self.task_1()
#task(task_id='task_2')
def task_2(data):
return self.task_2(data)
task_2(task_1())
return _dag()
class MyCustomDag(AbstractDag):
def task_1(self):
return 2
def task_2(self, data):
print(data)
custom_dag = MyCustomDag()
dag_object = custom_dag.dag_wrapper()
It's worth noting the following from the Airflow docs :
When searching for DAGs inside the DAG_FOLDER, Airflow only considers Python files that contain the strings airflow and dag (case-insensitively) as an optimization.
To consider all Python files instead, disable the
DAG_DISCOVERY_SAFE_MODE configuration flag.
If you're inheriting from AbstractDag in a different file, make sure airflow and dag are in that file. You can simply add a comment with those words.

Airflow - getting the execution_date in task when calling an Operator

I have this Operator, its pretty much the same as S3CopyObjectOperator except it looks for all objects in a folder and copies to a destination folder.
import os
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.decorators import apply_defaults
from common.s3.partition import Partition, PartitionType
from airflow.models import BaseOperator
import logging
class S3CopyObjectsOperator(BaseOperator):
#apply_defaults
def __init__(self,
aws_conn_id: str,
partition: Partition,
s3_bucket: str,
dest_prefix: str,
*args,
**kwargs):
super(S3CopyObjectsOperator, self).__init__(*args, **kwargs)
self.aws_conn_id = aws_conn_id
self.partition = partition
self.s3_bucket = s3_bucket
self.dest_prefix = dest_prefix
def execute(self, context):
self.partition.partition_value = context.get("execution_date")
logging.info(f'self.dest_prefix: {self.dest_prefix}')
exec_date = context.get("execution_date")
logging.info(f'self.partition.partition_value: {self.partition.partition_value}')
s3 = S3Hook(self.aws_conn_id)
s3_conn = s3.get_conn()
logging.info(f'source bucket -- self.partition.bucket: {self.partition.bucket}')
logging.info(f'source key -- self.partition.key_prefix: {self.partition.key_prefix}')
source_keys = s3.list_keys(bucket_name=self.partition.bucket, prefix=self.partition.key_prefix, delimiter="/")
logging.info(f'keys: {source_keys}')
for file in source_keys:
prefix, filename = os.path.split(file)
dest_key = f'{self.dest_prefix}/{filename}'
logging.info(f'Copying file {filename} to {self.dest_prefix}')
key = self.partition.key_prefix + filename
logging.info(f'key: {key}')
s3_conn.copy_object(Bucket=self.s3_bucket,
Key=f'{dest_key}',
CopySource={
'Bucket': self.partition.bucket,
'Key': key
}, ContentEncoding='csv')
However when I use this operator in my task I need my dest_prefix to include the execution date.
Things I've tried:
I've tried adding ds = '{{ ds_nodash }}' in the dag file but when I print self.dest_prefix in the Operator the value it returns he string value and not the execution date.
I've also tried creating a function but when I print self.dest_prefix in the Operator the value it returns is: self.dest_prefix: <function exec_value at 0x7fd008fcb940> See below for my task:
the execution date should be after snapshot_date=
for data_group in data_group_names:
copy_felix_to_s3 = S3CopyObjectsOperator(
task_id=f'copy_felix_{data_group}_data_to_s3',
aws_conn_id='aws_default',
s3_bucket='bucket_name',
partition=felixS3Partition(
bucket='source_bucket',
location_base=f'our_bucket/{data_group}',
partition_type=None
),
dest_prefix=f"felix/{data_group}/snapshot_date= ds",
dag=dag
)
copy_felix_to_s3
You are missing declaration of the parameter as templated field.
class S3CopyObjectsOperator(BaseOperator):
...
template_fields = ("dest_prefix",)
...
Macros (such as ds_nodash) are available only for templated fields thus if you don't specify template_fields it will handle the value you pass as string and it will not be rendered.

Pact: Error when trying to setup mock provider

I'm trying to write my first Pact-python test using pytest, Could someone please tell me what's wrong with my code?
import unittest
import requests
import json
import pytest
import atexit
from pact import Consumer, Provider
pact = Consumer('Consumer').has_pact_with(Provider('Provider'), host_name='mockservice', port=8080)
pact.start_service()
atexit.register(pact.stop_service)
class InterviewDetails(unittest.TestCase):
def test_candidate_report_api(self):
candidate_report_payload = {}
resp = requests.post("http://localhost:1234/users/",data=json.dumps(candidate_report_payload))
response = json.loads(resp.text)
return response
#pytest.mark.health1
def test_candidate_report(self):
expected = {}
(pact.given('Comment')
.upon_receiving('comment')
.with_request(method='POST', path="http://localhost:1234/users/", headers={})
.will_respond_with(200, body=expected))
with pact:
pact.setup()
result = self.test_candidate_report_api()
self.assertEqual(result, expected)
pact.verify()
The error from stacktrace:
AttributeError: module 'pact' has no attribute 'Like'
Can you please confirm you're using pact-python from https://github.com/pact-foundation/pact-python/ (and not pactman, a project that is not maintained by the Pact Foundation)?
It might be related to the way you have setup your test?
Here is an example project you can use for reference: https://github.com/pactflow/example-consumer-python/
Relevant test code:
"""pact test for product service client"""
import json
import logging
import os
import requests
from requests.auth import HTTPBasicAuth
import pytest
from pact import Consumer, Like, Provider, Term, Format
from src.consumer import ProductConsumer
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
print(Format().__dict__)
PACT_MOCK_HOST = 'localhost'
PACT_MOCK_PORT = 1234
PACT_DIR = os.path.dirname(os.path.realpath(__file__))
#pytest.fixture
def consumer():
return ProductConsumer(
'http://{host}:{port}'
.format(host=PACT_MOCK_HOST, port=PACT_MOCK_PORT)
)
#pytest.fixture(scope='session')
def pact(request):
pact = Consumer('pactflow-example-consumer-python').has_pact_with(
Provider('pactflow-example-provider-python'), host_name=PACT_MOCK_HOST, port=PACT_MOCK_PORT,
pact_dir="./pacts", log_dir="./logs")
try:
print('start service')
pact.start_service()
yield pact
finally:
print('stop service')
pact.stop_service()
def test_get_product(pact, consumer):
expected = {
'id': "27",
'name': 'Margharita',
'type': 'Pizza'
}
(pact
.given('a product with ID 10 exists')
.upon_receiving('a request to get a product')
.with_request('GET', '/product/10')
.will_respond_with(200, body=Like(expected)))
with pact:
user = consumer.get_product('10')
assert user.name == 'Margharita'

How to access Xcom value in a non airflow operator python function

I have a stored XCom value that I wanted to pass to another python function which is not called using PythonOperator.
def sql_file_template():
<some code which uses xcom variable>
def call_stored_proc(**kwargs):
#project = kwargs['row_id']
print("INSIDE CALL STORE PROC ------------")
query = """CALL `{0}.dataset_name.store_proc`(
'{1}' # source table
, ['{2}'] # row_ids
, '{3}' # pivot_col_name
, '{4}' # pivot_col_value
, 100 # max_columns
, 'MAX' # aggregation
);"""
query = query.format(kwargs['project'],kwargs['source_tbl'] ,kwargs['row_id'],kwargs['pivot_col'],kwargs['pivot_val'])
job = client.query(query, location="US")
for result in job.result():
task_instance = kwargs['task_instance']
task_instance.xcom_push(key='query_string', value=result)
print result
return result
bq_cmd = PythonOperator (
task_id= 'task1'
provide_context= True,
python_callable= call_stored_proc,
op_kwargs= {'project' : project,
'source_tbl' : source_tbl,
'row_id' : row_id,
'pivot_col' : pivot_col,
'pivot_val' : pivot_val
},
dag= dag
)
dummy_operator >> bq_cmd
sql_file_template()
The output of stored proc is a string which is captured using xcom.
Now I would like to pass this value to some python function sql_file_template without using PythonOperator.
As per Airflow documentation xcom can be accessed only between tasks.
Can anyone help on this?
If you have access to the Airflow installation you'd like to query (configuration, database access, and code) you can use Airflow's airflow.models.XCom:get_one class method:
from datetime import datetime
from airflow.models import XCom
execution_date = datetime(2020, 8, 28)
xcom_value = XCom.get_one(execution_date=execution_date,
task_id="the_task_id",
dag_id="the_dag_id")
So you want to access XCOM outside Airflow (probably a different project / module, without creating any Airflow DAGs / tasks)?
Airflow uses SQLAlchemy for mapping all it's models (including XCOM) to corresponding SQLAlchemy backend (meta-db) tables
Therefore this can be done in two ways
Leverage Airflow's SQLAlchemy model
(without having to create a task or DAG). Here's an untested code snippet for reference
from typing import List
from airflow.models import XCom
from airflow.settings import Session
from airflow.utils.db import provide_session
from pendulum import Pendulum
#provide_session
def read_xcom_values(dag_id: str,
task_id: str,
execution_date: Pendulum,
session: Optional[Session]) -> List[str]:
"""
Function that reads and returns 'values' of XCOMs with given filters
:param dag_id:
:param task_id:
:param execution_date: datetime object
:param session: Airflow's SQLAlchemy Session (this param must not be passed, it will be automatically supplied by
'#provide_session' decorator)
:return:
"""
# read XCOMs
xcoms: List[XCom] = session.query(XCom).filter(
XCom.dag_id == dag_id, XCom.task_id == task_id,
XCom.execution_date == execution_date).all()
# retrive 'value' fields from XCOMs
xcom_values: List[str] = list(map(lambda xcom: xcom.value, xcoms))
return xcom_values
Do note that since it is importing airflow packages, it still requires working airflow installation on python classpath (as well as connection to backend-db), but here we are not creating any tasks or dags (this snippet can be run in a standalone python file)
For this snippet, I have referred to views.py which is my favorite place to peek into Airflow's SQLAlchemy magic
Directly query Airflow's SQLAlchemy backend meta-db
Connect to meta db and run this query
SELECT value FROM xcom WHERE dag_id='' AND task_id='' AND ..

Resources