How to run sql command using prefect task - teradata

I am trying to fetch data from teradata then writing and reading to parquet file using prefect tasks
My code is working when I am executing sql separately like below
def fetch_data(host,db_name,user,password,query):
'logic'
#task(name="Write dask dataframe into GPFS parquet file")
def write_data(dask_dataframe,file_name):
'logic'
#task(name="Read data from GPFS parquet file into dask dataframe")
def read_data(file_name):
'logic'
with Flow("Teradata Example") as flow:
result = fetch_data(host,db_name,user,password,query)
write_data(dask_dataframe=result,file_name=file_name)
read_data(file_name=file_name)
flow.run()
But code is failing when same fetcher code is running as task
#task(name="Fetch sql query data from teradta data source into dask dataframe")
def fetch_data(host,db_name,user,password,query):
'logic'
#task(name="Write dask dataframe into GPFS parquet file")
def write_data(dask_dataframe,file_name):
'logic'
#task(name="Read data from GPFS parquet file into dask dataframe")
def read_data(file_name):
'logic'
with Flow("Teradata Example") as flow:
result = fetch_data(host,db_name,user,password,query)
write_data(dask_dataframe=result,file_name=file_name)
read_data(file_name=file_name)
flow.run()
Added the teradata file code:
def get_partitions(num_partitions):
list_range =[]
initial_start=0
for i in range(num_partitions):
amp_range = 3240//num_partitions
start = (i*amp_range+1)*initial_start
end = (i+1)*amp_range
list_range.append((start,end))
initial_start = 1
return list_range
#delayed
def load(query,start,end,connString):
return pd.read_sql(query.format(start, end),connString)
class TeradataFetch(Task):
def __init__(
args)
#defaults_from_attrs("fetch", "fetch_count", "query", "commit", "charset")
def run(
self,
query: str,
) -> Any:
try:
results = from_delayed([load(query,start, end,connString) for start,end in get_partitions(self.num_partitions)])
logging.debug("Fetch Results: %s", results)
return results
except Exception as e:
raise e
Can someone please suggest/help here?

Related

Apache Airflow unit and integration test

I am new to Apache Airflow and I am trying to figure out how to unit/integration test my dags/tasks
Here is my directory structure
/airflow
/dags
/tests/dags
I created a simple DAG which has a task to reads data from a Postgres table
def read_files(ti):
sql = "select id from files where status='NEW'"
pg_hook = PostgresHook(postgres_conn_id="metadata")
connection = pg_hook.get_conn()
cursor = connection.cursor()
cursor.execute(sql)
files = cursor.fetchall()
ti.xcom_push(key="files_to_process", value=files)
with DAG(dag_id="check_for_new_files", schedule_interval=timedelta(minutes=30),
start_date=datetime(2022, 9, 1), catchup=False) as dag:
check_files = PythonOperator(task_id="read_files",
python_callable=read_files)
Is it possible to test this by mocking Airflow/Postgres connection etc
yes it is possible to do test in dags, here is an example of basic things you can do:
import unittest
from airflow.models import DagBag
class TestCheckForNewFilesDAG(unittest.TestCase):
"""Check Dag"""
def setUp(self):
self.dagbag = DagBag()
def test_task_count(self):
"""Check task count for a dag"""
dag_id='check_for_new_files'
dag = self.dagbag.get_dag(dag_id)
self.assertEqual(len(dag.tasks), 1)
def test_contain_tasks(self):
"""Check task contains in hello_world dag"""
dag_id='check_for_new_files'
dag = self.dagbag.get_dag(dag_id)
tasks = dag.tasks
task_ids = list(map(lambda task: task.task_id, tasks))
self.assertListEqual(task_ids, ['read_files'])
def test_dependencies_of_read_files_task(self):
"""Check the task dependencies of a taskin hello_world dag"""
dag_id='check_for_new_files'
dag = self.dagbag.get_dag(dag_id)
read_files_task = dag.get_task('read_files')
# to be use in case you have upstream task
upstream_task_ids = list(map(lambda task: task.task_id,
read_files_task.upstream_list))
self.assertListEqual(upstream_task_ids, [])
downstream_task_ids = list(map(lambda task: task.task_id,
read_files_task.downstream_list))
self.assertListEqual(downstream_task_ids, [])
suite = unittest.TestLoader().loadTestsFromTestCase(TestHelloWorldDAG)
unittest.TextTestRunner(verbosity=2).run(suite)
In case of verifying that manipulated data of files are moved correctly the documentations suggest:
https://airflow.apache.org/docs/apache-airflow/2.0.1/best-practices.html#self-checks
Self-Checks
You can also implement checks in a DAG to make sure the tasks are producing the results as expected. As an example, if you have a task that pushes data to S3, you can implement a check in the next task. For example, the check could make sure that the partition is created in S3 and perform some simple checks to determine if the data is correct.
I think this is an excellent and straightforward way to verify a specific task.
Here there are other useful links you can use:
https://www.youtube.com/watch?v=ANJnYbLwLjE
In the next ones, they talk about mock
https://www.astronomer.io/guides/testing-airflow/
https://medium.com/#montadhar/apache-airflow-testing-guide-7956a3f4bbf5
https://godatadriven.com/blog/testing-and-debugging-apache-airflow/

Airflow - getting the execution_date in task when calling an Operator

I have this Operator, its pretty much the same as S3CopyObjectOperator except it looks for all objects in a folder and copies to a destination folder.
import os
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.decorators import apply_defaults
from common.s3.partition import Partition, PartitionType
from airflow.models import BaseOperator
import logging
class S3CopyObjectsOperator(BaseOperator):
#apply_defaults
def __init__(self,
aws_conn_id: str,
partition: Partition,
s3_bucket: str,
dest_prefix: str,
*args,
**kwargs):
super(S3CopyObjectsOperator, self).__init__(*args, **kwargs)
self.aws_conn_id = aws_conn_id
self.partition = partition
self.s3_bucket = s3_bucket
self.dest_prefix = dest_prefix
def execute(self, context):
self.partition.partition_value = context.get("execution_date")
logging.info(f'self.dest_prefix: {self.dest_prefix}')
exec_date = context.get("execution_date")
logging.info(f'self.partition.partition_value: {self.partition.partition_value}')
s3 = S3Hook(self.aws_conn_id)
s3_conn = s3.get_conn()
logging.info(f'source bucket -- self.partition.bucket: {self.partition.bucket}')
logging.info(f'source key -- self.partition.key_prefix: {self.partition.key_prefix}')
source_keys = s3.list_keys(bucket_name=self.partition.bucket, prefix=self.partition.key_prefix, delimiter="/")
logging.info(f'keys: {source_keys}')
for file in source_keys:
prefix, filename = os.path.split(file)
dest_key = f'{self.dest_prefix}/{filename}'
logging.info(f'Copying file {filename} to {self.dest_prefix}')
key = self.partition.key_prefix + filename
logging.info(f'key: {key}')
s3_conn.copy_object(Bucket=self.s3_bucket,
Key=f'{dest_key}',
CopySource={
'Bucket': self.partition.bucket,
'Key': key
}, ContentEncoding='csv')
However when I use this operator in my task I need my dest_prefix to include the execution date.
Things I've tried:
I've tried adding ds = '{{ ds_nodash }}' in the dag file but when I print self.dest_prefix in the Operator the value it returns he string value and not the execution date.
I've also tried creating a function but when I print self.dest_prefix in the Operator the value it returns is: self.dest_prefix: <function exec_value at 0x7fd008fcb940> See below for my task:
the execution date should be after snapshot_date=
for data_group in data_group_names:
copy_felix_to_s3 = S3CopyObjectsOperator(
task_id=f'copy_felix_{data_group}_data_to_s3',
aws_conn_id='aws_default',
s3_bucket='bucket_name',
partition=felixS3Partition(
bucket='source_bucket',
location_base=f'our_bucket/{data_group}',
partition_type=None
),
dest_prefix=f"felix/{data_group}/snapshot_date= ds",
dag=dag
)
copy_felix_to_s3
You are missing declaration of the parameter as templated field.
class S3CopyObjectsOperator(BaseOperator):
...
template_fields = ("dest_prefix",)
...
Macros (such as ds_nodash) are available only for templated fields thus if you don't specify template_fields it will handle the value you pass as string and it will not be rendered.

Airflow:Date not rendering in SQL file

Issue: Date not rendering in SQL file
I am not able to get the date yesterday_ds rendered in the SQL file
In bi_utils/airflow.py module I have given as YESTERDAY_DS = '{{yesterday_ds}}'
In the Dag
from bi_utils.airflow import YESTERDAY_DS
snflk_to_s3 = SnowflakeMultiSqlStatmentOperator(
task_id='snflk_to_s3',
snowflake_conn_id=SNOWFLAKE_CONN_ID,
sql=load_sql,
params={
'proc_run_task_id': [proc_start.task_id],
'yesterday_ds': YESTERDAY_DS,
},
autocommit=True,
)
In the SQL file
COPY INTO #public.stage/path/{{params.yesterday_ds}}/
Looks like you are using a custom operator. In that case, you will have to add the argument as a template_fields to resolve the jinja template.
In this case, inside your custom operator, the Code below is just to understand the structure.
class SnowflakeMultiSqlStatmentOperator(BaseOperator):
"""
Executes sql code in a Snowflake database
"""
template_fields = ('sql','params')
template_ext = ('.sql',)
ui_color = '#ededed'
#apply_defaults
def __init__(
self, sql, snowflake_conn_id='snowflake_default', parameters=None,
autocommit=True, warehouse=None, database=None, role=None,
schema=None,params=None, *args, **kwargs):
super(SnowflakeOperator, self).__init__(*args, **kwargs)
self.snowflake_conn_id = snowflake_conn_id
self.sql = sql
self.autocommit = autocommit
self.parameters = parameters
self.warehouse = warehouse
self.database = database
self.role = role
self.schema = schema
self.params=params
def execute(self, context):
self.log.info('Executing: %s', self.sql)
hook = self.get_hook()
hook.run(
self.sql,
autocommit=self.autocommit,
parameters=self.parameters)

How to change xcom in Airflow to accomodate large data?

I am using the following code in my Airflow operator:
import json
import pandas as pd
from airflow.exceptions import AirflowException
from airflow.hooks.http_hook import HttpHook
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
class HttpToGoogleCloudStorageOperator(BaseOperator):
template_fields = ['endpoint', 'data', 'headers', ]
template_ext = ()
ui_color = '#f4a460'
#apply_defaults
def __init__(self,
endpoint,
project_id,
table_id,
data=None,
headers=None,
auth=None,
http_conn_id='http_default',
*args, **kwargs):
super(HttpToGoogleCloudStorageOperator, self).__init__(*args, **kwargs)
self.table_id = table_id
self.http_conn_id = http_conn_id
self.method = "GET"
self.endpoint = endpoint
self.headers = headers or {}
self.auth = auth
self.data = data or {}
def execute(self, context):
http = HttpHook(self.method, http_conn_id=self.http_conn_id)
self.log.info("Calling HTTP method " + self.endpoint)
response = http.run(self.endpoint, self.data, self.headers,auth=self.auth)
self.log.info("Got response")
Unfortunately the data returned is too large (about 5k) to fit in the standard xcom and I get this error:
{taskinstance.py:1059} ERROR - (_mysql_exceptions.DataError) (1406, "Data too long for column 'value' at row 1")
Is there a way I can tell http_hook to use a different xcom, or (even better) not use xcom at all? I have looked around and I do not see a solution.
Thanks for any tips or pointers.
Edit: Here is how I call the operator. Note that nowhere do I specify xcom.
query_load_task = HttpToGoogleCloudStorageOperator(
task_id="query_load_task",
endpoint=endpoint,
project_id="my_gcp_poroject_id",
table_id="dataset.table",
data=None,
auth=(username, password))
It's preferable to store data to a system designed for such (e.g.: the file system, AWS S3, Azure, etc.) and instead return a unique identifier to reference the location of the data, for the file system this would likely be the full path (e.g.: /tmp/acme_response_20200709.csv) that way you leverage the best of both the storage system and your database.
If you add your code I'd be happy to take a crack at writing up some psuedo-code as an example.

Python: Run only one function async

I have a large legacy application that has one function that is a prime candidate to be executed async. It's IO bound (network and disk) and doesn't return anything.
This is a very simple similar implementation:
import random
import time
import requests
def fetch_urls(site):
wait = random.randint(0, 5)
filename = site.split("/")[2].replace(".", "_")
print(f"Will fetch {site} in {wait} seconds")
time.sleep(wait)
r = requests.get(site)
with open(filename, "w") as fd:
fd.write(r.text)
def something(sites):
for site in sites:
fetch_urls(site)
return True
def main():
sites = ["https://www.google.com", "https://www.reddit.com", "https://www.msn.com"]
start = time.perf_counter()
something(sites)
total_time = time.perf_counter() - start
print(f"Finished in {total_time}")
if __name__ == "__main__":
main()
My end goal would be updating the something function to run fetch_urls async.
I cannot change fetch_urls.
All documentation and tutorials I can find assumes my entire application is async (starting from async def main()) but this is not the case.
It's a huge application spanning across multiple modules and re-factoring everything for a single function doesn't look right.
For what I understand I will need to create a loop, add tasks to it and dispatch it somehow, but I tried many different things and I still get everything running just one after another - as oppose to concurrently.
I would appreciate any assistance. Thanks!
Replying to myself, it seems there is no easy way to do that with async. Ended up using concurrent.futures
import time
import requests
import concurrent.futures
def fetch_urls(url, name):
wait = 5
filename = url.split("/")[2].replace(".", "_")
print(f"Will fetch {name} in {wait} seconds")
time.sleep(wait)
r = requests.get(url)
with open(filename, "w") as fd:
fd.write(r.text)
def something(sites):
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
future_to_url = {
executor.submit(fetch_urls, url["url"], url["name"]): (url)
for url in sites["children"]
}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print("%r generated an exception: %s" % (url, exc))
return True
def main():
sites = {
"parent": "https://stackoverflow.com",
"children": [
{"name": "google", "url": "https://google.com"},
{"name": "reddit", "url": "https://reddit.com"},
],
}
start = time.perf_counter()
something(sites)
total_time = time.perf_counter() - start
print(f"Finished in {total_time}")

Resources