I'm trying to pass parameter from google composer into a dataflow template as following way, but it does not work.
# composer code
trigger_dataflow = DataflowTemplateOperator(
task_id="trigger_dataflow",
template="gs://mybucket/my_template",
dag=dag,
job_name='appsflyer_events_daily',
parameters={
"input": f'gs://my_bucket/' + "{{ ds }}" + "/*.gz"
}
)
# template code
class UserOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument(
'--input',
default='gs://my_bucket/*.gz',
help='path of input file')
def main():
pipeline_options = PipelineOptions()
user_options = pipeline_options.view_as(UserOptions)
p = beam.Pipeline(options=pipeline_options)
lines = (
p
| MatchFiles(user_options.input)
)
You can pass like following.
DataflowTemplateOperator(,
task_id="task1",
template=get_variable_value("template"),
on_failure_callback=update_job_message,
parameters={
"fileBucket": get_variable_value("file_bucket"),
"basePath": get_variable_value("path_input"),
"Day": "{{ json.loads(ti.xcom_pull(key=run_id))['day'] }}",
},
)
We are using Java and in Dataflow jobs we have option class get and set like following
public interface MyOptions extends CommonOptions {
#Description("The output bucket")
#Validation.Required
ValueProvider<String> getFileBucket();
void setFileBucket(ValueProvider<String> value);
}
We need to create template for this dataflow jobs and that template will be trigger by composer dag.
Moving from dataflow classic template to flex template fixed the issue.
Related
I have a dag that outputs a dictionary (map), passed through an XCom.
I want to generate a number of tasks according to the keys of said XCom dictionary.
This is how the dictionary looks in the xcom :
{
"F1": {
"source": {
"project": "legacy_project",
"bucket": "legacy_bucket",
"prefix": "prefix/{{ds_nodash}}/F1",
"files": [
"file_1.csv"
]
},
"destination": {
"project_id": "new_project_1",
"bucket": "new_bucket_1",
"prefix": "DTM/F1/{{ds_nodash}}"
}
},
"F2": {
"source": {
"project": "legacy_project",
"bucket": "legacy_bucket",
"prefix": "prefix/{{ds_nodash}}/F2",
"files": [
"file_1.csv"
]
},
"destination": {
"project_id": "new_project_2",
"bucket": "new_bucket_2",
"prefix": "DTM/F2/{{ds_nodash}}"
}
}
// ...
}
Notice that I used {{ds_nodash}} in the generated xcom in order for it to be replaced by the date of execution of the dag.
What I want, is to create a task for each of F1,F2,...,Fn and input the F1["source"]["bucket"], F1["source"]["prefix"] into a GCSObjectsWithPrefixExistenceSensor, with a custom task_id for each task..
Something like this :
for f_key in <xcom_output_dict>:
GCSObjectsWithPrefixExistenceSensor(
task_id = f"{f_key}_sensor",
bucket = f_key["source"]["bucket"]
prefix = f_key["source"]["prefix"]
)
I tried reading up on expand and partial, but it's really unclear how one can pass the dictionary contents such as bucket and prefix to the task operator...
EDIT :
One of my trials :
from airflow import DAG, XComArg
from airflow.contrib.sensors.gcs_sensor import (
GCSObjectsWithPrefixExistenceSensor,
)
...
generate_prefix_existance_sensor_kwargs = PythonOperator(
task_id="generate_prefix_existance_sensor_kwargs",
python_callable=gen_prefix_existance_sensor_kwargs,
provide_context=True,
op_kwargs={"effective_migration_map": "effective_migration_map"},
)
sensor_files = GCSObjectsWithPrefixExistenceSensor.expand_kwargs(
input=XComArg(generate_prefix_existance_sensor_kwargs),
)
(
... # Some other tasks
>> generate_prefix_existance_sensor_kwargs
>> sensor_files
)
With the code to the kwargs generation function
def gen_prefix_existance_sensor_kwargs(ti, **kwargs) -> List[Dict[str, str]]:
effective_migration_map = load_xcom_via_kwarg(
ti=ti,
key="effective_migration_map",
default_value="effective_migration_map",
kwargs=kwargs,
)
args_dicts = []
for f_key in effective_migration_map:
args_dicts.append(
{
"task_id": f_key,
"bucket": effective_migration_map[f_key]["source"]["bucket"],
"prefix": effective_migration_map[f_key]["source"]["prefix"],
}
)
return args_dicts
But I get :
AttributeError: type object 'GCSObjectsWithPrefixExistenceSensor' has no attribute 'expand_kwargs'
Which is weird because I saw an Astronomer.io video where it is used. I guess it isn't implemented yet in Airflow 2.3.4?
EDIT 1 :
I create the wrapper for the operator :
class GCSObjectsWithPrefixExistenceSensorWrapper(
GCSObjectsWithPrefixExistenceSensor
):
"""This class is a temporary work around to using expand_kwargs(),
as expand() can only take one argument,
we must create a wrapper around every operator we use
"""
def __init__(
self,
src_or_dest: str,
inp_parameters: Tuple[str, Dict[str, Dict[str, str]]],
**kwargs
):
if src_or_dest not in ["source", "destination"]:
raise TypeError(
"Bad argument for src_or_dest, must be either 'source' or 'destination'"
)
else:
bucket = inp_parameters[1][src_or_dest]["bucket"]
prefix = inp_parameters[1][src_or_dest]["prefix"]
super().__init__(
bucket=bucket,
prefix=prefix,
**kwargs,
)
I add it to my dag ..
updated_map_2 = match_data_with_migration_map(
src_mig_map=updated_map_1,
files_and_prefixes="{{ti.xcom_pull('list_files_and_prefixes')}}",
)
GCSObjectsWithPrefixExistenceSensorWrapper.partial(
task_id="sensor_files",
src_or_dest = "source",
impersonation_chain=IMPERSONATED_SERVICE_ACCOUNT,
).expand(inp_parameters=updated_map_2.output)
I get this error :
AttributeError: 'dict' object has no attribute 'output'
EDIT 2 :
The previous task to this operator was not decorated correctly, so it created the "EDIT 1" problems
I've tried to solve your issue with dynamic task mapping. However this solution has the down side that if you pass more parameters into the expand() it will make a cross-product
So my solution is to create a custom class that inherits from GCSObjectsWithPrefixExistenceSensor and the init function assigns correct values:
class CustomExistenceSensor(GCSObjectsWithPrefixExistenceSensor):
def __init__(self, inp_params, **kwargs):
bucket , prefix = inp_params
super().__init__(bucket=bucket, prefix=prefix, **kwargs)
By using this class we can now expand the sensor and pass an iterable of parameters (bucket, prefix):
from airflow.decorators import dag, task
from datetime import datetime
from airflow.providers.google.cloud.sensors.gcs import GCSObjectsWithPrefixExistenceSensor
class CustomExistenceSensor(GCSObjectsWithPrefixExistenceSensor):
def __init__(self, inp_params, **kwargs):
bucket , prefix = inp_params
super().__init__(bucket=bucket, prefix=prefix, **kwargs)
#dag(
schedule=None,
start_date=datetime(2022, 10, 21, hour=8),
catchup=False,
tags=['demo'],
)
def template_dag():
"""
### Template dag"""
#task()
def example_func():
pass
t1 = example_func()
# Below just to test the sensor works
# t3 = GCSObjectsWithPrefixExistenceSensor(
# task_id="test_check",
# bucket="text_stack_bucket",
# prefix="test_prefix"
# prefix="prefix",
# timeout=3)
# t4 works but the issue here is that we can pass only array with one parameter if we pass more airflow will make a cross product
# t4 = GCSObjectsWithPrefixExistenceSensor.partial(task_id="test_check_dynamic", bucket="text_stack_bucket", timeout=3).expand(prefix=["test_prefix", "prefix"])
t5 = CustomExistenceSensor.partial(task_id="test_custom_class", timeout=2).expand(inp_params=[("text_stack_bucket", "test_prefix"), ("next_existing", "prefix")])
t1 >> t2 >> t5
dag = template_dag()
So if you can amend a bit your code and make it return a list in xcom so that you can invoke it like so:
task = CustomExistenceSensor.partial(task_id='dynamic_tasks').expand(inp_params=previous_task.output)
I want to run mongo command with mongohook of airflow. How can I do it?
sh.shardCollection(db_name +, { _id: "hashed" }, false, { numInitialChunks: 128 });
db.collection.createIndex({ "field": 1 }, { field: true });
The pymongo client which the Mongohook provided in Airflow uses doesn't support the sh.shardCollection command in your script.
Though the createIndex collection method is supported in the pymongo client.
I recommend anyway to install the mongosh CLI binary and bake it into your container image for your workers.
You can write your shell command to a script such as /dags/templates/mongo-admin-create-index.js or some other location that it can be found.
Then can implement a custom operator using the SubprocessHook to run mongosh CLI command such as:
mongosh -f {mongosh_script} {db_address}
This custom operator would be along these lines
from airflow.compat.functools import cached_property
from airflow.hooks.subprocess import SubprocessHook
from airflow.providers.mongo.hooks import MongoHook
class MongoshScriptOperator(BaseOperator):
template_fields: Sequence[str] = ('mongosh_script')
def __init__(
self,
*,
mongosh_script: str,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.mongosh_script = mongosh_script
#cached_property
def subprocess_hook(self):
"""Returns hook for running the shell command"""
return SubprocessHook()
def execute(self):
"""Executes a mongosh script"""
mh = MongoHook(self.conn_id)
self.subprocess_hook.run_command(
command=['mongosh', '-f', self.mongosh_script, mh.uri],
)
When creating the DagNode, you can pass the location of the script to your custom operator.
I'm implementing a python script to create a bunch of Airflow dag based on json config files. One json config file contains all the fields to be used in DAG(), and the last three fields are optional(will use global default if not set).
{
"owner": "Mike",
"start_date": "2022-04-10",
"schedule_interval": "0 0 * * *",
"on_failure_callback": "slack",
"is_paused_upon_creation": False,
"catchup": True
}
Now, my question is how to create the DAG conditionally? Since the last three option on_failure_callback, is_paused_upon_creation and catchup is optional, wonder what's the best way to use them in DAG()?
One solution_1 I tried is to use default_arg=optional_fields, and add optional fields into it with an if statement. However, it doesn't work. The DAG is not taking these three optional fields' values.
def create_dag(name, config):
# config is a dict that generate from the json config file
optional_fields = {
'owner': config['owner']
}
if 'on_failure_callback' in config:
optional_fields['on_failure_callback'] = partial(xxx(config['on_failure_callback']))
if 'is_paused_upon_creation' in config:
optional_fields['is_paused_upon_creation'] = config['is_paused_upon_creation']
dag = DAG(
dag_id=name,
start_date=datetime.strptime(config['start_date'], '%Y-%m-%d'),
schedule_interval=config['schedule_interval'],
default_args=optional_fields
)
Then, I tried solution_2 with **optional_fields, but got an error TypeError: __init__() got an unexpected keyword argument 'owner'
dag = DAG(
dag_id=name,
start_date=datetime.strptime(config['start_date'], '%Y-%m-%d'),
schedule_interval=config['schedule_interval'],
**optional_fields
)
Then solution_3 works as the following.
def create_dag(name, config):
# config is a dict that generate from the json config file
default_args = {
'owner': config['owner']
}
optional_fields = {}
if 'on_failure_callback' in config:
optional_fields['on_failure_callback'] = partial(xxx(config['on_failure_callback']))
if 'is_paused_upon_creation' in config:
optional_fields['is_paused_upon_creation'] = config['is_paused_upon_creation']
dag = DAG(
dag_id=name,
start_date=datetime.strptime(config['start_date'], '%Y-%m-%d'),
schedule_interval=config['schedule_interval'],
default_args=default_args
**optional_fields
)
However, I'm confused 1) which fields should be set in optional_fields vs default_args? 2) is there any other way to achieve it?
In Airflow I am creating branches with different operators with a for loop, my code looks like this:
for table in ['messages', 'conversations']:
Operator1 with operator1.task_id = 'operator1_{}'.format(table)
Operator1 does kwargs['ti'].xcom_push(key='file_name', value='y')
Operator2 is a BashOperator that needs to run:
bash_command = "echo {{ ti.xcom_pull(task_ids='operator1_{}', key='file_name') }}".format(table)
Operator1 >> Operator2
But in the UI the commands are rendered like that:
echo { ti.xcom_pull(task_ids='operator1_messages', key='file_name') }
echo { ti.xcom_pull(task_ids='operator1_conversations', key='file_name') }
How should I write the bash_command to have Airflow interpret correctly the template?
If I write directly
bash_command = "echo {{ ti.xcom_pull(task_ids='operator1_messages', key='file_name') }}"
it works but I want to create this command from a for loop.
Thanks!
It's doing this because the .format(table) part of your bash command is stripping off the outer { and }. You may be able to fix this with the following instead:
bash_command = "echo {{ ti.xcom_pull(task_ids='operator1_" + table + "', key='file_name') }}"
Whether this is the best way to do it is probably another question.
Is it possible to parse JSON string inside an airflow template?
I have a HttpSensor which monitors a job via a REST API, but the job id is in the response of the upstream task which has xcom_push marked True.
I would like to do something like the following, however, this code gives the error jinja2.exceptions.UndefinedError: 'json' is undefined
t1 = SimpleHttpOperator(
http_conn_id="s1",
task_id="job",
endpoint="some_url",
method='POST',
data=json.dumps({ "foo": "bar" }),
xcom_push=True,
dag=dag,
)
t2 = HttpSensor(
http_conn_id="s1",
task_id="finish_job",
endpoint="job/{{ json.loads(ti.xcom_pull(\"job\")).jobId }}",
response_check=lambda response: True if response.json().state == "complete" else False,
poke_interval=5,
dag=dag
)
t2.set_upstream(t1)
You can add a custom Jinja filter to your DAG with the parameter user_defined_filters to parse the json.
a dictionary of filters that will be exposed
in your jinja templates. For example, passing
dict(hello=lambda name: 'Hello %s' % name) to this argument allows
you to {{ 'world' | hello }} in all jinja templates related to
this DAG.
dag = DAG(
...
user_defined_filters={'fromjson': lambda s: json.loads(s)},
)
t1 = SimpleHttpOperator(
task_id='job',
xcom_push=True,
...
)
t2 = HttpSensor(
endpoint='job/{{ (ti.xcom_pull("job") | fromjson)["jobId"] }}',
...
)
However, it may be cleaner to just write your own custom JsonHttpOperator plugin (or add a flag to SimpleHttpOperator) that parses the JSON before returning so that you can just directly reference {{ti.xcom_pull("job")["jobId"] in the template.
class JsonHttpOperator(SimpleHttpOperator):
def execute(self, context):
text = super(JsonHttpOperator, self).execute(context)
return json.loads(text)
Alternatively, it is also possible to add the json module to the template by doing and the json will be available for usage inside the template. However, it is probably a better idea to create a plugin like Daniel said.
dag = DAG(
'dagname',
default_args=default_args,
schedule_interval="#once",
user_defined_macros={
'json': json
}
)
then
finish_job = HttpSensor(
task_id="finish_job",
endpoint="kue/job/{{ json.loads(ti.xcom_pull('job'))['jobId'] }}",
response_check=lambda response: True if response.json()['state'] == "complete" else False,
poke_interval=5,
dag=dag
)