How to create a DAG from task in Airflow - airflow
I have requirement where there is parent Dag with only one task, which create certain parameters (not-fixed). Lets call them as params1, params2 and params3. Now I want to create three DAGs from task in parent Dag, which will have params available in cotext of each task with DAG. I was going through following link to create the dynamic dags and tried it -
https://airflow.incubator.apache.org/faq.html#how-can-i-create-dags-dynamically
class ParentBigquerySql(object):
def __init__(self):
pass
def run(self, **context):
logging.info('Running job')
batch_id = 100
#parent_sql = '''SELECT max(run_start_date) AS run_start_date,
# max(run_end_date) AS run_end_date
# FROM `vintel_rel_2_0_staging_westfield.in_venue_batch_dates_daily`'''
parent_sql = '''SELECT run_start_date, run_end_date
from vintel_rel_2_0_staging_westfield.in_venue_batch_dates_daily
order by 1 ,2'''
params = self.get_params(batch_id, parent_sql)
XcomManager.push_query_params(context, params)
return params
def get_params(self, batch_id, parent_sql):
batch_id = str(batch_id)
result = BigQueryManager.read_query_to_table(parent_sql)
t_list = []
if result and type(result) is not list and result.error_result:
#LogManager.info("Error in running the parent jobs - %s." % (result.error_result))
#LogManager.info("Not populating cache... ")
pass
elif len(result) > 0:
for row in result:
if len(row) > 0:
run_start_date = row[0]
run_end_date = row[1]
if run_start_date and run_end_date:
t_list.append({'min_date': run_start_date, 'max_date': run_end_date})
params = {}
params['date_range'] = t_list
return params
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2017, 3, 23),
'retries': 1,
'provide_context': True,
'retry_delay': timedelta(minutes=2),
}
dag = DAG('parent_dynamic_job_dag', # give the dag a name
schedule_interval='#once',
default_args=default_args
)
def pull_child11(**context):
logging.info(" Date range " + str(context['date_range']))
def conditionally_trigger(context, dag_run_obj):
return dag_run_obj
def create_dag_from_task(**context):
job = ParentBigquerySql()
job.run(**context)
logging.info("Context data")
logging.info(context)
params = XcomManager.pull_query_params(context)
logging.info("Xcomm parameters: " + str(params))
tl = []
counter = 1
for d1 in params['date_range']:
dyn_dag_id = 'child_dag_id' + str(counter)
dag_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': context['execution_date'],
'execution_date': context['execution_date'],
'retries': 1,
'provide_context': True,
'retry_delay': timedelta(minutes=2),
}
dyn_dag = DAG(dyn_dag_id, # give the dag a name
schedule_interval='#once',
default_args=dag_args
)
t1 = PythonOperator(
task_id='child' + str(counter),
dag=dyn_dag,
provide_context=True,
python_callable=pull_child11,
op_kwargs={'dag_id':10, 'date_range':d1}
)
t2 = TriggerDagRunOperator(task_id='test_trigger_dag',
trigger_dag_id='child_dag_id' + str((counter + 1)),
python_callable=conditionally_trigger,
dag=dyn_dag)
t1.set_downstream(t2)
logging.info("Updating globals for the dag " + dyn_dag_id)
#trigger_op.execute(context)
globals()[dyn_dag_id] = dyn_dag ##Assing DAG objects to global namespace
if counter > 2:
break
counter = counter + 1
push1 = PythonOperator(
task_id='100-Parent',
dag=dag,
provide_context=True,
python_callable=create_dag_from_task,
op_kwargs={'dag_id':100})
push11 = PythonOperator(
task_id='101-Child',
dag=dag,
provide_context=True,
python_callable=pull_child11,
op_kwargs={'dag_id': 100, 'date_range': {'start_date': 'temp_start_date', 'end_date': 'temp_end_date'}})
t2 = TriggerDagRunOperator(task_id='test_trigger_dag',
trigger_dag_id='child_dag_id1',
python_callable=conditionally_trigger,
dag=dag)
push1.set_downstream(push11)
push11.set_downstream(t2)
I am getting following error -
[2018-05-01 09:24:27,764] {__init__.py:45} INFO - Using executor SequentialExecutor
[2018-05-01 09:24:27,875] {models.py:189} INFO - Filling up the DagBag from /mnt/test_project /airflow/dags
[2018-05-01 09:25:02,074] {models.py:1197} INFO - Dependencies all met for <TaskInstance: parent_dynamic_job_dag.test_trigger_dag 2018-04-23 00:00:00 [up_for_retry]>
[2018-05-01 09:25:02,081] {base_executor.py:49} INFO - Adding to queue: airflow run parent_dynamic_job_dag test_trigger_dag 2018-04-23T00:00:00 --local -sd DAGS_FOLDER/test_dynamic_parent_child.py
[2018-05-01 09:25:07,003] {sequential_executor.py:40} INFO - Executing command: airflow run parent_dynamic_job_dag test_trigger_dag 2018-04-23T00:00:00 --local -sd DAGS_FOLDER/test_dynamic_parent_child.py
[2018-05-01 09:25:08,235] {__init__.py:45} INFO - Using executor SequentialExecutor
[2018-05-01 09:25:08,431] {models.py:189} INFO - Filling up the DagBag from /mnt/test_project /airflow/dags/test_dynamic_parent_child.py
[2018-05-01 09:26:44,207] {base_task_runner.py:115} INFO - Running: ['bash', '-c', u'airflow run parent_dynamic_job_dag test_trigger_dag 2018-04-23T00:00:00 --job_id 178 --raw -sd DAGS_FOLDER/test_dynamic_parent_child.py']
[2018-05-01 09:26:45,243] {base_task_runner.py:98} INFO - Subtask: [2018-05-01 09:26:45,242] {__init__.py:45} INFO - Using executor SequentialExecutor
[2018-05-01 09:26:45,416] {base_task_runner.py:98} INFO - Subtask: [2018-05-01 09:26:45,415] {models.py:189} INFO - Filling up the DagBag from /mnt/test_project /airflow/dags/test_dynamic_parent_child.py
[2018-05-01 09:27:49,798] {base_task_runner.py:98} INFO - Subtask: [2018-05-01 09:27:49,797] {models.py:189} INFO - Filling up the DagBag from /mnt/test_project /airflow/dags
[2018-05-01 09:27:50,108] {base_task_runner.py:98} INFO - Subtask: Traceback (most recent call last):
[2018-05-01 09:27:50,108] {base_task_runner.py:98} INFO - Subtask: File "/Users/manishz/anaconda2/bin/airflow", line 27, in <module>
[2018-05-01 09:27:50,109] {base_task_runner.py:98} INFO - Subtask: args.func(args)
[2018-05-01 09:27:50,109] {base_task_runner.py:98} INFO - Subtask: File "/Users/manishz/anaconda2/lib/python2.7/site-packages/airflow/bin/cli.py", line 392, in run
[2018-05-01 09:27:50,110] {base_task_runner.py:98} INFO - Subtask: pool=args.pool,
[2018-05-01 09:27:50,110] {base_task_runner.py:98} INFO - Subtask: File "/Users/manishz/anaconda2/lib/python2.7/site-packages/airflow/utils/db.py", line 50, in wrapper
[2018-05-01 09:27:50,110] {base_task_runner.py:98} INFO - Subtask: result = func(*args, **kwargs)
[2018-05-01 09:27:50,111] {base_task_runner.py:98} INFO - Subtask: File "/Users/manishz/anaconda2/lib/python2.7/site-packages/airflow/models.py", line 1493, in _run_raw_task
[2018-05-01 09:27:50,111] {base_task_runner.py:98} INFO - Subtask: result = task_copy.execute(context=context)
[2018-05-01 09:27:50,112] {base_task_runner.py:98} INFO - Subtask: File "/Users/manishz/anaconda2/lib/python2.7/site-packages/airflow/operators/dagrun_operator.py", line 67, in execute
[2018-05-01 09:27:50,112] {base_task_runner.py:98} INFO - Subtask: dr = trigger_dag.create_dagrun(
[2018-05-01 09:27:50,112] {base_task_runner.py:98} INFO - Subtask: AttributeError: 'NoneType' object has no attribute 'create_dagrun'
[2018-05-01 09:28:14,407] {jobs.py:2521} INFO - Task exited with return code 1
[2018-05-01 09:28:14,569] {jobs.py:1959} ERROR - Task instance <TaskInstance: parent_dynamic_job_dag.test_trigger_dag 2018-04-23 00:00:00 [failed]> failed
[2018-05-01 09:28:14,573] {models.py:4584} INFO - Updating state for <DagRun parent_dynamic_job_dag # 2018-04-23 00:00:00: backfill_2018-04-23T00:00:00, externally triggered: False> considering 3 task(s)
[2018-05-01 09:28:14,576] {models.py:4631} INFO - Marking run <DagRun parent_dynamic_job_dag # 2018-04-23 00:00:00: backfill_2018-04-23T00:00:00, externally triggered: False> failed
[2018-05-01 09:28:14,600] {jobs.py:2125} INFO - [backfill progress] | finished run 1 of 1 | tasks waiting: 0 | succeeded: 2 | kicked_off: 0 | failed: 1 | skipped: 0 | deadlocked: 0 | not ready: 0
Traceback (most recent call last):
File "/Users/manishz/anaconda2/bin/airflow", line 27, in <module>
args.func(args)
File "/Users/manishz/anaconda2/lib/python2.7/site-packages/airflow/bin/cli.py", line 185, in backfill
delay_on_limit_secs=args.delay_on_limit)
File "/Users/manishz/anaconda2/lib/python2.7/site-packages/airflow/models.py", line 3724, in run
job.run()
File "/Users/manishz/anaconda2/lib/python2.7/site-packages/airflow/jobs.py", line 198, in run
self._execute()
File "/Users/manishz/anaconda2/lib/python2.7/site-packages/airflow/jobs.py", line 2441, in _execute
raise AirflowException(err)
airflow.exceptions.AirflowException: ---------------------------------------------------
Some task instances failed:
%s
But above code is not running the following dags. Any idea whats happening here?
Thanks in Advance
Manish
Related
Why does Airflow ExternalTaskSensor not work on the dag having PythonOperator?
Airflow version: v2.3.0 OS: ubuntu 22.04 1. Dag structure working well(without failed) from datetime import datetime from airflow import DAG from airflow.operators.bash import BashOperator from airflow.operators.dummy import DummyOperator from airflow.sensors.external_task import ExternalTaskSensor def temp_task(): print(1) a_dag = DAG( dag_id='a_dag1', default_args={'owner': 'brownbear'}, start_date=datetime(2021, 11, 6, 0, 0, 0), schedule_interval="*/1 * * * *", tags=['external'], catchup=False ) with a_dag: start = DummyOperator(task_id='wow1') end = DummyOperator(task_id='wow2') start >> end b_dag = DAG( dag_id='a_dag2', default_args={'owner': 'brownbear'}, start_date=datetime(2021, 11, 6, 0, 0, 0), schedule_interval='*/1 * * * *', tags=['external'], catchup=False ) with b_dag: downstream_task1 = ExternalTaskSensor( task_id="downstream_task1", mode='reschedule', external_dag_id='a_dag1', external_task_id="wow2", timeout=600, ) start2 = DummyOperator(task_id='start2') start2 >> downstream_task1 result: 2. Dag structure failed only a_dag1 part changed (DummyOperator -> PythonOperator) def temp_task(): print(1) a_dag = DAG( dag_id='a_dag1', default_args={'owner': 'brownbear'}, start_date=datetime(2021, 11, 6, 0, 0, 0), schedule_interval="*/1 * * * *", tags=['external'], catchup=False ) with a_dag: # Doens't work... task1 = PythonOperator(task_id='wow1', python_callable=temp_task) task2 = PythonOperator(task_id='wow2', python_callable=temp_task) task1 >> task2 result: log(WebUI): (Nothing appears...) log(process): airflow-scheduler_1 | [2022-08-19 01:12:02,238] {dag.py:2915} INFO - Setting next_dagrun for a_dag2 to 2022-08-19T01:12:00+00:00, run_after=2022-08-19T01:13:00+00:00 airflow-scheduler_1 | [2022-08-19 01:12:02,243] {dagrun.py:562} INFO - Marking run <DagRun a_dag1 # 2022-08-19 01:11:00+00:00: scheduled__2022-08-19T01:11:00+00:00, externally triggered: False> successful airflow-scheduler_1 | [2022-08-19 01:12:02,244] {dagrun.py:607} INFO - DagRun Finished: dag_id=a_dag1, execution_date=2022-08-19 01:11:00+00:00, run_id=scheduled__2022-08-19T01:11:00+00:00, run_start_date=2022-08-19 01:12:00.111676+00:00, run_end_date=2022-08-19 01:12:02.244042+00:00, run_duration=2.132366, state=success, external_trigger=False, run_type=scheduled, data_interval_start=2022-08-19 01:11:00+00:00, data_interval_end=2022-08-19 01:12:00+00:00, dag_hash=c05eae379e808492a6614dfda6985c68 airflow-scheduler_1 | [2022-08-19 01:12:02,248] {dag.py:2915} INFO - Setting next_dagrun for a_dag1 to 2022-08-19T01:12:00+00:00, run_after=2022-08-19T01:13:00+00:00 airflow-scheduler_1 | [2022-08-19 01:12:02,250] {dagrun.py:547} ERROR - Marking run <DagRun after_dag2 # 2022-08-19 01:11:00+00:00: scheduled__2022-08-19T01:11:00+00:00, externally triggered: False> failed airflow-scheduler_1 | [2022-08-19 01:12:02,251] {dagrun.py:607} INFO - DagRun Finished: dag_id=after_dag2, execution_date=2022-08-19 01:11:00+00:00, run_id=scheduled__2022-08-19T01:11:00+00:00, run_start_date=2022-08-19 01:12:00.112784+00:00, run_end_date=2022-08-19 01:12:02.251044+00:00, run_duration=2.13826, state=failed, external_trigger=False, run_type=scheduled, data_interval_start=2022-08-19 01:11:00+00:00, data_interval_end=2022-08-19 01:12:00+00:00, dag_hash=a87e590bae62e97d0798e39e15be9f55 airflow-scheduler_1 | [2022-08-19 01:12:02,255] {dag.py:2915} INFO - Setting next_dagrun for after_dag2 to 2022-08-19T01:12:00+00:00, run_after=2022-08-19T01:13:00+00:00 airflow-scheduler_1 | [2022-08-19 01:12:02,300] {scheduler_job.py:596} INFO - Executor reports execution of a_dag2.downstream_task1 run_id=scheduled__2022-08-19T01:11:00+00:00 exited with status queued for try_number 1 airflow-scheduler_1 | [2022-08-19 01:12:02,300] {scheduler_job.py:596} INFO - Executor reports execution of a_dag1.wow2 run_id=scheduled__2022-08-19T01:11:00+00:00 exited with status success for try_number 1 airflow-scheduler_1 | [2022-08-19 01:12:02,300] {scheduler_job.py:596} INFO - Executor reports execution of after_dag2.wait_for_task_2 run_id=scheduled__2022-08-19T01:11:00+00:00 exited with status success for try_number 1 airflow-scheduler_1 | [2022-08-19 01:12:02,303] {scheduler_job.py:630} INFO - Setting external_id for <TaskInstance: a_dag2.downstream_task1 scheduled__2022-08-19T01:11:00+00:00 [failed]> to d43c2bef-0f56-4556-b34e-af64158e0545 airflow-scheduler_1 | [2022-08-19 01:12:02,303] {scheduler_job.py:640} INFO - TaskInstance Finished: dag_id=after_dag2, task_id=wait_for_task_2, run_id=scheduled__2022-08-19T01:11:00+00:00, map_index=-1, run_start_date=2022-08-19 01:12:00.769613+00:00, run_end_date=2022-08-19 01:12:01.117768+00:00, run_duration=0.348155, state=failed, executor_state=success, try_number=1, max_tries=0, job_id=708751, pool=default_pool, queue=default, priority_weight=2, operator=ExternalTaskSensor, queued_dttm=2022-08-19 01:12:00.990796+00:00, queued_by_job_id=650667, pid=1357729 airflow-scheduler_1 | [2022-08-19 01:12:02,303] {scheduler_job.py:640} INFO - TaskInstance Finished: dag_id=a_dag1, task_id=wow2, run_id=scheduled__2022-08-19T01:11:00+00:00, map_index=-1, run_start_date=2022-08-19 01:12:01.506658+00:00, run_end_date=2022-08-19 01:12:01.658264+00:00, run_duration=0.151606, state=success, executor_state=success, try_number=1, max_tries=0, job_id=708754, pool=default_pool, queue=default, priority_weight=1, operator=PythonOperator, queued_dttm=2022-08-19 01:12:00.990796+00:00, queued_by_job_id=650667, pid=1357738 airflow-scheduler_1 | [2022-08-19 01:12:03,357] {scheduler_job.py:596} INFO - Executor reports execution of a_dag2.downstream_task1 run_id=scheduled__2022-08-19T01:11:00+00:00 exited with status success for try_number 1 airflow-scheduler_1 | [2022-08-19 01:12:03,359] {scheduler_job.py:640} INFO - TaskInstance Finished: dag_id=a_dag2, task_id=downstream_task1, run_id=scheduled__2022-08-19T01:11:00+00:00, map_index=-1, run_start_date=2022-08-19 01:12:00.871492+00:00, run_end_date=2022-08-19 01:12:01.205089+00:00, run_duration=0.333597, state=failed, executor_state=success, try_number=1, max_tries=0, job_id=708753, pool=default_pool, queue=default, priority_weight=1, operator=ExternalTaskSensor, queued_dttm=2022-08-19 01:12:01.082837+00:00, queued_by_job_id=650667, pid=1357731 airflow-scheduler_1 | [2022-08-19 01:12:05,177] {dagrun.py:562} INFO - Marking run <DagRun after_dag1 # 2022-08-19 01:11:00+00:00: scheduled__2022-08-19T01:11:00+00:00, externally triggered: False> successful airflow-scheduler_1 | [2022-08-19 01:12:05,177] {dagrun.py:607} INFO - DagRun Finished: dag_id=after_dag1, execution_date=2022-08-19 01:11:00+00:00, run_id=scheduled__2022-08-19T01:11:00+00:00, run_start_date=2022-08-19 01:12:00.112426+00:00, run_end_date=2022-08-19 01:12:05.177649+00:00, run_duration=5.065223, state=success, external_trigger=False, run_type=scheduled, data_interval_start=2022-08-19 01:11:00+00:00, data_interval_end=2022-08-19 01:12:00+00:00, dag_hash=edf550ca4ca8e90dfdeb6b1d2a06c789 airflow-scheduler_1 | [2022-08-19 01:12:05,182] {dag.py:2915} INFO - Setting next_dagrun for after_dag1 to 2022-08-19T01:12:00+00:00, run_after=2022-08-19T01:13:00+00:00 airflow-scheduler_1 | [2022-08-19 01:12:05,208] {scheduler_job.py:596} INFO - Executor reports execution of after_dag1.b_task run_id=scheduled__2022-08-19T01:11:00+00:00 exited with status success for try_number 1 airflow-scheduler_1 | [2022-08-19 01:12:05,211] {scheduler_job.py:640} INFO - TaskInstance Finished: dag_id=after_dag1, task_id=b_task, run_id=scheduled__2022-08-19T01:11:00+00:00, map_index=-1, run_start_date=2022-08-19 01:12:01.641734+00:00, run_end_date=2022-08-19 01:12:04.801520+00:00, run_duration=3.159786, state=success, executor_state=success, try_number=1, max_tries=0, job_id=708755, pool=default_pool, queue=default, priority_weight=1, operator=PythonOperator, queued_dttm=2022-08-19 01:12:01.082837+00:00, queued_by_job_id=650667, pid=1357739
I just tested your code with airflow from 2.3.0 to 2.3.3, it didn't work with 2.3.0 but it works normally with the other versions, so it seems that there was a bug solved in 2.3.1.
In Apached Airflow Airflow 1.10.12 -No module named 'httplib2'
I am getting the below error for a sample dag I am trying to write. My Airflow is of below configuration:- pip install apache-airflow[crypto,celery,postgres,hive,jdbc,mysql,ssh,docker,hdfs,redis,slack,webhdfs,httplib2]==1.10.12 --constraint /requirements-python3.7.txt Error:- [2020-12-19 22:41:19,342] {dagbag.py:259} ERROR - Failed to import: /usr/local/airflow/dags/alert_dag.py Traceback (most recent call last): File "/usr/local/lib/python3.7/dist-packages/airflow/models/dagbag.py", line 256, in process_file m = imp.load_source(mod_name, filepath) File "/usr/lib/python3.7/imp.py", line 171, in load_source module = _load(spec) File "<frozen importlib._bootstrap>", line 696, in _load File "<frozen importlib._bootstrap>", line 677, in _load_unlocked File "<frozen importlib._bootstrap_external>", line 728, in exec_module File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed File "/usr/local/airflow/dags/alert_dag.py", line 6, in <module> from httplib2 import Http ModuleNotFoundError: No module named 'httplib2' Code:- from airflow import DAG from airflow.operators.bash_operator import BashOperator from datetime import datetime, timedelta from json import dumps from httplib2 import Http default_args = { 'start_date': datetime(2020, 12, 19,17,0,0), 'owner': 'Airflow' } def on_success(dict): print('on_success_call_back function') print(dict) def on_failure(dict): print('on_failure_call_back function') # """Hangouts Chat incoming webhook quickstart.""" # url = 'https://chat.googleapis.com/v1/spaces/XXXX' # bot_message = {'text': 'alert_dag Failed'} # message_headers = {'Content-Type': 'application/json; charset=UTF-8'} # http_obj = Http() # response = http_obj.request( # uri=url, # method='POST', # headers=message_headers, # body=dumps(bot_message), # ) #on_success_call_back=on_success with DAG(dag_id='alert_dag', schedule_interval="*/5 * * * *", default_args=default_args, catchup=True, dagrun_timeout=timedelta(seconds=25), on_failure_callback=on_failure) as dag: # Task 1 t1 = BashOperator(task_id='t1', bash_command="exit 0") # Task 2 t2 = BashOperator(task_id='t2', bash_command="echo 'second task'") t1 >> t2
how to eliminate error in generating Airflow DAG
Creating dag getting error root/.venv/lib/python3.6/site-packages/airflow/models/dag.py:1342: PendingDeprecationWarning: The requested task could not be added to the DAG because a task with task_id create_tag_template_field_result is already in the DAG. Starting in Airflow 2.0, trying to overwrite a task will raise an exception. default_args = { 'owner':'airflow', 'depend_on_past': False, 'start_date': datetime(2018, 11, 5, 10, 00, 00), 'retries':1, 'retry_delay': timedelta(minutes= 1) } def get_activated_sources(): request = "SELECT * FROM users" pg_hook = PostgresHook(postgre_conn_id="postgres", schema="postgres") connection = pg_hook.get_conn() cursor = connection.cursor() cursor.execute(request) sources = cursor.fetchall for source in sources: print( "Source: {0}} activated {1}".format(source[0], source[1])) return sources with DAG('hook_dag', default_args=default_args, schedule_interval= '#once', catchup=False ) as dag: start_task = DummyOperator(task_id='start_task') hook_task = PythonOperator(task_id='hook_task', python_callable=get_activated_sources) start_task >> hook_task How to solve what is wrong? Please help me
Apache Airflow - How to set execution_date using TriggerDagRunOperator in target DAG for use the current execution_date
I want to set the execution_date in a trigger DAG. I´m using the operator TriggerDagRunOperator, this operator have the parameter execution_date, I want to set the current execution_date. def conditionally_trigger(context, dag_run_obj): """This function decides whether or not to Trigger the remote DAG""" pp = pprint.PrettyPrinter(indent=4) c_p = Variable.get("VAR2") == Variable.get("VAR1") and Variable.get("VAR3") == "1" print("Controller DAG : conditionally_trigger = {}".format(c_p)) if Variable.get("VAR2") == Variable.get("VAR1") and Variable.get("VAR3") == "1": pp.pprint(dag_run_obj.payload) return dag_run_obj default_args = { 'owner': 'pepito', 'depends_on_past': False, 'retries': 2, 'start_date': datetime(2018, 12, 1, 0, 0), 'email': ['xxxx#yyyyy.net'], 'email_on_failure': False, 'email_on_retry': False, 'retry_delay': timedelta(minutes=1) } dag = DAG( 'DAG_1', default_args=default_args, schedule_interval="0 12 * * 1", dagrun_timeout=timedelta(hours=22), max_active_runs=1, catchup=False ) trigger_dag_2 = TriggerDagRunOperator( task_id='trigger_dag_2', trigger_dag_id="DAG_2", python_callable=conditionally_trigger, execution_date={{ execution_date }}, dag=dag, pool='a_roz' ) But I obtain the next error name 'execution_date' is not defined If I set execution_date={{ 'execution_date' }}, or execution_date='{{ execution_date }}', I obtain Traceback (most recent call last): File "/usr/local/lib/python3.6/site-packages/airflow/models.py", line 1659, in _run_raw_task result = task_copy.execute(context=context) File "/usr/local/lib/python3.6/site-packages/airflow/operators/dagrun_operator.py", line 78, in execute replace_microseconds=False) File "/usr/local/lib/python3.6/site-packages/airflow/api/common/experimental/trigger_dag.py", line 98, in trigger_dag replace_microseconds=replace_microseconds, File "/usr/local/lib/python3.6/site-packages/airflow/api/common/experimental/trigger_dag.py", line 45, in _trigger_dag assert timezone.is_localized(execution_date) File "/usr/local/lib/python3.6/site-packages/airflow/utils/timezone.py", line 38, in is_localized return value.utcoffset() is not None AttributeError: 'str' object has no attribute 'utcoffset' Does anyone know how I can set the execution date for DAG_2 if I want to be equal to DAG_1? This question is diferent to airflow TriggerDagRunOperator how to change the execution date because In this post didn't explain how to send the execution_date through the operator TriggerDagRunOperator, in it is only said that the possibility exists. https://stackoverflow.com/a/49442868/10269204
it was not templated previously, but it is templated now with this commit you can try your code with new version of airflow additionally for hardcoded execution_date, you need to set tzinfo: from datetime import datetime, timezone execution_date=datetime(2019, 3, 27, tzinfo=timezone.utc) # or: execution_date=datetime.now().replace(tzinfo=timezone.utc)
PySpark map datetime to DoW
I'm trying to map a column 'eventtimestamp' to its day of week with the following function: from datetime import datetime import calendar from pyspark.sql.functions import UserDefinedFunction as udf def toWeekDay(x): v = int(datetime.strptime(str(x),'%Y-%m-%d %H:%M:%S').strftime('%w')) if v == 0: v = 6 else: v = v-1 return calendar.day_name[v] and for my df trying to create a new column dow with UDF. udf_toWeekDay = udf(lambda x: toWeekDay(x), StringType()) df = df.withColumn("dow",udf_toWeekDay('eventtimestamp')) Yet, I'm getting error I do not understand at all. Firstly, it was complaining for inserting datetime.datetime into strptime instead of string. So I parsed to str and now I don't have a clue what's wrong. Traceback (most recent call last): File "/tmp/zeppelin_pyspark-9040214714346906648.py", line 267, in <module> raise Exception(traceback.format_exc()) Exception: Traceback (most recent call last): File "/tmp/zeppelin_pyspark-9040214714346906648.py", line 260, in <module> exec(code) File "<stdin>", line 10, in <module> File "/usr/lib/spark/python/pyspark/sql/dataframe.py", line 429, in take return self.limit(num).collect() File "/usr/lib/spark/python/pyspark/sql/dataframe.py", line 391, in collect port = self._jdf.collectToPython() File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__ answer, self.gateway_client, self.target_id, self.name) File "/usr/lib/spark/python/pyspark/sql/utils.py", line 63, in deco return f(*a, **kw) File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value format(target_id, ".", name), value) Py4JJavaError: An error occurred while calling o6250.collectToPython. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1107.0 failed 4 times, most recent failure: Lost task 0.3 in stage 1107.0 (TID 63757, ip-172-31-27-113.eu-west-1.compute.internal, executor 819): org.apache.spark.api.python.PythonException: Traceback (most recent call last): Thanks a lot for clues!
we can use date_format to get dayofweek, df = df.withColumn("dow",date_format(df['eventtimestamp'],'EEEE'))