There are parts of my DAG that generate lists which I cannot break up into individual tasks to be processed individually downstream.
Here's a pseudo example:
def push(**kwargs):
# """Pushes an XCom without a specific target"""
for n in range(10):
kwargs['ti'].xcom_push(key=f'vals', value=n)
def puller(**kwargs):
ti = kwargs['ti']
v1 = ti.xcom_pull(key='vals', task_ids='push')
print(v1)
push = python_operator.PythonOperator(
task_id='push',
python_callable=push,
provide_context=True
)
puller = python_operator.PythonOperator(
task_id='puller',
python_callable=puller,
provide_context=True
)
It appears the xcom_push only uses the last value, not generates a list. Therefore, I'd have to load the values in push into a list then use a for loop in the pull to process each item individually.
I'm completely fine doing that but it seems counter-intuitive for doing batch jobs.
How would I have the puller pull one of the 10 tasks generated by push?
Between DAG runs, you shouldn't be changing the structure of the DAG, so your puller is either one task, meant to pull all the values, or 10 tasks each meant to pull one of the values.
Here's how you'd push all 10 values with xcom:
def push(**kwargs):
# """Pushes an XCom without a specific target"""
final_output = []
for n in range(10):
# doing work
final_output.append(n)
kwargs['ti'].xcom_push(key=f'vals', value=final_output)
push = python_operator.PythonOperator(
task_id='push',
python_callable=push,
provide_context=True
)
And then you can either pull all 10 of them like this
def puller(**kwargs):
ti = kwargs['ti']
v1 = ti.xcom_pull(key='vals', task_ids='push')
print(v1) # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
puller = python_operator.PythonOperator(
task_id='puller',
python_callable=puller,
provide_context=True
)
Or one value for each of ten tasks:
def puller(index=0, **kwargs):
ti = kwargs['ti']
v1 = ti.xcom_pull(key='vals', task_ids='push')[index]
print(v1)
ten_ops = [python_operator.PythonOperator(
task_id=f'puller_{n}',
python_callable=puller,
provide_context=True,
op_kwargs={'index': n},
) for n in range(10)]
I hope that helps, unless I misunderstood the question.
Related
I tried to run code for DiffusionInst based on Detectron2 (source code: https://github.com/chenhaoxing/DiffusionInst). During my training, my python process has always been killed (at 10000-20000 iteration epochs, which is insufficient for diffisioninst training).
I only rewrite the code for dataloader, in order to adapt to my own dataset.
My new code for dataloader:
class DiffusionInstDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by DiffusionInst.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
def __init__(self, cfg, is_train=True):
if cfg.INPUT.CROP.ENABLED and is_train:
self.crop_gen = [
# T.ResizeShortestEdge([400, 500, 600], sample_style="choice"),
T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE),
]
else:
self.crop_gen = None
self.tfm_gens = build_transform_gen(cfg, is_train)
logging.getLogger(__name__).info(
"Full TransformGens used in training: {}, crop: {}".format(str(self.tfm_gens), str(self.crop_gen))
)
self.img_format = cfg.INPUT.FORMAT
self.is_train = is_train
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
# image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
## crop roi
'''lst = dataset_dict['file_name'].split('-')
image = sitk.ReadImage('-'.join(lst[:-2]))
image = sitk.GetArrayFromImage(image)
above, below = int(lst[-2]), int(lst[-1])
image = image[:, above:below, :]'''
## no crop roi
image = sitk.ReadImage(dataset_dict["file_name"],sitk.sitkFloat32)
image = sitk.GetArrayFromImage(image)
# print('**********************',image.shape,'************************')
image = (image - image.min()) / (image.max() - image.min()) * 255
#print(image.dtype)
image = image.transpose(1, 2, 0).astype(np.uint8)
image = np.repeat(image, 3, axis=2)
#print(image.dtype)
utils.check_image_size(dataset_dict, image)
#origshape = image.shape
if self.crop_gen is None:
image, transforms = T.apply_transform_gens(self.tfm_gens, image)
else:
image, transforms = T.apply_transform_gens(
self.tfm_gens + self.crop_gen, image
)
#print('orig', origshape, '\t\tresized', image.shape)
image_shape = image.shape[:2] # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
del image
gc.collect()
if not self.is_train:
# USER: Modify this if you want to keep them for some reason.
dataset_dict.pop("annotations", None)
return dataset_dict
if "annotations" in dataset_dict:
# USER: Modify this if you want to keep them for some reason.
# import pdb;pdb.set_trace()
for anno in dataset_dict["annotations"]:
# anno.pop("segmentation", None)
anno.pop("keypoints", None)
# USER: Implement additional transformations if you have other types of data
annos = [
utils.transform_instance_annotations(obj, transforms, image_shape)
for obj in dataset_dict.pop("annotations")
if obj.get("iscrowd", 0) == 0
]
instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask")
dataset_dict["instances"] = utils.filter_empty_instances(instances)
del instances
gc.collect()
return dataset_dict
And the information about the oom-killer:
[2599547.303018] python invoked oom-killer: gfp_mask=0x24000c0, order=0, oom_score_adj=995
[2599547.303084] [<ffffffff8119bfae>] oom_kill_process+0x1fe/0x3c0
[2599547.303133] Task in /kubepods/burstable/podd09a5032-8b07-11ed-bb60-ac1f6b9ec91e/8b4a8d5c2c1a082f93b1610173beb70bbc19fb1a1c2e28150d2d912ed9b95b10 killed as a result of limit of /kubepods/burstable/podd09a5032-8b07-11ed-bb60-ac1f6b9ec91e
[2599547.305957] Memory cgroup out of memory: Kill process 1041771 (python) score 1198 or sacrifice child
[2599547.307810] Killed process 1041771 (python) total-vm:36436532kB, anon-rss:10288264kB, file-rss:104888kB
[2599718.702250] python invoked oom-killer: gfp_mask=0x24000c0, order=0, oom_score_adj=995
[2599718.702299] [<ffffffff8119bfae>] oom_kill_process+0x1fe/0x3c0
[2599718.702333] Task in /kubepods/burstable/podd09a5032-8b07-11ed-bb60-ac1f6b9ec91e/8b4a8d5c2c1a082f93b1610173beb70bbc19fb1a1c2e28150d2d912ed9b95b10 killed as a result of limit of /kubepods/burstable/podd09a5032-8b07-11ed-bb60-ac1f6b9ec91e
I set IMS_PER_BATCH to 1, and used a dataset which contains only 1 image, but the oom problem still occurred.
I wonder know what should i do to prevent oom problem?
I'm trying to write two tasks that have no knowledge of the other. One task returns a dict (via XComArg) and I want to pass a single property of that object to the next task. If I pass the entire XComArg object, its value is populated as expected. But selecting a single property results in a None.
#dag(...):
def _dag():
#task
def A(**ctx):
# ...
return {'a': 42, 'b': 'B', 'c': 'C'}
#task
def B(a, _res, **ctx):
print('A', a) # >>> A None
print('RES', _res) # >>> RES {'a': 42, ...}
res = A()
B(res['a'])
dag = _dag
Ideally, B doesn't know where the value for a comes from, nor how to get it. Yes, passing all of res and having B extract what it needs with res['a'] works, but my goal is loose coupling.
See example in https://airflow.apache.org/docs/apache-airflow/stable/tutorial_taskflow_api.html
You need to specify 'multiple_outputs=true" in task A
With reconfigurable model execution it is possible to resize inputs and outputs of components. How are the connections updated, when reconfigured outputs and inputs are connected?
In the example below the output c2.y and c3.y is resized at each model run. This input and output is supposed to be connected, as shown in the N2 chart. However, after the reconfiguration the connection size seems to be not updated automatically, it throws the following error:
ValueError: The source and target shapes do not match or are ambiguous for the connection 'c2.y' to 'c3.y'. Expected (1,) but got (2,).
I included below 3 tests, with promoted connection, absolute connection, and the last one with reconfiguration but without the connection (which works).
The last chance would be to declare the connection in the parent group of the comps, which I did not try yet.
The tests:
Promoted connection
Absolute connection
No connection
Reconfigurable component classes and tests:
from __future__ import division
import logging
import numpy as np
import unittest
from openmdao.api import Problem, Group, IndepVarComp, ExplicitComponent
from openmdao.utils.assert_utils import assert_rel_error
class ReconfComp(ExplicitComponent):
def initialize(self):
self.size = 1
self.counter = 0
def reconfigure(self):
logging.info('reconf started {}'.format(self.pathname))
self.counter += 1
logging.info('reconf ended {}'.format(self.pathname))
if self.counter % 2 == 0:
self.size += 1
return True
else:
return False
def setup(self):
logging.info('setup started {}'.format(self.pathname))
self.add_input('x', val=1.0)
self.add_output('y', val=np.zeros(self.size))
# All derivatives are defined.
self.declare_partials(of='*', wrt='*')
logging.info('setup ended {}'.format(self.pathname))
def compute(self, inputs, outputs):
logging.info('compute started {}'.format(self.pathname))
outputs['y'] = 2 * inputs['x']
logging.info('compute ended {}'.format(self.pathname))
def compute_partials(self, inputs, jacobian):
jacobian['y', 'x'] = 2 * np.ones((self.size, 1))
class ReconfComp2(ReconfComp):
"""The size of the y input changes the same as way as in ReconfComp"""
def setup(self):
logging.info('setup started {}'.format(self.pathname))
self.add_input('y', val=np.zeros(self.size))
self.add_output('f', val=np.zeros(self.size))
# All derivatives are defined.
self.declare_partials(of='*', wrt='*')
logging.info('setup ended {}'.format(self.pathname))
def compute(self, inputs, outputs):
logging.info('compute started {}'.format(self.pathname))
outputs['f'] = 2 * inputs['y']
logging.info('compute ended {}'.format(self.pathname))
def compute_partials(self, inputs, jacobian):
jacobian['f', 'y'] = 2 * np.ones((self.size, 1))
class TestReconfConnections(unittest.TestCase):
def test_reconf_comp_promoted_connections(self):
p = Problem()
p.model = Group()
p.model.add_subsystem('c1', IndepVarComp('x', 1.0), promotes_outputs=['x'])
p.model.add_subsystem('c2', ReconfComp(), promotes_inputs=['x'], promotes_outputs=['y'])
p.model.add_subsystem('c3', ReconfComp2(), promotes_inputs=['y'],
promotes_outputs=['f'])
p.setup()
p['x'] = 3.
# First run the model once; counter = 1, size of y = 1
p.run_model()
totals = p.compute_totals(wrt=['x'], of=['y'])
assert_rel_error(self, p['x'], 3.0)
assert_rel_error(self, p['y'], 6.0)
assert_rel_error(self, totals['y', 'x'], [[2.0]])
print(p['x'], p['y'], totals['y', 'x'].flatten())
# Run the model again, which will trigger reconfiguration; counter = 2, size of y = 2
p.run_model() # FIXME Fails with ValueError
def test_reconf_comp_connections(self):
p = Problem()
p.model = Group()
p.model.add_subsystem('c1', IndepVarComp('x', 1.0), promotes_outputs=['x'])
p.model.add_subsystem('c2', ReconfComp(), promotes_inputs=['x'])
p.model.add_subsystem('c3', ReconfComp2(), promotes_outputs=['f'])
p.model.connect('c2.y', 'c3.y')
p.setup()
p['x'] = 3.
# First run the model once; counter = 1, size of y = 1
p.run_model()
# Run the model again, which will trigger reconfiguration; counter = 2, size of y = 2
p.run_model() # FIXME Fails with ValueError
def test_reconf_comp_not_connected(self):
p = Problem()
p.model = Group()
p.model.add_subsystem('c1', IndepVarComp('x', 1.0), promotes_outputs=['x'])
p.model.add_subsystem('c2', ReconfComp(), promotes_inputs=['x'])
p.model.add_subsystem('c3', ReconfComp2(), promotes_outputs=['f'])
# c2.y not connected to c3.y
p.setup()
p['x'] = 3.
# First run the model once; counter = 1, size of y = 1
p.run_model()
# Run the model again, which will trigger reconfiguration; counter = 2, size of y = 2
fail, _, _ = p.run_model()
self.assertFalse(fail)
if __name__ == '__main__':
unittest.main()
UPDATE:
It seems, that in Group._var_abs2meta only the source size is updated, but not the target. The setup of the connections starts, before the setup of the parent group or the setup of the other component would be called.
UPDATE 2:
This happens with the default NonlinearRunOnce solver, with a NewtonSolver of NonlinearBlockGS there is no error, but the variable sizes also don't change.
As of OpenMDAO V2.5 reconfigurable model variables is not an officially supported feature in the framework. The bare bones of the capability has been in the code since that research was done, but it wasn't something that was high priority enough for us to finalize. A recent major refactor in V2.4 re-worked how some underlying data-structures worked and must have broken this functionality.
It is on our development priority list to get this working again, but its not super high on that list. We focus development mainly on features that have a direct in-house applications, and we don't have one of those yet.
If you could provide a decently complete set of tests for it, we could take a look at getting the functionality working.
I'm just learning how to do parallel computing in Julia. I'm using #sync #distributed at the start of a 3x nested for loop to parallelize things (see code at bottom). From the line println(errCmp[row, col]) I can watch all the elements of the array errCmp be printed out. E.g.
From worker 3: 2.351134946074191e9
From worker 4: 2.3500830193505473e9
From worker 5: 2.3502416529551845e9
From worker 2: 2.3509105625656652e9
From worker 3: 2.3508352842971106e9
From worker 4: 2.3497049296121807e9
From worker 5: 2.35048428351797e9
From worker 2: 2.350742582031195e9
From worker 3: 2.350616273660934e9
From worker 4: 2.349709546599313e9
However, when the function returns, errCmp is the array of zeros I pre-allocate at the begging.
Am I missing some closing term to collect everything?
function optimizeDragCalc(df::DataFrame)
paramGrid = [cd*AoM for cd = range(1e-3, stop = 0.01, length = 50), AoM = range(2e-4, stop = 0.0015, length = 50)]
errCmp = zeros(size(paramGrid))
# totalSize = size(paramGrid, 1) * size(paramGrid, 2) * size(df.time, 1)
#sync #distributed for row = 1:size(paramGrid, 1)
for col = 1:size(paramGrid, 2)
# Run the propagation here
BC = 1/paramGrid[row, col]
slns, _ = propWholeTraj(df, BC)
for time = 1:size(df.time, 1)
errDF = propError(slns[time], df, time)
errCmp[row, col] += sum(errDF.totalErr)
end # time
# println("row: ", row, " of ",size(paramGrid, 1)," col: ", col, " of ", size(paramGrid, 2))
println(errCmp[row, col])
end # col
end # row
# plot(heatmap(z = errCmp))
return errCmp, paramGrid
end
errCmp, paramGrid = #time optimizeDragCalc(df)
You did not provide a minimal working example but I guess it might be hard. So here is mine MWE. Let us assume that we want to use Distributed to calculate sums of Array's columns:
using Distributed
addprocs(2)
#everywhere using StatsBase
data = rand(1000,2000)
res = zeros(2000)
#sync #distributed for col = 1:size(data)[2]
res[col] = StatsBase.mean(data[:,col])
# does not work!
# ... because data is created locally and never returned!
end
In order to correct the above code you need to provide an aggregator function (I keep the example intentionally simplified - a further optimization is possible).
using Distributed
addprocs(2)
#everywhere using Distributed,StatsBase
data = rand(1000,2000)
#everywhere function t2(d1,d2)
append!(d1,d2)
d1
end
res = #sync #distributed (t2) for col = 1:size(data)[2]
[(myid(),col, StatsBase.mean(data[:,col]))]
end
Now let us see the output. We can see that some of the values have been calculated on worker 2 while others on worker 3:
julia> res
2000-element Array{Tuple{Int64,Int64,Float64},1}:
(2, 1, 0.49703681326230276)
(2, 2, 0.5035341367791002)
(2, 3, 0.5050607022354537)
⋮
(3, 1998, 0.4975699181976122)
(3, 1999, 0.5009498778934444)
(3, 2000, 0.499671315490524)
Further possible improvements/modifications:
use #spawnat to generate values at remote processes (instead of the master process and sending them)
use SharedArray - this allows to automatically distribute data among workers. From my experience requires very careful programming.
use ParallelDataTransfer.jl to send data among workers. Very easy to use, not efficient for huge number of messages.
always consider Julia threading mechanism (in some scenarios it makes life easier - again depends on the problem)
I am using three-dimensional convolution links (with ConvolutionND) in my chain.
The forward computation run smoothly (I checked intermediate result shapes to be sure I understood correctly the meaning of the parameters of convolution_nd), but during the backward a CuDNNError is raised with the message CUDNN_STATUS_NOT_SUPPORTED.
The cover_all parameter of ConvolutionND as its default value of False, so from the doc I don't see what can be the cause of the error.
Here is how I defind one of the convolution layers :
self.conv1 = chainer.links.ConvolutionND(3, 1, 4, (3, 3, 3)).to_gpu(self.GPU_1_ID)
And the call stack is
File "chainer/function_node.py", line 548, in backward_accumulate
gxs = self.backward(target_input_indexes, grad_outputs)
File "chainer/functions/connection/convolution_nd.py", line 118, in backward
gy, W, stride=self.stride, pad=self.pad, outsize=x_shape)
File "chainer/functions/connection/deconvolution_nd.py", line 310, in deconvolution_nd
y, = func.apply(args)
File chainer/function_node.py", line 258, in apply
outputs = self.forward(in_data)
File "chainer/functions/connection/deconvolution_nd.py", line 128, in forward
return self._forward_cudnn(x, W, b)
File "chainer/functions/connection/deconvolution_nd.py", line 105, in _forward_cudnn
tensor_core=tensor_core)
File "cupy/cudnn.pyx", line 881, in cupy.cudnn.convolution_backward_data
File "cupy/cuda/cudnn.pyx", line 975, in cupy.cuda.cudnn.convolutionBackwardData_v3
File "cupy/cuda/cudnn.pyx", line 461, in cupy.cuda.cudnn.check_status
cupy.cuda.cudnn.CuDNNError: CUDNN_STATUS_NOT_SUPPORTED
So are there special points to take care of when using ConvolutionND ?
A failing code is for instance :
import chainer
from chainer import functions as F
from chainer import links as L
from chainer.backends import cuda
import numpy as np
import cupy as cp
chainer.global_config.cudnn_deterministic = False
NB_MASKS = 60
NB_FCN = 3
NB_CLASS = 17
class MFEChain(chainer.Chain):
"""docstring for Wavelphasenet."""
def __init__(self,
FCN_Dim,
gpu_ids=None):
super(MFEChain, self).__init__()
self.GPU_0_ID, self.GPU_1_ID = (0, 1) if gpu_ids is None else gpu_ids
with self.init_scope():
self.conv1 = chainer.links.ConvolutionND(3, 1, 4, (3, 3, 3)).to_gpu(
self.GPU_1_ID
)
def __call__(self, inputs):
### Pad input ###
processed_sequences = []
for convolved in inputs:
## Transform to sequences)
copy = convolved if self.GPU_0_ID == self.GPU_1_ID else F.copy(convolved, self.GPU_1_ID)
processed_sequences.append(copy)
reprocessed_sequences = []
with cuda.get_device(self.GPU_1_ID):
for convolved in processed_sequences:
convolved = F.expand_dims(convolved, 0)
convolved = F.expand_dims(convolved, 0)
convolved = self.conv1(convolved)
reprocessed_sequences.append(convolved)
states = F.vstack(reprocessed_sequences)
logits = states
ret_logits = logits if self.GPU_0_ID == self.GPU_1_ID else F.copy(logits, self.GPU_0_ID)
return ret_logits
def mfe_test():
mfe = MFEChain(150)
inputs = list(
chainer.Variable(
cp.random.randn(
NB_MASKS,
11,
in_len,
dtype=cp.float32
)
) for in_len in [53248]
)
val = mfe(inputs)
grad = cp.ones(val.shape, dtype=cp.float32)
val.grad = grad
val.backward()
for i in inputs:
print(i.grad)
if __name__ == "__main__":
mfe_test()
cupy.cuda.cudnn.convolutionBackwardData_v3 is incompatible with some specific parameters, as described in an issue in official github.
Unfortunately, the issue only dealt with deconvolution_2d.py (not deconvolution_nd.py), therefore the decision-making about whether cudnn is used or not failed in your case, I guess.
you can check your parameter by confirming
check whether dilation parameter (!=1) or group parameter (!=1) is passed to the convolution.
print chainer.config.cudnn_deterministic, configuration.config.autotune, and configuration.config.use_cudnn_tensor_core.
Further support may be obtained by raising an issue in the official github.
The code you showed is much complicated.
To clarify the problem, the code below would help.
from chainer import Variable, Chain
from chainer import links as L
from chainer import functions as F
import numpy as np
from six import print_
batch_size = 1
in_channel = 1
out_channel = 1
class MyLink(Chain):
def __init__(self):
super(MyLink, self).__init__()
with self.init_scope():
self.conv = L.ConvolutionND(3, 1, 1, (3, 3, 3), nobias=True, initialW=np.ones((in_channel, out_channel, 3, 3, 3)))
def __call__(self, x):
return F.sum(self.conv(x))
if __name__ == "__main__":
my_link = MyLink()
my_link.to_gpu(0)
batch = Variable(np.ones((batch_size, in_channel, 3, 3, 3)))
batch.to_gpu(0)
loss = my_link(batch)
loss.backward()
print_(batch.grad)