parallel DoE with distributed components in OpenMDAO - openmdao

I'm trying to run a DoE in parallel on a distributed code, which doesn't seem to work. Below is a simplified example that raises the same error as for the real code.
import numpy as np
from openmdao.api import IndepVarComp, Group, Problem, Component
from openmdao.core.mpi_wrap import MPI
from openmdao.drivers.latinhypercube_driver import LatinHypercubeDriver
if MPI:
from openmdao.core.petsc_impl import PetscImpl as impl
rank = MPI.COMM_WORLD.rank
else:
from openmdao.api import BasicImpl as impl
rank = 0
class DistribCompSimple(Component):
"""Uses 2 procs but takes full input vars"""
def __init__(self, arr_size=2):
super(DistribCompSimple, self).__init__()
self._arr_size = arr_size
self.add_param('invar', 0.)
self.add_output('outvec', np.ones(arr_size, float))
def solve_nonlinear(self, params, unknowns, resids):
if rank == 0:
unknowns['outvec'] = params['invar'] * np.ones(self._arr_size) * 0.25
elif rank == 1:
unknowns['outvec'] = params['invar'] * np.ones(self._arr_size) * 0.5
print 'hello from rank', rank, unknowns['outvec']
def get_req_procs(self):
return (2, 2)
if __name__ == '__main__':
N_PROCS = 4
prob = Problem(impl=impl)
root = prob.root = Group()
root.add('p1', IndepVarComp('invar', 0.), promotes=['*'])
root.add('comp', DistribCompSimple(2), promotes=['*'])
prob.driver = LatinHypercubeDriver(4, num_par_doe=N_PROCS/2)
prob.driver.add_desvar('invar', lower=-5.0, upper=5.0)
prob.driver.add_objective('outvec')
prob.setup(check=False)
prob.run()
I run this with
mpirun -np 4 python lhc_driver.py
and get this error:
Traceback (most recent call last):
File "lhc_driver.py", line 60, in <module>
prob.run()
File "/Users/frza/git/OpenMDAO/openmdao/core/problem.py", line 1064, in run
self.driver.run(self)
File "/Users/frza/git/OpenMDAO/openmdao/drivers/predeterminedruns_driver.py", line 157, in run
self._run_par_doe(problem.root)
File "/Users/frza/git/OpenMDAO/openmdao/drivers/predeterminedruns_driver.py", line 221, in _run_par_doe
for case in self._get_case_w_nones(self._distrib_build_runlist()):
File "/Users/frza/git/OpenMDAO/openmdao/drivers/predeterminedruns_driver.py", line 283, in _get_case_w_nones
case = next(it)
File "/Users/frza/git/OpenMDAO/openmdao/drivers/latinhypercube_driver.py", line 119, in _distrib_build_runlist
run_list = comm.scatter(job_list, root=0)
File "MPI/Comm.pyx", line 1286, in mpi4py.MPI.Comm.scatter (src/mpi4py.MPI.c:109079)
File "MPI/msgpickle.pxi", line 707, in mpi4py.MPI.PyMPI_scatter (src/mpi4py.MPI.c:48114)
File "MPI/msgpickle.pxi", line 161, in mpi4py.MPI.Pickle.dumpv (src/mpi4py.MPI.c:41605)
ValueError: expecting 4 items, got 2
I don't see a test for this use case in the latest master, so does that mean you don't yet support it or is it a bug?

Thanks for submitting a simple test case for this. I added the parallel DOE stuff fairly recently and forgot to test it with distributed components. I'll add a story to our bug tracker for this and hopefully get it fixed soon.

Related

How to update connection sizes in a reconfigurable model in OpenMDAO 2.5.0?

With reconfigurable model execution it is possible to resize inputs and outputs of components. How are the connections updated, when reconfigured outputs and inputs are connected?
In the example below the output c2.y and c3.y is resized at each model run. This input and output is supposed to be connected, as shown in the N2 chart. However, after the reconfiguration the connection size seems to be not updated automatically, it throws the following error:
ValueError: The source and target shapes do not match or are ambiguous for the connection 'c2.y' to 'c3.y'. Expected (1,) but got (2,).
I included below 3 tests, with promoted connection, absolute connection, and the last one with reconfiguration but without the connection (which works).
The last chance would be to declare the connection in the parent group of the comps, which I did not try yet.
The tests:
Promoted connection
Absolute connection
No connection
Reconfigurable component classes and tests:
from __future__ import division
import logging
import numpy as np
import unittest
from openmdao.api import Problem, Group, IndepVarComp, ExplicitComponent
from openmdao.utils.assert_utils import assert_rel_error
class ReconfComp(ExplicitComponent):
def initialize(self):
self.size = 1
self.counter = 0
def reconfigure(self):
logging.info('reconf started {}'.format(self.pathname))
self.counter += 1
logging.info('reconf ended {}'.format(self.pathname))
if self.counter % 2 == 0:
self.size += 1
return True
else:
return False
def setup(self):
logging.info('setup started {}'.format(self.pathname))
self.add_input('x', val=1.0)
self.add_output('y', val=np.zeros(self.size))
# All derivatives are defined.
self.declare_partials(of='*', wrt='*')
logging.info('setup ended {}'.format(self.pathname))
def compute(self, inputs, outputs):
logging.info('compute started {}'.format(self.pathname))
outputs['y'] = 2 * inputs['x']
logging.info('compute ended {}'.format(self.pathname))
def compute_partials(self, inputs, jacobian):
jacobian['y', 'x'] = 2 * np.ones((self.size, 1))
class ReconfComp2(ReconfComp):
"""The size of the y input changes the same as way as in ReconfComp"""
def setup(self):
logging.info('setup started {}'.format(self.pathname))
self.add_input('y', val=np.zeros(self.size))
self.add_output('f', val=np.zeros(self.size))
# All derivatives are defined.
self.declare_partials(of='*', wrt='*')
logging.info('setup ended {}'.format(self.pathname))
def compute(self, inputs, outputs):
logging.info('compute started {}'.format(self.pathname))
outputs['f'] = 2 * inputs['y']
logging.info('compute ended {}'.format(self.pathname))
def compute_partials(self, inputs, jacobian):
jacobian['f', 'y'] = 2 * np.ones((self.size, 1))
class TestReconfConnections(unittest.TestCase):
def test_reconf_comp_promoted_connections(self):
p = Problem()
p.model = Group()
p.model.add_subsystem('c1', IndepVarComp('x', 1.0), promotes_outputs=['x'])
p.model.add_subsystem('c2', ReconfComp(), promotes_inputs=['x'], promotes_outputs=['y'])
p.model.add_subsystem('c3', ReconfComp2(), promotes_inputs=['y'],
promotes_outputs=['f'])
p.setup()
p['x'] = 3.
# First run the model once; counter = 1, size of y = 1
p.run_model()
totals = p.compute_totals(wrt=['x'], of=['y'])
assert_rel_error(self, p['x'], 3.0)
assert_rel_error(self, p['y'], 6.0)
assert_rel_error(self, totals['y', 'x'], [[2.0]])
print(p['x'], p['y'], totals['y', 'x'].flatten())
# Run the model again, which will trigger reconfiguration; counter = 2, size of y = 2
p.run_model() # FIXME Fails with ValueError
def test_reconf_comp_connections(self):
p = Problem()
p.model = Group()
p.model.add_subsystem('c1', IndepVarComp('x', 1.0), promotes_outputs=['x'])
p.model.add_subsystem('c2', ReconfComp(), promotes_inputs=['x'])
p.model.add_subsystem('c3', ReconfComp2(), promotes_outputs=['f'])
p.model.connect('c2.y', 'c3.y')
p.setup()
p['x'] = 3.
# First run the model once; counter = 1, size of y = 1
p.run_model()
# Run the model again, which will trigger reconfiguration; counter = 2, size of y = 2
p.run_model() # FIXME Fails with ValueError
def test_reconf_comp_not_connected(self):
p = Problem()
p.model = Group()
p.model.add_subsystem('c1', IndepVarComp('x', 1.0), promotes_outputs=['x'])
p.model.add_subsystem('c2', ReconfComp(), promotes_inputs=['x'])
p.model.add_subsystem('c3', ReconfComp2(), promotes_outputs=['f'])
# c2.y not connected to c3.y
p.setup()
p['x'] = 3.
# First run the model once; counter = 1, size of y = 1
p.run_model()
# Run the model again, which will trigger reconfiguration; counter = 2, size of y = 2
fail, _, _ = p.run_model()
self.assertFalse(fail)
if __name__ == '__main__':
unittest.main()
UPDATE:
It seems, that in Group._var_abs2meta only the source size is updated, but not the target. The setup of the connections starts, before the setup of the parent group or the setup of the other component would be called.
UPDATE 2:
This happens with the default NonlinearRunOnce solver, with a NewtonSolver of NonlinearBlockGS there is no error, but the variable sizes also don't change.
As of OpenMDAO V2.5 reconfigurable model variables is not an officially supported feature in the framework. The bare bones of the capability has been in the code since that research was done, but it wasn't something that was high priority enough for us to finalize. A recent major refactor in V2.4 re-worked how some underlying data-structures worked and must have broken this functionality.
It is on our development priority list to get this working again, but its not super high on that list. We focus development mainly on features that have a direct in-house applications, and we don't have one of those yet.
If you could provide a decently complete set of tests for it, we could take a look at getting the functionality working.

What problems can lead to a CuDNNError with ConvolutionND

I am using three-dimensional convolution links (with ConvolutionND) in my chain.
The forward computation run smoothly (I checked intermediate result shapes to be sure I understood correctly the meaning of the parameters of convolution_nd), but during the backward a CuDNNError is raised with the message CUDNN_STATUS_NOT_SUPPORTED.
The cover_all parameter of ConvolutionND as its default value of False, so from the doc I don't see what can be the cause of the error.
Here is how I defind one of the convolution layers :
self.conv1 = chainer.links.ConvolutionND(3, 1, 4, (3, 3, 3)).to_gpu(self.GPU_1_ID)
And the call stack is
File "chainer/function_node.py", line 548, in backward_accumulate
gxs = self.backward(target_input_indexes, grad_outputs)
File "chainer/functions/connection/convolution_nd.py", line 118, in backward
gy, W, stride=self.stride, pad=self.pad, outsize=x_shape)
File "chainer/functions/connection/deconvolution_nd.py", line 310, in deconvolution_nd
y, = func.apply(args)
File chainer/function_node.py", line 258, in apply
outputs = self.forward(in_data)
File "chainer/functions/connection/deconvolution_nd.py", line 128, in forward
return self._forward_cudnn(x, W, b)
File "chainer/functions/connection/deconvolution_nd.py", line 105, in _forward_cudnn
tensor_core=tensor_core)
File "cupy/cudnn.pyx", line 881, in cupy.cudnn.convolution_backward_data
File "cupy/cuda/cudnn.pyx", line 975, in cupy.cuda.cudnn.convolutionBackwardData_v3
File "cupy/cuda/cudnn.pyx", line 461, in cupy.cuda.cudnn.check_status
cupy.cuda.cudnn.CuDNNError: CUDNN_STATUS_NOT_SUPPORTED
So are there special points to take care of when using ConvolutionND ?
A failing code is for instance :
import chainer
from chainer import functions as F
from chainer import links as L
from chainer.backends import cuda
import numpy as np
import cupy as cp
chainer.global_config.cudnn_deterministic = False
NB_MASKS = 60
NB_FCN = 3
NB_CLASS = 17
class MFEChain(chainer.Chain):
"""docstring for Wavelphasenet."""
def __init__(self,
FCN_Dim,
gpu_ids=None):
super(MFEChain, self).__init__()
self.GPU_0_ID, self.GPU_1_ID = (0, 1) if gpu_ids is None else gpu_ids
with self.init_scope():
self.conv1 = chainer.links.ConvolutionND(3, 1, 4, (3, 3, 3)).to_gpu(
self.GPU_1_ID
)
def __call__(self, inputs):
### Pad input ###
processed_sequences = []
for convolved in inputs:
## Transform to sequences)
copy = convolved if self.GPU_0_ID == self.GPU_1_ID else F.copy(convolved, self.GPU_1_ID)
processed_sequences.append(copy)
reprocessed_sequences = []
with cuda.get_device(self.GPU_1_ID):
for convolved in processed_sequences:
convolved = F.expand_dims(convolved, 0)
convolved = F.expand_dims(convolved, 0)
convolved = self.conv1(convolved)
reprocessed_sequences.append(convolved)
states = F.vstack(reprocessed_sequences)
logits = states
ret_logits = logits if self.GPU_0_ID == self.GPU_1_ID else F.copy(logits, self.GPU_0_ID)
return ret_logits
def mfe_test():
mfe = MFEChain(150)
inputs = list(
chainer.Variable(
cp.random.randn(
NB_MASKS,
11,
in_len,
dtype=cp.float32
)
) for in_len in [53248]
)
val = mfe(inputs)
grad = cp.ones(val.shape, dtype=cp.float32)
val.grad = grad
val.backward()
for i in inputs:
print(i.grad)
if __name__ == "__main__":
mfe_test()
cupy.cuda.cudnn.convolutionBackwardData_v3 is incompatible with some specific parameters, as described in an issue in official github.
Unfortunately, the issue only dealt with deconvolution_2d.py (not deconvolution_nd.py), therefore the decision-making about whether cudnn is used or not failed in your case, I guess.
you can check your parameter by confirming
check whether dilation parameter (!=1) or group parameter (!=1) is passed to the convolution.
print chainer.config.cudnn_deterministic, configuration.config.autotune, and configuration.config.use_cudnn_tensor_core.
Further support may be obtained by raising an issue in the official github.
The code you showed is much complicated.
To clarify the problem, the code below would help.
from chainer import Variable, Chain
from chainer import links as L
from chainer import functions as F
import numpy as np
from six import print_
batch_size = 1
in_channel = 1
out_channel = 1
class MyLink(Chain):
def __init__(self):
super(MyLink, self).__init__()
with self.init_scope():
self.conv = L.ConvolutionND(3, 1, 1, (3, 3, 3), nobias=True, initialW=np.ones((in_channel, out_channel, 3, 3, 3)))
def __call__(self, x):
return F.sum(self.conv(x))
if __name__ == "__main__":
my_link = MyLink()
my_link.to_gpu(0)
batch = Variable(np.ones((batch_size, in_channel, 3, 3, 3)))
batch.to_gpu(0)
loss = my_link(batch)
loss.backward()
print_(batch.grad)

OpenMDAO Singular Entry

I'm trying to understand the OpenMDAO error messages
RuntimeError: Singular entry found in '' for column associated with state/residual 'x'.
and
RuntimeError: Singular entry found in '' for row associated with state/residual 'y'.
Can someone explain these? E.g. When running the code
from openmdao.api import Problem, Group, IndepVarComp, ImplicitComponent, ScipyOptimizeDriver, NewtonSolver, DirectSolver, view_model, view_connections
class Test1Comp(ImplicitComponent):
def setup(self):
self.add_input('x', 0.5)
self.add_input('design_x', 1.0)
self.add_output('z', val=0.0)
self.add_output('obj')
self.declare_partials(of='*', wrt='*', method='fd', form='central', step=1.0e-4)
def apply_nonlinear(self, inputs, outputs, resids):
x = inputs['x']
design_x = inputs['design_x']
z = outputs['z']
resids['z'] = x*z + z - 4
resids['obj'] = (z/5.833333 - design_x)**2
if __name__ == "__main__":
prob = Problem()
model = prob.model = Group()
model.add_subsystem('p1', IndepVarComp('x', 0.5))
model.add_subsystem('d1', IndepVarComp('design_x', 1.0))
model.add_subsystem('comp', Test1Comp())
model.connect('p1.x', 'comp.x')
model.connect('d1.design_x', 'comp.design_x')
prob.driver = ScipyOptimizeDriver()
prob.driver.options["optimizer"] = 'SLSQP'
model.add_design_var("d1.design_x", lower=0.5, upper=1.5)
model.add_objective('comp.obj')
model.nonlinear_solver = NewtonSolver()
model.nonlinear_solver.options['iprint'] = 2
model.nonlinear_solver.options['maxiter'] = 20
model.linear_solver = DirectSolver()
prob.setup()
prob.run_model()
print(prob['comp.z'])
I get the error message:
File "C:\Scripts/mockup_component3.py", line 46, in <module>
prob.run_model()
File "C:\Python_32\lib\site-packages\openmdao\core\problem.py", line 315, in run_model
return self.model.run_solve_nonlinear()
File "C:\Python_32\lib\site-packages\openmdao\core\system.py", line 2960, in run_solve_nonlinear
result = self._solve_nonlinear()
File "C:\Python_32\lib\site-packages\openmdao\core\group.py", line 1420, in _solve_nonlinear
result = self._nonlinear_solver.solve()
File "C:\Python_32\lib\site-packages\openmdao\solvers\solver.py", line 602, in solve
fail, abs_err, rel_err = self._run_iterator()
File "C:\Python_32\lib\site-packages\openmdao\solvers\solver.py", line 349, in _run_iterator
self._iter_execute()
File "C:\Python_32\lib\site-packages\openmdao\solvers\nonlinear\newton.py", line 234, in _iter_execute
system._linearize()
File "C:\Python_32\lib\site-packages\openmdao\core\group.py", line 1562, in _linearize
self._linear_solver._linearize()
File "C:\Python_32\lib\site-packages\openmdao\solvers\linear\direct.py", line 199, in _linearize
raise RuntimeError(format_singluar_error(err, system, mtx))
RuntimeError: Singular entry found in '' for column associated with state/residual 'comp.obj'.
This error I was able to solve, by adding - outputs['obj'] in the equation for resids['obj']. But I still have little understanding as to what the two error messages mean. What matrix is it that is singular? And what does it mean to have
1) a singular entry for a column?
2) a singular entry for a row?
I realized that the cause for the singular row was that I had not defined the partial derivatives for the component. I fixed this problem by adding the command declare_partials to the top level system. The traceback gave me the clue that the matrix was related to linearization.
The case with the singular column seems related to that I had two equations in apply_nonlinear, but only one unknown (z).

openmdao 2.2.0: TypeError at setup

When running the following example code:
from openmdao.api import Problem, Group, IndepVarComp, ImplicitComponent, ScipyOptimizeDriver
class Test1Comp(ImplicitComponent):
def setup(self):
self.add_input('x', 0.5)
self.add_input('design_x', 1.0)
self.add_output('z', val=0.0)
self.add_output('obj')
self.declare_partials(of='*', wrt='*', method='fd', form='central', step=1.0e-4)
def apply_nonlinear(self, inputs, outputs, resids):
x = inputs['x']
design_x = inputs['design_x']
z = outputs['z']
resids['z'] = x*z + z - 4
resids['obj'] = (z/5.833333 - design_x)**2
if __name__ == "__main__":
prob = Problem()
model = prob.model = Group()
model.add_subsystem('p1', IndepVarComp('x', 0.5))
model.add_subsystem('d1', IndepVarComp('design_x', 1.0))
model.add_subsystem('comp', Test1Comp())
model.connect('p1.x', 'comp.x')
model.connect('d1.design_x', 'comp.design_x')
prob.driver = ScipyOptimizeDriver()
prob.driver.options["optimizer"] = 'SLSQP'
model.add_design_var("d1.design_x", lower=0.5, upper=1.5)
model.add_objective('comp.obj')
prob.setup()
prob.run_model()
print(prob['comp.z'])
I get:
Traceback (most recent call last):
File "C:/Users/jonat/Desktop/mockup_component3.py", line 40, in <module>
prob.setup()
File "C:\Python\openmdao\core\problem.py", line 409, in setup
model._setup(comm, 'full', mode)
File "C:\Python\openmdao\core\system.py", line 710, in _setup
self._setup_relevance(mode, self._relevant)
File "C:\Python\openmdao\core\system.py", line 1067, in _setup_relevance
self._relevant = relevant = self._init_relevance(mode)
File "C:\Python\openmdao\core\group.py", line 693, in _init_relevance
return get_relevant_vars(self._conn_global_abs_in2out, desvars, responses, mode)
File "C:\Python\openmdao\core\group.py", line 1823, in get_relevant_vars
if 'type_' in nodes[node]:
TypeError: 'instancemethod' object has no attribute '__getitem__'
Can someone explain why? I've succesfully run a similar component, but without optimization, so I'm suspicious the error comes from the optimization constructs. For example, do I have to define the objective in an ExplicitComponent?
I get a more descriptive message when I run:
KeyError: 'Variable name "comp.y" not found.'
Which just means that component "comp" doesn't have a variable named "y" (or "z").
The issue seems to have been caused by incorrect installation of OpenMDAO. I had previously tried to install by downloading a zip-file containing OpenMDAO. Now I instead installed using pip and the error disappeared.

openmdao v1.4 optimization with metamodel

I which to perform an optimization with openmdao 1.4 on a metamodel. Using the tutorials I have build u p problem that i do not mange to solve: I think the problem is coming from a misuse of setup() and run() : I do not manage to train my metamodel and to optimize on it at the same time (perhpas I should use two differentes "groups" to do this ..)
Here is my code :
from __future__ import print_function
from openmdao.api import Component, Group, MetaModel ,IndepVarComp, ExecComp, NLGaussSeidel, KrigingSurrogate, FloatKrigingSurrogate
import numpy as np
class KrigMM(Group):
''' FloatKriging gives responses as floats '''
def __init__(self):
super(KrigMM, self).__init__()
# Create meta_model for f_x as the response
pmm = self.add("pmm", MetaModel())
pmm.add_param('x', val=0.)
pmm.add_output('f_x:float', val=0., surrogate=FloatKrigingSurrogate())
self.add('p1', IndepVarComp('x', 0.0))
self.connect('p1.x','pmm.x')
# mm.add_output('f_xy:norm_dist', val=(0.,0.), surrogate=KrigingSurrogate())
if __name__ == '__main__':
# Setup and run the model.
from openmdao.core.problem import Problem
from openmdao.drivers.scipy_optimizer import ScipyOptimizer
from openmdao.core.driver import Driver
import numpy as np
import doe_lhs
#prob = Problem(root=ParaboloidProblem())
###########################################################
prob = Problem(root=Group())
prob.root.add('meta',KrigMM(), promotes=['*'])
prob.driver = ScipyOptimizer()
prob.driver.options['optimizer'] = 'SLSQP'
prob.driver.add_desvar('p1.x', lower=0, upper=10)
prob.driver.add_objective('pmm.f_x:float')
prob.setup()
prob['pmm.train:x'] = np.linspace(0,10,20)
prob['pmm.train:f_x:float'] = np.sin(prob['pmm.train:x'])
prob.run()
print('\n')
print('Minimum of %f found for meta at %f' % (prob['pmm.f_x:float'],prob['pmm.x'])) #predicted value
I believe your problem is actually working fine. Its just that the sinusiod you've picked has an local optimum at 0.0, which happens to be your initial condition.
If I change the initial condition as follows:
prob.setup()
prob['p1.x'] = 5
prob['pmm.train:x'] = np.linspace(0,10,20)
prob['pmm.train:f_x:float'] = np.sin(prob['pmm.train:x'])
prob.run()
I get:
Optimization terminated successfully. (Exit mode 0)
Current function value: [-1.00004544]
Iterations: 3
Function evaluations: 3
Gradient evaluations: 3
Optimization Complete
-----------------------------------
Minimum of -1.000045 found for meta at 4.710483

Resources