What problems can lead to a CuDNNError with ConvolutionND - chainer

I am using three-dimensional convolution links (with ConvolutionND) in my chain.
The forward computation run smoothly (I checked intermediate result shapes to be sure I understood correctly the meaning of the parameters of convolution_nd), but during the backward a CuDNNError is raised with the message CUDNN_STATUS_NOT_SUPPORTED.
The cover_all parameter of ConvolutionND as its default value of False, so from the doc I don't see what can be the cause of the error.
Here is how I defind one of the convolution layers :
self.conv1 = chainer.links.ConvolutionND(3, 1, 4, (3, 3, 3)).to_gpu(self.GPU_1_ID)
And the call stack is
File "chainer/function_node.py", line 548, in backward_accumulate
gxs = self.backward(target_input_indexes, grad_outputs)
File "chainer/functions/connection/convolution_nd.py", line 118, in backward
gy, W, stride=self.stride, pad=self.pad, outsize=x_shape)
File "chainer/functions/connection/deconvolution_nd.py", line 310, in deconvolution_nd
y, = func.apply(args)
File chainer/function_node.py", line 258, in apply
outputs = self.forward(in_data)
File "chainer/functions/connection/deconvolution_nd.py", line 128, in forward
return self._forward_cudnn(x, W, b)
File "chainer/functions/connection/deconvolution_nd.py", line 105, in _forward_cudnn
tensor_core=tensor_core)
File "cupy/cudnn.pyx", line 881, in cupy.cudnn.convolution_backward_data
File "cupy/cuda/cudnn.pyx", line 975, in cupy.cuda.cudnn.convolutionBackwardData_v3
File "cupy/cuda/cudnn.pyx", line 461, in cupy.cuda.cudnn.check_status
cupy.cuda.cudnn.CuDNNError: CUDNN_STATUS_NOT_SUPPORTED
So are there special points to take care of when using ConvolutionND ?
A failing code is for instance :
import chainer
from chainer import functions as F
from chainer import links as L
from chainer.backends import cuda
import numpy as np
import cupy as cp
chainer.global_config.cudnn_deterministic = False
NB_MASKS = 60
NB_FCN = 3
NB_CLASS = 17
class MFEChain(chainer.Chain):
"""docstring for Wavelphasenet."""
def __init__(self,
FCN_Dim,
gpu_ids=None):
super(MFEChain, self).__init__()
self.GPU_0_ID, self.GPU_1_ID = (0, 1) if gpu_ids is None else gpu_ids
with self.init_scope():
self.conv1 = chainer.links.ConvolutionND(3, 1, 4, (3, 3, 3)).to_gpu(
self.GPU_1_ID
)
def __call__(self, inputs):
### Pad input ###
processed_sequences = []
for convolved in inputs:
## Transform to sequences)
copy = convolved if self.GPU_0_ID == self.GPU_1_ID else F.copy(convolved, self.GPU_1_ID)
processed_sequences.append(copy)
reprocessed_sequences = []
with cuda.get_device(self.GPU_1_ID):
for convolved in processed_sequences:
convolved = F.expand_dims(convolved, 0)
convolved = F.expand_dims(convolved, 0)
convolved = self.conv1(convolved)
reprocessed_sequences.append(convolved)
states = F.vstack(reprocessed_sequences)
logits = states
ret_logits = logits if self.GPU_0_ID == self.GPU_1_ID else F.copy(logits, self.GPU_0_ID)
return ret_logits
def mfe_test():
mfe = MFEChain(150)
inputs = list(
chainer.Variable(
cp.random.randn(
NB_MASKS,
11,
in_len,
dtype=cp.float32
)
) for in_len in [53248]
)
val = mfe(inputs)
grad = cp.ones(val.shape, dtype=cp.float32)
val.grad = grad
val.backward()
for i in inputs:
print(i.grad)
if __name__ == "__main__":
mfe_test()

cupy.cuda.cudnn.convolutionBackwardData_v3 is incompatible with some specific parameters, as described in an issue in official github.
Unfortunately, the issue only dealt with deconvolution_2d.py (not deconvolution_nd.py), therefore the decision-making about whether cudnn is used or not failed in your case, I guess.
you can check your parameter by confirming
check whether dilation parameter (!=1) or group parameter (!=1) is passed to the convolution.
print chainer.config.cudnn_deterministic, configuration.config.autotune, and configuration.config.use_cudnn_tensor_core.
Further support may be obtained by raising an issue in the official github.
The code you showed is much complicated.
To clarify the problem, the code below would help.
from chainer import Variable, Chain
from chainer import links as L
from chainer import functions as F
import numpy as np
from six import print_
batch_size = 1
in_channel = 1
out_channel = 1
class MyLink(Chain):
def __init__(self):
super(MyLink, self).__init__()
with self.init_scope():
self.conv = L.ConvolutionND(3, 1, 1, (3, 3, 3), nobias=True, initialW=np.ones((in_channel, out_channel, 3, 3, 3)))
def __call__(self, x):
return F.sum(self.conv(x))
if __name__ == "__main__":
my_link = MyLink()
my_link.to_gpu(0)
batch = Variable(np.ones((batch_size, in_channel, 3, 3, 3)))
batch.to_gpu(0)
loss = my_link(batch)
loss.backward()
print_(batch.grad)

Related

What is the function of FrozenBatchNorm2d in “maskrcnn_benchmark”?

"maskrcnn_benchmark"s github
Here is the source code for "FrozenBatchNorm2d"
import torch
from torch import nn
class FrozenBatchNorm2d(nn.Module):
def __init__(self, n):
super(FrozenBatchNorm2d, self).__init__()
self.register_buffer("weight", torch.ones(n))
self.register_buffer("bias", torch.zeros(n))
self.register_buffer("running_mean", torch.zeros(n))
self.register_buffer("running_var", torch.ones(n))
def forward(self, x):
scale = self.weight * self.running_var.rsqrt()
bias = self.bias - self.running_mean * scale
scale = scale.reshape(1, -1, 1, 1)
bias = bias.reshape(1, -1, 1, 1)
return x * scale + bias
When I put this function in my script, I found that this function had almost no effect.
Here is my usage
import torch.nn as nn
import torch
class FrozenBatchNorm2d(nn.Module):
"""
BatchNorm2d where the batch statistics and the affine parameters
are fixed
"""
def __init__(self, n):
super(FrozenBatchNorm2d, self).__init__()
self.register_buffer("weight", torch.ones(n))
self.register_buffer("bias", torch.zeros(n))
self.register_buffer("running_mean", torch.zeros(n))
self.register_buffer("running_var", torch.ones(n))
def forward(self, x):
scale = self.weight * self.running_var.rsqrt()
bias = self.bias - self.running_mean * scale
scale = scale.reshape(1, -1, 1, 1)
bias = bias.reshape(1, -1, 1, 1)
print(scale.shape,bias.shape)
return x * scale + bias
a=FrozenBatchNorm2d((1,2))
a(torch.tensor([1,2,3]))
The running result is different from what I thought.
So can someone tell me what this function exactly does?
I will appreciate it if someone could help me.
"register_buffer" means open an RAM for some parameters which couldn't be optimized or changed during the tranning process, in another word, the "weight","bias","running_mean","running_var" are consistent values. Hence, that is the reason why this rebuild batchnorm method could be called FrozenBatchnorm2d. It's my explan, hope it can help you.

OpenMDAO Singular Entry

I'm trying to understand the OpenMDAO error messages
RuntimeError: Singular entry found in '' for column associated with state/residual 'x'.
and
RuntimeError: Singular entry found in '' for row associated with state/residual 'y'.
Can someone explain these? E.g. When running the code
from openmdao.api import Problem, Group, IndepVarComp, ImplicitComponent, ScipyOptimizeDriver, NewtonSolver, DirectSolver, view_model, view_connections
class Test1Comp(ImplicitComponent):
def setup(self):
self.add_input('x', 0.5)
self.add_input('design_x', 1.0)
self.add_output('z', val=0.0)
self.add_output('obj')
self.declare_partials(of='*', wrt='*', method='fd', form='central', step=1.0e-4)
def apply_nonlinear(self, inputs, outputs, resids):
x = inputs['x']
design_x = inputs['design_x']
z = outputs['z']
resids['z'] = x*z + z - 4
resids['obj'] = (z/5.833333 - design_x)**2
if __name__ == "__main__":
prob = Problem()
model = prob.model = Group()
model.add_subsystem('p1', IndepVarComp('x', 0.5))
model.add_subsystem('d1', IndepVarComp('design_x', 1.0))
model.add_subsystem('comp', Test1Comp())
model.connect('p1.x', 'comp.x')
model.connect('d1.design_x', 'comp.design_x')
prob.driver = ScipyOptimizeDriver()
prob.driver.options["optimizer"] = 'SLSQP'
model.add_design_var("d1.design_x", lower=0.5, upper=1.5)
model.add_objective('comp.obj')
model.nonlinear_solver = NewtonSolver()
model.nonlinear_solver.options['iprint'] = 2
model.nonlinear_solver.options['maxiter'] = 20
model.linear_solver = DirectSolver()
prob.setup()
prob.run_model()
print(prob['comp.z'])
I get the error message:
File "C:\Scripts/mockup_component3.py", line 46, in <module>
prob.run_model()
File "C:\Python_32\lib\site-packages\openmdao\core\problem.py", line 315, in run_model
return self.model.run_solve_nonlinear()
File "C:\Python_32\lib\site-packages\openmdao\core\system.py", line 2960, in run_solve_nonlinear
result = self._solve_nonlinear()
File "C:\Python_32\lib\site-packages\openmdao\core\group.py", line 1420, in _solve_nonlinear
result = self._nonlinear_solver.solve()
File "C:\Python_32\lib\site-packages\openmdao\solvers\solver.py", line 602, in solve
fail, abs_err, rel_err = self._run_iterator()
File "C:\Python_32\lib\site-packages\openmdao\solvers\solver.py", line 349, in _run_iterator
self._iter_execute()
File "C:\Python_32\lib\site-packages\openmdao\solvers\nonlinear\newton.py", line 234, in _iter_execute
system._linearize()
File "C:\Python_32\lib\site-packages\openmdao\core\group.py", line 1562, in _linearize
self._linear_solver._linearize()
File "C:\Python_32\lib\site-packages\openmdao\solvers\linear\direct.py", line 199, in _linearize
raise RuntimeError(format_singluar_error(err, system, mtx))
RuntimeError: Singular entry found in '' for column associated with state/residual 'comp.obj'.
This error I was able to solve, by adding - outputs['obj'] in the equation for resids['obj']. But I still have little understanding as to what the two error messages mean. What matrix is it that is singular? And what does it mean to have
1) a singular entry for a column?
2) a singular entry for a row?
I realized that the cause for the singular row was that I had not defined the partial derivatives for the component. I fixed this problem by adding the command declare_partials to the top level system. The traceback gave me the clue that the matrix was related to linearization.
The case with the singular column seems related to that I had two equations in apply_nonlinear, but only one unknown (z).

PyTorch RuntimeError: Assertion `cur_target >= 0 && cur_target < n_classes' failed

I’m trying to create a basic binary classifier in Pytorch that classifies whether my player plays on the right or the left side in the game Pong. The input is an 1x42x42 image and the label is my player's side (right = 1 or left = 2). The code:
class Net(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(Net, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, num_classes)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
return out
net = Net(42 * 42, 100, 2)
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer_net = torch.optim.Adam(net.parameters(), 0.001)
net.train()
while True:
state = get_game_img()
state = torch.from_numpy(state)
# right = 1, left = 2
current_side = get_player_side()
target = torch.LongTensor(current_side)
x = Variable(state.view(-1, 42 * 42))
y = Variable(target)
optimizer_net.zero_grad()
y_pred = net(x)
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
The error I get:
File "train.py", line 109, in train
loss = criterion(y_pred, y)
File "/home/shani/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 206, in __call__
result = self.forward(*input, **kwargs)
File "/home/shani/anaconda2/lib/python2.7/site-packages/torch/nn/modules/loss.py", line 321, in forward
self.weight, self.size_average)
File "/home/shani/anaconda2/lib/python2.7/site-packages/torch/nn/functional.py", line 533, in cross_entropy
return nll_loss(log_softmax(input), target, weight, size_average)
File "/home/shani/anaconda2/lib/python2.7/site-packages/torch/nn/functional.py", line 501, in nll_loss
return f(input, target)
File "/home/shani/anaconda2/lib/python2.7/site-packages/torch/nn/_functions/thnn/auto.py", line 41, in forward
output, *self.additional_args)
RuntimeError: Assertion `cur_target >= 0 && cur_target < n_classes' failed. at /py/conda-bld/pytorch_1493676237139/work/torch/lib/THNN/generic/ClassNLLCriterion.c:57
For most of deeplearning library, target(or label) should start from 0.
It means that your target should be in the range of [0,n) with n-classes.
It looks like PyTorch expect to get zero-based labels (0/1 in your case) and you probably feed it with one-based labels (1/2)
I had the same error in my program and i just realized that the problem was in the number of output nodes in my neural network
In my program the number of output nodes of my model was not equal to the number of labels in dataset
the number of output was 1 and the number of target labels was 10. then i changed the number of output to 10, there was no error

Categorical image classification always predicts one class, though calculated accuracy reaches 100%

I followed the Keras cat/dog image classification tutorial
Keras Image Classification tutorial
and found similar results to the reported values. I then took the code from the first example in that tutorial Tutorial Example 1 code, slightly altered a few lines, and trained the model for a dataset of grayscale images (~150 thousand images across 7 classes).
This gave me great initial results ( ~84% accuracy), which I am happy with.
Next I tried implementing the image batch generator myself, which is where I am having trouble. Briefly, the code seems to run well, except the reported accuracy of the model quickly shoots to >= 99% within two epochs. Due to noise in the dataset, this amount of accuracy is not believable. After using the trained model to predict a new batch of data ( images outside of the training or validation dataset ), I find the model always predicts the first class ( i.e. [1.,0.,0.,0.,0.,0.,0.]. The loss function is forcing the model to predict a single class 100% of the time, even though the labels I pass in are distributed across all the classes.
After 28 epochs of training, I see the following output:
320/320 [==============================] - 1114s - loss: 1.5820e-07 - categorical_accuracy: 1.0000 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 16.1181 - val_categorical_accuracy: 0.0000e+00 - val_sparse_categorical_accuracy: 0.0000e+00
When I examine the batch generator output from the tutorial code, and compare my batch generator output, the shape, datatype, and range of values are identical between both generators. I would like to emphasize that the generator passes y labels from each category, not just array([ 1.., 0., 0., 0., 0., 0., 0.], dtype=float32). Therefore, I am lost as to what I am doing incorrectly.
Since I posted this code several days ago, I have used the default Keras image generator, and successfully trained the network on the same dataset and same network architecture. Therefore, something about how I load and pass the data in the generator must be incorrect.
Here is the code I implemented:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import imgaug as ia
from imgaug import augmenters as iaa
import numpy as np
import numpy.random as nprand
import imageio
import os, re, random, sys, csv
import scipy
img_width, img_height = 112, 112
input_shape = (img_width,img_height,1)
batch_size = 200
epochs = 2
train_image_directory = '/PATH/To/Directory/train/'
valid_image_directory = '/PATH/To/Directory/validate/'
video_info_file = '/PATH/To/Directory/train_labels.csv'
train_image_paths = [train_image_directory + m.group(1) for m in [re.match(r"(\d+_\d+\.png)", fname) for fname in os.listdir(train_image_directory)] if m is not None]
valid_image_paths = [valid_image_directory + m.group(1) for m in [re.match(r"(\d+_\d+\.png)", fname) for fname in os.listdir(valid_image_directory)] if m is not None]
num_train_images = len(train_image_paths)
num_val_images = len(valid_image_paths)
label_map = {}
label_decode = {
'0': [1.,0.,0.,0.,0.,0.,0.],
'1': [0.,1.,0.,0.,0.,0.,0.],
'2': [0.,0.,1.,0.,0.,0.,0.],
'3': [0.,0.,0.,1.,0.,0.,0.],
'4': [0.,0.,0.,0.,1.,0.,0.],
'5': [0.,0.,0.,0.,0.,1.,0.],
'6': [0.,0.,0.,0.,0.,0.,1.]
}
with open(video_info_file) as f:
reader = csv.reader(f)
for row in reader:
key = row[0]
if key in label_map:
pass
label_map[key] = label_decode[row[1]]
sometimes = lambda aug: iaa.Sometimes(0.5,aug)
seq = iaa.Sequential(
[
iaa.Fliplr(0.5),
iaa.Flipud(0.2),
sometimes(iaa.Crop(percent=(0, 0.1))),
sometimes(iaa.Affine(
scale={"x": (0.8, 1.2), "y": (0.8, 1.2)},
translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)},
rotate=(-5, 5),
shear=(-16, 16),
order=[0, 1],
cval=(0, 1),
mode=ia.ALL
)),
iaa.SomeOf((0, 3),
[
sometimes(iaa.Superpixels(p_replace=(0, 0.40), n_segments=(20, 100))),
iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)),
iaa.Emboss(alpha=(0, 1.0), strength=(0, 1.0)),
iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255)),
iaa.OneOf([
iaa.Dropout((0.01, 0.1)),
iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05)),
]),
iaa.Invert(0.05),
iaa.Add((-10, 10)),
iaa.Multiply((0.5, 1.5), per_channel=0.5),
iaa.ContrastNormalization((0.5, 2.0)),
sometimes(iaa.ElasticTransformation(alpha=(0.5, 1.5), sigma=0.2)),
sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.03))) # sometimes move parts of the image around
],
random_order=True
)
],
random_order=True)
def image_data_generator(image_paths, labels, batch_size, training):
while(1):
image_paths = nprand.choice(image_paths, batch_size)
X0 = np.asarray([imageio.imread(x) for x in image_paths])
Y = np.asarray([labels[x] for x in image_paths],dtype=np.float32)
if(training):
X = np.divide(np.expand_dims(seq.augment_images(X0)[:,:,:,0],axis=3),255.)
else:
X = np.expand_dims(np.divide(X0[:,:,:,0],255.),axis=3)
X = np.asarray(X,dtype=np.float32)
yield X,Y
def predict_videos(model,video_paths):
i=0
predictions=[]
while(i < len(video_paths)):
video_reader = imageio.get_reader(video_paths[i])
X0 = np.expand_dims([ im[:,:,0] for x,im in enumerate(video_reader) ],axis=3)
prediction = model.predict(X0)
i=i+1
predictions.append(prediction)
return predictions
train_gen = image_data_generator(train_image_paths,label_map,batch_size,True)
val_gen = image_data_generator(valid_image_paths,label_map,batch_size,False)
model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=input_shape))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.4))
model.add(Dense(7))
model.add(Activation('softmax'))
model.load_weights('/PATH/To_pretrained_weights/pretrained_model.h5')
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
optimizer='sgd',
metrics=['categorical_accuracy','sparse_categorical_accuracy'])
checkpointer = ModelCheckpoint('/PATH/To_pretrained_weights/pretrained_model.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
reduceLR = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=20, verbose=0, mode='auto', cooldown=0, min_lr=0)
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
callbacks_list = [checkpointer, early_stop, reduceLR]
model.fit_generator(
train_gen,
steps_per_epoch = -(-num_train_images // batch_size),
epochs=epochs,
validation_data=val_gen,
validation_steps = -(-num_val_images // batch_size),
callbacks=callbacks_list)
For some reason that I cannot fully determine, if you do not give the fit_generator function accurate numbers for steps per epoch or steps for validation, the result is inaccurate reporting of the accuracy metric and strange gradient descent steps.
You can fix this problem by using the Train_on_batch function in Keras instead of the fit generator, or by accurately reporting these step numbers.

Patch glyph not updated when using multiple ColumnDataSources in bokeh app

I am trying to use the bokeh server to plot a time series together with a shaded percentile band around, and this, since bokeh does not support the fill_between function from matplotlib, requires the construction of a patch object of double dimension. Hence, I need two ColumnDataSources to hold the data. However, only the first curve is rendered correctly when the data changes. Although the data_source of the GlyphRenderer is updated, the figure does not change. I use bokeh 0.12.3, and have tried with several servers and browsers. A complete, and reasonably minimal example:
import numpy as np
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.layouts import column
from bokeh.io import curdoc
from bokeh.models.widgets import Select
class AppData:
def __init__(self, n):
self.p_source = None
self.c_source = None
self.x = np.linspace(0, 10, 20)
self.n = n
self.ys = [np.sin(self.x) - i for i in range(self.n)]
self.line = None
self.patch = None
def update_module(self, a, b):
assert b - a == 5
p_data = dict() if self.p_source is None else self.p_source.data
c_data = dict() if self.c_source is None else self.c_source.data
ys = [self.ys[j] for j in range(a, b)]
if "x" not in c_data:
c_data["x"] = self.x
p_data["x"] = c_data["x"].tolist() + c_data["x"][::-1].tolist()
n_r = len(ys[0])
n_p = 2*n_r
if "ys" not in p_data:
p_data["ys"] = np.empty((n_p))
p_data["ys"][:n_r] = ys[0]
p_data["ys"][n_r:] = np.flipud(ys[-1])
c_data["y"] = ys[2]
if self.p_source is None:
self.p_source = ColumnDataSource(data=p_data)
else:
self.p_source.data.update(p_data)
if self.c_source is None:
self.c_source = ColumnDataSource(data=c_data)
else:
self.c_source.data.update(c_data)
if self.line is not None:
print(max(self.line.data_source.data["y"]))
print(max(self.patch.data_source.data["ys"])) # The value changes, but the figure does not!
# initialize
app_data = AppData(10)
app_data.update_module(4, 4 + 5)
s1 = figure(width=500, plot_height=125, title=None, toolbar_location="above")
app_data.line = s1.line("x", "y", source=app_data.c_source)
app_data.patch = s1.patch("x", "ys", source=app_data.p_source, alpha=0.3, line_width=0)
select = Select(title="Case", options=[str(i) for i in range(5)], value="4")
def select_case(attrname, old, new):
a = int(select.value)
app_data.update_module(a, a + 5)
select.on_change('value', select_case)
layout = column(select, s1)
curdoc().add_root(layout)
curdoc().title = "Example of patches not being updated"
I am certainly not very experienced in using bokeh, so I could very well be using the system wrong. However, any help on this matter would be of great help!

Resources