Tensorflow: 6 layer CNN: OOM (use 10Gb GPU memory) - out-of-memory

I am using the following code for running a 6 layer CNN with 2 FC layers on top (on Tesla K-80 GPU).
Somehow, it consumes entire memory 10GB and died out of memory.I know that i can reduce the batch_size and then run , but i also want to run with 15 or 20 CNN layers.Whats wrong with the following code and why it takes all the memory? How should i run the code for 15 layers CNN.
Code:
import model
with tf.Graph().as_default() as g_train:
filenames = tf.train.match_filenames_once(FLAGS.train_dir+'*.tfrecords')
filename_queue = tf.train.string_input_producer(filenames, shuffle=True, num_epochs=FLAGS.num_epochs)
feats,labels = get_batch_input(filename_queue, batch_size=FLAGS.batch_size)
### feats size=(batch_size, 100, 50)
logits = model.inference(feats, FLAGS.batch_size)
loss = model.loss(logits, labels, feats)
tvars = tf.trainable_variables()
global_step = tf.Variable(0, name='global_step', trainable=False)
# Add to the Graph operations that train the model.
train_op = model.training(loss, tvars, global_step, FLAGS.learning_rate, FLAGS.clip_gradients)
# Add the Op to compare the logits to the labels during evaluation.
eval_correct = model.evaluation(logits, labels, feats)
summary_op = tf.merge_all_summaries()
saver = tf.train.Saver(tf.all_variables(), max_to_keep=15)
# The op for initializing the variables.
init_op = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init_op)
summary_writer = tf.train.SummaryWriter(FLAGS.model_dir,
graph=sess.graph)
# Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
step = 0
while not coord.should_stop():
_, loss_value = sess.run([train_op, loss])
if step % 100 == 0:
print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value))
# Update the events file.
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if (step == 0) or (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
ckpt_model = os.path.join(FLAGS.model_dir, 'model.ckpt')
saver.save(sess, ckpt_model, global_step=step)
#saver.save(sess, FLAGS.model_dir, global_step=step)
step += 1
except tf.errors.OutOfRangeError:
print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
finally:
coord.join(threads)
sess.close()
###################### File model.py ####################
def conv2d(x, W, b, strides=1):
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1],
padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def maxpool2d(x, k=2,s=2):
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, s,
s,1],padding='SAME')
def inference(feats,batch_size):
#feats size (batch_size,100,50,1) #batch_size=256
conv1_w=tf.get_variable("conv1_w", [filter_size,filter_size,1,256],initializer=tf.uniform_unit_scaling_initializer())
conv1_b=tf.get_variable("conv1_b",[256])
conv1 = conv2d(feats, conv1_w, conv1_b,2)
conv1 = maxpool2d(conv1, k=2,s=2)
### This was replicated for 6 layers and the 2 FC connected layers are added
return logits
def training(loss, train_vars, global_step, learning_rate, clip_gradients):
# Add a scalar summary for the snapshot loss.
tf.scalar_summary(loss.op.name, loss)
grads, _ = tf.clip_by_global_norm(tf.gradients(loss, train_vars,aggregation_method=1), clip_gradients)
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.apply_gradients(zip(grads, train_vars), global_step=global_step)
return train_op

I am not too sure what the model python library is. If it is something you wrote and can change the setting in the optimizer I would suggest the following which I use in my own code
train_step = tf.train.AdamOptimizer(learning_rate).minimize(cost, aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
By default the aggeragetion_method is ADD_N but if you change it to EXPERIMENTAL_ACCUMULATE_N or EXPERIMENTAL_TREE this will greatly save memory. The main memory hog in these programs is that tensorflow must save the output values at every neuron so that it can compute the gradients. Changing the aggregation_method helps a lot from my experience.
Also BTW I don't think there is anything wrong with your code. I can run out of memory on small cov-nets as well.

Related

TypeError: Caught TypeError in DataLoader worker process 0. TypeError: 'KeyError' object is not iterable

from torchvision_starter.engine import train_one_epoch, evaluate
from torchvision_starter import utils
import multiprocessing
import time
n_cpu = multiprocessing.cpu_count()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
_ = model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=3,
gamma=0.2,
verbose=True
)
# Let's train for 10 epochs
num_epochs = 1
start = time.time()
for epoch in range(10, 10 + num_epochs):
# train for one epoch, printing every 10 iterations
train_one_epoch(model, optimizer, data_loaders['train'], device, epoch, print_freq=10)
# update the learning rate
lr_scheduler.step()
# evaluate on the validation dataset
evaluate(model, data_loaders['valid'], device=device)
stop = time.time()
print(f"\n\n{num_epochs} epochs in {stop - start} s ({(stop-start) / 3600:.2f} hrs)")
Before I move on to this part, everything is OK. But after I run the part, the error is like below:
I have tried to add drop_last to the helper.py's function like:
data_loaders["train"] = torch.utils.data.DataLoader(
train_data,
batch_size=batch_size,
sampler=train_sampler,
num_workers=num_workers,
collate_fn=utils.collate_fn,
drop_last=True
)
But it doesn't work. By the way, the torch and torchvision are compatible and Cuda is available.
I wonder how to fix it.
The get_data_loaders function:
def get_data_loaders(
folder, batch_size: int = 2, valid_size: float = 0.2, num_workers: int = -1, limit: int = -1, thinning: int = None
):
"""
Create and returns the train_one_epoch, validation and test data loaders.
:param foder: folder containing the dataset
:param batch_size: size of the mini-batches
:param valid_size: fraction of the dataset to use for validation. For example 0.2
means that 20% of the dataset will be used for validation
:param num_workers: number of workers to use in the data loaders. Use -1 to mean
"use all my cores"
:param limit: maximum number of data points to consider
:param thinning: take every n-th frame, instead of all frames
:return a dictionary with 3 keys: 'train_one_epoch', 'valid' and 'test' containing respectively the
train_one_epoch, validation and test data loaders
"""
if num_workers == -1:
# Use all cores
num_workers = multiprocessing.cpu_count()
# We will fill this up later
data_loaders = {"train": None, "valid": None, "test": None}
# create 3 sets of data transforms: one for the training dataset,
# containing data augmentation, one for the validation dataset
# (without data augmentation) and one for the test set (again
# without augmentation)
data_transforms = {
"train": get_transform(UdacitySelfDrivingDataset.mean, UdacitySelfDrivingDataset.std, train=True),
"valid": get_transform(UdacitySelfDrivingDataset.mean, UdacitySelfDrivingDataset.std, train=False),
"test": get_transform(UdacitySelfDrivingDataset.mean, UdacitySelfDrivingDataset.std, train=False),
}
# Create train and validation datasets
train_data = UdacitySelfDrivingDataset(
folder,
transform=data_transforms["train"],
train=True,
thinning=thinning
)
# The validation dataset is a split from the train_one_epoch dataset, so we read
# from the same folder, but we apply the transforms for validation
valid_data = UdacitySelfDrivingDataset(
folder,
transform=data_transforms["valid"],
train=True,
thinning=thinning
)
# obtain training indices that will be used for validation
n_tot = len(train_data)
indices = torch.randperm(n_tot)
# If requested, limit the number of data points to consider
if limit > 0:
indices = indices[:limit]
n_tot = limit
split = int(math.ceil(valid_size * n_tot))
train_idx, valid_idx = indices[split:], indices[:split]
# define samplers for obtaining training and validation batches
train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
valid_sampler = torch.utils.data.SubsetRandomSampler(valid_idx) # =
# prepare data loaders
data_loaders["train"] = torch.utils.data.DataLoader(
train_data,
batch_size=batch_size,
sampler=train_sampler,
num_workers=num_workers,
collate_fn=utils.collate_fn,
drop_last=True
)
data_loaders["valid"] = torch.utils.data.DataLoader(
valid_data, # -
batch_size=batch_size, # -
sampler=valid_sampler, # -
num_workers=num_workers, # -
collate_fn=utils.collate_fn,
drop_last=True
)
# Now create the test data loader
test_data = UdacitySelfDrivingDataset(
folder,
transform=data_transforms["test"],
train=False,
thinning=thinning
)
if limit > 0:
indices = torch.arange(limit)
test_sampler = torch.utils.data.SubsetRandomSampler(indices)
else:
test_sampler = None
data_loaders["test"] = torch.utils.data.DataLoader(
test_data,
batch_size=batch_size,
shuffle=False,
num_workers=num_workers,
sampler=test_sampler,
collate_fn=utils.collate_fn,
drop_last=True
# -
)
return data_loaders
class UdacitySelfDrivingDataset(torch.utils.data.Dataset):
# Mean and std of the dataset to be used in nn.Normalize
mean = torch.tensor([0.3680, 0.3788, 0.3892])
std = torch.tensor([0.2902, 0.3069, 0.3242])
def __init__(self, root, transform, train=True, thinning=None):
super().__init__()
self.root = os.path.abspath(os.path.expandvars(os.path.expanduser(root)))
self.transform = transform
# load datasets
if train:
self.df = pd.read_csv(os.path.join(self.root, "labels_train.csv"))
else:
self.df = pd.read_csv(os.path.join(self.root, "labels_test.csv"))
# Index by file id (i.e., a sequence of the same length as the number of images)
codes, uniques = pd.factorize(self.df['frame'])
if thinning:
# Take every n-th rows. This makes sense because the images are
# frames of videos from the car, so we are essentially reducing
# the frame rate
thinned = uniques[::thinning]
idx = self.df['frame'].isin(thinned)
print(f"Keeping {thinned.shape[0]} of {uniques.shape[0]} images")
print(f"Keeping {idx.sum()} objects out of {self.df.shape[0]}")
self.df = self.df[idx].reset_index(drop=True)
# Recompute codes
codes, uniques = pd.factorize(self.df['frame'])
self.n_images = len(uniques)
self.df['image_id'] = codes
self.df.set_index("image_id", inplace=True)
self.classes = ['car', 'truck', 'pedestrian', 'bicyclist', 'light']
self.colors = ['cyan', 'blue', 'red', 'purple', 'orange']
#property
def n_classes(self):
return len(self.classes)
def __getitem__(self, idx):
if idx in self.df.index:
row = self.df.loc[[idx]]
else:
return KeyError(f"Element {idx} not in dataframe")
# load images fromm file
img_path = os.path.join(self.root, "images", row['frame'].iloc[0])
img = Image.open(img_path).convert("RGB")
# Exclude bogus boxes with 0 height or width
h = row['ymax'] - row['ymin']
w = row['xmax'] - row['xmin']
filter_idx = (h > 0) & (w > 0)
row = row[filter_idx]
# get bounding box coordinates for each mask
boxes = row[['xmin', 'ymin', 'xmax', 'ymax']].values
# convert everything into a torch.Tensor
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# get the labels
labels = torch.as_tensor(row['class_id'].values, dtype=int)
image_id = torch.tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# assume no crowd for everything
iscrowd = torch.zeros((row.shape[0],), dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transform is not None:
img, target = self.transform(img, target)
return img, target
def __len__(self):
return self.n_images
def plot(self, idx, renormalize=True, predictions=None, threshold=0.5, ax=None):
image, label_js = self[idx]
if renormalize:
# Invert the T.Normalize transform
unnormalize = T.Compose(
[
T.Normalize(mean = [ 0., 0., 0. ], std = 1 / type(self).std),
T.Normalize(mean = -type(self).mean, std = [ 1., 1., 1. ])
]
)
image, label_js = unnormalize(image, label_js)
if ax is None:
fig, ax = plt.subplots(figsize=(8, 8))
_ = ax.imshow(torch.permute(image, [1, 2, 0]))
for i, box in enumerate(label_js['boxes']):
xy = (box[0], box[1])
h, w = (box[2] - box[0]), (box[3] - box[1])
r = patches.Rectangle(xy, h, w, fill=False, color=self.colors[label_js['labels'][i]-1], lw=2, alpha=0.5)
ax.add_patch(r)
if predictions is not None:
# Make sure the predictions are on the CPU
for k in predictions:
predictions[k] = predictions[k].detach().cpu().numpy()
for i, box in enumerate(predictions['boxes']):
if predictions['scores'][i] > threshold:
xy = (box[0], box[1])
h, w = (box[2] - box[0]), (box[3] - box[1])
r = patches.Rectangle(xy, h, w, fill=False, color=self.colors[predictions['labels'][i]-1], lw=2, linestyle=':')
ax.add_patch(r)
_ = ax.axis("off")
return ax

Optimizing Distributed I/O with serial output

I am having trouble understanding how to optimize a distributed component with a serial output. This is my attempt with an example problem given in the openmdao docs.
import numpy as np
import openmdao.api as om
from openmdao.utils.array_utils import evenly_distrib_idxs
from openmdao.utils.mpi import MPI
class MixedDistrib2(om.ExplicitComponent):
def setup(self):
# Distributed Input
self.add_input('in_dist', shape_by_conn=True, distributed=True)
# Serial Input
self.add_input('in_serial', val=1)
# Distributed Output
self.add_output('out_dist', copy_shape='in_dist', distributed=True)
# Serial Output
self.add_output('out_serial', copy_shape='in_serial')
#self.declare_partials('*','*', method='cs')
def compute(self, inputs, outputs):
x = inputs['in_dist']
y = inputs['in_serial']
# "Computationally Intensive" operation that we wish to parallelize.
f_x = x**2 - 2.0*x + 4.0
# These operations are repeated on all procs.
f_y = y ** 0.5
g_y = y**2 + 3.0*y - 5.0
# Compute square root of our portion of the distributed input.
g_x = x ** 0.5
# Distributed output
outputs['out_dist'] = f_x + f_y
# Serial output
if MPI and comm.size > 1:
# We need to gather the summed values to compute the total sum over all procs.
local_sum = np.array(np.sum(g_x))
total_sum = local_sum.copy()
self.comm.Allreduce(local_sum, total_sum, op=MPI.SUM)
outputs['out_serial'] = g_y * total_sum
else:
# Recommended to make sure your code can run in serial too, for testing.
outputs['out_serial'] = g_y * np.sum(g_x)
size = 7
if MPI:
comm = MPI.COMM_WORLD
rank = comm.rank
sizes, offsets = evenly_distrib_idxs(comm.size, size)
else:
# When running in serial, the entire variable is on rank 0.
rank = 0
sizes = {rank : size}
offsets = {rank : 0}
prob = om.Problem()
model = prob.model
# Create a distributed source for the distributed input.
ivc = om.IndepVarComp()
ivc.add_output('x_dist', np.zeros(sizes[rank]), distributed=True)
ivc.add_output('x_serial', val=1)
model.add_subsystem("indep", ivc)
model.add_subsystem("D1", MixedDistrib2())
model.add_subsystem('con_cmp1', om.ExecComp('con1 = y**2'), promotes=['con1', 'y'])
model.connect('indep.x_dist', 'D1.in_dist')
model.connect('indep.x_serial', ['D1.in_serial','y'])
prob.driver = om.ScipyOptimizeDriver()
prob.driver.options['optimizer'] = 'SLSQP'
model.add_design_var('indep.x_serial', lower=5, upper=10)
model.add_constraint('con1', upper=90)
model.add_objective('D1.out_serial')
prob.setup(force_alloc_complex=True)
#prob.setup()
# Set initial values of distributed variable.
x_dist_init = [1,1,1,1,1,1,1]
prob.set_val('indep.x_dist', x_dist_init)
# Set initial values of serial variable.
prob.set_val('indep.x_serial', 10)
#prob.run_model()
prob.run_driver()
print('x_dist', prob.get_val('indep.x_dist', get_remote=True))
print('x_serial', prob.get_val('indep.x_serial'))
print('Obj', prob.get_val('D1.out_serial'))
The problem is with defining partials with 'fd' or 'cs'. I cannot define partials of serial output w.r.t distributed input. So I used prob.setup(force_alloc_complex=True) to use complex step. But gives me this warning DerivativesWarning:Constraints or objectives [('D1.out_serial', inds=[0])] cannot be impacted by the design variables of the problem. I understand this is because the total derivative is 0 which causes the warning but I dont understand the reason. Clearly the total derivative should not be 0 here. But I guess this is because I didn't explicitly declare_partials in the component. I tried removing the distributed components and ran it again with declare_partials and this works correctly(code below).
import numpy as np
import openmdao.api as om
class MixedDistrib2(om.ExplicitComponent):
def setup(self):
self.add_input('in_dist', np.zeros(7))
self.add_input('in_serial', val=1)
self.add_output('out_serial', val=0)
self.declare_partials('*','*', method='cs')
def compute(self, inputs, outputs):
x = inputs['in_dist']
y = inputs['in_serial']
g_y = y**2 + 3.0*y - 5.0
g_x = x ** 0.5
outputs['out_serial'] = g_y * np.sum(g_x)
prob = om.Problem()
model = prob.model
model.add_subsystem("D1", MixedDistrib2(), promotes_inputs=['in_dist', 'in_serial'], promotes_outputs=['out_serial'])
model.add_subsystem('con_cmp1', om.ExecComp('con1 = in_serial**2'), promotes=['con1', 'in_serial'])
prob.driver = om.ScipyOptimizeDriver()
prob.driver.options['optimizer'] = 'SLSQP'
model.add_design_var('in_serial', lower=5, upper=10)
model.add_constraint('con1', upper=90)
model.add_objective('out_serial')
prob.setup(force_alloc_complex=True)
prob.set_val('in_dist', [1,1,1,1,1,1,1])
prob.set_val('in_serial', 10)
prob.run_model()
prob.check_totals()
prob.run_driver()
print('x_dist', prob.get_val('in_dist', get_remote=True))
print('x_serial', prob.get_val('in_serial'))
print('Obj', prob.get_val('out_serial'))
What I am trying to understand is
How to use 'fd' or 'cs' in Distributed component with a serial output?
What is the meaning of prob.setup(force_alloc_complex=True) ? Is not forcing to use cs in all the components in the problem ? If so why does the total derivative becomes 0?
When I run your code in OpenMDAO V 3.11.0 (after uncommenting the declare_partials call) I get the following error:
RuntimeError: 'D1' <class MixedDistrib2>: component has defined partial ('out_serial', 'in_dist') which is a serial output wrt a distributed input. This is only supported using the matrix free API.
As the error indicates, you can't use the matrix-based api for derivatives in this situations. The reasons why are a bit subtle, and probably outside the scope of what needs to be delt with to answer your question here. It boils down to OpenMDAO not knowing why kind of distributed operations are being done in the compute and having no way to manage those details when you propagate things in reverse.
So you need to use the matrix-free derivative APIs in this situation. When you use the matrix-free APIs you DO NOT declare any partials, because you don't want OpenMDAO to allocate any memory for you to store partials in (and you wouldn't use that memory even if it did).
I've coded them for your example here, but I need to note a few important details:
Your example has a distributed IVC, but as of OpenMDAO V3.11.0 you can't get total derivatives with respect to distributed design variables. I assume you just made it that way to make your simple test case, but in case your real problem was set up this way, you need to note this and not do it this way. Instead, make the IVC serial, and use src indices to distribute the correct parts to each proc.
In the example below, the derivatives are correct. However, there seems to be a bug in the check_partials output when running in paralle. So the reverse mode partials look like they are off by a factor of the comm size... this will have to get fixed in later releases.
I only did the derivatives for out_serial. out_dist will work similarly and is left as an excersize for the reader :)
You'll notice that I duplicates some code in the compute and compute_jacvec_product methods. You can abstract this duplicate code out into its own method (or call compute from within compute_jacvec_product by providing your own output dictionary). However, you might be asking why the duplicate call is needed at all? Why can't u store the values from the compute call. The answer is, in large part, that OpenMDAO does not guarantee that compute is always called before compute_jacvec_product. However, I'll also point out that this kind of code duplication is very AD-like. Any AD code will have the same kind of duplication built in, even though you don't see it.
import numpy as np
import openmdao.api as om
from openmdao.utils.array_utils import evenly_distrib_idxs
from openmdao.utils.mpi import MPI
class MixedDistrib2(om.ExplicitComponent):
def setup(self):
# Distributed Input
self.add_input('in_dist', shape_by_conn=True, distributed=True)
# Serial Input
self.add_input('in_serial', val=1)
# Distributed Output
self.add_output('out_dist', copy_shape='in_dist', distributed=True)
# Serial Output
self.add_output('out_serial', copy_shape='in_serial')
# self.declare_partials('*','*', method='fd')
def compute(self, inputs, outputs):
x = inputs['in_dist']
y = inputs['in_serial']
# "Computationally Intensive" operation that we wish to parallelize.
f_x = x**2 - 2.0*x + 4.0
# These operations are repeated on all procs.
f_y = y ** 0.5
g_y = y**2 + 3.0*y - 5.0
# Compute square root of our portion of the distributed input.
g_x = x ** 0.5
# Distributed output
outputs['out_dist'] = f_x + f_y
# Serial output
if MPI and comm.size > 1:
# We need to gather the summed values to compute the total sum over all procs.
local_sum = np.array(np.sum(g_x))
total_sum = local_sum.copy()
self.comm.Allreduce(local_sum, total_sum, op=MPI.SUM)
outputs['out_serial'] = g_y * total_sum
else:
# Recommended to make sure your code can run in serial too, for testing.
outputs['out_serial'] = g_y * np.sum(g_x)
def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
x = inputs['in_dist']
y = inputs['in_serial']
g_y = y**2 + 3.0*y - 5.0
# "Computationally Intensive" operation that we wish to parallelize.
f_x = x**2 - 2.0*x + 4.0
# These operations are repeated on all procs.
f_y = y ** 0.5
g_y = y**2 + 3.0*y - 5.0
# Compute square root of our portion of the distributed input.
g_x = x ** 0.5
# Distributed output
out_dist = f_x + f_y
# Serial output
if MPI and comm.size > 1:
# We need to gather the summed values to compute the total sum over all procs.
local_sum = np.array(np.sum(g_x))
total_sum = local_sum.copy()
self.comm.Allreduce(local_sum, total_sum, op=MPI.SUM)
# total_sum
else:
# Recommended to make sure your code can run in serial too, for testing.
total_sum = np.sum(g_x)
num_x = len(x)
d_f_x__d_x = np.diag(2*x - 2.)
d_f_y__d_y = np.ones(num_x)*0.5*y**-0.5
d_g_y__d_y = 2*y + 3.
d_g_x__d_x = 0.5*x**-0.5
d_out_dist__d_x = d_f_x__d_x # square matrix
d_out_dist__d_y = d_f_y__d_y # num_x,1
d_out_serial__d_y = d_g_y__d_y # scalar
d_out_serial__d_x = g_y*d_g_x__d_x.reshape((1,num_x))
if mode == 'fwd':
if 'out_serial' in d_outputs:
if 'in_dist' in d_inputs:
d_outputs['out_serial'] += d_out_serial__d_x.dot(d_inputs['in_dist'])
if 'in_serial' in d_inputs:
d_outputs['out_serial'] += d_out_serial__d_y.dot(d_inputs['in_serial'])
elif mode == 'rev':
if 'out_serial' in d_outputs:
if 'in_dist' in d_inputs:
d_inputs['in_dist'] += d_out_serial__d_x.T.dot(d_outputs['out_serial'])
if 'in_serial' in d_inputs:
d_inputs['in_serial'] += total_sum*d_out_serial__d_y.T.dot(d_outputs['out_serial'])
size = 7
if MPI:
comm = MPI.COMM_WORLD
rank = comm.rank
sizes, offsets = evenly_distrib_idxs(comm.size, size)
else:
# When running in serial, the entire variable is on rank 0.
rank = 0
sizes = {rank : size}
offsets = {rank : 0}
prob = om.Problem()
model = prob.model
# Create a distributed source for the distributed input.
ivc = om.IndepVarComp()
ivc.add_output('x_dist', np.zeros(sizes[rank]), distributed=True)
ivc.add_output('x_serial', val=1)
model.add_subsystem("indep", ivc)
model.add_subsystem("D1", MixedDistrib2())
model.add_subsystem('con_cmp1', om.ExecComp('con1 = y**2'), promotes=['con1', 'y'])
model.connect('indep.x_dist', 'D1.in_dist')
model.connect('indep.x_serial', ['D1.in_serial','y'])
prob.driver = om.ScipyOptimizeDriver()
prob.driver.options['optimizer'] = 'SLSQP'
model.add_design_var('indep.x_serial', lower=5, upper=10)
model.add_constraint('con1', upper=90)
model.add_objective('D1.out_serial')
prob.setup(force_alloc_complex=True)
#prob.setup()
# Set initial values of distributed variable.
x_dist_init = np.ones(sizes[rank])
prob.set_val('indep.x_dist', x_dist_init)
# Set initial values of serial variable.
prob.set_val('indep.x_serial', 10)
prob.run_model()
prob.check_partials()
# prob.run_driver()
print('x_dist', prob.get_val('indep.x_dist', get_remote=True))
print('x_serial', prob.get_val('indep.x_serial'))
print('Obj', prob.get_val('D1.out_serial'))

How to save GPU memory usage in PyTorch

In PyTorch I wrote a very simple CNN discriminator and trained it. Now I need to deploy it to make predictions. But the target machine has a small GPU memory and got out of memory error. So I think that I can set requires_grad = False to prevent PyTorch from storing the gradient values. However I didn't find it making any difference.
There are about 5 millions of parameters in my model. But when predicting a single batch of input, it consumes about 1.2GB of memory. I think there should be no need for such large memory.
The question is how to save GPU memory usage when I just want to use my model to make predictions?
Here is a demo, I use discriminator.requires_grad_ to disable/enable autograd of all parameters. But it seems to be no use.
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as functional
from pynvml.smi import nvidia_smi
nvsmi = nvidia_smi.getInstance()
def getMemoryUsage():
usage = nvsmi.DeviceQuery("memory.used")["gpu"][0]["fb_memory_usage"]
return "%d %s" % (usage["used"], usage["unit"])
print("Before GPU Memory: %s" % getMemoryUsage())
class Discriminator(nn.Module):
def __init__(self):
super().__init__()
# trainable layers
# input: 2x256x256
self.conv1 = nn.Conv2d(2, 8, 5, padding=2) # 8x256x256
self.pool1 = nn.MaxPool2d(2) # 8x128x128
self.conv2 = nn.Conv2d(8, 32, 5, padding=2) # 32x128x128
self.pool2 = nn.MaxPool2d(2) # 32x64x64
self.conv3 = nn.Conv2d(32, 96, 5, padding=2) # 96x64x64
self.pool3 = nn.MaxPool2d(4) # 96x16x16
self.conv4 = nn.Conv2d(96, 256, 5, padding=2) # 256x16x16
self.pool4 = nn.MaxPool2d(4) # 256x4x4
self.num_flat_features = 4096
self.fc1 = nn.Linear(4096, 1024)
self.fc2 = nn.Linear(1024, 256)
self.fc3 = nn.Linear(256, 1)
# loss function
self.loss = nn.MSELoss()
# other properties
self.requires_grad = True
def forward(self, x):
y = x
y = self.conv1(y)
y = self.pool1(y)
y = functional.relu(y)
y = self.conv2(y)
y = self.pool2(y)
y = functional.relu(y)
y = self.conv3(y)
y = self.pool3(y)
y = functional.relu(y)
y = self.conv4(y)
y = self.pool4(y)
y = functional.relu(y)
y = y.view((-1,self.num_flat_features))
y = self.fc1(y)
y = functional.relu(y)
y = self.fc2(y)
y = functional.relu(y)
y = self.fc3(y)
y = torch.sigmoid(y)
return y
def predict(self, x, score_th=0.5):
if len(x.shape) == 3:
singlebatch = True
x = x.view([1]+list(x.shape))
else:
singlebatch = False
y = self.forward(x)
label = (y > float(score_th))
if singlebatch:
y = y.view(list(y.shape)[1:])
return label, y
def requires_grad_(self, requires_grad=True):
for parameter in self.parameters():
parameter.requires_grad_(requires_grad)
self.requires_grad = requires_grad
x = torch.cuda.FloatTensor(np.zeros([2, 256, 256]))
discriminator = Discriminator()
discriminator.to("cuda:0")
# comment/uncomment this line to make difference
discriminator.requires_grad_(False)
discriminator.predict(x)
print("Requires grad", discriminator.requires_grad)
print("After GPU Memory: %s" % getMemoryUsage())
By comment out the line discriminator.requires_grad_(False), I got output:
Before GPU Memory: 6350MiB
Requires grad True
After GPU Memory: 7547MiB
While by uncomment the line, I got:
Before GPU Memory: 6350MiB
Requires grad False
After GPU Memory: 7543MiB
You can use pynvml.
This python tool made Nvidia so you can Python query like this:
from pynvml.smi import nvidia_smi
nvsmi = nvidia_smi.getInstance()
nvsmi.DeviceQuery('memory.free, memory.total')
You can always also execute:
torch.cuda.empty_cache()
To empty the cache and you will find even more free memory that way.
Before calling torch.cuda.empty_cache() if you have objects you don't use anymore you can call this:
obj = None
And after that you call
gc.collect()
Try to use model.eval() with torch.no_grad() on your target machine when making predictions. model.eval() will switch model layers to eval mode. torch.no_grad() will deactivate autograd engine and as a result memory usage will be reduced.
x = torch.cuda.FloatTensor(np.zeros([2, 256, 256]))
discriminator = Discriminator()
discriminator.to("cuda:0")
discriminator.eval()
with torch.no_grad():
discriminator.predict(x)
I guess it's not relevant anymore for your specific problem, but you could take a look at Torchscript
It's a good way to decrease the size and complexity of your model. It also speeds up the prediction. Unfortunately it cant help with the training itself. It is just in general a good idea for deployment of pytorch models used in other hardware or embedded in c++ code for efficiency.
Cheers. :-)

Pitch Calculation Error via Autocorrelation Method

Aim : Pitch Calculation
Issue : The calculated pitch does not match the expected one. For instance, the output is approx. 'D3', however the expected output is 'C5'.
Source Sound : https://freewavesamples.com/1980s-casio-celesta-c5
Source Code
library("tuneR")
library("seewave")
#0: Acquisition of sample sound
snd_smpl = readWave(paste("~/Music/sample/1980s-Casio-Celesta-C5.wav"),
from = 0, to = 1, units = "seconds")
dur_smpl = duration(snd_smpl)
len_smpl = length(snd_smpl)
#1 : Pre-Processing Stage
#1.1 : Application of Hanning Window
n = 1:len_smpl
han_win = 0.5-0.5*cos(2*pi*n/(len_smpl-1))
wind_sig = han_win*snd_smpl#left
#2.1 : Auto-Correlation Calculation
rev_wind_sig = rev(wind_sig) #Reversing the windowed signal
acorr_1 = convolve(wind_sig, rev_wind_sig, type = "open")
# Obtaining the 2nd half of the correlation, to simplify calculation
n = 2*len_smpl-1
acorr_2 = (1/len_smpl)*acorr_1[len_smpl:n]
#2.2 : Note Calculation
min_index = which.min(acorr_2)
print(min_index)
fs = 44100
fo = fs/min_index #To obtain fundamental frequency
print(fo)
print(notenames(noteFromFF(fo)))
Output
> print(min_index)
[1] 37
> fs = 44100
> fo = fs/min_index
> print(fo)
[1] 1191.892
> print(notenames(noteFromFF(fo)))
[1] "d'''"
The entire calculation is performed in the Time Domain.
I'm currently using autocorrelation as a base to understand more about Pitch Detection & Analysis. I've tried to analyse the sample with 'Audacity' and the result is 'C5'. Hence, I'm wondering where actually the issue is.
Can you all help me find it?
Also, there are a few but important doubts:
How small should actually my analysis window be (20ms, 1s,..)?
Will reinforcement of the Autocorrelation Algorithm with AMDF and other similar algorithms make this Pitch Detection module more robust?
This whole analysis seems not correct. You should not use windowing in time domain analysis.
Attached a short solution in the python language; you can use it as pseudocode
from soundfile import read
from glob import glob
from scipy.signal import correlate, find_peaks
from matplotlib.pyplot import plot, show, xlim, title, xlabel
import numpy as np
%matplotlib inline
name = glob('*wav')[0]
samples, fs = read(name)
corr = correlate(samples, samples)
corr = corr[corr.size / 2:]
time = np.arange(corr.size) / float(fs)
ind = find_peaks(corr[time < 0.002])[0]
plot(time, corr)
plot(time[ind], corr[ind], '*')
xlim([0, 0.005])
title('Frequency = {} Hz'.format(1 / time[ind][0]))
xlabel('Time [Sec]')
show()

Error when implementing RBF kernel bandwidth differentiation in Pytorch

I'm implementing an RBF network by using some beginer examples from Pytorch Website. I have a problem when implementing the kernel bandwidth differentiation for the network. Also, Iwould like to know whether my attempt ti implement the idea is fine. This is a code sample to reproduce the issue. Thanks
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable
def kernel_product(x,y, mode = "gaussian", s = 1.):
x_i = x.unsqueeze(1)
y_j = y.unsqueeze(0)
xmy = ((x_i-y_j)**2).sum(2)
if mode == "gaussian" : K = torch.exp( - xmy/s**2) )
elif mode == "laplace" : K = torch.exp( - torch.sqrt(xmy + (s**2)))
elif mode == "energy" : K = torch.pow( xmy + (s**2), -.25 )
return torch.t(K)
class MyReLU(torch.autograd.Function):
"""
We can implement our own custom autograd Functions by subclassing
torch.autograd.Function and implementing the forward and backward passes
which operate on Tensors.
"""
#staticmethod
def forward(ctx, input):
"""
In the forward pass we receive a Tensor containing the input and return
a Tensor containing the output. ctx is a context object that can be used
to stash information for backward computation. You can cache arbitrary
objects for use in the backward pass using the ctx.save_for_backward method.
"""
ctx.save_for_backward(input)
return input.clamp(min=0)
#staticmethod
def backward(ctx, grad_output):
"""
In the backward pass we receive a Tensor containing the gradient of the loss
with respect to the output, and we need to compute the gradient of the loss
with respect to the input.
"""
input, = ctx.saved_tensors
grad_input = grad_output.clone()
grad_input[input < 0] = 0
return grad_input
dtype = torch.cuda.FloatTensor
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)
# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(H, D_in).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)
# I've created this scalar variable (the kernel bandwidth)
s = Variable(torch.randn(1).type(dtype), requires_grad=True)
learning_rate = 1e-6
for t in range(500):
# To apply our Function, we use Function.apply method. We alias this as 'relu'.
relu = MyReLU.apply
# Forward pass: compute predicted y using operations on Variables; we compute
# ReLU using our custom autograd operation.
# y_pred = relu(x.mm(w1)).mm(w2)
y_pred = relu(kernel_product(w1, x, s)).mm(w2)
# Compute and print loss
loss = (y_pred - y).pow(2).sum()
print(t, loss.data[0])
# Use autograd to compute the backward pass.
loss.backward()
# Update weights using gradient descent
w1.data -= learning_rate * w1.grad.data
w2.data -= learning_rate * w2.grad.data
# Manually zero the gradients after updating weights
w1.grad.data.zero_()
w2.grad.data.zero_()
However I get this error, which dissapears when I simply use a fixed scalar in the default input parameter of kernel_product():
RuntimeError: eq() received an invalid combination of arguments - got (str), but expected one of:
* (float other)
didn't match because some of the arguments have invalid types: (str)
* (Variable other)
didn't match because some of the arguments have invalid types: (str)
Well, you are calling kernel_product(w1, x, s) where w1, x and s are torch Variable while the definition of the function is: kernel_product(x,y, mode = "gaussian", s = 1.). Seems like s should be a string specifying the mode.

Resources