IndexError: Target 3 is out of bounds - bert-language-model

I keep getting "Target 3 out of bounds error" when I'm trying to perform training on a BERT classifyer. I found this code online and when I run the original code it works with now error. When I tweak it with my dataset then I get the error. Can someone help me out?
Here is the Code:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2
for _ in trange(epochs, desc = 'Epoch'):
# ========== Training ==========
# Set model to training mode
model.train()
# Tracking variables
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
for step, batch in enumerate(train_dataloader):
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
optimizer.zero_grad()
# Forward pass
train_output = model(b_input_ids,
token_type_ids = None,
attention_mask = b_input_mask,
labels = b_labels)
# Backward pass
torch.cuda.empty_cache()
train_output.loss.backward()
optimizer.step()
# Update tracking variables
tr_loss += train_output.loss.item()
nb_tr_examples += b_input_ids.size(0)
nb_tr_steps += 1
# ========== Validation ==========
# Set model to evaluation mode
model.eval()
# Tracking variables
val_accuracy = []
val_precision = []
val_recall = []
val_specificity = []
for batch in validation_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
# Forward pass
eval_output = model(b_input_ids,
token_type_ids = None,
attention_mask = b_input_mask)
logits = eval_output.logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
# Calculate validation metrics
b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
val_accuracy.append(b_accuracy)
# Update precision only when (tp + fp) !=0; ignore nan
if b_precision != 'nan': val_precision.append(b_precision)
# Update recall only when (tp + fn) !=0; ignore nan
if b_recall != 'nan': val_recall.append(b_recall)
# Update specificity only when (tn + fp) !=0; ignore nan
if b_specificity != 'nan': val_specificity.append(b_specificity)
Here is the error
IndexError Traceback (most recent call last)
<ipython-input-23-153b80916244> in <module>
20 optimizer.zero_grad()
21 # Forward pass
---> 22 train_output = model(b_input_ids,
23 token_type_ids = None,
24 attention_mask = b_input_mask,
4 frames
/usr/local/lib/python3.8/dist-packages/torch/nn/functional.py in # cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction,
label_smoothing)
3024 if size_average is not None or reduce is not None:
3025 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 3026 return torch._C._nn.cross_entropy_loss(input, target, weight,
_Reduction.get_enum(reduction), ignore_index, label_smoothing)
3027
3028
IndexError: Target 3 is out of bounds.
I've checked the size of the input variables being given to the function and they seem to be ok.
input_ids = torch.tensor(b_input_ids) # your input_ids tensor
print(input_ids.size())
torch.Size([16, 150])
attention_mask = torch.tensor(b_input_mask) # your attention_mask tensor
print(attention_mask.shape)
torch.Size([16, 150])
labels = torch.tensor(b_labels) # your attention_mask tensor
print(labels.shape)
torch.Size([16])`

Related

TypeError: Caught TypeError in DataLoader worker process 0. TypeError: 'KeyError' object is not iterable

from torchvision_starter.engine import train_one_epoch, evaluate
from torchvision_starter import utils
import multiprocessing
import time
n_cpu = multiprocessing.cpu_count()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
_ = model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=3,
gamma=0.2,
verbose=True
)
# Let's train for 10 epochs
num_epochs = 1
start = time.time()
for epoch in range(10, 10 + num_epochs):
# train for one epoch, printing every 10 iterations
train_one_epoch(model, optimizer, data_loaders['train'], device, epoch, print_freq=10)
# update the learning rate
lr_scheduler.step()
# evaluate on the validation dataset
evaluate(model, data_loaders['valid'], device=device)
stop = time.time()
print(f"\n\n{num_epochs} epochs in {stop - start} s ({(stop-start) / 3600:.2f} hrs)")
Before I move on to this part, everything is OK. But after I run the part, the error is like below:
I have tried to add drop_last to the helper.py's function like:
data_loaders["train"] = torch.utils.data.DataLoader(
train_data,
batch_size=batch_size,
sampler=train_sampler,
num_workers=num_workers,
collate_fn=utils.collate_fn,
drop_last=True
)
But it doesn't work. By the way, the torch and torchvision are compatible and Cuda is available.
I wonder how to fix it.
The get_data_loaders function:
def get_data_loaders(
folder, batch_size: int = 2, valid_size: float = 0.2, num_workers: int = -1, limit: int = -1, thinning: int = None
):
"""
Create and returns the train_one_epoch, validation and test data loaders.
:param foder: folder containing the dataset
:param batch_size: size of the mini-batches
:param valid_size: fraction of the dataset to use for validation. For example 0.2
means that 20% of the dataset will be used for validation
:param num_workers: number of workers to use in the data loaders. Use -1 to mean
"use all my cores"
:param limit: maximum number of data points to consider
:param thinning: take every n-th frame, instead of all frames
:return a dictionary with 3 keys: 'train_one_epoch', 'valid' and 'test' containing respectively the
train_one_epoch, validation and test data loaders
"""
if num_workers == -1:
# Use all cores
num_workers = multiprocessing.cpu_count()
# We will fill this up later
data_loaders = {"train": None, "valid": None, "test": None}
# create 3 sets of data transforms: one for the training dataset,
# containing data augmentation, one for the validation dataset
# (without data augmentation) and one for the test set (again
# without augmentation)
data_transforms = {
"train": get_transform(UdacitySelfDrivingDataset.mean, UdacitySelfDrivingDataset.std, train=True),
"valid": get_transform(UdacitySelfDrivingDataset.mean, UdacitySelfDrivingDataset.std, train=False),
"test": get_transform(UdacitySelfDrivingDataset.mean, UdacitySelfDrivingDataset.std, train=False),
}
# Create train and validation datasets
train_data = UdacitySelfDrivingDataset(
folder,
transform=data_transforms["train"],
train=True,
thinning=thinning
)
# The validation dataset is a split from the train_one_epoch dataset, so we read
# from the same folder, but we apply the transforms for validation
valid_data = UdacitySelfDrivingDataset(
folder,
transform=data_transforms["valid"],
train=True,
thinning=thinning
)
# obtain training indices that will be used for validation
n_tot = len(train_data)
indices = torch.randperm(n_tot)
# If requested, limit the number of data points to consider
if limit > 0:
indices = indices[:limit]
n_tot = limit
split = int(math.ceil(valid_size * n_tot))
train_idx, valid_idx = indices[split:], indices[:split]
# define samplers for obtaining training and validation batches
train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
valid_sampler = torch.utils.data.SubsetRandomSampler(valid_idx) # =
# prepare data loaders
data_loaders["train"] = torch.utils.data.DataLoader(
train_data,
batch_size=batch_size,
sampler=train_sampler,
num_workers=num_workers,
collate_fn=utils.collate_fn,
drop_last=True
)
data_loaders["valid"] = torch.utils.data.DataLoader(
valid_data, # -
batch_size=batch_size, # -
sampler=valid_sampler, # -
num_workers=num_workers, # -
collate_fn=utils.collate_fn,
drop_last=True
)
# Now create the test data loader
test_data = UdacitySelfDrivingDataset(
folder,
transform=data_transforms["test"],
train=False,
thinning=thinning
)
if limit > 0:
indices = torch.arange(limit)
test_sampler = torch.utils.data.SubsetRandomSampler(indices)
else:
test_sampler = None
data_loaders["test"] = torch.utils.data.DataLoader(
test_data,
batch_size=batch_size,
shuffle=False,
num_workers=num_workers,
sampler=test_sampler,
collate_fn=utils.collate_fn,
drop_last=True
# -
)
return data_loaders
class UdacitySelfDrivingDataset(torch.utils.data.Dataset):
# Mean and std of the dataset to be used in nn.Normalize
mean = torch.tensor([0.3680, 0.3788, 0.3892])
std = torch.tensor([0.2902, 0.3069, 0.3242])
def __init__(self, root, transform, train=True, thinning=None):
super().__init__()
self.root = os.path.abspath(os.path.expandvars(os.path.expanduser(root)))
self.transform = transform
# load datasets
if train:
self.df = pd.read_csv(os.path.join(self.root, "labels_train.csv"))
else:
self.df = pd.read_csv(os.path.join(self.root, "labels_test.csv"))
# Index by file id (i.e., a sequence of the same length as the number of images)
codes, uniques = pd.factorize(self.df['frame'])
if thinning:
# Take every n-th rows. This makes sense because the images are
# frames of videos from the car, so we are essentially reducing
# the frame rate
thinned = uniques[::thinning]
idx = self.df['frame'].isin(thinned)
print(f"Keeping {thinned.shape[0]} of {uniques.shape[0]} images")
print(f"Keeping {idx.sum()} objects out of {self.df.shape[0]}")
self.df = self.df[idx].reset_index(drop=True)
# Recompute codes
codes, uniques = pd.factorize(self.df['frame'])
self.n_images = len(uniques)
self.df['image_id'] = codes
self.df.set_index("image_id", inplace=True)
self.classes = ['car', 'truck', 'pedestrian', 'bicyclist', 'light']
self.colors = ['cyan', 'blue', 'red', 'purple', 'orange']
#property
def n_classes(self):
return len(self.classes)
def __getitem__(self, idx):
if idx in self.df.index:
row = self.df.loc[[idx]]
else:
return KeyError(f"Element {idx} not in dataframe")
# load images fromm file
img_path = os.path.join(self.root, "images", row['frame'].iloc[0])
img = Image.open(img_path).convert("RGB")
# Exclude bogus boxes with 0 height or width
h = row['ymax'] - row['ymin']
w = row['xmax'] - row['xmin']
filter_idx = (h > 0) & (w > 0)
row = row[filter_idx]
# get bounding box coordinates for each mask
boxes = row[['xmin', 'ymin', 'xmax', 'ymax']].values
# convert everything into a torch.Tensor
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# get the labels
labels = torch.as_tensor(row['class_id'].values, dtype=int)
image_id = torch.tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# assume no crowd for everything
iscrowd = torch.zeros((row.shape[0],), dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transform is not None:
img, target = self.transform(img, target)
return img, target
def __len__(self):
return self.n_images
def plot(self, idx, renormalize=True, predictions=None, threshold=0.5, ax=None):
image, label_js = self[idx]
if renormalize:
# Invert the T.Normalize transform
unnormalize = T.Compose(
[
T.Normalize(mean = [ 0., 0., 0. ], std = 1 / type(self).std),
T.Normalize(mean = -type(self).mean, std = [ 1., 1., 1. ])
]
)
image, label_js = unnormalize(image, label_js)
if ax is None:
fig, ax = plt.subplots(figsize=(8, 8))
_ = ax.imshow(torch.permute(image, [1, 2, 0]))
for i, box in enumerate(label_js['boxes']):
xy = (box[0], box[1])
h, w = (box[2] - box[0]), (box[3] - box[1])
r = patches.Rectangle(xy, h, w, fill=False, color=self.colors[label_js['labels'][i]-1], lw=2, alpha=0.5)
ax.add_patch(r)
if predictions is not None:
# Make sure the predictions are on the CPU
for k in predictions:
predictions[k] = predictions[k].detach().cpu().numpy()
for i, box in enumerate(predictions['boxes']):
if predictions['scores'][i] > threshold:
xy = (box[0], box[1])
h, w = (box[2] - box[0]), (box[3] - box[1])
r = patches.Rectangle(xy, h, w, fill=False, color=self.colors[predictions['labels'][i]-1], lw=2, linestyle=':')
ax.add_patch(r)
_ = ax.axis("off")
return ax

ValueError: shapes (2,1000) and (2,2,1000) not aligned: 1000 (dim 1) != 2 (dim 1)

I'm implementing a MLP to test a simple NN architecture, hoping to scale up to a bigger network with a larger dataset. My end goal is making a working phone recognizer for TIMIT data, as part of my internship.
To build the MLP, I used the suggestions of this video: https://www.youtube.com/watch?v=Z97XGNUUx9o.
And the proposal of my teacher to use the following inputs:
X = np.random.rand(5,1000)
y = X[4:5,:]
The error message is the following:
ValueError Traceback (most recent call last)
Cell In [63], line 7
5 build_model()
6 mlp = MLP(1000, [1000], 1000)
----> 7 mlp.train(inputs,targets, 50, 0.1)
8 output = mlp.forward_propagate(input)
Cell In [62], line 117, in MLP.train(self, inputs, targets, epochs, learning_rate)
115 output = self.forward_propagate(input)
116 error = target - output
--> 117 self.back_propagate(error)
118 self.gradient_descent(learning_rate=1)
119 sum_error += self._mse(target,output)
Cell In [62], line 96, in MLP.back_propagate(self, error)
94 current_activations = self.activations[i]
95 current_activations_reshaped = current_activations.reshape(current_activations.shape[0], -1)
---> 96 self.derivatives[i] = np.dot(current_activations, delta)
97 error = np.dot(delta, self.weights[i].T)
98 return error
File <__array_function__ internals>:180, in dot(*args, **kwargs)
ValueError: shapes (2,1000) and (2,2,1000) not aligned: 1000 (dim 1) != 2 (dim 1)
This is the relevant code:
class MLP(object):
def __init__(self, num_inputs=3, hidden_layers=[3,3], num_outputs=2):
self.num_inputs = num_inputs
self.hidden_layers = hidden_layers
self.num_outputs = num_outputs
layers = [num_inputs] + hidden_layers + [num_outputs]
weights = []
for i in range(len(layers) - 1):
w = np.random.rand(layers[i], layers[i + 1])
weights.append(w)
self.weights = weights
activations = []
for i in range(len(layers)):
a = np.zeros(layers[i])
activations.append(a)
self.activations = activations
derivatives = []
for i in range(len(layers) - 1):
d = np.zeros((layers[i], layers[i+1]))
derivatives.append(d)
self.derivatives = derivatives
def forward_propagate(self,inputs):
activations = inputs
self.activations[0] = inputs
for i in range(len(self.weights)):
net_inputs = np.dot(activations,self.weights)
activations = self._sigmoid(net_inputs)
self.activations[i+1] = activations
return activations
def back_propagate(self, error):
for i in reversed(range(len(self.derivatives))):
activations = self.activations[i+1]
delta = error * self._sigmoid_derivative(activations)
delta_reshaped = delta.reshape(delta.shape[0], -1).T
current_activations = self.activations[i]
current_activations_reshaped = current_activations.reshape(current_activations.shape[0], -1)
self.derivatives[i] = np.dot(current_activations, delta)
error = np.dot(delta, self.weights[i].T)
return error
def _sigmoid_derivative(self,x):
return x * (1.0 - x)
def _sigmoid(self,x):
y = 1.0 / (1+np.exp(-x))
return y
def gradient_descent(self, learning_rate):
for i in range(len(self.weights)):
weights = self.weights[i]
derivatives = self.derivatives[i]
weights += derivatives + learning_rate
def _mse(self,target,output):
return np.average((target-output)**2)
def train(self,inputs,targets,epochs,learning_rate):
for i in range(epochs):
sum_error = 0
for input,target in zip(inputs,targets):
output = self.forward_propagate(input)
error = target - output
self.back_propagate(error)
self.gradient_descent(learning_rate=1)
sum_error += self._mse(target,output)
print("Error: {} at epoch {}".format(sum_error/len(inputs), i))
And this is how I ran it:
if __name__ == "__main__":
X, y = load_dataset()
inputs = X
targets = y
build_model()
mlp = MLP(1000, [1000], 1000)
mlp.train(inputs,targets, 50, 0.1)
output = mlp.forward_propagate(input)
Thanks in advance!
I tried to do what the video said, to set up an MLP, as was the suggestion of the teacher, but I don't know how to solve the shape error.

How to create a DataSet of 1000 graphs in python

I need to create a dataset of 1000 graphs. I used the following code:
data_list = []
ngraphs = 1000
for i in range(ngraphs):
num_nodes = randint(10,500)
num_edges = randint(10,num_nodes*(num_nodes - 1))
f1 = np.random.randint(10, size=(num_nodes))
f2 = np.random.randint(10,20, size=(num_nodes))
f3 = np.random.randint(20,30, size=(num_nodes))
f_final = np.stack((f1,f2,f3), axis=1)
capital = 2*f1 + f2 - f3
f1_t = torch.from_numpy(f1)
f2_t = torch.from_numpy(f2)
f3_t = torch.from_numpy(f3)
capital_t = torch.from_numpy(capital)
capital_t = capital_t.type(torch.LongTensor)
x = torch.from_numpy(f_final)
x = x.type(torch.LongTensor)
edge_index = torch.randint(low=0, high=num_nodes, size=(num_edges,2), dtype=torch.long)
edge_attr = torch.randint(low=0, high=50, size=(num_edges,1), dtype=torch.long)
data = Data(x = x, edge_index = edge_index.t().contiguous(), y = capital_t, edge_attr=edge_attr )
data_list.append(data)
This works. But when I run my training function as follows:
for epoch in range(1, 500):
loss = train()
print(f'Loss: {loss:.4f}')
I keep getting the following error:
RuntimeError Traceback (most recent call
last) in ()
1 for epoch in range(1, 500):
----> 2 loss = train()
3 print(f'Loss: {loss:.4f}')
5 frames /usr/local/lib/python3.7/dist-packages/torch/nn/functional.py
in linear(input, weight, bias) 1845 if
has_torch_function_variadic(input, weight): 1846 return
handle_torch_function(linear, (input, weight), input, weight,
bias=bias)
-> 1847 return torch._C._nn.linear(input, weight, bias) 1848 1849
RuntimeError: expected scalar type Float but found Long
Can someone help me troubleshoot this. Or make a 1000 graph dataset that doesn't throw this error.
Change your x and y tensor into FloatTensor, since Linear layer in python only accept FloatTensor inputs

Custom Evaluation Function based on F1 for use in xgboost - Python API

I have written the following custom evaluation function to use with xgboost, in order to optimize F1. Umfortuantely it returns an exception when run with xgboost.
The evaluation function is the following:
def F1_eval(preds, labels):
t = np.arange(0, 1, 0.005)
f = np.repeat(0, 200)
Results = np.vstack([t, f]).T
P = sum(labels == 1)
for i in range(200):
m = (preds >= Results[i, 0])
TP = sum(labels[m] == 1)
FP = sum(labels[m] == 0)
if (FP + TP) > 0:
Precision = TP/(FP + TP)
Recall = TP/P
if (Precision + Recall >0) :
F1 = 2 * Precision * Recall / (Precision + Recall)
else:
F1 = 0
Results[i, 1] = F1
return(max(Results[:, 1]))
Below I provide a reproducible example along with the error message:
from sklearn import datasets
Wine = datasets.load_wine()
X_wine = Wine.data
y_wine = Wine.target
y_wine[y_wine == 2] = 1
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(X_wine, y_wine, test_size = 0.2)
clf_wine = xgb.XGBClassifier(max_depth=6, learning_rate=0.1,silent=False, objective='binary:logistic', \
booster='gbtree', n_jobs=8, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, \
subsample=0.8, colsample_bytree=0.8, colsample_bylevel=1, reg_alpha=0, reg_lambda=1)
clf_wine.fit(X_wine_train, y_wine_train,\
eval_set=[(X_wine_train, y_wine_train), (X_wine_test, y_wine_test)], eval_metric=F1_eval, early_stopping_rounds=10, verbose=True)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-453-452852658dd8> in <module>()
12 clf_wine = xgb.XGBClassifier(max_depth=6, learning_rate=0.1,silent=False, objective='binary:logistic', booster='gbtree', n_jobs=8, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, colsample_bylevel=1, reg_alpha=0, reg_lambda=1)
13
---> 14 clf_wine.fit(X_wine_train, y_wine_train,eval_set=[(X_wine_train, y_wine_train), (X_wine_test, y_wine_test)], eval_metric=F1_eval, early_stopping_rounds=10, verbose=True)
15
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\sklearn.py in fit(self, X, y, sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set)
519 early_stopping_rounds=early_stopping_rounds,
520 evals_result=evals_result, obj=obj, feval=feval,
--> 521 verbose_eval=verbose, xgb_model=None)
522
523 self.objective = xgb_options["objective"]
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, learning_rates)
202 evals=evals,
203 obj=obj, feval=feval,
--> 204 xgb_model=xgb_model, callbacks=callbacks)
205
206
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
82 # check evaluation result.
83 if len(evals) != 0:
---> 84 bst_eval_set = bst.eval_set(evals, i, feval)
85 if isinstance(bst_eval_set, STRING_TYPES):
86 msg = bst_eval_set
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\core.py in eval_set(self, evals, iteration, feval)
957 if feval is not None:
958 for dmat, evname in evals:
--> 959 feval_ret = feval(self.predict(dmat), dmat)
960 if isinstance(feval_ret, list):
961 for name, val in feval_ret:
<ipython-input-383-dfb8d5181b18> in F1_eval(preds, labels)
11
12
---> 13 P = sum(labels == 1)
14
15
TypeError: 'bool' object is not iterable
I do not understand why the function is not working. I have followed the examples here: https://github.com/dmlc/xgboost/blob/master/demo/guide-python/custom_objective.py
I would like to understand where I err.
When doing sum(labels == 1), Python evaluates labels == 1 as a Boolean object, thus you get TypeError: 'bool' object is not iterable
The function sum expecting an iterable object, like a list. Here's an example of your error:
In[32]: sum(True)
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2963, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-32-6eb8f80b7f2e>", line 1, in <module>
sum(True)
TypeError: 'bool' object is not iterable
If you want to use f1_score of scikit-learn you can implement the following wrapup:
from sklearn.metrics import f1_score
import numpy as np
def f1_eval(y_pred, dtrain):
y_true = dtrain.get_label()
err = 1-f1_score(y_true, np.round(y_pred))
return 'f1_err', err
params of the wrap up are list (of predictions) and DMatrix, and it returns a string, float
# Setting your classifier
clf_wine = xgb.XGBClassifier(max_depth=6, learning_rate=0.1,silent=False, objective='binary:logistic', \
booster='gbtree', n_jobs=8, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, \
subsample=0.8, colsample_bytree=0.8, colsample_bylevel=1, reg_alpha=0, reg_lambda=1)
# When you fit, add eval_metric=f1_eval
# Please don't forget to insert all the .fit arguments required
clf_wine.fit(eval_metric=f1_eval)
Here you can see an example of how to implement custom objective function and custom evaluation metric
Example containing the following code:
# user defined evaluation function, return a pair metric_name, result
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make builtin evaluation metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the builtin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
def evalerror(preds, dtrain):
labels = dtrain.get_label()
# return a pair metric_name, result
# since preds are margin(before logistic transformation, cutoff at 0)
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
which specify that an evaluation function gets as arguments (predictions, dtrain) dtrain is of type DMatrix and returns a string, float which is the name of the metric and the error.
Adding working python code example
import numpy as np
def _F1_eval(preds, labels):
t = np.arange(0, 1, 0.005)
f = np.repeat(0, 200)
results = np.vstack([t, f]).T
# assuming labels only containing 0's and 1's
n_pos_examples = sum(labels)
if n_pos_examples == 0:
raise ValueError("labels not containing positive examples")
for i in range(200):
pred_indexes = (preds >= results[i, 0])
TP = sum(labels[pred_indexes])
FP = len(labels[pred_indexes]) - TP
precision = 0
recall = TP / n_pos_examples
if (FP + TP) > 0:
precision = TP / (FP + TP)
if (precision + recall > 0):
F1 = 2 * precision * recall / (precision + recall)
else:
F1 = 0
results[i, 1] = F1
return (max(results[:, 1]))
if __name__ == '__main__':
labels = np.random.binomial(1, 0.75, 100)
preds = np.random.random_sample(100)
print(_F1_eval(preds, labels))
And if you want to implement _F1_eval to work specifically for xgboost evaluation methods add this:
def F1_eval(preds, dtrain):
res = _F1_eval(preds, dtrain.get_label())
return 'f1_err', 1-res

Tensorflow: 6 layer CNN: OOM (use 10Gb GPU memory)

I am using the following code for running a 6 layer CNN with 2 FC layers on top (on Tesla K-80 GPU).
Somehow, it consumes entire memory 10GB and died out of memory.I know that i can reduce the batch_size and then run , but i also want to run with 15 or 20 CNN layers.Whats wrong with the following code and why it takes all the memory? How should i run the code for 15 layers CNN.
Code:
import model
with tf.Graph().as_default() as g_train:
filenames = tf.train.match_filenames_once(FLAGS.train_dir+'*.tfrecords')
filename_queue = tf.train.string_input_producer(filenames, shuffle=True, num_epochs=FLAGS.num_epochs)
feats,labels = get_batch_input(filename_queue, batch_size=FLAGS.batch_size)
### feats size=(batch_size, 100, 50)
logits = model.inference(feats, FLAGS.batch_size)
loss = model.loss(logits, labels, feats)
tvars = tf.trainable_variables()
global_step = tf.Variable(0, name='global_step', trainable=False)
# Add to the Graph operations that train the model.
train_op = model.training(loss, tvars, global_step, FLAGS.learning_rate, FLAGS.clip_gradients)
# Add the Op to compare the logits to the labels during evaluation.
eval_correct = model.evaluation(logits, labels, feats)
summary_op = tf.merge_all_summaries()
saver = tf.train.Saver(tf.all_variables(), max_to_keep=15)
# The op for initializing the variables.
init_op = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init_op)
summary_writer = tf.train.SummaryWriter(FLAGS.model_dir,
graph=sess.graph)
# Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
step = 0
while not coord.should_stop():
_, loss_value = sess.run([train_op, loss])
if step % 100 == 0:
print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value))
# Update the events file.
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if (step == 0) or (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
ckpt_model = os.path.join(FLAGS.model_dir, 'model.ckpt')
saver.save(sess, ckpt_model, global_step=step)
#saver.save(sess, FLAGS.model_dir, global_step=step)
step += 1
except tf.errors.OutOfRangeError:
print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
finally:
coord.join(threads)
sess.close()
###################### File model.py ####################
def conv2d(x, W, b, strides=1):
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1],
padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def maxpool2d(x, k=2,s=2):
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, s,
s,1],padding='SAME')
def inference(feats,batch_size):
#feats size (batch_size,100,50,1) #batch_size=256
conv1_w=tf.get_variable("conv1_w", [filter_size,filter_size,1,256],initializer=tf.uniform_unit_scaling_initializer())
conv1_b=tf.get_variable("conv1_b",[256])
conv1 = conv2d(feats, conv1_w, conv1_b,2)
conv1 = maxpool2d(conv1, k=2,s=2)
### This was replicated for 6 layers and the 2 FC connected layers are added
return logits
def training(loss, train_vars, global_step, learning_rate, clip_gradients):
# Add a scalar summary for the snapshot loss.
tf.scalar_summary(loss.op.name, loss)
grads, _ = tf.clip_by_global_norm(tf.gradients(loss, train_vars,aggregation_method=1), clip_gradients)
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.apply_gradients(zip(grads, train_vars), global_step=global_step)
return train_op
I am not too sure what the model python library is. If it is something you wrote and can change the setting in the optimizer I would suggest the following which I use in my own code
train_step = tf.train.AdamOptimizer(learning_rate).minimize(cost, aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
By default the aggeragetion_method is ADD_N but if you change it to EXPERIMENTAL_ACCUMULATE_N or EXPERIMENTAL_TREE this will greatly save memory. The main memory hog in these programs is that tensorflow must save the output values at every neuron so that it can compute the gradients. Changing the aggregation_method helps a lot from my experience.
Also BTW I don't think there is anything wrong with your code. I can run out of memory on small cov-nets as well.

Resources