grads is showing none after loss.backward() in fgsm attack - bert-language-model

import torch
from torch import nn
from transformers import BertTokenizer, BertForSequenceClassification
# Load pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.cuda()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Set up device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define PGD/FGSM attack functions
def fgsm_attack(model, loss_fn, input_ids, attention_mask, labels, epsilon=0.1):
input_ids = input_ids.float()
attention_mask = attention_mask.float()
input_ids.requires_grad = True
attention_mask.requires_grad = True
logits = model(input_ids=input_ids.long(), attention_mask=attention_mask.long())[0]
loss = loss_fn(logits, labels)
loss.backward()
# Create perturbation tensor based on sign of gradients
perturbation = epsilon * input_ids.grad.sign()
# Add perturbation to input tensor and clamp values
perturbed_input = input_ids + perturbation
perturbed_input = torch.clamp(perturbed_input, 0, 1)
# Detach input from computation graph and return perturbed input
return perturbed_input.detach()
from tqdm import tqdm
for batch in tqdm(validation_dataloader):
input_ids = batch[0].to(device)
token_type_ids = batch[1].to(device)
attention_mask = batch[2].to(device)
labels = batch[3].to(device)
peterbed_inputs = fgsm_attack(model, nn.CrossEntropyLoss(), input_ids, attention_mask, labels, epsilon=0.1)
#outputs = model(input_ids, attention_mask=attention_mask, token_type_ids = token_type_ids)
I imported the BERT model and tried to perform fgsm attack on it. But it throws the error while getting sign of input_ids.grad.sign() showing that the value is None
Can some please help me why the input_ids.grad showing None
I expected input_ids.grads not to be None

Related

Optimization failing after very few iterations for nonlinear constraints calculated in a blackbox wrapped in an explicit component

I have a blackbox solver which is wrapped as explicit component and the objective function and constraints are calculated in the blackbox solver and output. These are taken to a constraint components that has an equality constraint defined such that at any iteration, these constraints are satisifed. I am using finite difference to approximate the partial derivatives. However, I get this SLSQP error "Positive directional derivative for linesearch". From S.O., I understand that this error translates - optimizer could not find a direction to move to and also couldn't verify if the results are minimum. I found that for some iterations derivative is 'None' and it was 'None' at least a few times before it threw this error. Is it because the constraints are calculated in the black box solver? or is it because 'fd' for approximation is not working for non linear constraints? or both? A problem summary is attached for reference.
from PowerHarvest import *
from HydroDynamics import *
from SatelliteComms import *
from Propulsion import *
from Constraints import *
from SystemCost import *
class MDA(Group):
"""Multidisciplinary Analysis Group"""
def __init__(self, derivative_method='fd', **kwargs):
super(MDA, self).__init__(**kwargs)
self.derivative_method = 'fd'
def setup(self):
cycle = self.add_subsystem('cycle',Group(), promotes = ["*"])
cycle.nonlinear_solver = om.NewtonSolver(solve_subsystems = True)
cycle.nonlinear_solver.options['atol'] = 1e-6
cycle.add_subsystem('Hydro', Hydro(),promotes = ["*"]) #This is a blackbox explicit component!
cycle.add_subsystem('Propulsion_system', Propulsion(),promotes = ["*"])
cycle.add_subsystem('PowerHarvest_system',PowerHarvest(),promotes = ["*"])
cycle.add_subsystem("SatelitteComs_system", SatelitteComs(),promotes = ["*"])
cycle.nonlinear_solver.options['atol'] = 1.0e-5
cycle.nonlinear_solver.options['maxiter'] = 500
cycle.nonlinear_solver.options['iprint'] = 2
#Add constraint on the each subsytem if possible
#cycle.add_constraint('',om.ex)
self.add_subsystem('PowerConstraints_system', PowerConstraints(), promotes=["*"])
self.add_subsystem('BodyConstraints_system', BodyConstraint(),promotes = ["*"])
self.add_subsystem('SystemCost_system',SystemCost(), promotes = ['*'])
self.add_constraint('A_PV', upper = 100, units = 'm**2')
#these constraints are output of the blackbox solver!
self.add_constraint('AreaCon', upper = 0)
self.add_constraint('massCon',equals = 0)
self.add_constraint('P_Load', upper = 0) # Solar generates just enough for everything no storing!
self.add_constraint('DraughtCon', lower = 0.5 )
self.add_constraint('GMCon', lower = 0.01) #should be positive
#self.add_constraint('theta', upper = 0.14, lower = 0.1)
self.add_constraint('Amplitude_Con',upper = -0.1) #amplitude differenc
Added. Run script
import openmdao.api as om
from geom_utils import *
from openmdao.api import Problem, Group, ExplicitComponent,ImplicitComponent, IndepVarComp, ExecComp,\
NonlinearBlockGS, ScipyOptimizeDriver,NewtonSolver,DirectSolver,ScipyKrylov
import os
import numpy as np
from types import FunctionType
from geom_utils import *
from capytaine.meshes.meshes import Mesh
from pprint import pprint
from PowerHarvest import *
from HydroDynamics import *
from SatelliteComms import *
from Propulsion import *
from Constraints import *
from SystemCost import *
from PEARLMDA import *
if __name__ == '__main__':
prob = Problem()
model = prob.model = MDA()
prob.driver = ScipyOptimizeDriver(optimizer = 'SLSQP')
# prob.model.nonlinear_solver = om.NonlinearBlockGS()
#prob.driver.options['optimizer'] = 'COBYLA'
prob.driver.options['tol'] = 1e-5
prob.model.add_design_var('Df', lower= 6.0, upper=20.0, units = "m")
prob.model.add_design_var('tf', lower=1.0, upper=4.0, units = "m")
#prob.model.add_design_var('submergence', upper = -0.9)
prob.model.add_design_var('Vs', lower=1, upper=2, units = "m/s") #make sure the lower, upper are according to their units.
prob.model.add_design_var('ld', lower = 3, upper = 7, units = 'm' )
prob.model.add_objective('cost_per_byte' )
newton = om.NewtonSolver(solve_subsystems=True)
newton.linesearch = om.BoundsEnforceLS()
prob.model.nonlinear_solver = newton
prob.model.linear_solver = om.DirectSolver()
# sqlite file to record the intermediate calculations and derivatives
r = om.SqliteRecorder("pearl_computations.sql")
prob.add_recorder(r)
prob.driver.add_recorder(r)
prob.driver.recording_options["record_derivatives"] = True
# Attach recorder to a subsystem
model.nonlinear_solver.add_recorder(r)
model.add_recorder(r)
prob.driver.recording_options["includes"] = ["*"]
# Attach recorder to a solver
model.nonlinear_solver.add_recorder(r)
prob.setup()
prob.set_solver_print(level=2)
# For gradients across the model this will do the finite difference method
prob.model.approx_totals(method="fd", step=0.1, form="forward", step_calc="abs")
prob.run_model()
prob.run_driver()
prob.record("final_state")
print('minimum objective found at')
print(prob['cost_per_byte'][0])
print(prob['A_PV'])
print(f"tf: {prob['tf'][0]}")
results = dict()
results['tf'] = prob['tf'][0]
results['Df'] = prob['Df'][0]
results['ld'] = prob['ld'][0]
results['mass'] = prob['Payloadmass'][0]
results['DraughCon'] = prob['DraughtCon'][0]
results['AmplitudeCon'] = prob['AmplitudeCon'][0]
print(results)
Scaling report

RuntimeError: quantile() q tensor must be same dtype as the input tensor in pytorch-forecasting

PyTorch-Forecasting version: 0.10.2
PyTorch version:1.12.1
Python version:3.10.4
Operating System: windows
Expected behavior
No Error
Actual behavior
The Error is
File c:\Users\josepeeterson.er\Miniconda3\envs\pytorch\lib\site-packages\pytorch_forecasting\metrics\base_metrics.py:979, in DistributionLoss.to_quantiles(self, y_pred, quantiles, n_samples)
977 except NotImplementedError: # resort to derive quantiles empirically
978 samples = torch.sort(self.sample(y_pred, n_samples), -1).values
--> 979 quantiles = torch.quantile(samples, torch.tensor(quantiles, device=samples.device), dim=2).permute(1, 2, 0)
980 return quantiles
RuntimeError: quantile() q tensor must be same dtype as the input tensor
How do I set them to be of same datatype? This is happening internally. I do not have control over this. I am not using any GPUs.
The link to the .csv file with input data is https://github.com/JosePeeterson/Demand_forecasting
The data is just sampled from a negative binomila distribution wiht parameters (9,0.5) every 4 hours. the time inbetween is all zero.
I just want to see if DeepAR can learn this pattern.
Code to reproduce the problem
from pytorch_forecasting.data.examples import generate_ar_data
import matplotlib.pyplot as plt
import pandas as pd
from pytorch_forecasting.data import TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
import pytorch_lightning as pl
from pytorch_forecasting import NegativeBinomialDistributionLoss, DeepAR
import torch
from pytorch_forecasting.data.encoders import TorchNormalizer
data = [pd.read_csv('1_f_nbinom_train.csv')]
data["date"] = pd.Timestamp("2021-08-24") + pd.to_timedelta(data.time_idx, "H")
data['_hour_of_day'] = str(data["date"].dt.hour)
data['_day_of_week'] = str(data["date"].dt.dayofweek)
data['_day_of_month'] = str(data["date"].dt.day)
data['_day_of_year'] = str(data["date"].dt.dayofyear)
data['_week_of_year'] = str(data["date"].dt.weekofyear)
data['_month_of_year'] = str(data["date"].dt.month)
data['_year'] = str(data["date"].dt.year)
max_encoder_length = 60
max_prediction_length = 20
training_cutoff = data["time_idx"].max() - max_prediction_length
training = TimeSeriesDataSet(
data.iloc[0:-620],
time_idx="time_idx",
target="value",
categorical_encoders={"series": NaNLabelEncoder(add_nan=True).fit(data.series), "_hour_of_day": NaNLabelEncoder(add_nan=True).fit(data._hour_of_day), \
"_day_of_week": NaNLabelEncoder(add_nan=True).fit(data._day_of_week), "_day_of_month" : NaNLabelEncoder(add_nan=True).fit(data._day_of_month), "_day_of_year" : NaNLabelEncoder(add_nan=True).fit(data._day_of_year), \
"_week_of_year": NaNLabelEncoder(add_nan=True).fit(data._week_of_year), "_year": NaNLabelEncoder(add_nan=True).fit(data._year)},
group_ids=["series"],
min_encoder_length=max_encoder_length,
max_encoder_length=max_encoder_length,
min_prediction_length=max_prediction_length,
max_prediction_length=max_prediction_length,
time_varying_unknown_reals=["value"],
time_varying_known_categoricals=["_hour_of_day","_day_of_week","_day_of_month","_day_of_year","_week_of_year","_year" ],
time_varying_known_reals=["time_idx"],
add_relative_time_idx=False,
randomize_length=None,
scalers=[],
target_normalizer=TorchNormalizer(method="identity",center=False,transformation=None )
)
validation = TimeSeriesDataSet.from_dataset(
training,
data.iloc[-620:-420],
# predict=True,
stop_randomization=True,
)
batch_size = 64
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=8)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=8)
# save datasets
training.save("training.pkl")
validation.save("validation.pkl")
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=5, verbose=False, mode="min")
lr_logger = LearningRateMonitor()
trainer = pl.Trainer(
max_epochs=10,
gpus=0,
gradient_clip_val=0.1,
limit_train_batches=30,
limit_val_batches=3,
# fast_dev_run=True,
# logger=logger,
# profiler=True,
callbacks=[lr_logger, early_stop_callback],
)
deepar = DeepAR.from_dataset(
training,
learning_rate=0.1,
hidden_size=32,
dropout=0.1,
loss=NegativeBinomialDistributionLoss(),
log_interval=10,
log_val_interval=3,
# reduce_on_plateau_patience=3,
)
print(f"Number of parameters in network: {deepar.size()/1e3:.1f}k")
torch.set_num_threads(10)
trainer.fit(
deepar,
train_dataloaders=train_dataloader,
val_dataloaders=val_dataloader,
)
Need to cast samples to torch.tensor as shown below. Then save this base_metrics.py and rerun above code.
except NotImplementedError: # resort to derive quantiles empirically
samples = torch.sort(self.sample(y_pred, n_samples), -1).values
quantiles = torch.quantile(torch.tensor(samples), torch.tensor(quantiles, device=samples.device), dim=2).permute(1, 2, 0)
return quantiles

Error message when running the codes in Jupyter notebook

I am trying to test out the accuracy of the images without using image augmentation. When I run both of the codes, I got an error shown below:
TypeError: 'NoneType' object is not callable
I found that the error occurs in the second code. I would like to know the reason on the cause of this error message, and how to resolve it. Attached below are my codes, which have to be run simultaneously. I am using Jupyter notebook for that. Thanks!
Code 1:
import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
import math
class CrossEntropyLabelSmooth(nn.Module):
"""Cross entropy loss with label smoothing regularizer.
Reference:
Szegedy et al. Rethinking the Inception Architecture for Computer Vision. CVPR 2016.
Equation: y = (1 - epsilon) * y + epsilon / K.
Args:
num_classes (int): number of classes.
epsilon (float): weight.
"""
def __init__(self, num_classes, epsilon=0.1, device='cpu'):
super(CrossEntropyLabelSmooth, self).__init__()
self.num_classes = num_classes
self.epsilon = epsilon
self.device = device
self.logsoftmax = nn.LogSoftmax(dim=1)
def forward(self, inputs, targets):
"""
Args:
inputs: prediction matrix (before softmax) with shape (batch_size, num_classes)
targets: ground truth labels with shape (num_classes)
"""
log_probs = self.logsoftmax(inputs)
# targets = torch.zeros(log_probs.size()).scatter_(1, targets.unsqueeze(1).data, 1)# for mldg da
targets = torch.zeros(log_probs.size()).scatter_(1, targets.unsqueeze(1).data.cpu(), 1)#for zzd
targets = targets.to(self.device)
targets = (1 - self.epsilon) * targets + self.epsilon / self.num_classes
loss = (-Variable(targets) * log_probs).mean(0).sum()
return loss
class TripletLoss(nn.Module):
"""Triplet loss with hard positive/negative mining.
Reference:
Hermans et al. In Defense of the Triplet Loss for Person Re-Identification. arXiv:1703.07737.
Code imported from https://github.com/Cysu/open-reid/blob/master/reid/loss/triplet.py.
Args:
margin (float): margin for triplet.
"""
def __init__(self, margin=0.3):
super(TripletLoss, self).__init__()
self.margin = margin
self.ranking_loss = nn.MarginRankingLoss(margin=margin)
def forward(self, inputs, targets):
"""
Args:
inputs: feature matrix with shape (batch_size, feat_dim)
targets: ground truth labels with shape (num_classes)
"""
n = inputs.size(0)
# Compute pairwise distance, replace by the official when merged
dist = torch.pow(inputs, 2).sum(dim=1, keepdim=True).expand(n, n)
dist = dist + dist.t()
dist.addmm_(1, -2, inputs, inputs.t())
dist = dist.clamp(min=1e-12).sqrt() # for numerical stability
# For each anchor, find the hardest positive and negative
mask = targets.expand(n, n).eq(targets.expand(n, n).t())
dist_ap, dist_an = [], []
for i in range(n):
dist_ap.append(dist[i][mask[i]].max().unsqueeze(0))
dist_an.append(dist[i][mask[i] == 0].min().unsqueeze(0))
dist_ap = torch.cat(dist_ap)
dist_an = torch.cat(dist_an)
# Compute ranking hinge loss
y = torch.ones_like(dist_an)
loss = self.ranking_loss(dist_an, dist_ap, y)
return loss
class CenterLoss(nn.Module):
"""Center loss.
Reference:
Wen et al. A Discriminative Feature Learning Approach for Deep Face Recognition. ECCV 2016.
Args:
num_classes (int): number of classes.
feat_dim (int): feature dimension.
"""
def __init__(self, num_classes=10, feat_dim=2048, device='cpu'):
super(CenterLoss, self).__init__()
self.num_classes = num_classes
self.feat_dim = feat_dim
self.device = device
self.centers = nn.Parameter(torch.randn(self.num_classes, self.feat_dim)).to(self.device)
def forward(self, x, labels):
"""
Args:
x: feature matrix with shape (batch_size, feat_dim).
labels: ground truth labels with shape (num_classes).
"""
batch_size = x.size(0)
distmat = torch.pow(x, 2).sum(dim=1, keepdim=True).expand(batch_size, self.num_classes) + \
torch.pow(self.centers, 2).sum(dim=1, keepdim=True).expand(self.num_classes, batch_size).t()
distmat.addmm_(1, -2, x, self.centers.t())
classes = torch.arange(self.num_classes).long()
classes = classes.to(self.device)
labels = labels.unsqueeze(1).expand(batch_size, self.num_classes)
mask = labels.data.eq(classes.expand(batch_size, self.num_classes))
dist = []
for i in range(batch_size):
value = distmat[i][mask[i]]
value = value.clamp(min=1e-12, max=1e+12) # for numerical stability
dist.append(value)
dist = torch.cat(dist)
loss = dist.mean()
return loss
Code 2:
# Code without data augmentation
import torch
import torch.nn as nn
from torchvision.datasets import ImageFolder
from torchvision import transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
import os
import numpy as np
from tqdm import tqdm
from PIL import Image
class FoodDataset(Dataset):
def __init__(self, file, transform=None, mode='train'):
self.transforms = transform
self.mode = mode
with open(file, 'r') as f:
self.image_list = f.readlines()
def __len__(self):
return len(self.image_list)
def __getitem__(self, index):
label = None
if self.mode == 'train':
image, label = self.image_list[index].split('\n')[0].split('\t')
label = int(label)
else:
image = self.image_list[index].split('\n')[0]
image = Image.open(image).convert('RGB')
image = self.transforms(image)
if self.mode == 'train':
return image, label
else:
return image
#transforms_train = transforms.Compose([
# transforms.Resize((224, 224)),
# transforms.RandomHorizontalFlip(p=0.5),
# transforms.RandomVerticalFlip(p=0.5),
# transforms.Pad(10, 10),
# transforms.RandomRotation(45),
# transforms.RandomCrop((224, 224)),
# transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5),
# transforms.ToTensor(),
# transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# ])
#transforms_test = transforms.Compose([
# transforms.Resize((224, 224)),
# transforms.ToTensor(),
# transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# ])
def evaluate(prediction, ground_truth):
num_correct = (np.array(prediction) == np.array(ground_truth)).sum()
return num_correct / len(prediction)
train_ds = FoodDataset('data/train.txt')
val_ds = FoodDataset('data/val.txt')
test_ds = FoodDataset('data/test.txt')
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=True)
num_classes = 5
train_model = models.resnet50(pretrained=True)
train_model.fc = nn.Linear(2048, num_classes)
output_dir = 'checkpoint'
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
ce_loss = CrossEntropyLabelSmooth(num_classes = num_classes, device = device)
optimizer = torch.optim.Adam(train_model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)
for param in train_model.parameters():
param.requires_grad = False
for param in train_model.fc.parameters():
param.requires_grad = True
for i in range(5):
train_model.train()
train_model.to(device)
for img, label in tqdm(train_dl):
img = img.to(device)
label = label.to(device)
optimizer.zero_grad()
output= train_model(img)
loss = ce_loss(output, label)
loss.backward()
optimizer.step()
for param in train_model.parameters():
param.requires_grad = True
epoch = 100
highest_acc = {'epoch': 0, 'accuracy': 0}
for ep in range(epoch):
train_model.train()
train_model.to(device)
count = 0
running_loss = 0.0
validation_loss = 0.0
output_list = []
ground_truth_list = []
for img, label in tqdm(train_dl):
img = img.to(device)
label = label.to(device)
optimizer.zero_grad()
output= train_model(img)
loss = ce_loss(output, label)
count += 1
prediction = torch.argmax(output, dim=1)
output_list.extend(prediction.detach().cpu())
ground_truth_list.extend(label.cpu())
running_loss += loss.item()
loss.backward()
optimizer.step()
scheduler.step()
if ep % 10 == 0:
torch.save(train_model.state_dict(), output_dir + '/resnet50_' + str(ep) + '.pth')
accuracy = evaluate(output_list, ground_truth_list)
print(f'Epoch[{ep}] training accuracy: {accuracy} '
f'training loss: {running_loss / count:.3e} Base Lr: {optimizer.param_groups[0]["lr"]:.5e}')
if ep % 10 == 0:
train_model.eval()
count = 0
output_list = []
ground_truth_list = []
for img, label in tqdm(val_dl):
with torch.no_grad():
img = img.to(device)
lbl = label.to(device)
output= train_model(img)
val_loss = ce_loss(output, lbl)
validation_loss += val_loss.item()
count += 1
prediction = torch.argmax(output, dim=1)
output_list.extend(prediction.detach().cpu())
ground_truth_list.extend(label)
accuracy = evaluate(output_list, ground_truth_list)
if accuracy > highest_acc['accuracy']:
highest_acc['accuracy'] = accuracy
highest_acc['epoch'] = ep
print(f'Accuracy: {accuracy} Epoch:{ep}')
torch.save(train_model.state_dict(), output_dir + '/resnet50_' + 'final' + '.pth')
print('highest_acc: {} epoch: {}'.format(highest_acc['accuracy'], highest_acc['epoch']))

Error when implementing RBF kernel bandwidth differentiation in Pytorch

I'm implementing an RBF network by using some beginer examples from Pytorch Website. I have a problem when implementing the kernel bandwidth differentiation for the network. Also, Iwould like to know whether my attempt ti implement the idea is fine. This is a code sample to reproduce the issue. Thanks
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable
def kernel_product(x,y, mode = "gaussian", s = 1.):
x_i = x.unsqueeze(1)
y_j = y.unsqueeze(0)
xmy = ((x_i-y_j)**2).sum(2)
if mode == "gaussian" : K = torch.exp( - xmy/s**2) )
elif mode == "laplace" : K = torch.exp( - torch.sqrt(xmy + (s**2)))
elif mode == "energy" : K = torch.pow( xmy + (s**2), -.25 )
return torch.t(K)
class MyReLU(torch.autograd.Function):
"""
We can implement our own custom autograd Functions by subclassing
torch.autograd.Function and implementing the forward and backward passes
which operate on Tensors.
"""
#staticmethod
def forward(ctx, input):
"""
In the forward pass we receive a Tensor containing the input and return
a Tensor containing the output. ctx is a context object that can be used
to stash information for backward computation. You can cache arbitrary
objects for use in the backward pass using the ctx.save_for_backward method.
"""
ctx.save_for_backward(input)
return input.clamp(min=0)
#staticmethod
def backward(ctx, grad_output):
"""
In the backward pass we receive a Tensor containing the gradient of the loss
with respect to the output, and we need to compute the gradient of the loss
with respect to the input.
"""
input, = ctx.saved_tensors
grad_input = grad_output.clone()
grad_input[input < 0] = 0
return grad_input
dtype = torch.cuda.FloatTensor
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)
# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(H, D_in).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)
# I've created this scalar variable (the kernel bandwidth)
s = Variable(torch.randn(1).type(dtype), requires_grad=True)
learning_rate = 1e-6
for t in range(500):
# To apply our Function, we use Function.apply method. We alias this as 'relu'.
relu = MyReLU.apply
# Forward pass: compute predicted y using operations on Variables; we compute
# ReLU using our custom autograd operation.
# y_pred = relu(x.mm(w1)).mm(w2)
y_pred = relu(kernel_product(w1, x, s)).mm(w2)
# Compute and print loss
loss = (y_pred - y).pow(2).sum()
print(t, loss.data[0])
# Use autograd to compute the backward pass.
loss.backward()
# Update weights using gradient descent
w1.data -= learning_rate * w1.grad.data
w2.data -= learning_rate * w2.grad.data
# Manually zero the gradients after updating weights
w1.grad.data.zero_()
w2.grad.data.zero_()
However I get this error, which dissapears when I simply use a fixed scalar in the default input parameter of kernel_product():
RuntimeError: eq() received an invalid combination of arguments - got (str), but expected one of:
* (float other)
didn't match because some of the arguments have invalid types: (str)
* (Variable other)
didn't match because some of the arguments have invalid types: (str)
Well, you are calling kernel_product(w1, x, s) where w1, x and s are torch Variable while the definition of the function is: kernel_product(x,y, mode = "gaussian", s = 1.). Seems like s should be a string specifying the mode.

Categorical image classification always predicts one class, though calculated accuracy reaches 100%

I followed the Keras cat/dog image classification tutorial
Keras Image Classification tutorial
and found similar results to the reported values. I then took the code from the first example in that tutorial Tutorial Example 1 code, slightly altered a few lines, and trained the model for a dataset of grayscale images (~150 thousand images across 7 classes).
This gave me great initial results ( ~84% accuracy), which I am happy with.
Next I tried implementing the image batch generator myself, which is where I am having trouble. Briefly, the code seems to run well, except the reported accuracy of the model quickly shoots to >= 99% within two epochs. Due to noise in the dataset, this amount of accuracy is not believable. After using the trained model to predict a new batch of data ( images outside of the training or validation dataset ), I find the model always predicts the first class ( i.e. [1.,0.,0.,0.,0.,0.,0.]. The loss function is forcing the model to predict a single class 100% of the time, even though the labels I pass in are distributed across all the classes.
After 28 epochs of training, I see the following output:
320/320 [==============================] - 1114s - loss: 1.5820e-07 - categorical_accuracy: 1.0000 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 16.1181 - val_categorical_accuracy: 0.0000e+00 - val_sparse_categorical_accuracy: 0.0000e+00
When I examine the batch generator output from the tutorial code, and compare my batch generator output, the shape, datatype, and range of values are identical between both generators. I would like to emphasize that the generator passes y labels from each category, not just array([ 1.., 0., 0., 0., 0., 0., 0.], dtype=float32). Therefore, I am lost as to what I am doing incorrectly.
Since I posted this code several days ago, I have used the default Keras image generator, and successfully trained the network on the same dataset and same network architecture. Therefore, something about how I load and pass the data in the generator must be incorrect.
Here is the code I implemented:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import imgaug as ia
from imgaug import augmenters as iaa
import numpy as np
import numpy.random as nprand
import imageio
import os, re, random, sys, csv
import scipy
img_width, img_height = 112, 112
input_shape = (img_width,img_height,1)
batch_size = 200
epochs = 2
train_image_directory = '/PATH/To/Directory/train/'
valid_image_directory = '/PATH/To/Directory/validate/'
video_info_file = '/PATH/To/Directory/train_labels.csv'
train_image_paths = [train_image_directory + m.group(1) for m in [re.match(r"(\d+_\d+\.png)", fname) for fname in os.listdir(train_image_directory)] if m is not None]
valid_image_paths = [valid_image_directory + m.group(1) for m in [re.match(r"(\d+_\d+\.png)", fname) for fname in os.listdir(valid_image_directory)] if m is not None]
num_train_images = len(train_image_paths)
num_val_images = len(valid_image_paths)
label_map = {}
label_decode = {
'0': [1.,0.,0.,0.,0.,0.,0.],
'1': [0.,1.,0.,0.,0.,0.,0.],
'2': [0.,0.,1.,0.,0.,0.,0.],
'3': [0.,0.,0.,1.,0.,0.,0.],
'4': [0.,0.,0.,0.,1.,0.,0.],
'5': [0.,0.,0.,0.,0.,1.,0.],
'6': [0.,0.,0.,0.,0.,0.,1.]
}
with open(video_info_file) as f:
reader = csv.reader(f)
for row in reader:
key = row[0]
if key in label_map:
pass
label_map[key] = label_decode[row[1]]
sometimes = lambda aug: iaa.Sometimes(0.5,aug)
seq = iaa.Sequential(
[
iaa.Fliplr(0.5),
iaa.Flipud(0.2),
sometimes(iaa.Crop(percent=(0, 0.1))),
sometimes(iaa.Affine(
scale={"x": (0.8, 1.2), "y": (0.8, 1.2)},
translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)},
rotate=(-5, 5),
shear=(-16, 16),
order=[0, 1],
cval=(0, 1),
mode=ia.ALL
)),
iaa.SomeOf((0, 3),
[
sometimes(iaa.Superpixels(p_replace=(0, 0.40), n_segments=(20, 100))),
iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)),
iaa.Emboss(alpha=(0, 1.0), strength=(0, 1.0)),
iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255)),
iaa.OneOf([
iaa.Dropout((0.01, 0.1)),
iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05)),
]),
iaa.Invert(0.05),
iaa.Add((-10, 10)),
iaa.Multiply((0.5, 1.5), per_channel=0.5),
iaa.ContrastNormalization((0.5, 2.0)),
sometimes(iaa.ElasticTransformation(alpha=(0.5, 1.5), sigma=0.2)),
sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.03))) # sometimes move parts of the image around
],
random_order=True
)
],
random_order=True)
def image_data_generator(image_paths, labels, batch_size, training):
while(1):
image_paths = nprand.choice(image_paths, batch_size)
X0 = np.asarray([imageio.imread(x) for x in image_paths])
Y = np.asarray([labels[x] for x in image_paths],dtype=np.float32)
if(training):
X = np.divide(np.expand_dims(seq.augment_images(X0)[:,:,:,0],axis=3),255.)
else:
X = np.expand_dims(np.divide(X0[:,:,:,0],255.),axis=3)
X = np.asarray(X,dtype=np.float32)
yield X,Y
def predict_videos(model,video_paths):
i=0
predictions=[]
while(i < len(video_paths)):
video_reader = imageio.get_reader(video_paths[i])
X0 = np.expand_dims([ im[:,:,0] for x,im in enumerate(video_reader) ],axis=3)
prediction = model.predict(X0)
i=i+1
predictions.append(prediction)
return predictions
train_gen = image_data_generator(train_image_paths,label_map,batch_size,True)
val_gen = image_data_generator(valid_image_paths,label_map,batch_size,False)
model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=input_shape))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.4))
model.add(Dense(7))
model.add(Activation('softmax'))
model.load_weights('/PATH/To_pretrained_weights/pretrained_model.h5')
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
optimizer='sgd',
metrics=['categorical_accuracy','sparse_categorical_accuracy'])
checkpointer = ModelCheckpoint('/PATH/To_pretrained_weights/pretrained_model.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
reduceLR = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=20, verbose=0, mode='auto', cooldown=0, min_lr=0)
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
callbacks_list = [checkpointer, early_stop, reduceLR]
model.fit_generator(
train_gen,
steps_per_epoch = -(-num_train_images // batch_size),
epochs=epochs,
validation_data=val_gen,
validation_steps = -(-num_val_images // batch_size),
callbacks=callbacks_list)
For some reason that I cannot fully determine, if you do not give the fit_generator function accurate numbers for steps per epoch or steps for validation, the result is inaccurate reporting of the accuracy metric and strange gradient descent steps.
You can fix this problem by using the Train_on_batch function in Keras instead of the fit generator, or by accurately reporting these step numbers.

Resources