how to get the correct embedding from Roberta transformers? - bert-language-model

I got confused by which hidden state should I use as the output of fine-tuned Roberta transformer models.
from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer
config = AutoConfig.from_pretrained("roberta-base")
config.output_hidden_states = True
tok = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForMaskedLM.from_pretrained("roberta-base", config=config)
inp = "alright let s do this "
sentence = tok.encode(inp, padding='max_length', max_length=512, truncation=True, return_tensors='pt')
output = model(sentence)
According to Huggingface documentation for RobertaForMaskedLM:
Returns a tuple of:
masked_lm_loss (optional)
prediction_scores
hidden_states (optional)
attentions (optional)
By passing the config to enable hidden_states output, the output is a tuple of (prediction_scores, hidden_states)
My question is:
should I use output[-1][0] or output[-1][-1] as the final output embedding from the fine-tuned Roberta Model? My understanding is that output[-1][0] is the initial embedding feeding into the Roberta Model, and output[-1][-1] is the final embedding output.

output[-1][-1] is correct if you are looking for the output of the last encoding layer. You can figure this out by looking at the source code and validate it by comparing the output:
import torch
print(len(output[-1]))
outputEmbeddings = model.roberta.embeddings(sentence)
#the first tensor is the output of the embedding layer
print(torch.equal(output[1][0], outputEmbeddings))
#the second tensor is the output of the first encoding layer
print(torch.equal(output[1][1], model.roberta.encoder.layer[0](outputEmbeddings)[0]))
previousLayer = outputEmbeddings
for x in range(12):
#it is now the current layer
previousLayer = model.roberta.encoder.layer[x](previousLayer)[0]
print(torch.equal(output[1][1+x], previousLayer))
Output:
13
True
True
True
True
True
True
True
True
True
True
True
True
True
True

Related

detectron2 diffusioninst: oom-kill during training

I tried to run code for DiffusionInst based on Detectron2 (source code: https://github.com/chenhaoxing/DiffusionInst). During my training, my python process has always been killed (at 10000-20000 iteration epochs, which is insufficient for diffisioninst training).
I only rewrite the code for dataloader, in order to adapt to my own dataset.
My new code for dataloader:
class DiffusionInstDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by DiffusionInst.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
def __init__(self, cfg, is_train=True):
if cfg.INPUT.CROP.ENABLED and is_train:
self.crop_gen = [
# T.ResizeShortestEdge([400, 500, 600], sample_style="choice"),
T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE),
]
else:
self.crop_gen = None
self.tfm_gens = build_transform_gen(cfg, is_train)
logging.getLogger(__name__).info(
"Full TransformGens used in training: {}, crop: {}".format(str(self.tfm_gens), str(self.crop_gen))
)
self.img_format = cfg.INPUT.FORMAT
self.is_train = is_train
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
# image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
## crop roi
'''lst = dataset_dict['file_name'].split('-')
image = sitk.ReadImage('-'.join(lst[:-2]))
image = sitk.GetArrayFromImage(image)
above, below = int(lst[-2]), int(lst[-1])
image = image[:, above:below, :]'''
## no crop roi
image = sitk.ReadImage(dataset_dict["file_name"],sitk.sitkFloat32)
image = sitk.GetArrayFromImage(image)
# print('**********************',image.shape,'************************')
image = (image - image.min()) / (image.max() - image.min()) * 255
#print(image.dtype)
image = image.transpose(1, 2, 0).astype(np.uint8)
image = np.repeat(image, 3, axis=2)
#print(image.dtype)
utils.check_image_size(dataset_dict, image)
#origshape = image.shape
if self.crop_gen is None:
image, transforms = T.apply_transform_gens(self.tfm_gens, image)
else:
image, transforms = T.apply_transform_gens(
self.tfm_gens + self.crop_gen, image
)
#print('orig', origshape, '\t\tresized', image.shape)
image_shape = image.shape[:2] # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
del image
gc.collect()
if not self.is_train:
# USER: Modify this if you want to keep them for some reason.
dataset_dict.pop("annotations", None)
return dataset_dict
if "annotations" in dataset_dict:
# USER: Modify this if you want to keep them for some reason.
# import pdb;pdb.set_trace()
for anno in dataset_dict["annotations"]:
# anno.pop("segmentation", None)
anno.pop("keypoints", None)
# USER: Implement additional transformations if you have other types of data
annos = [
utils.transform_instance_annotations(obj, transforms, image_shape)
for obj in dataset_dict.pop("annotations")
if obj.get("iscrowd", 0) == 0
]
instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask")
dataset_dict["instances"] = utils.filter_empty_instances(instances)
del instances
gc.collect()
return dataset_dict
And the information about the oom-killer:
[2599547.303018] python invoked oom-killer: gfp_mask=0x24000c0, order=0, oom_score_adj=995
[2599547.303084] [<ffffffff8119bfae>] oom_kill_process+0x1fe/0x3c0
[2599547.303133] Task in /kubepods/burstable/podd09a5032-8b07-11ed-bb60-ac1f6b9ec91e/8b4a8d5c2c1a082f93b1610173beb70bbc19fb1a1c2e28150d2d912ed9b95b10 killed as a result of limit of /kubepods/burstable/podd09a5032-8b07-11ed-bb60-ac1f6b9ec91e
[2599547.305957] Memory cgroup out of memory: Kill process 1041771 (python) score 1198 or sacrifice child
[2599547.307810] Killed process 1041771 (python) total-vm:36436532kB, anon-rss:10288264kB, file-rss:104888kB
[2599718.702250] python invoked oom-killer: gfp_mask=0x24000c0, order=0, oom_score_adj=995
[2599718.702299] [<ffffffff8119bfae>] oom_kill_process+0x1fe/0x3c0
[2599718.702333] Task in /kubepods/burstable/podd09a5032-8b07-11ed-bb60-ac1f6b9ec91e/8b4a8d5c2c1a082f93b1610173beb70bbc19fb1a1c2e28150d2d912ed9b95b10 killed as a result of limit of /kubepods/burstable/podd09a5032-8b07-11ed-bb60-ac1f6b9ec91e
I set IMS_PER_BATCH to 1, and used a dataset which contains only 1 image, but the oom problem still occurred.
I wonder know what should i do to prevent oom problem?

Yolov5 Convert to ONNX

I am trying to convert Yolov5 that takes as input dynamic image shapes into onnx.
import torch
from app import onnx_tools
# This is an example of usage of onnx converter.
yolo5_layout = '/home/eirini/Downloads/best.pt'
model = torch.hub.load("ultralytics/yolov5", 'custom', path=yolo5_layout, source='local')
model.eval()
# Example case
dummy_input = torch.rand((1, 3, 224, 224))
# Passing a dictionary where you define that batch size dimension, width and height are dynamic
dynamic_axes_dict = {"actual_input": {0: "bs",
2: "img_x",
3: "img_y"},
"output": {0: "bs",
}}
# In this example, we told PyTorch to set the axes at indices 0, 2 and 3 of “actual_input” to be dynamic
# and to set the 0 index of “output” to be dynamic – where a dynamic shape is represented as an arbitrary
# string rather than a numerical value (e.g., `img_x` and `img_y` instead of 224 and 224).
torch.onnx.export(model= model,
args= dummy_input,
f = "mytest.onnx",
export_params= True,
verbose= False,
input_names=["actual_input"],
output_names=["output"],
opset_version=14,
dynamic_axes=dynamic_axes_dict)
The above code produces an onnx model. Then I try to load this model by passing a random example.
import numpy as np
import onnxruntime as ort
ort_session = ort.InferenceSession("mytest.onnx")
outputs = ort_session.run(
None,
{"actual_input": np.random.randn(10, 3, 960, 1200).astype(np.float32)},
)
print(outputs[0])
But I get the following error:
onnxruntime.capi.onnxruntime_pybind11_state.Fail: [ONNXRuntimeError] : 1 : FAIL : Non-zero status code returned while running Concat node. Name:'/model/model/model.12/Concat' Status Message: concat.cc:159 PrepareForCompute Non concat axis dimensions must match: Axis 3 has mismatched dimensions of 75 and 76`
It seems like I accepts 224,224 but the purpose of dynamic axes was to handle variant shapes

GridSearchCV for Multiples Models

I would like to run different models using GridSearchCV.
models = {
"RandomForestRegressor": RandomForestRegressor(),
"AdaBoostRegressor": AdaBoostRegressor(),}
params = {
"RandomForestRegressor": {"n_estimators": [10, 50, 75], "max_depth": [10, 20, 50], "max_features": ["auto","sqrt","log2"]},
"AdaBoostRegressor": {"n_estimators": [50, 100],"learning_rate": [0.01,0.1, 0.5],"loss": ["linear","square"]},}
I hope this is helpful, but perhaps just add a parameter to your "create_model" function. FOr example, here is a very basic create_model function that uses the activation function as its argument as the parameter that GridsearchCV is trying to help you tune.
def create_model(activation_fn):
# create model
model = Sequential()
model.add(Dense(30, input_dim=feats, activation=activation_fn,
kernel_initializer='normal'))
model.add(Dropout(0.2))
model.add(Dense(10, activation=activation_fn))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))
# Compile model
model.compile(loss='mean_squared_error',
optimizer='adam',
metrics=['mean_squared_error','mae'])
return model
Now what you can do is modify this to have a second argument called model_type (or whatever you want to call it).
def create_model(model_type = 'rfr'):
if model_type == 'rfr':
......
elif model_type == 'xgb':
.......
elif model_type == 'neural_network':
.......
Then in your params dictionary that is fed into the GridsearchCV that you call, you just give the model_type key a list of models that you want to tune (optimize over). Just make sure that within each block of code under a given "if" statement that you put in the proper code to create your desired model.

R: Parsing boolean command line argument in using argparse

I am using "argparse" library in R for command line arguments.
# Create parser
parser = ArgumentParser(description='command line args')
# Add command line arguments
parser$add_argument("is_local", nargs='?', type="logical",
help="whether to use local or server path", default=FALSE)
parser$add_argument("alert", nargs='?', type="double",
help="alert threshold", default=0.99)
I am trying to call it on command line such as:
Rscript my_func.R TRUE 0.99
However boolean argument does not change. Any idea how to parse boolean argument in R?
Thanks!
I don't know R, but the description of this package says it's a wrapper for the Python argparse.
I would recommend changing these:
parser$add_argument("is_local", nargs='?', type="logical",
help="whether to use local or server path", default=FALSE)
parser$add_argument("alert", nargs='?', type="double",
help="alert threshold", default=0.99)
to
parser$add_argument("--local", action='store_true'),
help="whether to use local or server path")
parser$add_argument("--alert", type="double",
help="alert threshold", default=0.99)
which would be called with
Rscript my_func.R --local --alert 0.99
store_true is illustrated on the basic docs page, https://github.com/trevorld/r-argparse
If I read the R correctly, your is_local should be giving you a warning
"You almost certainly want to use action='store_true' or action='store_false' instead"
A store_true argument sets the attribute to TRUE if present, and the default FALSE if absent. It should be an optional (--) and not set the nargs.
(It is possible to have an argument that takes strings 'true' and 'false' (or any other pair in your native language) and converts them to logical values, but requires more coding.)
I made --alert a flagged argument as well, without the nargs. Its value will be the default if absent, and the convert the string to a double if provided. It could be a '?' positional, but while learning I think it's best to stick with optionals unless you want the argument to be required.
The R-argparse docs aren't very complete. You may need to refer to the Python docs, and experiment to get the translation right.
https://docs.python.org/3/library/argparse.html
Thanks for your time and help!
I would like to share my workaround for the problem as I find original argparse (action) usage a bit complicated.
# Convert str input to boolean
str2bool = function(input_str)
{
if(input_str == "0")
{
input_str = FALSE
}
else if(input_str == "1")
{
input_str = TRUE
}
return(input_str)
}
# Create parser
parser = ArgumentParser(description='command line args')
# Add command line arguments
parser$add_argument("is_local", nargs='?', type="character",
help="whether to use local or server path", default="1")
parser$add_argument("alert", nargs='?', type="double",
help="alert threshold", default=0.99)
# Parse arguments
args = parser$parse_args()
# Convert str arguments
args$is_local = str2bool(args$is_local)
# Call on CMD line
Rscript my_func.R 1 0.99 #equivalent Rscript my_func.R TRUE 0.99
Rscript my_func.R 0 0.99 #equivalent Rscript my_func.R FALSE 0.99
Below is the sample example given in the official docs of package argparse:
parser <- ArgumentParser(description='Process some integers')
parser$add_argument('integers', metavar='N', type="integer", nargs='+',
help='an integer for the accumulator')
parser$add_argument('--sum', dest='accumulate', action='store_const',
const='sum', default='max',
help='sum the integers (default: find the max)')
parser$print_help()
# default args for ArgumentParser()$parse_args are commandArgs(TRUE)
# which is what you'd want for an Rscript but not for interactive use
args <- parser$parse_args(c("--sum", "1", "2", "3"))
accumulate_fn <- get(args$accumulate)
print(accumulate_fn(args$integers))
Here is the link for the argparse pdf https://cran.r-project.org/web/packages/argparse/argparse.pdf
I hope it might help.

rpy2 to subset RS4 object (expressionSet)

I'm building an ExpressionSet class using rpy2, following the relevant tutorial as a guide. One of the most common things I do with the Eset object is subsetting, which in native R is as straightforward as
eset2<-eset1[1:10,1:5] # first ten features, first five samples
which returns a new ExpressionSet object with subsets of both the expression and phenotype data, using the given indices. Rpy2's RS4 object doesn't seem to allow direct subsetting, or have rx/rx2 attributes unlike e.g. RS3 vectors. I tried, with ~50% success, adding a '_subset' function (below) that creates subsets of these two datasets separately and assigns them back to Eset, but is there a more straightforward way that I'm missing?
from rpy2 import (robjects, rinterface)
from rpy2.robjects import (r, pandas2ri, Formula)
from rpy2.robjects.packages import (importr,)
from rpy2.robjects.methods import (RS4,)
class ExpressionSet(RS4):
# funcs to get the attributes
def _assay_get(self): # returns an environment, use ['exprs'] key to access
return self.slots["assayData"]
def _pdata_get(self): # returns an RS4 object, use .slots("data") to access
return self.slots["phenoData"]
def _feats_get(self): # returns an RS4 object, use .slots("data") to access
return self.slots["featureData"]
def _annot_get(self): # slots returns a tuple, just pick 1st (only) element
return self.slots["annotation"][0]
def _class_get(self): # slots returns a tuple, just pick 1st (only) element
return self.slots["class"][0]
# funcs to set the attributes
def _assay_set(self, value):
self.slots["assayData"] = value
def _pdata_set(self, value):
self.slots["phenoData"] = value
def _feats_set(self,value):
self.slots["featureData"] = value
def _annot_set(self, value):
self.slots["annotation"] = value
def _class_set(self, value):
self.slots["class"] = value
# funcs to work with the above to get/set the data
def _exprs_get(self):
return self.assay["exprs"]
def _pheno_get(self):
pdata = self.pData
return pdata.slots["data"]
def _exprs_set(self, value):
assay = self.assay
assay["exprs"] = value
def _pheno_set(self, value):
pdata = self.pData
pdata.slots["data"] = value
assay = property(_assay_get, _assay_set, None, "R attribute 'assayData'")
pData = property(_pdata_get, _pdata_set, None, "R attribute 'phenoData'")
fData = property(_feats_get, _feats_set, None, "R attribute 'featureData'")
annot = property(_annot_get, _annot_set, None, "R attribute 'annotation'")
exprs = property(_exprs_get, _exprs_set, None, "R attribute 'exprs'")
pheno = property(_pheno_get, _pheno_set, None, "R attribute 'pheno")
def _subset(self, features=None, samples=None):
features = features if features else self.exprs.rownames
samples = samples if samples else self.exprs.colnames
fx = robjects.BoolVector([f in features for f in self.exprs.rownames])
sx = robjects.BoolVector([s in samples for s in self.exprs.colnames])
self.pheno = self.pheno.rx(sx, self.pheno.colnames)
self.exprs = self.exprs.rx(fx,sx) # can't assign back to exprs this way
When doing
eset2<-eset1[1:10,1:5]
in R, the R S4 method "[" with the signature ("ExpressionSet") is fetched and run using the parameter values you provided.
The documentation is suggesting the use of getmethod (see http://rpy2.readthedocs.org/en/version_2.7.x/generated_rst/s4class.html#methods ) to facilitate the task of fetching the relevant S4 method, but its behaviour seems to have changed after the documentation was written (resolution of the dispatch through inheritance is no longer done).
The following should do it though:
from rpy2.robjects.packages import importr
methods = importr('methods')
r_subset_expressionset = methods.selectMethod("[", "ExpressionSet")
with thanks to #lgautier's answer, here's a snippet of my above code, modified to allow subsetting of the RS4 object:
from multipledispatch import dispatch
#dispatch(RS4)
def eset_subset(eset, features=None, samples=None):
"""
subset an RS4 eset object
"""
features = features if features else eset.exprs.rownames
samples = samples if samples else eset.exprs.colnames
fx = robjects.BoolVector([f in features for f in eset.exprs.rownames])
sx = robjects.BoolVector([s in samples for s in eset.exprs.colnames])
esub=methods.selectMethod("[", signature="ExpressionSet")(eset, fx,sx)
return esub

Resources