rpy2 to subset RS4 object (expressionSet) - r

I'm building an ExpressionSet class using rpy2, following the relevant tutorial as a guide. One of the most common things I do with the Eset object is subsetting, which in native R is as straightforward as
eset2<-eset1[1:10,1:5] # first ten features, first five samples
which returns a new ExpressionSet object with subsets of both the expression and phenotype data, using the given indices. Rpy2's RS4 object doesn't seem to allow direct subsetting, or have rx/rx2 attributes unlike e.g. RS3 vectors. I tried, with ~50% success, adding a '_subset' function (below) that creates subsets of these two datasets separately and assigns them back to Eset, but is there a more straightforward way that I'm missing?
from rpy2 import (robjects, rinterface)
from rpy2.robjects import (r, pandas2ri, Formula)
from rpy2.robjects.packages import (importr,)
from rpy2.robjects.methods import (RS4,)
class ExpressionSet(RS4):
# funcs to get the attributes
def _assay_get(self): # returns an environment, use ['exprs'] key to access
return self.slots["assayData"]
def _pdata_get(self): # returns an RS4 object, use .slots("data") to access
return self.slots["phenoData"]
def _feats_get(self): # returns an RS4 object, use .slots("data") to access
return self.slots["featureData"]
def _annot_get(self): # slots returns a tuple, just pick 1st (only) element
return self.slots["annotation"][0]
def _class_get(self): # slots returns a tuple, just pick 1st (only) element
return self.slots["class"][0]
# funcs to set the attributes
def _assay_set(self, value):
self.slots["assayData"] = value
def _pdata_set(self, value):
self.slots["phenoData"] = value
def _feats_set(self,value):
self.slots["featureData"] = value
def _annot_set(self, value):
self.slots["annotation"] = value
def _class_set(self, value):
self.slots["class"] = value
# funcs to work with the above to get/set the data
def _exprs_get(self):
return self.assay["exprs"]
def _pheno_get(self):
pdata = self.pData
return pdata.slots["data"]
def _exprs_set(self, value):
assay = self.assay
assay["exprs"] = value
def _pheno_set(self, value):
pdata = self.pData
pdata.slots["data"] = value
assay = property(_assay_get, _assay_set, None, "R attribute 'assayData'")
pData = property(_pdata_get, _pdata_set, None, "R attribute 'phenoData'")
fData = property(_feats_get, _feats_set, None, "R attribute 'featureData'")
annot = property(_annot_get, _annot_set, None, "R attribute 'annotation'")
exprs = property(_exprs_get, _exprs_set, None, "R attribute 'exprs'")
pheno = property(_pheno_get, _pheno_set, None, "R attribute 'pheno")
def _subset(self, features=None, samples=None):
features = features if features else self.exprs.rownames
samples = samples if samples else self.exprs.colnames
fx = robjects.BoolVector([f in features for f in self.exprs.rownames])
sx = robjects.BoolVector([s in samples for s in self.exprs.colnames])
self.pheno = self.pheno.rx(sx, self.pheno.colnames)
self.exprs = self.exprs.rx(fx,sx) # can't assign back to exprs this way

When doing
eset2<-eset1[1:10,1:5]
in R, the R S4 method "[" with the signature ("ExpressionSet") is fetched and run using the parameter values you provided.
The documentation is suggesting the use of getmethod (see http://rpy2.readthedocs.org/en/version_2.7.x/generated_rst/s4class.html#methods ) to facilitate the task of fetching the relevant S4 method, but its behaviour seems to have changed after the documentation was written (resolution of the dispatch through inheritance is no longer done).
The following should do it though:
from rpy2.robjects.packages import importr
methods = importr('methods')
r_subset_expressionset = methods.selectMethod("[", "ExpressionSet")

with thanks to #lgautier's answer, here's a snippet of my above code, modified to allow subsetting of the RS4 object:
from multipledispatch import dispatch
#dispatch(RS4)
def eset_subset(eset, features=None, samples=None):
"""
subset an RS4 eset object
"""
features = features if features else eset.exprs.rownames
samples = samples if samples else eset.exprs.colnames
fx = robjects.BoolVector([f in features for f in eset.exprs.rownames])
sx = robjects.BoolVector([s in samples for s in eset.exprs.colnames])
esub=methods.selectMethod("[", signature="ExpressionSet")(eset, fx,sx)
return esub

Related

Type error when fine-tuning a bert-large-uncased-whole-word-masking model by Huggingface

I am trying to fine-tune a Huggingface bert-large-uncased-whole-word-masking model and i get a type error like this when training:
"TypeError: only integer tensors of a single element can be converted to an index"
Here is the code:
train_inputs = tokenizer(text_list[0:457], return_tensors='pt', max_length=512, truncation=True, padding='max_length')
train_inputs['labels']= train_inputs.input_ids.detach().clone()
Then i mask randomly about 15% of the words in the input-ids,
and define a class for the dataset, and then the mistake happens in the training loop:
class MeditationsDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings= encodings
def __getitem__(self, idx):
return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
def __len__(self):
return self.encodings.input_ids
train_dataset = MeditationsDataset(train_inputs)
train_dataloader = torch.utils.data.DataLoader(dataset= train_dataset, batch_size=8, shuffle=False)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
from transformers import BertModel, AdamW
model = BertModel.from_pretrained("bert-large-uncased-whole-word-masking")
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=1e-5)
num_epochs = 2
from tqdm.auto import tqdm
for epoch in range(num_epochs):
loop = tqdm(train_dataloader, leave=True)
for batch in loop:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
The mistake happens in "for batch in loop"
Does anybody understand it and know how to solve this? Thanks in advance for your help
In the class MeditationsDataset in function __getitem__ torch.tensor(val[idx]) is deprecated by PyTorch you should use instead val[idx].clone().detach()

Error when implementing RBF kernel bandwidth differentiation in Pytorch

I'm implementing an RBF network by using some beginer examples from Pytorch Website. I have a problem when implementing the kernel bandwidth differentiation for the network. Also, Iwould like to know whether my attempt ti implement the idea is fine. This is a code sample to reproduce the issue. Thanks
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable
def kernel_product(x,y, mode = "gaussian", s = 1.):
x_i = x.unsqueeze(1)
y_j = y.unsqueeze(0)
xmy = ((x_i-y_j)**2).sum(2)
if mode == "gaussian" : K = torch.exp( - xmy/s**2) )
elif mode == "laplace" : K = torch.exp( - torch.sqrt(xmy + (s**2)))
elif mode == "energy" : K = torch.pow( xmy + (s**2), -.25 )
return torch.t(K)
class MyReLU(torch.autograd.Function):
"""
We can implement our own custom autograd Functions by subclassing
torch.autograd.Function and implementing the forward and backward passes
which operate on Tensors.
"""
#staticmethod
def forward(ctx, input):
"""
In the forward pass we receive a Tensor containing the input and return
a Tensor containing the output. ctx is a context object that can be used
to stash information for backward computation. You can cache arbitrary
objects for use in the backward pass using the ctx.save_for_backward method.
"""
ctx.save_for_backward(input)
return input.clamp(min=0)
#staticmethod
def backward(ctx, grad_output):
"""
In the backward pass we receive a Tensor containing the gradient of the loss
with respect to the output, and we need to compute the gradient of the loss
with respect to the input.
"""
input, = ctx.saved_tensors
grad_input = grad_output.clone()
grad_input[input < 0] = 0
return grad_input
dtype = torch.cuda.FloatTensor
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)
# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(H, D_in).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)
# I've created this scalar variable (the kernel bandwidth)
s = Variable(torch.randn(1).type(dtype), requires_grad=True)
learning_rate = 1e-6
for t in range(500):
# To apply our Function, we use Function.apply method. We alias this as 'relu'.
relu = MyReLU.apply
# Forward pass: compute predicted y using operations on Variables; we compute
# ReLU using our custom autograd operation.
# y_pred = relu(x.mm(w1)).mm(w2)
y_pred = relu(kernel_product(w1, x, s)).mm(w2)
# Compute and print loss
loss = (y_pred - y).pow(2).sum()
print(t, loss.data[0])
# Use autograd to compute the backward pass.
loss.backward()
# Update weights using gradient descent
w1.data -= learning_rate * w1.grad.data
w2.data -= learning_rate * w2.grad.data
# Manually zero the gradients after updating weights
w1.grad.data.zero_()
w2.grad.data.zero_()
However I get this error, which dissapears when I simply use a fixed scalar in the default input parameter of kernel_product():
RuntimeError: eq() received an invalid combination of arguments - got (str), but expected one of:
* (float other)
didn't match because some of the arguments have invalid types: (str)
* (Variable other)
didn't match because some of the arguments have invalid types: (str)
Well, you are calling kernel_product(w1, x, s) where w1, x and s are torch Variable while the definition of the function is: kernel_product(x,y, mode = "gaussian", s = 1.). Seems like s should be a string specifying the mode.

Dictionary key from pdb file

I'm trying to go through a .pdb file, calculate distance between alpha carbon atoms from different residues on chains A and B of a protein complex, then store the distance in a dictionary, together with the chain identifier and residue number.
For example, if the first alpha carbon ("CA") is found on residue 100 on chain A and the one it binds to is on residue 123 on chain B I would want my dictionary to look something like d={(A, 100):[B, 123, distance_between_atoms]}
from Bio.PDB.PDBParser import PDBParser
parser=PDBParser()
struct = parser.get_structure("1trk", "1trk.pdb")
def getAlphaCarbons(chain):
vec = []
for residue in chain:
for atom in residue:
if atom.get_name() == "CA":
vec = vec + [atom.get_vector()]
return vec
def dist(a,b):
return (a-b).norm()
chainA = struct[0]['A']
chainB = struct[0]['B']
vecA = getAlphaCarbons(chainA)
vecB = getAlphaCarbons(chainB)
t={}
model=struct[0]
for model in struct:
for chain in model:
for residue in chain:
for a in vecA:
for b in vecB:
if dist(a,b)<=8:
t={(chain,residue):[(a, b, dist(a, b))]}
break
print t
It's been running the programme for ages and I had to abort the run (have I made an infinite loop somewhere??)
I was also trying to do this:
t = {i:[((a, b, dist(a,b)) for a in vecA) for b in vecB if dist(a, b) <= 8] for i in chainA}
print t
But it's printing info about residues in the following format:
<Residue PHE het= resseq=591 icode= >: []
It's not printing anything related to the distance.
Thanks a lot, I hope everything is clear.
Would strongly suggest using C libraries while calculating distances. I use mdtraj for this sort of thing and it works much quicker than all the for loops in BioPython.
To get all pairs of alpha-Carbons:
import mdtraj as md
def get_CA_pairs(self,pdbfile):
traj = md.load_pdb(pdbfile)
topology = traj.topology
CA_index = ([atom.index for atom in topology.atoms if (atom.name == 'CA')])
pairs=list(itertools.combinations(CA_index,2))
return pairs
Then, for quick computation of distances:
def get_distances(self,pdbfile,pairs):
#returns list of resid1, resid2,distances between CA-CA
traj = md.load_pdb(pdbfile)
pairs=self.get_CA_pairs(pdbfile)
dist=md.compute_distances(traj,pairs)
#make dictionary you desire.
dict=dict(zip(CA, pairs))
return dict
This includes all alpha-Carbons. There should be a chain identifier too in mdtraj to select CA's from each chain.

where do we use the methods __str__ and __repr__ in python3? [duplicate]

This question already has answers here:
What is the difference between __str__ and __repr__?
(28 answers)
Closed 2 years ago.
I really don't understand where are __str__ and __repr__ used in Python. I mean, I get that __str__ returns the string representation of an object. But why would I need that? In what use case scenario? Also, I read about the usage of __repr__
But what I don't understand is, where would I use them?
__repr__
Called by the repr() built-in function and by string conversions (reverse quotes) to compute the "official" string representation of an object. If at all possible, this should look like a valid Python expression that could be used to recreate an object with the same value (given an appropriate environment).
__str__
Called by the str() built-in function and by the print statement to compute the "informal" string representation of an object.
Use __str__ if you have a class, and you'll want an informative/informal output, whenever you use this object as part of string. E.g. you can define __str__ methods for Django models, which then gets rendered in the Django administration interface. Instead of something like <Model object> you'll get like first and last name of a person, the name and date of an event, etc.
__repr__ and __str__ are similar, in fact sometimes equal (Example from BaseSet class in sets.py from the standard library):
def __repr__(self):
"""Return string representation of a set.
This looks like 'Set([<list of elements>])'.
"""
return self._repr()
# __str__ is the same as __repr__
__str__ = __repr__
The one place where you use them both a lot is in an interactive session. If you print an object then its __str__ method will get called, whereas if you just use an object by itself then its __repr__ is shown:
>>> from decimal import Decimal
>>> a = Decimal(1.25)
>>> print(a)
1.25 <---- this is from __str__
>>> a
Decimal('1.25') <---- this is from __repr__
The __str__ is intended to be as human-readable as possible, whereas the __repr__ should aim to be something that could be used to recreate the object, although it often won't be exactly how it was created, as in this case.
It's also not unusual for both __str__ and __repr__ to return the same value (certainly for built-in types).
Building up and on the previous answers and showing some more examples. If used properly, the difference between str and repr is clear. In short repr should return a string that can be copy-pasted to rebuilt the exact state of the object, whereas str is useful for logging and observing debugging results. Here are some examples to see the different outputs for some known libraries.
Datetime
print repr(datetime.now()) #datetime.datetime(2017, 12, 12, 18, 49, 27, 134411)
print str(datetime.now()) #2017-12-12 18:49:27.134452
The str is good to print into a log file, where as repr can be re-purposed if you want to run it directly or dump it as commands into a file.
x = datetime.datetime(2017, 12, 12, 18, 49, 27, 134411)
Numpy
print repr(np.array([1,2,3,4,5])) #array([1, 2, 3, 4, 5])
print str(np.array([1,2,3,4,5])) #[1 2 3 4 5]
in Numpy the repr is again directly consumable.
Custom Vector3 example
class Vector3(object):
def __init__(self, args):
self.x = args[0]
self.y = args[1]
self.z = args[2]
def __str__(self):
return "x: {0}, y: {1}, z: {2}".format(self.x, self.y, self.z)
def __repr__(self):
return "Vector3([{0},{1},{2}])".format(self.x, self.y, self.z)
In this example, repr returns again a string that can be directly consumed/executed, whereas str is more useful as a debug output.
v = Vector3([1,2,3])
print str(v) #x: 1, y: 2, z: 3
print repr(v) #Vector3([1,2,3])
One thing to keep in mind, if str isn't defined but repr, str will automatically call repr. So, it's always good to at least define repr
Grasshopper, when in doubt go to the mountain and read the Ancient Texts. In them you will find that __repr__() should:
If at all possible, this should look like a valid Python expression that could be used to recreate an object with the same value.
Lets have a class without __str__ function.
class Employee:
def __init__(self, first, last, pay):
self.first = first
self.last = last
self.pay = pay
emp1 = Employee('Ivan', 'Smith', 90000)
print(emp1)
When we print this instance of the class, emp1, this is what we get:
<__main__.Employee object at 0x7ff6fc0a0e48>
This is not very helpful, and certainly this is not what we want printed if we are using it to display (like in html)
So now, the same class, but with __str__ function:
class Employee:
def __init__(self, first, last, pay):
self.first = first
self.last = last
self.pay = pay
def __str__(self):
return(f"The employee {self.first} {self.last} earns {self.pay}.")
# you can edit this and use any attributes of the class
emp2 = Employee('John', 'Williams', 90000)
print(emp2)
Now instead of printing that there is an object, we get what we specified with return of __str__ function:
The employee John Williams earns 90000
str will be informal and readable format whereas repr will give official object representation.
class Complex:
# Constructor
def __init__(self, real, imag):
self.real = real
self.imag = imag
# "official" string representation of an object
def __repr__(self):
return 'Rational(%s, %s)' % (self.real, self.imag)
# "informal" string representation of an object (readable)
def __str__(self):
return '%s + i%s' % (self.real, self.imag)
t = Complex(10, 20)
print (t) # this is usual way we print the object
print (str(t)) # this is str representation of object
print (repr(t)) # this is repr representation of object
Answers :
Rational(10, 20) # usual representation
10 + i20 # str representation
Rational(10, 20) # repr representation
str and repr are both ways to represent. You can use them when you are writing a class.
class Fraction:
def __init__(self, n, d):
self.n = n
self.d = d
def __repr__(self):
return "{}/{}".format(self.n, self.d)
for example when I print a instance of it, it returns things.
print(Fraction(1, 2))
results in
1/2
while
class Fraction:
def __init__(self, n, d):
self.n = n
self.d = d
def __str__(self):
return "{}/{}".format(self.n, self.d)
print(Fraction(1, 2))
also results in
1/2
But what if you write both of them, which one does python use?
class Fraction:
def __init__(self, n, d):
self.n = n
self.d = d
def __str__(self):
return "str"
def __repr__(self):
return "repr"
print(Fraction(None, None))
This results in
str
So python actually uses the str method not the repr method when both are written.
Suppose you have a class and wish to inspect an instance, you see the print doesn't give much useful information
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
p1 = Person("John", 36)
print(p1) # <__main__.Animal object at 0x7f9060250410>
Now see a class with a str, it shows the instance information and with repr you even don't need the print. Nice no?
class Animal:
def __init__(self, color, age, breed):
self.color = color
self.age = age
self.breed = breed
def __str__(self):
return f"{self.color} {self.breed} of age {self.age}"
def __repr__(self):
return f"repr : {self.color} {self.breed} of age {self.age}"
a1 = Animal("Red", 36, "Dog")
a1 # repr : Red Dog of age 36
print(a1) # Red Dog of age 36

How can I use functools.partial on multiple methods on an object, and freeze parameters out of order?

I find functools.partial to be extremely useful, but I would like to be able to freeze arguments out of order (the argument you want to freeze is not always the first one) and I'd like to be able to apply it to several methods on a class at once, to make a proxy object that has the same methods as the underlying object except with some of its methods parameters being frozen (think of it as generalizing partial to apply to classes). And I'd prefer to do this without editing the original object, just like partial doesn't change its original function.
I've managed to scrap together a version of functools.partial called 'bind' that lets me specify parameters out of order by passing them by keyword argument. That part works:
>>> def foo(x, y):
... print x, y
...
>>> bar = bind(foo, y=3)
>>> bar(2)
2 3
But my proxy class does not work, and I'm not sure why:
>>> class Foo(object):
... def bar(self, x, y):
... print x, y
...
>>> a = Foo()
>>> b = PureProxy(a, bar=bind(Foo.bar, y=3))
>>> b.bar(2)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: bar() takes exactly 3 arguments (2 given)
I'm probably doing this all sorts of wrong because I'm just going by what I've pieced together from random documentation, blogs, and running dir() on all the pieces. Suggestions both on how to make this work and better ways to implement it would be appreciated ;) One detail I'm unsure about is how this should all interact with descriptors. Code follows.
from types import MethodType
class PureProxy(object):
def __init__(self, underlying, **substitutions):
self.underlying = underlying
for name in substitutions:
subst_attr = substitutions[name]
if hasattr(subst_attr, "underlying"):
setattr(self, name, MethodType(subst_attr, self, PureProxy))
def __getattribute__(self, name):
return getattr(object.__getattribute__(self, "underlying"), name)
def bind(f, *args, **kwargs):
""" Lets you freeze arguments of a function be certain values. Unlike
functools.partial, you can freeze arguments by name, which has the bonus
of letting you freeze them out of order. args will be treated just like
partial, but kwargs will properly take into account if you are specifying
a regular argument by name. """
argspec = inspect.getargspec(f)
argdict = copy(kwargs)
if hasattr(f, "im_func"):
f = f.im_func
args_idx = 0
for arg in argspec.args:
if args_idx >= len(args):
break
argdict[arg] = args[args_idx]
args_idx += 1
num_plugged = args_idx
def new_func(*inner_args, **inner_kwargs):
args_idx = 0
for arg in argspec.args[num_plugged:]:
if arg in argdict:
continue
if args_idx >= len(inner_args):
# We can't raise an error here because some remaining arguments
# may have been passed in by keyword.
break
argdict[arg] = inner_args[args_idx]
args_idx += 1
f(**dict(argdict, **inner_kwargs))
new_func.underlying = f
return new_func
Update: In case anyone can benefit, here's the final implementation I went with:
from types import MethodType
class PureProxy(object):
""" Intended usage:
>>> class Foo(object):
... def bar(self, x, y):
... print x, y
...
>>> a = Foo()
>>> b = PureProxy(a, bar=FreezeArgs(y=3))
>>> b.bar(1)
1 3
"""
def __init__(self, underlying, **substitutions):
self.underlying = underlying
for name in substitutions:
subst_attr = substitutions[name]
if isinstance(subst_attr, FreezeArgs):
underlying_func = getattr(underlying, name)
new_method_func = bind(underlying_func, *subst_attr.args, **subst_attr.kwargs)
setattr(self, name, MethodType(new_method_func, self, PureProxy))
def __getattr__(self, name):
return getattr(self.underlying, name)
class FreezeArgs(object):
def __init__(self, *args, **kwargs):
self.args = args
self.kwargs = kwargs
def bind(f, *args, **kwargs):
""" Lets you freeze arguments of a function be certain values. Unlike
functools.partial, you can freeze arguments by name, which has the bonus
of letting you freeze them out of order. args will be treated just like
partial, but kwargs will properly take into account if you are specifying
a regular argument by name. """
argspec = inspect.getargspec(f)
argdict = copy(kwargs)
if hasattr(f, "im_func"):
f = f.im_func
args_idx = 0
for arg in argspec.args:
if args_idx >= len(args):
break
argdict[arg] = args[args_idx]
args_idx += 1
num_plugged = args_idx
def new_func(*inner_args, **inner_kwargs):
args_idx = 0
for arg in argspec.args[num_plugged:]:
if arg in argdict:
continue
if args_idx >= len(inner_args):
# We can't raise an error here because some remaining arguments
# may have been passed in by keyword.
break
argdict[arg] = inner_args[args_idx]
args_idx += 1
f(**dict(argdict, **inner_kwargs))
return new_func
You're "binding too deep": change def __getattribute__(self, name): to def __getattr__(self, name): in class PureProxy. __getattribute__ intercepts every attribute access and so bypasses everything that you've set with setattr(self, name, ... making those setattr bereft of any effect, which obviously's not what you want; __getattr__ is called only for access to attributes not otherwise defined so those setattr calls become "operative" & useful.
In the body of that override, you can and should also change object.__getattribute__(self, "underlying") to self.underlying (since you're not overriding __getattribute__ any more). There are other changes I'd suggest (enumerate in lieu of the low-level logic you're using for counters, etc) but they wouldn't change the semantics.
With the change I suggest, your sample code works (you'll have to keep testing with more subtle cases of course). BTW, the way I debugged this was simply to stick in print statements in the appropriate places (a jurassic=era approach but still my favorite;-).

Resources