Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 69 additions & 38 deletions cancermuts/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,16 @@ class Sequence(object):
----------
source : :obj:`cancermuts.datasources.Datasource`
Source from where the protein sequence is downloaded
gene_id : :obj:`int`, optional
gene_id : :obj:`str`, optional
gene name to which the sequence to be downloaded belongs
sequence : :obj:`str`
protein sequence as obtained from the data source
sequence_numbering : :obj:`list of int`
list of integers, starting from one, each representing a position
properties : :obj:`dict`
dictionary including the downloaded protein-associated properties.
These can span one or more residues.
Dictionary mapping property names to one SequenceProperty object per
property type. Each SequenceProperty stores its own entries internally
by residue position
aliases : :obj:`dict`
This dictionary maps different "aliases" (i.e. identifiers) for the
protein of interest. The key should represent the type of identifier
Expand Down Expand Up @@ -96,7 +97,6 @@ def __init__(self, gene_id, uniprot_ac, sequence, source, isoform=None, is_canon
self.sequence_numbering = list(range(1, len(self.sequence) + 1))
self.properties = {}


def seq2index(self, seqn):
"""
Returns the 0-indexed position number in the sequence corresponding
Expand All @@ -121,26 +121,30 @@ def __iter__(self):
def __repr__(self):
return f"<Sequence gene_id={self.gene_id}, uniprot_ac={self.uniprot_ac}, isoform={self.isoform}, is_canonical={self.is_canonical}, source={self.source.name}, {len(self.sequence)} positions>"

def add_property(self, prop):
def add_property(self, property_name, positions, sources=None, **metadata):
"""
Adds sequence property to sequence object. If a property of the same
category is already present, the property will be added to the same
category; otherwise the category will be created anew
Add an entry to a sequence property.
A Sequence stores one SequenceProperty object per property name. This method
creates the property object if needed, then stores the provided entry inside
that object at the requested residue positions.

Parameters
----------
prop : :obj:`cancermuts.properties.SequenceProperty`
new property to be added
"""
if property_name not in sequence_properties_classes:
raise ValueError(f"Unknown property_name: {property_name}")

if prop.category in self.properties:
self.properties[prop.category].append(prop)
add_type = "appending"
self.log.debug("adding property %s to sequence of %s (appeding)" % (str(prop), self.gene_id))
else:
self.properties[prop.category] = [prop]
add_type = "new category"
self.log.debug("adding property %s to sequence of %s (%s)" % (str(prop), self.gene_id, add_type))
if positions is None:
positions = []
if not isinstance(positions, list):
raise TypeError("positions must be a list of 1-based residue numbers")

for position in positions:
if position not in self.sequence_numbering:
raise ValueError(f"Property {property_name} is outside sequence bounds: "
f"position {position} for sequence of length {len(self.sequence)}")
if property_name not in self.properties:
self.properties[property_name] = sequence_properties_classes[property_name]()

return self.properties[property_name].add(positions=positions, sources=sources, **metadata)

def _variant_wt_check(self, var):
if var.start < 1 or var.end > len(self.sequence):
Expand Down Expand Up @@ -185,31 +189,58 @@ def add_variant(self, var):
existing.metadata[k] = var.metadata[k]
self.log.debug(" metadata %s was added anew" % k)

def variants_at_position(self, position, variant_types=None):

if position not in self.sequence_numbering:
raise ValueError(f"Position {position} is outside sequence bounds: 1-{len(self.sequence)}")
if variant_types is None:
variant_types = ProteinVariant.VARIANT_TYPES
elif isinstance(variant_types, str):
variant_types = {variant_types}
else:
variant_types = set(variant_types)

invalid_variant_types = variant_types - ProteinVariant.VARIANT_TYPES
if invalid_variant_types:
raise ValueError(f"Invalid variant type(s): {invalid_variant_types}")
matching_variants = []
for variant in self.variants:
if variant.variant_type not in variant_types:
continue
if variant.variant_type == "insertion":
if position in {variant.start, variant.start + 1}:
matching_variants.append(variant)
elif variant.start <= position <= variant.end:
matching_variants.append(variant)
return matching_variants

def properties_at_position(self, position, property_name=None):
"""
If property_name is provided, return a dict of properties mapped
to the property category.
If property_name is None, return all property categories with their
properties.
Return property entries annotated at a residue position.
If property_name is provided, only entries for that property type are
returned. If property_name is None, entries for all property types present
at the position are returned.
"""
if position not in self.sequence_numbering:
raise ValueError(f"Position {position} is outside sequence bounds: 1-{len(self.sequence)}")

if property_name is not None:
properties = {}
if property_name is None:
property_names = self.properties.keys()
else:
if property_name not in sequence_properties_classes:
raise ValueError(f"Unknown property_name: {property_name}")
if property_name not in self.properties:
properties_to_check = {property_name: []}
else:
properties_to_check = {property_name: self.properties[property_name]}
else:
properties_to_check = self.properties
property_names = [property_name]

properties = {}
for this_property_name, props in properties_to_check.items():
matching_props = [prop for prop in props if position in prop.positions]
if matching_props:
properties[this_property_name] = matching_props
for property_name in property_names:
if property_name not in self.properties:
continue
entries = self.properties[property_name].get_entries_at(position)
if entries:
properties[property_name] = entries
return properties


class ProteinVariant(object):
"""This class describes in-frame protein variants on a reference sequence.

Expand All @@ -228,7 +259,7 @@ class ProteinVariant(object):
metadata : :obj:`dict`
Dictionary encoding variant-associated metadata.
"""

VARIANT_TYPES = {"missense", "deletion", "insertion", "delins"}
@logger_init
def __init__(self, start, end, ref, alt, sources=None, metadata=None):
"""Constructor for the ProteinVariant class.
Expand Down Expand Up @@ -432,7 +463,7 @@ def get_variant_table(self, variant_types=None, metadata=[], metadata_formatter=
variant_types = {variant_types}
else:
variant_types = set(variant_types)
invalid_var_types = variant_types - {"missense", "delins", "insertion", "deletion"}
invalid_var_types = variant_types - ProteinVariant.VARIANT_TYPES
if invalid_var_types:
raise ValueError(f"Invalid variant type(s): {invalid_var_types}")
variants = [v for v in variants if v.variant_type in variant_types]
Expand Down
132 changes: 33 additions & 99 deletions cancermuts/datasources.py
Original file line number Diff line number Diff line change
Expand Up @@ -1929,25 +1929,13 @@ def add_sequence_properties(self, sequence, properties=None):
if sequence.sequence[site_seq_idx] != wt:
self.log.warning("for PTM %s, residue %s is %s in wild-type sequence; it will be skipped" %(m, wt, sequence.sequence[site_seq_idx]))
continue

already_annotated = False
if self._ptm_types_to_classes[ptm] in sequence.properties:
for prop in sequence.properties[self._ptm_types_to_classes[ptm]]:
if prop.positions == [site]:
if self not in prop.sources:
prop.sources.append(self)
self.log.info("site %s already annotated as %s; source will be added" % (m, self._ptm_types_to_classes[ptm]))
already_annotated = True
property_obj = prop

if not already_annotated:
property_obj = sequence_properties_classes[self._ptm_types_to_classes[ptm]]( sources=[self],
positions=[site] )
sequence.add_property(property_obj)
self.log.info("adding %s to site %s" % (m, property_obj.name))
prop_name = self._ptm_types_to_classes[ptm]

if ptm == "O-GalNAc" or ptm == "O-GlcNAc":
property_obj.add_subtype(ptm)
sequence.add_property(prop_name, positions=[site], sources=[self], subtype=ptm)
else:
sequence.add_property(prop_name, positions=[site], sources=[self])
self.log.info("adding %s as %s" % (m, prop_name))

class dbPTM(DynamicSource, object):
@logger_init
Expand Down Expand Up @@ -2021,35 +2009,11 @@ def add_sequence_properties(self, sequence, properties=None):

prop_name = self._ptm_types_to_classes[ptm]

already_annotated = False
if prop_name in sequence.properties:
for prop in sequence.properties[prop_name]:
if prop.positions == [site]:
if self not in prop.sources:
prop.sources.append(self)
already_annotated = True
property_obj = prop

if not already_annotated:
property_obj = sequence_properties_classes[prop_name](
sources=[self],
positions=[site]
)
sequence.add_property(property_obj)

if ptm in self._glyco_subtypes:
sequence.add_property(prop_name, positions=[site], sources=[self], subtype=self._glyco_subtypes[ptm])
else:
sequence.add_property(prop_name, positions=[site], sources=[self])

new_subtype = self._glyco_subtypes[ptm]
existing_subtypes = property_obj.metadata.get("subtypes", [])
base = new_subtype.split("-")[0]

has_specific = any(
st.startswith(base + "-") and st != new_subtype
for st in existing_subtypes
)

if not has_specific and new_subtype not in existing_subtypes:
property_obj.add_subtype(new_subtype)

class NetPhos(StaticSource, object):
@logger_init
Expand Down Expand Up @@ -2117,9 +2081,11 @@ def _parse_db_file(self, sequence):

return sorted(sites, key=lambda x: int(x[1:]))

def add_position_properties(self, sequence, properties=None):
def add_sequence_properties(self, sequence, properties=None):
if properties is None:
properties = ["phosphorylation"]
elif isinstance(properties, str):
properties = [properties]

if "phosphorylation" not in properties:
return
Expand All @@ -2133,25 +2099,17 @@ def add_position_properties(self, sequence, properties=None):

try:
site_seq_idx = sequence.seq2index(site)
position = sequence.positions[site_seq_idx]
except Exception:
self.log.warning(f"NetPhos site {site_label} is outside the protein sequence; it will be skipped")
continue

if position.wt_residue_type != wt:
if sequence.sequence[site_seq_idx] != wt:
self.log.warning(f"for NetPhos site {site_label}, residue {wt} is "
f"{position.wt_residue_type} in wild-type sequence; it will be skipped")
f"{sequence.sequence[site_seq_idx]} in wild-type sequence; it will be skipped")
continue

if prop_name in position.properties:
prop = position.properties[prop_name]
if self not in prop.sources:
prop.sources.append(self)
self.log.info(f"site {site_label} already annotated as phosphorylation; source will be added")
else:
prop = position_properties_classes[prop_name](sources=[self], position=position)
position.add_property(prop)
self.log.info(f"adding {site_label} to site {prop.name}")
sequence.add_property(prop_name, positions=[site], sources=[self])
self.log.info(f"adding/updating {site_label} to site {prop_name}")

class GlyGen(StaticSource, object):
@logger_init
Expand Down Expand Up @@ -2183,20 +2141,7 @@ def add_sequence_properties(self, sequence):
site = int(row['glycosylation_site_uniprotkb'])
subtype = f"{row['glycosylation_type']}-{row['carb_name'].rstrip('.')}"


already_annotated = False
if prop_name in sequence.properties:
for prop in sequence.properties[prop_name]:
if prop.positions == [site]:
if self not in prop.sources:
prop.sources.append(self)
prop.add_subtype(subtype)
already_annotated = True

if not already_annotated:
property_obj = GlycosylationSite(positions=[site], sources=[self])
property_obj.add_subtype(subtype)
sequence.add_property(property_obj)
sequence.add_property(prop_name, positions=[site], sources=[self], subtype=subtype)

class MyVariant(DynamicSource, object):
@logger_init
Expand Down Expand Up @@ -2738,14 +2683,12 @@ def add_sequence_properties(self, sequence, exclude_elm_classes=r'{.*}', use_ali
sequence.seq2index(p)
this_positions.append(p)

property_obj = sequence_properties_classes['linear_motif'] (sources=[self],
positions=this_positions,
name=self._elm_classes[d[0]][0],
id=d[0])

property_obj.metadata['function'] = [self._elm_classes[d[0]][0]]
property_obj.metadata['ref'] = self.description
sequence.add_property(property_obj)
sequence.add_property("linear_motif", positions=this_positions,
sources=[self],
name=self._elm_classes[d[0]][0],
id=d[0],
function=[self._elm_classes[d[0]][0]],
ref=self.description)

class ggetELMPredictions(StaticSource, object):
@logger_init
Expand Down Expand Up @@ -2819,14 +2762,12 @@ def add_sequence_properties(self, sequence, exclude_elm_classes=r'{.*}'):
sequence.seq2index(p)
this_positions.append(p)

property_obj = sequence_properties_classes['linear_motif'] (sources=[self],
positions=this_positions,
name=r['FunctionalSiteName'],
id=r['ELMIdentifier'])

property_obj.metadata['function'] = [r['Description']]
property_obj.metadata['ref'] = self.description
sequence.add_property(property_obj)
sequence.add_property("linear_motif", positions=this_positions,
sources=[self],
name=r["FunctionalSiteName"],
id=r["ELMIdentifier"],
function=[r["Description"]],
ref=self.description)


class gnomAD(DynamicSource, object):
Expand Down Expand Up @@ -3337,11 +3278,7 @@ def _get_mobidb_disorder_predictions(self, sequence, *args, **kwargs):
return False

for i,a in enumerate(assignments):
property_obj = sequence_properties_classes['mobidb_disorder_propensity'](
positions=[i+1],
sources=[self],
disorder_state=a)
sequence.add_property(property_obj)
sequence.add_property("mobidb_disorder_propensity", positions=[i + 1], sources=[self], disorder_state=a)

def _get_mobidb_disorder_predictions_assignments(self, sequence, *args, **kwargs):
if 'use_alias' in kwargs:
Expand Down Expand Up @@ -3610,12 +3547,9 @@ def add_sequence_properties(self, sequence, prop=_supported_sequence_properties)
continue

if row['type'] in self._ptm_keywords:
property_obj = sequence_properties_classes[row['type']](sources=[self], positions=positions)
sequence.add_property(
row['type'], positions=positions, sources=[self], function=row['function'], reference=row['reference'])
else:
property_obj = sequence_properties_classes[row['type']](sources=[self], positions=positions,
name=row['name'])

property_obj.metadata['function'] = row['function']
property_obj.metadata['reference'] = row['reference']
sequence.add_property(row['type'], positions=positions, sources=[self], name=row['name'], function=row['function'],
reference=row['reference'])

sequence.add_property(property_obj)
Loading