diff --git a/docs-src/api.rst b/docs-src/api.rst index 28ae6a50..b38bbd41 100644 --- a/docs-src/api.rst +++ b/docs-src/api.rst @@ -11,6 +11,16 @@ Interval :special-members: :members: +DisjointIntervalSequence +======================== + +.. autoclass:: genome_kit.diseq.IndexDirection + :members: + +.. autoclass:: genome_kit.diseq.DisjointIntervalSequence + :special-members: + :members: + Variant ======= diff --git a/docs-src/diseq.rst b/docs-src/diseq.rst new file mode 100644 index 00000000..03903812 --- /dev/null +++ b/docs-src/diseq.rst @@ -0,0 +1,402 @@ +.. _diseq: + +------------------------------- +Disjoint Interval Sequences +------------------------------- + +Motivation +========== + +When working with transcripts, it is often necessary to operate on the exonic +sequence with introns (or other intervening regions) removed. The remaining exons +form a disjoint set of genomic :py:class:`~genome_kit.Interval` objects. +Operations such as indexing into the spliced sequence or defining sub-ranges +across exon boundaries require a coordinate system that accounts for the gaps +between these intervals. + +The :py:class:`~genome_kit.diseq.DisjointIntervalSequence` (DIS) class provides +this coordinate system. While it builds on :py:class:`~genome_kit.Interval`, a DIS +is conceptually distinct in several ways: + +- **Spliced coordinate space.** Positions in a DIS are offsets into the + concatenated exonic (or other) sequence, not genomic coordinates. +- **5'→3' index direction.** DIS indices always increase from 5' to 3' with + respect to the transcript. Index 0 corresponds to the transcript's 5' end + regardless of genomic strand. This contrasts with + :py:class:`~genome_kit.Interval`, where ``start < end`` always holds in + genomic coordinates, so on the ``-`` strand ``start`` is the 3' end. +- **Same-strand / opposite-strand semantics.** Because a DIS models spliced + RNA rather than raw DNA, the concept of ``+``/``-`` strand is replaced by + ``on_coordinate_strand`` (same strand as the transcript) versus opposite + strand. The underlying genomic strand is accessible via ``coord_strand``, + but intervals within the DIS are described relative to the coordinate + space rather than in absolute genomic terms. + + +Overview +======== + +A :py:class:`~genome_kit.diseq.DisjointIntervalSequence` (DIS) represents +a flattened coordinate system over a sequence of disjoint genomic intervals. +For example, the exons of a transcript form a disjoint set of genomic intervals that, +when concatenated, represent the spliced RNA sequence. + +A DIS has two aspects: + +- A **coordinate space**: the underlying genomic + :py:class:`~genome_kit.Interval` objects (e.g. exons) that define the + flattened index system. These intervals are sorted 5'→3' and must not overlap. +- An **interval**: a sub-range within that coordinate space, defined by + a start and end index, where start <= end. + +The following examples illustrate how the coordinate space and interval interact, +using both diagrams and code. + +Consider a transcript on the + strand with the following genomic layout: +:: + Genomic Coordinates: 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 + DNA Sequence: | A T G C C G C A T G C C G C | + | |<------->| |<------->| |<--->| |<----------->| |<--->| | + 5' Exon1 Intron1 Exon2 Intron2 Exon3 3' + +Extracting only the exons yields the following disjoint intervals: +:: + Genomic Coordinates: 153 154 155 159 160 165 166 167 + DNA Sequence: | A T G C A G C | + | |<------->| |<--->| |<--->| | + 5' Exon1 Exon2 Exon3 3' + +These exon intervals can be represented as :py:class:`~genome_kit.Interval` objects:: + + >>> from genome_kit import Interval + >>> from genome_kit.diseq import DisjointIntervalSequence + >>> exon1 = Interval("chr1", "+", 153, 156, "hg38") + >>> exon2 = Interval("chr1", "+", 159, 161, "hg38") + >>> exon3 = Interval("chr1", "+", 165, 167, "hg38") + +To define an interval spanning the full exonic sequence (from the start of Exon1 to +the end of Exon3), the exon intervals are first converted into a DIS coordinate space +:: + DIS Coordinates: 0 1 2 3 4 5 6 7 + DNA Sequence: | A T G C A G C | + | |<------->| |<--->| |<--->| | + 5' Exon1 Exon2 Exon3 3' + +The default interval spans the entire coordinate space +:: + DIS Coordinates: 0 1 2 3 4 5 6 7 + DNA Sequence: A T G C A G C + |<--------------------->| + end5 Interval end3 + Start Index: 0 + End Index: 7 + +The interval spans the full length of the coordinate space, with a start index of 0 +and an end index of 7:: + + >>> dis = DisjointIntervalSequence.from_intervals( + ... [exon1, exon2, exon3], coord_name="tx_example" + ... ) + >>> dis.start + 0 + >>> dis.end + 7 + >>> dis.on_coordinate_strand + True + +.. note:: + The disjoint interval follows the convention of + :py:class:`~genome_kit.Interval` where intervals are half-open + (the end index is exclusive). + +A DIS can also represent an interval on the strand opposite the coordinate space. +This is useful for modeling the complementary sequence or a binding partner. + +Starting from the coordinate space defined above +:: + DIS Coordinates: 0 1 2 3 4 5 6 7 + DNA Sequence: | A T G C A G C | + | |<----->| |<->| |<->| | + 5' Exon1 Exon2 Exon3 3' + +The opposite strand shares the same DIS coordinate indices +:: + 5' Positive strand 3' + DIS Coordinates: | 0 1 2 3 4 5 6 7 | + DNA Sequence (+): | A T G C A G C | + ----------------------------------------------------- + DNA Sequence (-): | T A C G T C G | + DIS Coordinates: | 0 1 2 3 4 5 6 7 | + 3' Negative Strand 5' + +The DIS coordinate indices are identical on both strands. To obtain the complement +of a given interval, the same start and end indices apply; only the +``on_coordinate_strand`` flag changes. The following shows the full-length interval +on the opposite strand +:: + 5' Coordinate Strand 3' + DIS Coordinates: | 0 1 2 3 4 5 6 7 | + DNA Sequence (+): | A T G C A G C | + ----------------------------------------------------- + DNA Sequence (-): T A C G T C G + DIS Coordinates: 0 1 2 3 4 5 6 7 + Opposite Strand + |<--------------------->| + end3 Interval end5 + Start Index: 0 + End Index: 7 + On Coordinate Strand: False + +The ``on_coordinate_strand`` flag distinguishes same-strand from opposite-strand +intervals, since the start and end indices alone do not encode strand information:: + + >>> dis_opp = DisjointIntervalSequence( + ... [exon1, exon2, exon3], + ... coord_name="tx_example", + ... on_coordinate_strand=False, + ... ) + >>> dis_opp.on_coordinate_strand + False + >>> dis_opp.end5_index + 7 + >>> dis_opp.end3_index + 0 + +.. note:: + The preceding examples used + strand coordinate intervals. When the coordinate intervals + lie on the **negative** strand, the DIS behaves differently from :py:class:`~genome_kit.Interval`: + in one key aspect. Indices still increase in the 5'→3' direction of the transcript.:: + + >>> iv = Interval("chr1", "-", 0, 100, genome.refg) + >>> assert iv.end5.start > iv.end3.start + >>> dis = DisjointIntervalSequence.from_intervals([iv]) + >>> assert dis.end5.start < dis.end3.start + +Consider a transcript on the negative strand: +:: + 3' Exon3 Intron2 Exon2 Intron1 Exon1 5' + | |<------->| |<------->| |<--->| |<----------->| |<--->| | + DNA Sequence (-): | G T C A G T C A G T C A G T | + Genomic Coordinates: 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 + Negative Strand +Taking just the exons: +:: + 3' Exon3 Exon2 Exon1 5' + | |<------->| |<--->| |<--->| | + DNA Sequence (-): | G T C C A G T | + Genomic Coordinates: 153 154 155 159 160 165 166 167 + Negative Strand + +Converting these exons into a DIS coordinate space: +:: + DIS Coordinates: 0 1 2 3 4 5 6 7 + DNA Sequence: | T G A C C T G | + | |<->| |<->| |<----->| | + 5' Exon1 Exon2 Exon3 3' + +The sequence appears reversed relative to genomic coordinates because the DIS +coordinate space is oriented 5'→3' with respect to the transcript, regardless of +genomic strand. Index 0 always corresponds to the transcript's 5' end, and the +largest index to the 3' end:: + + >>> neg_exon1 = Interval("chr1", "-", 165, 167, "hg38") + >>> neg_exon2 = Interval("chr1", "-", 159, 161, "hg38") + >>> neg_exon3 = Interval("chr1", "-", 153, 156, "hg38") + >>> dis_neg = DisjointIntervalSequence.from_intervals( + ... [neg_exon1, neg_exon2, neg_exon3], + ... coord_name="tx_neg_example", + ... ) + >>> dis_neg.coord_strand + '-' + >>> dis_neg.coordinate_length + 7 + +A full-length interval on the coordinate strand +:: + DIS Coordinates: 0 1 2 3 4 5 6 7 + DNA Sequence: T G A C C T G + |<--------------------->| + end5 Interval end3 + Start Index: 0 + End Index: 7 + On Coordinate Strand: True + +Despite creating the DIS from the negative strand, the full-length interval on the +coordinate strand is identical to the + strand example. When working with DIS +objects, strand is expressed only as "same strand" or "opposite strand":: + + >>> dis_neg.start + 0 + >>> dis_neg.end + 7 + >>> dis_neg.on_coordinate_strand + True + +The same coordinate space with an opposite-strand interval +:: + DIS Coordinates: 0 1 2 3 4 5 6 7 + DNA Sequence (-): T G A C C T G + ----------------------------------------------------- + DNA Sequence (+): A C T G G A C + DIS Coordinates: 0 1 2 3 4 5 6 7 + |<--------------------->| + end3 Interval end5 + Start Index: 0 + End Index: 7 + On Coordinate Strand: False + +:: + + >>> dis_neg_opp = DisjointIntervalSequence( + ... [neg_exon1, neg_exon2, neg_exon3], + ... coord_name="tx_neg_example", + ... on_coordinate_strand=False, + ... ) + >>> dis_neg_opp.on_coordinate_strand + False + >>> dis_neg_opp.end5_index + 7 + >>> dis_neg_opp.end3_index + 0 + +Construction +============ + +From a Transcript +~~~~~~~~~~~~~~~~~ + +The most common way to create a DIS is from a +:py:class:`~genome_kit.Transcript`:: + + >>> from genome_kit import Genome + >>> from genome_kit.diseq import DisjointIntervalSequence + >>> genome = Genome("gencode.v29") + >>> transcript = genome.transcripts[100] + >>> dis = DisjointIntervalSequence.from_transcript(transcript) + +By default, the coordinate space is built from the transcript's exons. +You can also specify a region to use CDS or UTR intervals:: + + >>> dis_cds = DisjointIntervalSequence.from_transcript(transcript, region="cds") + >>> dis_utr5 = DisjointIntervalSequence.from_transcript(transcript, region="utr5") + >>> dis_utr3 = DisjointIntervalSequence.from_transcript(transcript, region="utr3") + +The ``coord_name`` and ``interval_name`` default to ``transcript.id`` but can +be overridden:: + + >>> dis = DisjointIntervalSequence.from_transcript( + ... transcript, coord_name="my_coord", interval_name="my_interval") +From Intervals +~~~~~~~~~~~~~~ + +You can also construct a DIS from any sequence of +:py:class:`~genome_kit.Interval` objects:: + + >>> from genome_kit import Interval + >>> exon_intervals = [e.interval for e in transcript.exons] + >>> dis = DisjointIntervalSequence.from_intervals(exon_intervals, coord_name="my_coord") + +The intervals must all share the same chromosome, strand, reference +genome, and must not overlap. They are automatically sorted 5'→3'. + +Coordinate Space +================ + +The coordinate space is defined by the underlying genomic intervals, which +are accessible as a tuple:: + + >>> dis.coordinate_intervals + (Interval("chr1", "+", 100, 200, "hg38"), Interval("chr1", "+", 300, 450, "hg38")) + >>> dis.coordinate_length + 250 + +Metadata about the coordinate space is available through properties:: + + >>> dis.chromosome + 'chr1' + >>> dis.coord_strand + '+' + >>> dis.reference_genome + 'hg38' + >>> dis.coord_name + 'ENST00000...' + +Interval Start and End +====================== + +The interval within the coordinate space is defined by ``start`` and ``end`` +indices, following the same half-open convention as :py:class:`~genome_kit.Interval` +(``start <= end`` always):: + + >>> dis.start + 0 + >>> dis.end + 250 + >>> dis.length + 250 + >>> len(dis) + 250 + +By default, the interval spans the full coordinate space (``start=0``, +``end=coordinate_length``). Indices can extend beyond ``[0, coordinate_length]``, but +the DNA sequence returned by ``genome.dna()`` will be N-padded. + +End5 and End3 +~~~~~~~~~~~~~ + +The ``end5_index`` and ``end3_index`` properties give the 5' and 3' positions +of the interval. These are derived from ``start`` and ``end`` based on the +interval's strand. + +When ``on_coordinate_strand`` is ``True``, ``end5_index`` equals ``start`` and +``end3_index`` equals ``end``:: + + Start Index: 1 + End Index: 6 + DIS Coordinates: 0 1 2 3 4 5 6 7 + DNA Sequence: T G A C C T G + |<------------->| + ----------------------------------------------------- + DNA Sequence: A C T G G A C + DIS Coordinates: 0 1 2 3 4 5 6 7 + Opposite Strand + +:: + + >>> dis = DisjointIntervalSequence.from_transcript(transcript) + >>> dis.end5_index # same as start when on coordinate strand + 1 + >>> dis.end3_index # same as end when on coordinate strand + 6 + +When ``on_coordinate_strand`` is ``False``, the mapping reverses: +``end5_index`` equals ``end`` and ``end3_index`` equals ``start``:: + + Start Index: 3 + End Index: 7 + DIS Coordinates: 0 1 2 3 4 5 6 7 + DNA Sequence: T G A C C T G + ----------------------------------------------------- + DNA Sequence: A C T G G A C + DIS Coordinates: 0 1 2 3 4 5 6 7 + |<--------->| + Opposite Strand + +:: + + >>> opp = dis.as_opposite_strand() + >>> opp.end5_index # same as end when off coordinate strand + 7 + >>> opp.end3_index # same as start when off coordinate strand + 3 + +Boundary Properties +~~~~~~~~~~~~~~~~~~~ + +Zero-length DIS objects at the interval and coordinate boundaries are +available as properties:: + + >>> dis.end5 # 0-length DIS at the interval's 5' boundary + >>> dis.end3 # 0-length DIS at the interval's 3' boundary + >>> dis.coord_end5 # 0-length DIS at the coordinate space's 5' boundary + >>> dis.coord_end3 # 0-length DIS at the coordinate space's 3' boundary diff --git a/docs-src/index.rst b/docs-src/index.rst index 46b7b4b6..c6feb3fc 100644 --- a/docs-src/index.rst +++ b/docs-src/index.rst @@ -70,6 +70,7 @@ Contents: :maxdepth: 2 quickstart + diseq anchors api genomes diff --git a/genome_kit/__init__.py b/genome_kit/__init__.py index dc1f1cb3..4d4d22e3 100644 --- a/genome_kit/__init__.py +++ b/genome_kit/__init__.py @@ -5,6 +5,7 @@ from . import gk_data from .data_manager import DataManager, DefaultDataManager, GCSDataManager +from .diseq import DisjointIntervalSequence from .genome import Genome, ApprisNotAvailableError, ManeNotAvailableError from .genome_annotation import ( Cds, diff --git a/genome_kit/diseq.py b/genome_kit/diseq.py new file mode 100644 index 00000000..bff791da --- /dev/null +++ b/genome_kit/diseq.py @@ -0,0 +1,454 @@ +from copy import deepcopy +import enum +from dataclasses import dataclass +from typing import Sequence, Literal + +from .interval import Interval +from .genome_annotation import Transcript + + +class IndexDirection(enum.Enum): + """Controls how indices are assigned to the 5' and 3' ends of a coordinate space. + Changing this value will result in undefined behaviour for + :py:class:`DisjointIntervalSequence` objects created under a different convention. + + ``TRANSCRIPT_FIVE_TO_THREE`` + Index 0 is always at the coordinate transcript's 5' end, regardless of genomic strand. + + ``POSITIVE_STRAND_LEFT_TO_RIGHT`` + Index 0 is always at the leftmost genomic position relative to the positive strand. + On the negative strand, this means the 3' end is at index 0. + """ + + TRANSCRIPT_FIVE_TO_THREE = "transcript_five_to_three" + POSITIVE_STRAND_LEFT_TO_RIGHT = "positive_strand_left_to_right" + + +@dataclass(frozen=True) +class _CoordinateMetadata: + name: str | None + reference_genome: str + chromosome: str + transcript_strand: Literal["+", "-"] + + +@dataclass(frozen=True) +class _IntervalMetadata: + name: str | None + on_coordinate_strand: bool + + +class DisjointIntervalSequence: + """A flattened coordinate system over a sequence of disjoint genomic Intervals. + + A DIS represents two layers: + + - A **coordinate space** defined by a sequence of non-overlapping genomic + :py:class:`~genome_kit.Interval` objects (e.g. the exons of a transcript), + which are flattened into a contiguous 0-based index space. Indices for the + coordinate space are assigned according to the current :py:class:`IndexDirection` + value. + - An **interval** within that coordinate space, defined by a 5' and 3' index. + The interval may lie on the same, or opposite, strand as the coordinate space. + + Use :py:meth:`from_transcript` or :py:meth:`from_intervals` to construct + instances rather than calling the constructor directly. + """ + + _index_direction: IndexDirection = IndexDirection.TRANSCRIPT_FIVE_TO_THREE + + @classmethod + def set_index_direction(cls, direction: IndexDirection) -> None: + """Set the index direction convention for all DIS instances. + + Parameters + ---------- + direction : :py:class:`IndexDirection` + The index direction convention to use. + """ + cls._index_direction = direction + + @classmethod + def get_index_direction(cls) -> IndexDirection: + """Get the current index direction convention. + + Returns + ------- + :py:class:`IndexDirection` + """ + return cls._index_direction + + def __init__( + self, + coordinate_intervals: Sequence[Interval], + *, + coord_name: str | None = None, + interval_name: str | None = None, + on_coordinate_strand: bool = True, + start: int | None = None, + end: int | None = None, + ): + """Low-level constructor. + + Prefer :py:meth:`from_transcript` or :py:meth:`from_intervals` for + public construction. + + Parameters + ---------- + coordinate_intervals : Sequence[:py:class:`~genome_kit.Interval`] + Non-empty sequence of non-overlapping Intervals on the same + chromosome, strand, and reference genome. + coord_name : :py:class:`str` or None + Optional name for the coordinate space. + interval_name : :py:class:`str` or None + Optional name for the interval. + on_coordinate_strand : :py:class:`bool` + Whether the interval is on the same strand as the coordinate + intervals. Defaults to True. Can be used to represent a sequence that binds + to the transcript if set to False. + start : :py:class:`int` or None + start index of the interval in the coordinate space. Defaults to 0 + end : :py:class:`int` or None + end index of the interval in the coordinate space. Defaults to the length + of the coordinate space. + + Raises + ------ + ValueError + If intervals are empty, inconsistent, overlapping, or if start + is greater than end. + TypeError + If any element is not an Interval. + """ + if len(coordinate_intervals) == 0: + raise ValueError("coordinate_intervals must be non-empty") + + for i, iv in enumerate(coordinate_intervals): + if not isinstance(iv, Interval): + raise TypeError( + f"coordinate_intervals[{i}] is {type(iv).__name__}, expected Interval" + ) + + # Consistent chromosome, strand, reference_genome + iv0 = coordinate_intervals[0] + for iv in coordinate_intervals[1:]: + if iv.chromosome != iv0.chromosome: + raise ValueError( + f"All intervals must share the same chromosome, " + f"got {iv0.chromosome!r} and {iv.chromosome!r}" + ) + if iv.strand != iv0.strand: + raise ValueError( + f"All intervals must share the same strand, " + f"got {iv0.strand!r} and {iv.strand!r}" + ) + if iv.reference_genome != iv0.reference_genome: + raise ValueError( + f"All intervals must share the same reference genome, " + f"got {iv0.reference_genome!r} and {iv.reference_genome!r}" + ) + + # Sort 5'->3' + if iv0.strand == "+": + sorted_intervals = sorted(coordinate_intervals, key=lambda iv: iv.start) + else: + # On negative strand end is the 5' end since start < end. + # Sort by -end to get 5'->3' order. + sorted_intervals = sorted(coordinate_intervals, key=lambda iv: -iv.end) + + # No overlaps (adjacent/touching OK) + for i in range(len(sorted_intervals) - 1): + cur_iv, next_iv = sorted_intervals[i], sorted_intervals[i + 1] + plus_strand_overlap = iv0.strand == "+" and cur_iv.end > next_iv.start + minus_strand_overlap = iv0.strand == "-" and cur_iv.start < next_iv.end + if plus_strand_overlap or minus_strand_overlap: + raise ValueError( + f"Intervals must not overlap: [{cur_iv.start}, {cur_iv.end}) and [{next_iv.start}, {next_iv.end})" + ) + + self._coordinate_intervals: tuple[Interval, ...] = tuple(sorted_intervals) + + self._coord_metadata = _CoordinateMetadata( + name=coord_name, + reference_genome=iv0.reference_genome, + chromosome=iv0.chromosome, + transcript_strand=iv0.strand, + ) + self._interval_metadata = _IntervalMetadata( + name=interval_name, + on_coordinate_strand=on_coordinate_strand, + ) + + # Default interval start/end to span the full coordinate + if start is None: + start = 0 + if end is None: + end = self.coordinate_length + + # Validate that start is less than or equal to end + if start > end: + raise ValueError( + f"start index {start} cannot be greater than end index {end}" + ) + + self._start: int = start + self._end: int = end + + @classmethod + def from_intervals( + cls, + intervals: Sequence[Interval], + *, + coord_name: str | None = None, + interval_name: str | None = None, + ) -> "DisjointIntervalSequence": + """Construct a DIS from a sequence of Intervals + (or :py:class:`~genome_kit.Exon`/:py:class:`~genome_kit.Cds`/:py:class:`~genome_kit.Utr` objects). + + Parameters + ---------- + intervals : Sequence[:py:class:`~genome_kit.Interval`] + Sequence of Interval or annotation objects. + coord_name : :py:class:`str` or None + Optional name for the coordinate space. + interval_name : :py:class:`str` or None + Optional name for the interval. + + Returns + ------- + :py:class:`DisjointIntervalSequence` + """ + coord_intervals = [ + iv.interval if hasattr(iv, "interval") else iv for iv in intervals + ] + return cls(coord_intervals, coord_name=coord_name, interval_name=interval_name) + + @classmethod + def from_transcript( + cls, + transcript: Transcript, + *, + region: Literal["exons", "cds", "utr5", "utr3"] = "exons", + coord_name: str | None = None, + interval_name: str | None = None, + ) -> "DisjointIntervalSequence": + """Construct a DIS from a transcript's exons, CDS, or UTR regions. + + Parameters + ---------- + transcript : :py:class:`~genome_kit.Transcript` + The source Transcript object. + region : :py:class:`str` + Which region to extract — ``"exons"``, ``"cds"``, + ``"utr5"``, or ``"utr3"``. + coord_name : :py:class:`str` or None + Optional name for the coordinate space. Defaults to ``transcript.id``. + interval_name : :py:class:`str` or None + Optional name for the interval. Defaults to ``transcript.id``. + + Returns + ------- + :py:class:`DisjointIntervalSequence` + + Raises + ------ + ValueError + If region is not one of the allowed values. + """ + match region: + case "exons": + region_elements = transcript.exons + case "cds": + region_elements = transcript.cdss + case "utr5": + region_elements = transcript.utr5s + case "utr3": + region_elements = transcript.utr3s + case _: + raise ValueError(f"Invalid region: {region!r}") + coord_intervals = [element.interval for element in region_elements] + if coord_name is None: + coord_name = transcript.id + if interval_name is None: + interval_name = transcript.id + + return cls(coord_intervals, coord_name=coord_name, interval_name=interval_name) + + @property + def coord_name(self) -> str | None: + """Name of the coordinate space, or None.""" + return self._coord_metadata.name + + @property + def reference_genome(self) -> str: + """Reference genome of the coordinate intervals.""" + return self._coord_metadata.reference_genome + + @property + def chromosome(self) -> str: + """Chromosome of the coordinate intervals.""" + return self._coord_metadata.chromosome + + @property + def coord_strand(self) -> Literal["+", "-"]: + """Strand of the coordinate intervals (the transcript strand).""" + return self._coord_metadata.transcript_strand + + @property + def name(self) -> str | None: + """Name of the interval, or None.""" + return self._interval_metadata.name + + @property + def on_coordinate_strand(self) -> bool: + """True if the interval is on the same strand as the coordinate intervals.""" + return self._interval_metadata.on_coordinate_strand + + @property + def strand(self) -> Literal["+", "-"]: + """Effective strand of the interval, accounting for on_coordinate_strand.""" + if self.on_coordinate_strand: + return self.coord_strand + # Interval is on opposite strand + if self.coord_strand == "+": + return "-" + return "+" + + @property + def coordinate_end5_index(self) -> int: + """5' index of the coordinate space.""" + if self._index_direction == IndexDirection.TRANSCRIPT_FIVE_TO_THREE: + return 0 + if self.coord_strand == "+": + return 0 + return self.coordinate_length + + @property + def coordinate_end3_index(self) -> int: + """3' index of the coordinate space.""" + if self._index_direction == IndexDirection.TRANSCRIPT_FIVE_TO_THREE: + return self.coordinate_length + if self.coord_strand == "+": + return self.coordinate_length + return 0 + + @property + def end5_index(self) -> int: + """5' index of the interval.""" + if self._upstream_index_step() == -1: + return self._start + return self._end + + @property + def end3_index(self) -> int: + """3' index of the interval.""" + if self._upstream_index_step() == -1: + return self._end + return self._start + + @property + def start(self) -> int: + """Start index of the interval in the coordinate space.""" + return self._start + + @property + def end(self) -> int: + """End index of the interval in the coordinate space.""" + return self._end + + def _at_index( + self, idx: int, on_coordinate_strand: bool + ) -> "DisjointIntervalSequence": + """Return a 0-length DIS at the given index position.""" + return DisjointIntervalSequence( + self._coordinate_intervals, + coord_name=self._coord_metadata.name, + on_coordinate_strand=on_coordinate_strand, + start=idx, + end=idx, + ) + + @property + def end5(self) -> "DisjointIntervalSequence": + """0-length DIS at the interval's 5' end.""" + return self._at_index( + self.end5_index, on_coordinate_strand=self.on_coordinate_strand + ) + + @property + def end3(self) -> "DisjointIntervalSequence": + """0-length DIS at the interval's 3' end.""" + return self._at_index( + self.end3_index, on_coordinate_strand=self.on_coordinate_strand + ) + + @property + def coord_end5(self) -> "DisjointIntervalSequence": + """0-length DIS at the coordinate space's 5' end.""" + return self._at_index(self.coordinate_end5_index, on_coordinate_strand=True) + + @property + def coord_end3(self) -> "DisjointIntervalSequence": + """0-length DIS at the coordinate space's 3' end.""" + return self._at_index(self.coordinate_end3_index, on_coordinate_strand=True) + + @property + def coordinate_intervals(self) -> tuple[Interval, ...]: + """The underlying genomic intervals of the coordinate-space, sorted 5'->3'.""" + # Deepcopy to preserve imutability of this DIS + return deepcopy(self._coordinate_intervals) + + @property + def coordinate_length(self) -> int: + """Total length of the coordinate space in bases.""" + return sum(len(iv) for iv in self._coordinate_intervals) + + @property + def length(self) -> int: + """Length of the interval on the coordinate space.""" + return self.end - self.start + + def _upstream_index_step(self, on_coordinate_strand: bool | None = None) -> int: + """Return +1 or -1 indicating the upstream direction in index space. + + Args: + on_coordinate_strand: Override for which strand to compute the step for. + Defaults to this interval's on_coordinate_strand. + """ + if on_coordinate_strand is None: + on_coordinate_strand = self.on_coordinate_strand + if self._index_direction == IndexDirection.TRANSCRIPT_FIVE_TO_THREE: + return -1 if on_coordinate_strand else 1 + # POSITIVE_STRAND_LEFT_TO_RIGHT: effective strand determines direction + return -1 if self.strand == "+" else 1 + + def __len__(self) -> int: + """Return the length of the interval.""" + return self.length + + def __repr__(self) -> str: + """Return a human-readable representation.""" + return ( + f"DisjointIntervalSequence(" + f"coord_name={self._coord_metadata.name!r}, " + f"name={self._interval_metadata.name!r}, " + f"{self.chromosome}:{self.coord_strand}, " + f"len={self.length}, " + f"coord_intervals={self._coordinate_intervals}, " + f"start={self._start}, " + f"end={self._end}, " + f"end5={self.end5_index}, " + f"end3={self.end3_index})" + ) + + def __eq__(self, other: object) -> bool: + """Equality based on coordinate intervals, metadata, and index values.""" + if not isinstance(other, DisjointIntervalSequence): + return NotImplemented + return ( + self._coord_metadata == other._coord_metadata + and self._interval_metadata == other._interval_metadata + and self._start == other._start + and self._end == other._end + and self._coordinate_intervals == other._coordinate_intervals + ) diff --git a/tests/test_diseq.py b/tests/test_diseq.py new file mode 100644 index 00000000..322866fb --- /dev/null +++ b/tests/test_diseq.py @@ -0,0 +1,523 @@ +import unittest +from genome_kit import Interval +from tests import MiniGenome +from genome_kit.diseq import DisjointIntervalSequence + +REFG = "hg19.mini" + + +def _make_intervals(specs, refg = REFG): + """Helper: specs is list of (chrom, strand, start, end).""" + return [ + Interval(chrom, strand, start, end, refg) for chrom, strand, start, end in specs + ] + + +class TestInit(unittest.TestCase): + + def test_non_interval_raises(self): + with self.assertRaises(TypeError): + DisjointIntervalSequence(["not an interval"]) + with self.assertRaises(TypeError): + DisjointIntervalSequence([42]) + iv = Interval("chr1", "+", 100, 200, REFG) + with self.assertRaises(TypeError): + DisjointIntervalSequence([iv, "bad"]) + + def test_empty_list_raises(self): + with self.assertRaises(ValueError): + DisjointIntervalSequence([]) + + def test_mixed_chromosomes_raises(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr2", "+", 300, 400)]) + with self.assertRaises(ValueError): + DisjointIntervalSequence(ivs) + + def test_mixed_strands_raises(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "-", 300, 400)]) + with self.assertRaises(ValueError): + DisjointIntervalSequence(ivs) + + def test_mixed_reference_genomes_raises(self): + ivs = [ + Interval("chr2", "+", 100, 200, "hg38.p12.mini"), + Interval("chr2", "+", 300, 400, "hg38.p13.mini"), + ] + with self.assertRaises(ValueError): + DisjointIntervalSequence(ivs) + + def test_overlapping_intervals_raises(self): + ivs = _make_intervals([("chr1", "+", 100, 250), ("chr1", "+", 200, 400)]) + with self.assertRaises(ValueError): + DisjointIntervalSequence(ivs) + + def test_overlapping_intervals_negative_strand_raises(self): + ivs = _make_intervals([("chr1", "-", 100, 250), ("chr1", "-", 200, 400)]) + with self.assertRaises(ValueError): + DisjointIntervalSequence(ivs) + + def test_adjacent_intervals_ok(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 200, 300)]) + dis = DisjointIntervalSequence(ivs) + self.assertEqual(dis.coordinate_length, 200) + self.assertEqual(dis.coordinate_intervals, tuple(ivs)) + + def test_sorts_out_of_order_positive(self): + ivs = _make_intervals([("chr1", "+", 300, 400), ("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs) + self.assertEqual(dis.coordinate_intervals[0].start, 100) + self.assertEqual(dis.coordinate_intervals[1].start, 300) + + def test_sorts_out_of_order_negative(self): + ivs = _make_intervals([("chr1", "-", 100, 200), ("chr1", "-", 300, 400)]) + dis = DisjointIntervalSequence(ivs) + self.assertEqual(dis.coordinate_intervals[0].end, 400) + self.assertEqual(dis.coordinate_intervals[1].end, 200) + + def test_coordinate_length(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 450)]) + dis = DisjointIntervalSequence(ivs) + self.assertEqual(dis.coordinate_length, 250) + + def test_start_end_default_to_full_interval(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence(ivs) + self.assertEqual(dis.start, 0) + self.assertEqual(dis.end, 200) + + def test_out_of_bounds_indices_allowed(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) # coord_length=100 + dis = DisjointIntervalSequence(ivs, start=-10, end=50) + self.assertEqual(dis.start, -10) + dis2 = DisjointIntervalSequence(ivs, start=10, end=300) + self.assertEqual(dis2.end, 300) + dis3 = DisjointIntervalSequence(ivs, start=-100, end=300) + self.assertEqual(dis3.start, -100) + self.assertEqual(dis3.end, 300) + dis4 = DisjointIntervalSequence(ivs, start=300, end=300) + self.assertEqual(dis4.start, 300) + self.assertEqual(dis4.end, 300) + dis5 = DisjointIntervalSequence(ivs, start=-300, end=-300) + self.assertEqual(dis5.start, -300) + self.assertEqual(dis5.end, -300) + + def test_start_greater_than_end_raises(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + with self.assertRaises(ValueError): + DisjointIntervalSequence(ivs, start=80, end=10) + + def test_start_equals_end_allowed(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs, start=50, end=50) + self.assertEqual(dis.length, 0) + + def test_custom_interval_indices(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence(ivs, start=10, end=50) + self.assertEqual(dis.start, 10) + self.assertEqual(dis.end, 50) + self.assertEqual(dis.length, 40) + + +class TestFromIntervals(unittest.TestCase): + + def setUp(self): + self.genome = MiniGenome("gencode.v41") + self.transcript = self.genome.transcripts["ENST00000233331.12"] + + def test_happy_path(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence.from_intervals(ivs, coord_name="mycoord") + self.assertEqual(dis.coord_name, "mycoord") + self.assertEqual(dis.coordinate_length, 200) + self.assertEqual(dis.coordinate_intervals, tuple(ivs)) + + def test_coord_and_interval_id_independent(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence.from_intervals( + ivs, coord_name="c1", interval_name="i1" + ) + self.assertEqual(dis.coord_name, "c1") + self.assertEqual(dis.name, "i1") + + def test_single_interval(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence.from_intervals(ivs) + self.assertEqual(len(dis.coordinate_intervals), 1) + self.assertEqual(dis.coordinate_length, 100) + self.assertEqual(dis.coordinate_intervals, tuple(ivs)) + + def test_adjacent_intervals(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 200, 300)]) + dis = DisjointIntervalSequence.from_intervals(ivs) + self.assertEqual(dis.coordinate_length, 200) + self.assertEqual(dis.coordinate_intervals, tuple(ivs)) + + def test_extracts_interval_from_exon(self): + exons = list(self.transcript.exons) + dis = DisjointIntervalSequence.from_intervals(exons) + expected = tuple(e.interval for e in self.transcript.exons) + self.assertEqual(dis.coordinate_intervals, expected) + + def test_extracts_interval_from_cds(self): + cdss = list(self.transcript.cdss) + dis = DisjointIntervalSequence.from_intervals(cdss) + expected = tuple(c.interval for c in self.transcript.cdss) + self.assertEqual(dis.coordinate_intervals, expected) + + def test_extracts_interval_from_utr5(self): + utr5s = list(self.transcript.utr5s) + dis = DisjointIntervalSequence.from_intervals(utr5s) + expected = tuple(u.interval for u in self.transcript.utr5s) + self.assertEqual(dis.coordinate_intervals, expected) + + def test_extracts_interval_from_utr3(self): + utr3s = list(self.transcript.utr3s) + dis = DisjointIntervalSequence.from_intervals(utr3s) + expected = tuple(u.interval for u in self.transcript.utr3s) + self.assertEqual(dis.coordinate_intervals, expected) + + def test_metadata_defaults_to_none(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence.from_intervals(ivs) + self.assertEqual(dis.coord_name, None) + self.assertEqual(dis.name, None) + self.assertEqual(dis.reference_genome, REFG) + self.assertEqual(dis.chromosome, "chr1") + self.assertEqual(dis.coord_strand, "+") + + +class TestFromTranscript(unittest.TestCase): + + def setUp(self): + self.genome = MiniGenome("gencode.v41") + self.transcript = self.genome.transcripts["ENST00000233331.12"] + + def test_exons_region(self): + dis = DisjointIntervalSequence.from_transcript(self.transcript, region="exons") + expected = tuple(e.interval for e in self.transcript.exons) + self.assertEqual(dis.coordinate_intervals, expected) + + def test_cds_region(self): + dis = DisjointIntervalSequence.from_transcript(self.transcript, region="cds") + expected = tuple(c.interval for c in self.transcript.cdss) + self.assertEqual(dis.coordinate_intervals, expected) + + def test_utr5_region(self): + dis = DisjointIntervalSequence.from_transcript(self.transcript, region="utr5") + expected = tuple(u.interval for u in self.transcript.utr5s) + self.assertEqual(dis.coordinate_intervals, expected) + + def test_utr3_region(self): + dis = DisjointIntervalSequence.from_transcript(self.transcript, region="utr3") + expected = tuple(u.interval for u in self.transcript.utr3s) + self.assertEqual(dis.coordinate_intervals, expected) + + def test_metadata_defaults_to_transcript_id(self): + dis = DisjointIntervalSequence.from_transcript(self.transcript) + self.assertEqual(dis.coord_name, self.transcript.id) + self.assertEqual(dis.name, self.transcript.id) + self.assertEqual(dis.reference_genome, self.transcript.reference_genome) + self.assertEqual(dis.chromosome, self.transcript.chromosome) + self.assertEqual(dis.coord_strand, self.transcript.strand) + + def test_invalid_region_raises(self): + with self.assertRaises(ValueError): + DisjointIntervalSequence.from_transcript(self.transcript, region="invalid") + + def test_custom_id_overrides(self): + dis = DisjointIntervalSequence.from_transcript( + self.transcript, coord_name="custom_coord", interval_name="custom_iv" + ) + self.assertEqual(dis.coord_name, "custom_coord") + self.assertEqual(dis.name, "custom_iv") + + +class TestProperties(unittest.TestCase): + + def test_metadata_getters_positive(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs, coord_name="c", interval_name="i") + self.assertEqual(dis.coord_name, "c") + self.assertEqual(dis.name, "i") + self.assertEqual(dis.reference_genome, REFG) + self.assertEqual(dis.chromosome, "chr1") + self.assertEqual(dis.coord_strand, "+") + self.assertTrue(dis.on_coordinate_strand) + + def test_on_coordinate_strand_false(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs, on_coordinate_strand=False) + self.assertFalse(dis.on_coordinate_strand) + + def test_coord_transcript_strand_positive(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs) + self.assertEqual(dis.coord_strand, "+") + + def test_coord_transcript_strand_negative(self): + ivs = _make_intervals([("chr1", "-", 100, 200)]) + dis = DisjointIntervalSequence(ivs) + self.assertEqual(dis.coord_strand, "-") + + def test_coord_transcript_strand_unaffected_by_on_coordinate_strand(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs, on_coordinate_strand=False) + self.assertEqual(dis.coord_strand, "+") + + def test_transcript_strand_on_coordinate_strand_positive(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs, on_coordinate_strand=True) + self.assertEqual(dis.strand, "+") + + def test_transcript_strand_on_coordinate_strand_negative(self): + ivs = _make_intervals([("chr1", "-", 100, 200)]) + dis = DisjointIntervalSequence(ivs, on_coordinate_strand=True) + self.assertEqual(dis.strand, "-") + + def test_transcript_strand_off_coordinate_strand_positive(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs, on_coordinate_strand=False) + self.assertEqual(dis.strand, "-") + + def test_transcript_strand_off_coordinate_strand_negative(self): + ivs = _make_intervals([("chr1", "-", 100, 200)]) + dis = DisjointIntervalSequence(ivs, on_coordinate_strand=False) + self.assertEqual(dis.strand, "+") + + def test_start_and_end(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence(ivs, start=10, end=150) + self.assertEqual(dis.start, 10) + self.assertEqual(dis.end, 150) + + def test_coordinate_intervals(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence(ivs, start=10, end=150) + self.assertEqual(dis.coordinate_intervals, tuple(ivs)) + + def test_coordinate_length(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence(ivs) + self.assertEqual(dis.coordinate_length, 200) + + def test_length(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence(ivs, start=10, end=150) + self.assertEqual(dis.length, 140) + + def test_length_zero(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs, start=50, end=50) + self.assertEqual(dis.length, 0) + + +class TestEndProperties(unittest.TestCase): + + def test_end5_default(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence(ivs, coord_name="c", interval_name="i") + # On coordinate strand: end5_index == start (0), end3_index == end (200) + self.assertEqual(dis.end5_index, 0) + self.assertEqual(dis.end3_index, 200) + e5 = dis.end5 + self.assertEqual(len(e5), 0) + self.assertEqual(e5.start, 0) + self.assertEqual(e5.end, 0) + self.assertEqual(e5.coord_name, "c") + self.assertEqual(e5.name, None) + expected = DisjointIntervalSequence( + ivs, coord_name="c", interval_name=None, on_coordinate_strand=True, start=0, end=0 + ) + self.assertEqual(e5, expected) + + def test_end3_default(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence(ivs) + e3 = dis.end3 + self.assertEqual(len(e3), 0) + self.assertEqual(e3.start, 200) + self.assertEqual(e3.end, 200) + self.assertEqual(dis.end3_index, 200) + expected = DisjointIntervalSequence( + ivs, on_coordinate_strand=True, start=200, end=200 + ) + self.assertEqual(e3, expected) + + def test_end5_end3_with_custom_indices(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence(ivs, start=30, end=150) + # On coordinate strand: end5_index == start, end3_index == end + e5 = dis.end5 + e3 = dis.end3 + self.assertEqual(e5.start, 30) + self.assertEqual(e5.end, 30) + self.assertEqual(e3.start, 150) + self.assertEqual(e3.end, 150) + self.assertEqual(dis.end5_index, 30) + self.assertEqual(dis.end3_index, 150) + expected_e5 = DisjointIntervalSequence( + ivs, on_coordinate_strand=True, start=30, end=30 + ) + expected_e3 = DisjointIntervalSequence( + ivs, on_coordinate_strand=True, start=150, end=150 + ) + self.assertEqual(e5, expected_e5) + self.assertEqual(e3, expected_e3) + + def test_coord_end5(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs) + ce5 = dis.coord_end5 + self.assertEqual(len(ce5), 0) + self.assertEqual(ce5.start, 0) + self.assertEqual(ce5.end, 0) + self.assertEqual(ce5.end5_index, 0) + self.assertEqual(ce5.end3_index, 0) + expected = DisjointIntervalSequence( + ivs, on_coordinate_strand=True, start=0, end=0 + ) + self.assertEqual(ce5, expected) + + def test_coord_end3(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs) + ce3 = dis.coord_end3 + self.assertEqual(len(ce3), 0) + self.assertEqual(ce3.start, 100) + self.assertEqual(ce3.end, 100) + self.assertEqual(ce3.end5_index, 100) + self.assertEqual(ce3.end3_index, 100) + expected = DisjointIntervalSequence( + ivs, on_coordinate_strand=True, start=100, end=100 + ) + self.assertEqual(ce3, expected) + + def test_end_preserves_coordinate_intervals(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence(ivs) + self.assertEqual(dis.end5.coordinate_intervals, dis.coordinate_intervals) + self.assertEqual(dis.end3.coordinate_intervals, dis.coordinate_intervals) + self.assertEqual(dis.coord_end5.coordinate_intervals, dis.coordinate_intervals) + self.assertEqual(dis.coord_end3.coordinate_intervals, dis.coordinate_intervals) + + def test_end_preserves_on_coordinate_strand(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs, on_coordinate_strand=False) + self.assertFalse(dis.end5.on_coordinate_strand) + self.assertFalse(dis.end3.on_coordinate_strand) + + def test_coord_end_independent_of_on_coordinate_strand(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs, on_coordinate_strand=False) + self.assertTrue(dis.coord_end5.on_coordinate_strand) + self.assertTrue(dis.coord_end3.on_coordinate_strand) + + def test_end5_end3_opposite_strand(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence( + ivs, start=30, end=150, on_coordinate_strand=False + ) + # Off coord strand: end5 at end (150), end3 at start (30) + self.assertEqual(dis.end5_index, 150) + self.assertEqual(dis.end3_index, 30) + e5 = dis.end5 + e3 = dis.end3 + self.assertEqual(e5.start, 150) + self.assertEqual(e5.end, 150) + self.assertEqual(e3.start, 30) + self.assertEqual(e3.end, 30) + + +class TestDunderMethods(unittest.TestCase): + + def test_len(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence(ivs) + self.assertEqual(len(dis), 200) + + def test_len_with_custom_indices(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs, start=10, end=80) + self.assertEqual(len(dis), 70) + + def test_repr(self): + ivs = _make_intervals([("chr1", "+", 100, 200), ("chr1", "+", 300, 400)]) + dis = DisjointIntervalSequence(ivs, coord_name="ENST0001", interval_name="IV1") + r = repr(dis) + self.assertIn("DisjointIntervalSequence(", r) + self.assertIn("coord_name='ENST0001'", r) + self.assertIn("name='IV1'", r) + self.assertIn("chr1:+", r) + self.assertIn("len=200", r) + self.assertIn('coord_intervals=(Interval("chr1", "+", 100, 200, "hg19.mini"), Interval("chr1", "+", 300, 400, "hg19.mini"))', r) + self.assertIn("start=0", r) + self.assertIn("end=200", r) + self.assertIn("end5=0", r) + self.assertIn("end3=200", r) + + def test_eq_same(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + a = DisjointIntervalSequence(ivs, coord_name="x", interval_name="i") + b = DisjointIntervalSequence(ivs, coord_name="x", interval_name="i") + self.assertEqual(a, b) + + def test_eq_different_coord_id(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + a = DisjointIntervalSequence(ivs, coord_name="x") + b = DisjointIntervalSequence(ivs, coord_name="y") + self.assertNotEqual(a, b) + + def test_eq_different_interval_id(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + a = DisjointIntervalSequence(ivs, interval_name="x") + b = DisjointIntervalSequence(ivs, interval_name="y") + self.assertNotEqual(a, b) + + def test_eq_different_on_coordinate_strand(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + a = DisjointIntervalSequence(ivs, on_coordinate_strand=True) + b = DisjointIntervalSequence(ivs, on_coordinate_strand=False) + self.assertNotEqual(a, b) + + def test_eq_different_chrom(self): + a = DisjointIntervalSequence(_make_intervals([("chr1", "+", 100, 200)])) + b = DisjointIntervalSequence(_make_intervals([("chr2", "+", 100, 200)])) + self.assertNotEqual(a, b) + + def test_eq_different_refg(self): + a = DisjointIntervalSequence([Interval("chr2", "+", 100, 200, "hg38.p12.mini")]) + b = DisjointIntervalSequence([Interval("chr2", "+", 100, 200, "hg38.p13.mini")]) + self.assertNotEqual(a, b) + + def test_eq_different_strand(self): + a = DisjointIntervalSequence(_make_intervals([("chr1", "+", 100, 200)])) + b = DisjointIntervalSequence(_make_intervals([("chr1", "-", 100, 200)])) + self.assertNotEqual(a, b) + + def test_eq_different_coordinate_intervals(self): + a = DisjointIntervalSequence(_make_intervals([("chr1", "+", 100, 200)])) + b = DisjointIntervalSequence(_make_intervals([("chr1", "+", 100, 300)])) + self.assertNotEqual(a, b) + + def test_eq_different_start(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + a = DisjointIntervalSequence(ivs, start=0, end=50) + b = DisjointIntervalSequence(ivs, start=10, end=50) + self.assertNotEqual(a, b) + + def test_eq_different_end(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + a = DisjointIntervalSequence(ivs, start=0, end=50) + b = DisjointIntervalSequence(ivs, start=0, end=60) + self.assertNotEqual(a, b) + + def test_eq_non_dis(self): + ivs = _make_intervals([("chr1", "+", 100, 200)]) + dis = DisjointIntervalSequence(ivs) + self.assertNotEqual(dis, "not a DIS") + self.assertNotEqual(dis, 42) + + +if __name__ == "__main__": + unittest.main()