Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions src/Chromosome.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<?php declare(strict_types=1);

namespace MLL\Utils;

class Chromosome
{
private string $value;

private NamingConvention $namingConvention;

public function __construct(string $chromosomeAsString)
{
/** Matches human chromosomes with or without "chr" prefix: chr1-chr22, chrX, chrY, chrM, chrMT, or 1-22, X, Y, M, MT. */
if (\Safe\preg_match('/^(chr)?(1[0-9]|[1-9]|2[0-2]|X|Y|M|MT)$/i', $chromosomeAsString, $matches) === 0) {
throw new \InvalidArgumentException("Invalid chromosome: {$chromosomeAsString}. Expected format: chr1-chr22, chrX, chrY, chrM, or without chr prefix.");
}
$this->namingConvention = $matches[1] === 'chr'
? new NamingConvention(NamingConvention::UCSC)
: new NamingConvention(NamingConvention::ENSEMBL);

$this->value = strtoupper($matches[2]);
}

public function toString(?NamingConvention $namingConvention = null): string
{
$namingConvention ??= $this->namingConvention;

switch ($namingConvention->value) {
case NamingConvention::ENSEMBL:
return $this->value === 'M' ? 'MT' : $this->value;
case NamingConvention::UCSC:
return "chr{$this->value}";
default:
throw new \InvalidArgumentException("No toString logic implemented for valid naming convention: {$namingConvention->value}");
}
}
}
35 changes: 35 additions & 0 deletions src/GenomicPosition.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<?php declare(strict_types=1);

namespace MLL\Utils;

class GenomicPosition
{
public Chromosome $chromosome;

public int $position;

public function __construct(Chromosome $chromosome, int $position)
{
if ($position < 1) {
throw new \InvalidArgumentException("Position must be positive, got: {$position}.");
}

$this->chromosome = $chromosome;
$this->position = $position;
}

/** @example GenomicPosition::parse('chr1:123456') */
public static function parse(string $genomicPosition): self
{
if (\Safe\preg_match('/^(.+):(g\.|)(\d+)$/', $genomicPosition, $matches) === 0) {
throw new \InvalidArgumentException("Invalid genomic position format: {$genomicPosition}. Expected format: chr1:123456.");
}

return new self(new Chromosome($matches[1]), (int) $matches[3]);
}

public function toString(?NamingConvention $namingConvention = null): string
{
return "{$this->chromosome->toString($namingConvention)}:{$this->position}";
}
}
89 changes: 89 additions & 0 deletions src/GenomicRegion.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
<?php declare(strict_types=1);

namespace MLL\Utils;

use function Safe\preg_match;

final class GenomicRegion
{
public Chromosome $chromosome;

public int $start;

public int $end;

public function __construct(
Chromosome $chromosome,
int $start,
int $end
) {
if ($start < 1) {
throw new \InvalidArgumentException("Start must be positive, got: {$start}.");
}

if ($end < 1) {
throw new \InvalidArgumentException("End must be positive, got: {$end}.");
}

if ($start > $end) {
throw new \InvalidArgumentException("End ({$end}) must be greater than start ({$start})");
}

$this->chromosome = $chromosome;
$this->start = $start;
$this->end = $end;
}

public static function parse(string $genomicRegion): self
{
if (preg_match('/^(.+):(g\.|)(\d+)(-(\d+)|)$/', $genomicRegion, $matches) === 0) {
throw new \InvalidArgumentException("Invalid genomic region format: {$genomicRegion}. Expected format: chr1:123-456.");
}

return new self(
new Chromosome($matches[1]),
(int) $matches[3],
(int) ($matches[5] ?? $matches[3])
);
}

public function containsGenomicPosition(GenomicPosition $genomicPosition): bool
{
return $this->chromosome->toString() === $genomicPosition->chromosome->toString()
&& $this->positionIsBetweenStartAndEnd($genomicPosition->position);
}

public function containsGenomicRegion(GenomicRegion $genomicRegion): bool
{
return $this->chromosome->toString() === $genomicRegion->chromosome->toString()
&& $this->positionIsBetweenStartAndEnd($genomicRegion->start)
&& $this->positionIsBetweenStartAndEnd($genomicRegion->end);
}

public function isCoveredByGenomicRegion(GenomicRegion $genomicRegion): bool
{
return $this->chromosome->toString() === $genomicRegion->chromosome->toString()
&& $genomicRegion->start <= $this->start
&& $genomicRegion->end >= $this->end;
}

public function intersectsWithGenomicRegion(GenomicRegion $genomicRegion): bool
{
return $this->chromosome->toString() === $genomicRegion->chromosome->toString()
&& (
$this->isCoveredByGenomicRegion($genomicRegion)
|| $this->positionIsBetweenStartAndEnd($genomicRegion->start)
|| $this->positionIsBetweenStartAndEnd($genomicRegion->end)
);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Falsch bei vollständiger Überlappung

Wenn Region B die Region A komplett umschließt (z.B. A=chr11:20-30, B=chr11:10-40), liefert die Methode fälschlich false, weil weder B.start noch B.end innerhalb von A liegt.

Fix — die kanonische Intervall-Formel:

return $this->chromosome->equals($other->chromosome)
    && $this->start <= $other->end
    && $other->start <= $this->end;

}

private function positionIsBetweenStartAndEnd(int $position): bool
{
return $position >= $this->start && $position <= $this->end;
}

public function toString(?NamingConvention $namingConvention = null): string
{
return "{$this->chromosome->toString($namingConvention)}:{$this->start}-{$this->end}";
}
}
23 changes: 23 additions & 0 deletions src/NamingConvention.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?php declare(strict_types=1);

namespace MLL\Utils;

class NamingConvention
{
public const ENSEMBL = 'ENSEMBL';
public const UCSC = 'UCSC';

public string $value;

public function __construct(string $value)
{
switch ($value) {
case NamingConvention::ENSEMBL:
case NamingConvention::UCSC:
$this->value = $value;
break;
default:
throw new \InvalidArgumentException("Invalid naming convention: {$value}");
}
}
}
40 changes: 40 additions & 0 deletions tests/ChromosomeTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
<?php declare(strict_types=1);

use MLL\Utils\Chromosome;
use MLL\Utils\NamingConvention;
use PHPUnit\Framework\TestCase;

final class ChromosomeTest extends TestCase
{
public function testToStringWithDefault(): void
{
$chromosome = new Chromosome('chr11');
self::assertSame('chr11', $chromosome->toString());
}

public function testToStringForEnsembl(): void
{
$chromosome = new Chromosome('chr11');
self::assertSame('11', $chromosome->toString(new NamingConvention(NamingConvention::ENSEMBL)));
}

public function testInitWithUCSC(): void
{
$chromosome = new Chromosome('11');
self::assertSame('11', $chromosome->toString());
}

public function testToStringWithUCSCAndMitochondrialChromosome(): void
{
$chromosome = new Chromosome('chrM');
self::assertSame('MT', $chromosome->toString(new NamingConvention(NamingConvention::ENSEMBL)));
}

public function testFailedInit(): void
{
$chromosomeAsString = 'FOO11';
self::expectException(\InvalidArgumentException::class);
self::expectExceptionMessage("Invalid chromosome: {$chromosomeAsString}. Expected format: chr1-chr22, chrX, chrY, chrM, or without chr prefix.");
new Chromosome($chromosomeAsString);
}
}
33 changes: 33 additions & 0 deletions tests/GenomicPositionTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<?php declare(strict_types=1);

use MLL\Utils\GenomicPosition;
use PHPUnit\Framework\TestCase;

final class GenomicPositionTest extends TestCase
{
public function testParseOnSuccessHG19(): void
{
$genomicPosition = GenomicPosition::parse('chr11:1');
self::assertSame('chr11:1', $genomicPosition->toString());
}

public function testParseOnSuccessGRC37(): void
{
$genomicPosition = GenomicPosition::parse('11:1');
self::assertSame('11:1', $genomicPosition->toString());
}

public function testParseOnSuccessHGVSg(): void
{
$genomicPosition = GenomicPosition::parse('chr11:g.1');
self::assertSame('chr11:1', $genomicPosition->toString());
}

public function testParseOnError(): void
{
$genomicPositionAsString = '11:1test';
self::expectException(\InvalidArgumentException::class);
self::expectExceptionMessage("Invalid genomic position format: {$genomicPositionAsString}. Expected format: chr1:123456.");
GenomicPosition::parse($genomicPositionAsString);
}
}
84 changes: 84 additions & 0 deletions tests/GenomicRegionTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
<?php declare(strict_types=1);

use MLL\Utils\GenomicPosition;
use MLL\Utils\GenomicRegion;
use PHPUnit\Framework\TestCase;

final class GenomicRegionTest extends TestCase
{
public function testParseOnSuccessUCSC(): void
{
$genomicRegion = GenomicRegion::parse('chr11:1-2');
self::assertSame('chr11:1-2', $genomicRegion->toString());
}

public function testParseOnSuccessEnsembl(): void
{
$genomicRegion = GenomicRegion::parse('11:1-2');
self::assertSame('11:1-2', $genomicRegion->toString());
}

public function testParseOnSuccessHGVSg(): void
{
$genomicRegion = GenomicRegion::parse('chr11:g.1-2');
self::assertSame('chr11:1-2', $genomicRegion->toString());
}

public function testParseOnError(): void
{
$genomicRegionAsString = '11:1_2';
self::expectException(\InvalidArgumentException::class);
self::expectExceptionMessage("Invalid genomic region format: {$genomicRegionAsString}. Expected format: chr1:123-456.");
GenomicRegion::parse($genomicRegionAsString);
}

public function testStartIsGerateThenEnd(): void
{
$genomicRegionAsString = '11:2-1';
self::expectException(\InvalidArgumentException::class);
self::expectExceptionMessage('End (1) must be greater than start (2)');
GenomicRegion::parse($genomicRegionAsString);
}

public function testContainsGenomicPositionIsTrue(): void
{
$genomicRegion = GenomicRegion::parse('chr11:g.1-20');
self::assertTrue($genomicRegion->containsGenomicPosition(GenomicPosition::parse('chr11:20')));
}

public function testContainsGenomicPositionIsFalse(): void
{
$genomicRegion = GenomicRegion::parse('chr11:g.1-20');
self::assertFalse($genomicRegion->containsGenomicPosition(GenomicPosition::parse('chr11:21')));
}

public function testContainsGenomicRegionIsTrue(): void
{
$genomicRegion = GenomicRegion::parse('chr11:g.1-20');
self::assertTrue($genomicRegion->containsGenomicRegion(GenomicRegion::parse('chr11:19-20')));
}

public function testContainsGenomicRegionIsFalse(): void
{
$genomicRegion = GenomicRegion::parse('chr11:g.1-20');
self::assertFalse($genomicRegion->containsGenomicRegion(GenomicRegion::parse('chr11:21-22')));
}

public function testCoversGenomicRegionIsTrue(): void
{
$genomicRegion = GenomicRegion::parse('chr11:g.20-30');
self::assertTrue($genomicRegion->isCoveredByGenomicRegion(GenomicRegion::parse('chr11:g.15-35')));
}

public function testIntersectsFullyWithGenomicRegionIsTrue(): void
{
$genomicRegion = GenomicRegion::parse('chr11:g.20-30');
self::assertTrue($genomicRegion->intersectsWithGenomicRegion(GenomicRegion::parse('chr11:g.15-35')));
}

public function testIntersectsWithGenomicRegionIsFalse(): void
{
$genomicRegion = GenomicRegion::parse('chr11:g.20-30');
self::assertFalse($genomicRegion->intersectsWithGenomicRegion(GenomicRegion::parse('chr11:15-19')));
}
}