From 2af581d3a6087b304369f37ca30e0efbf59a56e2 Mon Sep 17 00:00:00 2001 From: PrinsFrank <25006490+PrinsFrank@users.noreply.github.com> Date: Mon, 1 Dec 2025 20:46:10 +0100 Subject: [PATCH 1/3] Consider scaling when sorting text --- src/Document/ContentStream/ContentStream.php | 8 ++--- .../PositionedText/TransformationMatrix.php | 8 +++++ .../TransformationMatrixTest.php | 32 +++++++++++++++++++ 3 files changed, 44 insertions(+), 4 deletions(-) create mode 100644 tests/Unit/Document/ContentStream/PositionedText/TransformationMatrixTest.php diff --git a/src/Document/ContentStream/ContentStream.php b/src/Document/ContentStream/ContentStream.php index 2d01cfe5..f1f3c254 100644 --- a/src/Document/ContentStream/ContentStream.php +++ b/src/Document/ContentStream/ContentStream.php @@ -68,11 +68,11 @@ public function getPositionedTextElements(): array { usort( $positionedTextElements, static function (PositionedTextElement $a, PositionedTextElement $b): int { - if (($differenceY = $b->absoluteMatrix->offsetY <=> $a->absoluteMatrix->offsetY) !== 0) { + if (($differenceY = $b->absoluteMatrix->getAbsoluteY() <=> $a->absoluteMatrix->getAbsoluteY()) !== 0) { return $differenceY; } - return $a->absoluteMatrix->offsetX <=> $b->absoluteMatrix->offsetX; + return $a->absoluteMatrix->getAbsoluteX() <=> $b->absoluteMatrix->getAbsoluteX(); } ); @@ -85,9 +85,9 @@ public function getText(Document $document, Page $page): string { $previousPositionedTextElement = null; foreach ($this->getPositionedTextElements() as $positionedTextElement) { if ($previousPositionedTextElement !== null) { - if ($previousPositionedTextElement->absoluteMatrix->offsetY !== $positionedTextElement->absoluteMatrix->offsetY) { + if ($previousPositionedTextElement->absoluteMatrix->getAbsoluteY() !== $positionedTextElement->absoluteMatrix->getAbsoluteY()) { $text .= "\n"; - } elseif (($positionedTextElement->absoluteMatrix->offsetX - $previousPositionedTextElement->absoluteMatrix->offsetX - $positionedTextElement->getFont($document, $page)->getWidthForChars($previousPositionedTextElement->getCodePoints(), $previousPositionedTextElement->textState, $previousPositionedTextElement->absoluteMatrix)) >= ($previousPositionedTextElement->textState->fontSize ?? 10) * $previousPositionedTextElement->absoluteMatrix->scaleX * 0.40) { + } elseif (($positionedTextElement->absoluteMatrix->getAbsoluteX() - $previousPositionedTextElement->absoluteMatrix->getAbsoluteX() - $positionedTextElement->getFont($document, $page)->getWidthForChars($previousPositionedTextElement->getCodePoints(), $previousPositionedTextElement->textState, $previousPositionedTextElement->absoluteMatrix)) >= ($previousPositionedTextElement->textState->fontSize ?? 10) * $previousPositionedTextElement->absoluteMatrix->scaleX * 0.40) { $text .= ' '; } } diff --git a/src/Document/ContentStream/PositionedText/TransformationMatrix.php b/src/Document/ContentStream/PositionedText/TransformationMatrix.php index 0145b7cf..9d447ed0 100644 --- a/src/Document/ContentStream/PositionedText/TransformationMatrix.php +++ b/src/Document/ContentStream/PositionedText/TransformationMatrix.php @@ -13,6 +13,14 @@ public function __construct( ) { } + public function getAbsoluteY(): float { + return $this->offsetY * $this->scaleY; + } + + public function getAbsoluteX(): float { + return $this->offsetX * $this->scaleX; + } + /** Please note that a concatenated transformation matrix of A B !== B A */ public function multiplyWith(self $other): self { return new self( diff --git a/tests/Unit/Document/ContentStream/PositionedText/TransformationMatrixTest.php b/tests/Unit/Document/ContentStream/PositionedText/TransformationMatrixTest.php new file mode 100644 index 00000000..ce7996df --- /dev/null +++ b/tests/Unit/Document/ContentStream/PositionedText/TransformationMatrixTest.php @@ -0,0 +1,32 @@ +getAbsoluteY() + ); + static::assertSame( + 42.0, + (new TransformationMatrix(10, 10, 10, 6, 10, 7))->getAbsoluteY() + ); + } + + public function testGetAbsoluteX(): void { + static::assertSame( + 42.0, + (new TransformationMatrix(6, 0, 0, 0, 7, 0))->getAbsoluteX() + ); + static::assertSame( + 42.0, + (new TransformationMatrix(6, 10, 10, 10, 7, 10))->getAbsoluteX() + ); + } +} From 4d124cd42d2dcf35950cb27daaaa1cc92a9fe6a0 Mon Sep 17 00:00:00 2001 From: PrinsFrank <25006490+PrinsFrank@users.noreply.github.com> Date: Mon, 1 Dec 2025 20:55:05 +0100 Subject: [PATCH 2/3] Retune automatic space insertion between text objects and don't add spaces when previous object does end with them --- src/Document/ContentStream/ContentStream.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Document/ContentStream/ContentStream.php b/src/Document/ContentStream/ContentStream.php index f1f3c254..07f31e56 100644 --- a/src/Document/ContentStream/ContentStream.php +++ b/src/Document/ContentStream/ContentStream.php @@ -87,7 +87,7 @@ public function getText(Document $document, Page $page): string { if ($previousPositionedTextElement !== null) { if ($previousPositionedTextElement->absoluteMatrix->getAbsoluteY() !== $positionedTextElement->absoluteMatrix->getAbsoluteY()) { $text .= "\n"; - } elseif (($positionedTextElement->absoluteMatrix->getAbsoluteX() - $previousPositionedTextElement->absoluteMatrix->getAbsoluteX() - $positionedTextElement->getFont($document, $page)->getWidthForChars($previousPositionedTextElement->getCodePoints(), $previousPositionedTextElement->textState, $previousPositionedTextElement->absoluteMatrix)) >= ($previousPositionedTextElement->textState->fontSize ?? 10) * $previousPositionedTextElement->absoluteMatrix->scaleX * 0.40) { + } elseif (($positionedTextElement->absoluteMatrix->getAbsoluteX() - $previousPositionedTextElement->absoluteMatrix->getAbsoluteX() - $positionedTextElement->getFont($document, $page)->getWidthForChars($previousPositionedTextElement->getCodePoints(), $previousPositionedTextElement->textState, $previousPositionedTextElement->absoluteMatrix)) >= ($previousPositionedTextElement->textState->fontSize ?? 10) * $previousPositionedTextElement->absoluteMatrix->scaleX * 0.20 && str_ends_with($text, ' ') === false) { $text .= ' '; } } From dd70ae5e1cb6e0a97b7bfbdaaf40d98e80007426 Mon Sep 17 00:00:00 2001 From: PrinsFrank <25006490+PrinsFrank@users.noreply.github.com> Date: Mon, 1 Dec 2025 21:21:19 +0100 Subject: [PATCH 3/3] Consider variability when sorting items to account for small differences in baselines --- src/Document/ContentStream/ContentStream.php | 39 ++++++++++++++------ 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/src/Document/ContentStream/ContentStream.php b/src/Document/ContentStream/ContentStream.php index 07f31e56..8a381644 100644 --- a/src/Document/ContentStream/ContentStream.php +++ b/src/Document/ContentStream/ContentStream.php @@ -65,27 +65,44 @@ public function getPositionedTextElements(): array { } } + return $positionedTextElements; + } + + /** @throws PdfParserException */ + public function getText(Document $document, Page $page): string { + $positionedTextElements = $this->getPositionedTextElements(); + if ($positionedTextElements === []) { + return ''; + } + + $lowestY = min(array_map(static fn(PositionedTextElement $positionedTextElement): float => $positionedTextElement->absoluteMatrix->offsetY * $positionedTextElement->absoluteMatrix->scaleY, $positionedTextElements)); + $highestY = max(array_map(static fn(PositionedTextElement $positionedTextElement): float => $positionedTextElement->absoluteMatrix->offsetY * $positionedTextElement->absoluteMatrix->scaleY, $positionedTextElements)); + $variabilityY = $lowestY !== $highestY + ? ($highestY - $lowestY) / 200 + : 0; + usort( $positionedTextElements, - static function (PositionedTextElement $a, PositionedTextElement $b): int { - if (($differenceY = $b->absoluteMatrix->getAbsoluteY() <=> $a->absoluteMatrix->getAbsoluteY()) !== 0) { - return $differenceY; + static function (PositionedTextElement $a, PositionedTextElement $b) use ($variabilityY): int { + $differenceY = $b->absoluteMatrix->offsetY * $b->absoluteMatrix->scaleY - $a->absoluteMatrix->offsetY * $a->absoluteMatrix->scaleY; + if ($differenceY > $variabilityY) { + return 1; } - return $a->absoluteMatrix->getAbsoluteX() <=> $b->absoluteMatrix->getAbsoluteX(); + if ($differenceY < -$variabilityY) { + return -1; + } + + return $a->absoluteMatrix->offsetX <=> $b->absoluteMatrix->offsetX; } ); - return $positionedTextElements; - } - - /** @throws PdfParserException */ - public function getText(Document $document, Page $page): string { $text = ''; $previousPositionedTextElement = null; - foreach ($this->getPositionedTextElements() as $positionedTextElement) { + foreach ($positionedTextElements as $positionedTextElement) { if ($previousPositionedTextElement !== null) { - if ($previousPositionedTextElement->absoluteMatrix->getAbsoluteY() !== $positionedTextElement->absoluteMatrix->getAbsoluteY()) { + $diffY = $previousPositionedTextElement->absoluteMatrix->offsetY * $previousPositionedTextElement->absoluteMatrix->scaleY - $positionedTextElement->absoluteMatrix->offsetY * $positionedTextElement->absoluteMatrix->scaleY; + if ($diffY > $variabilityY || $diffY < -$variabilityY) { $text .= "\n"; } elseif (($positionedTextElement->absoluteMatrix->getAbsoluteX() - $previousPositionedTextElement->absoluteMatrix->getAbsoluteX() - $positionedTextElement->getFont($document, $page)->getWidthForChars($previousPositionedTextElement->getCodePoints(), $previousPositionedTextElement->textState, $previousPositionedTextElement->absoluteMatrix)) >= ($previousPositionedTextElement->textState->fontSize ?? 10) * $previousPositionedTextElement->absoluteMatrix->scaleX * 0.20 && str_ends_with($text, ' ') === false) { $text .= ' ';