Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 28 additions & 11 deletions src/Document/ContentStream/ContentStream.php
Original file line number Diff line number Diff line change
Expand Up @@ -65,29 +65,46 @@ public function getPositionedTextElements(): array {
}
}

return $positionedTextElements;
}

/** @throws PdfParserException */
public function getText(Document $document, Page $page): string {
$positionedTextElements = $this->getPositionedTextElements();
if ($positionedTextElements === []) {
return '';
}

$lowestY = min(array_map(static fn(PositionedTextElement $positionedTextElement): float => $positionedTextElement->absoluteMatrix->offsetY * $positionedTextElement->absoluteMatrix->scaleY, $positionedTextElements));
$highestY = max(array_map(static fn(PositionedTextElement $positionedTextElement): float => $positionedTextElement->absoluteMatrix->offsetY * $positionedTextElement->absoluteMatrix->scaleY, $positionedTextElements));
$variabilityY = $lowestY !== $highestY
? ($highestY - $lowestY) / 200
: 0;

usort(
$positionedTextElements,
static function (PositionedTextElement $a, PositionedTextElement $b): int {
if (($differenceY = $b->absoluteMatrix->offsetY <=> $a->absoluteMatrix->offsetY) !== 0) {
return $differenceY;
static function (PositionedTextElement $a, PositionedTextElement $b) use ($variabilityY): int {
$differenceY = $b->absoluteMatrix->offsetY * $b->absoluteMatrix->scaleY - $a->absoluteMatrix->offsetY * $a->absoluteMatrix->scaleY;
if ($differenceY > $variabilityY) {
return 1;
}

if ($differenceY < -$variabilityY) {
return -1;
}

return $a->absoluteMatrix->offsetX <=> $b->absoluteMatrix->offsetX;
}
);

return $positionedTextElements;
}

/** @throws PdfParserException */
public function getText(Document $document, Page $page): string {
$text = '';
$previousPositionedTextElement = null;
foreach ($this->getPositionedTextElements() as $positionedTextElement) {
foreach ($positionedTextElements as $positionedTextElement) {
if ($previousPositionedTextElement !== null) {
if ($previousPositionedTextElement->absoluteMatrix->offsetY !== $positionedTextElement->absoluteMatrix->offsetY) {
$diffY = $previousPositionedTextElement->absoluteMatrix->offsetY * $previousPositionedTextElement->absoluteMatrix->scaleY - $positionedTextElement->absoluteMatrix->offsetY * $positionedTextElement->absoluteMatrix->scaleY;
if ($diffY > $variabilityY || $diffY < -$variabilityY) {
$text .= "\n";
} elseif (($positionedTextElement->absoluteMatrix->offsetX - $previousPositionedTextElement->absoluteMatrix->offsetX - $positionedTextElement->getFont($document, $page)->getWidthForChars($previousPositionedTextElement->getCodePoints(), $previousPositionedTextElement->textState, $previousPositionedTextElement->absoluteMatrix)) >= ($previousPositionedTextElement->textState->fontSize ?? 10) * $previousPositionedTextElement->absoluteMatrix->scaleX * 0.40) {
} elseif (($positionedTextElement->absoluteMatrix->getAbsoluteX() - $previousPositionedTextElement->absoluteMatrix->getAbsoluteX() - $positionedTextElement->getFont($document, $page)->getWidthForChars($previousPositionedTextElement->getCodePoints(), $previousPositionedTextElement->textState, $previousPositionedTextElement->absoluteMatrix)) >= ($previousPositionedTextElement->textState->fontSize ?? 10) * $previousPositionedTextElement->absoluteMatrix->scaleX * 0.20 && str_ends_with($text, ' ') === false) {
$text .= ' ';
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@ public function __construct(
) {
}

public function getAbsoluteY(): float {
return $this->offsetY * $this->scaleY;
}

public function getAbsoluteX(): float {
return $this->offsetX * $this->scaleX;
}

/** Please note that a concatenated transformation matrix of A B !== B A */
public function multiplyWith(self $other): self {
return new self(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<?php declare(strict_types=1);

namespace PrinsFrank\PdfParser\Tests\Unit\Document\ContentStream\PositionedText;

use PHPUnit\Framework\Attributes\CoversClass;
use PHPUnit\Framework\TestCase;
use PrinsFrank\PdfParser\Document\ContentStream\PositionedText\TransformationMatrix;

#[CoversClass(TransformationMatrix::class)]
class TransformationMatrixTest extends TestCase {
public function testGetAbsoluteY(): void {
static::assertSame(
42.0,
(new TransformationMatrix(0, 0, 0, 6, 0, 7))->getAbsoluteY()
);
static::assertSame(
42.0,
(new TransformationMatrix(10, 10, 10, 6, 10, 7))->getAbsoluteY()
);
}

public function testGetAbsoluteX(): void {
static::assertSame(
42.0,
(new TransformationMatrix(6, 0, 0, 0, 7, 0))->getAbsoluteX()
);
static::assertSame(
42.0,
(new TransformationMatrix(6, 10, 10, 10, 7, 10))->getAbsoluteX()
);
}
}
Loading