-
Notifications
You must be signed in to change notification settings - Fork 0
Ontology rendering #113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Ontology rendering #113
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,175 @@ | ||
| from enum import Enum | ||
| from typing import Callable | ||
|
|
||
| import pymupdf | ||
| from pydantic import BaseModel, model_validator | ||
|
|
||
| from .ontology import Cited, PoliceReport, PoliceReportParseResult, SourceChunk | ||
|
|
||
|
|
||
| class OntoPainterMark(Enum): | ||
| RECT = "RECT" | ||
|
|
||
|
|
||
| FieldAccessor = Callable[[PoliceReport], list[Cited]] | ||
|
|
||
|
|
||
| class OntoPainterFieldConfig(BaseModel): | ||
| field: str | None = None # TODO - validate against PoliceReport fields | ||
| label: str | None = None | ||
| mark: OntoPainterMark | ||
| fill: tuple[float, float, float] | None = None | ||
| stroke: tuple[float, float, float] | None = None | ||
| stroke_width: float = 0 | ||
| accessor: FieldAccessor | None = None | ||
|
|
||
| @model_validator(mode="after") | ||
| def validate_field_accessor(self) -> "OntoPainterFieldConfig": | ||
| """Either field or accessor must be set, but not both.""" | ||
| if not ((self.field is None) ^ (self.accessor is None)): | ||
| raise ValueError("Either field or accessor must be set, but not both.") | ||
| return self | ||
|
|
||
| def get_value(self, report: PoliceReport) -> list[Cited]: | ||
| """Get value of a field from the report. | ||
|
|
||
| Use the `field` attribute if set, otherwise use the `accessor` function. | ||
| """ | ||
| v: Cited | list[Cited | None] | list[Cited] | None = None | ||
| if self.field is None: | ||
| if not self.accessor: | ||
| raise ValueError("Accessor is required if field is not set.") | ||
| v = self.accessor(report) | ||
| else: | ||
| v = getattr(report, self.field) | ||
|
|
||
| # Normalize none into an empty list. | ||
| if v is None: | ||
| return [] | ||
| # Normalize singular value to a one-item list. | ||
| elif isinstance(v, Cited): | ||
| return [v] | ||
| # Normalize list of Optional values to a list of Cited values. | ||
| elif isinstance(v, list): | ||
| return [v for v in v if v is not None] | ||
| raise ValueError(f"Unexpected type: {type(v)}") | ||
|
|
||
|
|
||
| class OntoPainter(BaseModel): | ||
| fields: list[OntoPainterFieldConfig] | ||
|
|
||
| def paint( | ||
| self, | ||
| pdf: str | pymupdf.Document, | ||
| parse_result: PoliceReportParseResult, | ||
| pages: str | None = None, | ||
| ) -> pymupdf.Document: | ||
| """Paint a document annotated with the parse result.""" | ||
| # 1. Load the requested pages from the input path / doc | ||
| doc = self._load_pdf(pdf, pages) | ||
|
|
||
| # 2. Loop over field configs and paint each field. | ||
| for field_config in self.fields: | ||
| field_values = field_config.get_value(parse_result.report) | ||
| for i, field_value in enumerate(field_values): | ||
| for j, chunk_id in enumerate(field_value.ids): | ||
| chunk = parse_result.chunks[chunk_id] | ||
| self._paint_field( | ||
| doc, | ||
| field_config, | ||
| chunk, | ||
| label=f"{field_config.label} {i + 1}-{j + 1}", | ||
| ) | ||
|
|
||
| return doc | ||
|
|
||
| def _paint_field( | ||
| self, | ||
| doc: pymupdf.Document, | ||
| field_config: OntoPainterFieldConfig, | ||
| chunk: SourceChunk, | ||
| label: str | None = None, | ||
| ) -> None: | ||
| """Paint a field on a document.""" | ||
| match field_config.mark: | ||
| case OntoPainterMark.RECT: | ||
| self._paint_rect(doc, field_config, chunk) | ||
| case _: | ||
| raise ValueError(f"Unsupported mark: {field_config.mark}") | ||
| if label: | ||
| page = doc.load_page(chunk.regions[0].page) | ||
| page_width, page_height = page.mediabox[2:] | ||
| scaled_points = [ | ||
| (p[0] * page_width, p[1] * page_height) for p in chunk.regions[0].points | ||
| ] | ||
| x, y = scaled_points[0] | ||
| # Offset to avoid overlapping with bounding rectangle | ||
| y -= 2 | ||
| page.insert_text( | ||
| (x, y), label, fontsize=5, fill=field_config.stroke, color=(1, 1, 1) | ||
| ) | ||
|
|
||
| def _paint_rect( | ||
| self, | ||
| doc: pymupdf.Document, | ||
| field_config: OntoPainterFieldConfig, | ||
| chunk: SourceChunk, | ||
| ) -> None: | ||
| """Paint a rectangle on a document.""" | ||
| for region in chunk.regions: | ||
| # The coordinates come normalized in (0, 1) space. Project into page coords. | ||
| page = doc.load_page(region.page) | ||
| page_width, page_height = page.mediabox[2:] | ||
| shape = page.new_shape() | ||
| scaled_points = [ | ||
| (p[0] * page_width, p[1] * page_height) for p in region.points | ||
| ] | ||
| shape.draw_rect(pymupdf.Quad(*scaled_points).rect) | ||
| shape.finish(color=field_config.stroke, width=field_config.stroke_width) | ||
| shape.commit() | ||
|
|
||
| def _load_pdf( | ||
| self, doc: str | pymupdf.Document, pages: str | None = None | ||
| ) -> pymupdf.Document: | ||
| """Load a PDF document.""" | ||
| if isinstance(doc, str): | ||
| with open(doc, "rb") as f: | ||
| pdf_doc = pymupdf.open(f) | ||
| else: | ||
| pdf_doc = doc | ||
|
|
||
| filter_pages = _parse_pages_range(pages) | ||
| if filter_pages: | ||
| pdf_doc.select(filter_pages) | ||
|
|
||
| return pdf_doc | ||
|
|
||
|
|
||
| def _parse_pages_range(pages: str | None = None) -> list[int] | None: | ||
| """Parse page range specification as a list of page numbers. | ||
|
|
||
| If no spec is given, return None. | ||
|
|
||
| Spec looks like: | ||
| 1 Single page | ||
| 1-3 Range of pages | ||
| 1,2,3 List of pages | ||
| 1-3,5 Range and list of pages | ||
|
|
||
| Args: | ||
| pages: The page range specification, 1-indexed. | ||
|
|
||
| Returns: | ||
| A list of page numbers (0-indexed). | ||
| """ | ||
| if pages is None: | ||
| return None | ||
| page_list = list[int]() | ||
| for segment in pages.split(","): | ||
| if "-" in segment: | ||
| start, end = segment.split("-") | ||
| page_list.extend(range(int(start.strip()), int(end.strip()) + 1)) | ||
| else: | ||
| page_list.append(int(segment.strip())) | ||
| # Clean up duplicates and sort. | ||
| return sorted([x - 1 for x in set(page_list)]) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| class Palette: | ||
| Red1 = (0.9, 0.2, 0.1) | ||
| Red2 = (0.8, 0.1, 0.0) | ||
| Red3 = (0.7, 0.0, 0.0) | ||
| Orange1 = (0.9, 0.5, 0.0) | ||
| Orange2 = (0.8, 0.4, 0.0) | ||
| Orange3 = (0.7, 0.3, 0.0) | ||
| Yellow1 = (0.9, 0.9, 0.0) | ||
| Yellow2 = (0.8, 0.8, 0.0) | ||
| Yellow3 = (0.7, 0.7, 0.0) | ||
| Green1 = (0.2, 0.9, 0.1) | ||
| Green2 = (0.1, 0.8, 0.0) | ||
| Green3 = (0.0, 0.7, 0.0) | ||
| Blue1 = (0, 0.5, 1) | ||
| Blue2 = (0.0, 0.4, 0.8) | ||
| Blue3 = (0.0, 0.3, 0.7) | ||
| Purple1 = (0.9, 0.0, 0.9) | ||
| Purple2 = (0.8, 0.0, 0.8) | ||
| Purple3 = (0.7, 0.0, 0.7) | ||
| Pink1 = (0.9, 0.0, 0.9) | ||
| Pink2 = (0.8, 0.0, 0.8) | ||
| Pink3 = (0.7, 0.0, 0.7) | ||
| Brown1 = (0.5, 0.25, 0.0) | ||
| Brown2 = (0.4, 0.2, 0.0) | ||
| Brown3 = (0.3, 0.15, 0.0) | ||
| Cyan1 = (0.0, 0.9, 0.9) | ||
| Cyan2 = (0.0, 0.8, 0.8) | ||
| Cyan3 = (0.0, 0.7, 0.7) | ||
| Lime1 = (0.9, 0.9, 0.0) | ||
| Lime2 = (0.8, 0.8, 0.0) | ||
| Lime3 = (0.7, 0.7, 0.0) | ||
| Maroon1 = (0.5, 0.0, 0.0) | ||
| Maroon2 = (0.4, 0.0, 0.0) | ||
| Maroon3 = (0.3, 0.0, 0.0) | ||
| Gray1 = (0.5, 0.5, 0.5) | ||
| Gray2 = (0.4, 0.4, 0.4) | ||
| Gray3 = (0.3, 0.3, 0.3) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| from abc import ABC, abstractmethod | ||
| from typing import Generic, TypeVar | ||
|
|
||
| from ..common.context import Context | ||
| from ..common.file import MemoryFile | ||
| from ..common.preprocess import PreprocessMixin | ||
|
|
||
| T = TypeVar("T") | ||
|
|
||
|
|
||
| class BasePainter(ABC, Generic[T], PreprocessMixin[T]): | ||
| def __call__(self, file: MemoryFile, context: Context) -> MemoryFile: | ||
| """Paint a file, returning an annotated version. | ||
|
|
||
| `file` is the primary pipe value (e.g. a serialized ontology result). | ||
| The original input file is read from `context.input_file`. | ||
| """ | ||
| data = self.preprocess(file) | ||
| return self.paint(context.input_file, data) | ||
|
|
||
| @abstractmethod | ||
| def paint(self, original: MemoryFile, data: T) -> MemoryFile: | ||
| """Paint the input file using current analysis.""" | ||
| ... |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Bug:
pymupdf.Quad(*scaled_points)crashes ifregion.pointshas anything other than exactly 4 points.Severity: MEDIUM
Suggested Fix
Clamp
scaled_pointsto exactly 4 points (e.g., usescaled_points[:4]) or construct the rect directly viapymupdf.Rect(min_x, min_y, max_x, max_y)computed from all points, avoiding theQuadconstraint entirely.Prompt for AI Agent