From 7ec16c1efe73d0e3d4b52a9893ab8d48cc523603 Mon Sep 17 00:00:00 2001 From: Joe Nudell Date: Wed, 15 Apr 2026 12:04:01 -0400 Subject: [PATCH 1/3] initial sketch of onto paint module, with changes to support this --- bc2/core/common/ontology.py | 39 ----- bc2/core/common/palette.py | 37 ++++ bc2/core/input/azureblob.py | 2 +- bc2/core/input/base.py | 11 +- bc2/core/input/file.py | 2 +- bc2/core/input/memory.py | 2 +- bc2/core/input/stdin.py | 2 +- bc2/core/paint/__init__.py | 0 bc2/core/paint/base.py | 20 +++ bc2/core/paint/ontology.py | 0 bc2/core/paint/ontopainter.py | 309 ++++++++++++++++++++++++++++++++++ 11 files changed, 380 insertions(+), 44 deletions(-) create mode 100644 bc2/core/common/palette.py create mode 100644 bc2/core/paint/__init__.py create mode 100644 bc2/core/paint/base.py create mode 100644 bc2/core/paint/ontology.py create mode 100644 bc2/core/paint/ontopainter.py diff --git a/bc2/core/common/ontology.py b/bc2/core/common/ontology.py index 797c801..dcff647 100644 --- a/bc2/core/common/ontology.py +++ b/bc2/core/common/ontology.py @@ -66,42 +66,3 @@ class KeyValuePair(TypedDict): Table = list[dict[str, str]] - - -class Palette: - Red1 = (0.9, 0.2, 0.1) - Red2 = (0.8, 0.1, 0.0) - Red3 = (0.7, 0.0, 0.0) - Orange1 = (0.9, 0.5, 0.0) - Orange2 = (0.8, 0.4, 0.0) - Orange3 = (0.7, 0.3, 0.0) - Yellow1 = (0.9, 0.9, 0.0) - Yellow2 = (0.8, 0.8, 0.0) - Yellow3 = (0.7, 0.7, 0.0) - Green1 = (0.2, 0.9, 0.1) - Green2 = (0.1, 0.8, 0.0) - Green3 = (0.0, 0.7, 0.0) - Blue1 = (0, 0.5, 1) - Blue2 = (0.0, 0.4, 0.8) - Blue3 = (0.0, 0.3, 0.7) - Purple1 = (0.9, 0.0, 0.9) - Purple2 = (0.8, 0.0, 0.8) - Purple3 = (0.7, 0.0, 0.7) - Pink1 = (0.9, 0.0, 0.9) - Pink2 = (0.8, 0.0, 0.8) - Pink3 = (0.7, 0.0, 0.7) - Brown1 = (0.5, 0.25, 0.0) - Brown2 = (0.4, 0.2, 0.0) - Brown3 = (0.3, 0.15, 0.0) - Cyan1 = (0.0, 0.9, 0.9) - Cyan2 = (0.0, 0.8, 0.8) - Cyan3 = (0.0, 0.7, 0.7) - Lime1 = (0.9, 0.9, 0.0) - Lime2 = (0.8, 0.8, 0.0) - Lime3 = (0.7, 0.7, 0.0) - Maroon1 = (0.5, 0.0, 0.0) - Maroon2 = (0.4, 0.0, 0.0) - Maroon3 = (0.3, 0.0, 0.0) - Gray1 = (0.5, 0.5, 0.5) - Gray2 = (0.4, 0.4, 0.4) - Gray3 = (0.3, 0.3, 0.3) diff --git a/bc2/core/common/palette.py b/bc2/core/common/palette.py new file mode 100644 index 0000000..deb93cd --- /dev/null +++ b/bc2/core/common/palette.py @@ -0,0 +1,37 @@ +class Palette: + Red1 = (0.9, 0.2, 0.1) + Red2 = (0.8, 0.1, 0.0) + Red3 = (0.7, 0.0, 0.0) + Orange1 = (0.9, 0.5, 0.0) + Orange2 = (0.8, 0.4, 0.0) + Orange3 = (0.7, 0.3, 0.0) + Yellow1 = (0.9, 0.9, 0.0) + Yellow2 = (0.8, 0.8, 0.0) + Yellow3 = (0.7, 0.7, 0.0) + Green1 = (0.2, 0.9, 0.1) + Green2 = (0.1, 0.8, 0.0) + Green3 = (0.0, 0.7, 0.0) + Blue1 = (0, 0.5, 1) + Blue2 = (0.0, 0.4, 0.8) + Blue3 = (0.0, 0.3, 0.7) + Purple1 = (0.9, 0.0, 0.9) + Purple2 = (0.8, 0.0, 0.8) + Purple3 = (0.7, 0.0, 0.7) + Pink1 = (0.9, 0.0, 0.9) + Pink2 = (0.8, 0.0, 0.8) + Pink3 = (0.7, 0.0, 0.7) + Brown1 = (0.5, 0.25, 0.0) + Brown2 = (0.4, 0.2, 0.0) + Brown3 = (0.3, 0.15, 0.0) + Cyan1 = (0.0, 0.9, 0.9) + Cyan2 = (0.0, 0.8, 0.8) + Cyan3 = (0.0, 0.7, 0.7) + Lime1 = (0.9, 0.9, 0.0) + Lime2 = (0.8, 0.8, 0.0) + Lime3 = (0.7, 0.7, 0.0) + Maroon1 = (0.5, 0.0, 0.0) + Maroon2 = (0.4, 0.0, 0.0) + Maroon3 = (0.3, 0.0, 0.0) + Gray1 = (0.5, 0.5, 0.5) + Gray2 = (0.4, 0.4, 0.4) + Gray3 = (0.3, 0.3, 0.3) diff --git a/bc2/core/input/azureblob.py b/bc2/core/input/azureblob.py index b311f3e..ae1f652 100644 --- a/bc2/core/input/azureblob.py +++ b/bc2/core/input/azureblob.py @@ -21,7 +21,7 @@ class AzureBlobInput(BaseInputDriver, AzureBlobDriver): def __init__(self, config: AzureBlobInputConfig): self.init_client(config) - def __call__(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: + def load_file(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: """Read from an Azure Blob.""" f = MemoryFile() full_path = f"{self.config.prefix}{path}" diff --git a/bc2/core/input/base.py b/bc2/core/input/base.py index 34b0be1..49a68c9 100644 --- a/bc2/core/input/base.py +++ b/bc2/core/input/base.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from typing import Literal +from ..common.context import Context from ..common.file import MemoryFile @@ -16,7 +17,15 @@ class BaseInputDriver(ABC): required: list[Literal["path"] | Literal["buffer"]] = [] - @abstractmethod def __call__( + self, context: Context, path: str = "", buffer: io.BytesIO | None = None + ) -> MemoryFile: + """Load a file from a path or buffer.""" + f = self.load_file(path=path, buffer=buffer) + context.input_file = f + return f + + @abstractmethod + def load_file( self, path: str = "", buffer: io.BytesIO | None = None ) -> MemoryFile: ... diff --git a/bc2/core/input/file.py b/bc2/core/input/file.py index b2515bd..3faee04 100644 --- a/bc2/core/input/file.py +++ b/bc2/core/input/file.py @@ -23,7 +23,7 @@ def __init__(self, config: FileInputConfig): required = ["path"] - def __call__(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: + def load_file(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: """Read from a file.""" if not path: raise ValueError("Path is required for file input.") diff --git a/bc2/core/input/memory.py b/bc2/core/input/memory.py index c016baa..0c48b17 100644 --- a/bc2/core/input/memory.py +++ b/bc2/core/input/memory.py @@ -23,7 +23,7 @@ def __init__(self, config: MemoryInputConfig): required = ["buffer"] - def __call__(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: + def load_file(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: """Read from a buffer in memory.""" if not buffer: raise ValueError("Buffer is required for memory input.") diff --git a/bc2/core/input/stdin.py b/bc2/core/input/stdin.py index a538c21..5c08ded 100644 --- a/bc2/core/input/stdin.py +++ b/bc2/core/input/stdin.py @@ -22,7 +22,7 @@ class StdinInput(BaseInputDriver): def __init__(self, config: StdinInputConfig): self.config = config - def __call__(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: + def load_file(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: """Read from stdin.""" f = MemoryFile() # Consume all the stdin pipe and write it to memory diff --git a/bc2/core/paint/__init__.py b/bc2/core/paint/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bc2/core/paint/base.py b/bc2/core/paint/base.py new file mode 100644 index 0000000..6056bce --- /dev/null +++ b/bc2/core/paint/base.py @@ -0,0 +1,20 @@ +from abc import ABC, abstractmethod +from typing import Generic, TypeVar + +from ..common.context import Context +from ..common.file import MemoryFile +from ..common.preprocess import PreprocessMixin + +T = TypeVar("T") + + +class BasePainter(ABC, Generic[T], PreprocessMixin[T]): + def __call__(self, file: MemoryFile, context: Context) -> MemoryFile: + """Paint a file, returning an annotated version.""" + current = self.preprocess(file) + return self.paint(context.input_file, current) + + @abstractmethod + def paint(self, original: MemoryFile, data: T) -> MemoryFile: + """Paint the preprocessed input, returning an annotated MemoryFile.""" + ... diff --git a/bc2/core/paint/ontology.py b/bc2/core/paint/ontology.py new file mode 100644 index 0000000..e69de29 diff --git a/bc2/core/paint/ontopainter.py b/bc2/core/paint/ontopainter.py new file mode 100644 index 0000000..005b815 --- /dev/null +++ b/bc2/core/paint/ontopainter.py @@ -0,0 +1,309 @@ +from enum import Enum +from typing import Callable + +import pymupdf +from pydantic import BaseModel, model_validator + +from ..common.ontology import Cited, PoliceReport, PoliceReportParseResult, SourceChunk +from ..common.palette import Palette + + +class OntoPainterMark(Enum): + RECT = "RECT" + + +FieldAccessor = Callable[[PoliceReport], list[Cited]] + + +class OntoPainterFieldConfig(BaseModel): + field: str | None = None # TODO - validate against PoliceReport fields + label: str | None = None + mark: OntoPainterMark + fill: tuple[float, float, float] | None = None + stroke: tuple[float, float, float] | None = None + stroke_width: float = 0 + accessor: FieldAccessor | None = None + + @model_validator(mode="after") + def validate_field_accessor(self) -> "OntoPainterFieldConfig": + """Either field or accessor must be set, but not both.""" + if not ((self.field is None) ^ (self.accessor is None)): + raise ValueError("Either field or accessor must be set, but not both.") + return self + + def get_value(self, report: PoliceReport) -> list[Cited]: + """Get value of a field from the report. + + Use the `field` attribute if set, otherwise use the `accessor` function. + """ + v: Cited | list[Cited | None] | list[Cited] | None = None + if self.field is None: + if not self.accessor: + raise ValueError("Accessor is required if field is not set.") + v = self.accessor(report) + else: + v = getattr(report, self.field) + + # Normalize none into an empty list. + if v is None: + return [] + # Normalize singular value to a one-item list. + elif isinstance(v, Cited): + return [v] + # Normalize list of Optional values to a list of Cited values. + elif isinstance(v, list): + return [v for v in v if v is not None] + raise ValueError(f"Unexpected type: {type(v)}") + + +class OntoPainter(BaseModel): + fields: list[OntoPainterFieldConfig] + + def paint( + self, + pdf_path: str, + parse_result: PoliceReportParseResult, + pages: str | None = None, + ) -> pymupdf.Document: + """Paint a document annotated with the parse result.""" + # 1. Load the input path. + doc = self._load_pdf(pdf_path, pages) + + # 2. Loop over field configs and paint each field. + for field_config in self.fields: + field_values = field_config.get_value(parse_result.report) + for i, field_value in enumerate(field_values): + for j, chunk_id in enumerate(field_value.ids): + chunk = parse_result.chunks[chunk_id] + self._paint_field( + doc, + field_config, + chunk, + label=f"{field_config.label} {i + 1}-{j + 1}", + ) + + return doc + + def _paint_field( + self, + doc: pymupdf.Document, + field_config: OntoPainterFieldConfig, + chunk: SourceChunk, + label: str | None = None, + ) -> None: + """Paint a field on a document.""" + match field_config.mark: + case OntoPainterMark.RECT: + self._paint_rect(doc, field_config, chunk) + case _: + raise ValueError(f"Unsupported mark: {field_config.mark}") + if label: + page = doc.load_page(chunk.regions[0].page) + page_width, page_height = page.mediabox[2:] + scaled_points = [ + (p[0] * page_width, p[1] * page_height) for p in chunk.regions[0].points + ] + x, y = scaled_points[0] + # Offset to avoid overlapping with bounding rectangle + y -= 2 + page.insert_text( + (x, y), label, fontsize=5, fill=field_config.stroke, color=(1, 1, 1) + ) + + def _paint_rect( + self, + doc: pymupdf.Document, + field_config: OntoPainterFieldConfig, + chunk: SourceChunk, + ) -> None: + """Paint a rectangle on a document.""" + for region in chunk.regions: + # The coordinates come normalized in (0, 1) space. Project into page coords. + page = doc.load_page(chunk.regions[0].page) + page_width, page_height = page.mediabox[2:] + shape = page.new_shape() + scaled_points = [ + (p[0] * page_width, p[1] * page_height) for p in region.points + ] + shape.draw_rect(pymupdf.Quad(*scaled_points).rect) + shape.finish(color=field_config.stroke, width=field_config.stroke_width) + shape.commit() + + def _load_pdf(self, pdf_path: str, pages: str | None = None) -> pymupdf.Document: + """Load a PDF document.""" + with open(pdf_path, "rb") as f: + doc = pymupdf.open(f) + + filter_pages = _parse_pages_range(pages) + if filter_pages: + doc.select(filter_pages) + + return doc + + +def _parse_pages_range(pages: str | None = None) -> list[int] | None: + """Parse page range specification as a list of page numbers. + + If no spec is given, return None. + + Spec looks like: + 1 Single page + 1-3 Range of pages + 1,2,3 List of pages + 1-3,5 Range and list of pages + + Args: + pages: The page range specification, 1-indexed. + + Returns: + A list of page numbers (0-indexed). + """ + if pages is None: + return None + page_list = list[int]() + for segment in pages.split(","): + if "-" in segment: + start, end = segment.split("-") + page_list.extend(range(int(start.strip()), int(end.strip()) + 1)) + else: + page_list.append(int(segment.strip())) + # Clean up duplicates and sort. + return sorted([x - 1 for x in set(page_list)]) + + +default_onto_painter = OntoPainter( + fields=[ + OntoPainterFieldConfig( + field="case_number", + label="Case Number", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Red1, + stroke_width=2, + ), + OntoPainterFieldConfig( + field="location", + label="Location", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Green1, + stroke_width=2, + ), + OntoPainterFieldConfig( + field="incident_type", + label="Incident Type", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Blue1, + stroke_width=2, + ), + OntoPainterFieldConfig( + field="reporting_agency", + label="Reporting Agency", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Purple1, + stroke_width=2, + ), + OntoPainterFieldConfig( + field="narratives", + label="Narrative", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Blue1, + stroke_width=2, + ), + # Subject fields: + # type, name, address, phone, race, sex, dob + OntoPainterFieldConfig( + accessor=lambda report: [subject.type for subject in report.subjects], + label="Subject Type", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.name for subject in report.subjects], + label="Subject", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.address for subject in report.subjects], + label="Subject Address", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.phone for subject in report.subjects], + label="Subject Phone", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.race for subject in report.subjects], + label="Subject Race", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.sex for subject in report.subjects], + label="Subject Sex", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.dob for subject in report.subjects], + label="Subject DOB", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.dob for subject in report.subjects], + label="Subject DOB", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + # Offense fields: + # crime, code + OntoPainterFieldConfig( + accessor=lambda report: [offense.crime for offense in report.offenses], + label="Offense Crime", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Orange1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [offense.code for offense in report.offenses], + label="Offense Code", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Orange1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [offense.statute for offense in report.offenses], + label="Offense Statute", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Orange1, + stroke_width=2, + ), + ] +) From 1fabdccb464a3f9976a5abfaab208566a5e6b8e4 Mon Sep 17 00:00:00 2001 From: Joe Nudell Date: Wed, 15 Apr 2026 12:40:40 -0400 Subject: [PATCH 2/3] refactor painter --- bc2/core/{paint => common}/ontopainter.py | 162 ++------------------ bc2/core/paint/base.py | 12 +- bc2/core/paint/ontology.py | 173 ++++++++++++++++++++++ 3 files changed, 195 insertions(+), 152 deletions(-) rename bc2/core/{paint => common}/ontopainter.py (52%) diff --git a/bc2/core/paint/ontopainter.py b/bc2/core/common/ontopainter.py similarity index 52% rename from bc2/core/paint/ontopainter.py rename to bc2/core/common/ontopainter.py index 005b815..5d2bf0d 100644 --- a/bc2/core/paint/ontopainter.py +++ b/bc2/core/common/ontopainter.py @@ -4,8 +4,7 @@ import pymupdf from pydantic import BaseModel, model_validator -from ..common.ontology import Cited, PoliceReport, PoliceReportParseResult, SourceChunk -from ..common.palette import Palette +from .ontology import Cited, PoliceReport, PoliceReportParseResult, SourceChunk class OntoPainterMark(Enum): @@ -61,13 +60,13 @@ class OntoPainter(BaseModel): def paint( self, - pdf_path: str, + pdf: str | pymupdf.Document, parse_result: PoliceReportParseResult, pages: str | None = None, ) -> pymupdf.Document: """Paint a document annotated with the parse result.""" - # 1. Load the input path. - doc = self._load_pdf(pdf_path, pages) + # 1. Load the requested pages from the input path / doc + doc = self._load_pdf(pdf, pages) # 2. Loop over field configs and paint each field. for field_config in self.fields: @@ -129,16 +128,21 @@ def _paint_rect( shape.finish(color=field_config.stroke, width=field_config.stroke_width) shape.commit() - def _load_pdf(self, pdf_path: str, pages: str | None = None) -> pymupdf.Document: + def _load_pdf( + self, doc: str | pymupdf.Document, pages: str | None = None + ) -> pymupdf.Document: """Load a PDF document.""" - with open(pdf_path, "rb") as f: - doc = pymupdf.open(f) + if isinstance(doc, str): + with open(doc, "rb") as f: + pdf_doc = pymupdf.open(f) + else: + pdf_doc = doc filter_pages = _parse_pages_range(pages) if filter_pages: - doc.select(filter_pages) + pdf_doc.select(filter_pages) - return doc + return pdf_doc def _parse_pages_range(pages: str | None = None) -> list[int] | None: @@ -169,141 +173,3 @@ def _parse_pages_range(pages: str | None = None) -> list[int] | None: page_list.append(int(segment.strip())) # Clean up duplicates and sort. return sorted([x - 1 for x in set(page_list)]) - - -default_onto_painter = OntoPainter( - fields=[ - OntoPainterFieldConfig( - field="case_number", - label="Case Number", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Red1, - stroke_width=2, - ), - OntoPainterFieldConfig( - field="location", - label="Location", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Green1, - stroke_width=2, - ), - OntoPainterFieldConfig( - field="incident_type", - label="Incident Type", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Blue1, - stroke_width=2, - ), - OntoPainterFieldConfig( - field="reporting_agency", - label="Reporting Agency", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Purple1, - stroke_width=2, - ), - OntoPainterFieldConfig( - field="narratives", - label="Narrative", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Blue1, - stroke_width=2, - ), - # Subject fields: - # type, name, address, phone, race, sex, dob - OntoPainterFieldConfig( - accessor=lambda report: [subject.type for subject in report.subjects], - label="Subject Type", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Cyan1, - stroke_width=2, - ), - OntoPainterFieldConfig( - accessor=lambda report: [subject.name for subject in report.subjects], - label="Subject", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Cyan1, - stroke_width=2, - ), - OntoPainterFieldConfig( - accessor=lambda report: [subject.address for subject in report.subjects], - label="Subject Address", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Cyan1, - stroke_width=2, - ), - OntoPainterFieldConfig( - accessor=lambda report: [subject.phone for subject in report.subjects], - label="Subject Phone", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Cyan1, - stroke_width=2, - ), - OntoPainterFieldConfig( - accessor=lambda report: [subject.race for subject in report.subjects], - label="Subject Race", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Cyan1, - stroke_width=2, - ), - OntoPainterFieldConfig( - accessor=lambda report: [subject.sex for subject in report.subjects], - label="Subject Sex", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Cyan1, - stroke_width=2, - ), - OntoPainterFieldConfig( - accessor=lambda report: [subject.dob for subject in report.subjects], - label="Subject DOB", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Cyan1, - stroke_width=2, - ), - OntoPainterFieldConfig( - accessor=lambda report: [subject.dob for subject in report.subjects], - label="Subject DOB", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Cyan1, - stroke_width=2, - ), - # Offense fields: - # crime, code - OntoPainterFieldConfig( - accessor=lambda report: [offense.crime for offense in report.offenses], - label="Offense Crime", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Orange1, - stroke_width=2, - ), - OntoPainterFieldConfig( - accessor=lambda report: [offense.code for offense in report.offenses], - label="Offense Code", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Orange1, - stroke_width=2, - ), - OntoPainterFieldConfig( - accessor=lambda report: [offense.statute for offense in report.offenses], - label="Offense Statute", - mark=OntoPainterMark.RECT, - fill=None, - stroke=Palette.Orange1, - stroke_width=2, - ), - ] -) diff --git a/bc2/core/paint/base.py b/bc2/core/paint/base.py index 6056bce..f1c9460 100644 --- a/bc2/core/paint/base.py +++ b/bc2/core/paint/base.py @@ -10,11 +10,15 @@ class BasePainter(ABC, Generic[T], PreprocessMixin[T]): def __call__(self, file: MemoryFile, context: Context) -> MemoryFile: - """Paint a file, returning an annotated version.""" - current = self.preprocess(file) - return self.paint(context.input_file, current) + """Paint a file, returning an annotated version. + + `file` is the primary pipe value (e.g. a serialized ontology result). + The original input file is read from `context.input_file`. + """ + data = self.preprocess(file) + return self.paint(context.input_file, data) @abstractmethod def paint(self, original: MemoryFile, data: T) -> MemoryFile: - """Paint the preprocessed input, returning an annotated MemoryFile.""" + """Paint the input file using current analysis.""" ... diff --git a/bc2/core/paint/ontology.py b/bc2/core/paint/ontology.py index e69de29..928ab75 100644 --- a/bc2/core/paint/ontology.py +++ b/bc2/core/paint/ontology.py @@ -0,0 +1,173 @@ +from typing import Literal + +import pymupdf +from pydantic import BaseModel + +from ..common.file import MemoryFile +from ..common.ontology import PoliceReportParseResult +from ..common.ontopainter import OntoPainter, OntoPainterFieldConfig, OntoPainterMark +from ..common.palette import Palette +from ..common.preprocess import register_preprocessor +from .base import BasePainter + +painter = OntoPainter( + fields=[ + OntoPainterFieldConfig( + field="case_number", + label="Case Number", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Red1, + stroke_width=2, + ), + OntoPainterFieldConfig( + field="location", + label="Location", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Green1, + stroke_width=2, + ), + OntoPainterFieldConfig( + field="incident_type", + label="Incident Type", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Blue1, + stroke_width=2, + ), + OntoPainterFieldConfig( + field="reporting_agency", + label="Reporting Agency", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Purple1, + stroke_width=2, + ), + OntoPainterFieldConfig( + field="narratives", + label="Narrative", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Blue1, + stroke_width=2, + ), + # Subject fields: + # type, name, address, phone, race, sex, dob + OntoPainterFieldConfig( + accessor=lambda report: [subject.type for subject in report.subjects], + label="Subject Type", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.name for subject in report.subjects], + label="Subject", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.address for subject in report.subjects], + label="Subject Address", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.phone for subject in report.subjects], + label="Subject Phone", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.race for subject in report.subjects], + label="Subject Race", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.sex for subject in report.subjects], + label="Subject Sex", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.dob for subject in report.subjects], + label="Subject DOB", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.dob for subject in report.subjects], + label="Subject DOB", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + # Offense fields: + # crime, code + OntoPainterFieldConfig( + accessor=lambda report: [offense.crime for offense in report.offenses], + label="Offense Crime", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Orange1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [offense.code for offense in report.offenses], + label="Offense Code", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Orange1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [offense.statute for offense in report.offenses], + label="Offense Statute", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Orange1, + stroke_width=2, + ), + ] +) + + +class OntologyPainterConfig(BaseModel): + engine: Literal["paint:ontology"] = "paint:ontology" + + +class OntologyPainter(BasePainter[PoliceReportParseResult]): + @register_preprocessor(r"application/x-ontology") + def preprocess_ontology(self, file: MemoryFile) -> PoliceReportParseResult: + """Deserialize an ontology MemoryFile into a PoliceReportParseResult.""" + file.buffer.seek(0) + return PoliceReportParseResult.model_validate_json(file.buffer.read()) + + def paint(self, original: MemoryFile, data: PoliceReportParseResult) -> MemoryFile: + """Paint the original PDF with ontology annotations.""" + original.buffer.seek(0) + if original.mime_type != "application/pdf": + raise ValueError(f"Expected PDF, got {original.mime_type}") + doc = pymupdf.open(stream=original.buffer.read(), filetype="pdf") + + painted = painter.paint(doc, data) + + out = MemoryFile(mime_type="application/pdf") + out.writeb(painted.tobytes()) + return out From 392a36c7f60d56de91f482fb8f08108ab79b6de4 Mon Sep 17 00:00:00 2001 From: Joe Nudell Date: Wed, 15 Apr 2026 12:46:57 -0400 Subject: [PATCH 3/3] fix page region --- bc2/core/common/ontopainter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bc2/core/common/ontopainter.py b/bc2/core/common/ontopainter.py index 5d2bf0d..051b2ce 100644 --- a/bc2/core/common/ontopainter.py +++ b/bc2/core/common/ontopainter.py @@ -118,7 +118,7 @@ def _paint_rect( """Paint a rectangle on a document.""" for region in chunk.regions: # The coordinates come normalized in (0, 1) space. Project into page coords. - page = doc.load_page(chunk.regions[0].page) + page = doc.load_page(region.page) page_width, page_height = page.mediabox[2:] shape = page.new_shape() scaled_points = [