diff --git a/bc2/core/common/ontology.py b/bc2/core/common/ontology.py index 797c801..dcff647 100644 --- a/bc2/core/common/ontology.py +++ b/bc2/core/common/ontology.py @@ -66,42 +66,3 @@ class KeyValuePair(TypedDict): Table = list[dict[str, str]] - - -class Palette: - Red1 = (0.9, 0.2, 0.1) - Red2 = (0.8, 0.1, 0.0) - Red3 = (0.7, 0.0, 0.0) - Orange1 = (0.9, 0.5, 0.0) - Orange2 = (0.8, 0.4, 0.0) - Orange3 = (0.7, 0.3, 0.0) - Yellow1 = (0.9, 0.9, 0.0) - Yellow2 = (0.8, 0.8, 0.0) - Yellow3 = (0.7, 0.7, 0.0) - Green1 = (0.2, 0.9, 0.1) - Green2 = (0.1, 0.8, 0.0) - Green3 = (0.0, 0.7, 0.0) - Blue1 = (0, 0.5, 1) - Blue2 = (0.0, 0.4, 0.8) - Blue3 = (0.0, 0.3, 0.7) - Purple1 = (0.9, 0.0, 0.9) - Purple2 = (0.8, 0.0, 0.8) - Purple3 = (0.7, 0.0, 0.7) - Pink1 = (0.9, 0.0, 0.9) - Pink2 = (0.8, 0.0, 0.8) - Pink3 = (0.7, 0.0, 0.7) - Brown1 = (0.5, 0.25, 0.0) - Brown2 = (0.4, 0.2, 0.0) - Brown3 = (0.3, 0.15, 0.0) - Cyan1 = (0.0, 0.9, 0.9) - Cyan2 = (0.0, 0.8, 0.8) - Cyan3 = (0.0, 0.7, 0.7) - Lime1 = (0.9, 0.9, 0.0) - Lime2 = (0.8, 0.8, 0.0) - Lime3 = (0.7, 0.7, 0.0) - Maroon1 = (0.5, 0.0, 0.0) - Maroon2 = (0.4, 0.0, 0.0) - Maroon3 = (0.3, 0.0, 0.0) - Gray1 = (0.5, 0.5, 0.5) - Gray2 = (0.4, 0.4, 0.4) - Gray3 = (0.3, 0.3, 0.3) diff --git a/bc2/core/common/ontopainter.py b/bc2/core/common/ontopainter.py new file mode 100644 index 0000000..051b2ce --- /dev/null +++ b/bc2/core/common/ontopainter.py @@ -0,0 +1,175 @@ +from enum import Enum +from typing import Callable + +import pymupdf +from pydantic import BaseModel, model_validator + +from .ontology import Cited, PoliceReport, PoliceReportParseResult, SourceChunk + + +class OntoPainterMark(Enum): + RECT = "RECT" + + +FieldAccessor = Callable[[PoliceReport], list[Cited]] + + +class OntoPainterFieldConfig(BaseModel): + field: str | None = None # TODO - validate against PoliceReport fields + label: str | None = None + mark: OntoPainterMark + fill: tuple[float, float, float] | None = None + stroke: tuple[float, float, float] | None = None + stroke_width: float = 0 + accessor: FieldAccessor | None = None + + @model_validator(mode="after") + def validate_field_accessor(self) -> "OntoPainterFieldConfig": + """Either field or accessor must be set, but not both.""" + if not ((self.field is None) ^ (self.accessor is None)): + raise ValueError("Either field or accessor must be set, but not both.") + return self + + def get_value(self, report: PoliceReport) -> list[Cited]: + """Get value of a field from the report. + + Use the `field` attribute if set, otherwise use the `accessor` function. + """ + v: Cited | list[Cited | None] | list[Cited] | None = None + if self.field is None: + if not self.accessor: + raise ValueError("Accessor is required if field is not set.") + v = self.accessor(report) + else: + v = getattr(report, self.field) + + # Normalize none into an empty list. + if v is None: + return [] + # Normalize singular value to a one-item list. + elif isinstance(v, Cited): + return [v] + # Normalize list of Optional values to a list of Cited values. + elif isinstance(v, list): + return [v for v in v if v is not None] + raise ValueError(f"Unexpected type: {type(v)}") + + +class OntoPainter(BaseModel): + fields: list[OntoPainterFieldConfig] + + def paint( + self, + pdf: str | pymupdf.Document, + parse_result: PoliceReportParseResult, + pages: str | None = None, + ) -> pymupdf.Document: + """Paint a document annotated with the parse result.""" + # 1. Load the requested pages from the input path / doc + doc = self._load_pdf(pdf, pages) + + # 2. Loop over field configs and paint each field. + for field_config in self.fields: + field_values = field_config.get_value(parse_result.report) + for i, field_value in enumerate(field_values): + for j, chunk_id in enumerate(field_value.ids): + chunk = parse_result.chunks[chunk_id] + self._paint_field( + doc, + field_config, + chunk, + label=f"{field_config.label} {i + 1}-{j + 1}", + ) + + return doc + + def _paint_field( + self, + doc: pymupdf.Document, + field_config: OntoPainterFieldConfig, + chunk: SourceChunk, + label: str | None = None, + ) -> None: + """Paint a field on a document.""" + match field_config.mark: + case OntoPainterMark.RECT: + self._paint_rect(doc, field_config, chunk) + case _: + raise ValueError(f"Unsupported mark: {field_config.mark}") + if label: + page = doc.load_page(chunk.regions[0].page) + page_width, page_height = page.mediabox[2:] + scaled_points = [ + (p[0] * page_width, p[1] * page_height) for p in chunk.regions[0].points + ] + x, y = scaled_points[0] + # Offset to avoid overlapping with bounding rectangle + y -= 2 + page.insert_text( + (x, y), label, fontsize=5, fill=field_config.stroke, color=(1, 1, 1) + ) + + def _paint_rect( + self, + doc: pymupdf.Document, + field_config: OntoPainterFieldConfig, + chunk: SourceChunk, + ) -> None: + """Paint a rectangle on a document.""" + for region in chunk.regions: + # The coordinates come normalized in (0, 1) space. Project into page coords. + page = doc.load_page(region.page) + page_width, page_height = page.mediabox[2:] + shape = page.new_shape() + scaled_points = [ + (p[0] * page_width, p[1] * page_height) for p in region.points + ] + shape.draw_rect(pymupdf.Quad(*scaled_points).rect) + shape.finish(color=field_config.stroke, width=field_config.stroke_width) + shape.commit() + + def _load_pdf( + self, doc: str | pymupdf.Document, pages: str | None = None + ) -> pymupdf.Document: + """Load a PDF document.""" + if isinstance(doc, str): + with open(doc, "rb") as f: + pdf_doc = pymupdf.open(f) + else: + pdf_doc = doc + + filter_pages = _parse_pages_range(pages) + if filter_pages: + pdf_doc.select(filter_pages) + + return pdf_doc + + +def _parse_pages_range(pages: str | None = None) -> list[int] | None: + """Parse page range specification as a list of page numbers. + + If no spec is given, return None. + + Spec looks like: + 1 Single page + 1-3 Range of pages + 1,2,3 List of pages + 1-3,5 Range and list of pages + + Args: + pages: The page range specification, 1-indexed. + + Returns: + A list of page numbers (0-indexed). + """ + if pages is None: + return None + page_list = list[int]() + for segment in pages.split(","): + if "-" in segment: + start, end = segment.split("-") + page_list.extend(range(int(start.strip()), int(end.strip()) + 1)) + else: + page_list.append(int(segment.strip())) + # Clean up duplicates and sort. + return sorted([x - 1 for x in set(page_list)]) diff --git a/bc2/core/common/palette.py b/bc2/core/common/palette.py new file mode 100644 index 0000000..deb93cd --- /dev/null +++ b/bc2/core/common/palette.py @@ -0,0 +1,37 @@ +class Palette: + Red1 = (0.9, 0.2, 0.1) + Red2 = (0.8, 0.1, 0.0) + Red3 = (0.7, 0.0, 0.0) + Orange1 = (0.9, 0.5, 0.0) + Orange2 = (0.8, 0.4, 0.0) + Orange3 = (0.7, 0.3, 0.0) + Yellow1 = (0.9, 0.9, 0.0) + Yellow2 = (0.8, 0.8, 0.0) + Yellow3 = (0.7, 0.7, 0.0) + Green1 = (0.2, 0.9, 0.1) + Green2 = (0.1, 0.8, 0.0) + Green3 = (0.0, 0.7, 0.0) + Blue1 = (0, 0.5, 1) + Blue2 = (0.0, 0.4, 0.8) + Blue3 = (0.0, 0.3, 0.7) + Purple1 = (0.9, 0.0, 0.9) + Purple2 = (0.8, 0.0, 0.8) + Purple3 = (0.7, 0.0, 0.7) + Pink1 = (0.9, 0.0, 0.9) + Pink2 = (0.8, 0.0, 0.8) + Pink3 = (0.7, 0.0, 0.7) + Brown1 = (0.5, 0.25, 0.0) + Brown2 = (0.4, 0.2, 0.0) + Brown3 = (0.3, 0.15, 0.0) + Cyan1 = (0.0, 0.9, 0.9) + Cyan2 = (0.0, 0.8, 0.8) + Cyan3 = (0.0, 0.7, 0.7) + Lime1 = (0.9, 0.9, 0.0) + Lime2 = (0.8, 0.8, 0.0) + Lime3 = (0.7, 0.7, 0.0) + Maroon1 = (0.5, 0.0, 0.0) + Maroon2 = (0.4, 0.0, 0.0) + Maroon3 = (0.3, 0.0, 0.0) + Gray1 = (0.5, 0.5, 0.5) + Gray2 = (0.4, 0.4, 0.4) + Gray3 = (0.3, 0.3, 0.3) diff --git a/bc2/core/input/azureblob.py b/bc2/core/input/azureblob.py index b311f3e..ae1f652 100644 --- a/bc2/core/input/azureblob.py +++ b/bc2/core/input/azureblob.py @@ -21,7 +21,7 @@ class AzureBlobInput(BaseInputDriver, AzureBlobDriver): def __init__(self, config: AzureBlobInputConfig): self.init_client(config) - def __call__(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: + def load_file(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: """Read from an Azure Blob.""" f = MemoryFile() full_path = f"{self.config.prefix}{path}" diff --git a/bc2/core/input/base.py b/bc2/core/input/base.py index 34b0be1..49a68c9 100644 --- a/bc2/core/input/base.py +++ b/bc2/core/input/base.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from typing import Literal +from ..common.context import Context from ..common.file import MemoryFile @@ -16,7 +17,15 @@ class BaseInputDriver(ABC): required: list[Literal["path"] | Literal["buffer"]] = [] - @abstractmethod def __call__( + self, context: Context, path: str = "", buffer: io.BytesIO | None = None + ) -> MemoryFile: + """Load a file from a path or buffer.""" + f = self.load_file(path=path, buffer=buffer) + context.input_file = f + return f + + @abstractmethod + def load_file( self, path: str = "", buffer: io.BytesIO | None = None ) -> MemoryFile: ... diff --git a/bc2/core/input/file.py b/bc2/core/input/file.py index b2515bd..3faee04 100644 --- a/bc2/core/input/file.py +++ b/bc2/core/input/file.py @@ -23,7 +23,7 @@ def __init__(self, config: FileInputConfig): required = ["path"] - def __call__(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: + def load_file(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: """Read from a file.""" if not path: raise ValueError("Path is required for file input.") diff --git a/bc2/core/input/memory.py b/bc2/core/input/memory.py index c016baa..0c48b17 100644 --- a/bc2/core/input/memory.py +++ b/bc2/core/input/memory.py @@ -23,7 +23,7 @@ def __init__(self, config: MemoryInputConfig): required = ["buffer"] - def __call__(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: + def load_file(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: """Read from a buffer in memory.""" if not buffer: raise ValueError("Buffer is required for memory input.") diff --git a/bc2/core/input/stdin.py b/bc2/core/input/stdin.py index a538c21..5c08ded 100644 --- a/bc2/core/input/stdin.py +++ b/bc2/core/input/stdin.py @@ -22,7 +22,7 @@ class StdinInput(BaseInputDriver): def __init__(self, config: StdinInputConfig): self.config = config - def __call__(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: + def load_file(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile: """Read from stdin.""" f = MemoryFile() # Consume all the stdin pipe and write it to memory diff --git a/bc2/core/paint/__init__.py b/bc2/core/paint/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bc2/core/paint/base.py b/bc2/core/paint/base.py new file mode 100644 index 0000000..f1c9460 --- /dev/null +++ b/bc2/core/paint/base.py @@ -0,0 +1,24 @@ +from abc import ABC, abstractmethod +from typing import Generic, TypeVar + +from ..common.context import Context +from ..common.file import MemoryFile +from ..common.preprocess import PreprocessMixin + +T = TypeVar("T") + + +class BasePainter(ABC, Generic[T], PreprocessMixin[T]): + def __call__(self, file: MemoryFile, context: Context) -> MemoryFile: + """Paint a file, returning an annotated version. + + `file` is the primary pipe value (e.g. a serialized ontology result). + The original input file is read from `context.input_file`. + """ + data = self.preprocess(file) + return self.paint(context.input_file, data) + + @abstractmethod + def paint(self, original: MemoryFile, data: T) -> MemoryFile: + """Paint the input file using current analysis.""" + ... diff --git a/bc2/core/paint/ontology.py b/bc2/core/paint/ontology.py new file mode 100644 index 0000000..928ab75 --- /dev/null +++ b/bc2/core/paint/ontology.py @@ -0,0 +1,173 @@ +from typing import Literal + +import pymupdf +from pydantic import BaseModel + +from ..common.file import MemoryFile +from ..common.ontology import PoliceReportParseResult +from ..common.ontopainter import OntoPainter, OntoPainterFieldConfig, OntoPainterMark +from ..common.palette import Palette +from ..common.preprocess import register_preprocessor +from .base import BasePainter + +painter = OntoPainter( + fields=[ + OntoPainterFieldConfig( + field="case_number", + label="Case Number", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Red1, + stroke_width=2, + ), + OntoPainterFieldConfig( + field="location", + label="Location", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Green1, + stroke_width=2, + ), + OntoPainterFieldConfig( + field="incident_type", + label="Incident Type", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Blue1, + stroke_width=2, + ), + OntoPainterFieldConfig( + field="reporting_agency", + label="Reporting Agency", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Purple1, + stroke_width=2, + ), + OntoPainterFieldConfig( + field="narratives", + label="Narrative", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Blue1, + stroke_width=2, + ), + # Subject fields: + # type, name, address, phone, race, sex, dob + OntoPainterFieldConfig( + accessor=lambda report: [subject.type for subject in report.subjects], + label="Subject Type", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.name for subject in report.subjects], + label="Subject", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.address for subject in report.subjects], + label="Subject Address", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.phone for subject in report.subjects], + label="Subject Phone", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.race for subject in report.subjects], + label="Subject Race", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.sex for subject in report.subjects], + label="Subject Sex", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.dob for subject in report.subjects], + label="Subject DOB", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [subject.dob for subject in report.subjects], + label="Subject DOB", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Cyan1, + stroke_width=2, + ), + # Offense fields: + # crime, code + OntoPainterFieldConfig( + accessor=lambda report: [offense.crime for offense in report.offenses], + label="Offense Crime", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Orange1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [offense.code for offense in report.offenses], + label="Offense Code", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Orange1, + stroke_width=2, + ), + OntoPainterFieldConfig( + accessor=lambda report: [offense.statute for offense in report.offenses], + label="Offense Statute", + mark=OntoPainterMark.RECT, + fill=None, + stroke=Palette.Orange1, + stroke_width=2, + ), + ] +) + + +class OntologyPainterConfig(BaseModel): + engine: Literal["paint:ontology"] = "paint:ontology" + + +class OntologyPainter(BasePainter[PoliceReportParseResult]): + @register_preprocessor(r"application/x-ontology") + def preprocess_ontology(self, file: MemoryFile) -> PoliceReportParseResult: + """Deserialize an ontology MemoryFile into a PoliceReportParseResult.""" + file.buffer.seek(0) + return PoliceReportParseResult.model_validate_json(file.buffer.read()) + + def paint(self, original: MemoryFile, data: PoliceReportParseResult) -> MemoryFile: + """Paint the original PDF with ontology annotations.""" + original.buffer.seek(0) + if original.mime_type != "application/pdf": + raise ValueError(f"Expected PDF, got {original.mime_type}") + doc = pymupdf.open(stream=original.buffer.read(), filetype="pdf") + + painted = painter.paint(doc, data) + + out = MemoryFile(mime_type="application/pdf") + out.writeb(painted.tobytes()) + return out