Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 0 additions & 39 deletions bc2/core/common/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,42 +66,3 @@ class KeyValuePair(TypedDict):


Table = list[dict[str, str]]


class Palette:
Red1 = (0.9, 0.2, 0.1)
Red2 = (0.8, 0.1, 0.0)
Red3 = (0.7, 0.0, 0.0)
Orange1 = (0.9, 0.5, 0.0)
Orange2 = (0.8, 0.4, 0.0)
Orange3 = (0.7, 0.3, 0.0)
Yellow1 = (0.9, 0.9, 0.0)
Yellow2 = (0.8, 0.8, 0.0)
Yellow3 = (0.7, 0.7, 0.0)
Green1 = (0.2, 0.9, 0.1)
Green2 = (0.1, 0.8, 0.0)
Green3 = (0.0, 0.7, 0.0)
Blue1 = (0, 0.5, 1)
Blue2 = (0.0, 0.4, 0.8)
Blue3 = (0.0, 0.3, 0.7)
Purple1 = (0.9, 0.0, 0.9)
Purple2 = (0.8, 0.0, 0.8)
Purple3 = (0.7, 0.0, 0.7)
Pink1 = (0.9, 0.0, 0.9)
Pink2 = (0.8, 0.0, 0.8)
Pink3 = (0.7, 0.0, 0.7)
Brown1 = (0.5, 0.25, 0.0)
Brown2 = (0.4, 0.2, 0.0)
Brown3 = (0.3, 0.15, 0.0)
Cyan1 = (0.0, 0.9, 0.9)
Cyan2 = (0.0, 0.8, 0.8)
Cyan3 = (0.0, 0.7, 0.7)
Lime1 = (0.9, 0.9, 0.0)
Lime2 = (0.8, 0.8, 0.0)
Lime3 = (0.7, 0.7, 0.0)
Maroon1 = (0.5, 0.0, 0.0)
Maroon2 = (0.4, 0.0, 0.0)
Maroon3 = (0.3, 0.0, 0.0)
Gray1 = (0.5, 0.5, 0.5)
Gray2 = (0.4, 0.4, 0.4)
Gray3 = (0.3, 0.3, 0.3)
175 changes: 175 additions & 0 deletions bc2/core/common/ontopainter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
from enum import Enum
from typing import Callable

import pymupdf
from pydantic import BaseModel, model_validator

from .ontology import Cited, PoliceReport, PoliceReportParseResult, SourceChunk


class OntoPainterMark(Enum):
RECT = "RECT"


FieldAccessor = Callable[[PoliceReport], list[Cited]]


class OntoPainterFieldConfig(BaseModel):
field: str | None = None # TODO - validate against PoliceReport fields
label: str | None = None
mark: OntoPainterMark
fill: tuple[float, float, float] | None = None
stroke: tuple[float, float, float] | None = None
stroke_width: float = 0
accessor: FieldAccessor | None = None

@model_validator(mode="after")
def validate_field_accessor(self) -> "OntoPainterFieldConfig":
"""Either field or accessor must be set, but not both."""
if not ((self.field is None) ^ (self.accessor is None)):
raise ValueError("Either field or accessor must be set, but not both.")
return self

def get_value(self, report: PoliceReport) -> list[Cited]:
"""Get value of a field from the report.

Use the `field` attribute if set, otherwise use the `accessor` function.
"""
v: Cited | list[Cited | None] | list[Cited] | None = None
if self.field is None:
if not self.accessor:
raise ValueError("Accessor is required if field is not set.")
v = self.accessor(report)
else:
v = getattr(report, self.field)

# Normalize none into an empty list.
if v is None:
return []
# Normalize singular value to a one-item list.
elif isinstance(v, Cited):
return [v]
# Normalize list of Optional values to a list of Cited values.
elif isinstance(v, list):
return [v for v in v if v is not None]
raise ValueError(f"Unexpected type: {type(v)}")


class OntoPainter(BaseModel):
fields: list[OntoPainterFieldConfig]

def paint(
self,
pdf: str | pymupdf.Document,
parse_result: PoliceReportParseResult,
pages: str | None = None,
) -> pymupdf.Document:
"""Paint a document annotated with the parse result."""
# 1. Load the requested pages from the input path / doc
doc = self._load_pdf(pdf, pages)

# 2. Loop over field configs and paint each field.
for field_config in self.fields:
field_values = field_config.get_value(parse_result.report)
for i, field_value in enumerate(field_values):
for j, chunk_id in enumerate(field_value.ids):
chunk = parse_result.chunks[chunk_id]
self._paint_field(
doc,
field_config,
chunk,
label=f"{field_config.label} {i + 1}-{j + 1}",
)

return doc

def _paint_field(
self,
doc: pymupdf.Document,
field_config: OntoPainterFieldConfig,
chunk: SourceChunk,
label: str | None = None,
) -> None:
"""Paint a field on a document."""
match field_config.mark:
case OntoPainterMark.RECT:
self._paint_rect(doc, field_config, chunk)
case _:
raise ValueError(f"Unsupported mark: {field_config.mark}")
if label:
page = doc.load_page(chunk.regions[0].page)
page_width, page_height = page.mediabox[2:]
scaled_points = [
(p[0] * page_width, p[1] * page_height) for p in chunk.regions[0].points
]
x, y = scaled_points[0]
# Offset to avoid overlapping with bounding rectangle
y -= 2
page.insert_text(
(x, y), label, fontsize=5, fill=field_config.stroke, color=(1, 1, 1)
)

def _paint_rect(
self,
doc: pymupdf.Document,
field_config: OntoPainterFieldConfig,
chunk: SourceChunk,
) -> None:
"""Paint a rectangle on a document."""
for region in chunk.regions:
# The coordinates come normalized in (0, 1) space. Project into page coords.
page = doc.load_page(region.page)
page_width, page_height = page.mediabox[2:]
shape = page.new_shape()
scaled_points = [
(p[0] * page_width, p[1] * page_height) for p in region.points
]
shape.draw_rect(pymupdf.Quad(*scaled_points).rect)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: pymupdf.Quad(*scaled_points) crashes if region.points has anything other than exactly 4 points.
Severity: MEDIUM

Suggested Fix

Clamp scaled_points to exactly 4 points (e.g., use scaled_points[:4]) or construct the rect directly via pymupdf.Rect(min_x, min_y, max_x, max_y) computed from all points, avoiding the Quad constraint entirely.

Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent.
Verify if this is a real issue. If it is, propose a fix; if not, explain why it's not
valid.

Location: bc2/core/common/ontopainter.py#L127

Potential issue: In `_paint_rect` (line 127), `pymupdf.Quad(*scaled_points)` unpacks all
points from `region.points` as positional arguments. `pymupdf.Quad` accepts exactly 4
corner points (ul, ur, ll, lr). The polygon from Azure DI is converted in `openai.py` by
iterating `range(0, len(polygon), 2)`, which typically yields 4 points (8-float
polygon). However, `SourceChunkBoundingRegion.points` is typed as an unconstrained
`list[tuple[float, float]]`, so if Azure DI returns a polygon with more or fewer than 4
points (e.g., for irregular regions or future API changes),
`pymupdf.Quad(*scaled_points)` will raise a `TypeError` about incorrect number of
arguments, crashing the painting pipeline for that document.

shape.finish(color=field_config.stroke, width=field_config.stroke_width)
shape.commit()

def _load_pdf(
self, doc: str | pymupdf.Document, pages: str | None = None
) -> pymupdf.Document:
"""Load a PDF document."""
if isinstance(doc, str):
with open(doc, "rb") as f:
pdf_doc = pymupdf.open(f)
else:
pdf_doc = doc

filter_pages = _parse_pages_range(pages)
if filter_pages:
pdf_doc.select(filter_pages)

return pdf_doc


def _parse_pages_range(pages: str | None = None) -> list[int] | None:
"""Parse page range specification as a list of page numbers.

If no spec is given, return None.

Spec looks like:
1 Single page
1-3 Range of pages
1,2,3 List of pages
1-3,5 Range and list of pages

Args:
pages: The page range specification, 1-indexed.

Returns:
A list of page numbers (0-indexed).
"""
if pages is None:
return None
page_list = list[int]()
for segment in pages.split(","):
if "-" in segment:
start, end = segment.split("-")
page_list.extend(range(int(start.strip()), int(end.strip()) + 1))
else:
page_list.append(int(segment.strip()))
# Clean up duplicates and sort.
return sorted([x - 1 for x in set(page_list)])
37 changes: 37 additions & 0 deletions bc2/core/common/palette.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
class Palette:
Red1 = (0.9, 0.2, 0.1)
Red2 = (0.8, 0.1, 0.0)
Red3 = (0.7, 0.0, 0.0)
Orange1 = (0.9, 0.5, 0.0)
Orange2 = (0.8, 0.4, 0.0)
Orange3 = (0.7, 0.3, 0.0)
Yellow1 = (0.9, 0.9, 0.0)
Yellow2 = (0.8, 0.8, 0.0)
Yellow3 = (0.7, 0.7, 0.0)
Green1 = (0.2, 0.9, 0.1)
Green2 = (0.1, 0.8, 0.0)
Green3 = (0.0, 0.7, 0.0)
Blue1 = (0, 0.5, 1)
Blue2 = (0.0, 0.4, 0.8)
Blue3 = (0.0, 0.3, 0.7)
Purple1 = (0.9, 0.0, 0.9)
Purple2 = (0.8, 0.0, 0.8)
Purple3 = (0.7, 0.0, 0.7)
Pink1 = (0.9, 0.0, 0.9)
Pink2 = (0.8, 0.0, 0.8)
Pink3 = (0.7, 0.0, 0.7)
Brown1 = (0.5, 0.25, 0.0)
Brown2 = (0.4, 0.2, 0.0)
Brown3 = (0.3, 0.15, 0.0)
Cyan1 = (0.0, 0.9, 0.9)
Cyan2 = (0.0, 0.8, 0.8)
Cyan3 = (0.0, 0.7, 0.7)
Lime1 = (0.9, 0.9, 0.0)
Lime2 = (0.8, 0.8, 0.0)
Lime3 = (0.7, 0.7, 0.0)
Maroon1 = (0.5, 0.0, 0.0)
Maroon2 = (0.4, 0.0, 0.0)
Maroon3 = (0.3, 0.0, 0.0)
Gray1 = (0.5, 0.5, 0.5)
Gray2 = (0.4, 0.4, 0.4)
Gray3 = (0.3, 0.3, 0.3)
2 changes: 1 addition & 1 deletion bc2/core/input/azureblob.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class AzureBlobInput(BaseInputDriver, AzureBlobDriver):
def __init__(self, config: AzureBlobInputConfig):
self.init_client(config)

def __call__(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile:
def load_file(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile:
"""Read from an Azure Blob."""
f = MemoryFile()
full_path = f"{self.config.prefix}{path}"
Expand Down
11 changes: 10 additions & 1 deletion bc2/core/input/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from abc import ABC, abstractmethod
from typing import Literal

from ..common.context import Context
from ..common.file import MemoryFile


Expand All @@ -16,7 +17,15 @@ class BaseInputDriver(ABC):

required: list[Literal["path"] | Literal["buffer"]] = []

@abstractmethod
def __call__(
self, context: Context, path: str = "", buffer: io.BytesIO | None = None
) -> MemoryFile:
"""Load a file from a path or buffer."""
f = self.load_file(path=path, buffer=buffer)
context.input_file = f
return f

@abstractmethod
def load_file(
self, path: str = "", buffer: io.BytesIO | None = None
) -> MemoryFile: ...
2 changes: 1 addition & 1 deletion bc2/core/input/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, config: FileInputConfig):

required = ["path"]

def __call__(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile:
def load_file(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile:
"""Read from a file."""
if not path:
raise ValueError("Path is required for file input.")
Expand Down
2 changes: 1 addition & 1 deletion bc2/core/input/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, config: MemoryInputConfig):

required = ["buffer"]

def __call__(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile:
def load_file(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile:
"""Read from a buffer in memory."""
if not buffer:
raise ValueError("Buffer is required for memory input.")
Expand Down
2 changes: 1 addition & 1 deletion bc2/core/input/stdin.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class StdinInput(BaseInputDriver):
def __init__(self, config: StdinInputConfig):
self.config = config

def __call__(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile:
def load_file(self, path: str = "", buffer: io.BytesIO | None = None) -> MemoryFile:
"""Read from stdin."""
f = MemoryFile()
# Consume all the stdin pipe and write it to memory
Expand Down
Empty file added bc2/core/paint/__init__.py
Empty file.
24 changes: 24 additions & 0 deletions bc2/core/paint/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from abc import ABC, abstractmethod
from typing import Generic, TypeVar

from ..common.context import Context
from ..common.file import MemoryFile
from ..common.preprocess import PreprocessMixin

T = TypeVar("T")


class BasePainter(ABC, Generic[T], PreprocessMixin[T]):
def __call__(self, file: MemoryFile, context: Context) -> MemoryFile:
"""Paint a file, returning an annotated version.

`file` is the primary pipe value (e.g. a serialized ontology result).
The original input file is read from `context.input_file`.
"""
data = self.preprocess(file)
return self.paint(context.input_file, data)

@abstractmethod
def paint(self, original: MemoryFile, data: T) -> MemoryFile:
"""Paint the input file using current analysis."""
...
Loading
Loading