From 1457778abc29d8d1e930bf17f4d96f8a95d39cf5 Mon Sep 17 00:00:00 2001
From: Joe Barrow <joseph.d.barrow@gmail.com>
Date: Mon, 17 Nov 2025 08:35:52 -0500
Subject: [PATCH 1/5] add ffdetr inference code

---
 commonforms/__main__.py  |  1 +
 commonforms/inference.py | 74 ++++++++++++++++++++++++++++++++++++++--
 pyproject.toml           |  1 +
 3 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/commonforms/__main__.py b/commonforms/__main__.py
index f66df1a..08be6a6 100644
--- a/commonforms/__main__.py
+++ b/commonforms/__main__.py
@@ -58,6 +58,7 @@ def main():
 
     args = parser.parse_args()
 
+    print(f"**{args.confidence=}")
     prepare_form(
         args.input,
         args.output,
diff --git a/commonforms/inference.py b/commonforms/inference.py
index 7ac5827..10fa676 100644
--- a/commonforms/inference.py
+++ b/commonforms/inference.py
@@ -2,6 +2,7 @@
 from ultralytics import YOLO
 from pathlib import Path
 from huggingface_hub import hf_hub_download
+from rfdetr import RFDETRNano, RFDETRBase, RFDETRMedium, RFDETRLarge
 
 from commonforms.utils import BoundingBox, Page, Widget
 from commonforms.form_creator import PyPdfFormCreator
@@ -9,6 +10,7 @@
 
 import formalpdf
 import pypdfium2
+import PIL
 
 
 # our mapping from (model_name, fast) to (repo_id, filename) for the huggingface hub
@@ -17,9 +19,75 @@
     ("FFDNET-S", False): ("jbarrow/FFDNet-S", "FFDNet-S.pt"),
     ("FFDNET-L", True): ("jbarrow/FFDNet-L-cpu", "FFDNet-L.onnx"),
     ("FFDNET-L", False): ("jbarrow/FFDNet-L", "FFDNet-L.pt"),
+    ("FFDetr-Nano", False): ("./models/FFDetr-Nano", "checkpoint_best_ema.pth")
 }
 
 
+
+def batch(lst: list, n: int = 8):
+    l = len(lst)
+    for ndx in range(0, l, n):
+        yield lst[ndx:min(ndx + n, l)]
+
+
+
+class FFDetrDetector:
+    def __init__(
+        self, model_or_path: str, device: int | str = "cpu"
+    ) -> None:
+        self.device = device
+        self.model = RFDETRMedium(pretrain_weights=model_or_path, resolution=224*5, num_classes=2)
+
+        self.id_to_cls = {0: "TextBox", 1: "ChoiceButton"}
+
+    def resize(
+        self,
+        image: PIL.Image.Image,
+        size: tuple[int, int] | int,
+    ) -> PIL.Image.Image:
+        if isinstance(size, int):
+            size = (size, size)
+
+        return image.resize(size, PIL.Image.Resampling.LANCZOS)
+
+    def extract_widgets(
+        self, pages: list[Page], confidence: float = 0.2, image_size: int = 1120
+    ) -> dict[int, list[Widget]]:
+        image_size = 1024
+        results = []
+        for b in batch([p.image for p in pages], n=1):
+            results += [self.model.predict(b, threshold=confidence)]
+
+        widgets = {}
+
+        if len(pages) == 1:
+            results = [results]
+
+        for page_ix, detections in enumerate(results):
+            print(f"{page_ix}: {len(detections)} fields detected")
+            detections = detections.with_nms(threshold=0.1, class_agnostic=True)
+            print(f"{len(detections)} after nms")
+            widgets[page_ix] = []
+
+            for class_id, box in zip(detections.class_id, detections.xyxy):
+                x0, x1 = box[[0, 2]] / pages[page_ix].image.width
+                y0, y1 = box[[1, 3]] / pages[page_ix].image.height
+
+                widget_type = self.id_to_cls[class_id]
+
+                widgets[page_ix].append(
+                    Widget(
+                        widget_type=widget_type,
+                        bounding_box=BoundingBox(x0=x0, y0=y0, x1=x1, y1=y1),
+                        page=page_ix,
+                    )
+                )
+
+            widgets[page_ix] = sort_widgets(widgets[page_ix])
+
+        return widgets
+
+
 class FFDNetDetector:
     def __init__(
         self, model_or_path: str, device: int | str = "cpu", fast: bool = False
@@ -148,7 +216,8 @@ def render_pdf(pdf_path: str) -> list[Page]:
     doc = formalpdf.open(pdf_path)
     try:
         for page in doc:
-            image = page.render()
+            image = page.render(dpi=144)
+            print(image.width, image.height)
             pages.append(Page(image=image, width=image.width, height=image.height))
         return pages
     finally:
@@ -168,7 +237,8 @@ def prepare_form(
     fast: bool = False,
     multiline: bool = False,
 ):
-    detector = FFDNetDetector(model_or_path, device=device, fast=fast)
+    # detector = FFDNetDetector(model_or_path, device=device, fast=fast)
+    detector = FFDetrDetector("./models/FFDetr-Medium/checkpoint_best_ema.pth")
 
     try:
         pages = render_pdf(input_path)
diff --git a/pyproject.toml b/pyproject.toml
index a613152..10b5d04 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
     "pillow>=11.3.0",
     "pydantic>=2.11.9",
     "pypdf>=6.1.1",
+    "rfdetr>=1.3.0",
     "ultralytics>=8.3.204",
 ]
 

From cf1399b749f03db407c8bf4fcc05e28510e2c320 Mon Sep 17 00:00:00 2001
From: Joe Barrow <joseph.d.barrow@gmail.com>
Date: Tue, 25 Nov 2025 08:24:38 -0500
Subject: [PATCH 2/5] FFDetr inference

---
 commonforms/inference.py | 49 ++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/commonforms/inference.py b/commonforms/inference.py
index 10fa676..38ef5ea 100644
--- a/commonforms/inference.py
+++ b/commonforms/inference.py
@@ -10,16 +10,22 @@
 
 import formalpdf
 import pypdfium2
+import logging
 import PIL
 
 
-# our mapping from (model_name, fast) to (repo_id, filename) for the huggingface hub
+logging.basicConfig(level=logging.INFO)
+
+
+# our mapping from (model_name_upper, fast) to (repo_id, filename) for the huggingface hub.
+# keeping it simple and declarative like this becuase it's not like we're adding a bunch
+# of models.
 models = {
     ("FFDNET-S", True): ("jbarrow/FFDNet-S-cpu", "FFDNet-S.onnx"),
     ("FFDNET-S", False): ("jbarrow/FFDNet-S", "FFDNet-S.pt"),
     ("FFDNET-L", True): ("jbarrow/FFDNet-L-cpu", "FFDNet-L.onnx"),
     ("FFDNET-L", False): ("jbarrow/FFDNet-L", "FFDNet-L.pt"),
-    ("FFDetr-Nano", False): ("./models/FFDetr-Nano", "checkpoint_best_ema.pth")
+    ("FFDETR", False): ("jbarrow/FFDetr", "FFDetr.pth")
 }
 
 
@@ -30,15 +36,25 @@ def batch(lst: list, n: int = 8):
         yield lst[ndx:min(ndx + n, l)]
 
 
-
 class FFDetrDetector:
     def __init__(
         self, model_or_path: str, device: int | str = "cpu"
     ) -> None:
         self.device = device
-        self.model = RFDETRMedium(pretrain_weights=model_or_path, resolution=224*5, num_classes=2)
+        self.model = RFDETRMedium(pretrain_weights=self.get_model_path(model_or_path))
+
+        self.id_to_cls = {0: "TextBox", 1: "ChoiceButton", 2: "Signature"}
+
+    def get_model_path(self, model_or_path: str) -> str:
+        model_upper = model_or_path.upper()
+        if model_upper in ["FFDETR"]:
+            # download the model, will just use the cached version if it already exists
+            repo_id, filename = models[(model_upper, False)] 
+            model_path = hf_hub_download(repo_id=repo_id, filename=filename) 
+        else:
+            model_path = model_or_path
 
-        self.id_to_cls = {0: "TextBox", 1: "ChoiceButton"}
+        return model_path
 
     def resize(
         self,
@@ -51,22 +67,26 @@ def resize(
         return image.resize(size, PIL.Image.Resampling.LANCZOS)
 
     def extract_widgets(
-        self, pages: list[Page], confidence: float = 0.2, image_size: int = 1120
+        self,
+        pages: list[Page],
+        confidence: float = 0.4,
+        image_size: int = 1120,
+        batch_size: int = 3,
     ) -> dict[int, list[Widget]]:
         image_size = 1024
         results = []
-        for b in batch([p.image for p in pages], n=1):
-            results += [self.model.predict(b, threshold=confidence)]
+        for b in batch([p.image for p in pages], n=batch_size):
+            predictions = self.model.predict(b, threshold=confidence)
+            if len(pages) == 1 or batch_size == 1:
+                predictions = [predictions]
+            results.extend(predictions)
 
         widgets = {}
 
-        if len(pages) == 1:
-            results = [results]
-
         for page_ix, detections in enumerate(results):
-            print(f"{page_ix}: {len(detections)} fields detected")
+            logging.info(f"  Page {page_ix}: {len(detections)} fields detected")
             detections = detections.with_nms(threshold=0.1, class_agnostic=True)
-            print(f"{len(detections)} after nms")
+            logging.info(f"\t\t{len(detections)} after nms")
             widgets[page_ix] = []
 
             for class_id, box in zip(detections.class_id, detections.xyxy):
@@ -217,7 +237,6 @@ def render_pdf(pdf_path: str) -> list[Page]:
     try:
         for page in doc:
             image = page.render(dpi=144)
-            print(image.width, image.height)
             pages.append(Page(image=image, width=image.width, height=image.height))
         return pages
     finally:
@@ -238,7 +257,7 @@ def prepare_form(
     multiline: bool = False,
 ):
     # detector = FFDNetDetector(model_or_path, device=device, fast=fast)
-    detector = FFDetrDetector("./models/FFDetr-Medium/checkpoint_best_ema.pth")
+    detector = FFDetrDetector("FFDetr")
 
     try:
         pages = render_pdf(input_path)

From 812ca5ccb1eb19e9b72d74344e36b03fc3d53869 Mon Sep 17 00:00:00 2001
From: Joe Barrow <joseph.d.barrow@gmail.com>
Date: Tue, 25 Nov 2025 08:42:31 -0500
Subject: [PATCH 3/5] test update to pyproject

---
 .github/workflows/ci.yml | 2 +-
 pyproject.toml           | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f83175f..5d5edaa 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -34,7 +34,7 @@ jobs:
             .venv
           key: uv-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml', 'uv.lock') }}
           restore-keys: |
-            uv-${{ runner.os }}-${{ matrix.python-version }}-
+            uv-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml', 'uv.lock') }}
 
       - name: Install (uv)
         run: |
diff --git a/pyproject.toml b/pyproject.toml
index 10b5d04..980f110 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
     "pydantic>=2.11.9",
     "pypdf>=6.1.1",
     "rfdetr>=1.3.0",
+    "transformers>=4.57"
     "ultralytics>=8.3.204",
 ]
 

From 4e6497d234e91999eb27f86ee0eb8ce8aaa9d4c0 Mon Sep 17 00:00:00 2001
From: Joe Barrow <joseph.d.barrow@gmail.com>
Date: Tue, 25 Nov 2025 08:43:22 -0500
Subject: [PATCH 4/5] test update to pyproject

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 980f110..bbca4b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
     "pydantic>=2.11.9",
     "pypdf>=6.1.1",
     "rfdetr>=1.3.0",
-    "transformers>=4.57"
+    "transformers>=4.57",
     "ultralytics>=8.3.204",
 ]
 

From bf1ae45e8b046e3c5c0066dd077434aa9ff8caff Mon Sep 17 00:00:00 2001
From: Joe Barrow <joseph.d.barrow@gmail.com>
Date: Tue, 25 Nov 2025 21:00:33 -0500
Subject: [PATCH 5/5] add tests for ffdetr vs. ffdnet

---
 commonforms/inference.py | 36 +++++++++++++++++++-----------------
 tests/inference_test.py  | 28 ++++++++++++++++++++++------
 2 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/commonforms/inference.py b/commonforms/inference.py
index 38ef5ea..0d5ccc8 100644
--- a/commonforms/inference.py
+++ b/commonforms/inference.py
@@ -25,21 +25,18 @@
     ("FFDNET-S", False): ("jbarrow/FFDNet-S", "FFDNet-S.pt"),
     ("FFDNET-L", True): ("jbarrow/FFDNet-L-cpu", "FFDNet-L.onnx"),
     ("FFDNET-L", False): ("jbarrow/FFDNet-L", "FFDNet-L.pt"),
-    ("FFDETR", False): ("jbarrow/FFDetr", "FFDetr.pth")
+    ("FFDETR", False): ("jbarrow/FFDetr", "FFDetr.pth"),
 }
 
 
-
 def batch(lst: list, n: int = 8):
     l = len(lst)
     for ndx in range(0, l, n):
-        yield lst[ndx:min(ndx + n, l)]
+        yield lst[ndx : min(ndx + n, l)]
 
 
 class FFDetrDetector:
-    def __init__(
-        self, model_or_path: str, device: int | str = "cpu"
-    ) -> None:
+    def __init__(self, model_or_path: str, device: int | str = "cpu") -> None:
         self.device = device
         self.model = RFDETRMedium(pretrain_weights=self.get_model_path(model_or_path))
 
@@ -49,8 +46,8 @@ def get_model_path(self, model_or_path: str) -> str:
         model_upper = model_or_path.upper()
         if model_upper in ["FFDETR"]:
             # download the model, will just use the cached version if it already exists
-            repo_id, filename = models[(model_upper, False)] 
-            model_path = hf_hub_download(repo_id=repo_id, filename=filename) 
+            repo_id, filename = models[(model_upper, False)]
+            model_path = hf_hub_download(repo_id=repo_id, filename=filename)
         else:
             model_path = model_or_path
 
@@ -75,7 +72,7 @@ def extract_widgets(
     ) -> dict[int, list[Widget]]:
         image_size = 1024
         results = []
-        for b in batch([p.image for p in pages], n=batch_size):
+        for b in batch([self.resize(p.image, image_size) for p in pages], n=batch_size):
             predictions = self.model.predict(b, threshold=confidence)
             if len(pages) == 1 or batch_size == 1:
                 predictions = [predictions]
@@ -131,8 +128,8 @@ def get_model_path(
         model_upper = model_or_path.upper()
         if model_upper in ["FFDNET-S", "FFDNET-L"]:
             # download the model, will just use the cached version if it already exists
-            repo_id, filename = models[(model_upper, fast)] 
-            model_path = hf_hub_download(repo_id=repo_id, filename=filename) 
+            repo_id, filename = models[(model_upper, fast)]
+            model_path = hf_hub_download(repo_id=repo_id, filename=filename)
         else:
             model_path = model_or_path
 
@@ -247,17 +244,20 @@ def prepare_form(
     input_path: str | Path,
     output_path: str | Path,
     *,
-    model_or_path: str = "FFDNet-L",
+    model_or_path: str = "FFDetr",
     keep_existing_fields: bool = False,
     use_signature_fields: bool = False,
     device: int | str = "cpu",
-    image_size: int = 1600,
-    confidence: float = 0.3,
+    image_size: int = 1024,
+    confidence: float = 0.4,
     fast: bool = False,
     multiline: bool = False,
+    batch_size: int = 4,
 ):
-    # detector = FFDNetDetector(model_or_path, device=device, fast=fast)
-    detector = FFDetrDetector("FFDetr")
+    if "FFDNET" in model_or_path.upper():
+        detector = FFDNetDetector(model_or_path, device=device, fast=fast)
+    else:
+        detector = FFDetrDetector(model_or_path)
 
     try:
         pages = render_pdf(input_path)
@@ -277,7 +277,9 @@ def prepare_form(
             name = f"{widget.widget_type.lower()}_{widget.page}_{i}"
 
             if widget.widget_type == "TextBox":
-                writer.add_text_box(name, page_ix, widget.bounding_box, multiline=multiline)
+                writer.add_text_box(
+                    name, page_ix, widget.bounding_box, multiline=multiline
+                )
             elif widget.widget_type == "ChoiceButton":
                 writer.add_checkbox(name, page_ix, widget.bounding_box)
             elif widget.widget_type == "Signature":
diff --git a/tests/inference_test.py b/tests/inference_test.py
index 62b9474..5b8693b 100644
--- a/tests/inference_test.py
+++ b/tests/inference_test.py
@@ -8,7 +8,7 @@
 def test_inference(tmp_path):
     # tmp_path is a built-in pythest fixture where we'll write the outputs
     output_path = tmp_path / "output.pdf"
-    commonforms.prepare_form("./tests/resources/input.pdf", output_path)
+    commonforms.prepare_form("./tests/resources/input.pdf", output_path, model_or_path="FFDetr")
 
     assert output_path.exists()
 
@@ -20,7 +20,7 @@ def test_inference(tmp_path):
 
 def test_inference_fast(tmp_path):
     output_path = tmp_path / "output.pdf"
-    commonforms.prepare_form("./tests/resources/input.pdf", output_path, fast=True)
+    commonforms.prepare_form("./tests/resources/input.pdf", output_path, fast=True, model_or_path="FFDNet-L")
 
     assert output_path.exists()
 
@@ -32,7 +32,9 @@ def test_inference_fast(tmp_path):
 
 def test_mutlinline(tmp_path):
     output_path = tmp_path / "output.pdf"
-    commonforms.prepare_form("./tests/resources/input.pdf", output_path, fast=True, multiline=True)
+    commonforms.prepare_form(
+        "./tests/resources/input.pdf", output_path, fast=True, multiline=True
+    )
 
     assert output_path.exists()
 
@@ -42,7 +44,6 @@ def test_mutlinline(tmp_path):
     doc.document.close()
 
 
-
 def test_encrypted_failure(tmp_path):
     # Reminder to future Joe: password for encrypted PDF is "kanbanery"
     output_path = tmp_path / "output.pdf"
@@ -51,7 +52,22 @@ def test_encrypted_failure(tmp_path):
         commonforms.prepare_form("./tests/resources/encrypted.pdf", output_path)
 
 
+def test_inference_ffdetr(tmp_path):
+    # tmp_path is a built-in pythest fixture where we'll write the outputs
+    output_path = tmp_path / "output.pdf"
+    commonforms.prepare_form(
+        "./tests/resources/input.pdf", output_path, model_or_path="FFDetr"
+    )
+
+    assert output_path.exists()
+
+    doc = formalpdf.open(output_path)
+    assert len(doc[0].widgets()) > 0
+
+    doc.document.close()
+
+
 # TODO(joe): future tests around handling encrypted PDFs
 #   1. add a --password flag and test that inference doesn't fail
-#   2. if a password is provided, ensure that the _output_ PDF remains encrpyted 
-#      with the same password 
+#   2. if a password is provided, ensure that the _output_ PDF remains encrpyted
+#      with the same password