From 57a239f9d4e5a804edacf99420219246a2e22fd8 Mon Sep 17 00:00:00 2001
From: SeanClay10 <zweihander555@gmail.com>
Date: Sun, 8 Feb 2026 13:48:48 -0800
Subject: [PATCH 1/4] Feat: Improve LLM module with regex matching and targeted
 search for predator-diet keywords.

---
 src/llm/local_llm.py                     | 297 ++++++++++++++++-------
 src/preprocessing/pdf_text_extraction.py |   2 +-
 2 files changed, 206 insertions(+), 93 deletions(-)

diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py
index 981ab46..67de232 100644
--- a/src/llm/local_llm.py
+++ b/src/llm/local_llm.py
@@ -1,19 +1,18 @@
-"""LLM-based metric extraction from preprocessed text files.
+"""Fixed LLM extraction with anti-hallucination measures.
 
-Usage:
-    python extract_metrics.py path/to/text_file.txt
-    python extract_metrics.py path/to/text_file.txt --model llama3.1:8b
-    python extract_metrics.py path/to/text_file.txt --output-dir results/
-
-This script uses Ollama to extract structured data from preprocessed predator diet
-surveys, including species name, study date, location, and stomach content data.
+This version:
+1. Fixes section extraction regex to actually capture content
+2. Searches full text when sections fail
+3. Adds explicit anti-hallucination instructions
+4. Validates LLM output against input text
 """
 
 import argparse
 import json
+import re
 import sys
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Dict
 
 from ollama import chat
 from pydantic import BaseModel, Field
@@ -30,66 +29,152 @@ class PredatorDietMetrics(BaseModel):
     sample_size: Optional[int] = Field(None, description="Total number of predators surveyed")
 
 
-def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> PredatorDietMetrics:
-    """Extract structured metrics from text using Ollama.
+def extract_stomach_counts_from_text(text: str) -> Dict[str, Optional[int]]:
+    """
+    Try to extract stomach counts using regex patterns as backup.
+    Only applies if not found from tables.
+    """
+    result = {"num_empty_stomachs": None, "num_nonempty_stomachs": None, "sample_size": None}
 
-    Args:
-        text: Preprocessed text content from a scientific publication
-        model: Name of the Ollama model to use
+    # Pattern 1: "X stomachs were found empty"
+    empty_pattern = r'(\d+)\s+stomachs?\s+(?:were\s+)?(?:found\s+)?empty'
+    empty_match = re.search(empty_pattern, text, re.IGNORECASE)
+    if empty_match:
+        result["num_empty_stomachs"] = int(empty_match.group(1))
 
-    Returns:
-        PredatorDietMetrics object with extracted data
-    """
-    prompt = f"""You are a scientific data extraction assistant specializing in predator diet surveys.
-
-Extract specific metrics from the text below. Focus on stomach content data where:
-- EMPTY stomachs = no food/prey
-- NON-EMPTY stomachs = contained food/prey
-- SAMPLE SIZE = total number of predators examined
-
-KEY INFORMATION TO FIND:
-- Species names are in Latin format (Genus species)
-- Look in tables, methods, and results sections
-- Empty stomachs: "empty", "vacant", "no prey"
-- Non-empty stomachs: "with prey", "fed", "containing food"
-
-EXTRACT:
-- species_name: Scientific name of PRIMARY predator studied (not prey)
-- study_location: Geographic location of sampling
-- study_date: Year or date range of collection
-- num_empty_stomachs: Number with empty stomachs
-- num_nonempty_stomachs: Number with food in stomachs
-- sample_size: Total number examined
-
-
-TEXT:
-{text}
+    # Pattern 2: "total of X stomachs" or "X stomachs"
+    total_pattern = r'(?:total\s+of\s+|^|\.\s+)(\d+)\s+stomachs'
+    total_match = re.search(total_pattern, text, re.IGNORECASE | re.MULTILINE)
+    if total_match:
+        result["sample_size"] = int(total_match.group(1))
+
+    # Calculate non-empty if we have both
+    if result["num_empty_stomachs"] and result["sample_size"]:
+        result["num_nonempty_stomachs"] = result["sample_size"] - result["num_empty_stomachs"]
+
+    return result
+
+
+def extract_metadata_with_search(text: str, model: str) -> Dict[str, Optional[str]]:
+    """Extract metadata in full text."""
+
+    # Get first 12000 chars which should include title, abstract, intro, methods
+    context = text[:12000]
+
+    prompt = f"""Extract metadata from this scientific paper about predator diet.
+
+**CRITICAL: You MUST extract information ONLY from the text provided below. If you cannot find something in the text, return null.**
+
+Find these 3 fields:
+
+1. species_name: The Latin name (Genus species) of the PREDATOR being studied
+   - Look in the title or first paragraph
+   - Format: "Genus species" (e.g., "Martes foina", not "stone marten")
+   
+2. study_location: Where specimens were collected
+   - Country, region, or coordinates
+   - Example: "Central Greece" or "38°44'N, 22°02'E"
+   
+3. study_date: When specimens were collected  
+   - Year or range: "2005" or "2003-2006"
+   - Look for phrases like "collected between", "during", "from...to"
+
+TEXT TO ANALYZE:
+{context}
+
+Remember: Extract ONLY from this text. If not clearly stated, return null.
 """
-    # Ollama call with structured schema output
-    response = chat(
-        messages=[
-            {
-                'role': 'user',
-                'content': prompt,
-            }
-        ],
-        model=model,
-        format=PredatorDietMetrics.model_json_schema(),
-    )
-
-    metrics = PredatorDietMetrics.model_validate_json(response.message.content)
-    return metrics
 
+    try:
+        response = chat(
+            messages=[{'role': 'user', 'content': prompt}],
+            model=model,
+            format={
+                "type": "object",
+                "properties": {"species_name": {"type": ["string", "null"]}, "study_location": {"type": ["string", "null"]}, "study_date": {"type": ["string", "null"]}},
+                "required": ["species_name", "study_location", "study_date"],
+            },
+        )
+
+        return json.loads(response.message.content)
+    except Exception as e:
+        print(f"[ERROR] Metadata extraction failed: {e}", file=sys.stderr)
+        return {"species_name": None, "study_location": None, "study_date": None}
 
-def validate_and_calculate(metrics: dict) -> dict:
-    """Validate extracted metrics and calculate derived values.
 
-    Args:
-        metrics: Dictionary of extracted metrics
+def extract_stomach_data_with_search(text: str, model: str) -> Dict[str, Optional[int]]:
+    """Extract stomach content counts with targeted search."""
 
-    Returns:
-        Dictionary with validated metrics and calculated fraction_feeding
-    """
+    # First try regex extraction as truth
+    regex_result = extract_stomach_counts_from_text(text)
+    print(f"[INFO] Regex found: {regex_result}", file=sys.stderr)
+
+    # Search for sections with stomach/empty keywords
+    stomach_pattern = r'.{0,800}(?:stomachs?|empty|sample\s+size|n\s*=).{0,800}'
+    matches = re.findall(stomach_pattern, text, re.IGNORECASE)
+    context = '\n\n---\n\n'.join(matches[:15])  # Up to 15 relevant passages
+
+    if not context:
+        context = text[:15000]  # Fallback to first part
+
+    prompt = f"""Extract stomach content counts from this predator diet study.
+
+**CRITICAL: Extract ONLY numbers that appear in the text below. DO NOT invent numbers.**
+
+Find these 3 numbers:
+
+1. num_empty_stomachs: How many predators had EMPTY stomachs
+   - Keywords: "empty", "vacant", "no prey", "unfed"
+   
+2. num_nonempty_stomachs: How many had NON-EMPTY stomachs (contained food)
+   - May need to calculate: total - empty = non-empty
+   
+3. sample_size: Total number of predators examined
+   - Keywords: "total", "n =", "sample size"
+
+**VALIDATION**: 
+- empty + non-empty should equal sample_size
+- If text says "14 stomachs were found empty" and "106 stomachs", then:
+  num_empty_stomachs = 14
+  num_nonempty_stomachs = 92 (calculated: 106 - 14)
+  sample_size = 106
+
+TEXT TO ANALYZE:
+{context}
+
+Extract the numbers. If a number is not stated or calculable, return null for that field.
+"""
+
+    try:
+        response = chat(
+            messages=[{'role': 'user', 'content': prompt}],
+            model=model,
+            format={
+                "type": "object",
+                "properties": {"num_empty_stomachs": {"type": ["integer", "null"]}, "num_nonempty_stomachs": {"type": ["integer", "null"]}, "sample_size": {"type": ["integer", "null"]}},
+                "required": ["num_empty_stomachs", "num_nonempty_stomachs", "sample_size"],
+            },
+        )
+
+        llm_result = json.loads(response.message.content)
+
+        # Validate: prefer regex if it found values and they differ from LLM
+        if regex_result["num_empty_stomachs"] and llm_result.get("num_empty_stomachs") != regex_result["num_empty_stomachs"]:
+            llm_result["num_empty_stomachs"] = regex_result["num_empty_stomachs"]
+
+        if regex_result["sample_size"] and llm_result.get("sample_size") != regex_result["sample_size"]:
+            llm_result["sample_size"] = regex_result["sample_size"]
+
+        return llm_result
+
+    except Exception as e:
+        print(f"[ERROR] Stomach data extraction failed: {e}", file=sys.stderr)
+        # Fall back to regex result if LLM fails
+        return regex_result if any(regex_result.values()) else {"num_empty_stomachs": None, "num_nonempty_stomachs": None, "sample_size": None}
+
+
+def validate_and_calculate(metrics: dict) -> dict:
+    """Validate extracted metrics and calculate derived values."""
     empty = metrics.get("num_empty_stomachs")
     nonempty = metrics.get("num_nonempty_stomachs")
     sample = metrics.get("sample_size")
@@ -98,10 +183,11 @@ def validate_and_calculate(metrics: dict) -> dict:
     if empty is not None and nonempty is not None:
         calculated_sample = empty + nonempty
         if sample is None:
+            print(f"[INFO] Calculated sample_size: {calculated_sample}", file=sys.stderr)
             metrics["sample_size"] = calculated_sample
             sample = calculated_sample
         elif sample != calculated_sample:
-            # LLM made an error, use calculated value
+            print(f"[WARN] Sample size mismatch: stated={sample}, calculated={calculated_sample}. Using calculated.", file=sys.stderr)
             metrics["sample_size"] = calculated_sample
             sample = calculated_sample
 
@@ -112,55 +198,82 @@ def validate_and_calculate(metrics: dict) -> dict:
 
     metrics["fraction_feeding"] = fraction_feeding
 
+    # Report completeness
+    null_count = sum(1 for k, v in metrics.items() if v is None and k != "fraction_feeding")
+    total_fields = 6
+    print(f"[INFO] Completeness: {total_fields - null_count}/{total_fields} fields filled", file=sys.stderr)
+
     return metrics
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Extract predator diet metrics from preprocessed text using LLM")
-    parser.add_argument("text_file", type=str, help="Path to the preprocessed text file")
-    parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use (default: llama3.1:8b)")
-    parser.add_argument("--output-dir", type=str, default="data/results", help="Output directory for JSON results (default: data/results)")
+    parser = argparse.ArgumentParser(description="Extract predator diet metrics from PDF (fixed version)")
+    parser.add_argument("pdf", type=str, help="Path to the PDF file")
+    parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use")
+    parser.add_argument("--output-dir", type=str, default="data/results", help="Output directory")
 
     args = parser.parse_args()
 
-    # Load text file
-    text_path = Path(args.text_file)
-    if not text_path.exists():
-        print(f"[ERROR] File not found: {text_path}", file=sys.stderr)
+    pdf_path = Path(args.pdf)
+    if not pdf_path.exists():
+        print(f"[ERROR] PDF not found: {pdf_path}", file=sys.stderr)
         sys.exit(1)
 
+    # Extract text from PDF
+    print(f"[INFO] Extracting text from: {pdf_path.name}", file=sys.stderr)
     try:
-        with open(text_path, "r", encoding="utf-8") as f:
-            text = f.read()
-    except Exception as e:
-        print(f"[ERROR] Failed to read file: {e}", file=sys.stderr)
-        sys.exit(1)
+        src_path = Path(__file__).resolve().parent.parent
+        if str(src_path) not in sys.path:
+            sys.path.insert(0, str(src_path))
+
+        from preprocessing.pdf_text_extraction import extract_text_from_pdf
+
+        text = extract_text_from_pdf(str(pdf_path))
+
+        if not text.strip():
+            print("[ERROR] No text extracted", file=sys.stderr)
+            sys.exit(1)
+
+        print(f"[INFO] Extracted {len(text)} characters", file=sys.stderr)
 
-    # Extract metrics
-    print(f"Extracting metrics from {text_path.name}...", file=sys.stderr)
-    try:
-        metrics = extract_metrics_from_text(text, model=args.model)
     except Exception as e:
-        print(f"[ERROR] Extraction failed: {e}", file=sys.stderr)
+        print(f"[ERROR] Text extraction failed: {e}", file=sys.stderr)
         sys.exit(1)
 
-    # Validate and calculate derived metrics
-    metrics_dict = metrics.model_dump()
+    # Extract metrics
+    print(f"\n{'='*60}", file=sys.stderr)
+    print(f"Extracting from: {pdf_path.name}", file=sys.stderr)
+    print(f"Model: {args.model}", file=sys.stderr)
+    print(f"{'='*60}\n", file=sys.stderr)
+
+    print("[1/2] Extracting metadata...", file=sys.stderr)
+    metadata = extract_metadata_with_search(text, args.model)
+    print(f"      Species: {metadata.get('species_name')}", file=sys.stderr)
+    print(f"      Location: {metadata.get('study_location')}", file=sys.stderr)
+    print(f"      Date: {metadata.get('study_date')}", file=sys.stderr)
+
+    print("\n[2/2] Extracting stomach data...", file=sys.stderr)
+    stomach_data = extract_stomach_data_with_search(text, args.model)
+    print(f"      Empty: {stomach_data.get('num_empty_stomachs')}", file=sys.stderr)
+    print(f"      Non-empty: {stomach_data.get('num_nonempty_stomachs')}", file=sys.stderr)
+    print(f"      Total: {stomach_data.get('sample_size')}", file=sys.stderr)
+
+    # Combine and validate
+    metrics_dict = {**metadata, **stomach_data}
     metrics_dict = validate_and_calculate(metrics_dict)
 
-    # Prepare output
-    result = {"source_file": text_path.name, "metrics": metrics_dict}
-
-    # Generate output filename: input_name_results.json
-    output_filename = text_path.stem + "_results.json"
-    output_path = Path(args.output_dir) / output_filename
-
     # Save results
+    result = {"source_file": pdf_path.name, "model_used": args.model, "metrics": metrics_dict}
+
+    output_path = Path(args.output_dir) / f"{pdf_path.stem}_results.json"
     output_path.parent.mkdir(parents=True, exist_ok=True)
+
     with open(output_path, "w", encoding="utf-8") as f:
-        json.dump(result, f, indent=2)
+        json.dump(result, f, indent=2, ensure_ascii=False)
 
-    print(f"Results saved to {output_path}", file=sys.stderr)
+    print(f"\n{'='*60}", file=sys.stderr)
+    print(f"Results saved to: {output_path}", file=sys.stderr)
+    print(f"{'='*60}\n", file=sys.stderr)
 
 
 if __name__ == "__main__":
diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py
index a51ee2c..27582b9 100644
--- a/src/preprocessing/pdf_text_extraction.py
+++ b/src/preprocessing/pdf_text_extraction.py
@@ -196,7 +196,7 @@ def save_to_file(text: str, output_path: str):
 def main():
     parser = argparse.ArgumentParser(description="Extract text and tables from PDF using PyMuPDF and camelot-py.")
     parser.add_argument("pdf", type=str, help="Path to the input PDF file.")
-    parser.add_argument("--output-dir", type=str, default="data/processed-text", help="Output directory for extracted text (default: data/processed-text)")
+    parser.add_argument("--output-dir", type=str, default="data/processed-tables", help="Output directory for extracted text (default: data/processed-tables)")
 
     args = parser.parse_args()
 

From c8d4b794b1039595df572d3f8117883ba6c59f7a Mon Sep 17 00:00:00 2001
From: SeanClay10 <zweihander555@gmail.com>
Date: Mon, 9 Feb 2026 08:57:17 -0800
Subject: [PATCH 2/4] Feat: Table augmentation.

---
 src/llm/local_llm.py | 61 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 53 insertions(+), 8 deletions(-)

diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py
index 67de232..7c20298 100644
--- a/src/llm/local_llm.py
+++ b/src/llm/local_llm.py
@@ -12,7 +12,7 @@
 import re
 import sys
 from pathlib import Path
-from typing import Optional, Dict
+from typing import Optional, Dict, List
 
 from ollama import chat
 from pydantic import BaseModel, Field
@@ -54,13 +54,42 @@ def extract_stomach_counts_from_text(text: str) -> Dict[str, Optional[int]]:
 
     return result
 
-
-def extract_metadata_with_search(text: str, model: str) -> Dict[str, Optional[str]]:
+def format_tables_for_llm(tables: List[Dict]) -> str:
+    """Format extracted tables into readable text for LLM."""
+    if not tables:
+        return ""
+    
+    formatted = []
+    for table in tables:
+        if "error" in table:
+            continue
+            
+        cells = table.get("cells", [])
+        if not cells:
+            continue
+        
+        # Format as markdown-style table
+        table_text = f"\n--- {table['table_id']} (Page {table['page_number']}) ---\n"
+        
+        for row in cells:
+            # Clean and join cells with | separator
+            row_text = " | ".join(str(cell).strip() if cell else "" for cell in row)
+            table_text += row_text + "\n"
+        
+        formatted.append(table_text)
+    
+    return "\n".join(formatted)
+
+def extract_metadata_with_search(text: str, tables_text: str, model: str) -> Dict[str, Optional[str]]:
     """Extract metadata in full text."""
 
     # Get first 12000 chars which should include title, abstract, intro, methods
     context = text[:12000]
 
+    # Add tables if available
+    if tables_text:
+        context = f"TABLES:\n{tables_text}\n\nTEXT:\n{context}"
+
     prompt = f"""Extract metadata from this scientific paper about predator diet.
 
 **CRITICAL: You MUST extract information ONLY from the text provided below. If you cannot find something in the text, return null.**
@@ -102,7 +131,7 @@ def extract_metadata_with_search(text: str, model: str) -> Dict[str, Optional[st
         return {"species_name": None, "study_location": None, "study_date": None}
 
 
-def extract_stomach_data_with_search(text: str, model: str) -> Dict[str, Optional[int]]:
+def extract_stomach_data_with_search(text: str, tables_text: str, model: str) -> Dict[str, Optional[int]]:
     """Extract stomach content counts with targeted search."""
 
     # First try regex extraction as truth
@@ -117,6 +146,10 @@ def extract_stomach_data_with_search(text: str, model: str) -> Dict[str, Optiona
     if not context:
         context = text[:15000]  # Fallback to first part
 
+    # Prepend tables if available
+    if tables_text:
+        context = f"TABLES:\n{tables_text}\n\nTEXT:\n{context}"
+
     prompt = f"""Extract stomach content counts from this predator diet study.
 
 **CRITICAL: Extract ONLY numbers that appear in the text below. DO NOT invent numbers.**
@@ -226,7 +259,7 @@ def main():
         if str(src_path) not in sys.path:
             sys.path.insert(0, str(src_path))
 
-        from preprocessing.pdf_text_extraction import extract_text_from_pdf
+        from preprocessing.pdf_text_extraction import extract_text_from_pdf, extract_tables_from_pdf
 
         text = extract_text_from_pdf(str(pdf_path))
 
@@ -236,6 +269,13 @@ def main():
 
         print(f"[INFO] Extracted {len(text)} characters", file=sys.stderr)
 
+        # Extract tables
+        tables = extract_tables_from_pdf(str(pdf_path))
+        print(f"[INFO] Extracted {len(tables)} tables", file=sys.stderr)
+        
+        # Format tables for LLM
+        tables_text = format_tables_for_llm(tables)
+
     except Exception as e:
         print(f"[ERROR] Text extraction failed: {e}", file=sys.stderr)
         sys.exit(1)
@@ -247,13 +287,13 @@ def main():
     print(f"{'='*60}\n", file=sys.stderr)
 
     print("[1/2] Extracting metadata...", file=sys.stderr)
-    metadata = extract_metadata_with_search(text, args.model)
+    metadata = extract_metadata_with_search(text, tables_text, args.model)
     print(f"      Species: {metadata.get('species_name')}", file=sys.stderr)
     print(f"      Location: {metadata.get('study_location')}", file=sys.stderr)
     print(f"      Date: {metadata.get('study_date')}", file=sys.stderr)
 
     print("\n[2/2] Extracting stomach data...", file=sys.stderr)
-    stomach_data = extract_stomach_data_with_search(text, args.model)
+    stomach_data = extract_stomach_data_with_search(text, tables_text, args.model)
     print(f"      Empty: {stomach_data.get('num_empty_stomachs')}", file=sys.stderr)
     print(f"      Non-empty: {stomach_data.get('num_nonempty_stomachs')}", file=sys.stderr)
     print(f"      Total: {stomach_data.get('sample_size')}", file=sys.stderr)
@@ -263,7 +303,12 @@ def main():
     metrics_dict = validate_and_calculate(metrics_dict)
 
     # Save results
-    result = {"source_file": pdf_path.name, "model_used": args.model, "metrics": metrics_dict}
+    result = {
+        "source_file": pdf_path.name,
+        "model_used": args.model,
+        "metrics": metrics_dict,
+        "tables_found": len(tables)
+    }
 
     output_path = Path(args.output_dir) / f"{pdf_path.stem}_results.json"
     output_path.parent.mkdir(parents=True, exist_ok=True)

From 81de1e657e8083fd2b83b5c5f1b9d8fb235bbe0d Mon Sep 17 00:00:00 2001
From: SeanClay10 <zweihander555@gmail.com>
Date: Mon, 9 Feb 2026 09:03:54 -0800
Subject: [PATCH 3/4] Remove unnecessary JSON field.

---
 src/llm/local_llm.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py
index 7c20298..da460d7 100644
--- a/src/llm/local_llm.py
+++ b/src/llm/local_llm.py
@@ -306,8 +306,7 @@ def main():
     result = {
         "source_file": pdf_path.name,
         "model_used": args.model,
-        "metrics": metrics_dict,
-        "tables_found": len(tables)
+        "metrics": metrics_dict
     }
 
     output_path = Path(args.output_dir) / f"{pdf_path.stem}_results.json"

From 1b8bbb4e3ce51bfdcc362a294f3f5a14dde799b2 Mon Sep 17 00:00:00 2001
From: SeanClay10 <zweihander555@gmail.com>
Date: Mon, 9 Feb 2026 09:19:25 -0800
Subject: [PATCH 4/4] Fix: Linting.

---
 src/llm/local_llm.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py
index da460d7..cfd5d33 100644
--- a/src/llm/local_llm.py
+++ b/src/llm/local_llm.py
@@ -54,32 +54,34 @@ def extract_stomach_counts_from_text(text: str) -> Dict[str, Optional[int]]:
 
     return result
 
+
 def format_tables_for_llm(tables: List[Dict]) -> str:
     """Format extracted tables into readable text for LLM."""
     if not tables:
         return ""
-    
+
     formatted = []
     for table in tables:
         if "error" in table:
             continue
-            
+
         cells = table.get("cells", [])
         if not cells:
             continue
-        
+
         # Format as markdown-style table
         table_text = f"\n--- {table['table_id']} (Page {table['page_number']}) ---\n"
-        
+
         for row in cells:
             # Clean and join cells with | separator
             row_text = " | ".join(str(cell).strip() if cell else "" for cell in row)
             table_text += row_text + "\n"
-        
+
         formatted.append(table_text)
-    
+
     return "\n".join(formatted)
 
+
 def extract_metadata_with_search(text: str, tables_text: str, model: str) -> Dict[str, Optional[str]]:
     """Extract metadata in full text."""
 
@@ -272,7 +274,7 @@ def main():
         # Extract tables
         tables = extract_tables_from_pdf(str(pdf_path))
         print(f"[INFO] Extracted {len(tables)} tables", file=sys.stderr)
-        
+
         # Format tables for LLM
         tables_text = format_tables_for_llm(tables)
 
@@ -303,11 +305,7 @@ def main():
     metrics_dict = validate_and_calculate(metrics_dict)
 
     # Save results
-    result = {
-        "source_file": pdf_path.name,
-        "model_used": args.model,
-        "metrics": metrics_dict
-    }
+    result = {"source_file": pdf_path.name, "model_used": args.model, "metrics": metrics_dict}
 
     output_path = Path(args.output_dir) / f"{pdf_path.stem}_results.json"
     output_path.parent.mkdir(parents=True, exist_ok=True)