diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..3eab283 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,33 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + + - name: Lint with ruff + run: ruff check datascope/ tests/ + + - name: Run tests + run: pytest tests/ -q diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..e751de0 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,30 @@ +name: Publish to PyPI + +on: + push: + tags: + - "v*" + +jobs: + publish: + runs-on: ubuntu-latest + environment: pypi + permissions: + id-token: write + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install build tools + run: python -m pip install --upgrade pip build + + - name: Build package + run: python -m build + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/AUDIT.md b/AUDIT.md new file mode 100644 index 0000000..0af61bd --- /dev/null +++ b/AUDIT.md @@ -0,0 +1,333 @@ +# datascope — Project Audit + +Generated: 2026-05-15 + +--- + +## Phase 1: Baseline Assessment + +### What It Is Today + +**datascope v2.0.0** — a Python CLI tool that analyzes Excel/CSV files for hidden data quality issues and produces professional PDF diagnostic reports in plain English. + +**Core insight:** Most tools let pandas silently coerce types (485 numbers + 15 strings → all float64, strings become NaN). datascope reads each cell's actual Python type, detects quality issues, and explains them as "assumption vs. reality" findings for non-technical readers. + +### By the Numbers + +| Metric | Value | +|--------|-------| +| Production code | ~1,620 LOC across 15 files | +| Test code | ~3,450 LOC, 265 test cases | +| Test:code ratio | 2.1:1 | +| Dependencies | 4 runtime (pandas, openpyxl, reportlab, numpy) | +| Python target | 3.10+ | +| Git commits | 23 over 62 days | +| Open issues/PRs | 0 / 0 | +| License | MIT (Shawn, Lailara LLC) | + +### Architecture (5 layers) + +``` +INPUT (Excel/CSV) + → Loaders (235 LOC) — cell-level type preservation, no silent coercion + → Analyzers (633 LOC) — 5 detectors: type consistency, sentinels, leading zeros, mixed dates, cardinality + → Findings (239 LOC) — severity classification + plain-English template composition + → Reports (605 LOC) — professional PDF via reportlab, color-coded severity cards + → CLI (205 LOC) — argparse orchestration, stdout summary +OUTPUT (PDF + stdout) +``` + +### What's Strong + +1. **Architecture** — clean layer separation, extensible Finding data model, no monolith +2. **Test coverage** — 265 tests at 2.1:1 ratio; unit + integration + CLI tests +3. **Documentation** — portfolio-grade README, brainstorm/plan/learning docs in docs/ +4. **Code quality** — type hints throughout, ruff configured, zero TODO/FIXME/HACK comments +5. **Product thinking** — "assumption vs. reality" framing, non-technical audience focus, severity by downstream impact +6. **Packaging** — proper pyproject.toml, entry point, editable install + +### What's Missing or Weak + +1. **No CI/CD** — no GitHub Actions; quality gates are manual +2. **No CHANGELOG** — v1→v2 was a major rewrite with no migration record +3. **Legacy scorer.py** — v1 monolith (~31KB) still in root, not archived +4. **Single output format** — PDF only; no Excel/HTML/JSON export +5. **Limited input sources** — Excel and CSV only; no database, no Parquet, no API +6. **No strict mode flag removed** — README git clone URL still says `field-story-scorer.git` but repo renamed to `datascope` +7. **No PyPI publishing** — local install only +8. **generate_sample.py in root** — utility script not in tools/ folder +9. **tools/render_strict_mode_comparison.py** — references v1 strict mode concept, may be stale + +### Project Identity + +- **Repo:** MsShawnP/datascope (renamed from field-story-scorer) +- **Audience:** Data consultants, developers, business analysts +- **Differentiator:** Cell-level type detection + plain-English diagnostic reports +- **Stage:** v2.0 shipped 2026-05-14, no users yet beyond author + +--- + +## Phase 2: Internal Review + +Five parallel reviews: architecture, testing, performance, security, UX/docs. Findings ranked by leverage — what moves the needle most for the least effort. + +### Tier 1 — High Leverage (fix before promoting the project) + +| # | Finding | Dimension | Why It Matters | +|---|---------|-----------|----------------| +| 1 | **README clone URL points to old repo name** `field-story-scorer.git` | UX | Every new user hits this immediately; the `cd` command also fails | +| 2 | **`samples/README.md` is entirely about v1** — references scorer.py, --strict-types, scoring numbers | UX | New users exploring samples/ get a completely misleading picture | +| 3 | **Missing `defusedxml` dependency** — openpyxl uses stdlib XML parser without it, exposing XML bomb/XXE risk | Security | One-line fix (`defusedxml>=0.7.0`) that closes a real attack vector for a tool that processes untrusted files | +| 4 | **Legacy `scorer.py` (791 LOC) still in root** — confusing, weaker security posture, imported by tools/ | Architecture | Confuses contributors, duplicates entry points, has unescaped user input in reportlab | +| 5 | **`generate_sample.py` references v1 CLI** — prints `python scorer.py --input ...` | UX | Unusable with v2; generates misleading instructions | +| 6 | **Backtick column names render as literal backticks in PDF** | UX | Every finding card in every report has this cosmetic defect — not "plain English" | +| 7 | **Mixed-dates template newlines ignored in PDF** — `\n`-joined list renders as run-on text | UX | Date format breakdown is unreadable in the actual report | + +### Tier 2 — Medium Leverage (meaningful improvements) + +| # | Finding | Dimension | Why It Matters | +|---|---------|-----------|----------------| +| 8 | **FindingType sub-types dispatched via evidence-key sniffing** — 6 places check magic dict keys | Architecture | Adding a new sub-type requires finding and updating all 6 locations; promote sub-types to first-class enum values | +| 9 | **No CI/CD** — no GitHub Actions, no automated test/lint gates | DevEx | Quality gates are entirely manual; one workflow file closes this | +| 10 | **Full materialization defeats openpyxl read_only=True** — `list(ws.iter_rows())` loads everything at once | Performance | 1M-row file → ~2.4GB RAM; the streaming flag is wasted | +| 11 | **CSV loads entire file into memory twice** — `list(reader)` + `inferred_rows` list | Performance | Same memory problem, compounded by the type inference copy | +| 12 | **CSV datetime inference is O(n × 7 strptime calls)** per non-date cell | Performance | 1M text-string cells → 7M failed strptime calls; regex pre-filter would cut this 10x | +| 13 | **No `--json` / `--format` output flag** | UX | Blocks pipeline integration, CI/CD usage, programmatic consumption | +| 14 | **Analyzer failures swallowed with one-line warning** — no traceback, can produce false-negative reports | UX | A "No issues detected" report when an analyzer actually crashed is dangerous | +| 15 | **No page numbers or running header in PDF** | UX | Multi-page professional deliverable without pagination | +| 16 | **`source_metadata` is untyped `dict[str, Any]`** — keys established by convention across 3 files | Architecture | Adding a new output format requires guessing which keys exist | +| 17 | **`normalize_type` creates cross-analyzer coupling** — sentinel.py and format_check.py import from type_consistency.py | Architecture | Extract to shared utility in analyzers/base.py | +| 18 | **Dependency version ranges fully open, no lock file** | Security | No reproducible builds; `pip audit` not configured | + +### Tier 3 — Polish (lower leverage but worth noting) + +| # | Finding | Dimension | Why It Matters | +|---|---------|-----------|----------------| +| 19 | **`cell_types` stores one `type` reference per cell** — near-doubles memory vs DataFrame | Performance | Could use run-length encoding or type codes instead | +| 20 | **CLI analyzer failure error path untested** (cli.py:184-187) | Testing | The only error-resilience mechanism in the pipeline has zero test coverage | +| 21 | **CSV `_infer_cell` never tested in isolation** — 6 inference branches, no direct unit tests | Testing | Leading-zero preservation, the tool's differentiating feature, is tested only indirectly | +| 22 | **Composer fallback branches untested** — 3 default cases in template dispatch | Testing | Silent wrong-template selection if a new sub-type is added | +| 23 | **No `--quiet` / `--verbose` flags** | UX | Blocks scripting (quiet) and debugging (verbose/traceback) | +| 24 | **`requirements.txt` duplicates `pyproject.toml`** — invites version drift | DevEx | Use only pyproject.toml; generate requirements.txt if needed | +| 25 | **`pyproject.toml` missing `authors`, `urls`, `readme` fields** | DevEx | Needed for PyPI publishing | +| 26 | **No mypy/pyright configuration** — type hints are documentation-only | DevEx | Extensive type hints exist but are never verified | +| 27 | **`Analyzer` type alias defined but never imported or used** | Architecture | Dead code in analyzers/base.py | +| 28 | **`numpy` listed as dependency but unused by v2 code** | Architecture | Only used by legacy scorer.py and generate_sample.py | +| 29 | **`--sheet` silently ignored for CSV files** | UX | No warning when the flag has no effect | +| 30 | **PDF health assessment doesn't mention total finding count** | UX | 25 info findings → "only informational" with no sense of volume | + +### Cross-Cutting Themes + +1. **v1 → v2 cleanup is incomplete.** scorer.py, generate_sample.py, samples/README.md, tools/render_strict_mode_comparison.py, and the README clone URL all reference v1 concepts. This is the single highest-leverage batch of fixes. + +2. **The architecture is sound but has one structural weakness.** The evidence-key sniffing pattern for sub-type dispatch (FORMAT_INCONSISTENCY and CARDINALITY_ANOMALY) creates a hidden coupling between analyzers and the findings layer. Promoting sub-types to first-class enum values eliminates this. + +3. **Performance is fine for the current audience (<10K rows) but has a hard wall.** Full materialization + cell_types doubling + strptime brute-force means the tool falls over around 100K rows. If the target audience ever includes "production data pipeline" users, this needs a streaming rewrite. + +4. **Test coverage is genuinely strong (2.1:1 ratio, 265 tests) but has specific blind spots.** The untested paths are exactly the defensive/error-handling code that matters most when things go wrong: analyzer failures, CSV type inference edge cases, composer fallbacks, PDF health assessment branches. + +5. **The PDF report has two rendering bugs** (backtick literals, newline collapse) that affect every report generated. These are quick fixes with high visible impact. + +--- + +## Phase 3: Landscape Scan + +### Competitive Set (10 tools) + +| Tool | Stars | Type | Input | Output | Type Detection | Audience | Pricing | +|------|-------|------|-------|--------|----------------|----------|---------| +| **ydata-profiling** | 13.6k | Library | DataFrame | HTML | Column-level inference | Data scientists | Free | +| **Great Expectations** | 11.5k | Framework | DataFrame/SQL | HTML "Data Docs" | Rule-based (expectations) | Data engineers | Free + Cloud | +| **Pandera** | 4.3k | Library | DataFrame | Exceptions (no report) | Schema-as-code | Engineers | Free | +| **SweetViz** | 3.1k | Library | DataFrame | HTML | Column-level | DS/ML | Free | +| **whylogs** | 2.8k | Library | DataFrame | JSON profiles | Column-level stats | ML engineers | Freemium | +| **Soda Core** | 2.3k | CLI | SQL/databases | Pass/fail + Cloud UI | YAML check rules | Data engineers | Free + $750/mo Cloud | +| **DataPrep** | 2.2k | Library | DataFrame | HTML | Column-level (Dask) | Data scientists | Free | +| **Pointblank** | <1k | Library | DataFrame/SQL | HTML tables | Threshold-based validation | Analysts (newer) | Free | +| **DataProfiler** | 1.6k | Library | CSV/JSON/Parquet | JSON | Column-level + PII | Data/security analysts | Free | +| **Deepchecks** | — | Library | DataFrame | HTML | Column-ratio mixed-type check | ML engineers | Freemium | + +### Feature Matrix — Where datascope Sits + +| Capability | datascope | ydata-profiling | Great Expectations | Pandera | Soda Core | Pointblank | +|------------|:---------:|:---------------:|:------------------:|:-------:|:---------:|:----------:| +| **Cell-level type detection** | **Yes** | No | No | No | No | No | +| **Excel-native reading** (openpyxl, no pandas coercion) | **Yes** | No | No | No | No | No | +| **Plain-English narrative** | **Yes** | No | Partial (Data Docs) | No | No | Partial | +| **PDF report output** | **Yes** | No | No | No | No | No | +| **Zero-config CLI** (file in → report out) | **Yes** | No (1-liner but library) | No (expectations required) | No (schema required) | No (YAML required) | No (code required) | +| **CSV support** | Yes | Yes (via pandas) | Yes (via pandas) | Yes | Yes (via SQL) | Yes | +| **Statistical profiling** | No | **Yes** | No | No | No | No | +| **Custom validation rules** | No | No | **Yes** | **Yes** | **Yes** | **Yes** | +| **Pipeline integration** | No | Yes | **Yes** | **Yes** | **Yes** | Yes | +| **Database support** | No | No | Yes | No | **Yes** | Yes | +| **Parquet/Arrow support** | No | Yes | Yes | Yes | Yes | Yes | +| **JSON/machine-readable output** | No | Yes | Yes | Yes | Yes | Yes | +| **Large file performance** | Weak (>100K rows) | Weak | Good | **Good** | Good | Good | +| **Polars support** | No | No | No | Yes | No | **Yes** | +| **Community/stars** | New | 13.6k | 11.5k | 4.3k | 2.3k | <1k | + +### datascope's Position: What's Better, Worse, Unique, Missing + +**Unique (no competitor does this):** +1. **Cell-level type detection** — every other tool uses column-level inference after pandas/SQL coercion. datascope reads each cell's actual Python type via openpyxl before any coercion happens. This is the core technical moat. +2. **"Assumption vs. reality" narrative framing** — no tool produces prose explanations aimed at non-technical readers. The closest (GX Data Docs, Pointblank tables) are validation result tables, not narratives. +3. **Excel-native reading** — every competitor requires loading through pandas first, which is exactly where type coercion destroys the signal datascope detects. +4. **PDF as portable audit artifact** — no competitor outputs PDF. The inspection-report analogy: the artifact is passed to a client who doesn't control the toolchain. + +**Better than competitors:** +5. **Zero-config experience** — `datascope file.xlsx` produces a full report. GX requires expectation suites, Pandera requires schemas, Soda requires YAML. The setup cost for datascope is zero. +6. **Non-technical audience targeting** — while competitors target engineers, datascope targets consultants handing reports to clients. + +**Worse than competitors:** +7. **No machine-readable output** — every major competitor supports JSON/HTML/programmatic output. datascope has PDF + unstructured stdout only. +8. **No custom validation rules** — can't define domain-specific checks ("price must be positive", "date must be after 2020"). +9. **No pipeline integration** — can't embed in CI/CD, dbt, or Airflow workflows without parsing stdout. +10. **No statistical profiling** — no distributions, correlations, missing-value analysis beyond what the 5 analyzers detect. +11. **Performance ceiling** — falls over at ~100K rows due to full materialization and strptime brute-force. +12. **No community** — new project with zero external users/stars. + +**Missing (competitors have, datascope doesn't):** +13. **Database/Parquet/Arrow input** — limited to Excel + CSV. +14. **Polars support** — Polars is the growth vector in the Python data ecosystem. +15. **HTML report option** — for web/email embedding. +16. **Drift detection** — comparing two datasets or monitoring over time (whylogs, Soda territory). + +### Market Context + +**Where the market is going:** +- Pipeline-integrated observability platforms (Monte Carlo, Sifflet, Datafold — VC-funded) +- AI-augmented test generation (GX DraftValidation, DataOps TestGen) +- Validation-as-feature inside frameworks (Pydantic in FastAPI, dbt tests) + +**Where the market is NOT going:** +- Standalone CLI tools for one-shot file auditing +- Stakeholder-facing prose reports +- Excel-native anything + +**This is the opportunity.** The market is leaving the "consultant analyzes a client's messy Excel file and needs a professional report" use case completely unaddressed. Every tool is moving toward engineers, pipelines, and platforms. datascope occupies an empty niche. + +**The risk:** The niche may be empty because it's small. The growth path requires either (a) staying niche but being the definitive tool for data consultants, or (b) adding enough pipeline features (JSON output, CI integration) to serve both audiences. + +### Analogies That Clarify Position + +- **Building inspection reports** — inspectors don't hand homeowners JSON schemas. They produce written reports. datascope is the building inspector for data files. +- **Spell-checker UX** — surfaces problems inline, in the user's own document, with one-click fixes. No existing tool does this for data files. +- **Rust compiler errors** — the shift toward human-readable, actionable error messages ("expected integer, found string at column B row 14") maps directly onto datascope's narrative approach. + +--- + +## Phase 4: Synthesis & Next Moves + +### Strategic Frame + +datascope has a **genuine technical moat** (cell-level type detection) and a **genuine product moat** (plain-English narrative for non-technical readers). No competitor combines both. The architecture is sound, the test coverage is strong, and the code quality is portfolio-grade. + +But the project can't capitalize on either moat yet because: +1. **v1 artifacts confuse first impressions** — scorer.py, old README URL, stale samples +2. **PDF rendering bugs undermine the "professional report" value prop** — the core product has cosmetic defects +3. **No machine-readable output blocks the bridge audience** — engineers who'd champion datascope in their org can't integrate it +4. **No CI/CD or PyPI hurts credibility** — open-source adoption requires trust signals + +The synthesis produces four ranked move categories: **Clean → Polish → Bridge → Grow.** + +--- + +### Move 1: CLEAN — Ship-ready baseline (1-2 sessions) + +*Goal: A stranger who finds the repo can install, run, and trust what they see.* + +| Task | Internal Finding | Landscape Rationale | Effort | +|------|-----------------|---------------------|--------| +| Fix README clone URL + cd command | Phase 2 #1 | First-touch UX; every competitor has working install instructions | 5 min | +| Rewrite `samples/README.md` for v2 | Phase 2 #2 | Samples are the "try it yourself" onramp | 30 min | +| Delete `scorer.py` from root | Phase 2 #4 | Eliminates confusion, removes weaker security surface | 5 min | +| Update `generate_sample.py` for v2 CLI | Phase 2 #5 | Makes sample generation actually work | 15 min | +| Retire or port `tools/render_strict_mode_comparison.py` | Phase 2 #4 | Last v1 import reference | 15 min | +| Add `defusedxml>=0.7.0` to dependencies | Phase 2 #3 | One-line fix; closes XML bomb vector for a tool processing untrusted files | 2 min | +| Drop `numpy` from dependencies (unused by v2) | Phase 2 #28 | Smaller install footprint, honest dependency list | 2 min | + +**Total: ~75 minutes of focused work. Zero architectural risk.** + +--- + +### Move 2: POLISH — The report is the product (1-2 sessions) + +*Goal: Every PDF datascope produces is genuinely professional and correct.* + +| Task | Internal Finding | Landscape Rationale | Effort | +|------|-----------------|---------------------|--------| +| Fix backtick literals in PDF templates | Phase 2 #6 | The report is datascope's differentiator; literal backticks aren't "plain English" | 30 min | +| Fix newline collapse in mixed-dates template | Phase 2 #7 | Date format breakdown is unreadable as run-on text | 20 min | +| Add page numbers + running header to PDF | Phase 2 #15 | Every competitor's HTML report has navigation; PDF needs pagination | 45 min | +| Fix PDF health assessment to mention total count | Phase 2 #30 | "Only informational" with 25 findings buries the signal | 15 min | +| Regenerate v2 sample outputs in `samples/output/` | Phase 1 gap | Current samples are v1 artifacts | 15 min | + +**Total: ~2 hours. These are the changes users actually see.** + +The landscape confirms this priority: datascope's unique position is the **report**. ydata-profiling has better stats. GX has better rules. Pandera has better schemas. But none of them produce a report you'd hand to a non-technical client. If the PDF has cosmetic bugs, the entire value proposition is undermined. + +--- + +### Move 3: BRIDGE — Serve both audiences (2-3 sessions) + +*Goal: Engineers can integrate datascope into pipelines; consultants still get their PDF.* + +| Task | Internal Finding | Landscape Rationale | Effort | +|------|-----------------|---------------------|--------| +| Add `--format json` output flag | Phase 2 #13 | Every competitor has machine-readable output; this is datascope's biggest functional gap | 2-3 hr | +| Add `--verbose` / `--quiet` flags | Phase 2 #23 | `--quiet` enables scripting (exit code only); `--verbose` enables debugging | 1 hr | +| Add GitHub Actions CI (pytest + ruff) | Phase 2 #9 | Table-stakes trust signal for open-source adoption | 30 min | +| Promote FindingType sub-types to first-class enums | Phase 2 #8 | Eliminates evidence-key sniffing in 6 locations; unblocks adding new analyzers | 2 hr | +| Type `source_metadata` as TypedDict | Phase 2 #16 | Unblocks adding new output formats without guessing keys | 30 min | +| Complete `pyproject.toml` metadata (authors, urls, readme) | Phase 2 #25 | Required for PyPI publishing | 10 min | +| Publish to PyPI | Phase 1 gap | `pip install datascope` is the expected install path; git clone is friction | 1 hr | + +**Why `--format json` is the single highest-leverage feature:** +- It's the bridge between datascope's consultant audience and the engineer audience +- Engineers who discover datascope via PyPI can plug it into CI/CD: `datascope data.csv --format json | jq '.findings[] | select(.severity == "CRITICAL")'` +- JSON output is the prerequisite for GitHub Actions integration, Slack alerts, dashboard embedding +- It costs 2-3 hours and doubles the addressable audience + +--- + +### Move 4: GROW — Expand the moat (future sessions) + +*Goal: Make datascope the definitive tool for its niche, then expand.* + +| Task | Landscape Rationale | Effort | +|------|---------------------|--------| +| **Add HTML report option** | Email-embeddable; web-viewable; complements PDF for different delivery contexts | 3-4 hr | +| **Add `--max-rows` / size guard** | Prevents OOM on large files; sets user expectations honestly | 1 hr | +| **Regex pre-filter for CSV datetime inference** | 10x speedup on text-heavy CSVs; moves the performance wall from 100K to 1M rows | 1 hr | +| **Add Parquet/CSV-from-stdin input** | Parquet is the growth vector; stdin enables piping from other tools | 2-3 hr | +| **Add a sixth analyzer: missing-value patterns** | Gap vs. ydata-profiling; "15% of rows have no email" is a finding consultants care about | 2-3 hr | +| **Add annotated Excel output** — highlight problem cells in the source file | The "spell-checker UX" analogy; no competitor does this; huge differentiation | 4-6 hr | +| **Stream-process loaders** | Eliminates the 100K-row memory wall entirely | 4-6 hr | +| **Lock file + `pip audit` in CI** | Reproducible builds + vulnerability scanning | 30 min | + +The annotated Excel output is the **long-term differentiator**. No tool in the landscape highlights problem cells in the user's own file. Combined with the PDF diagnostic report, this creates a two-artifact deliverable: "here's your file with problems highlighted, and here's the report explaining what each problem means." That's the building-inspection analogy made concrete. + +--- + +### Strategic Summary + +``` +Now Soon Next Later +───────────────────────────────────────────────────────── +CLEAN POLISH BRIDGE GROW +v1 artifacts PDF rendering --format json HTML reports +defusedxml page numbers CI/CD Parquet input +scorer.py sample outputs PyPI publish Annotated Excel +README URL health text enum sub-types Stream loaders + --verbose/--quiet Missing-value analyzer +``` + +**The thesis:** datascope's moat is the combination of cell-level detection and professional narrative output. Clean the repo (Move 1), make the report flawless (Move 2), then add JSON output to bridge the engineer audience (Move 3). Everything after that deepens the moat or expands the audience. + +**What NOT to build:** +- Custom validation rules (GX/Pandera own this; don't compete on their turf) +- Statistical profiling (ydata-profiling owns this; datascope finds *problems*, not *statistics*) +- Database connectivity (Soda owns this; stay in the file-auditing lane) +- Drift detection (whylogs owns this; datascope is point-in-time, not longitudinal) +- Web UI / SaaS (premature; the CLI + report is the right form factor for now) diff --git a/PLAN.md b/PLAN.md new file mode 100644 index 0000000..5008b43 --- /dev/null +++ b/PLAN.md @@ -0,0 +1,184 @@ +# datascope — Improvement Plan + +Derived from full project audit (2026-05-15). See AUDIT.md for rationale. + +Tier: Medium +Current focus: Moves 1-3 complete. Move 4 (GROW) is next. + +--- + +## Move 1: CLEAN — Ship-ready baseline + +Goal: A stranger who finds the repo can install, run, and trust what they see. + +### 1A: Fix README install URL ✓ +- Depends on: none +- Change `field-story-scorer.git` → `datascope.git` and `cd field-story-scorer` → `cd datascope` in README.md:27-28 +- Done when: `grep -c "field-story-scorer" README.md` returns 0 + +### 1B: Add defusedxml, drop numpy from dependencies ✓ +- Depends on: none +- Add `defusedxml>=0.7.0` to pyproject.toml `[project.dependencies]` and requirements.txt +- Remove `numpy>=1.24.0` from both files (v2 code never imports numpy) +- Done when: `pip install -e .` succeeds; `python -c "import defusedxml"` succeeds; `grep numpy pyproject.toml` returns nothing + +### 1C: Delete scorer.py and update its dependents ✓ +- Depends on: none +- Delete `scorer.py` from repo root +- Update `tools/render_strict_mode_comparison.py`: either delete it (if stale) or port the `from scorer import analyze, load_strict` to v2 APIs (`from datascope.loaders import load_file` + v2 analyzer pipeline) +- Done when: `grep -r "from scorer" .` returns nothing; `python -m pytest` still passes + +### 1D: Update generate_sample.py for v2 ✓ +- Depends on: 1C (scorer.py must be gone so old instructions don't work) +- Change print statements at lines 87-88 from `python scorer.py --input ...` to `datascope --output-dir ...` +- Move to `tools/` directory for consistency (optional, confirm with user) +- Done when: `python generate_sample.py` prints v2 CLI commands; no reference to `scorer.py` in file + +### 1E: Rewrite samples/README.md for v2 ✓ +- Depends on: 1C, 1D (need v1 artifacts gone before rewriting the guide) +- Replace entire file: describe v2 diagnostic reports, reference `datascope` CLI, update output file names, remove scoring numbers and --strict-types references +- Done when: `grep -c "scorer\|strict-types\|field-story-scorer\|field_report" samples/README.md` returns 0; file describes v2 outputs and commands + +### 1F: Integration verify ✓ +- Depends on: 1A-1E all complete +- Run full test suite: `python -m pytest` +- Run tool end-to-end: `datascope samples/input/sample_mixed_types.xlsx --output-dir /tmp/test` +- Verify PDF is produced and stdout summary prints correctly +- Done when: all tests pass; PDF exists and opens; no stderr warnings about missing imports + +--- + +## Move 2: POLISH — The report is the product + +Goal: Every PDF datascope produces is genuinely professional and correct. + +### 2A: Fix backtick literals in templates ✓ +- Depends on: none +- In `datascope/findings/templates.py`, replace backtick-wrapped field names (e.g., `` f"Column `{field_name}`" ``) with either bare names or bold tags reportlab understands (`{field_name}`) +- ~30 occurrences across 6 template functions +- Done when: `grep -c '`' datascope/findings/templates.py` returns 0 (for backtick-wrapped names); generate a test PDF and visually confirm field names render without literal backtick characters + +### 2B: Fix newline collapse in mixed-dates template ✓ +- Depends on: none +- In `datascope/findings/templates.py:224-225`, replace `"\n".join(format_parts)` with `"
".join(format_parts)` so reportlab Paragraph renders line breaks +- Verify _safe() in pdf.py doesn't escape `
` tags (it escapes `<` and `>` — need to handle this) +- Done when: generate a PDF from sample_mixed_types.xlsx; the date format breakdown in the mixed-dates finding renders as a vertical list, not run-on text + +### 2C: Add page numbers and running header to PDF ✓ +- Depends on: none +- In `datascope/reports/pdf.py`, add an `onLaterPages` callback to `SimpleDocTemplate` that renders "datascope diagnostic — {filename}" as a header and "Page N" as a footer +- Done when: generate a multi-page PDF; every page after the title has a header and page number + +### 2D: Fix health assessment total count ✓ +- Depends on: none +- In `datascope/reports/pdf.py:244-278`, update health assessment text branches to include total finding count (e.g., "25 informational observations were found" instead of "Only informational observations were found") +- Done when: test with a dataset that produces only info findings; health assessment text includes the count + +### 2E: Regenerate v2 sample outputs ✓ +- Depends on: 2A, 2B, 2C, 2D (want polished PDF before committing samples) +- Run `datascope samples/input/sample_mixed_types.xlsx --output-dir samples/output/` and `datascope samples/input/sample_sales.xlsx --output-dir samples/output/` +- Delete old v1 output files (`*_field_report.*`, `*_field_report_strict.*`) +- Update screenshots if applicable +- Done when: `samples/output/` contains only v2 diagnostic PDFs; no v1 artifacts remain + +--- + +## Move 3: BRIDGE — Serve both audiences + +Goal: Engineers can integrate datascope into pipelines; consultants still get their PDF. + +### 3A: Promote FindingType sub-types to first-class enums ✓ +- Depends on: none +- Add `LEADING_ZEROS`, `MIXED_DATES`, `NEAR_CONSTANT`, `DUPLICATE_IDS` to `FindingType` enum in models.py +- Update analyzers to emit the specific type (format_check.py, cardinality.py) +- Remove evidence-key sniffing in severity.py (~4 helper functions + 2 branches) and composer.py (~2 branches) +- Update test assertions that reference the old generic types +- Done when: `grep -c "leading_zero_count.*in.*evidence\|date_formats.*in.*evidence\|near_constant\|suspected.*duplicate" datascope/findings/severity.py datascope/findings/composer.py` returns 0; all tests pass + +### 3B: Type source_metadata as TypedDict ✓ +- Depends on: none +- Add `SourceMetadata = TypedDict(...)` in models.py with keys: filename, sheet, row_count, column_count +- Update `LoaderResult.source_metadata` type annotation from `dict[str, Any]` to `SourceMetadata` +- Update loaders and pdf.py to use typed access +- Done when: `mypy datascope/models.py` passes (or `pyright` equivalent); no `dict[str, Any]` for source_metadata + +### 3C: Add `--format json` output flag ✓ +- Depends on: 3A (clean enum makes JSON serialization straightforward) +- Add `--format {pdf,json,both}` argument to cli.py (default: pdf for backward compat) +- JSON schema: `{"source": {...metadata}, "findings": [{severity, finding_type, field_name, assumption, reality, impact, fix, prevention, evidence}], "summary": {critical, warning, info, total}}` +- When format=json, write to `_diagnostic.json` alongside or instead of PDF +- Done when: `datascope samples/input/sample_mixed_types.xlsx --format json | python -m json.tool` produces valid JSON with all finding fields populated + +### 3D: Add `--verbose` / `--quiet` flags ✓ +- Depends on: none +- `--quiet`: suppress stdout summary, exit code only (0 = no critical, 1 = has critical findings) +- `--verbose`: print full traceback on analyzer failures instead of one-line warning +- Done when: `datascope file.xlsx --quiet` produces no stdout; `datascope file.xlsx --verbose` with a patched-to-fail analyzer shows full traceback + +### 3E: Add GitHub Actions CI workflow ✓ +- Depends on: none +- Create `.github/workflows/ci.yml`: pytest + ruff check on push/PR, Python 3.10-3.12 matrix +- Done when: push to a branch triggers CI; green check on passing tests + +### 3F: Complete pyproject.toml metadata ✓ +- Depends on: none +- Add `authors`, `urls` (homepage, repository, issues), `readme = "README.md"` fields +- Done when: `python -m build` produces a wheel whose metadata includes author, homepage URL, and rendered README + +### 3G: Publish to PyPI ✓ +- Depends on: 3E, 3F (CI must be green; metadata must be complete) +- Register `datascope` on PyPI (check name availability first — may need `datascope-dq` or similar) +- Add GitHub Actions publish workflow (on tag push) +- Done when: `pip install datascope` (or chosen name) from a fresh venv installs and runs successfully + +--- + +## Move 4: GROW — Expand the moat (future) + +Goal: Deepen the technical moat and expand the addressable audience. + +### 4A: Add `--max-rows` / file size guard +- Depends on: none +- After loading, check `row_count * column_count`; warn if > 500K cells; abort if > 5M cells (configurable via `--max-rows`) +- Done when: `datascope huge_file.csv` prints a warning at 500K cells and aborts at 5M with a clear message + +### 4B: Regex pre-filter for CSV datetime inference +- Depends on: none +- Port `_DATE_LIKE_RE` from format_check.py:139 to csv_loader.py's `_infer_cell`; skip strptime loop if regex doesn't match +- Done when: benchmark on a 100K-row CSV of text strings shows >5x speedup vs. current; all existing tests pass + +### 4C: Add HTML report option +- Depends on: 3A (clean enum types), 3C (JSON output as data source for HTML) +- New `datascope/reports/html.py` — Jinja2 template rendering the same finding data as the PDF +- Wire to `--format html` in cli.py +- Done when: `datascope file.xlsx --format html` produces a self-contained HTML file that opens in browser with styled finding cards + +### 4D: Add missing-value pattern analyzer +- Depends on: 3A (new FindingType enum value: `MISSING_VALUE_PATTERN`) +- New analyzer in `datascope/analyzers/missing_values.py` +- Detects: columns with >N% nulls, row-level patterns (all nulls in a row = likely empty row), correlated missingness +- Template in templates.py, severity rule in severity.py +- Done when: running on a dataset with a 40%-null column produces a finding with assumption/reality/impact text + +### 4E: Add annotated Excel output +- Depends on: none (but benefits from 3A for clean finding types) +- New `datascope/reports/annotated_excel.py` — copies input file, highlights problem cells with conditional formatting, adds a "Findings" sheet +- Wire to `--format annotated-excel` in cli.py +- Done when: `datascope file.xlsx --format annotated-excel` produces a copy of the input with problem cells highlighted in red/amber/blue + +### 4F: Add Parquet input support +- Depends on: none +- New `datascope/loaders/parquet.py` — reads via pyarrow, maps Arrow types to Python types for cell_types +- Add `pyarrow` as optional dependency (`pip install datascope[parquet]`) +- Done when: `datascope data.parquet` produces a diagnostic report; cell_types correctly maps Arrow schema types + +### 4G: Stream-process loaders +- Depends on: 4A (size guard provides fallback for unsupported streaming cases) +- Refactor excel.py and csv_loader.py to build DataFrame + cell_types in a single streaming pass without intermediate `list()` materialization +- Done when: benchmark on a 500K-row file shows <500MB peak memory (vs. current ~1.5GB); all existing tests pass + +### 4H: Lock file + pip audit in CI +- Depends on: 3E (CI must exist) +- Generate `requirements.lock` via `pip-compile` or `uv pip compile` +- Add `pip audit` step to CI workflow +- Done when: `pip install -r requirements.lock` produces identical installs; CI fails on known-vulnerable dependencies diff --git a/README.md b/README.md index bb16ef6..92829c5 100644 --- a/README.md +++ b/README.md @@ -24,8 +24,14 @@ Each finding is expressed as **assumption vs. reality**: what the data *appears* ## Installation ```bash -git clone https://github.com/MsShawnP/field-story-scorer.git -cd field-story-scorer +pip install datascope-dq +``` + +Or install from source: + +```bash +git clone https://github.com/MsShawnP/datascope.git +cd datascope pip install -e . ``` @@ -140,7 +146,7 @@ datascope/ - pandas >= 2.0 - openpyxl >= 3.1 - reportlab >= 4.0 -- numpy >= 1.24 +- defusedxml >= 0.7 --- diff --git a/datascope/__init__.py b/datascope/__init__.py index 75d8914..c47ea74 100644 --- a/datascope/__init__.py +++ b/datascope/__init__.py @@ -2,4 +2,4 @@ from __future__ import annotations -__version__ = "2.0.0" +__version__ = "2.1.0" diff --git a/datascope/analyzers/cardinality.py b/datascope/analyzers/cardinality.py index dd2ea46..3ca1fdc 100644 --- a/datascope/analyzers/cardinality.py +++ b/datascope/analyzers/cardinality.py @@ -5,7 +5,8 @@ uniqueness). Produces :class:`~datascope.models.Finding` instances with -:attr:`~datascope.models.FindingType.CARDINALITY_ANOMALY`. +:attr:`~datascope.models.FindingType.NEAR_CONSTANT` or +:attr:`~datascope.models.FindingType.DUPLICATE_IDS`. Severity is *not* assigned here -- that is the severity classifier's job (U7). """ @@ -82,7 +83,7 @@ def analyze_cardinality(result: LoaderResult) -> list[Finding]: findings.append(Finding( field_name=col_name, - finding_type=FindingType.CARDINALITY_ANOMALY, + finding_type=FindingType.NEAR_CONSTANT, evidence=evidence, )) @@ -104,7 +105,7 @@ def analyze_cardinality(result: LoaderResult) -> list[Finding]: findings.append(Finding( field_name=col_name, - finding_type=FindingType.CARDINALITY_ANOMALY, + finding_type=FindingType.DUPLICATE_IDS, evidence=evidence, )) diff --git a/datascope/analyzers/format_check.py b/datascope/analyzers/format_check.py index b905ed7..70216bf 100644 --- a/datascope/analyzers/format_check.py +++ b/datascope/analyzers/format_check.py @@ -1,7 +1,8 @@ """Format-inconsistency detectors: leading zeros and mixed date formats. Produces :class:`~datascope.models.Finding` instances with -:attr:`~datascope.models.FindingType.FORMAT_INCONSISTENCY`. +:attr:`~datascope.models.FindingType.LEADING_ZEROS` or +:attr:`~datascope.models.FindingType.MIXED_DATES`. Severity is *not* assigned here -- that is the severity classifier's job (U7). """ @@ -105,7 +106,7 @@ def analyze_leading_zeros(result: LoaderResult) -> list[Finding]: findings.append(Finding( field_name=col_name, - finding_type=FindingType.FORMAT_INCONSISTENCY, + finding_type=FindingType.LEADING_ZEROS, evidence=evidence, )) @@ -224,7 +225,7 @@ def analyze_mixed_dates(result: LoaderResult) -> list[Finding]: findings.append(Finding( field_name=col_name, - finding_type=FindingType.FORMAT_INCONSISTENCY, + finding_type=FindingType.MIXED_DATES, evidence=evidence, )) diff --git a/datascope/cli.py b/datascope/cli.py index 03b814d..2f0221e 100644 --- a/datascope/cli.py +++ b/datascope/cli.py @@ -2,16 +2,18 @@ Usage:: - datascope [--output-dir DIR] [--sheet NAME_OR_INDEX] [--version] + datascope [--output-dir DIR] [--sheet NAME_OR_INDEX] [--format FMT] [--version] The CLI orchestrates the full analysis pipeline: load, analyse, classify, -compose, and render a PDF diagnostic report. +compose, and render diagnostic output. """ from __future__ import annotations import argparse +import json import sys +import traceback from pathlib import Path from datascope import __version__ @@ -49,6 +51,23 @@ def _build_parser() -> argparse.ArgumentParser: "Default: first sheet (index 0)." ), ) + parser.add_argument( + "--format", + choices=["pdf", "json", "both"], + default="pdf", + dest="output_format", + help="Output format (default: pdf).", + ) + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Show full tracebacks when an analyzer fails.", + ) + parser.add_argument( + "--quiet", "-q", + action="store_true", + help="Suppress stdout summary. Exit code 0 = no critical findings, 1 = critical findings present.", + ) parser.add_argument( "--version", action="version", @@ -125,6 +144,39 @@ def _format_summary(findings: list, source_metadata: dict, output_path: Path) -> return "\n".join(lines) +def _write_json(findings: list, source_metadata: dict, output_path: Path) -> None: + """Write findings as structured JSON.""" + from datascope.models import Severity + + counts: dict[str, int] = {"critical": 0, "warning": 0, "info": 0, "total": 0} + for f in findings: + if f.severity is not None: + counts[f.severity.name.lower()] += 1 + counts["total"] += 1 + + payload = { + "source": dict(source_metadata), + "summary": counts, + "findings": [ + { + "field_name": f.field_name, + "finding_type": f.finding_type.value, + "severity": f.severity.name.lower() if f.severity else None, + "assumption": f.assumption, + "reality": f.reality, + "impact": f.impact, + "fix_recommendation": f.fix_recommendation, + "prevention_rule": f.prevention_rule, + "evidence": f.evidence, + } + for f in findings + ], + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(payload, indent=2, default=str), encoding="utf-8") + + def main(argv: list[str] | None = None) -> None: """Entry point for the datascope CLI. @@ -184,7 +236,10 @@ def main(argv: list[str] | None = None) -> None: try: all_findings.extend(analyzer(result)) except Exception as exc: - print(f"Warning: {analyzer.__name__} failed: {exc}", file=sys.stderr) + if args.verbose: + traceback.print_exc(file=sys.stderr) + else: + print(f"Warning: {analyzer.__name__} failed: {exc}", file=sys.stderr) # --- process -------------------------------------------------------- from datascope.findings import process_findings @@ -192,14 +247,30 @@ def main(argv: list[str] | None = None) -> None: processed = process_findings(all_findings) # --- report --------------------------------------------------------- - from datascope.reports import write_pdf - output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) - output_name = f"{input_path.stem}_diagnostic.pdf" - output_path = output_dir / output_name - write_pdf(processed, result.source_metadata, output_path) + fmt = args.output_format + output_path = None + + if fmt in ("pdf", "both"): + from datascope.reports import write_pdf + + output_name = f"{input_path.stem}_diagnostic.pdf" + output_path = output_dir / output_name + write_pdf(processed, result.source_metadata, output_path) + + if fmt in ("json", "both"): + json_path = output_dir / f"{input_path.stem}_diagnostic.json" + _write_json(processed, result.source_metadata, json_path) + if output_path is None: + output_path = json_path # --- stdout summary ------------------------------------------------- + if args.quiet: + from datascope.models import Severity + + has_critical = any(f.severity is Severity.CRITICAL for f in processed) + sys.exit(1 if has_critical else 0) + print(_format_summary(processed, result.source_metadata, output_path)) diff --git a/datascope/findings/composer.py b/datascope/findings/composer.py index 3c26ae5..dd51705 100644 --- a/datascope/findings/composer.py +++ b/datascope/findings/composer.py @@ -1,8 +1,7 @@ """Template engine that populates the narrative text fields on a Finding. -Chooses the correct template function for each finding's sub-type (using -evidence keys to disambiguate within shared FindingType values) and writes -the five text fields onto the Finding in place. +Chooses the correct template function for each finding type and writes the +five text fields onto the Finding in place. """ from __future__ import annotations @@ -11,40 +10,14 @@ from datascope.findings import templates -# --------------------------------------------------------------------------- -# Sub-type dispatch -# --------------------------------------------------------------------------- - -def _select_template(finding: Finding): - """Return the template function for *finding*'s sub-type.""" - ft = finding.finding_type - ev = finding.evidence - - if ft is FindingType.TYPE_INCONSISTENCY: - return templates.type_inconsistency - - if ft is FindingType.SENTINEL_VALUE: - return templates.sentinel_value - - if ft is FindingType.FORMAT_INCONSISTENCY: - if "leading_zero_count" in ev: - return templates.leading_zeros - if "formats_found" in ev: - return templates.mixed_dates - # Fallback for unrecognised format sub-variant. - return templates.leading_zeros - - if ft is FindingType.CARDINALITY_ANOMALY: - if "top_values" in ev: - return templates.near_constant - if "duplicate_values" in ev: - return templates.suspected_duplicate_ids - # Fallback for unrecognised cardinality sub-variant. - return templates.near_constant - - # Unknown finding type -- use type_inconsistency as a safe fallback - # so that every finding always gets text populated. - return templates.type_inconsistency +_TEMPLATE_MAP = { + FindingType.TYPE_INCONSISTENCY: templates.type_inconsistency, + FindingType.SENTINEL_VALUE: templates.sentinel_value, + FindingType.LEADING_ZEROS: templates.leading_zeros, + FindingType.MIXED_DATES: templates.mixed_dates, + FindingType.NEAR_CONSTANT: templates.near_constant, + FindingType.DUPLICATE_IDS: templates.suspected_duplicate_ids, +} # --------------------------------------------------------------------------- @@ -70,7 +43,9 @@ def compose_finding(finding: Finding) -> Finding: Finding The same finding instance, now with all five text fields set. """ - template_fn = _select_template(finding) + template_fn = _TEMPLATE_MAP.get( + finding.finding_type, templates.type_inconsistency, + ) texts = template_fn(finding.field_name, finding.evidence) finding.assumption = texts["assumption"] diff --git a/datascope/findings/severity.py b/datascope/findings/severity.py index c3d1736..16ac33e 100644 --- a/datascope/findings/severity.py +++ b/datascope/findings/severity.py @@ -1,7 +1,7 @@ """Severity classifier for datascope findings. Maps each finding to a :class:`~datascope.models.Severity` level based on -the finding type and its evidence. The rules are impact-based: +the finding type. The rules are impact-based: * **CRITICAL** -- silent data loss or incorrect calculations downstream. * **WARNING** -- key mismatches or misinterpretation likely. @@ -13,48 +13,13 @@ from __future__ import annotations -from typing import Any - from datascope.models import Finding, FindingType, Severity -# Numeric types that can cause silent calculation errors when mixed. _NUMERIC_TYPES: frozenset[str] = frozenset({"numeric", "int", "float"}) -def _is_leading_zeros(evidence: dict[str, Any]) -> bool: - """Return True if evidence belongs to a leading-zero finding.""" - return "leading_zero_count" in evidence - - -def _is_mixed_dates(evidence: dict[str, Any]) -> bool: - """Return True if evidence belongs to a mixed-date finding.""" - return "formats_found" in evidence - - -def _is_near_constant(evidence: dict[str, Any]) -> bool: - """Return True if evidence belongs to a near-constant cardinality finding.""" - return "top_values" in evidence - - -def _is_suspected_duplicates(evidence: dict[str, Any]) -> bool: - """Return True if evidence belongs to a suspected-duplicate-ID finding.""" - return "duplicate_values" in evidence - - def classify_severity(finding: Finding) -> Severity: - """Determine the severity of *finding* based on its type and evidence. - - Parameters - ---------- - finding: - A :class:`~datascope.models.Finding` with ``finding_type`` and - ``evidence`` populated. - - Returns - ------- - Severity - The computed severity level. - """ + """Determine the severity of *finding* based on its type and evidence.""" ft = finding.finding_type ev = finding.evidence @@ -67,17 +32,16 @@ def classify_severity(finding: Finding) -> Severity: if ft is FindingType.SENTINEL_VALUE: return Severity.CRITICAL - if ft is FindingType.FORMAT_INCONSISTENCY: - # Both sub-variants are WARNING. + if ft is FindingType.LEADING_ZEROS: return Severity.WARNING - if ft is FindingType.CARDINALITY_ANOMALY: - if _is_near_constant(ev): - return Severity.INFO - if _is_suspected_duplicates(ev): - return Severity.WARNING - # Fallback for unrecognised cardinality evidence shape. + if ft is FindingType.MIXED_DATES: + return Severity.WARNING + + if ft is FindingType.NEAR_CONSTANT: return Severity.INFO - # Unknown finding type -- default conservatively. + if ft is FindingType.DUPLICATE_IDS: + return Severity.WARNING + return Severity.INFO diff --git a/datascope/findings/templates.py b/datascope/findings/templates.py index 5522e18..f3dd114 100644 --- a/datascope/findings/templates.py +++ b/datascope/findings/templates.py @@ -70,7 +70,7 @@ def type_inconsistency(field_name: str, evidence: dict[str, Any]) -> dict[str, s example_str = _join_examples(all_examples) assumption = ( - f"Column `{field_name}` appears to be purely {majority}." + f"Column '{field_name}' appears to be purely {majority}." ) reality = ( f"However, {minority_desc} were found among {total} non-null values " @@ -80,24 +80,24 @@ def type_inconsistency(field_name: str, evidence: dict[str, Any]) -> dict[str, s if majority.lower() in ("numeric", "int", "float"): impact = ( - f"Rows with non-numeric values in `{field_name}` will be silently " + f"Rows with non-numeric values in '{field_name}' will be silently " f"dropped or converted to NaN during sums, averages, and other " f"calculations, producing incorrect results without any error message." ) else: impact = ( f"Downstream systems expecting a uniform {majority} type in " - f"`{field_name}` may misinterpret or reject the unexpected values, " + f"'{field_name}' may misinterpret or reject the unexpected values, " f"leading to key-lookup failures or broken transformations." ) fix_recommendation = ( - f"Review the non-{majority} values in `{field_name}` and decide " + f"Review the non-{majority} values in '{field_name}' and decide " f"whether they should be converted to {majority}, replaced with a " f"proper null, or moved to a separate column." ) prevention_rule = ( - f"Every value in `{field_name}` should be the same type ({majority}). " + f"Every value in '{field_name}' should be the same type ({majority}). " f"Add a type-check validation rule at data entry or ingestion time." ) @@ -129,7 +129,7 @@ def sentinel_value(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: sentinel_desc = ", ".join(sentinel_counts) if sentinel_counts else "unknown sentinel values" assumption = ( - f"Column `{field_name}` appears to be a clean {majority_type} column." + f"Column '{field_name}' appears to be a clean {majority_type} column." ) reality = ( f"However, {_pct(sentinel_pct)} of values ({len(sentinels)} distinct sentinel " @@ -138,11 +138,11 @@ def sentinel_value(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: ) impact = ( f"Tools like pandas and Excel silently drop sentinel strings when " - f"computing sums or averages on `{field_name}`, making totals lower " + f"computing sums or averages on '{field_name}', making totals lower " f"than expected. No error is raised, so the data loss goes unnoticed." ) fix_recommendation = ( - f"Replace sentinel values in `{field_name}` with proper null/blank " + f"Replace sentinel values in '{field_name}' with proper null/blank " f"cells so that downstream tools handle missing data correctly and " f"row counts reflect reality." ) @@ -162,7 +162,7 @@ def sentinel_value(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: # --------------------------------------------------------------------------- -# FORMAT_INCONSISTENCY -- leading zeros +# LEADING_ZEROS # --------------------------------------------------------------------------- def leading_zeros(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: @@ -174,7 +174,7 @@ def leading_zeros(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: examples_without = evidence.get("examples_without_zeros", []) assumption = ( - f"Column `{field_name}` appears to use a single, consistent " + f"Column '{field_name}' appears to use a single, consistent " f"numeric-string format." ) reality = ( @@ -183,17 +183,17 @@ def leading_zeros(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: f"do not (e.g. {_join_examples(examples_without)})." ) impact = ( - f"Leading zeros in `{field_name}` will be stripped when values are " + f"Leading zeros in '{field_name}' will be stripped when values are " f"treated as numbers. This causes join or lookup failures because " f"'00123' no longer matches '123' as a key." ) fix_recommendation = ( - f"Standardize all values in `{field_name}` to the same format. If " + f"Standardize all values in '{field_name}' to the same format. If " f"leading zeros are meaningful (e.g. zip codes, product codes), store " f"them as text with consistent padding." ) prevention_rule = ( - f"Decide whether `{field_name}` is a number or a code. Numbers " + f"Decide whether '{field_name}' is a number or a code. Numbers " f"should never have leading zeros; codes should always be stored " f"as text with a fixed width." ) @@ -208,7 +208,7 @@ def leading_zeros(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: # --------------------------------------------------------------------------- -# FORMAT_INCONSISTENCY -- mixed dates +# MIXED_DATES # --------------------------------------------------------------------------- def mixed_dates(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: @@ -221,28 +221,28 @@ def mixed_dates(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: for fmt in formats: examples = examples_per.get(fmt, []) example_str = _join_examples(examples, limit=2) - format_parts.append(f" - {fmt}: {example_str}") - format_desc = "\n".join(format_parts) if format_parts else " (no formats)" + format_parts.append(f"{fmt} (e.g. {example_str})") + format_desc = "; ".join(format_parts) if format_parts else "(no formats detected)" assumption = ( - f"Column `{field_name}` appears to use a single date format." + f"Column '{field_name}' appears to use a single date format." ) reality = ( f"However, {len(formats)} different date formats were found across " - f"{total} date values:\n{format_desc}" + f"{total} date values: {format_desc}." ) impact = ( - f"Mixed date formats in `{field_name}` cause parsing ambiguity. " + f"Mixed date formats in '{field_name}' cause parsing ambiguity. " f"For example, '01/02/2026' could be January 2 or February 1 " f"depending on the format. Tools may silently parse dates " f"incorrectly, producing wrong results." ) fix_recommendation = ( - f"Pick one date format for `{field_name}` (ISO 8601 YYYY-MM-DD is " + f"Pick one date format for '{field_name}' (ISO 8601 YYYY-MM-DD is " f"recommended) and convert all existing values to that format." ) prevention_rule = ( - f"All dates in `{field_name}` should use the same format. Add " + f"All dates in '{field_name}' should use the same format. Add " f"format validation at data entry time and reject values that " f"do not match." ) @@ -257,7 +257,7 @@ def mixed_dates(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: # --------------------------------------------------------------------------- -# CARDINALITY_ANOMALY -- near-constant +# NEAR_CONSTANT # --------------------------------------------------------------------------- def near_constant(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: @@ -274,7 +274,7 @@ def near_constant(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: top_desc = ", ".join(top_parts) if top_parts else "(no values)" assumption = ( - f"Column `{field_name}` is expected to carry meaningful, " + f"Column '{field_name}' is expected to carry meaningful, " f"varying data." ) reality = ( @@ -284,17 +284,17 @@ def near_constant(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: f"Most common: {top_desc}." ) impact = ( - f"A near-constant column like `{field_name}` adds no analytical " + f"A near-constant column like '{field_name}' adds no analytical " f"value. Including it in models or reports may mislead readers " f"into thinking the field varies when it does not." ) fix_recommendation = ( - f"Verify whether `{field_name}` should actually vary. If not, " + f"Verify whether '{field_name}' should actually vary. If not, " f"document it as a constant and consider removing it from analysis. " f"If it should vary, investigate why the data is uniform." ) prevention_rule = ( - f"If `{field_name}` is supposed to carry diverse values, add a " + f"If '{field_name}' is supposed to carry diverse values, add a " f"data-quality check that flags columns with fewer than 1% unique " f"values." ) @@ -309,7 +309,7 @@ def near_constant(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: # --------------------------------------------------------------------------- -# CARDINALITY_ANOMALY -- suspected duplicate IDs +# DUPLICATE_IDS # --------------------------------------------------------------------------- def suspected_duplicate_ids(field_name: str, evidence: dict[str, Any]) -> dict[str, str]: @@ -322,7 +322,7 @@ def suspected_duplicate_ids(field_name: str, evidence: dict[str, Any]) -> dict[s dup_str = _join_examples(duplicates, limit=5) assumption = ( - f"Column `{field_name}` appears to be a unique identifier " + f"Column '{field_name}' appears to be a unique identifier " f"(ID column)." ) reality = ( @@ -331,18 +331,18 @@ def suspected_duplicate_ids(field_name: str, evidence: dict[str, Any]) -> dict[s f"Duplicate values include: {dup_str}." ) impact = ( - f"Duplicate IDs in `{field_name}` cause row-level joins to fan out, " + f"Duplicate IDs in '{field_name}' cause row-level joins to fan out, " f"producing unexpected extra rows in merged datasets. Aggregations " f"that assume one row per ID will double-count affected records." ) fix_recommendation = ( - f"Investigate the duplicate values in `{field_name}` to determine " + f"Investigate the duplicate values in '{field_name}' to determine " f"whether they are true duplicates (same record entered twice) or " f"legitimate repeats (one-to-many relationship). De-duplicate or " f"re-model accordingly." ) prevention_rule = ( - f"If `{field_name}` is meant to be a primary key, enforce a " + f"If '{field_name}' is meant to be a primary key, enforce a " f"uniqueness constraint at the database or validation layer." ) diff --git a/datascope/models.py b/datascope/models.py index 84d99a3..c49765c 100644 --- a/datascope/models.py +++ b/datascope/models.py @@ -8,7 +8,7 @@ import enum from dataclasses import dataclass, field -from typing import Any +from typing import Any, TypedDict import pandas as pd @@ -34,8 +34,10 @@ class FindingType(enum.Enum): TYPE_INCONSISTENCY = "type_inconsistency" SENTINEL_VALUE = "sentinel_value" - FORMAT_INCONSISTENCY = "format_inconsistency" - CARDINALITY_ANOMALY = "cardinality_anomaly" + LEADING_ZEROS = "leading_zeros" + MIXED_DATES = "mixed_dates" + NEAR_CONSTANT = "near_constant" + DUPLICATE_IDS = "duplicate_ids" # --------------------------------------------------------------------------- @@ -69,6 +71,15 @@ class Finding: prevention_rule: str | None = None +class SourceMetadata(TypedDict, total=False): + """Typed metadata about the data source.""" + + filename: str + sheet: str | int + row_count: int + column_count: int + + @dataclass class LoaderResult: """What the loader hands to the profiler and detectors. @@ -79,4 +90,4 @@ class LoaderResult: dataframe: pd.DataFrame cell_types: dict[str, list[type]] = field(default_factory=dict) - source_metadata: dict[str, Any] = field(default_factory=dict) + source_metadata: SourceMetadata = field(default_factory=dict) # type: ignore[assignment] diff --git a/datascope/reports/pdf.py b/datascope/reports/pdf.py index 78fc8ed..02066d8 100644 --- a/datascope/reports/pdf.py +++ b/datascope/reports/pdf.py @@ -79,8 +79,10 @@ _FINDING_TYPE_LABELS: dict[FindingType, str] = { FindingType.TYPE_INCONSISTENCY: "Type Inconsistency", FindingType.SENTINEL_VALUE: "Sentinel Value", - FindingType.FORMAT_INCONSISTENCY: "Format Inconsistency", - FindingType.CARDINALITY_ANOMALY: "Cardinality Anomaly", + FindingType.LEADING_ZEROS: "Leading Zeros", + FindingType.MIXED_DATES: "Mixed Date Formats", + FindingType.NEAR_CONSTANT: "Near-Constant Column", + FindingType.DUPLICATE_IDS: "Suspected Duplicate IDs", } # Page geometry. @@ -252,16 +254,19 @@ def _health_assessment(counts: dict[Severity, int]) -> str: "No data quality issues were detected. The dataset appears clean " "and ready for analysis." ) + info = counts[Severity.INFO] if crit == 0 and warn == 0: return ( - "Only informational observations were found. The dataset is in " - "good shape overall, with a few minor items worth noting." + f"{info} informational observation{'s were' if info != 1 else ' was'} " + f"found. The dataset is in good shape overall, with " + f"{'a few' if info <= 3 else 'some'} minor items worth noting." ) if crit == 0: return ( - "No critical issues were found, but there are warnings that " - "should be addressed before using this data in production. " - "Review the warnings below to prevent downstream problems." + f"No critical issues were found, but {warn} " + f"warning{'s' if warn != 1 else ''} and {info} informational " + f"observation{'s were' if info != 1 else ' was'} detected. " + f"Address the warnings before using this data in production." ) if crit <= 2: return ( @@ -588,10 +593,26 @@ def write_pdf( pagesize=letter, rightMargin=0.5 * inch, leftMargin=0.5 * inch, - topMargin=0.5 * inch, - bottomMargin=0.5 * inch, + topMargin=0.6 * inch, + bottomMargin=0.6 * inch, ) + filename = source_metadata.get("filename", "Unknown source") + + def _on_later_pages(canvas, doc): + canvas.saveState() + canvas.setFont("Helvetica", 8) + canvas.setFillColor(colors.HexColor("#666666")) + canvas.drawString( + 0.5 * inch, letter[1] - 0.4 * inch, + f"datascope diagnostic — {filename}", + ) + canvas.drawRightString( + letter[0] - 0.5 * inch, 0.35 * inch, + f"Page {canvas.getPageNumber()}", + ) + canvas.restoreState() + styles = _build_styles() counts = _severity_counts(findings) story: list = [] @@ -601,5 +622,5 @@ def write_pdf( _build_findings_section(story, styles, findings) _build_field_inventory(story, styles, findings, source_metadata) - doc.build(story) + doc.build(story, onLaterPages=_on_later_pages) return output diff --git a/generate_sample.py b/generate_sample.py index a234eb8..b7ba094 100644 --- a/generate_sample.py +++ b/generate_sample.py @@ -1,10 +1,10 @@ """ -Generate sample xlsx files for testing field-story-scorer. +Generate sample xlsx files for datascope. Creates two files in samples/input/: sample_sales.xlsx — clean dataset with a range of field types sample_mixed_types.xlsx — dataset with genuine mixed-type cells in one column, - designed to demonstrate the --strict-types flag + designed to demonstrate cell-level type detection Usage: python generate_sample.py @@ -61,9 +61,9 @@ def make_mixed_type_dataset(n: int = 200) -> None: Write sample_mixed_types.xlsx using openpyxl directly so that cells in 'revenue_mixed' are genuine native types (float or str), not pandas-coerced. - This is the canonical test file for --strict-types mode: - - Standard run: revenue_mixed scores near-identical to revenue - - --strict-types run: revenue_mixed shows score penalty + type_mix breakdown + This is the canonical test file for datascope's cell-level type detection: + the 15 string "N/A" cells hidden in the numeric column are invisible to + pandas but detected by datascope. """ revenues = RNG.lognormal(mean=5, sigma=1.5, size=n).round(2).tolist() bad_rows = set(random.sample(range(n), 15)) @@ -83,9 +83,8 @@ def make_mixed_type_dataset(n: int = 200) -> None: wb.save(path) print(f"Created {path} ({n} rows, 15 intentional string cells in revenue_mixed)") print() - print("Demonstrate the difference:") - print(f" python scorer.py --input {path} --output-dir samples/output/") - print(f" python scorer.py --input {path} --output-dir samples/output/ --strict-types") + print("Run datascope on the sample:") + print(f" datascope {path} --output-dir samples/output/") if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 5dc8b9a..1906df6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,17 +3,20 @@ requires = ["setuptools>=68.0", "wheel"] build-backend = "setuptools.build_meta" [project] -name = "datascope" -version = "2.0.0" +name = "datascope-dq" +version = "2.1.0" description = "Data quality diagnostics for tabular datasets — surfaces hidden problems in plain English" +readme = "README.md" requires-python = ">=3.10" license = "MIT" +authors = [ + { name = "Shawn", email = "msshawnp@gmail.com" }, +] keywords = ["data-quality", "diagnostics", "csv", "excel", "type-detection", "data-validation"] classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -24,7 +27,7 @@ dependencies = [ "pandas>=2.0.0", "openpyxl>=3.1.0", "reportlab>=4.0.0", - "numpy>=1.24.0", + "defusedxml>=0.7.0", ] [project.optional-dependencies] @@ -33,6 +36,11 @@ dev = [ "ruff>=0.4.0", ] +[project.urls] +Homepage = "https://github.com/MsShawnP/datascope" +Repository = "https://github.com/MsShawnP/datascope" +Issues = "https://github.com/MsShawnP/datascope/issues" + [project.scripts] datascope = "datascope.cli:main" diff --git a/requirements.txt b/requirements.txt index d5774ef..13a0594 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ pandas>=2.0.0 openpyxl>=3.1.0 reportlab>=4.0.0 -numpy>=1.24.0 +defusedxml>=0.7.0 diff --git a/samples/README.md b/samples/README.md index 1f13b28..b126918 100644 --- a/samples/README.md +++ b/samples/README.md @@ -1,6 +1,6 @@ # Samples -Real outputs from `field-story-scorer`, committed so you can see what the tool produces without running it yourself. The strict-mode comparison on `sample_mixed_types.xlsx` is the headline — it's the clearest demonstration of what `--strict-types` actually catches. +Real outputs from datascope, committed so you can see what the tool produces without running it yourself. ## Layout @@ -9,47 +9,32 @@ samples/ ├── input/ # Synthetic source files (regenerate with `python generate_sample.py`) │ ├── sample_sales.xlsx │ └── sample_mixed_types.xlsx -└── output/ # Reports produced by scorer.py - ├── sample_sales_field_report.{xlsx,pdf} - ├── sample_mixed_types_field_report.{xlsx,pdf} - ├── sample_mixed_types_field_report_strict.{xlsx,pdf} - └── screenshots/ # Rendered previews of the Excel/PDF reports +└── output/ # Diagnostic reports produced by datascope + ├── sample_sales_diagnostic.pdf + └── sample_mixed_types_diagnostic.pdf ``` ## Inputs | File | Rows × Cols | What it exercises | |---|---|---| -| [sample_sales.xlsx](input/sample_sales.xlsx) | 500 × 15 | A varied but clean dataset — categorical, numeric, boolean, datetime-as-string, sparse, and constant columns. Shows the full range of field types the scorer classifies. | -| [sample_mixed_types.xlsx](input/sample_mixed_types.xlsx) | 200 × 4 | Engineered to demonstrate `--strict-types`. The `revenue_mixed` column has 185 floats and 15 cells containing the literal string `"N/A"` — pandas silently coerces these to `NaN` on load. | +| [sample_sales.xlsx](input/sample_sales.xlsx) | 500 × 15 | A varied but clean dataset — categorical, numeric, boolean, datetime-as-string, sparse, and constant columns. Shows the full range of field types datascope classifies. | +| [sample_mixed_types.xlsx](input/sample_mixed_types.xlsx) | 200 × 4 | Demonstrates cell-level type detection. The `revenue_mixed` column has 185 floats and 15 cells containing the literal string `"N/A"` — pandas silently coerces these to `NaN` on load, but datascope reads each cell's actual type and flags the inconsistency. | ## Outputs -| Output file | Source command | What it demonstrates | -|---|---|---| -| [sample_sales_field_report.xlsx](output/sample_sales_field_report.xlsx) | `scorer.py --input samples/input/sample_sales.xlsx` | Full Excel report on the clean dataset — all four tabs (Field Rankings, Field Profiles, Chart Recommendations, Correlation Matrix) with conditional formatting and data bars. | -| [sample_sales_field_report.pdf](output/sample_sales_field_report.pdf) | (same as above) | PDF version of the same report — landscape, color-coded score cells. | -| [sample_mixed_types_field_report.xlsx](output/sample_mixed_types_field_report.xlsx) | `scorer.py --input samples/input/sample_mixed_types.xlsx` | **Standard mode.** `revenue_mixed` scores **0.9775** and is classified as `numeric_continuous` — the 15 string cells are invisible. This is the false-positive story. | -| [sample_mixed_types_field_report.pdf](output/sample_mixed_types_field_report.pdf) | (same as above) | PDF of the standard-mode mixed-types run. | -| [sample_mixed_types_field_report_strict.xlsx](output/sample_mixed_types_field_report_strict.xlsx) | `scorer.py --input samples/input/sample_mixed_types.xlsx --strict-types` | **Strict mode.** Same input. `revenue_mixed` now scores **0.9708** (type contamination shows up in `type_consistency` 0.925 vs 1.0), and the new `type_mix` column shows `[numeric:185, str:15]`. The string contamination is exposed. | -| [sample_mixed_types_field_report_strict.pdf](output/sample_mixed_types_field_report_strict.pdf) | (same as above) | PDF of the strict-mode run — open this side-by-side with the non-strict PDF and look at the `type_mix` column. | - -## The strict-mode story, in one line - -| Mode | Score | Type | Type breakdown | -|---|---|---|---| -| Standard | 0.9775 | `numeric_continuous` | *(column not present — pandas inferred away)* | -| `--strict-types` | 0.9708 | `numeric_continuous` | `numeric:185, str:15` | +Each input file produces a PDF diagnostic report with: -Same file. Same column. Standard mode says "ship it" with no visibility into the cells. Strict mode keeps the score honest (only `type_consistency` is dinged, because the other dimensions still see legitimate numeric values for 92.5% of rows) and surfaces the new `type_mix` column so you can see exactly which cells are off — "this column has 15 sentinel strings, go talk to whoever exported it." +- **Executive summary** — overall health assessment, finding counts by severity +- **Findings by severity** — each finding as an assumption-vs-reality card with impact, fix, and prevention rule +- **Field inventory** — summary table of all columns with detected issue types ## Regenerating ```bash -python generate_sample.py # rewrite samples/input/ -python scorer.py --input samples/input/sample_sales.xlsx --output-dir samples/output/ -python scorer.py --input samples/input/sample_mixed_types.xlsx --output-dir samples/output/ -python scorer.py --input samples/input/sample_mixed_types.xlsx --output-dir samples/output/ --strict-types +python generate_sample.py # rewrite samples/input/ +datascope samples/input/sample_sales.xlsx --output-dir samples/output/ +datascope samples/input/sample_mixed_types.xlsx --output-dir samples/output/ ``` -The generator uses a fixed seed (`42`) so regenerated inputs are byte-stable. Outputs may differ slightly across `openpyxl` / `reportlab` versions. +The generator uses a fixed seed (`42`) so regenerated inputs are byte-stable. Report formatting may differ slightly across `reportlab` versions. diff --git a/samples/output/sample_mixed_types_diagnostic.pdf b/samples/output/sample_mixed_types_diagnostic.pdf new file mode 100644 index 0000000..75957d4 Binary files /dev/null and b/samples/output/sample_mixed_types_diagnostic.pdf differ diff --git a/samples/output/sample_mixed_types_field_report.pdf b/samples/output/sample_mixed_types_field_report.pdf deleted file mode 100644 index 948909e..0000000 Binary files a/samples/output/sample_mixed_types_field_report.pdf and /dev/null differ diff --git a/samples/output/sample_mixed_types_field_report.xlsx b/samples/output/sample_mixed_types_field_report.xlsx deleted file mode 100644 index b16bda9..0000000 Binary files a/samples/output/sample_mixed_types_field_report.xlsx and /dev/null differ diff --git a/samples/output/sample_mixed_types_field_report_strict.pdf b/samples/output/sample_mixed_types_field_report_strict.pdf deleted file mode 100644 index dd0af8b..0000000 Binary files a/samples/output/sample_mixed_types_field_report_strict.pdf and /dev/null differ diff --git a/samples/output/sample_mixed_types_field_report_strict.xlsx b/samples/output/sample_mixed_types_field_report_strict.xlsx deleted file mode 100644 index 1997bd3..0000000 Binary files a/samples/output/sample_mixed_types_field_report_strict.xlsx and /dev/null differ diff --git a/samples/output/sample_sales_diagnostic.pdf b/samples/output/sample_sales_diagnostic.pdf new file mode 100644 index 0000000..66b86f3 Binary files /dev/null and b/samples/output/sample_sales_diagnostic.pdf differ diff --git a/samples/output/sample_sales_field_report.pdf b/samples/output/sample_sales_field_report.pdf deleted file mode 100644 index cb4491e..0000000 Binary files a/samples/output/sample_sales_field_report.pdf and /dev/null differ diff --git a/samples/output/sample_sales_field_report.xlsx b/samples/output/sample_sales_field_report.xlsx deleted file mode 100644 index 81f790b..0000000 Binary files a/samples/output/sample_sales_field_report.xlsx and /dev/null differ diff --git a/samples/output/screenshots/correlation_matrix.png b/samples/output/screenshots/correlation_matrix.png deleted file mode 100644 index e1fe6bc..0000000 Binary files a/samples/output/screenshots/correlation_matrix.png and /dev/null differ diff --git a/samples/output/screenshots/excel_field_rankings.png b/samples/output/screenshots/excel_field_rankings.png deleted file mode 100644 index 6a6a110..0000000 Binary files a/samples/output/screenshots/excel_field_rankings.png and /dev/null differ diff --git a/samples/output/screenshots/strict_mode_comparison.png b/samples/output/screenshots/strict_mode_comparison.png deleted file mode 100644 index 65767e4..0000000 Binary files a/samples/output/screenshots/strict_mode_comparison.png and /dev/null differ diff --git a/scorer.py b/scorer.py deleted file mode 100644 index 0514c81..0000000 --- a/scorer.py +++ /dev/null @@ -1,791 +0,0 @@ -""" -field-story-scorer — Excel field profiling and quality scoring CLI. - -Scores every column in an xlsx file across five dimensions (Completeness, -Cardinality, Type Consistency, Distribution, Correlation) and outputs ranked -Excel and PDF reports. - -Usage: - python scorer.py --input data.xlsx --sheet Sheet1 --output-dir ./reports - python scorer.py --input data.xlsx --sheet Sheet1 --output-dir ./reports --strict-types - -See README.md for full documentation. -""" - -import argparse -import sys -import warnings -from datetime import date, datetime -from pathlib import Path - -import numpy as np -import pandas as pd - -warnings.filterwarnings("ignore") - -__version__ = "1.0.0" - - -# --------------------------------------------------------------------------- -# Strict-types loader (openpyxl cell-level) -# --------------------------------------------------------------------------- - -def load_strict(input_path: str, sheet) -> pd.DataFrame: - """ - Read an xlsx cell-by-cell via openpyxl, preserving each cell's native Python type - (int, float, str, bool, datetime, None). Pandas type inference is intentionally - bypassed so that mixed-type columns are not silently coerced. - - Args: - input_path: Path to the xlsx file. - sheet: Sheet name (str) or 0-based index (int). - - Returns: - DataFrame where every column is dtype=object and values are raw Python types. - """ - from openpyxl import load_workbook - - wb = load_workbook(input_path, data_only=True, read_only=True) - - if isinstance(sheet, int): - ws = wb.worksheets[sheet] - else: - ws = wb[sheet] - - rows = list(ws.iter_rows(values_only=True)) - wb.close() - - if not rows: - return pd.DataFrame() - - headers = [str(h) if h is not None else f"col_{i}" for i, h in enumerate(rows[0])] - data = [list(r) for r in rows[1:]] - # dtype=object is what makes strict mode strict: without it, pandas would - # infer float64 / string / datetime64 per column and the type_mix breakdown - # would only ever populate for genuinely-mixed columns. Passing dtype=object - # preserves each cell's raw Python type (int, float, str, bool, datetime). - return pd.DataFrame(data, columns=headers, dtype=object) - - -# --------------------------------------------------------------------------- -# Scoring Engine -# --------------------------------------------------------------------------- - -def score_completeness(series: pd.Series) -> float: - """Ratio of non-null values. 1.0 = fully populated.""" - if len(series) == 0: - return 0.0 - return round(series.notna().sum() / len(series), 4) - - -def score_cardinality(series: pd.Series) -> float: - """ - Normalized uniqueness. - - Near 0 → constant / near-constant (bad for analytics) - - Near 1 → every value unique (identifier column) - Sweet spot is in the middle for categorical fields. - We return raw ratio; interpretation lives in chart recommendations. - """ - filled = series.dropna() - if len(filled) == 0: - return 0.0 - return round(filled.nunique() / len(filled), 4) - - -def score_type_consistency(series: pd.Series) -> float: - """ - Proportion of non-null values that share the majority Python type. - - Standard mode: pandas has already inferred dtypes, so numeric/datetime columns - score 1.0 by definition. Object columns are inspected via type(). - - Strict mode (--strict-types): every column arrives as dtype=object with raw Python - types preserved. int/float variants are normalized to a single 'numeric' bucket so - that pandas integer-vs-float coercion differences don't inflate inconsistency. - """ - filled = series.dropna() - if len(filled) == 0: - return 1.0 - - # Already typed by pandas — trust it - if pd.api.types.is_numeric_dtype(series) or pd.api.types.is_datetime64_any_dtype(series): - return 1.0 - - # Object column: inspect raw Python types. - # Normalize int/float → 'numeric' so openpyxl int-vs-float noise doesn't penalize - # columns that are legitimately all-numeric. - def normalize_type(v): - t = type(v) - if t in (int, float): - return "numeric" - return t.__name__ - - type_counts = filled.map(normalize_type).value_counts() - majority = type_counts.iloc[0] - return round(majority / len(filled), 4) - - -def score_distribution(series: pd.Series, numeric_view: pd.Series = None) -> float: - """ - For numeric: normalized coefficient of variation capped at 1. - For categorical: normalized entropy (0=one category, 1=perfectly uniform). - Non-numeric / non-categorical gets 0.5 (neutral). - - numeric_view: optional coerced-numeric view of the column (used in strict mode - when the raw series is dtype=object but the column is majority-numeric). - """ - if numeric_view is not None: - filled = numeric_view.dropna() - if len(filled) < 2: - return 0.0 - mean = filled.mean() - std = filled.std() - if mean == 0: - return 0.0 if std == 0 else 1.0 - cv = abs(std / mean) - return round(min(cv, 1.0), 4) - - filled = series.dropna() - if len(filled) < 2: - return 0.0 - - if pd.api.types.is_numeric_dtype(series): - mean = filled.mean() - std = filled.std() - if mean == 0: - return 0.0 if std == 0 else 1.0 - cv = abs(std / mean) - return round(min(cv, 1.0), 4) - - if pd.api.types.is_object_dtype(series) or isinstance(series.dtype, pd.CategoricalDtype): - counts = filled.value_counts(normalize=True) - k = len(counts) - if k == 1: - return 0.0 - entropy = -np.sum(counts * np.log2(counts)) - max_entropy = np.log2(k) - return round(entropy / max_entropy if max_entropy > 0 else 0.0, 4) - - return 0.5 - - -def score_correlation(col_name, corr_matrix: pd.DataFrame) -> float: - """ - Mean absolute Pearson correlation of `col_name` with all other numeric columns. - - `corr_matrix` is precomputed in analyze() over either the dtype-selected numeric - columns (standard mode) or a coerced-numeric view of object columns that are - majority-numeric (strict mode). Columns not present in the matrix score 0.0. - """ - if corr_matrix.empty or col_name not in corr_matrix.columns: - return 0.0 - col_corr = corr_matrix[col_name].drop(col_name, errors="ignore").abs() - if len(col_corr) == 0: - return 0.0 - return round(col_corr.mean(), 4) - - -WEIGHTS = { - "completeness": 0.30, - "cardinality": 0.15, - "type_consistency": 0.25, - "distribution": 0.15, - "correlation": 0.15, -} - - -def composite_score(row: dict) -> float: - total = sum(WEIGHTS[k] * row[k] for k in WEIGHTS) - return round(total, 4) - - -def infer_field_type(series: pd.Series, numeric_view: pd.Series = None, - force_type: str = None) -> str: - """ - `numeric_view` (strict mode): coerced-numeric Series for columns that are - majority-numeric despite dtype=object. When supplied, the column is classified - as numeric_continuous / numeric_discrete based on the coerced cardinality. - - `force_type` (strict mode): explicit override for columns detected as bool or - datetime via cell-type sniffing. - """ - if force_type is not None: - return force_type - if numeric_view is not None: - card = numeric_view.dropna().nunique() - return "numeric_continuous" if card > 20 else "numeric_discrete" - if pd.api.types.is_datetime64_any_dtype(series): - return "datetime" - if pd.api.types.is_bool_dtype(series): - return "boolean" - if pd.api.types.is_numeric_dtype(series): - card = series.dropna().nunique() - return "numeric_continuous" if card > 20 else "numeric_discrete" - filled = series.dropna() - card = filled.nunique() - n = len(filled) - if n == 0: - return "unknown" - ratio = card / n - if ratio > 0.9: - return "identifier" - if card <= 20: - return "categorical_low" - return "categorical_high" - - -def recommend_chart(field_type: str, cardinality: float, distribution: float) -> str: - recs = { - "numeric_continuous": "Histogram, Box Plot, Violin Plot", - "numeric_discrete": "Bar Chart, Dot Plot", - "categorical_low": "Bar Chart, Pie Chart (if ≤6 cats), Treemap", - "categorical_high": "Horizontal Bar Chart (top N), Word Cloud", - "datetime": "Line Chart, Area Chart, Calendar Heatmap", - "boolean": "Stacked Bar, Donut Chart", - "identifier": "Count / Frequency Table — not recommended for visualization", - "unknown": "N/A — no data", - } - base = recs.get(field_type, "Bar Chart") - if cardinality == 1.0 and field_type != "identifier": - base += " ⚠ All unique — likely ID column" - elif cardinality < 0.01: - base += " ⚠ Near-constant — low analytical value" - return base - - -# --------------------------------------------------------------------------- -# Analysis Driver -# --------------------------------------------------------------------------- - -def _strict_majority_kind(series: pd.Series): - """ - For a strict-mode (dtype=object) column, sniff whether it is majority-bool - or majority-datetime based on raw Python cell types. Returns "boolean", - "datetime", or None. - """ - filled = series.dropna() - if len(filled) == 0: - return None - bool_share = filled.map(lambda v: isinstance(v, bool)).mean() - if bool_share > 0.5: - return "boolean" - dt_share = filled.map(lambda v: isinstance(v, (datetime, date)) and not isinstance(v, bool)).mean() - if dt_share > 0.5: - return "datetime" - return None - - -def analyze(df: pd.DataFrame, strict_types: bool = False) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Returns (rankings_df, profiles_df, chart_recs_df, corr_matrix_df). - - strict_types: when True, df arrived via load_strict() — columns may be dtype=object - with raw Python types. We pre-compute a coerced-numeric view of each object column - that is majority-numeric so that distribution, correlation, and type inference can - reason about numeric-ness despite the object dtype. - """ - # Pre-compute per-column numeric views and bool/datetime overrides. - # In standard mode the "numeric view" is just the dtype-selected numeric frame. - numeric_views: dict = {} - force_types: dict = {} - - if strict_types: - for col in df.columns: - series = df[col] - if pd.api.types.is_numeric_dtype(series): - numeric_views[col] = series - continue - if pd.api.types.is_object_dtype(series): - kind = _strict_majority_kind(series) - if kind is not None: - force_types[col] = kind - continue - coerced = pd.to_numeric(series, errors="coerce") - if coerced.notna().sum() > 0 and coerced.notna().mean() > 0.5: - numeric_views[col] = coerced - else: - for col in df.columns: - series = df[col] - if pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series): - numeric_views[col] = series - - numeric_df = pd.DataFrame(numeric_views) if numeric_views else pd.DataFrame() - - if not numeric_df.empty and numeric_df.shape[1] >= 2: - corr_matrix = numeric_df.corr(method="pearson") - else: - corr_matrix = pd.DataFrame() - - rows = [] - for col in df.columns: - series = df[col] - numeric_view = numeric_views.get(col) - force_type = force_types.get(col) - - completeness = score_completeness(series) - cardinality = score_cardinality(series) - type_consistency = score_type_consistency(series) - distribution = score_distribution(series, numeric_view=numeric_view) - correlation = score_correlation(col, corr_matrix) - field_type = infer_field_type(series, numeric_view=numeric_view, force_type=force_type) - - # Human-readable type breakdown for strict mode (object columns only) - if strict_types and pd.api.types.is_object_dtype(series): - filled = series.dropna() - def norm(v): - if isinstance(v, bool): - return "bool" - if isinstance(v, (int, float)): - return "numeric" - return type(v).__name__ - tc = filled.map(norm).value_counts() - type_mix = ", ".join(f"{t}:{n}" for t, n in tc.items()) - else: - type_mix = "" - - dim_scores = { - "completeness": completeness, - "cardinality": cardinality, - "type_consistency": type_consistency, - "distribution": distribution, - "correlation": correlation, - } - comp = composite_score(dim_scores) - - rows.append({ - "field": col, - "field_type": field_type, - "composite_score": comp, - "completeness": completeness, - "cardinality": cardinality, - "type_consistency": type_consistency, - "distribution": distribution, - "correlation": correlation, - "null_count": int(series.isna().sum()), - "row_count": len(series), - "unique_count": int(series.nunique()), - "type_mix": type_mix, - "chart_recommendation": recommend_chart(field_type, cardinality, distribution), - }) - - all_df = pd.DataFrame(rows) - - rank_cols = ["field", "composite_score", "field_type", "null_count", "unique_count"] - if strict_types: - rank_cols.append("type_mix") - rankings_df = all_df[rank_cols].copy() - rankings_df = rankings_df.sort_values("composite_score", ascending=False).reset_index(drop=True) - rankings_df.index += 1 - rankings_df.index.name = "rank" - - profiles_df = all_df[["field", "completeness", "cardinality", "type_consistency", - "distribution", "correlation", "composite_score"]].copy() - profiles_df = profiles_df.sort_values("composite_score", ascending=False).reset_index(drop=True) - - chart_cols = ["field", "field_type", "chart_recommendation", "cardinality", "distribution"] - if strict_types: - chart_cols.insert(2, "type_mix") - chart_recs_df = all_df[chart_cols].copy() - chart_recs_df = chart_recs_df.sort_values("field").reset_index(drop=True) - - if not corr_matrix.empty: - corr_matrix_df = corr_matrix.round(4) - else: - corr_matrix_df = pd.DataFrame({"note": ["No numeric columns found for correlation"]}) - - return rankings_df, profiles_df, chart_recs_df, corr_matrix_df - - -# --------------------------------------------------------------------------- -# Excel Report -# --------------------------------------------------------------------------- - -def write_excel(rankings_df, profiles_df, chart_recs_df, corr_matrix_df, - output_path: str, source_name: str): - from openpyxl import Workbook - from openpyxl.styles import Alignment, Font, PatternFill, Border, Side - from openpyxl.utils import get_column_letter - from openpyxl.formatting.rule import ColorScaleRule, DataBarRule - - wb = Workbook() - wb.remove(wb.active) - - # Color palette - HDR_FILL = PatternFill("solid", start_color="1F3864") # dark navy - HDR_FONT = Font(name="Arial", bold=True, color="FFFFFF", size=10) - TITLE_FONT = Font(name="Arial", bold=True, size=13, color="1F3864") - BODY_FONT = Font(name="Arial", size=10) - ALT_FILL = PatternFill("solid", start_color="EEF2F7") - THIN = Side(style="thin", color="CCCCCC") - BORDER = Border(left=THIN, right=THIN, top=THIN, bottom=THIN) - CENTER = Alignment(horizontal="center", vertical="center", wrap_text=True) - LEFT = Alignment(horizontal="left", vertical="center", wrap_text=True) - - def style_header_row(ws, row_num, col_count, title=None): - for c in range(1, col_count + 1): - cell = ws.cell(row=row_num, column=c) - cell.fill = HDR_FILL - cell.font = HDR_FONT - cell.alignment = CENTER - cell.border = BORDER - - def style_data_rows(ws, start_row, end_row, col_count): - for r in range(start_row, end_row + 1): - fill = ALT_FILL if r % 2 == 0 else PatternFill("solid", start_color="FFFFFF") - for c in range(1, col_count + 1): - cell = ws.cell(row=r, column=c) - cell.fill = fill - cell.font = BODY_FONT - cell.border = BORDER - cell.alignment = CENTER if c > 1 else LEFT - - def add_title(ws, title_text, col_span): - ws.insert_rows(1) - ws.merge_cells(start_row=1, start_column=1, end_row=1, end_column=col_span) - t = ws.cell(row=1, column=1, value=title_text) - t.font = TITLE_FONT - t.alignment = LEFT - t.fill = PatternFill("solid", start_color="D9E1F2") - - def auto_width(ws, min_w=10, max_w=40): - for col in ws.columns: - max_len = 0 - col_letter = get_column_letter(col[0].column) - for cell in col: - try: - if cell.value: - max_len = max(max_len, len(str(cell.value))) - except (AttributeError, TypeError): - pass - ws.column_dimensions[col_letter].width = min(max(max_len + 2, min_w), max_w) - - def df_to_sheet(ws, df, include_index=False): - if include_index: - df = df.reset_index() - headers = list(df.columns) - ws.append(headers) - for _, row in df.iterrows(): - ws.append([str(v) if not isinstance(v, (int, float)) else v for v in row]) - return len(headers) - - # --- Tab 1: Field Rankings --- - ws1 = wb.create_sheet("Field Rankings") - col_count = df_to_sheet(ws1, rankings_df, include_index=True) - add_title(ws1, f"Field Rankings — {source_name}", col_count) - style_header_row(ws1, 2, col_count) - style_data_rows(ws1, 3, ws1.max_row, col_count) - auto_width(ws1) - - # Color scale on composite score column (col 3 after rank + field) - score_col = get_column_letter(3) - data_start = f"{score_col}3" - data_end = f"{score_col}{ws1.max_row}" - ws1.conditional_formatting.add( - f"{data_start}:{data_end}", - ColorScaleRule(start_type="num", start_value=0, start_color="F8696B", - mid_type="num", mid_value=0.5, mid_color="FFEB84", - end_type="num", end_value=1, end_color="63BE7B") - ) - ws1.freeze_panes = "A3" - - # --- Tab 2: Field Profiles --- - ws2 = wb.create_sheet("Field Profiles") - col_count2 = df_to_sheet(ws2, profiles_df) - add_title(ws2, f"Field Profiles — Dimension Breakdown — {source_name}", col_count2) - style_header_row(ws2, 2, col_count2) - style_data_rows(ws2, 3, ws2.max_row, col_count2) - auto_width(ws2) - - # Data bars on each score column (cols 2–7) - for c_idx in range(2, col_count2 + 1): - col_letter = get_column_letter(c_idx) - ws2.conditional_formatting.add( - f"{col_letter}3:{col_letter}{ws2.max_row}", - DataBarRule(start_type="num", start_value=0, end_type="num", - end_value=1, color="4472C4") - ) - ws2.freeze_panes = "B3" - - # --- Tab 3: Chart Recommendations --- - ws3 = wb.create_sheet("Chart Recommendations") - col_count3 = df_to_sheet(ws3, chart_recs_df) - add_title(ws3, f"Chart Recommendations — {source_name}", col_count3) - style_header_row(ws3, 2, col_count3) - style_data_rows(ws3, 3, ws3.max_row, col_count3) - auto_width(ws3) - ws3.freeze_panes = "A3" - - # --- Tab 4: Correlation Matrix --- - ws4 = wb.create_sheet("Correlation Matrix") - if "note" in corr_matrix_df.columns: - ws4.append(["note"]) - ws4.append([corr_matrix_df["note"].iloc[0]]) - else: - col_count4 = df_to_sheet(ws4, corr_matrix_df.reset_index()) - add_title(ws4, f"Pearson Correlation Matrix (Numeric Fields) — {source_name}", col_count4) - style_header_row(ws4, 2, col_count4) - style_data_rows(ws4, 3, ws4.max_row, col_count4) - auto_width(ws4) - # Diverging color scale - score_col_end = get_column_letter(col_count4) - ws4.conditional_formatting.add( - f"B3:{score_col_end}{ws4.max_row}", - ColorScaleRule(start_type="num", start_value=-1, start_color="F8696B", - mid_type="num", mid_value=0, mid_color="FFFFFF", - end_type="num", end_value=1, end_color="63BE7B") - ) - ws4.freeze_panes = "B3" - - wb.save(output_path) - - -# --------------------------------------------------------------------------- -# PDF Report -# --------------------------------------------------------------------------- - -def write_pdf(rankings_df, profiles_df, chart_recs_df, corr_matrix_df, - output_path: str, source_name: str): - from reportlab.lib.pagesizes import letter, landscape - from reportlab.lib import colors - from reportlab.lib.units import inch - from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle - from reportlab.platypus import (SimpleDocTemplate, Paragraph, Spacer, Table, - TableStyle, PageBreak, HRFlowable) - from reportlab.lib.enums import TA_LEFT, TA_CENTER - - PAGE_W, PAGE_H = landscape(letter) - NAVY = colors.HexColor("#1F3864") - LIGHT_BLUE = colors.HexColor("#D9E1F2") - ALT_ROW = colors.HexColor("#EEF2F7") - GREEN = colors.HexColor("#63BE7B") - RED = colors.HexColor("#F8696B") - YELLOW = colors.HexColor("#FFEB84") - - doc = SimpleDocTemplate( - output_path, - pagesize=landscape(letter), - rightMargin=0.5 * inch, - leftMargin=0.5 * inch, - topMargin=0.5 * inch, - bottomMargin=0.5 * inch, - ) - - styles = getSampleStyleSheet() - h1 = ParagraphStyle("h1", parent=styles["Heading1"], fontSize=16, - textColor=NAVY, spaceAfter=4) - h2 = ParagraphStyle("h2", parent=styles["Heading2"], fontSize=12, - textColor=NAVY, spaceAfter=4) - body = ParagraphStyle("body", parent=styles["Normal"], fontSize=9, spaceAfter=2) - caption = ParagraphStyle("caption", parent=styles["Normal"], fontSize=8, - textColor=colors.grey, spaceAfter=6) - - def score_color(val): - try: - v = float(val) - except (ValueError, TypeError): - return colors.white - if v >= 0.75: - return GREEN - if v >= 0.45: - return YELLOW - return RED - - def make_table(df, col_widths=None, score_cols=None): - data = [list(df.columns)] - for _, row in df.iterrows(): - data.append([str(v) if not isinstance(v, (int, float)) else round(v, 4) for v in row]) - - available_w = PAGE_W - inch - if col_widths is None: - w = available_w / len(df.columns) - col_widths = [w] * len(df.columns) - - style_cmds = [ - ("BACKGROUND", (0, 0), (-1, 0), NAVY), - ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), - ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), - ("FONTSIZE", (0, 0), (-1, -1), 8), - ("FONTNAME", (0, 1), (-1, -1), "Helvetica"), - ("ALIGN", (0, 0), (-1, -1), "CENTER"), - ("ALIGN", (0, 1), (0, -1), "LEFT"), - ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), - ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, ALT_ROW]), - ("GRID", (0, 0), (-1, -1), 0.5, colors.HexColor("#CCCCCC")), - ("TOPPADDING", (0, 0), (-1, -1), 4), - ("BOTTOMPADDING", (0, 0), (-1, -1), 4), - ] - - if score_cols: - for r_idx, row in enumerate(data[1:], start=1): - for c_idx in score_cols: - if c_idx < len(row): - cell_val = row[c_idx] - bg = score_color(cell_val) - style_cmds.append(("BACKGROUND", (c_idx, r_idx), (c_idx, r_idx), bg)) - - t = Table(data, colWidths=col_widths, repeatRows=1) - t.setStyle(TableStyle(style_cmds)) - return t - - story = [] - - # Cover-style header - story.append(Paragraph(f"Field Story Scorer Report", h1)) - story.append(Paragraph(f"Source: {source_name}", body)) - story.append(HRFlowable(width="100%", thickness=2, color=NAVY)) - story.append(Spacer(1, 0.15 * inch)) - - # Tab 1: Rankings - story.append(Paragraph("Tab 1 — Field Rankings", h2)) - story.append(Paragraph( - "Fields ranked by composite score (weighted: Completeness 30%, Type Consistency 25%, " - "Cardinality 15%, Distribution 15%, Correlation 15%). " - "Green ≥ 0.75 | Yellow 0.45–0.74 | Red < 0.45.", caption)) - - r_df = rankings_df.reset_index() - n_cols = len(r_df.columns) - available = PAGE_W - inch - col_widths_r = [0.5 * inch, 2.2 * inch] + [(available - 2.7 * inch) / (n_cols - 2)] * (n_cols - 2) - story.append(make_table(r_df, col_widths_r, score_cols=[2])) - story.append(PageBreak()) - - # Tab 2: Profiles - story.append(Paragraph("Tab 2 — Field Profiles (Dimension Breakdown)", h2)) - story.append(Paragraph( - "Per-field scores across all five dimensions. Scores are 0–1 ratios. " - "Color coding matches the ranking table.", caption)) - p_df = profiles_df.copy() - n_cols_p = len(p_df.columns) - first_col_w = 2.0 * inch - rest_w = (PAGE_W - inch - first_col_w) / (n_cols_p - 1) - col_widths_p = [first_col_w] + [rest_w] * (n_cols_p - 1) - score_cols_p = list(range(1, n_cols_p)) - story.append(make_table(p_df, col_widths_p, score_cols=score_cols_p)) - story.append(PageBreak()) - - # Tab 3: Chart Recommendations - story.append(Paragraph("Tab 3 — Chart Recommendations", h2)) - story.append(Paragraph( - "Suggested visualization types per field, derived from inferred field type, " - "cardinality ratio, and distribution score.", caption)) - c_df = chart_recs_df.copy() - col_widths_c = [1.8 * inch, 1.5 * inch, 4.5 * inch, 1.0 * inch, 1.0 * inch] - total_c = sum(col_widths_c) - if total_c > PAGE_W - inch: - scale = (PAGE_W - inch) / total_c - col_widths_c = [w * scale for w in col_widths_c] - story.append(make_table(c_df, col_widths_c)) - story.append(PageBreak()) - - # Tab 4: Correlation Matrix - story.append(Paragraph("Tab 4 — Pearson Correlation Matrix (Numeric Fields)", h2)) - if "note" in corr_matrix_df.columns: - story.append(Paragraph(corr_matrix_df["note"].iloc[0], body)) - else: - story.append(Paragraph( - "Pearson correlation coefficients for all numeric fields. " - "Green = strong positive, Red = strong negative, White = near-zero.", caption)) - cm = corr_matrix_df.reset_index() - n_c = len(cm.columns) - cw = (PAGE_W - inch) / n_c - col_widths_cm = [cw] * n_c - score_cols_cm = list(range(1, n_c)) - story.append(make_table(cm, col_widths_cm, score_cols=score_cols_cm)) - - doc.build(story) - - -# --------------------------------------------------------------------------- -# CLI Entry Point -# --------------------------------------------------------------------------- - -def main(): - parser = argparse.ArgumentParser( - description="field-story-scorer: Score every column in an xlsx file.", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog="See README.md for full documentation and scoring methodology.", - ) - parser.add_argument("--version", action="version", version=f"field-story-scorer {__version__}") - parser.add_argument("--input", required=True, help="Path to input Excel file") - parser.add_argument("--sheet", default=0, help="Sheet name or index (default: first sheet)") - parser.add_argument("--output-dir", default="./reports", help="Directory for output files") - parser.add_argument( - "--strict-types", - action="store_true", - help=( - "Read cell values directly via openpyxl instead of letting pandas infer dtypes. " - "Use this when columns contain mixed types that pandas silently coerces — e.g. a " - "column that is mostly numeric but contains stray strings like 'N/A', 'TBD', or '—'. " - "Without this flag, pandas converts those strings to NaN and the column scores as " - "fully numeric. With this flag, every cell's raw Python type is preserved and " - "type_consistency scores reflect true cell-level heterogeneity. A 'type_mix' column " - "is added to the rankings and chart-recs tabs showing the exact type breakdown." - ), - ) - args = parser.parse_args() - - input_path = Path(args.input) - if not input_path.exists(): - print(f"ERROR: Input file not found: {input_path}", file=sys.stderr) - sys.exit(1) - - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - sheet_arg = args.sheet - try: - sheet_arg = int(sheet_arg) - except ValueError: - pass - - if args.strict_types: - print(f"Reading (strict-types): {input_path} | Sheet: {sheet_arg}") - try: - df = load_strict(str(input_path), sheet_arg) - except Exception as e: - print(f"ERROR reading file (strict mode): {e}", file=sys.stderr) - sys.exit(1) - else: - print(f"Reading: {input_path} | Sheet: {sheet_arg}") - try: - df = pd.read_excel(input_path, sheet_name=sheet_arg) - except Exception as e: - print(f"ERROR reading file: {e}", file=sys.stderr) - sys.exit(1) - - print(f" {len(df)} rows × {len(df.columns)} columns") - if args.strict_types: - print(" [strict-types] pandas dtype inference bypassed — raw cell types preserved") - - print("Scoring fields...") - rankings_df, profiles_df, chart_recs_df, corr_matrix_df = analyze(df, strict_types=args.strict_types) - - stem = input_path.stem - mode_tag = "_strict" if args.strict_types else "" - source_name = f"{stem} ({sheet_arg}){' [strict-types]' if args.strict_types else ''}" - - xlsx_out = output_dir / f"{stem}_field_report{mode_tag}.xlsx" - pdf_out = output_dir / f"{stem}_field_report{mode_tag}.pdf" - - print(f"Writing Excel → {xlsx_out}") - write_excel(rankings_df, profiles_df, chart_recs_df, corr_matrix_df, - str(xlsx_out), source_name) - - print(f"Writing PDF → {pdf_out}") - write_pdf(rankings_df, profiles_df, chart_recs_df, corr_matrix_df, - str(pdf_out), source_name) - - print("\nDone.") - print(f" Excel: {xlsx_out}") - print(f" PDF: {pdf_out}") - - print("\nTop 5 fields by composite score:") - top5 = rankings_df.head(5).reset_index() - for _, row in top5.iterrows(): - mix = f" mix=[{row['type_mix']}]" if args.strict_types and row.get("type_mix") else "" - print(f" #{row['rank']:>2} {row['field']:<30} score={row['composite_score']:.4f} type={row['field_type']}{mix}") - - -if __name__ == "__main__": - main() - diff --git a/tests/test_cardinality.py b/tests/test_cardinality.py index b14afaa..cb14b22 100644 --- a/tests/test_cardinality.py +++ b/tests/test_cardinality.py @@ -68,7 +68,7 @@ def test_finding_field_name(self, result): def test_finding_type(self, result): finding = analyze_cardinality(result)[0] - assert finding.finding_type is FindingType.CARDINALITY_ANOMALY + assert finding.finding_type is FindingType.DUPLICATE_IDS def test_severity_is_none(self, result): """Severity is assigned later by the classifier, not the detector.""" @@ -116,7 +116,7 @@ def test_finding_field_name(self, result): def test_finding_type(self, result): finding = analyze_cardinality(result)[0] - assert finding.finding_type is FindingType.CARDINALITY_ANOMALY + assert finding.finding_type is FindingType.NEAR_CONSTANT def test_evidence_unique_count(self, result): finding = analyze_cardinality(result)[0] diff --git a/tests/test_composer.py b/tests/test_composer.py index 611097f..fffe9e3 100644 --- a/tests/test_composer.py +++ b/tests/test_composer.py @@ -92,7 +92,7 @@ class TestComposeLeadingZeros: @pytest.fixture() def finding(self) -> Finding: f = _make_finding( - FindingType.FORMAT_INCONSISTENCY, + FindingType.LEADING_ZEROS, { "leading_zero_count": 10, "no_leading_zero_count": 40, @@ -148,7 +148,7 @@ def processed(self) -> list[Finding]: # 5 WARNING (leading zeros) for i in range(5): findings.append(_make_finding( - FindingType.FORMAT_INCONSISTENCY, + FindingType.LEADING_ZEROS, { "leading_zero_count": 5, "no_leading_zero_count": 15, @@ -162,7 +162,7 @@ def processed(self) -> list[Finding]: # 2 INFO (near-constant) for i in range(2): findings.append(_make_finding( - FindingType.CARDINALITY_ANOMALY, + FindingType.NEAR_CONSTANT, { "unique_count": 1, "total_count": 1000, @@ -245,7 +245,7 @@ class TestComposeNearConstant: def test_near_constant_info_with_text(self): f = _make_finding( - FindingType.CARDINALITY_ANOMALY, + FindingType.NEAR_CONSTANT, { "unique_count": 2, "total_count": 5000, @@ -273,7 +273,7 @@ class TestComposeSuspectedDuplicates: def test_duplicates_warning_with_text(self): f = _make_finding( - FindingType.CARDINALITY_ANOMALY, + FindingType.DUPLICATE_IDS, { "unique_count": 980, "total_count": 1000, @@ -298,7 +298,7 @@ class TestComposeMixedDates: def test_mixed_dates_warning_with_text(self): f = _make_finding( - FindingType.FORMAT_INCONSISTENCY, + FindingType.MIXED_DATES, { "formats_found": ["%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y"], "examples_per_format": { @@ -364,7 +364,7 @@ def test_sentinel_empty_evidence(self): def test_leading_zeros_empty_evidence(self): f = _make_finding( - FindingType.FORMAT_INCONSISTENCY, + FindingType.LEADING_ZEROS, {"leading_zero_count": 0}, ) compose_finding(f) @@ -372,7 +372,7 @@ def test_leading_zeros_empty_evidence(self): def test_mixed_dates_empty_evidence(self): f = _make_finding( - FindingType.FORMAT_INCONSISTENCY, + FindingType.MIXED_DATES, {"formats_found": []}, ) compose_finding(f) @@ -380,7 +380,7 @@ def test_mixed_dates_empty_evidence(self): def test_near_constant_empty_evidence(self): f = _make_finding( - FindingType.CARDINALITY_ANOMALY, + FindingType.NEAR_CONSTANT, {"top_values": []}, ) compose_finding(f) @@ -388,7 +388,7 @@ def test_near_constant_empty_evidence(self): def test_duplicate_ids_empty_evidence(self): f = _make_finding( - FindingType.CARDINALITY_ANOMALY, + FindingType.DUPLICATE_IDS, {"duplicate_values": []}, ) compose_finding(f) @@ -443,7 +443,7 @@ def test_sentinel_long_name(self, long_name): def test_leading_zeros_long_name(self, long_name): f = _make_finding( - FindingType.FORMAT_INCONSISTENCY, + FindingType.LEADING_ZEROS, {"leading_zero_count": 1, "total_checked": 10}, field_name=long_name, ) @@ -453,7 +453,7 @@ def test_leading_zeros_long_name(self, long_name): def test_near_constant_long_name(self, long_name): f = _make_finding( - FindingType.CARDINALITY_ANOMALY, + FindingType.NEAR_CONSTANT, {"top_values": [{"value": "A", "count": 100}], "total_count": 100}, field_name=long_name, ) @@ -498,7 +498,7 @@ def test_process_findings_all_fields_non_none(self): field_name="cost", ), _make_finding( - FindingType.FORMAT_INCONSISTENCY, + FindingType.LEADING_ZEROS, { "leading_zero_count": 5, "no_leading_zero_count": 15, @@ -509,7 +509,7 @@ def test_process_findings_all_fields_non_none(self): field_name="zip", ), _make_finding( - FindingType.CARDINALITY_ANOMALY, + FindingType.NEAR_CONSTANT, { "unique_count": 1, "total_count": 500, @@ -532,7 +532,7 @@ def test_process_findings_sort_order(self): findings = [ # INFO (near-constant) -- will sort last _make_finding( - FindingType.CARDINALITY_ANOMALY, + FindingType.NEAR_CONSTANT, { "unique_count": 1, "total_count": 100, @@ -543,7 +543,7 @@ def test_process_findings_sort_order(self): ), # WARNING (leading zeros) -- will sort middle _make_finding( - FindingType.FORMAT_INCONSISTENCY, + FindingType.LEADING_ZEROS, { "leading_zero_count": 2, "no_leading_zero_count": 8, diff --git a/tests/test_format_check.py b/tests/test_format_check.py index 3a89a41..059a214 100644 --- a/tests/test_format_check.py +++ b/tests/test_format_check.py @@ -79,7 +79,7 @@ def test_finding_field_name(self, ae2_result): def test_finding_type(self, ae2_result): finding = analyze_leading_zeros(ae2_result)[0] - assert finding.finding_type is FindingType.FORMAT_INCONSISTENCY + assert finding.finding_type is FindingType.LEADING_ZEROS def test_severity_is_none(self, ae2_result): """Severity is assigned later by the classifier, not the detector.""" @@ -217,7 +217,7 @@ def test_finding_field_name(self, result): def test_finding_type(self, result): finding = analyze_mixed_dates(result)[0] - assert finding.finding_type is FindingType.FORMAT_INCONSISTENCY + assert finding.finding_type is FindingType.MIXED_DATES def test_severity_is_none(self, result): finding = analyze_mixed_dates(result)[0] diff --git a/tests/test_models.py b/tests/test_models.py index 2edbfb1..ad0a635 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -41,12 +41,14 @@ def test_sort_produces_critical_first(self): class TestFindingType: - def test_contains_all_four_types(self): + def test_contains_all_six_types(self): expected = { "TYPE_INCONSISTENCY", "SENTINEL_VALUE", - "FORMAT_INCONSISTENCY", - "CARDINALITY_ANOMALY", + "LEADING_ZEROS", + "MIXED_DATES", + "NEAR_CONSTANT", + "DUPLICATE_IDS", } assert {ft.name for ft in FindingType} == expected @@ -94,7 +96,7 @@ def test_pre_composition_state_has_none_text_fields(self): assert f.prevention_rule is None def test_evidence_defaults_to_empty_dict(self): - f = Finding(field_name="x", finding_type=FindingType.CARDINALITY_ANOMALY) + f = Finding(field_name="x", finding_type=FindingType.NEAR_CONSTANT) assert f.evidence == {} def test_sort_findings_by_severity(self): @@ -112,7 +114,7 @@ def test_sort_findings_by_severity(self): ), Finding( field_name="c", - finding_type=FindingType.FORMAT_INCONSISTENCY, + finding_type=FindingType.LEADING_ZEROS, severity=Severity.WARNING, ), ] diff --git a/tests/test_report_pdf.py b/tests/test_report_pdf.py index 825f1ef..ae5a484 100644 --- a/tests/test_report_pdf.py +++ b/tests/test_report_pdf.py @@ -54,7 +54,7 @@ def _critical_finding(field_name: str = "revenue") -> Finding: def _warning_finding(field_name: str = "zip_code") -> Finding: return _make_finding( - FindingType.FORMAT_INCONSISTENCY, + FindingType.LEADING_ZEROS, { "leading_zero_count": 10, "no_leading_zero_count": 40, @@ -68,7 +68,7 @@ def _warning_finding(field_name: str = "zip_code") -> Finding: def _info_finding(field_name: str = "status") -> Finding: return _make_finding( - FindingType.CARDINALITY_ANOMALY, + FindingType.NEAR_CONSTANT, { "unique_count": 2, "total_count": 5000, @@ -281,7 +281,7 @@ def test_full_pipeline(self, tmp_path: Path): ), Finding( field_name="zip_code", - finding_type=FindingType.FORMAT_INCONSISTENCY, + finding_type=FindingType.LEADING_ZEROS, evidence={ "leading_zero_count": 5, "no_leading_zero_count": 45, @@ -292,7 +292,7 @@ def test_full_pipeline(self, tmp_path: Path): ), Finding( field_name="invoice_date", - finding_type=FindingType.FORMAT_INCONSISTENCY, + finding_type=FindingType.MIXED_DATES, evidence={ "formats_found": ["%Y-%m-%d", "%m/%d/%Y"], "examples_per_format": { @@ -304,7 +304,7 @@ def test_full_pipeline(self, tmp_path: Path): ), Finding( field_name="country", - finding_type=FindingType.CARDINALITY_ANOMALY, + finding_type=FindingType.NEAR_CONSTANT, evidence={ "unique_count": 1, "total_count": 500, @@ -314,7 +314,7 @@ def test_full_pipeline(self, tmp_path: Path): ), Finding( field_name="order_id", - finding_type=FindingType.CARDINALITY_ANOMALY, + finding_type=FindingType.DUPLICATE_IDS, evidence={ "unique_count": 980, "total_count": 1000, diff --git a/tests/test_severity.py b/tests/test_severity.py index eab81aa..91c5ad8 100644 --- a/tests/test_severity.py +++ b/tests/test_severity.py @@ -146,7 +146,7 @@ def test_sentinel_in_bool_column_still_critical(self): # --------------------------------------------------------------------------- -# FORMAT_INCONSISTENCY severity +# LEADING_ZEROS / MIXED_DATES severity # --------------------------------------------------------------------------- class TestFormatInconsistency: @@ -154,7 +154,7 @@ class TestFormatInconsistency: def test_leading_zeros_is_warning(self): """Leading-zero inconsistency -> WARNING.""" finding = _make_finding( - FindingType.FORMAT_INCONSISTENCY, + FindingType.LEADING_ZEROS, { "leading_zero_count": 10, "no_leading_zero_count": 40, @@ -168,7 +168,7 @@ def test_leading_zeros_is_warning(self): def test_mixed_dates_is_warning(self): """Mixed date formats -> WARNING.""" finding = _make_finding( - FindingType.FORMAT_INCONSISTENCY, + FindingType.MIXED_DATES, { "formats_found": ["%Y-%m-%d", "%m/%d/%Y"], "examples_per_format": { @@ -182,7 +182,7 @@ def test_mixed_dates_is_warning(self): # --------------------------------------------------------------------------- -# CARDINALITY_ANOMALY severity +# NEAR_CONSTANT / DUPLICATE_IDS severity # --------------------------------------------------------------------------- class TestCardinalityAnomaly: @@ -190,7 +190,7 @@ class TestCardinalityAnomaly: def test_near_constant_is_info(self): """Near-constant column -> INFO.""" finding = _make_finding( - FindingType.CARDINALITY_ANOMALY, + FindingType.NEAR_CONSTANT, { "unique_count": 1, "total_count": 1000, @@ -203,7 +203,7 @@ def test_near_constant_is_info(self): def test_suspected_duplicate_ids_is_warning(self): """Suspected duplicate IDs -> WARNING.""" finding = _make_finding( - FindingType.CARDINALITY_ANOMALY, + FindingType.DUPLICATE_IDS, { "unique_count": 980, "total_count": 1000, @@ -244,16 +244,26 @@ def test_empty_evidence_sentinel(self): finding = _make_finding(FindingType.SENTINEL_VALUE, {}) assert classify_severity(finding) is Severity.CRITICAL - def test_empty_evidence_format(self): - """FORMAT_INCONSISTENCY with empty evidence -> WARNING.""" - finding = _make_finding(FindingType.FORMAT_INCONSISTENCY, {}) + def test_empty_evidence_leading_zeros(self): + """LEADING_ZEROS with empty evidence -> WARNING.""" + finding = _make_finding(FindingType.LEADING_ZEROS, {}) assert classify_severity(finding) is Severity.WARNING - def test_cardinality_empty_evidence_defaults_to_info(self): - """CARDINALITY_ANOMALY with no distinguishing keys -> INFO.""" - finding = _make_finding(FindingType.CARDINALITY_ANOMALY, {}) + def test_empty_evidence_mixed_dates(self): + """MIXED_DATES with empty evidence -> WARNING.""" + finding = _make_finding(FindingType.MIXED_DATES, {}) + assert classify_severity(finding) is Severity.WARNING + + def test_near_constant_empty_evidence_defaults_to_info(self): + """NEAR_CONSTANT with no distinguishing keys -> INFO.""" + finding = _make_finding(FindingType.NEAR_CONSTANT, {}) assert classify_severity(finding) is Severity.INFO + def test_duplicate_ids_empty_evidence_defaults_to_warning(self): + """DUPLICATE_IDS with empty evidence -> WARNING.""" + finding = _make_finding(FindingType.DUPLICATE_IDS, {}) + assert classify_severity(finding) is Severity.WARNING + def test_very_long_field_name(self): """Very long field name does not crash the classifier.""" long_name = "a" * 1000 diff --git a/tools/render_strict_mode_comparison.py b/tools/render_strict_mode_comparison.py deleted file mode 100644 index ba0fa80..0000000 --- a/tools/render_strict_mode_comparison.py +++ /dev/null @@ -1,250 +0,0 @@ -""" -Render samples/output/screenshots/strict_mode_comparison.png. - -Produces a side-by-side comparison of the Field Rankings tab in standard mode -vs --strict-types mode for sample_mixed_types.xlsx. The two tables are rendered -directly with Pillow so the screenshot stays in sync with the actual scorer -output without needing a screen recording / Excel. - -Usage: - python tools/render_strict_mode_comparison.py -""" - -import sys -from pathlib import Path - -from PIL import Image, ImageDraw, ImageFont - -ROOT = Path(__file__).resolve().parent.parent -OUT_PATH = ROOT / "samples" / "output" / "screenshots" / "strict_mode_comparison.png" - -# Make `scorer` importable so we can sanity-check the hardcoded numbers below -# against what the live code actually produces on the bundled sample. -sys.path.insert(0, str(ROOT)) - -NAVY = (31, 56, 100) -LIGHT_BLUE = (217, 225, 242) -GREEN = (99, 190, 123) -YELLOW = (255, 235, 132) -RED = (248, 105, 107) -ALT_ROW = (238, 242, 247) -WHITE = (255, 255, 255) -GREY_BORDER = (180, 180, 180) -TEXT = (33, 33, 33) -WHITE_TEXT = (255, 255, 255) -SUBTITLE = (80, 80, 80) - - -def score_color(value): - if value >= 0.75: - return GREEN - if value >= 0.45: - return YELLOW - return RED - - -def _font(size, bold=False): - path = ( - "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf" - if bold - else "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf" - ) - return ImageFont.truetype(path, size) - - -def _draw_cell(draw, x, y, w, h, text, *, fill, text_color, font, align="center"): - draw.rectangle([x, y, x + w, y + h], fill=fill, outline=GREY_BORDER, width=1) - if text is None or text == "": - return - bbox = draw.textbbox((0, 0), str(text), font=font) - tw = bbox[2] - bbox[0] - th = bbox[3] - bbox[1] - if align == "center": - tx = x + (w - tw) // 2 - elif align == "left": - tx = x + 8 - else: - tx = x + w - tw - 8 - ty = y + (h - th) // 2 - bbox[1] - draw.text((tx, ty), str(text), fill=text_color, font=font) - - -def _draw_table(draw, top_left, title, headers, rows, col_widths, score_col_idx): - x0, y0 = top_left - title_h = 32 - header_h = 28 - row_h = 26 - table_w = sum(col_widths) - - # Title bar - draw.rectangle([x0, y0, x0 + table_w, y0 + title_h], - fill=LIGHT_BLUE, outline=GREY_BORDER, width=1) - title_font = _font(13, bold=True) - bbox = draw.textbbox((0, 0), title, font=title_font) - th = bbox[3] - bbox[1] - draw.text((x0 + 10, y0 + (title_h - th) // 2 - bbox[1]), - title, fill=NAVY, font=title_font) - - # Header row - hy = y0 + title_h - header_font = _font(11, bold=True) - cx = x0 - for header, width in zip(headers, col_widths): - _draw_cell(draw, cx, hy, width, header_h, header, - fill=NAVY, text_color=WHITE_TEXT, font=header_font) - cx += width - - # Data rows - body_font = _font(11) - for r_idx, row in enumerate(rows): - ry = hy + header_h + r_idx * row_h - row_bg = ALT_ROW if r_idx % 2 == 1 else WHITE - cx = x0 - for c_idx, (value, width) in enumerate(zip(row, col_widths)): - if c_idx == score_col_idx: - try: - fill = score_color(float(value)) - except (ValueError, TypeError): - fill = row_bg - else: - fill = row_bg - align = "left" if c_idx == 1 else "center" - _draw_cell(draw, cx, ry, width, row_h, value, - fill=fill, text_color=TEXT, font=body_font, align=align) - cx += width - - return table_w, title_h + header_h + len(rows) * row_h - - -def main(): - # Source-of-truth rows are the actual current scorer outputs. - standard_headers = ["rank", "field", "composite_score", "field_type", - "null_count", "unique_count"] - standard_rows = [ - [1, "revenue", "1.0000", "numeric_continuous", 0, 200], - [2, "revenue_mixed", "0.9775", "numeric_continuous", 15, 185], - [3, "customer_id", "0.7750", "identifier", 0, 200], - [4, "region", "0.6287", "categorical_low", 0, 5], - ] - standard_widths = [50, 130, 130, 170, 90, 110] - - strict_headers = standard_headers + ["type_mix"] - strict_rows = [ - [1, "revenue", "1.0000", "numeric_continuous", 0, 200, "numeric:200"], - [2, "revenue_mixed", "0.9708", "numeric_continuous", 0, 186, "numeric:185, str:15"], - [3, "customer_id", "0.8500", "identifier", 0, 200, "str:200"], - [4, "region", "0.7036", "categorical_low", 0, 5, "str:200"], - ] - strict_widths = [50, 130, 130, 170, 90, 110, 175] - - pad = 24 - gap = 32 - caption_h = 60 - title_h_outer = 56 # top header strip ("Field Rankings — ...") - standard_w = sum(standard_widths) - strict_w = sum(strict_widths) - total_w = pad + standard_w + gap + strict_w + pad - total_h = pad + title_h_outer + (32 + 28 + len(standard_rows) * 26) + caption_h + pad - - img = Image.new("RGB", (total_w, total_h), WHITE) - draw = ImageDraw.Draw(img) - - title_font = _font(15, bold=True) - draw.text((pad, pad), - "Field Rankings — sample_mixed_types.xlsx (rank tab)", - fill=NAVY, font=title_font) - subtitle_font = _font(11) - draw.text((pad, pad + 24), - "Left: standard mode · Right: --strict-types " - "(same input, scored two ways)", - fill=SUBTITLE, font=subtitle_font) - - table_top = pad + title_h_outer - _draw_table( - draw, (pad, table_top), - "Field Rankings — sample_mixed_types.xlsx", - standard_headers, standard_rows, standard_widths, - score_col_idx=2, - ) - _draw_table( - draw, (pad + standard_w + gap, table_top), - "Field Rankings — sample_mixed_types.xlsx [strict-types]", - strict_headers, strict_rows, strict_widths, - score_col_idx=2, - ) - - # Caption beneath the tables - caption_y = table_top + 32 + 28 + len(standard_rows) * 26 + 16 - cap_font = _font(11) - cap_bold = _font(11, bold=True) - - line1 = "Standard mode: revenue_mixed scores 0.9775 and is classified as numeric_continuous —" - line1b = " the 15 string cells were silently converted to NaN." - draw.text((pad, caption_y), line1, fill=TEXT, font=cap_font) - bbox = draw.textbbox((0, 0), line1, font=cap_font) - draw.text((pad + (bbox[2] - bbox[0]), caption_y), line1b, fill=TEXT, font=cap_font) - - line2_a = "--strict-types: revenue_mixed scores 0.9708, still numeric_continuous," - line2_b = " and the new type_mix column exposes the breakdown: " - line2_c = "numeric:185, str:15" - line2_d = "." - y2 = caption_y + 20 - draw.text((pad, y2), line2_a, fill=TEXT, font=cap_font) - w = draw.textbbox((0, 0), line2_a, font=cap_font)[2] - draw.text((pad + w, y2), line2_b, fill=TEXT, font=cap_font) - w += draw.textbbox((0, 0), line2_b, font=cap_font)[2] - draw.text((pad + w, y2), line2_c, fill=NAVY, font=cap_bold) - w += draw.textbbox((0, 0), line2_c, font=cap_bold)[2] - draw.text((pad + w, y2), line2_d, fill=TEXT, font=cap_font) - - OUT_PATH.parent.mkdir(parents=True, exist_ok=True) - img.save(OUT_PATH, format="PNG") - print(f"Wrote {OUT_PATH.relative_to(ROOT)} ({img.size[0]}×{img.size[1]})") - - _verify_against_live_scorer(standard_rows, strict_rows) - - -def _verify_against_live_scorer(standard_rows, strict_rows): - """Run the scorer on the bundled sample and warn if any hardcoded row - diverges from what the live code produces. Keeps the script honest.""" - sample = ROOT / "samples" / "input" / "sample_mixed_types.xlsx" - if not sample.exists(): - print(f" (skip verification — {sample.relative_to(ROOT)} not present)") - return - try: - import pandas as pd - from scorer import analyze, load_strict - except ImportError as e: - print(f" (skip verification — {e})") - return - - std_actual, _, _, _ = analyze(pd.read_excel(sample), strict_types=False) - strict_actual, _, _, _ = analyze(load_strict(str(sample), 0), strict_types=True) - - mismatches = [] - for table_name, hardcoded, actual in ( - ("standard", standard_rows, std_actual), - ("strict", strict_rows, strict_actual), - ): - for row in hardcoded: - field, hc_score = row[1], row[2] - actual_row = actual[actual["field"] == field] - if actual_row.empty: - mismatches.append(f"{table_name}/{field}: missing in actual output") - continue - live_score = f"{float(actual_row.iloc[0]['composite_score']):.4f}" - if live_score != hc_score: - mismatches.append( - f"{table_name}/{field}: hardcoded {hc_score} vs live {live_score}" - ) - - if mismatches: - print(" WARNING: hardcoded rows are stale — rerender after updating:") - for m in mismatches: - print(f" - {m}") - else: - print(" Verified against live scorer output ✓") - - -if __name__ == "__main__": - main()