diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml new file mode 100644 index 0000000..da5e966 --- /dev/null +++ b/.github/workflows/fuzz.yml @@ -0,0 +1,60 @@ +name: Fuzz + +on: + # Run on-demand + workflow_dispatch: + inputs: + duration: + description: "Fuzz duration in seconds per target" + required: false + default: "300" + # Weekly scheduled run + schedule: + - cron: "0 3 * * 1" # Every Monday at 03:00 UTC + # Also run on PRs that touch the reader + pull_request: + paths: + - "src/reader/**" + - "fuzz/**" + +jobs: + fuzz: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + target: + - fuzz_read_xlsx + - fuzz_read_single_sheet + - fuzz_read_sheet_names + - fuzz_read_document_properties + steps: + - uses: actions/checkout@v6 + + - name: Install Rust nightly + uses: dtolnay/rust-toolchain@nightly + + - name: Install cargo-fuzz + run: cargo install cargo-fuzz + + - name: Cache fuzz corpus + uses: actions/cache@v4 + with: + path: fuzz/corpus/${{ matrix.target }} + key: fuzz-corpus-${{ matrix.target }}-${{ github.sha }} + restore-keys: | + fuzz-corpus-${{ matrix.target }}- + + - name: Run fuzzer + run: | + DURATION=${{ github.event.inputs.duration || '60' }} + cargo +nightly fuzz run ${{ matrix.target }} -- \ + -max_total_time=$DURATION \ + -max_len=65536 + + - name: Upload crash artifacts + if: failure() + uses: actions/upload-artifact@v4 + with: + name: fuzz-crash-${{ matrix.target }} + path: fuzz/artifacts/${{ matrix.target }} diff --git a/.gitignore b/.gitignore index 6ad2ff2..bd02022 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,10 @@ env/ *.swp *.swo +# Fuzzing artifacts (keep seed corpus, ignore runtime output) +fuzz/artifacts/ +fuzz/corpus/ + # OS .DS_Store Thumbs.db diff --git a/docs/edge-cases.md b/docs/edge-cases.md new file mode 100644 index 0000000..ab12e5d --- /dev/null +++ b/docs/edge-cases.md @@ -0,0 +1,96 @@ +# Known Edge Cases and Limitations + +This document records edge cases discovered through fuzzing and corpus +testing, along with the reader's behavior in each case. + +## Security Limits + +The reader enforces hard limits to prevent denial-of-service: + +| Limit | Value | Effect | +|-------|-------|--------| +| Max ZIP entry size | 256 MB (decompressed) | Returns `XlsxError` | +| Max shared strings | 2,000,000 | Returns `XlsxError` | +| Max rows per sheet | 1,048,576 (Excel limit) | Returns `XlsxError` | + +## ZIP / Archive Edge Cases + +| Scenario | Behavior | +|----------|----------| +| Truncated ZIP | Clean `ZipError` | +| Empty ZIP (no entries) | Error: missing `xl/workbook.xml` | +| Non-ZIP data (random bytes) | Clean `ZipError` | +| Missing `xl/workbook.xml` | Error: `InvalidStructure` | +| Missing `_rels/.rels` | Falls back gracefully | +| ZIP bomb (large compression ratio) | Caught by entry size limit | + +## XML Edge Cases + +| Scenario | Behavior | +|----------|----------| +| Malformed XML (unclosed tags) | Partial parse; may return truncated data | +| Missing `` element | Empty rows returned | +| Unknown XML elements | Ignored (forward-compatible) | +| XML with BOM | Handled by quick-xml | +| Very deeply nested XML | Parsed normally (no depth limit) | + +## Cell & Data Type Edge Cases + +| Scenario | Behavior | +|----------|----------| +| Shared string index out of bounds | Returns `"[invalid string index]"` | +| Empty cell element `` | Returns `Empty` | +| Cell with no `` child | Returns `Empty` | +| Boolean cell with value not 0 or 1 | Treated as truthy/falsy | +| Inline string with rich text runs | Concatenated plain text | +| Formula with cached value | Returns `Formula { formula, cached_value }` | +| Error cell (`t="e"`) | Returns the error string (e.g. `"#REF!"`) | +| Number format code detection | Date serial numbers with date format codes → `Date` | + +## Row & Structure Edge Cases + +| Scenario | Behavior | +|----------|----------| +| Row number 0 | Treated as row 0 (no crash) | +| Sparse rows (gaps in numbering) | Gaps filled with empty rows | +| Row at max (1,048,576) | Accepted (at limit) | +| Duplicate row numbers | Last row wins | +| Columns beyond XFD (16,384) | Parsed if present | +| Merged cells with no data in merge area | Merge range recorded, cells empty | + +## Sheet & Workbook Edge Cases + +| Scenario | Behavior | +|----------|----------| +| Hidden/veryHidden sheets | Parsed with `state` field set | +| Sheet with no rows | Empty `rows` list | +| 100+ sheets in one workbook | All parsed | +| Sheet name with special characters | Preserved as-is | +| Defined names with `hidden="1"` | Included in output | +| Missing `xl/_rels/workbook.xml.rels` | Error: cannot resolve sheet paths | + +## File Source Compatibility + +The test corpus includes files structured to match output from: + +- **Microsoft Excel** (standard OOXML) +- **LibreOffice Calc** (may use different XML namespaces) +- **Google Sheets** (export as .xlsx) +- **opensheet-core writer** (roundtrip testing) +- **Programmatically generated** (minimal valid XLSX) + +## Fuzzing + +Fuzz targets exercise all reader entry points: + +- `fuzz_read_xlsx` — Full workbook parse +- `fuzz_read_single_sheet` — Single sheet extraction +- `fuzz_read_sheet_names` — Sheet name enumeration +- `fuzz_read_document_properties` — Document metadata parse + +Run locally: +```bash +cargo +nightly fuzz run fuzz_read_xlsx -- -max_total_time=60 +``` + +CI runs fuzzing weekly and on PRs touching `src/reader/` or `fuzz/`. diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml new file mode 100644 index 0000000..427e38c --- /dev/null +++ b/fuzz/Cargo.toml @@ -0,0 +1,40 @@ +[package] +name = "opensheet-core-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" + +[dependencies.opensheet-core] +path = ".." +default-features = false + +# --- Fuzz targets --------------------------------------------------- + +[[bin]] +name = "fuzz_read_xlsx" +path = "fuzz_targets/fuzz_read_xlsx.rs" +doc = false + +[[bin]] +name = "fuzz_read_single_sheet" +path = "fuzz_targets/fuzz_read_single_sheet.rs" +doc = false + +[[bin]] +name = "fuzz_read_sheet_names" +path = "fuzz_targets/fuzz_read_sheet_names.rs" +doc = false + +[[bin]] +name = "fuzz_read_document_properties" +path = "fuzz_targets/fuzz_read_document_properties.rs" +doc = false + +[workspace] +members = ["."] diff --git a/fuzz/fuzz_targets/fuzz_read_document_properties.rs b/fuzz/fuzz_targets/fuzz_read_document_properties.rs new file mode 100644 index 0000000..7b9b3a9 --- /dev/null +++ b/fuzz/fuzz_targets/fuzz_read_document_properties.rs @@ -0,0 +1,8 @@ +#![no_main] +use libfuzzer_sys::fuzz_target; +use std::io::Cursor; + +fuzz_target!(|data: &[u8]| { + let cursor = Cursor::new(data); + let _ = opensheet_core::reader::xlsx::read_document_properties(cursor); +}); diff --git a/fuzz/fuzz_targets/fuzz_read_sheet_names.rs b/fuzz/fuzz_targets/fuzz_read_sheet_names.rs new file mode 100644 index 0000000..2bd428f --- /dev/null +++ b/fuzz/fuzz_targets/fuzz_read_sheet_names.rs @@ -0,0 +1,8 @@ +#![no_main] +use libfuzzer_sys::fuzz_target; +use std::io::Cursor; + +fuzz_target!(|data: &[u8]| { + let cursor = Cursor::new(data); + let _ = opensheet_core::reader::xlsx::read_sheet_names(cursor); +}); diff --git a/fuzz/fuzz_targets/fuzz_read_single_sheet.rs b/fuzz/fuzz_targets/fuzz_read_single_sheet.rs new file mode 100644 index 0000000..75f2f14 --- /dev/null +++ b/fuzz/fuzz_targets/fuzz_read_single_sheet.rs @@ -0,0 +1,9 @@ +#![no_main] +use libfuzzer_sys::fuzz_target; +use std::io::Cursor; + +fuzz_target!(|data: &[u8]| { + let cursor = Cursor::new(data); + // Try reading the first sheet by index. + let _ = opensheet_core::reader::xlsx::read_single_sheet(cursor, None, Some(0)); +}); diff --git a/fuzz/fuzz_targets/fuzz_read_xlsx.rs b/fuzz/fuzz_targets/fuzz_read_xlsx.rs new file mode 100644 index 0000000..e647778 --- /dev/null +++ b/fuzz/fuzz_targets/fuzz_read_xlsx.rs @@ -0,0 +1,9 @@ +#![no_main] +use libfuzzer_sys::fuzz_target; +use std::io::Cursor; + +fuzz_target!(|data: &[u8]| { + let cursor = Cursor::new(data); + // We don't care about the result — only that it doesn't panic or hang. + let _ = opensheet_core::reader::xlsx::read_xlsx(cursor); +}); diff --git a/src/lib.rs b/src/lib.rs index 45a9043..aa04375 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,9 +6,9 @@ use pyo3::types::{ use std::fs::File; use std::io::{BufReader, BufWriter}; -mod reader; -mod types; -mod writer; +pub mod reader; +pub mod types; +pub mod writer; use types::CellValue; use writer::xlsx::StreamingXlsxWriter; diff --git a/tests/fixtures/auto_filter.xlsx b/tests/fixtures/auto_filter.xlsx new file mode 100644 index 0000000..c4e9bba Binary files /dev/null and b/tests/fixtures/auto_filter.xlsx differ diff --git a/tests/fixtures/column_widths.xlsx b/tests/fixtures/column_widths.xlsx new file mode 100644 index 0000000..a13255c Binary files /dev/null and b/tests/fixtures/column_widths.xlsx differ diff --git a/tests/fixtures/date_cells.xlsx b/tests/fixtures/date_cells.xlsx new file mode 100644 index 0000000..660547a Binary files /dev/null and b/tests/fixtures/date_cells.xlsx differ diff --git a/tests/fixtures/defined_names.xlsx b/tests/fixtures/defined_names.xlsx new file mode 100644 index 0000000..743749b Binary files /dev/null and b/tests/fixtures/defined_names.xlsx differ diff --git a/tests/fixtures/document_properties.xlsx b/tests/fixtures/document_properties.xlsx new file mode 100644 index 0000000..42b28b5 Binary files /dev/null and b/tests/fixtures/document_properties.xlsx differ diff --git a/tests/fixtures/empty_workbook.xlsx b/tests/fixtures/empty_workbook.xlsx new file mode 100644 index 0000000..5bdd58e Binary files /dev/null and b/tests/fixtures/empty_workbook.xlsx differ diff --git a/tests/fixtures/empty_zip.xlsx b/tests/fixtures/empty_zip.xlsx new file mode 100644 index 0000000..15cb0ec Binary files /dev/null and b/tests/fixtures/empty_zip.xlsx differ diff --git a/tests/fixtures/freeze_panes.xlsx b/tests/fixtures/freeze_panes.xlsx new file mode 100644 index 0000000..fc771e1 Binary files /dev/null and b/tests/fixtures/freeze_panes.xlsx differ diff --git a/tests/fixtures/huge_row_gap.xlsx b/tests/fixtures/huge_row_gap.xlsx new file mode 100644 index 0000000..c6dcc30 Binary files /dev/null and b/tests/fixtures/huge_row_gap.xlsx differ diff --git a/tests/fixtures/inline_strings.xlsx b/tests/fixtures/inline_strings.xlsx new file mode 100644 index 0000000..0954ee4 Binary files /dev/null and b/tests/fixtures/inline_strings.xlsx differ diff --git a/tests/fixtures/large_shared_strings.xlsx b/tests/fixtures/large_shared_strings.xlsx new file mode 100644 index 0000000..d00fbb5 Binary files /dev/null and b/tests/fixtures/large_shared_strings.xlsx differ diff --git a/tests/fixtures/malformed_xml.xlsx b/tests/fixtures/malformed_xml.xlsx new file mode 100644 index 0000000..6e3399e Binary files /dev/null and b/tests/fixtures/malformed_xml.xlsx differ diff --git a/tests/fixtures/many_data_types.xlsx b/tests/fixtures/many_data_types.xlsx new file mode 100644 index 0000000..f29449b Binary files /dev/null and b/tests/fixtures/many_data_types.xlsx differ diff --git a/tests/fixtures/merged_cells.xlsx b/tests/fixtures/merged_cells.xlsx new file mode 100644 index 0000000..bab00a0 Binary files /dev/null and b/tests/fixtures/merged_cells.xlsx differ diff --git a/tests/fixtures/missing_workbook.xlsx b/tests/fixtures/missing_workbook.xlsx new file mode 100644 index 0000000..de0d070 Binary files /dev/null and b/tests/fixtures/missing_workbook.xlsx differ diff --git a/tests/fixtures/multiple_sheets.xlsx b/tests/fixtures/multiple_sheets.xlsx new file mode 100644 index 0000000..026b293 Binary files /dev/null and b/tests/fixtures/multiple_sheets.xlsx differ diff --git a/tests/fixtures/negative_row_number.xlsx b/tests/fixtures/negative_row_number.xlsx new file mode 100644 index 0000000..6e7473e Binary files /dev/null and b/tests/fixtures/negative_row_number.xlsx differ diff --git a/tests/fixtures/rich_text_strings.xlsx b/tests/fixtures/rich_text_strings.xlsx new file mode 100644 index 0000000..e5df1b0 Binary files /dev/null and b/tests/fixtures/rich_text_strings.xlsx differ diff --git a/tests/fixtures/single_cell.xlsx b/tests/fixtures/single_cell.xlsx new file mode 100644 index 0000000..14f3970 Binary files /dev/null and b/tests/fixtures/single_cell.xlsx differ diff --git a/tests/fixtures/sparse_rows.xlsx b/tests/fixtures/sparse_rows.xlsx new file mode 100644 index 0000000..aaf6750 Binary files /dev/null and b/tests/fixtures/sparse_rows.xlsx differ diff --git a/tests/fixtures/truncated_zip.xlsx b/tests/fixtures/truncated_zip.xlsx new file mode 100644 index 0000000..efc602e Binary files /dev/null and b/tests/fixtures/truncated_zip.xlsx differ diff --git a/tests/fixtures/unicode_strings.xlsx b/tests/fixtures/unicode_strings.xlsx new file mode 100644 index 0000000..3af64b7 Binary files /dev/null and b/tests/fixtures/unicode_strings.xlsx differ diff --git a/tests/fixtures/wrong_string_index.xlsx b/tests/fixtures/wrong_string_index.xlsx new file mode 100644 index 0000000..bc9e0c5 Binary files /dev/null and b/tests/fixtures/wrong_string_index.xlsx differ diff --git a/tests/generate_corpus.py b/tests/generate_corpus.py new file mode 100644 index 0000000..eaf506a --- /dev/null +++ b/tests/generate_corpus.py @@ -0,0 +1,420 @@ +#!/usr/bin/env python3 +"""Generate diverse XLSX test corpus for broader coverage and fuzz seeding. + +Creates files that test edge cases across different Excel features, +data types, and structural variations. Each file is a valid XLSX that +exercises a specific code path in the reader. +""" + +import os +import struct +import zipfile +from io import BytesIO +from pathlib import Path + +FIXTURES = Path(__file__).parent / "fixtures" + + +def _minimal_xlsx(sheets_xml: dict[str, str], shared_strings: str | None = None, + workbook_xml: str | None = None, extra_files: dict[str, str] | None = None) -> bytes: + """Build a minimal XLSX from raw XML parts.""" + buf = BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: + # [Content_Types].xml + overrides = "" + for name in sheets_xml: + idx = int(name.replace("sheet", "").replace(".xml", "")) + overrides += f'\n' + if shared_strings: + overrides += '\n' + zf.writestr("[Content_Types].xml", f""" + + + + +{overrides} +""") + + # _rels/.rels + zf.writestr("_rels/.rels", """ + + +""") + + # xl/_rels/workbook.xml.rels + rels = "" + for i, name in enumerate(sheets_xml, 1): + rels += f'\n' + if shared_strings: + rels += f'\n' + zf.writestr("xl/_rels/workbook.xml.rels", f""" + +{rels} +""") + + # xl/workbook.xml + if workbook_xml is None: + sheet_entries = "" + for i, name in enumerate(sheets_xml, 1): + sheet_entries += f'\n' + workbook_xml = f""" + +{sheet_entries} +""" + zf.writestr("xl/workbook.xml", workbook_xml) + + # Worksheets + for name, xml in sheets_xml.items(): + zf.writestr(f"xl/worksheets/{name}", xml) + + # Shared strings + if shared_strings: + zf.writestr("xl/sharedStrings.xml", shared_strings) + + # Extra files + if extra_files: + for path, content in extra_files.items(): + zf.writestr(path, content) + + return buf.getvalue() + + +def _sheet_xml(rows_xml: str, merges: str = "", extras: str = "") -> str: + """Wrap row XML in a full worksheet element.""" + merge_block = f"{merges}" if merges else "" + return f""" + +{extras} +{rows_xml} +{merge_block} +""" + + +def gen_empty_workbook(): + """Workbook with a single sheet containing no data.""" + return _minimal_xlsx({"sheet1.xml": _sheet_xml("")}) + + +def gen_single_cell(): + """Workbook with exactly one cell.""" + rows = '42' + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows)}) + + +def gen_many_data_types(): + """Cells with every data type: number, bool, string, inline string, formula, empty.""" + ss = """ + +hello +world +""" + rows = """ + + 3.14 + 1 + 0 + 1 + inline + + + SUM(A1:A1)3.14 + #REF! + +""" + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows)}, shared_strings=ss) + + +def gen_multiple_sheets(): + """Workbook with 5 sheets including hidden ones.""" + sheets = {} + for i in range(1, 6): + rows = f'{i}' + sheets[f"sheet{i}.xml"] = _sheet_xml(rows) + + sheet_entries = "" + for i in range(1, 6): + state = ' state="hidden"' if i == 3 else (' state="veryHidden"' if i == 5 else "") + sheet_entries += f'\n' + wb = f""" + +{sheet_entries} +""" + return _minimal_xlsx(sheets, workbook_xml=wb) + + +def gen_merged_cells(): + """Sheet with merged cell regions.""" + rows = """ +12 +3 +4""" + merges = '' + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows, merges=merges)}) + + +def gen_sparse_rows(): + """Sheet with gaps in row numbers (rows 1, 5, 1000).""" + rows = """ +1 +5 +999""" + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows)}) + + +def gen_unicode_strings(): + """Shared strings with Unicode: emoji, CJK, RTL, combining chars.""" + strings = [ + "Hello 🌍🎉", + "日本語テスト", + "العربية", + "café résumé naïve", + "Z̤͔ͧ̑a̴̬l̶̞g̗̲ͧo̙̫", # Combining characters + "", # Empty string + " ", # Space only + "\t\n\r", # Whitespace + "a" * 32767, # Max Excel cell length + ] + si_entries = "".join(f"{s}" for s in strings) + ss = f""" + +{si_entries} +""" + cells = "".join(f'{i}' for i in range(len(strings))) + rows = f'{cells}' + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows)}, shared_strings=ss) + + +def gen_rich_text_strings(): + """Shared strings with rich text runs ( elements).""" + ss = """ + +Bold Normal +Red Italic +""" + rows = '01' + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows)}, shared_strings=ss) + + +def gen_freeze_panes(): + """Sheet with freeze panes set.""" + extras = '' + rows = '1' + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows, extras=extras)}) + + +def gen_auto_filter(): + """Sheet with auto-filter on a range.""" + extras = "" + rows = """ +NameAge +Alice30""" + af = '' + sheet = f""" + +{rows} +{af} +""" + return _minimal_xlsx({"sheet1.xml": sheet}) + + +def gen_defined_names(): + """Workbook with named ranges / defined names.""" + sheets = {"sheet1.xml": _sheet_xml('1')} + wb = """ + + + +Sheet1!$A$1:$C$10 +Sheet1!$A$1:$Z$100 + + +""" + return _minimal_xlsx(sheets, workbook_xml=wb) + + +def gen_large_shared_strings(): + """Large shared string table (1000 entries).""" + entries = "".join(f"string_{i:04d}" for i in range(1000)) + ss = f""" + +{entries} +""" + cells = "".join(f'{i}' for i in range(100)) + rows = "" + for r in range(1, 5): + row_cells = "".join(f'{(r-1)*26+c}' for c in range(26)) + rows += f'{row_cells}' + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows)}, shared_strings=ss) + + +def gen_date_cells(): + """Cells with date serial numbers and a styles.xml with date formats.""" + styles = """ + + + + + + + + + + + +""" + rows = """ + + 44197 + 44197 + 1 + 44197 +""" + extra = {"xl/styles.xml": styles} + content_type_override = '' + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows)}, extra_files=extra) + + +def gen_column_widths(): + """Sheet with custom column widths.""" + extras = '' + rows = '1' + sheet = f""" + +{extras} +{rows} +""" + return _minimal_xlsx({"sheet1.xml": sheet}) + + +def gen_inline_strings(): + """Cells using inline string type (t="inlineStr") with ....""" + rows = """ + + Inline A + Inline B + Rich inline +""" + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows)}) + + +def gen_document_properties(): + """Workbook with core and custom document properties.""" + core = """ + +Test Workbook +Test Author +A test description +2024-01-15T10:30:00Z +""" + custom = """ + + +Engineering + +""" + extras = {"docProps/core.xml": core, "docProps/custom.xml": custom} + rows = '1' + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows)}, extra_files=extras) + + +# ---- Malformed / edge-case files for robustness testing ---- + +def gen_truncated_zip(): + """A valid ZIP header followed by truncated data.""" + valid = gen_single_cell() + return valid[:len(valid) // 2] # Cut in half + + +def gen_empty_zip(): + """A valid but empty ZIP file (no entries).""" + buf = BytesIO() + with zipfile.ZipFile(buf, "w"): + pass + return buf.getvalue() + + +def gen_missing_workbook(): + """ZIP with worksheet but no workbook.xml.""" + buf = BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("[Content_Types].xml", '') + zf.writestr("xl/worksheets/sheet1.xml", _sheet_xml('1')) + return buf.getvalue() + + +def gen_malformed_xml(): + """XLSX with syntactically invalid XML in the worksheet.""" + rows = '1>>"}) + + +def gen_wrong_string_index(): + """Shared string reference pointing beyond table size.""" + ss = """ + +only_one +""" + rows = '999' + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows)}, shared_strings=ss) + + +def gen_negative_row_number(): + """Row with negative or zero row number.""" + rows = '0' + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows)}) + + +def gen_huge_row_gap(): + """Rows at 1 and 1048576 (Excel max), testing sparse allocation.""" + rows = '12' + return _minimal_xlsx({"sheet1.xml": _sheet_xml(rows)}) + + +GENERATORS = { + # Valid files + "empty_workbook.xlsx": gen_empty_workbook, + "single_cell.xlsx": gen_single_cell, + "many_data_types.xlsx": gen_many_data_types, + "multiple_sheets.xlsx": gen_multiple_sheets, + "merged_cells.xlsx": gen_merged_cells, + "sparse_rows.xlsx": gen_sparse_rows, + "unicode_strings.xlsx": gen_unicode_strings, + "rich_text_strings.xlsx": gen_rich_text_strings, + "freeze_panes.xlsx": gen_freeze_panes, + "auto_filter.xlsx": gen_auto_filter, + "defined_names.xlsx": gen_defined_names, + "large_shared_strings.xlsx": gen_large_shared_strings, + "date_cells.xlsx": gen_date_cells, + "column_widths.xlsx": gen_column_widths, + "inline_strings.xlsx": gen_inline_strings, + "document_properties.xlsx": gen_document_properties, + # Malformed / edge cases + "truncated_zip.xlsx": gen_truncated_zip, + "empty_zip.xlsx": gen_empty_zip, + "missing_workbook.xlsx": gen_missing_workbook, + "malformed_xml.xlsx": gen_malformed_xml, + "wrong_string_index.xlsx": gen_wrong_string_index, + "negative_row_number.xlsx": gen_negative_row_number, + "huge_row_gap.xlsx": gen_huge_row_gap, +} + + +def main(): + FIXTURES.mkdir(parents=True, exist_ok=True) + for name, gen in GENERATORS.items(): + path = FIXTURES / name + data = gen() + path.write_bytes(data) + print(f" {name} ({len(data):,} bytes)") + print(f"\nGenerated {len(GENERATORS)} files in {FIXTURES}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_corpus.py b/tests/test_corpus.py new file mode 100644 index 0000000..ac481cb --- /dev/null +++ b/tests/test_corpus.py @@ -0,0 +1,236 @@ +"""Test the reader against a diverse corpus of XLSX files. + +Covers valid files (different features, data types, structural variations) +and malformed files (truncated, missing parts, invalid XML) to ensure +the reader either succeeds or returns a clean error — never panics. +""" + +import os +import pytest +from pathlib import Path + +import opensheet_core + +FIXTURES = Path(__file__).parent / "fixtures" + + +# ---- Helpers ---- + +def read_or_error(path: str): + """Try reading; return result or exception (never raises).""" + try: + return opensheet_core.read_xlsx(path) + except Exception as e: + return e + + +# ---- Valid corpus: must parse successfully ---- + +class TestValidCorpus: + """Files that are valid XLSX and must parse without error.""" + + def test_empty_workbook(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "empty_workbook.xlsx")) + assert len(sheets) == 1 + assert sheets[0]["rows"] == [] + + def test_single_cell(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "single_cell.xlsx")) + rows = sheets[0]["rows"] + assert len(rows) == 1 + assert rows[0][0] == 42.0 + + def test_many_data_types(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "many_data_types.xlsx")) + sheet = sheets[0] + row1 = sheet["rows"][0] + # Number + assert row1[0] == pytest.approx(3.14) + # Bool + assert row1[1] is True + # Shared strings + assert row1[2] == "hello" + assert row1[3] == "world" + + def test_multiple_sheets(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "multiple_sheets.xlsx")) + assert len(sheets) == 5 + # Check sheet states + states = [s["state"] for s in sheets] + assert states[0] == "visible" + assert states[2] == "hidden" + assert states[4] == "veryHidden" + + def test_merged_cells(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "merged_cells.xlsx")) + sheet = sheets[0] + assert len(sheet["merges"]) == 2 + assert "A1:B2" in sheet["merges"] + + def test_sparse_rows(self): + """Sparse rows should produce a grid with gaps filled by None.""" + sheets = opensheet_core.read_xlsx(str(FIXTURES / "sparse_rows.xlsx")) + rows = sheets[0]["rows"] + assert len(rows) >= 1000 # Should have rows up to 1000 + + def test_unicode_strings(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "unicode_strings.xlsx")) + row = sheets[0]["rows"][0] + assert "🌍" in row[0] # Emoji + assert "日本語" in row[1] # CJK + + def test_rich_text_strings(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "rich_text_strings.xlsx")) + row = sheets[0]["rows"][0] + assert "Bold" in row[0] + assert "Normal" in row[0] + + def test_freeze_panes(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "freeze_panes.xlsx")) + sheet = sheets[0] + assert sheet["freeze_pane"] is not None + + def test_auto_filter(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "auto_filter.xlsx")) + sheet = sheets[0] + assert sheet["auto_filter"] == "A1:B2" + + def test_defined_names(self): + names = opensheet_core.defined_names(str(FIXTURES / "defined_names.xlsx")) + assert any(n["name"] == "MyRange" for n in names) + + def test_large_shared_strings(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "large_shared_strings.xlsx")) + sheet = sheets[0] + assert len(sheet["rows"]) > 0 + + def test_date_cells(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "date_cells.xlsx")) + rows = sheets[0]["rows"] + assert len(rows) > 0 + + def test_column_widths(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "column_widths.xlsx")) + widths = sheets[0]["column_widths"] + assert len(widths) > 0 + + def test_inline_strings(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "inline_strings.xlsx")) + row = sheets[0]["rows"][0] + assert row[0] == "Inline A" + assert row[1] == "Inline B" + + def test_document_properties(self): + sheets = opensheet_core.read_xlsx(str(FIXTURES / "document_properties.xlsx")) + assert len(sheets) > 0 + + def test_sheet_names(self): + """sheet_names() should work on all valid files.""" + for name in ["multiple_sheets.xlsx", "defined_names.xlsx", "empty_workbook.xlsx"]: + names = opensheet_core.sheet_names(str(FIXTURES / name)) + assert isinstance(names, list) + assert len(names) > 0 + + def test_read_sheet_by_index(self): + rows = opensheet_core.read_sheet(str(FIXTURES / "multiple_sheets.xlsx"), sheet_index=1) + assert isinstance(rows, list) + + def test_read_sheet_by_name(self): + rows = opensheet_core.read_sheet(str(FIXTURES / "multiple_sheets.xlsx"), sheet_name="Sheet3") + assert isinstance(rows, list) + + +# ---- Malformed corpus: must not panic ---- + +class TestMalformedCorpus: + """Files that are invalid or malformed. The reader must return + a clean error (exception), never panic or hang.""" + + def test_truncated_zip(self): + r = read_or_error(str(FIXTURES / "truncated_zip.xlsx")) + assert isinstance(r, Exception) + + def test_empty_zip(self): + r = read_or_error(str(FIXTURES / "empty_zip.xlsx")) + assert isinstance(r, Exception) + + def test_missing_workbook(self): + r = read_or_error(str(FIXTURES / "missing_workbook.xlsx")) + assert isinstance(r, Exception) + + def test_malformed_xml(self): + """Malformed XML should produce an error or degrade gracefully.""" + r = read_or_error(str(FIXTURES / "malformed_xml.xlsx")) + # May succeed partially or error — just must not panic + assert r is not None + + def test_wrong_string_index(self): + """Out-of-bounds shared string index should not panic.""" + r = read_or_error(str(FIXTURES / "wrong_string_index.xlsx")) + assert r is not None + + def test_negative_row_number(self): + """Row number 0 should not cause a panic.""" + r = read_or_error(str(FIXTURES / "negative_row_number.xlsx")) + assert r is not None + + def test_huge_row_gap(self): + """Rows at 1 and 1048576 should not OOM or panic.""" + r = read_or_error(str(FIXTURES / "huge_row_gap.xlsx")) + assert r is not None + + def test_random_bytes(self): + """Completely random data should produce a clean error.""" + import tempfile + with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as f: + f.write(os.urandom(1024)) + f.flush() + r = read_or_error(f.name) + assert isinstance(r, Exception) + os.unlink(f.name) + + def test_nonexistent_file(self): + r = read_or_error("/nonexistent/path/file.xlsx") + assert isinstance(r, Exception) + + +# ---- Roundtrip: write then read ---- + +class TestRoundtrip: + """Write diverse data and verify we can read it back.""" + + def test_roundtrip_all_types(self): + import tempfile + path = tempfile.mktemp(suffix=".xlsx") + try: + w = opensheet_core.XlsxWriter(path) + w.add_sheet("Types") + w.write_row([1, 2.5, True, False, "text", None, ""]) + w.write_row([0, -1, 1e10, 1e-10, "🎉", " ", "\t"]) + w.close() + + sheets = opensheet_core.read_xlsx(path) + rows = sheets[0]["rows"] + assert rows[0][0] == 1.0 + assert rows[0][2] is True + assert rows[0][4] == "text" + assert rows[1][4] == "🎉" + finally: + os.unlink(path) + + def test_roundtrip_many_sheets(self): + import tempfile + path = tempfile.mktemp(suffix=".xlsx") + try: + w = opensheet_core.XlsxWriter(path) + for i in range(10): + w.add_sheet(f"Sheet{i}") + w.write_row([i, f"data_{i}"]) + w.close() + + names = opensheet_core.sheet_names(path) + assert len(names) == 10 + assert names[0] == "Sheet0" + assert names[9] == "Sheet9" + finally: + os.unlink(path)