diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index 05cb5c9..7ac920e 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -10,6 +10,7 @@ {"id":"vgi-python-67w","title":"Create example function using DuckDB settings","description":"Create an example function that demonstrates using DuckDB settings to determine its output.\n\nRequirements:\n- Function declares required_settings in Meta\n- Output schema depends on a setting value (e.g., include extra column based on setting)\n- Clear documentation showing the pattern\n\nExample ideas:\n1. TimezoneAwareFunction: Output includes timezone info based on 'timezone' setting\n2. VerboseOutput: Adds debug columns when 'debug_mode' setting is true\n3. NumericPrecision: Uses 'numeric_precision' to determine output type precision\n\nAdd to vgi/examples/ and register in ExampleWorker.","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-04T13:05:48.503681-05:00","created_by":"rusty","updated_at":"2026-01-04T13:22:23.779895-05:00","closed_at":"2026-01-04T13:22:23.779895-05:00","close_reason":"Added SettingsAwareFunction example","dependencies":[{"issue_id":"vgi-python-67w","depends_on_id":"vgi-python-c2b","type":"blocks","created_at":"2026-01-04T13:06:13.865474-05:00","created_by":"rusty"},{"issue_id":"vgi-python-67w","depends_on_id":"vgi-python-ivf","type":"blocks","created_at":"2026-01-04T13:06:13.890269-05:00","created_by":"rusty"},{"issue_id":"vgi-python-67w","depends_on_id":"vgi-python-bqb","type":"blocks","created_at":"2026-01-04T13:06:13.912531-05:00","created_by":"rusty"},{"issue_id":"vgi-python-67w","depends_on_id":"vgi-python-a99","type":"blocks","created_at":"2026-01-04T13:06:13.936552-05:00","created_by":"rusty"},{"issue_id":"vgi-python-67w","depends_on_id":"vgi-python-j4t","type":"blocks","created_at":"2026-01-04T13:06:13.958494-05:00","created_by":"rusty"}]} {"id":"vgi-python-6kr","title":"Test RowCountMismatchError when output exceeds input rows","notes":"Coverage: 86% in vgi/scalar_function.py. Missing tests for:\n- Lines 134-142: Error message when output has MORE rows than input\n\nCurrent tests cover when output \u003c input but not output \u003e input.\nNeed a test that returns an array with more elements than input rows.","status":"closed","priority":3,"issue_type":"task","created_at":"2026-01-04T22:15:26.097532-05:00","created_by":"rusty","updated_at":"2026-01-04T22:32:00.720462-05:00","closed_at":"2026-01-04T22:32:00.720462-05:00","close_reason":"Added test for RowCountMismatchError when output exceeds input. Coverage improved from 86% to 93%."} {"id":"vgi-python-6o0","title":"Consolidate _OutputComplete classes into shared module","description":"Three nearly identical _OutputComplete classes exist in scalar_function.py:168-197 (_ScalarOutputComplete), table_function.py:136-175 (_OutputComplete), and table_in_out_function.py:356-400 (_OutputComplete). All are frozen dataclasses with batch field, log_message field, and from_process_result() classmethod. Extract to shared module (e.g., vgi/protocol_types.py) with a single parameterized class.","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-04T20:06:40.893139-05:00","created_by":"rusty","updated_at":"2026-01-04T21:18:34.529683-05:00","closed_at":"2026-01-04T21:18:34.529683-05:00","close_reason":"PR #5 created: https://github.com/Query-farm/vgi-python/pull/5"} +{"id":"vgi-python-790","title":"Add slots=True to ArgumentSpec dataclass","description":"ArgumentSpec is a frozen dataclass but doesn't use slots=True. Adding slots=True would reduce memory footprint and improve attribute access speed, which matters if many specs are created during introspection.","status":"open","priority":4,"issue_type":"task","created_at":"2026-01-05T11:51:20.675386-05:00","created_by":"rusty","updated_at":"2026-01-05T11:51:20.675386-05:00"} {"id":"vgi-python-79e","title":"Unify ProtocolInput classes with shared base","description":"ProtocolInput classes in scalar_function.py:151-166 and table_in_out_function.py:109-142 have similar structure with batch and metadata fields. The table_in_out version adds is_finalize logic. Create shared base ProtocolInput in protocol_types.py with table_in_out extending it.","status":"closed","priority":3,"issue_type":"task","created_at":"2026-01-04T20:06:41.31917-05:00","created_by":"rusty","updated_at":"2026-01-04T21:53:26.965345-05:00","closed_at":"2026-01-04T21:53:26.965345-05:00","close_reason":"PR #9 created - unified ProtocolInput with shared base in protocol_types.py"} {"id":"vgi-python-8ra","title":"Implement Arrow-based argument specification serialization","description":"## Overview\n\nImplement serialization and deserialization of function argument specifications using Apache Arrow schemas. This enables functions to describe their argument signatures (types, positions, special markers) in a format that can be transmitted over IPC and understood by DuckDB for function registration.\n\n## Design\n\nUses a **single Arrow schema** where:\n- Positional arguments come first (field order = position index)\n- Named arguments follow (marked with `vgi_arg=named` metadata)\n- Special types (TableInput, AnyArrow, varargs) use field metadata markers\n\n## Key Components\n\n1. `ArgumentSpec` dataclass - represents one argument's specification\n2. `argument_specs_to_schema()` - convert specs to Arrow schema\n3. `schema_to_argument_specs()` - convert schema back to specs\n4. `extract_argument_specs()` - extract specs from function class Arg descriptors\n\n## Metadata Keys\n\n| Key | Value | Meaning |\n|-----|-------|---------|\n| `vgi_arg` | `named` | Named argument (not positional) |\n| `vgi_type` | `table` | Receives table input (Arg[TableInput]) |\n| `vgi_type` | `any` | Accepts any Arrow type (Arg[AnyArrow]) |\n| `vgi_varargs` | `true` | Collects remaining positional args |\n\n## References\n\n- Plan file: `.claude/plans/purrfect-foraging-nygaard.md`\n- Arguments module: `vgi/arguments.py`","status":"closed","priority":2,"issue_type":"feature","created_at":"2026-01-05T11:18:01.05631-05:00","created_by":"rusty","updated_at":"2026-01-05T11:34:12.712096-05:00","closed_at":"2026-01-05T11:34:12.712096-05:00","close_reason":"Implemented Arrow-based argument specification serialization with tests and documentation"} {"id":"vgi-python-a99","title":"Add settings accessor to function base classes","description":"Add a property to access DuckDB settings values in function implementations.\n\nChanges needed:\n- Add 'settings: dict[str, str]' property to Function base class\n- Property should return self.invocation.duckdb_settings or empty dict\n- Add convenience method like 'get_setting(name, default=None)'\n- Update ScalarFunction, TableFunctionGenerator, TableInOutFunction\n\nExample usage in function:\ndef compute(self, batch):\n tz = self.get_setting('timezone', 'UTC')\n # or\n tz = self.settings.get('timezone', 'UTC')","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-04T13:05:48.221602-05:00","created_by":"rusty","updated_at":"2026-01-04T13:20:41.171991-05:00","closed_at":"2026-01-04T13:20:41.171991-05:00","close_reason":"Implementation complete, all tests pass","dependencies":[{"issue_id":"vgi-python-a99","depends_on_id":"vgi-python-aad","type":"blocks","created_at":"2026-01-04T13:06:13.738212-05:00","created_by":"rusty"}]} @@ -29,20 +30,27 @@ {"id":"vgi-python-e37","title":"move Invocation from function.py out to own file","description":"The Invocation clas is kind of seperate from functions, so it should be in its own file. Move it and all of its other associated classes like InvocationType to its own file","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-04T09:18:46.605941-05:00","created_by":"rusty","updated_at":"2026-01-04T09:24:37.922675-05:00","closed_at":"2026-01-04T09:24:37.922675-05:00","close_reason":"Closed"} {"id":"vgi-python-e9q","title":"Unify ProtocolOutput classes with shared base","description":"ProtocolOutput classes in table_function.py:177-224 and table_in_out_function.py:144-207 share similar metadata() method and from_process_result() classmethod. The table_in_out version adds status field. Create shared base with table_in_out extending it for status support.","status":"closed","priority":3,"issue_type":"task","created_at":"2026-01-04T20:06:41.45014-05:00","created_by":"rusty","updated_at":"2026-01-04T21:54:55.871986-05:00","closed_at":"2026-01-04T21:54:55.871986-05:00","close_reason":"Not warranted - dataclass inheritance with slots=True doesn't allow adding required field (status) between inherited fields. The classes have different semantics (table_in_out requires status for generator state tracking) making inheritance impractical."} {"id":"vgi-python-g1m","title":"Use sentinel type pattern instead of Any for _MISSING in arguments.py","notes":"Line 33: _MISSING: Any = object()\n\nReplace with proper sentinel type pattern:\n```python\nfrom typing import Final\n\nclass _Missing:\n __slots__ = ()\n def __repr__(self) -\u003e str:\n return '\u003cMISSING\u003e'\n\nMISSING: Final = _Missing()\n```\n\nThis removes 1 Any and provides better type safety for default value checking.\nPart of 26.89% imprecision in arguments.py (59 Anys total).","status":"closed","priority":4,"issue_type":"task","created_at":"2026-01-04T22:19:50.079174-05:00","created_by":"rusty","updated_at":"2026-01-04T22:35:56.508153-05:00","closed_at":"2026-01-04T22:35:56.508153-05:00","close_reason":"Replaced _MISSING: Any = object() with proper _MissingType sentinel class. Improves type safety and removes 1 Any."} +{"id":"vgi-python-g7i","title":"Add validation for contiguous positional indices","description":"Neither argument_specs_to_schema() nor schema_to_argument_specs() validates that positional argument indices are contiguous (0, 1, 2...). Gaps like (0, 2, 3) would serialize fine but might indicate a bug. Consider adding validation that positional indices form a contiguous sequence starting from 0.","status":"open","priority":3,"issue_type":"task","created_at":"2026-01-05T11:51:19.868862-05:00","created_by":"rusty","updated_at":"2026-01-05T11:51:19.868862-05:00"} {"id":"vgi-python-ivf","title":"Add required_settings to function Meta class","description":"Update function metadata to support declaring required DuckDB settings.\n\nChanges needed:\n- Add 'required_settings: list[str]' to FunctionMeta in vgi/metadata.py\n- Update Meta class resolution in vgi/function.py\n- Add validation that required_settings is a list of strings\n- Make it available via get_metadata() for introspection\n\nExample usage:\nclass MyFunction(TableInOutFunction):\n class Meta:\n required_settings = ['timezone', 'threads']","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-04T13:05:47.903747-05:00","created_by":"rusty","updated_at":"2026-01-04T13:20:41.169516-05:00","closed_at":"2026-01-04T13:20:41.169516-05:00","close_reason":"Implementation complete, all tests pass","dependencies":[{"issue_id":"vgi-python-ivf","depends_on_id":"vgi-python-aad","type":"blocks","created_at":"2026-01-04T13:06:13.690253-05:00","created_by":"rusty"}]} {"id":"vgi-python-j4t","title":"Update client to pass DuckDB settings in Invocation","description":"Update vgi/client/client.py to support passing DuckDB settings.\n\nChanges needed:\n- Add 'duckdb_settings: dict[str, str] | None = None' parameter to relevant methods\n- Include settings in Invocation creation\n- Add helper to query function's required_settings from metadata\n\nThe client needs to know what settings to pass. Options:\n1. Client queries worker for function metadata first\n2. Settings passed explicitly by caller\n3. Client introspects function class if available locally","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-04T13:05:48.358656-05:00","created_by":"rusty","updated_at":"2026-01-04T13:20:41.173178-05:00","closed_at":"2026-01-04T13:20:41.173178-05:00","close_reason":"Implementation complete, all tests pass","dependencies":[{"issue_id":"vgi-python-j4t","depends_on_id":"vgi-python-aad","type":"blocks","created_at":"2026-01-04T13:06:13.761572-05:00","created_by":"rusty"}]} +{"id":"vgi-python-j8a","title":"Investigate named argument field name ambiguity","description":"For named arguments, position is set to field.name (the SQL key). But there's potential ambiguity between the Python attribute name and the named argument key if they could differ. Currently they're the same, but if ArgumentSpec.name (attribute) ever differs from ArgumentSpec.position (key), the schema only preserves one. Investigate whether this is a real concern or document the assumption that they're always equal.","status":"open","priority":4,"issue_type":"task","created_at":"2026-01-05T11:51:20.257539-05:00","created_by":"rusty","updated_at":"2026-01-05T11:51:20.257539-05:00"} {"id":"vgi-python-j9k","title":"Add protocol types for IPC stream writers in cli.py","notes":"Line 53: self._writer: Any = None\n\nCould define a Protocol type for the IPC stream writer interface:\n```python\nclass IPCWriter(Protocol):\n def write_batch(self, batch: pa.RecordBatch) -\u003e None: ...\n def close(self) -\u003e None: ...\n```\n\nPart of 14.17% imprecision in cli.py (34 Anys total).","status":"closed","priority":4,"issue_type":"task","created_at":"2026-01-04T22:19:50.31711-05:00","created_by":"rusty","updated_at":"2026-01-04T22:37:01.488788-05:00","closed_at":"2026-01-04T22:37:01.488788-05:00","close_reason":"Replaced _writer: Any with _writer: pq.ParquetWriter | None. Removes 1 Any and provides proper type information."} {"id":"vgi-python-jrf","title":"Add varargs parameter to Arg descriptor","description":"In vgi/arguments.py:\n- Add varargs: bool = False to Arg.__init__ and __slots__\n- Update _resolve() to collect positional[position:] when varargs=True\n- Validate at least 1 value provided\n- Update _validate() to validate each element in tuple\n- Add Arguments.get_varargs(start, type=None) method\n- Update __repr__ to show varargs flag","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-05T10:49:20.012964-05:00","created_by":"rusty","updated_at":"2026-01-05T10:55:22.479344-05:00","closed_at":"2026-01-05T10:55:22.479344-05:00","close_reason":"Implemented varargs parameter in Arg descriptor with get_varargs() method and _validate_single()"} +{"id":"vgi-python-k7x","title":"Use Mapping instead of dict in extract_argument_specs signature","description":"The arg_types parameter in extract_argument_specs() is typed as dict[str, pa.DataType]. Using Mapping[str, pa.DataType] from collections.abc would be more flexible, accepting any mapping type.","status":"open","priority":4,"issue_type":"task","created_at":"2026-01-05T11:51:21.021496-05:00","created_by":"rusty","updated_at":"2026-01-05T11:51:21.021496-05:00"} {"id":"vgi-python-kz4","title":"Rename TableInOutGeneratorFunction to TableInOutGenerator for consistency","description":"Naming inconsistency: TableFunctionGenerator uses *Generator suffix, but TableInOutGeneratorFunction uses *GeneratorFunction suffix. Rename TableInOutGeneratorFunction to TableInOutGenerator for consistency. Also consider renaming ScalarFunctionGenerator if needed.","status":"closed","priority":3,"issue_type":"task","created_at":"2026-01-04T20:06:41.581028-05:00","created_by":"rusty","updated_at":"2026-01-04T21:43:58.141038-05:00","closed_at":"2026-01-04T21:43:58.141038-05:00","close_reason":"PR #7 created: https://github.com/Query-farm/vgi-python/pull/7"} +{"id":"vgi-python-l1u","title":"Consider custom __repr__ for ArgumentSpec","description":"The default dataclass __repr__ includes the full Arrow type repr which can be verbose. Consider a custom __repr__ that's more concise for debugging, e.g., 'ArgumentSpec(name=\"count\", pos=0, type=int64)' instead of showing the full pa.DataType object.","status":"open","priority":4,"issue_type":"task","created_at":"2026-01-05T11:51:21.415976-05:00","created_by":"rusty","updated_at":"2026-01-05T11:51:21.415976-05:00"} {"id":"vgi-python-lec","title":"Add test coverage for testing.py helper edge cases","notes":"Coverage: 89% in vgi/testing.py. Missing tests for:\n- Lines 421-422, 450-451: StopIteration handling in _process_batch\n- Lines 468-472: FINISHED status during data phase\n- Lines 485-486, 502-503: _finalize edge cases\n\nLow priority since these are test helpers.","status":"open","priority":4,"issue_type":"task","created_at":"2026-01-04T22:15:34.006563-05:00","created_by":"rusty","updated_at":"2026-01-04T22:16:18.592782-05:00"} +{"id":"vgi-python-lzc","title":"Extract duplicated sort_key function in argument_spec","description":"The sort_key function is duplicated at lines 139-142 and 309-312 in argument_spec.py. Extract it to a module-level function to follow DRY principles.","status":"open","priority":3,"issue_type":"task","created_at":"2026-01-05T11:51:19.141041-05:00","created_by":"rusty","updated_at":"2026-01-05T11:51:19.141041-05:00"} {"id":"vgi-python-m45","title":"Create tests/test_argument_spec.py","description":"## Overview\n\nCreate comprehensive tests for the argument specification serialization module.\n\n## File Location\n\n`tests/test_argument_spec.py`\n\n## Test Classes and Cases\n\n### TestArgumentSpecToSchema\n\nTest converting ArgumentSpec objects to Arrow schema.\n\n#### test_positional_arguments_preserve_order\n- Create specs with positions 0, 1, 2\n- Convert to schema\n- Verify field order matches position order\n- Verify field types are preserved\n\n#### test_named_arguments_have_metadata\n- Create spec with position='key' (named)\n- Convert to schema\n- Verify field has `vgi_arg=named` metadata\n\n#### test_mixed_positional_and_named\n- Create mix of positional (0, 1) and named ('format', 'verbose') specs\n- Convert to schema\n- Verify positional come first, then named\n- Verify named have correct metadata\n\n#### test_table_input_uses_null_type\n- Create spec with is_table_input=True\n- Convert to schema\n- Verify field type is pa.null()\n- Verify field has `vgi_type=table` metadata\n\n#### test_any_type_uses_null_type\n- Create spec with is_any_type=True\n- Convert to schema\n- Verify field type is pa.null()\n- Verify field has `vgi_type=any` metadata\n\n#### test_varargs_has_metadata\n- Create spec with is_varargs=True and arrow_type=pa.int64()\n- Convert to schema\n- Verify field type is pa.int64() (element type preserved)\n- Verify field has `vgi_varargs=true` metadata\n\n### TestSchemaToArgumentSpecs\n\nTest converting Arrow schema back to ArgumentSpec objects.\n\n#### test_positional_arguments_from_schema\n- Create schema with 3 fields (no metadata)\n- Convert to specs\n- Verify positions are 0, 1, 2\n\n#### test_named_arguments_from_metadata\n- Create schema with `vgi_arg=named` metadata on fields\n- Convert to specs\n- Verify position is field name string\n\n#### test_table_input_detected\n- Create schema with `vgi_type=table` metadata\n- Convert to specs\n- Verify is_table_input=True\n\n#### test_any_type_detected\n- Create schema with `vgi_type=any` metadata\n- Convert to specs\n- Verify is_any_type=True\n\n#### test_varargs_detected\n- Create schema with `vgi_varargs=true` metadata\n- Convert to specs\n- Verify is_varargs=True\n\n### TestRoundTrip\n\nTest that specs survive serialization round-trip.\n\n#### test_complex_arrow_types_preserved\nTest each of these types round-trips correctly:\n- pa.int64(), pa.float32(), pa.utf8()\n- pa.list_(pa.float64())\n- pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string())])\n- pa.map_(pa.string(), pa.int64())\n- pa.decimal128(10, 2)\n- pa.timestamp('us', tz='UTC')\n\n#### test_full_function_signature_roundtrip\n- Create specs matching a realistic function:\n - count: int, position 0\n - data: TableInput, position 1\n - extra: float varargs, position 2\n - format: str, named 'format'\n- Convert to schema, serialize to bytes, deserialize, convert back to specs\n- Verify all specs match original\n\n### TestExtractArgumentSpecs\n\nTest extracting specs from function classes.\n\n#### test_extract_from_simple_function\n- Define function class with Arg descriptors\n- Call extract_argument_specs with arg_types dict\n- Verify specs match descriptors\n\n#### test_extract_table_input\n- Define function with Arg[TableInput]\n- Extract specs\n- Verify is_table_input=True\n\n#### test_extract_any_arrow\n- Define function with Arg[AnyArrow]\n- Extract specs\n- Verify is_any_type=True\n\n#### test_extract_varargs\n- Define function with Arg[int](2, varargs=True)\n- Extract specs\n- Verify is_varargs=True\n\n### TestEdgeCases\n\n#### test_empty_schema\n- Convert empty list of specs to schema\n- Verify empty schema works\n- Convert back, verify empty list\n\n#### test_only_named_arguments\n- Create specs with only named arguments (no positional)\n- Round-trip and verify\n\n#### test_only_positional_arguments\n- Create specs with only positional arguments (no named)\n- Round-trip and verify\n\n## Test Utilities\n\nConsider creating fixtures for common patterns:\n- `make_spec()` helper for creating ArgumentSpec\n- Sample function classes for extraction tests","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-05T11:18:53.312911-05:00","created_by":"rusty","updated_at":"2026-01-05T11:32:35.580879-05:00","closed_at":"2026-01-05T11:32:35.580879-05:00","close_reason":"Created comprehensive tests with 43 passing test cases","dependencies":[{"issue_id":"vgi-python-m45","depends_on_id":"vgi-python-cd0","type":"blocks","created_at":"2026-01-05T11:19:30.779207-05:00","created_by":"rusty"}]} {"id":"vgi-python-odi","title":"Change max_processes from method to property in Function hierarchy","description":"Refactor max_processes from a method to a property across the Function class hierarchy (Function, ScalarFunction, TableFunctionGenerator, TableInOutFunction, etc.). This makes the API more consistent since max_processes is effectively a constant per function class and properties are more idiomatic for such values.","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-04T11:25:29.750648-05:00","created_by":"rusty","updated_at":"2026-01-04T11:50:57.566545-05:00","closed_at":"2026-01-04T11:50:57.566545-05:00","close_reason":"Closed"} {"id":"vgi-python-p91","title":"Move exception classes from function.py to own file","description":"Move InitIdentifierError and SchemaValidationError from vgi/function.py to a new vgi/exceptions.py file. Update imports in function.py and any other files that reference these exceptions.","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-04T09:12:28.058227-05:00","created_by":"rusty","updated_at":"2026-01-04T09:17:52.477661-05:00","closed_at":"2026-01-04T09:17:52.477661-05:00","close_reason":"Closed"} {"id":"vgi-python-qud","title":"Test FunctionStorageSqlite: global_delete, global_exists, queue_clear","notes":"Coverage: 83% in vgi/function_storage.py. Missing tests for:\n- Line 266: KeyError path in global_get (key not found)\n- Lines 273-278: global_delete method\n- Lines 282-290: global_exists method \n- Line 337: queue_push with empty list\n- Lines 376-385: queue_clear method\n\nThese storage operations need direct unit tests to ensure correctness.","status":"closed","priority":3,"issue_type":"task","created_at":"2026-01-04T22:15:25.982124-05:00","created_by":"rusty","updated_at":"2026-01-04T22:30:05.625934-05:00","closed_at":"2026-01-04T22:30:05.625934-05:00","close_reason":"Added comprehensive tests for FunctionStorageSqlite. Coverage improved from 83% to 98%."} {"id":"vgi-python-r3t","title":"Consolidate test client infrastructure in testing.py","description":"testing.py has three test client classes (FunctionTestClient, TableFunctionTestClient, ScalarFunctionTestClient) with shared infrastructure patterns. Extend _BaseTestClient pattern to reduce code duplication. Consider using a single unified client with method dispatch based on function type.","status":"closed","priority":3,"issue_type":"task","created_at":"2026-01-04T20:06:53.913912-05:00","created_by":"rusty","updated_at":"2026-01-04T22:02:51.368907-05:00","closed_at":"2026-01-04T22:02:51.368907-05:00","close_reason":"Not warranted - _BaseTestClient already provides shared infrastructure (context manager, log capture, logging). The three clients handle genuinely different protocols (TableInOut with finalize, TableFunction with no input, Scalar with different protocol). Unifying would add type detection complexity without real benefit."} {"id":"vgi-python-set","title":"Improve type annotations in testing.py test helpers","notes":"92.61% type coverage (70 Anys) in vgi/testing.py\n\nMain opportunities:\n- Lines 136-137, 641-642, 685-686, etc: `args: tuple[Any, ...]` and `kwargs: dict[str, Any]`\n Could use ParamSpec or more specific signatures\n- Lines 151-152: `positional: tuple[pa.Scalar[Any], ...]` - unavoidable (PyArrow)\n- Lines 761, 843: Log expectation dicts - could use TypedDict\n\nLower priority since these are test helpers and flexibility is intentional.","status":"open","priority":4,"issue_type":"task","created_at":"2026-01-04T22:19:50.204524-05:00","created_by":"rusty","updated_at":"2026-01-04T22:20:05.418044-05:00"} +{"id":"vgi-python-uq8","title":"Add validation for missing arg_types in extract_argument_specs","description":"In extract_argument_specs(), if an argument name is missing from arg_types dict, it silently defaults to pa.null(). This could mask bugs where the caller forgot to provide a type. Consider raising an error or logging a warning when a type mapping is missing.","status":"open","priority":2,"issue_type":"task","created_at":"2026-01-05T11:51:19.501577-05:00","created_by":"rusty","updated_at":"2026-01-05T11:51:19.501577-05:00"} {"id":"vgi-python-vir","title":"Add tests for varargs in Arg descriptor","description":"In tests/test_arguments.py:\n- Basic varargs: receives multiple values as tuple\n- Single value varargs: works with exactly 1 value \n- Empty varargs: raises error when 0 values\n- Type validation: each element validated\n- Varargs with constraints (ge, le, etc.): validates each element\n- Varargs with choices: validates each element against choices","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-05T10:49:20.285269-05:00","created_by":"rusty","updated_at":"2026-01-05T11:00:06.124186-05:00","closed_at":"2026-01-05T11:00:06.124186-05:00","close_reason":"Added comprehensive tests for varargs in Arg descriptor and Arguments.get_varargs()","dependencies":[{"issue_id":"vgi-python-vir","depends_on_id":"vgi-python-jrf","type":"blocks","created_at":"2026-01-05T10:49:26.454614-05:00","created_by":"rusty"}]} {"id":"vgi-python-vzg","title":"Update documentation for DuckDB settings feature","description":"Update documentation to cover DuckDB settings feature.\n\nFiles to update:\n1. docs/protocol.md - Add settings to protocol flow diagrams and Invocation fields\n2. docs/metadata.md - Document required_settings in Meta class\n3. CLAUDE.md - Add example showing settings usage pattern\n4. docs/lifecycle.md - Mention settings availability during bind\n\nInclude:\n- When settings are available (bind phase and later)\n- How to declare required settings\n- How to access settings in function code\n- Example patterns for settings-dependent output schemas","status":"closed","priority":3,"issue_type":"task","created_at":"2026-01-04T13:05:48.795757-05:00","created_by":"rusty","updated_at":"2026-01-04T13:28:21.005166-05:00","closed_at":"2026-01-04T13:28:21.005166-05:00","close_reason":"Documentation updated in protocol.md and CLAUDE.md","dependencies":[{"issue_id":"vgi-python-vzg","depends_on_id":"vgi-python-67w","type":"blocks","created_at":"2026-01-04T13:06:14.089469-05:00","created_by":"rusty"}]} +{"id":"vgi-python-xj7","title":"Export metadata constants in argument_spec __all__","description":"The metadata constants (VGI_ARG_KEY, VGI_ARG_NAMED, VGI_TYPE_KEY, VGI_TYPE_TABLE, VGI_TYPE_ANY, VGI_VARARGS_KEY, VGI_VARARGS_TRUE) are not exported in __all__. If external code needs to parse schemas, they'd need to hardcode these values. Add them to __all__ and re-export from vgi/__init__.py.","status":"open","priority":3,"issue_type":"task","created_at":"2026-01-05T11:51:18.760011-05:00","created_by":"rusty","updated_at":"2026-01-05T11:51:18.760011-05:00"} {"id":"vgi-python-ymf","title":"Add tests for AnyValue argument type","description":"Test AnyValue with various Arrow types (int, string, float, etc.) and ensure proper metadata extraction","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-05T10:41:41.787705-05:00","created_by":"rusty","updated_at":"2026-01-05T11:10:17.132018-05:00","closed_at":"2026-01-05T11:10:17.132018-05:00","close_reason":"Closed","dependencies":[{"issue_id":"vgi-python-ymf","depends_on_id":"vgi-python-ckg","type":"blocks","created_at":"2026-01-05T10:41:48.751019-05:00","created_by":"rusty"},{"issue_id":"vgi-python-ymf","depends_on_id":"vgi-python-bkz","type":"blocks","created_at":"2026-01-05T10:41:48.787562-05:00","created_by":"rusty"},{"issue_id":"vgi-python-ymf","depends_on_id":"vgi-python-dvo","type":"blocks","created_at":"2026-01-05T10:41:48.823096-05:00","created_by":"rusty"}]} {"id":"vgi-python-yzj","title":"Write end-to-end tests for DuckDB settings feature","description":"Create comprehensive tests for the DuckDB settings feature.\n\nTest scenarios:\n1. Function with required_settings receives settings correctly\n2. Function with no required_settings works without settings\n3. Missing required setting fails with clear error\n4. Setting value correctly affects output schema (bind phase)\n5. Setting value correctly affects function behavior (processing phase)\n6. Settings serialization/deserialization roundtrip\n7. Parallel workers all receive same settings\n\nAdd tests to tests/ directory covering:\n- Unit tests for Invocation serialization with settings\n- Unit tests for Meta required_settings\n- Integration tests using ExampleWorker with settings-dependent function","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-04T13:05:48.650173-05:00","created_by":"rusty","updated_at":"2026-01-04T13:26:24.713648-05:00","closed_at":"2026-01-04T13:26:24.713648-05:00","close_reason":"End-to-end tests written and passing","dependencies":[{"issue_id":"vgi-python-yzj","depends_on_id":"vgi-python-67w","type":"blocks","created_at":"2026-01-04T13:06:14.064084-05:00","created_by":"rusty"}]} {"id":"vgi-python-zf7","title":"Unify storage protocols into single FunctionStorage interface","description":"## Problem\n\nCurrently there are two separate storage protocols (InitStorage and WorkerStateStorage) with inconsistent naming:\n- `create` vs `store` (different verbs for similar operations)\n- `collect_and_delete` is verbose and describes implementation\n\nUsers wanting a custom storage backend (Redis, DynamoDB, etc.) must implement two separate classes.\n\n## Solution\n\nUnify into a single `FunctionStorage` protocol with consistent naming using prefixes to group related methods:\n\n```python\nclass FunctionStorage(Protocol):\n \"\"\"Storage protocol for VGI distributed function execution.\n \n Three access patterns:\n - Global state: Init data shared across all workers (key-value with auto-generated keys)\n - Worker state: Partial results per worker (collected during finalization)\n - Work queue: Atomic work distribution across workers (FIFO queue)\n \"\"\"\n \n # Global state (init data)\n def global_put(self, value: bytes) -\u003e bytes: ... # Returns auto-generated key\n def global_get(self, key: bytes) -\u003e bytes: ...\n def global_delete(self, key: bytes) -\u003e None: ...\n def global_exists(self, key: bytes) -\u003e bool: ...\n \n # Worker state (partial results per worker)\n def worker_put(self, invocation_id: bytes, worker_id: int, state: bytes) -\u003e None: ...\n def worker_collect(self, invocation_id: bytes) -\u003e list[bytes]: ... # Atomic collect+delete\n \n # Work queue (distributed work items)\n def queue_push(self, invocation_id: bytes, items: list[bytes]) -\u003e int: ...\n def queue_pop(self, invocation_id: bytes) -\u003e bytes | None: ... # Atomic claim\n def queue_clear(self, invocation_id: bytes) -\u003e int: ...\n```\n\n## Design Rationale\n\n- **Prefixes** (`global_`, `worker_`, `queue_`): Clear grouping, good autocomplete\n- **Consistent verbs**: `put/get` for storage, `push/pop` for queue\n- **Minimal interface**: 9 methods total (down from 9, but now unified)\n- **Single class variable** in Function: `storage: ClassVar[FunctionStorage]`\n\n## Files to Change\n\n- `vgi/function_storage.py`: New protocol + merged FunctionStorageSqlite\n- `vgi/function.py`: Single `storage` class variable, update all method calls","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-04T12:34:25.966005-05:00","created_by":"rusty","updated_at":"2026-01-04T12:58:16.913278-05:00","closed_at":"2026-01-04T12:58:16.913278-05:00","close_reason":"Closed"} diff --git a/docs/argument-serialization.md b/docs/argument-serialization.md new file mode 100644 index 0000000..dc753e6 --- /dev/null +++ b/docs/argument-serialization.md @@ -0,0 +1,259 @@ +# Argument Specification Serialization + +This document describes how VGI function argument specifications are serialized +to Apache Arrow schemas for IPC transmission and DuckDB function registration. + +## Quick Reference + +| Metadata Key | Value | Meaning | +|--------------|-------|---------| +| `vgi_arg` | `named` | Named argument (not positional) | +| `vgi_type` | `table` | Table input argument | +| `vgi_type` | `any` | Any Arrow type argument | +| `vgi_varargs` | `true` | Variable arguments | + +## Schema Format + +Arguments are serialized as a **single Arrow schema** where each field +represents one argument. + +### Field Order + +1. **Positional arguments** come first, in order (field index = position index) +2. **Named arguments** follow, marked with metadata + +### Field Components + +| Component | Source | +|-----------|--------| +| Field name | Python attribute name | +| Field type | Exact Arrow data type | +| Field metadata | Markers for named, table, any, varargs | + +## Positional Arguments + +Positional arguments have no special metadata. Their position index is +determined by their order in the schema. + +```python +# Arg[int](0) becomes: +pa.field("count", pa.int64()) + +# Arg[str](1) becomes: +pa.field("name", pa.utf8()) +``` + +## Named Arguments + +Named arguments have `vgi_arg=named` metadata. The field name is the +argument key used in SQL. + +```python +# Arg[str]("format") becomes: +pa.field("format", pa.utf8(), metadata={b"vgi_arg": b"named"}) +``` + +## Special Types + +### Table Input + +Table input arguments (`Arg[TableInput]`) receive streaming RecordBatches +rather than scalar values. + +- **Arrow type**: `pa.null()` +- **Metadata**: `{b"vgi_type": b"table"}` + +```python +# Arg[TableInput](1) becomes: +pa.field("data", pa.null(), metadata={b"vgi_type": b"table"}) +``` + +### Any Type + +Any-type arguments (`Arg[AnyArrow]`) accept any valid Arrow scalar type +at runtime. + +- **Arrow type**: `pa.null()` +- **Metadata**: `{b"vgi_type": b"any"}` + +```python +# Arg[AnyArrow](0) becomes: +pa.field("value", pa.null(), metadata={b"vgi_type": b"any"}) +``` + +### Variable Arguments + +Varargs arguments (`varargs=True`) collect all remaining positional +arguments from their position onwards. + +- **Arrow type**: The element type (e.g., `pa.int64()` for int varargs) +- **Metadata**: `{b"vgi_varargs": b"true"}` + +```python +# Arg[str](0, varargs=True) becomes: +pa.field("columns", pa.utf8(), metadata={b"vgi_varargs": b"true"}) +``` + +## Combined Metadata + +Fields can have multiple metadata keys. For example, a named argument +that accepts any type: + +```python +# Arg[AnyArrow]("threshold") becomes: +pa.field("threshold", pa.null(), metadata={ + b"vgi_arg": b"named", + b"vgi_type": b"any", +}) +``` + +## Complete Examples + +### Example 1: Simple Function + +```python +class MyFunction(TableInOutFunction): + count = Arg[int](0) # Positional 0 + name = Arg[str](1) # Positional 1 + verbose = Arg[bool]("verbose") # Named + +# Serializes to: +schema = pa.schema([ + pa.field("count", pa.int64()), + pa.field("name", pa.utf8()), + pa.field("verbose", pa.bool_(), metadata={b"vgi_arg": b"named"}), +]) +``` + +### Example 2: Function with Table Input + +```python +class TransformFunction(TableInOutFunction): + multiplier = Arg[float](0) + data: TableInput = Arg[TableInput](1) + +# Serializes to: +schema = pa.schema([ + pa.field("multiplier", pa.float64()), + pa.field("data", pa.null(), metadata={b"vgi_type": b"table"}), +]) +``` + +### Example 3: Function with Varargs + +```python +class SumColumnsFunction(TableInOutFunction): + columns = Arg[str](0, varargs=True) + +# Serializes to: +schema = pa.schema([ + pa.field("columns", pa.utf8(), metadata={b"vgi_varargs": b"true"}), +]) +``` + +### Example 4: Complex Function + +```python +class ComplexFunction(TableInOutFunction): + count = Arg[int](0) + data: TableInput = Arg[TableInput](1) + extra = Arg[float](2, varargs=True) + format = Arg[str]("format") + threshold: AnyArrow = Arg[AnyArrow]("threshold") + +# Serializes to: +schema = pa.schema([ + pa.field("count", pa.int64()), + pa.field("data", pa.null(), metadata={b"vgi_type": b"table"}), + pa.field("extra", pa.float64(), metadata={b"vgi_varargs": b"true"}), + pa.field("format", pa.utf8(), metadata={b"vgi_arg": b"named"}), + pa.field("threshold", pa.null(), metadata={ + b"vgi_arg": b"named", + b"vgi_type": b"any", + }), +]) +``` + +## Serialization Code + +### Serialize to Bytes + +```python +from vgi.argument_spec import argument_specs_to_schema + +# Create schema from specs +schema = argument_specs_to_schema(specs) + +# Serialize to bytes +schema_bytes = schema.serialize().to_pybytes() +``` + +### Deserialize from Bytes + +```python +import pyarrow as pa +from vgi.argument_spec import schema_to_argument_specs + +# Deserialize schema +schema = pa.ipc.read_schema(pa.py_buffer(schema_bytes)) + +# Convert to ArgumentSpec objects +specs = schema_to_argument_specs(schema) +``` + +## Parsing Algorithm + +To parse a schema back to argument specifications: + +1. Initialize `position_index = 0` +2. For each field in schema: + - Check if field has `vgi_arg=named` metadata + - If named: `position = field.name` (string) + - If positional: `position = position_index`, then increment `position_index` + - Check for `vgi_type` metadata (`table` or `any`) + - Check for `vgi_varargs` metadata + - Create `ArgumentSpec` with extracted info + +```python +def parse_schema(schema): + specs = [] + position_index = 0 + + for field in schema: + metadata = field.metadata or {} + + # Determine position type + if metadata.get(b"vgi_arg") == b"named": + position = field.name # Named argument + else: + position = position_index # Positional argument + position_index += 1 + + # Check special types + vgi_type = metadata.get(b"vgi_type") + is_table_input = (vgi_type == b"table") + is_any_type = (vgi_type == b"any") + is_varargs = (metadata.get(b"vgi_varargs") == b"true") + + specs.append(ArgumentSpec( + name=field.name, + position=position, + arrow_type=field.type, + is_table_input=is_table_input, + is_any_type=is_any_type, + is_varargs=is_varargs, + )) + + return specs +``` + +## Not Included + +The following are **not** serialized in the schema: + +- **Default values** - handled at runtime by the `Arg` descriptor +- **Validation constraints** (`ge`, `le`, `choices`, `pattern`) - Python-side validation +- **Documentation strings** - available via `ParameterInfo` in metadata + +These are implementation details of the Python function runtime, not part +of the argument type specification needed for function registration. diff --git a/tests/test_argument_spec.py b/tests/test_argument_spec.py new file mode 100644 index 0000000..6f787d6 --- /dev/null +++ b/tests/test_argument_spec.py @@ -0,0 +1,500 @@ +"""Tests for Arrow-based argument specification serialization.""" + +from __future__ import annotations + +from typing import Any + +import pyarrow as pa +import pytest + +from vgi.argument_spec import ( + VGI_ARG_KEY, + VGI_ARG_NAMED, + VGI_TYPE_ANY, + VGI_TYPE_KEY, + VGI_TYPE_TABLE, + VGI_VARARGS_KEY, + VGI_VARARGS_TRUE, + ArgumentSpec, + argument_specs_to_schema, + extract_argument_specs, + schema_to_argument_specs, +) +from vgi.arguments import AnyArrow, Arg, TableInput +from vgi.table_in_out_function import TableInOutFunction + + +class TestArgumentSpecToSchema: + """Test converting ArgumentSpec objects to Arrow schema.""" + + def test_positional_arguments_preserve_order(self) -> None: + """Positional arguments should maintain their order in schema.""" + specs = [ + ArgumentSpec(name="third", position=2, arrow_type=pa.float64()), + ArgumentSpec(name="first", position=0, arrow_type=pa.int64()), + ArgumentSpec(name="second", position=1, arrow_type=pa.utf8()), + ] + schema = argument_specs_to_schema(specs) + + assert len(schema) == 3 + assert schema.field(0).name == "first" + assert schema.field(1).name == "second" + assert schema.field(2).name == "third" + assert schema.field(0).type == pa.int64() + assert schema.field(1).type == pa.utf8() + assert schema.field(2).type == pa.float64() + + def test_named_arguments_have_metadata(self) -> None: + """Named arguments should have vgi_arg=named metadata.""" + specs = [ + ArgumentSpec(name="format", position="format", arrow_type=pa.utf8()), + ] + schema = argument_specs_to_schema(specs) + + assert len(schema) == 1 + field = schema.field(0) + assert field.name == "format" + assert field.metadata is not None + assert field.metadata.get(VGI_ARG_KEY) == VGI_ARG_NAMED + + def test_mixed_positional_and_named(self) -> None: + """Mixed args should have positional first, then named.""" + specs = [ + ArgumentSpec(name="verbose", position="verbose", arrow_type=pa.bool_()), + ArgumentSpec(name="count", position=0, arrow_type=pa.int64()), + ArgumentSpec(name="format", position="format", arrow_type=pa.utf8()), + ArgumentSpec(name="name", position=1, arrow_type=pa.utf8()), + ] + schema = argument_specs_to_schema(specs) + + # Positional come first (sorted by index) + assert schema.field(0).name == "count" + assert schema.field(1).name == "name" + # Named come after (sorted alphabetically) + assert schema.field(2).name == "format" + assert schema.field(3).name == "verbose" + + # Named args have metadata + assert schema.field(0).metadata is None + assert schema.field(1).metadata is None + field2_meta = schema.field(2).metadata + field3_meta = schema.field(3).metadata + assert field2_meta is not None + assert field3_meta is not None + assert field2_meta.get(VGI_ARG_KEY) == VGI_ARG_NAMED + assert field3_meta.get(VGI_ARG_KEY) == VGI_ARG_NAMED + + def test_table_input_uses_null_type_and_metadata(self) -> None: + """TableInput args should use pa.null() with vgi_type=table.""" + specs = [ + ArgumentSpec( + name="data", + position=0, + arrow_type=pa.null(), + is_table_input=True, + ), + ] + schema = argument_specs_to_schema(specs) + + field = schema.field(0) + assert field.type == pa.null() + assert field.metadata is not None + assert field.metadata.get(VGI_TYPE_KEY) == VGI_TYPE_TABLE + + def test_any_type_uses_null_type_and_metadata(self) -> None: + """AnyArrow args should use pa.null() with vgi_type=any.""" + specs = [ + ArgumentSpec( + name="value", + position=0, + arrow_type=pa.null(), + is_any_type=True, + ), + ] + schema = argument_specs_to_schema(specs) + + field = schema.field(0) + assert field.type == pa.null() + assert field.metadata is not None + assert field.metadata.get(VGI_TYPE_KEY) == VGI_TYPE_ANY + + def test_varargs_has_metadata(self) -> None: + """Varargs should preserve element type and have vgi_varargs=true.""" + specs = [ + ArgumentSpec( + name="columns", + position=0, + arrow_type=pa.utf8(), + is_varargs=True, + ), + ] + schema = argument_specs_to_schema(specs) + + field = schema.field(0) + assert field.type == pa.utf8() # Element type preserved + assert field.metadata is not None + assert field.metadata.get(VGI_VARARGS_KEY) == VGI_VARARGS_TRUE + + def test_empty_specs(self) -> None: + """Empty specs should produce empty schema.""" + schema = argument_specs_to_schema([]) + assert len(schema) == 0 + + +class TestSchemaToArgumentSpecs: + """Test converting Arrow schema back to ArgumentSpec objects.""" + + def test_positional_arguments_from_schema(self) -> None: + """Fields without named metadata should be positional.""" + fields: list[pa.Field[Any]] = [ + pa.field("a", pa.int64()), + pa.field("b", pa.utf8()), + pa.field("c", pa.float64()), + ] + schema = pa.schema(fields) + specs = schema_to_argument_specs(schema) + + assert len(specs) == 3 + assert specs[0].position == 0 + assert specs[1].position == 1 + assert specs[2].position == 2 + assert specs[0].name == "a" + assert specs[1].name == "b" + assert specs[2].name == "c" + + def test_named_arguments_from_metadata(self) -> None: + """Fields with vgi_arg=named should have string position.""" + schema = pa.schema( + [ + pa.field("format", pa.utf8(), metadata={VGI_ARG_KEY: VGI_ARG_NAMED}), + ] + ) + specs = schema_to_argument_specs(schema) + + assert len(specs) == 1 + assert specs[0].position == "format" + assert specs[0].name == "format" + + def test_table_input_detected(self) -> None: + """vgi_type=table metadata should set is_table_input.""" + schema = pa.schema( + [ + pa.field("data", pa.null(), metadata={VGI_TYPE_KEY: VGI_TYPE_TABLE}), + ] + ) + specs = schema_to_argument_specs(schema) + + assert specs[0].is_table_input is True + assert specs[0].is_any_type is False + + def test_any_type_detected(self) -> None: + """vgi_type=any metadata should set is_any_type.""" + schema = pa.schema( + [ + pa.field("value", pa.null(), metadata={VGI_TYPE_KEY: VGI_TYPE_ANY}), + ] + ) + specs = schema_to_argument_specs(schema) + + assert specs[0].is_any_type is True + assert specs[0].is_table_input is False + + def test_varargs_detected(self) -> None: + """vgi_varargs=true metadata should set is_varargs.""" + schema = pa.schema( + [ + pa.field( + "cols", pa.utf8(), metadata={VGI_VARARGS_KEY: VGI_VARARGS_TRUE} + ), + ] + ) + specs = schema_to_argument_specs(schema) + + assert specs[0].is_varargs is True + + def test_mixed_positional_and_named_positions(self) -> None: + """Position index should only increment for positional args.""" + fields: list[pa.Field[Any]] = [ + pa.field("a", pa.int64()), # positional 0 + pa.field("b", pa.utf8()), # positional 1 + pa.field("key", pa.bool_(), metadata={VGI_ARG_KEY: VGI_ARG_NAMED}), + ] + schema = pa.schema(fields) + specs = schema_to_argument_specs(schema) + + assert specs[0].position == 0 + assert specs[1].position == 1 + assert specs[2].position == "key" # named, not 2 + + +class TestRoundTrip: + """Test that specs survive serialization round-trip.""" + + def test_simple_round_trip(self) -> None: + """Basic specs should round-trip correctly.""" + original = [ + ArgumentSpec(name="count", position=0, arrow_type=pa.int64()), + ArgumentSpec(name="name", position=1, arrow_type=pa.utf8()), + ] + schema = argument_specs_to_schema(original) + restored = schema_to_argument_specs(schema) + + assert len(restored) == 2 + assert restored[0].name == "count" + assert restored[0].position == 0 + assert restored[0].arrow_type == pa.int64() + assert restored[1].name == "name" + assert restored[1].position == 1 + assert restored[1].arrow_type == pa.utf8() + + @pytest.mark.parametrize( + "arrow_type", + [ + pa.int64(), + pa.int32(), + pa.float32(), + pa.float64(), + pa.utf8(), + pa.bool_(), + pa.binary(), + pa.list_(pa.float64()), + pa.struct([pa.field("a", pa.int32()), pa.field("b", pa.string())]), + pa.map_(pa.string(), pa.int64()), + pa.decimal128(10, 2), + pa.timestamp("us", tz="UTC"), + pa.date32(), + pa.time64("us"), + pa.duration("ms"), + ], + ) + def test_complex_arrow_types_preserved(self, arrow_type: pa.DataType) -> None: + """Complex Arrow types should survive round-trip.""" + original = [ArgumentSpec(name="arg", position=0, arrow_type=arrow_type)] + schema = argument_specs_to_schema(original) + + # Serialize to bytes and back + schema_bytes = schema.serialize().to_pybytes() + restored_schema = pa.ipc.read_schema(pa.py_buffer(schema_bytes)) + + restored = schema_to_argument_specs(restored_schema) + assert restored[0].arrow_type == arrow_type + + def test_full_function_signature_roundtrip(self) -> None: + """Complete function signature should round-trip.""" + original = [ + ArgumentSpec(name="count", position=0, arrow_type=pa.int64()), + ArgumentSpec( + name="data", position=1, arrow_type=pa.null(), is_table_input=True + ), + ArgumentSpec( + name="extra", position=2, arrow_type=pa.float64(), is_varargs=True + ), + ArgumentSpec(name="format", position="format", arrow_type=pa.utf8()), + ArgumentSpec( + name="threshold", + position="threshold", + arrow_type=pa.null(), + is_any_type=True, + ), + ] + + schema = argument_specs_to_schema(original) + + # Full serialization round-trip + schema_bytes = schema.serialize().to_pybytes() + restored_schema = pa.ipc.read_schema(pa.py_buffer(schema_bytes)) + restored = schema_to_argument_specs(restored_schema) + + assert len(restored) == 5 + + # Positional args + assert restored[0].name == "count" + assert restored[0].position == 0 + assert restored[0].arrow_type == pa.int64() + + assert restored[1].name == "data" + assert restored[1].position == 1 + assert restored[1].is_table_input is True + + assert restored[2].name == "extra" + assert restored[2].position == 2 + assert restored[2].is_varargs is True + + # Named args + assert restored[3].name == "format" + assert restored[3].position == "format" + + assert restored[4].name == "threshold" + assert restored[4].position == "threshold" + assert restored[4].is_any_type is True + + +class TestExtractArgumentSpecs: + """Test extracting specs from function classes.""" + + def test_extract_from_simple_function(self) -> None: + """Extract specs from function with basic Arg descriptors.""" + + class SimpleFunction(TableInOutFunction): + count = Arg[int](0) + name = Arg[str](1) + + arg_types: dict[str, pa.DataType] = {"count": pa.int64(), "name": pa.utf8()} + specs = extract_argument_specs(SimpleFunction, arg_types) + + assert len(specs) == 2 + assert specs[0].name == "count" + assert specs[0].position == 0 + assert specs[0].arrow_type == pa.int64() + assert specs[1].name == "name" + assert specs[1].position == 1 + assert specs[1].arrow_type == pa.utf8() + + def test_extract_table_input(self) -> None: + """Extract specs should detect Arg[TableInput].""" + + class FunctionWithTable(TableInOutFunction): + multiplier = Arg[float](0) + data: TableInput = Arg[TableInput](1) # type: ignore[assignment] + + arg_types: dict[str, pa.DataType] = { + "multiplier": pa.float64(), + "data": pa.null(), + } + specs = extract_argument_specs(FunctionWithTable, arg_types) + + assert len(specs) == 2 + assert specs[1].name == "data" + assert specs[1].is_table_input is True + + def test_extract_any_arrow(self) -> None: + """Extract specs should detect Arg[AnyArrow].""" + + class FunctionWithAny(TableInOutFunction): + value: AnyArrow = Arg[AnyArrow](0) # type: ignore[assignment] + + arg_types: dict[str, pa.DataType] = {"value": pa.null()} + specs = extract_argument_specs(FunctionWithAny, arg_types) + + assert len(specs) == 1 + assert specs[0].is_any_type is True + + def test_extract_varargs(self) -> None: + """Extract specs should detect varargs=True.""" + + class FunctionWithVarargs(TableInOutFunction): + columns = Arg[str](0, varargs=True) + + arg_types: dict[str, pa.DataType] = {"columns": pa.utf8()} + specs = extract_argument_specs(FunctionWithVarargs, arg_types) + + assert len(specs) == 1 + assert specs[0].is_varargs is True + + def test_extract_named_arguments(self) -> None: + """Extract specs should handle named arguments.""" + + class FunctionWithNamed(TableInOutFunction): + count = Arg[int](0) + format = Arg[str]("format") + + arg_types: dict[str, pa.DataType] = {"count": pa.int64(), "format": pa.utf8()} + specs = extract_argument_specs(FunctionWithNamed, arg_types) + + assert len(specs) == 2 + assert specs[0].position == 0 + assert specs[1].position == "format" + + def test_extract_mixed_arguments(self) -> None: + """Extract specs should handle mixed positional and named args.""" + + class ComplexFunction(TableInOutFunction): + count = Arg[int](0) + data: TableInput = Arg[TableInput](1) # type: ignore[assignment] + extra = Arg[float](2, varargs=True) + format = Arg[str]("format") + threshold: AnyArrow = Arg[AnyArrow]("threshold") # type: ignore[assignment] + + arg_types: dict[str, pa.DataType] = { + "count": pa.int64(), + "data": pa.null(), + "extra": pa.float64(), + "format": pa.utf8(), + "threshold": pa.null(), + } + specs = extract_argument_specs(ComplexFunction, arg_types) + + assert len(specs) == 5 + + # Positional first + assert specs[0].name == "count" + assert specs[0].position == 0 + + assert specs[1].name == "data" + assert specs[1].position == 1 + assert specs[1].is_table_input is True + + assert specs[2].name == "extra" + assert specs[2].position == 2 + assert specs[2].is_varargs is True + + # Named after + assert specs[3].name == "format" + assert specs[3].position == "format" + + assert specs[4].name == "threshold" + assert specs[4].position == "threshold" + assert specs[4].is_any_type is True + + +class TestEdgeCases: + """Test edge cases and boundary conditions.""" + + def test_empty_schema_roundtrip(self) -> None: + """Empty specs/schema should round-trip.""" + schema = argument_specs_to_schema([]) + assert len(schema) == 0 + + specs = schema_to_argument_specs(schema) + assert specs == [] + + def test_only_named_arguments(self) -> None: + """Function with only named arguments should work.""" + specs = [ + ArgumentSpec(name="format", position="format", arrow_type=pa.utf8()), + ArgumentSpec(name="verbose", position="verbose", arrow_type=pa.bool_()), + ] + schema = argument_specs_to_schema(specs) + restored = schema_to_argument_specs(schema) + + assert len(restored) == 2 + assert all(isinstance(s.position, str) for s in restored) + + def test_only_positional_arguments(self) -> None: + """Function with only positional arguments should work.""" + specs = [ + ArgumentSpec(name="a", position=0, arrow_type=pa.int64()), + ArgumentSpec(name="b", position=1, arrow_type=pa.utf8()), + ] + schema = argument_specs_to_schema(specs) + restored = schema_to_argument_specs(schema) + + assert len(restored) == 2 + assert all(isinstance(s.position, int) for s in restored) + + def test_combined_metadata(self) -> None: + """Named argument with special type should have both metadata keys.""" + specs = [ + ArgumentSpec( + name="threshold", + position="threshold", + arrow_type=pa.null(), + is_any_type=True, + ), + ] + schema = argument_specs_to_schema(specs) + + field = schema.field(0) + assert field.metadata is not None + assert field.metadata.get(VGI_ARG_KEY) == VGI_ARG_NAMED + assert field.metadata.get(VGI_TYPE_KEY) == VGI_TYPE_ANY diff --git a/vgi/__init__.py b/vgi/__init__.py index 6e818f7..5295f13 100644 --- a/vgi/__init__.py +++ b/vgi/__init__.py @@ -134,6 +134,11 @@ class Meta: """ # Re-export commonly used classes for convenient imports +from vgi.argument_spec import ( + ArgumentSpec, + argument_specs_to_schema, + schema_to_argument_specs, +) from vgi.arguments import AnyArrow, Arg, Arguments, ArgumentValidationError, TableInput from vgi.invocation import Invocation from vgi.log import Level, Message @@ -174,8 +179,10 @@ class Meta: "AggregationFunction", "AnyArrow", "Arg", + "ArgumentSpec", "ArgumentValidationError", "Arguments", + "argument_specs_to_schema", "FilterFunction", "FunctionExample", "FunctionStability", @@ -204,6 +211,7 @@ class Meta: "hello", "schema", "schema_like", + "schema_to_argument_specs", "streaming", ] diff --git a/vgi/argument_spec.py b/vgi/argument_spec.py new file mode 100644 index 0000000..ac3359d --- /dev/null +++ b/vgi/argument_spec.py @@ -0,0 +1,314 @@ +"""Arrow-based serialization of function argument specifications. + +This module provides classes and functions for serializing function argument +specifications to Apache Arrow schemas. This enables functions to describe +their argument signatures (types, positions, special markers) in a format +that can be transmitted over IPC and understood by DuckDB for function +registration. + +The serialization uses a single Arrow schema where: +- Positional arguments come first (field order = position index) +- Named arguments follow (marked with metadata) +- Special types (TableInput, AnyArrow, varargs) use field metadata markers + +Example: + # Define argument specs + specs = [ + ArgumentSpec(name="count", position=0, arrow_type=pa.int64()), + ArgumentSpec( + name="data", position=1, arrow_type=pa.null(), is_table_input=True + ), + ArgumentSpec(name="format", position="format", arrow_type=pa.utf8()), + ] + + # Serialize to Arrow schema + schema = argument_specs_to_schema(specs) + + # Serialize schema to bytes for IPC + schema_bytes = schema.serialize().to_pybytes() + + # Deserialize + schema = pa.ipc.read_schema(pa.py_buffer(schema_bytes)) + specs = schema_to_argument_specs(schema) + +""" + +from collections.abc import Sequence +from dataclasses import dataclass +from typing import Any, get_type_hints + +import pyarrow as pa + +from vgi.arguments import AnyArrow, Arg, TableInput + +__all__ = [ + "ArgumentSpec", + "argument_specs_to_schema", + "extract_argument_specs", + "schema_to_argument_specs", +] + +# ============================================================================= +# Metadata Keys +# ============================================================================= + +# Key indicating a named argument (not positional) +VGI_ARG_KEY = b"vgi_arg" +VGI_ARG_NAMED = b"named" + +# Key indicating special argument types +VGI_TYPE_KEY = b"vgi_type" +VGI_TYPE_TABLE = b"table" +VGI_TYPE_ANY = b"any" + +# Key indicating varargs (collects remaining positional arguments) +VGI_VARARGS_KEY = b"vgi_varargs" +VGI_VARARGS_TRUE = b"true" + + +# ============================================================================= +# ArgumentSpec Dataclass +# ============================================================================= + + +@dataclass(frozen=True) +class ArgumentSpec: + """Specification for a single function argument. + + This represents one argument in a function's signature, capturing: + - The argument's name and position (positional index or named key) + - The exact Arrow data type + - Special markers for table input, any-type, and varargs + + Attributes: + name: Python attribute name for the argument. + position: Positional index (int) for positional args, or the named key + (str) for named arguments. + arrow_type: The Arrow data type. Use pa.null() for TableInput and + AnyArrow types. + is_table_input: True if this argument receives streaming table input + (Arg[TableInput]). + is_any_type: True if this argument accepts any Arrow type + (Arg[AnyArrow]). + is_varargs: True if this argument collects all remaining positional + arguments (varargs=True). + + """ + + name: str + position: int | str + arrow_type: pa.DataType + is_table_input: bool = False + is_any_type: bool = False + is_varargs: bool = False + + +# ============================================================================= +# Serialization Functions +# ============================================================================= + + +def argument_specs_to_schema(specs: Sequence[ArgumentSpec]) -> pa.Schema: + """Convert ArgumentSpecs to a single Arrow schema. + + The schema encodes the argument specifications as follows: + - Positional arguments come first, in order (field index = position index) + - Named arguments follow, each with metadata {b"vgi_arg": b"named"} + - Special types are indicated via metadata: + - TableInput: {b"vgi_type": b"table"} + - AnyArrow: {b"vgi_type": b"any"} + - varargs: {b"vgi_varargs": b"true"} + + Args: + specs: Sequence of ArgumentSpec objects to serialize. + + Returns: + Arrow schema with one field per argument. + + Example: + specs = [ + ArgumentSpec(name="count", position=0, arrow_type=pa.int64()), + ArgumentSpec(name="format", position="format", arrow_type=pa.utf8()), + ] + schema = argument_specs_to_schema(specs) + # schema has fields: count (int64), format (utf8 with vgi_arg=named) + + """ + + # Sort: positional first (by index), then named (alphabetically) + def sort_key(spec: ArgumentSpec) -> tuple[int, int | str]: + if isinstance(spec.position, int): + return (0, spec.position) + return (1, spec.position) + + sorted_specs = sorted(specs, key=sort_key) + + fields: list[pa.Field[Any]] = [] + for spec in sorted_specs: + # Build metadata dict + metadata: dict[bytes, bytes] = {} + + if isinstance(spec.position, str): + metadata[VGI_ARG_KEY] = VGI_ARG_NAMED + + if spec.is_table_input: + metadata[VGI_TYPE_KEY] = VGI_TYPE_TABLE + elif spec.is_any_type: + metadata[VGI_TYPE_KEY] = VGI_TYPE_ANY + + if spec.is_varargs: + metadata[VGI_VARARGS_KEY] = VGI_VARARGS_TRUE + + # Create field with or without metadata + field = pa.field( + spec.name, + spec.arrow_type, + metadata=metadata if metadata else None, + ) + fields.append(field) + + return pa.schema(fields) + + +def schema_to_argument_specs(schema: pa.Schema) -> list[ArgumentSpec]: + """Convert Arrow schema back to ArgumentSpecs. + + Parses the schema fields and their metadata to reconstruct the original + ArgumentSpec objects. + + Args: + schema: Arrow schema with argument fields. + + Returns: + List of ArgumentSpec objects in schema field order. + + Example: + schema = pa.schema([ + pa.field("count", pa.int64()), + pa.field("format", pa.utf8(), metadata={b"vgi_arg": b"named"}), + ]) + specs = schema_to_argument_specs(schema) + # specs[0].position == 0, specs[1].position == "format" + + """ + specs: list[ArgumentSpec] = [] + position_index = 0 + + for field in schema: + metadata = field.metadata or {} + + # Determine position + is_named = metadata.get(VGI_ARG_KEY) == VGI_ARG_NAMED + if is_named: + position: int | str = field.name + else: + position = position_index + position_index += 1 + + # Check special type markers + vgi_type = metadata.get(VGI_TYPE_KEY) + is_table_input = vgi_type == VGI_TYPE_TABLE + is_any_type = vgi_type == VGI_TYPE_ANY + + # Check varargs + is_varargs = metadata.get(VGI_VARARGS_KEY) == VGI_VARARGS_TRUE + + specs.append( + ArgumentSpec( + name=field.name, + position=position, + arrow_type=field.type, + is_table_input=is_table_input, + is_any_type=is_any_type, + is_varargs=is_varargs, + ) + ) + + return specs + + +# ============================================================================= +# Extraction from Function Classes +# ============================================================================= + + +def extract_argument_specs( + cls: type, + arg_types: dict[str, pa.DataType], +) -> list[ArgumentSpec]: + """Extract ArgumentSpecs from a function class with Arg descriptors. + + Walks the class hierarchy to find all Arg descriptors and creates + ArgumentSpec objects with the provided Arrow types. + + Args: + cls: Function class with Arg descriptors. + arg_types: Mapping from argument attribute names to their Arrow types. + For TableInput and AnyArrow arguments, use pa.null(). + + Returns: + List of ArgumentSpec objects, sorted by position (positional first, + then named). + + Example: + class MyFunction(TableInOutFunction): + count = Arg[int](0) + format = Arg[str]("format") + + arg_types = {"count": pa.int64(), "format": pa.utf8()} + specs = extract_argument_specs(MyFunction, arg_types) + + """ + specs: list[ArgumentSpec] = [] + seen_names: set[str] = set() + + # Get type hints for detecting TableInput/AnyArrow + try: + hints = get_type_hints(cls) + except (NameError, AttributeError): + hints = {} + + # Walk MRO to find all Arg descriptors + for klass in cls.__mro__: + if klass is object: + continue + + for attr_name, attr_value in vars(klass).items(): + if attr_name.startswith("_"): + continue + if attr_name in seen_names: + continue + + if isinstance(attr_value, Arg): + seen_names.add(attr_name) + arg: Arg[Any] = attr_value + + # Get Arrow type from provided mapping + arrow_type = arg_types.get(attr_name, pa.null()) + + # Check type hint for special types + hint = hints.get(attr_name) + is_table_input = hint is TableInput + is_any_type = hint is AnyArrow + + # Check varargs flag + is_varargs = arg.varargs + + specs.append( + ArgumentSpec( + name=attr_name, + position=arg.position, + arrow_type=arrow_type, + is_table_input=is_table_input, + is_any_type=is_any_type, + is_varargs=is_varargs, + ) + ) + + # Sort: positional first (by index), then named (alphabetically) + def sort_key(spec: ArgumentSpec) -> tuple[int, int | str]: + if isinstance(spec.position, int): + return (0, spec.position) + return (1, spec.position) + + return sorted(specs, key=sort_key)