From 11838ae4f72af4cef7293375d6f80b002ddf6484 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 26 Mar 2026 15:47:02 -0700 Subject: [PATCH 01/52] python(feat): add data import api --- .../low_level_wrappers/data_imports.py | 212 ++++++++++++++ python/lib/sift_client/client.py | 7 + python/lib/sift_client/resources/__init__.py | 4 + .../lib/sift_client/resources/data_imports.py | 239 ++++++++++++++++ .../resources/sync_stubs/__init__.py | 3 + .../resources/sync_stubs/__init__.pyi | 146 ++++++++++ .../lib/sift_client/sift_types/data_import.py | 269 ++++++++++++++++++ python/lib/sift_client/util/util.py | 4 + 8 files changed, 884 insertions(+) create mode 100644 python/lib/sift_client/_internal/low_level_wrappers/data_imports.py create mode 100644 python/lib/sift_client/resources/data_imports.py create mode 100644 python/lib/sift_client/sift_types/data_import.py diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py new file mode 100644 index 000000000..d83f42142 --- /dev/null +++ b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py @@ -0,0 +1,212 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, cast + +from sift.data_imports.v2.data_imports_pb2 import ( + CreateDataImportFromUploadRequest, + CreateDataImportFromUploadResponse, + CreateDataImportFromUrlRequest, + CreateDataImportFromUrlResponse, + DetectConfigRequest, + DetectConfigResponse, + GetDataImportRequest, + GetDataImportResponse, + ListDataImportsRequest, + ListDataImportsResponse, + RetryDataImportRequest, +) +from sift.data_imports.v2.data_imports_pb2_grpc import DataImportServiceStub + +from sift_client._internal.low_level_wrappers.base import LowLevelClientBase +from sift_client._internal.util.executor import run_sync_function +from sift_client.sift_types.data_import import CsvImportConfig, DataImport +from sift_client.transport import WithGrpcClient, WithRestClient + +if TYPE_CHECKING: + from pathlib import Path + + from sift.data_imports.v2.data_imports_pb2 import DataTypeKey + + from sift_client.transport.grpc_transport import GrpcClient + from sift_client.transport.rest_transport import RestClient + +# Union of all supported config types. Extend this as new formats are added. +ImportConfig = CsvImportConfig + + +def _set_config_on_request( + request: CreateDataImportFromUploadRequest | CreateDataImportFromUrlRequest, + config: ImportConfig, +) -> None: + """Set the appropriate config field on a proto request based on the config type.""" + if isinstance(config, CsvImportConfig): + request.csv_config.CopyFrom(config._to_proto()) + else: + raise TypeError(f"Unsupported import config type: {type(config).__name__}") + + +logger = logging.getLogger(__name__) + + +class DataImportsLowLevelClient(LowLevelClientBase, WithGrpcClient, WithRestClient): + """Low-level client for the DataImportService. + + This class provides a thin wrapper around the autogenerated bindings for the DataImportsAPI. + """ + + def __init__(self, grpc_client: GrpcClient, rest_client: RestClient): + WithGrpcClient.__init__(self, grpc_client=grpc_client) + WithRestClient.__init__(self, rest_client=rest_client) + + async def create_from_upload(self, config: ImportConfig) -> tuple[str, str]: + """Create a data import and get back a presigned upload URL. + + Args: + config: The import configuration. + + Returns: + A tuple of (data_import_id, upload_url). + """ + request = CreateDataImportFromUploadRequest() + _set_config_on_request(request, config) + response = await self._grpc_client.get_stub( + DataImportServiceStub + ).CreateDataImportFromUpload(request) + response = cast("CreateDataImportFromUploadResponse", response) + return response.data_import_id, response.upload_url + + async def upload_file(self, upload_url: str, file_path: Path) -> None: + """Upload a file to a presigned URL. + + Runs the synchronous HTTP POST in a thread pool to avoid blocking + the event loop. + + Args: + upload_url: The presigned URL to upload to. + file_path: Path to the file to upload. + """ + rest_client = self._rest_client + + def _do_upload() -> None: + with open(file_path, "rb") as f: + response = rest_client.post(upload_url, data=f) + response.raise_for_status() + + await run_sync_function(_do_upload) + + async def create_from_url(self, url: str, config: ImportConfig) -> str: + """Create a data import from a remote URL. + + Args: + url: The URL to import from (HTTP or S3). + config: The import configuration. + + Returns: + The data_import_id. + """ + request = CreateDataImportFromUrlRequest(url=url) + _set_config_on_request(request, config) + response = await self._grpc_client.get_stub(DataImportServiceStub).CreateDataImportFromUrl( + request + ) + response = cast("CreateDataImportFromUrlResponse", response) + return response.data_import_id + + async def get(self, data_import_id: str) -> DataImport: + """Get a data import by ID. + + Args: + data_import_id: The ID of the data import. + + Returns: + The DataImport. + """ + request = GetDataImportRequest(data_import_id=data_import_id) + response = await self._grpc_client.get_stub(DataImportServiceStub).GetDataImport(request) + response = cast("GetDataImportResponse", response) + return DataImport._from_proto(response.data_import) + + async def list_( + self, + *, + page_size: int | None = None, + page_token: str | None = None, + query_filter: str = "", + order_by: str = "", + ) -> tuple[list[DataImport], str]: + """List data imports with optional filtering and pagination. + + Args: + page_size: Maximum number of results per page. + page_token: Token for the next page of results. + query_filter: CEL filter string. + order_by: Ordering string (e.g. "created_date desc"). + + Returns: + A tuple of (list of DataImports, next_page_token). + """ + request = ListDataImportsRequest( + filter=query_filter, + order_by=order_by, + ) + if page_size is not None: + request.page_size = page_size + if page_token: + request.page_token = page_token + + response = await self._grpc_client.get_stub(DataImportServiceStub).ListDataImports(request) + response = cast("ListDataImportsResponse", response) + data_imports = [DataImport._from_proto(di) for di in response.data_imports] + return data_imports, response.next_page_token + + async def list_all( + self, + *, + query_filter: str = "", + order_by: str = "", + max_results: int | None = None, + ) -> list[DataImport]: + """List all data imports, handling pagination automatically. + + Args: + query_filter: CEL filter string. + order_by: Ordering string (e.g. "created_date desc"). + max_results: Maximum total results to return. + + Returns: + A list of all matching DataImports. + """ + return await self._handle_pagination( + func=self.list_, + kwargs={"query_filter": query_filter, "order_by": order_by}, + max_results=max_results, + ) + + async def retry(self, data_import_id: str) -> None: + """Retry a failed data import. + + Only works for URL-based imports in a failed state. + + Args: + data_import_id: The ID of the data import to retry. + """ + request = RetryDataImportRequest(data_import_id=data_import_id) + await self._grpc_client.get_stub(DataImportServiceStub).RetryDataImport(request) + + async def detect_config( + self, data: bytes, data_type_key: DataTypeKey.ValueType + ) -> DetectConfigResponse: + """Call the DetectConfig RPC to auto-detect import configuration. + + Args: + data: A sample of the file content (e.g. the first 64 KiB). + data_type_key: The file type hint. + + Returns: + The raw DetectConfigResponse proto. The caller (resource API) + is responsible for converting to a sift_type. + """ + request = DetectConfigRequest(data=data, type=data_type_key) + response = await self._grpc_client.get_stub(DataImportServiceStub).DetectConfig(request) + return cast("DetectConfigResponse", response) diff --git a/python/lib/sift_client/client.py b/python/lib/sift_client/client.py index ed7aeba9a..95fd25b71 100644 --- a/python/lib/sift_client/client.py +++ b/python/lib/sift_client/client.py @@ -9,6 +9,8 @@ ChannelsAPIAsync, DataExportAPI, DataExportAPIAsync, + DataImportAPI, + DataImportAPIAsync, FileAttachmentsAPI, FileAttachmentsAPIAsync, IngestionAPIAsync, @@ -110,6 +112,9 @@ class SiftClient( data_export: DataExportAPI """Instance of the Data Export API for making synchronous requests.""" + data_import: DataImportAPI + """Instance of the Data Import API for making synchronous requests.""" + async_: AsyncAPIs """Accessor for the asynchronous APIs. All asynchronous APIs are available as attributes on this accessor.""" @@ -159,6 +164,7 @@ def __init__( self.tags = TagsAPI(self) self.test_results = TestResultsAPI(self) self.data_export = DataExportAPI(self) + self.data_import = DataImportAPI(self) # Accessor for the asynchronous APIs self.async_ = AsyncAPIs( @@ -175,6 +181,7 @@ def __init__( tags=TagsAPIAsync(self), test_results=TestResultsAPIAsync(self), data_export=DataExportAPIAsync(self), + data_import=DataImportAPIAsync(self), ) @property diff --git a/python/lib/sift_client/resources/__init__.py b/python/lib/sift_client/resources/__init__.py index 78b3b4eba..2b7a4c55b 100644 --- a/python/lib/sift_client/resources/__init__.py +++ b/python/lib/sift_client/resources/__init__.py @@ -162,6 +162,7 @@ async def main(): from sift_client.resources.runs import RunsAPIAsync from sift_client.resources.tags import TagsAPIAsync from sift_client.resources.test_results import TestResultsAPIAsync +from sift_client.resources.data_imports import DataImportAPIAsync from sift_client.resources.exports import DataExportAPIAsync # ruff: noqa All imports needs to be imported before sync_stubs to avoid circular import @@ -178,6 +179,7 @@ async def main(): TestResultsAPI, FileAttachmentsAPI, DataExportAPI, + DataImportAPI, ) import sys @@ -215,4 +217,6 @@ async def main(): "TracingConfig", "DataExportAPI", "DataExportAPIAsync", + "DataImportAPI", + "DataImportAPIAsync", ] diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py new file mode 100644 index 000000000..8ec2a3706 --- /dev/null +++ b/python/lib/sift_client/resources/data_imports.py @@ -0,0 +1,239 @@ +from __future__ import annotations + +import asyncio +import logging +import time +from pathlib import Path +from typing import TYPE_CHECKING + +from sift.data_imports.v2.data_imports_pb2 import DATA_TYPE_KEY_CSV + +from sift_client._internal.low_level_wrappers.data_imports import DataImportsLowLevelClient +from sift_client.resources._base import ResourceBase +from sift_client.sift_types.data_import import ( + CsvImportConfig, + DataImport, + DataImportStatus, +) +from sift_client.util import cel_utils as cel + +if TYPE_CHECKING: + from sift_client._internal.low_level_wrappers.data_imports import ImportConfig + from sift_client.client import SiftClient + +logger = logging.getLogger(__name__) + +_DETECT_CONFIG_SAMPLE_SIZE = 65_536 # 64 KiB + + +class DataImportAPIAsync(ResourceBase): + """High-level API for importing data into Sift. + + Supports importing data from local files or remote URLs. Returns a + `DataImport` object that can be polled for status. + """ + + def __init__(self, sift_client: SiftClient): + """Initialize the DataImportAPI. + + Args: + sift_client: The Sift client to use. + """ + super().__init__(sift_client) + self._low_level_client = DataImportsLowLevelClient( + grpc_client=self.client.grpc_client, + rest_client=self.client.rest_client, + ) + + async def import_from_path( + self, + *, + file_path: str | Path, + config: ImportConfig, + ) -> DataImport: + """Import data from a local file. + + Creates a data import on the server and uploads the file to the + returned presigned URL. Returns a :class:`DataImport` that can be + polled for status via ``data_import.refresh()``. + + Args: + file_path: Path to the local file to import. + config: Import configuration describing the file format and column + mapping. + + Returns: + A :class:`DataImport` representing the import operation. + + Raises: + FileNotFoundError: If the file does not exist. + """ + path = Path(file_path) + if not path.is_file(): + raise FileNotFoundError(f"File not found: {file_path}") + + data_import_id, upload_url = await self._low_level_client.create_from_upload(config) + logger.info("Created data import %s", data_import_id) + + await self._low_level_client.upload_file(upload_url, path) + logger.info("Uploaded file to presigned URL for import %s", data_import_id) + + data_import = await self._low_level_client.get(data_import_id) + return self._apply_client_to_instance(data_import) + + async def import_from_url( + self, + *, + url: str, + config: ImportConfig, + ) -> DataImport: + """Import data from a remote URL (HTTP or S3). + + Returns a :class:`DataImport` that can be polled for status via + ``data_import.refresh()``. + + Args: + url: The URL to import from. + config: Import configuration describing the file format and column + mapping. + + Returns: + A :class:`DataImport` representing the import operation. + """ + data_import_id = await self._low_level_client.create_from_url(url, config) + logger.info("Created URL-based data import %s", data_import_id) + + data_import = await self._low_level_client.get(data_import_id) + return self._apply_client_to_instance(data_import) + + async def get(self, data_import_id: str) -> DataImport: + """Get a data import by ID. + + Args: + data_import_id: The ID of the data import. + + Returns: + The DataImport. + """ + data_import = await self._low_level_client.get(data_import_id) + return self._apply_client_to_instance(data_import) + + async def list_( + self, + *, + data_import_ids: list[str] | None = None, + status: DataImportStatus | None = None, + filter_query: str | None = None, + order_by: str | None = None, + limit: int | None = None, + ) -> list[DataImport]: + """List data imports with optional filtering. + + Args: + data_import_ids: Filter to imports with any of these IDs. + status: Filter to imports with this status. + filter_query: Explicit CEL filter string. + order_by: Ordering string (e.g. "created_date desc"). + limit: Maximum number of imports to return. If None, returns all. + + Returns: + A list of DataImport objects matching the filter criteria. + """ + filter_parts = [] + if data_import_ids: + filter_parts.append(cel.in_("data_import_id", data_import_ids)) + if status is not None: + filter_parts.append(cel.equals("status", str(status.value))) + if filter_query: + filter_parts.append(filter_query) + query_filter = cel.and_(*filter_parts) + + data_imports = await self._low_level_client.list_all( + query_filter=query_filter or "", + order_by=order_by or "", + max_results=limit, + ) + return self._apply_client_to_instances(data_imports) + + async def retry(self, data_import: str | DataImport) -> None: + """Retry a failed data import. + + Only works for URL-based imports in a failed state. + + Args: + data_import: The DataImport or data_import_id to retry. + """ + data_import_id = ( + data_import._id_or_error if isinstance(data_import, DataImport) else data_import + ) + await self._low_level_client.retry(data_import_id) + + async def detect_config(self, file_path: str | Path) -> CsvImportConfig: + """Auto-detect import configuration from a file. + + Reads a sample of the file, sends it to the server's DetectConfig + endpoint, and returns the detected configuration. You can inspect + and modify the result before passing it to :meth:`import_from_path`. + + Currently supports CSV files only. + + Args: + file_path: Path to the file to analyze. + + Returns: + The detected import config. + + Raises: + FileNotFoundError: If the file does not exist. + ValueError: If detection returns no config. + """ + path = Path(file_path) + if not path.is_file(): + raise FileNotFoundError(f"File not found: {file_path}") + + with open(path, "rb") as f: + sample = f.read(_DETECT_CONFIG_SAMPLE_SIZE) + + response = await self._low_level_client.detect_config(sample, DATA_TYPE_KEY_CSV) + + if response.HasField("csv_config"): + return CsvImportConfig._from_proto(response.csv_config) + + raise ValueError("Server returned an empty DetectConfig response.") + + async def wait_until_complete( + self, + data_import: str | DataImport, + *, + polling_interval_secs: int = 5, + timeout_secs: int | None = None, + ) -> DataImport: + """Wait until a data import reaches a terminal state. + + Polls the import status at the given interval until the import is + SUCCEEDED or FAILED, returning the completed DataImport. + + Args: + data_import: The DataImport or data_import_id to wait for. + polling_interval_secs: Seconds between status polls. Defaults to 5s. + timeout_secs: Maximum seconds to wait. If None, polls indefinitely. + Defaults to None (indefinite). + + Returns: + The DataImport in its terminal state. + """ + data_import_id = ( + data_import._id_or_error if isinstance(data_import, DataImport) else data_import + ) + + start = time.monotonic() + while True: + result = await self.get(data_import_id) + if result.is_complete: + return result + if timeout_secs is not None and (time.monotonic() - start) >= timeout_secs: + raise TimeoutError( + f"Data import '{data_import_id}' did not complete " + f"within {timeout_secs} seconds." + ) + await asyncio.sleep(polling_interval_secs) diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.py b/python/lib/sift_client/resources/sync_stubs/__init__.py index acd73755e..982a028c6 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.py +++ b/python/lib/sift_client/resources/sync_stubs/__init__.py @@ -8,6 +8,7 @@ CalculatedChannelsAPIAsync, ChannelsAPIAsync, DataExportAPIAsync, + DataImportAPIAsync, FileAttachmentsAPIAsync, JobsAPIAsync, PingAPIAsync, @@ -30,12 +31,14 @@ TagsAPI = generate_sync_api(TagsAPIAsync, "TagsAPI") TestResultsAPI = generate_sync_api(TestResultsAPIAsync, "TestResultsAPI") DataExportAPI = generate_sync_api(DataExportAPIAsync, "DataExportAPI") +DataImportAPI = generate_sync_api(DataImportAPIAsync, "DataImportAPI") __all__ = [ "AssetsAPI", "CalculatedChannelsAPI", "ChannelsAPI", "DataExportAPI", + "DataImportAPI", "FileAttachmentsAPI", "JobsAPI", "PingAPI", diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index fe87809cd..a96efe70c 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -13,6 +13,9 @@ if TYPE_CHECKING: import pandas as pd import pyarrow as pa + from sift_client._internal.low_level_wrappers.data_imports import ( + ImportConfig, + ) from sift_client.client import SiftClient from sift_client.sift_types.asset import Asset, AssetUpdate from sift_client.sift_types.calculated_channel import ( @@ -21,6 +24,7 @@ if TYPE_CHECKING: CalculatedChannelUpdate, ) from sift_client.sift_types.channel import Channel + from sift_client.sift_types.data_import import CsvImportConfig, DataImport, DataImportStatus from sift_client.sift_types.export import ExportOutputFormat from sift_client.sift_types.file_attachment import ( FileAttachment, @@ -621,6 +625,148 @@ class DataExportAPI: """ ... +class DataImportAPI: + """Sync counterpart to `DataImportAPIAsync`. + + High-level API for importing data into Sift. + + Supports importing data from local files or remote URLs. Returns a + `DataImport` object that can be polled for status. + """ + + def __init__(self, sift_client: SiftClient): + """Initialize the DataImportAPI. + + Args: + sift_client: The Sift client to use. + """ + ... + + def _run(self, coro): ... + def detect_config(self, file_path: str | Path) -> CsvImportConfig: + """Auto-detect import configuration from a file. + + Reads a sample of the file, sends it to the server's DetectConfig + endpoint, and returns the detected configuration. You can inspect + and modify the result before passing it to :meth:`import_from_path`. + + Currently supports CSV files only. + + Args: + file_path: Path to the file to analyze. + + Returns: + The detected import config. + + Raises: + FileNotFoundError: If the file does not exist. + ValueError: If detection returns no config. + """ + ... + + def get(self, data_import_id: str) -> DataImport: + """Get a data import by ID. + + Args: + data_import_id: The ID of the data import. + + Returns: + The DataImport. + """ + ... + + def import_from_path(self, *, file_path: str | Path, config: ImportConfig) -> DataImport: + """Import data from a local file. + + Creates a data import on the server and uploads the file to the + returned presigned URL. Returns a :class:`DataImport` that can be + polled for status via ``data_import.refresh()``. + + Args: + file_path: Path to the local file to import. + config: Import configuration describing the file format and column + mapping. + + Returns: + A :class:`DataImport` representing the import operation. + + Raises: + FileNotFoundError: If the file does not exist. + """ + ... + + def import_from_url(self, *, url: str, config: ImportConfig) -> DataImport: + """Import data from a remote URL (HTTP or S3). + + Returns a :class:`DataImport` that can be polled for status via + ``data_import.refresh()``. + + Args: + url: The URL to import from. + config: Import configuration describing the file format and column + mapping. + + Returns: + A :class:`DataImport` representing the import operation. + """ + ... + + def list_( + self, + *, + data_import_ids: list[str] | None = None, + status: DataImportStatus | None = None, + filter_query: str | None = None, + order_by: str | None = None, + limit: int | None = None, + ) -> list[DataImport]: + """List data imports with optional filtering. + + Args: + data_import_ids: Filter to imports with any of these IDs. + status: Filter to imports with this status. + filter_query: Explicit CEL filter string. + order_by: Ordering string (e.g. "created_date desc"). + limit: Maximum number of imports to return. If None, returns all. + + Returns: + A list of DataImport objects matching the filter criteria. + """ + ... + + def retry(self, data_import: str | DataImport) -> None: + """Retry a failed data import. + + Only works for URL-based imports in a failed state. + + Args: + data_import: The DataImport or data_import_id to retry. + """ + ... + + def wait_until_complete( + self, + data_import: str | DataImport, + *, + polling_interval_secs: int = 5, + timeout_secs: int | None = None, + ) -> DataImport: + """Wait until a data import reaches a terminal state. + + Polls the import status at the given interval until the import is + SUCCEEDED or FAILED, returning the completed DataImport. + + Args: + data_import: The DataImport or data_import_id to wait for. + polling_interval_secs: Seconds between status polls. Defaults to 5s. + timeout_secs: Maximum seconds to wait. If None, polls indefinitely. + Defaults to None (indefinite). + + Returns: + The DataImport in its terminal state. + """ + ... + class FileAttachmentsAPI: """Sync counterpart to `FileAttachmentsAPIAsync`. diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py new file mode 100644 index 000000000..fc0bf119a --- /dev/null +++ b/python/lib/sift_client/sift_types/data_import.py @@ -0,0 +1,269 @@ +from __future__ import annotations + +from datetime import datetime # noqa: TC003 +from enum import Enum +from typing import TYPE_CHECKING + +from pydantic import BaseModel, ConfigDict +from sift.common.type.v1.channel_config_pb2 import ChannelConfig as ChannelConfigProto +from sift.data_imports.v2.data_imports_pb2 import CsvConfig as CsvConfigProto +from sift.data_imports.v2.data_imports_pb2 import CsvTimeColumn as CsvTimeColumnProto +from sift.data_imports.v2.data_imports_pb2 import DataImport as DataImportProto +from sift.data_imports.v2.data_imports_pb2 import DataImportStatus as DataImportStatusProto +from sift.data_imports.v2.data_imports_pb2 import TimeFormat as TimeFormatProto + +from sift_client._internal.util.timestamp import to_pb_timestamp +from sift_client.sift_types._base import BaseType +from sift_client.sift_types.channel import ChannelDataType + +if TYPE_CHECKING: + from sift_client.client import SiftClient + + +# --------------------------------------------------------------------------- +# Enums +# --------------------------------------------------------------------------- + + +class TimeFormat(Enum): + """Supported time formats for data import columns.""" + + RELATIVE_NANOSECONDS = TimeFormatProto.TIME_FORMAT_RELATIVE_NANOSECONDS + RELATIVE_MICROSECONDS = TimeFormatProto.TIME_FORMAT_RELATIVE_MICROSECONDS + RELATIVE_MILLISECONDS = TimeFormatProto.TIME_FORMAT_RELATIVE_MILLISECONDS + RELATIVE_SECONDS = TimeFormatProto.TIME_FORMAT_RELATIVE_SECONDS + RELATIVE_MINUTES = TimeFormatProto.TIME_FORMAT_RELATIVE_MINUTES + RELATIVE_HOURS = TimeFormatProto.TIME_FORMAT_RELATIVE_HOURS + ABSOLUTE_RFC3339 = TimeFormatProto.TIME_FORMAT_ABSOLUTE_RFC3339 + ABSOLUTE_DATETIME = TimeFormatProto.TIME_FORMAT_ABSOLUTE_DATETIME + ABSOLUTE_UNIX_SECONDS = TimeFormatProto.TIME_FORMAT_ABSOLUTE_UNIX_SECONDS + ABSOLUTE_UNIX_MILLISECONDS = TimeFormatProto.TIME_FORMAT_ABSOLUTE_UNIX_MILLISECONDS + ABSOLUTE_UNIX_MICROSECONDS = TimeFormatProto.TIME_FORMAT_ABSOLUTE_UNIX_MICROSECONDS + ABSOLUTE_UNIX_NANOSECONDS = TimeFormatProto.TIME_FORMAT_ABSOLUTE_UNIX_NANOSECONDS + + +class DataImportStatus(Enum): + """Status of a data import.""" + + PENDING = DataImportStatusProto.DATA_IMPORT_STATUS_PENDING + IN_PROGRESS = DataImportStatusProto.DATA_IMPORT_STATUS_IN_PROGRESS + SUCCEEDED = DataImportStatusProto.DATA_IMPORT_STATUS_SUCCEEDED + FAILED = DataImportStatusProto.DATA_IMPORT_STATUS_FAILED + + +# --------------------------------------------------------------------------- +# CSV config types +# --------------------------------------------------------------------------- + + +class CsvTimeColumn(BaseModel): + """Time column configuration for CSV imports. + + Attributes: + column: The 1-indexed column number of the time column. + format: The time format used in this column. + relative_start_time: Required when using a relative time format. + """ + + model_config = ConfigDict(frozen=True) + + column: int + format: TimeFormat + relative_start_time: datetime | None = None + + def _to_proto(self) -> CsvTimeColumnProto: + proto = CsvTimeColumnProto( + column_number=self.column, + format=self.format.value, + ) + if self.relative_start_time is not None: + proto.relative_start_time.CopyFrom(to_pb_timestamp(self.relative_start_time)) + return proto + + +class CsvDataColumn(BaseModel): + """A data column definition for CSV imports. + + Attributes: + column: The 1-indexed column number. + name: Channel name. + data_type: The data type of the channel values. + units: Optional units string. + description: Optional channel description. + """ + + model_config = ConfigDict(frozen=True) + + column: int + name: str + data_type: ChannelDataType + units: str = "" + description: str = "" + + +class CsvImportConfig(BaseModel): + """Configuration for importing a CSV file. + + Attributes: + asset_name: Name of the asset to import data into. + run_name: Name for the run. Ignored if ``run_id`` is set. + run_id: ID of an existing run to append data to. + first_data_row: The first row containing data (1-indexed). Defaults to 2 to skip a header row. + time_column: Time column configuration. + data_columns: List of data column definitions. + """ + + model_config = ConfigDict(frozen=True) + + asset_name: str + run_name: str | None = None + run_id: str | None = None + first_data_row: int = 2 + time_column: CsvTimeColumn + data_columns: list[CsvDataColumn] + + def _to_proto(self) -> CsvConfigProto: + return CsvConfigProto( + asset_name=self.asset_name, + run_name=self.run_name or "", + run_id=self.run_id or "", + first_data_row=self.first_data_row, + time_column=self.time_column._to_proto(), + data_columns={ + dc.column: ChannelConfigProto( + name=dc.name, + data_type=dc.data_type.value, + units=dc.units, + description=dc.description, + ) + for dc in self.data_columns + }, + ) + + @classmethod + def _from_proto(cls, proto: CsvConfigProto) -> CsvImportConfig: + """Create from a proto CsvConfig (e.g. from DetectConfig response).""" + time_column = CsvTimeColumn( + column=proto.time_column.column_number, + format=TimeFormat(proto.time_column.format), + ) + data_columns = [ + CsvDataColumn( + column=col_num, + name=ch_cfg.name, + data_type=ChannelDataType(ch_cfg.data_type), + units=ch_cfg.units, + description=ch_cfg.description, + ) + for col_num, ch_cfg in proto.data_columns.items() + ] + return cls( + asset_name=proto.asset_name, + run_name=proto.run_name or None, + run_id=proto.run_id or None, + first_data_row=proto.first_data_row or 2, + time_column=time_column, + data_columns=data_columns, + ) + + +# --------------------------------------------------------------------------- +# DataImport resource type +# --------------------------------------------------------------------------- + + +class DataImport(BaseType[DataImportProto, "DataImport"]): + """A data import in the Sift system. + + Represents the status and metadata of an import operation. Use + ``client.data_import.upload()`` to create one, or ``client.data_import.get()`` + to retrieve an existing import by ID. + """ + + # Required fields + status: DataImportStatus + created_date: datetime + modified_date: datetime + + # Optional fields + error_message: str | None + source_url: str | None + run_id: str | None + report_id: str | None + asset_id: str | None + data_start_time: datetime | None + data_stop_time: datetime | None + + # Config used for this import + csv_config: CsvImportConfig | None + + @classmethod + def _from_proto( + cls, proto: DataImportProto, sift_client: SiftClient | None = None + ) -> DataImport: + from datetime import timezone + + return cls( + proto=proto, + id_=proto.data_import_id, + status=DataImportStatus(proto.status), + error_message=proto.error_message or None, + created_date=proto.created_date.ToDatetime(tzinfo=timezone.utc), + modified_date=proto.modified_date.ToDatetime(tzinfo=timezone.utc), + source_url=proto.source_url or None, + run_id=proto.run_id if proto.HasField("_run_id") else None, + report_id=proto.report_id if proto.HasField("_report_id") else None, + asset_id=proto.asset_id if proto.HasField("_asset_id") else None, + data_start_time=( + proto.data_start_time.ToDatetime(tzinfo=timezone.utc) + if proto.HasField("_data_start_time") + else None + ), + data_stop_time=( + proto.data_stop_time.ToDatetime(tzinfo=timezone.utc) + if proto.HasField("_data_stop_time") + else None + ), + csv_config=( + CsvImportConfig._from_proto(proto.csv_config) + if proto.HasField("csv_config") + else None + ), + _client=sift_client, + ) + + @property + def is_pending(self) -> bool: + """Return True if the import is pending.""" + return self.status == DataImportStatus.PENDING + + @property + def is_in_progress(self) -> bool: + """Return True if the import is in progress.""" + return self.status == DataImportStatus.IN_PROGRESS + + @property + def is_succeeded(self) -> bool: + """Return True if the import succeeded.""" + return self.status == DataImportStatus.SUCCEEDED + + @property + def is_failed(self) -> bool: + """Return True if the import failed.""" + return self.status == DataImportStatus.FAILED + + @property + def is_complete(self) -> bool: + """Return True if the import reached a terminal state (succeeded or failed).""" + return self.status in (DataImportStatus.SUCCEEDED, DataImportStatus.FAILED) + + def refresh(self) -> DataImport: + """Refresh this import with the latest data from the API.""" + updated = self.client.data_import.get(self._id_or_error) + self._update(updated) + return self + + def retry(self) -> None: + """Retry this import. Only works for URL-based imports in a failed state.""" + self.client.data_import.retry(self._id_or_error) + self.refresh() diff --git a/python/lib/sift_client/util/util.py b/python/lib/sift_client/util/util.py index e82a8ccfe..98719cfdd 100644 --- a/python/lib/sift_client/util/util.py +++ b/python/lib/sift_client/util/util.py @@ -8,6 +8,7 @@ CalculatedChannelsAPIAsync, ChannelsAPIAsync, DataExportAPIAsync, + DataImportAPIAsync, FileAttachmentsAPIAsync, IngestionAPIAsync, JobsAPIAsync, @@ -62,6 +63,9 @@ class AsyncAPIs(NamedTuple): data_export: DataExportAPIAsync """Instance of the Data Export API for making asynchronous requests.""" + data_import: DataImportAPIAsync + """Instance of the Data Import API for making asynchronous requests.""" + def count_non_none(*args: Any) -> int: """Count the number of non-none arguments.""" From 7224b79c6a7ed16fda0f717ff23fa9f5f3df2c75 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Fri, 27 Mar 2026 08:46:42 -0700 Subject: [PATCH 02/52] add detect config data types --- .../lib/sift_client/resources/data_imports.py | 70 ++++++++++++++++--- .../resources/sync_stubs/__init__.pyi | 36 ++++++++-- .../lib/sift_client/sift_types/data_import.py | 28 ++++++++ 3 files changed, 116 insertions(+), 18 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 8ec2a3706..bc2ac9cdf 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -6,11 +6,10 @@ from pathlib import Path from typing import TYPE_CHECKING -from sift.data_imports.v2.data_imports_pb2 import DATA_TYPE_KEY_CSV - from sift_client._internal.low_level_wrappers.data_imports import DataImportsLowLevelClient from sift_client.resources._base import ResourceBase from sift_client.sift_types.data_import import ( + EXTENSION_TO_DATA_TYPE_KEY, CsvImportConfig, DataImport, DataImportStatus, @@ -49,7 +48,10 @@ async def import_from_path( self, *, file_path: str | Path, - config: ImportConfig, + config: ImportConfig | None = None, + asset_name: str | None = None, + run_name: str | None = None, + run_id: str | None = None, ) -> DataImport: """Import data from a local file. @@ -57,21 +59,47 @@ async def import_from_path( returned presigned URL. Returns a :class:`DataImport` that can be polled for status via ``data_import.refresh()``. + When ``config`` is omitted the file format is auto-detected via + :meth:`detect_config` and a :class:`CsvImportConfig` is built using + the provided ``asset_name`` and optional ``run_name`` / ``run_id``. + Args: file_path: Path to the local file to import. config: Import configuration describing the file format and column - mapping. + mapping. When provided, ``asset_name``, ``run_name``, and + ``run_id`` are ignored. + asset_name: Name of the asset to import into. Required when + ``config`` is not provided. + run_name: Optional run name. Only used when ``config`` is not + provided. + run_id: Optional existing run ID. Only used when ``config`` is not + provided. Returns: A :class:`DataImport` representing the import operation. Raises: FileNotFoundError: If the file does not exist. + ValueError: If neither ``config`` nor ``asset_name`` is provided. """ path = Path(file_path) if not path.is_file(): raise FileNotFoundError(f"File not found: {file_path}") + if config is None: + if asset_name is None: + raise ValueError( + "Either 'config' or 'asset_name' must be provided." + ) + detected = await self.detect_config(file_path) + config = detected.model_copy( + update={ + "asset_name": asset_name, + "run_name": run_name, + "run_id": run_id, + } + ) + data_import_id, upload_url = await self._low_level_client.create_from_upload(config) logger.info("Created data import %s", data_import_id) @@ -168,14 +196,15 @@ async def retry(self, data_import: str | DataImport) -> None: ) await self._low_level_client.retry(data_import_id) - async def detect_config(self, file_path: str | Path) -> CsvImportConfig: + async def detect_config(self, file_path: str | Path) -> ImportConfig: """Auto-detect import configuration from a file. Reads a sample of the file, sends it to the server's DetectConfig - endpoint, and returns the detected configuration. You can inspect - and modify the result before passing it to :meth:`import_from_path`. + endpoint, and returns the detected configuration. The file format + is inferred from the file extension. You can inspect and modify the + result before passing it to :meth:`import_from_path`. - Currently supports CSV files only. + Supported extensions: .csv, .parquet, .tdms, .ch10, .ch11, .h5, .hdf5 Args: file_path: Path to the file to analyze. @@ -185,19 +214,38 @@ async def detect_config(self, file_path: str | Path) -> CsvImportConfig: Raises: FileNotFoundError: If the file does not exist. - ValueError: If detection returns no config. + ValueError: If the file extension is unsupported or detection + returns no config. """ path = Path(file_path) if not path.is_file(): raise FileNotFoundError(f"File not found: {file_path}") + ext = path.suffix.lower() + data_type_key = EXTENSION_TO_DATA_TYPE_KEY.get(ext) + if data_type_key is None: + raise ValueError( + f"Unsupported file extension '{ext}'. " + f"Supported: {', '.join(sorted(EXTENSION_TO_DATA_TYPE_KEY))}" + ) + with open(path, "rb") as f: sample = f.read(_DETECT_CONFIG_SAMPLE_SIZE) - response = await self._low_level_client.detect_config(sample, DATA_TYPE_KEY_CSV) + response = await self._low_level_client.detect_config(sample, data_type_key.value) if response.HasField("csv_config"): - return CsvImportConfig._from_proto(response.csv_config) + config = CsvImportConfig._from_proto(response.csv_config) + # The server's DetectConfig may include the time column in + # data_columns, but CreateDataImportFromUpload rejects that + # overlap. Filter it out so the config is import-ready. + time_col = config.time_column.column + filtered = [dc for dc in config.data_columns if dc.column != time_col] + if len(filtered) != len(config.data_columns): + config = config.model_copy(update={"data_columns": filtered}) + return config + + # TODO: Add parquet_config and hdf5_config once their config types are added. raise ValueError("Server returned an empty DetectConfig response.") diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index a96efe70c..ce8d04cf5 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -643,14 +643,15 @@ class DataImportAPI: ... def _run(self, coro): ... - def detect_config(self, file_path: str | Path) -> CsvImportConfig: + def detect_config(self, file_path: str | Path) -> ImportConfig: """Auto-detect import configuration from a file. Reads a sample of the file, sends it to the server's DetectConfig - endpoint, and returns the detected configuration. You can inspect - and modify the result before passing it to :meth:`import_from_path`. + endpoint, and returns the detected configuration. The file format + is inferred from the file extension. You can inspect and modify the + result before passing it to :meth:`import_from_path`. - Currently supports CSV files only. + Supported extensions: .csv, .parquet, .tdms, .ch10, .ch11, .h5, .hdf5 Args: file_path: Path to the file to analyze. @@ -660,7 +661,8 @@ class DataImportAPI: Raises: FileNotFoundError: If the file does not exist. - ValueError: If detection returns no config. + ValueError: If the file extension is unsupported or detection + returns no config. """ ... @@ -675,23 +677,43 @@ class DataImportAPI: """ ... - def import_from_path(self, *, file_path: str | Path, config: ImportConfig) -> DataImport: + def import_from_path( + self, + *, + file_path: str | Path, + config: ImportConfig | None = None, + asset_name: str | None = None, + run_name: str | None = None, + run_id: str | None = None, + ) -> DataImport: """Import data from a local file. Creates a data import on the server and uploads the file to the returned presigned URL. Returns a :class:`DataImport` that can be polled for status via ``data_import.refresh()``. + When ``config`` is omitted the file format is auto-detected via + :meth:`detect_config` and a :class:`CsvImportConfig` is built using + the provided ``asset_name`` and optional ``run_name`` / ``run_id``. + Args: file_path: Path to the local file to import. config: Import configuration describing the file format and column - mapping. + mapping. When provided, ``asset_name``, ``run_name``, and + ``run_id`` are ignored. + asset_name: Name of the asset to import into. Required when + ``config`` is not provided. + run_name: Optional run name. Only used when ``config`` is not + provided. + run_id: Optional existing run ID. Only used when ``config`` is not + provided. Returns: A :class:`DataImport` representing the import operation. Raises: FileNotFoundError: If the file does not exist. + ValueError: If neither ``config`` nor ``asset_name`` is provided. """ ... diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index fc0bf119a..3ced4e9f6 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -6,6 +6,13 @@ from pydantic import BaseModel, ConfigDict from sift.common.type.v1.channel_config_pb2 import ChannelConfig as ChannelConfigProto +from sift.data_imports.v2.data_imports_pb2 import ( + DATA_TYPE_KEY_CH10, + DATA_TYPE_KEY_CSV, + DATA_TYPE_KEY_HDF5, + DATA_TYPE_KEY_PARQUET_FLATDATASET, + DATA_TYPE_KEY_TDMS, +) from sift.data_imports.v2.data_imports_pb2 import CsvConfig as CsvConfigProto from sift.data_imports.v2.data_imports_pb2 import CsvTimeColumn as CsvTimeColumnProto from sift.data_imports.v2.data_imports_pb2 import DataImport as DataImportProto @@ -51,6 +58,27 @@ class DataImportStatus(Enum): FAILED = DataImportStatusProto.DATA_IMPORT_STATUS_FAILED +class DataTypeKey(Enum): + """Supported file types for data import detection.""" + + CSV = DATA_TYPE_KEY_CSV + PARQUET = DATA_TYPE_KEY_PARQUET_FLATDATASET + TDMS = DATA_TYPE_KEY_TDMS + CH10 = DATA_TYPE_KEY_CH10 + HDF5 = DATA_TYPE_KEY_HDF5 + + +EXTENSION_TO_DATA_TYPE_KEY: dict[str, DataTypeKey] = { + ".csv": DataTypeKey.CSV, + ".parquet": DataTypeKey.PARQUET, + ".tdms": DataTypeKey.TDMS, + ".ch10": DataTypeKey.CH10, + ".ch11": DataTypeKey.CH10, + ".h5": DataTypeKey.HDF5, + ".hdf5": DataTypeKey.HDF5, +} + + # --------------------------------------------------------------------------- # CSV config types # --------------------------------------------------------------------------- From d27b0703c047517c2418e91bdbb7eb708d6afec3 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Wed, 1 Apr 2026 14:52:24 -0700 Subject: [PATCH 03/52] added relative time validation, refactored the import process --- .../lib/sift_client/resources/data_imports.py | 35 ++++++++++++------- .../resources/sync_stubs/__init__.pyi | 2 +- .../lib/sift_client/sift_types/data_import.py | 12 +++++-- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index bc2ac9cdf..a03024188 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING from sift_client._internal.low_level_wrappers.data_imports import DataImportsLowLevelClient +from sift_client._internal.util.executor import run_sync_function from sift_client.resources._base import ResourceBase from sift_client.sift_types.data_import import ( EXTENSION_TO_DATA_TYPE_KEY, @@ -25,6 +26,16 @@ _DETECT_CONFIG_SAMPLE_SIZE = 65_536 # 64 KiB +def _validate_config(config: ImportConfig) -> None: + """Validate an import config before sending it to the server.""" + if isinstance(config, CsvImportConfig): + tc = config.time_column + if tc.format.name.startswith("RELATIVE_") and tc.relative_start_time is None: + raise ValueError( + f"'relative_start_time' is required when using a relative time format ({tc.format.name})." + ) + + class DataImportAPIAsync(ResourceBase): """High-level API for importing data into Sift. @@ -55,9 +66,8 @@ async def import_from_path( ) -> DataImport: """Import data from a local file. - Creates a data import on the server and uploads the file to the - returned presigned URL. Returns a :class:`DataImport` that can be - polled for status via ``data_import.refresh()``. + Creates a data import on the server, uploads the file, and waits + for the import to complete. Returns the completed :class:`DataImport`. When ``config`` is omitted the file format is auto-detected via :meth:`detect_config` and a :class:`CsvImportConfig` is built using @@ -88,9 +98,7 @@ async def import_from_path( if config is None: if asset_name is None: - raise ValueError( - "Either 'config' or 'asset_name' must be provided." - ) + raise ValueError("Either 'config' or 'asset_name' must be provided.") detected = await self.detect_config(file_path) config = detected.model_copy( update={ @@ -100,14 +108,14 @@ async def import_from_path( } ) + _validate_config(config) data_import_id, upload_url = await self._low_level_client.create_from_upload(config) logger.info("Created data import %s", data_import_id) await self._low_level_client.upload_file(upload_url, path) logger.info("Uploaded file to presigned URL for import %s", data_import_id) - data_import = await self._low_level_client.get(data_import_id) - return self._apply_client_to_instance(data_import) + return await self.wait_until_complete(data_import_id) async def import_from_url( self, @@ -128,11 +136,11 @@ async def import_from_url( Returns: A :class:`DataImport` representing the import operation. """ + _validate_config(config) data_import_id = await self._low_level_client.create_from_url(url, config) logger.info("Created URL-based data import %s", data_import_id) - data_import = await self._low_level_client.get(data_import_id) - return self._apply_client_to_instance(data_import) + return await self.wait_until_complete(data_import_id) async def get(self, data_import_id: str) -> DataImport: """Get a data import by ID. @@ -229,8 +237,11 @@ async def detect_config(self, file_path: str | Path) -> ImportConfig: f"Supported: {', '.join(sorted(EXTENSION_TO_DATA_TYPE_KEY))}" ) - with open(path, "rb") as f: - sample = f.read(_DETECT_CONFIG_SAMPLE_SIZE) + def _read_sample() -> bytes: + with open(path, "rb") as f: + return f.read(_DETECT_CONFIG_SAMPLE_SIZE) + + sample = await run_sync_function(_read_sample) response = await self._low_level_client.detect_config(sample, data_type_key.value) diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index ce8d04cf5..e9153aafc 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -24,7 +24,7 @@ if TYPE_CHECKING: CalculatedChannelUpdate, ) from sift_client.sift_types.channel import Channel - from sift_client.sift_types.data_import import CsvImportConfig, DataImport, DataImportStatus + from sift_client.sift_types.data_import import DataImport, DataImportStatus from sift_client.sift_types.export import ExportOutputFormat from sift_client.sift_types.file_attachment import ( FileAttachment, diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index 3ced4e9f6..2d6242276 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -171,9 +171,17 @@ def _to_proto(self) -> CsvConfigProto: @classmethod def _from_proto(cls, proto: CsvConfigProto) -> CsvImportConfig: """Create from a proto CsvConfig (e.g. from DetectConfig response).""" + relative_start_time = None + if proto.time_column.HasField("relative_start_time"): + from datetime import timezone + + relative_start_time = proto.time_column.relative_start_time.ToDatetime( + tzinfo=timezone.utc + ) time_column = CsvTimeColumn( column=proto.time_column.column_number, format=TimeFormat(proto.time_column.format), + relative_start_time=relative_start_time, ) data_columns = [ CsvDataColumn( @@ -204,8 +212,8 @@ class DataImport(BaseType[DataImportProto, "DataImport"]): """A data import in the Sift system. Represents the status and metadata of an import operation. Use - ``client.data_import.upload()`` to create one, or ``client.data_import.get()`` - to retrieve an existing import by ID. + ``client.data_import.import_from_path()`` to create one, or + ``client.data_import.get()`` to retrieve an existing import by ID. """ # Required fields From 77dbf868929a821dcab2c55540d4dc890f1aa307 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Wed, 1 Apr 2026 15:28:20 -0700 Subject: [PATCH 04/52] added progress spinner for polling --- .../lib/sift_client/resources/data_imports.py | 59 +++++++++++++++---- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index a03024188..39fa13c2f 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -6,6 +6,9 @@ from pathlib import Path from typing import TYPE_CHECKING +from alive_progress import alive_bar # type: ignore[import-untyped] + +import sift_client as _sift_client_module from sift_client._internal.low_level_wrappers.data_imports import DataImportsLowLevelClient from sift_client._internal.util.executor import run_sync_function from sift_client.resources._base import ResourceBase @@ -63,6 +66,7 @@ async def import_from_path( asset_name: str | None = None, run_name: str | None = None, run_id: str | None = None, + show_progress: bool | None = None, ) -> DataImport: """Import data from a local file. @@ -84,6 +88,9 @@ async def import_from_path( provided. run_id: Optional existing run ID. Only used when ``config`` is not provided. + show_progress: If True, display a progress spinner while waiting + for the import to complete. Defaults to True for sync, False + for async. Returns: A :class:`DataImport` representing the import operation. @@ -115,13 +122,14 @@ async def import_from_path( await self._low_level_client.upload_file(upload_url, path) logger.info("Uploaded file to presigned URL for import %s", data_import_id) - return await self.wait_until_complete(data_import_id) + return await self.wait_until_complete(data_import_id, show_progress=show_progress) async def import_from_url( self, *, url: str, config: ImportConfig, + show_progress: bool | None = None, ) -> DataImport: """Import data from a remote URL (HTTP or S3). @@ -132,6 +140,9 @@ async def import_from_url( url: The URL to import from. config: Import configuration describing the file format and column mapping. + show_progress: If True, display a progress spinner while waiting + for the import to complete. Defaults to True for sync, False + for async. Returns: A :class:`DataImport` representing the import operation. @@ -140,7 +151,7 @@ async def import_from_url( data_import_id = await self._low_level_client.create_from_url(url, config) logger.info("Created URL-based data import %s", data_import_id) - return await self.wait_until_complete(data_import_id) + return await self.wait_until_complete(data_import_id, show_progress=show_progress) async def get(self, data_import_id: str) -> DataImport: """Get a data import by ID. @@ -266,6 +277,7 @@ async def wait_until_complete( *, polling_interval_secs: int = 5, timeout_secs: int | None = None, + show_progress: bool | None = None, ) -> DataImport: """Wait until a data import reaches a terminal state. @@ -277,6 +289,10 @@ async def wait_until_complete( polling_interval_secs: Seconds between status polls. Defaults to 5s. timeout_secs: Maximum seconds to wait. If None, polls indefinitely. Defaults to None (indefinite). + show_progress: If True, display an animated progress spinner alongside + the import status while polling. Defaults to True for sync, False + for async. Use ``sift_client.config.show_progress = False`` to disable + globally for sync. Returns: The DataImport in its terminal state. @@ -284,15 +300,34 @@ async def wait_until_complete( data_import_id = ( data_import._id_or_error if isinstance(data_import, DataImport) else data_import ) + if show_progress is None: + global_setting = _sift_client_module.config.show_progress + if global_setting is not None: + show_progress = global_setting + elif getattr(self, "_is_sync", False): + show_progress = True + else: + show_progress = False start = time.monotonic() - while True: - result = await self.get(data_import_id) - if result.is_complete: - return result - if timeout_secs is not None and (time.monotonic() - start) >= timeout_secs: - raise TimeoutError( - f"Data import '{data_import_id}' did not complete " - f"within {timeout_secs} seconds." - ) - await asyncio.sleep(polling_interval_secs) + with alive_bar( + title=f"Data Import ID {data_import_id}: polling", + bar=None, + spinner_length=7, + spinner="dots_waves", + monitor=False, + stats=False, + disable=not show_progress, + ) as bar: + while True: + result = await self.get(data_import_id) + bar.title(f"Data Import ID {data_import_id}: {result.status.name}") + bar() + if result.is_complete: + return result + if timeout_secs is not None and (time.monotonic() - start) >= timeout_secs: + raise TimeoutError( + f"Data import '{data_import_id}' did not complete " + f"within {timeout_secs} seconds." + ) + await asyncio.sleep(polling_interval_secs) From bd5e9f81e5062ed2a885d4057cdac5773cafe596 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Wed, 1 Apr 2026 16:38:50 -0700 Subject: [PATCH 05/52] missing run defaults to filename --- .../lib/sift_client/resources/data_imports.py | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 39fa13c2f..f940e38b7 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -26,8 +26,6 @@ logger = logging.getLogger(__name__) -_DETECT_CONFIG_SAMPLE_SIZE = 65_536 # 64 KiB - def _validate_config(config: ImportConfig) -> None: """Validate an import config before sending it to the server.""" @@ -66,6 +64,8 @@ async def import_from_path( asset_name: str | None = None, run_name: str | None = None, run_id: str | None = None, + polling_interval_secs: int = 5, + timeout_secs: int | None = None, show_progress: bool | None = None, ) -> DataImport: """Import data from a local file. @@ -88,6 +88,8 @@ async def import_from_path( provided. run_id: Optional existing run ID. Only used when ``config`` is not provided. + polling_interval_secs: Seconds between status polls. Defaults to 5s. + timeout_secs: Maximum seconds to wait. If None, polls indefinitely. show_progress: If True, display a progress spinner while waiting for the import to complete. Defaults to True for sync, False for async. @@ -110,7 +112,7 @@ async def import_from_path( config = detected.model_copy( update={ "asset_name": asset_name, - "run_name": run_name, + "run_name": run_name if run_name or run_id else path.name, "run_id": run_id, } ) @@ -122,13 +124,20 @@ async def import_from_path( await self._low_level_client.upload_file(upload_url, path) logger.info("Uploaded file to presigned URL for import %s", data_import_id) - return await self.wait_until_complete(data_import_id, show_progress=show_progress) + return await self.wait_until_complete( + data_import_id, + polling_interval_secs=polling_interval_secs, + timeout_secs=timeout_secs, + show_progress=show_progress, + ) async def import_from_url( self, *, url: str, config: ImportConfig, + polling_interval_secs: int = 5, + timeout_secs: int | None = None, show_progress: bool | None = None, ) -> DataImport: """Import data from a remote URL (HTTP or S3). @@ -140,6 +149,8 @@ async def import_from_url( url: The URL to import from. config: Import configuration describing the file format and column mapping. + polling_interval_secs: Seconds between status polls. Defaults to 5s. + timeout_secs: Maximum seconds to wait. If None, polls indefinitely. show_progress: If True, display a progress spinner while waiting for the import to complete. Defaults to True for sync, False for async. @@ -151,7 +162,12 @@ async def import_from_url( data_import_id = await self._low_level_client.create_from_url(url, config) logger.info("Created URL-based data import %s", data_import_id) - return await self.wait_until_complete(data_import_id, show_progress=show_progress) + return await self.wait_until_complete( + data_import_id, + polling_interval_secs=polling_interval_secs, + timeout_secs=timeout_secs, + show_progress=show_progress, + ) async def get(self, data_import_id: str) -> DataImport: """Get a data import by ID. @@ -250,7 +266,7 @@ async def detect_config(self, file_path: str | Path) -> ImportConfig: def _read_sample() -> bytes: with open(path, "rb") as f: - return f.read(_DETECT_CONFIG_SAMPLE_SIZE) + return f.read(65_536) # 64 KiB sample = await run_sync_function(_read_sample) @@ -267,7 +283,7 @@ def _read_sample() -> bytes: config = config.model_copy(update={"data_columns": filtered}) return config - # TODO: Add parquet_config and hdf5_config once their config types are added. + # TODO: Add other file format configs raise ValueError("Server returned an empty DetectConfig response.") From 41f9c082a3eef7e48d7ad950ab560313cb8bdb0b Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 2 Apr 2026 13:44:02 -0700 Subject: [PATCH 06/52] added relative time format validation in the model --- python/lib/sift_client/resources/data_imports.py | 15 +-------------- python/lib/sift_client/sift_types/data_import.py | 10 +++++++++- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index f940e38b7..3ed12e50d 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -27,16 +27,6 @@ logger = logging.getLogger(__name__) -def _validate_config(config: ImportConfig) -> None: - """Validate an import config before sending it to the server.""" - if isinstance(config, CsvImportConfig): - tc = config.time_column - if tc.format.name.startswith("RELATIVE_") and tc.relative_start_time is None: - raise ValueError( - f"'relative_start_time' is required when using a relative time format ({tc.format.name})." - ) - - class DataImportAPIAsync(ResourceBase): """High-level API for importing data into Sift. @@ -112,12 +102,10 @@ async def import_from_path( config = detected.model_copy( update={ "asset_name": asset_name, - "run_name": run_name if run_name or run_id else path.name, + "run_name": run_name if run_name or run_id else path.stem, "run_id": run_id, } ) - - _validate_config(config) data_import_id, upload_url = await self._low_level_client.create_from_upload(config) logger.info("Created data import %s", data_import_id) @@ -158,7 +146,6 @@ async def import_from_url( Returns: A :class:`DataImport` representing the import operation. """ - _validate_config(config) data_import_id = await self._low_level_client.create_from_url(url, config) logger.info("Created URL-based data import %s", data_import_id) diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index 2d6242276..8f6cc0212 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -4,7 +4,7 @@ from enum import Enum from typing import TYPE_CHECKING -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, model_validator from sift.common.type.v1.channel_config_pb2 import ChannelConfig as ChannelConfigProto from sift.data_imports.v2.data_imports_pb2 import ( DATA_TYPE_KEY_CH10, @@ -108,6 +108,14 @@ def _to_proto(self) -> CsvTimeColumnProto: proto.relative_start_time.CopyFrom(to_pb_timestamp(self.relative_start_time)) return proto + @model_validator(mode="after") + def _check_relative_start_time(self) -> CsvTimeColumn: + if self.format.name.startswith("RELATIVE_") and self.relative_start_time is None: + raise ValueError( + f"'relative_start_time' is required when using a relative time format ({self.format.name})." + ) + return self + class CsvDataColumn(BaseModel): """A data column definition for CSV imports. From e76d2d2017523ee731ffda9c4f785e9d056808bc Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 2 Apr 2026 14:01:02 -0700 Subject: [PATCH 07/52] updated post request to include file name for downstream file attachment --- .../_internal/low_level_wrappers/data_imports.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py index d83f42142..2e0a3bb50 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py @@ -90,7 +90,13 @@ async def upload_file(self, upload_url: str, file_path: Path) -> None: def _do_upload() -> None: with open(file_path, "rb") as f: - response = rest_client.post(upload_url, data=f) + response = rest_client.post( + upload_url, + data=f, + headers={ + "Content-Disposition": f'attachment; filename="{file_path.name}"' + }, # Preserve original filename for server-side storage. + ) response.raise_for_status() await run_sync_function(_do_upload) From 4cb5ebd93a07675817694d3b7da8004239e2b1f3 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 2 Apr 2026 15:39:16 -0700 Subject: [PATCH 08/52] added parquet data type, refactor to use util --- .../low_level_wrappers/data_imports.py | 36 ++----------- python/lib/sift_client/_internal/util/file.py | 26 ++++++++++ .../lib/sift_client/resources/data_imports.py | 50 +++++++++++++------ .../lib/sift_client/sift_types/data_import.py | 21 ++------ 4 files changed, 66 insertions(+), 67 deletions(-) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py index 2e0a3bb50..5cd1b91e5 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py @@ -19,17 +19,13 @@ from sift.data_imports.v2.data_imports_pb2_grpc import DataImportServiceStub from sift_client._internal.low_level_wrappers.base import LowLevelClientBase -from sift_client._internal.util.executor import run_sync_function from sift_client.sift_types.data_import import CsvImportConfig, DataImport -from sift_client.transport import WithGrpcClient, WithRestClient +from sift_client.transport import WithGrpcClient if TYPE_CHECKING: - from pathlib import Path - from sift.data_imports.v2.data_imports_pb2 import DataTypeKey from sift_client.transport.grpc_transport import GrpcClient - from sift_client.transport.rest_transport import RestClient # Union of all supported config types. Extend this as new formats are added. ImportConfig = CsvImportConfig @@ -49,15 +45,14 @@ def _set_config_on_request( logger = logging.getLogger(__name__) -class DataImportsLowLevelClient(LowLevelClientBase, WithGrpcClient, WithRestClient): +class DataImportsLowLevelClient(LowLevelClientBase, WithGrpcClient): """Low-level client for the DataImportService. This class provides a thin wrapper around the autogenerated bindings for the DataImportsAPI. """ - def __init__(self, grpc_client: GrpcClient, rest_client: RestClient): + def __init__(self, grpc_client: GrpcClient): WithGrpcClient.__init__(self, grpc_client=grpc_client) - WithRestClient.__init__(self, rest_client=rest_client) async def create_from_upload(self, config: ImportConfig) -> tuple[str, str]: """Create a data import and get back a presigned upload URL. @@ -76,31 +71,6 @@ async def create_from_upload(self, config: ImportConfig) -> tuple[str, str]: response = cast("CreateDataImportFromUploadResponse", response) return response.data_import_id, response.upload_url - async def upload_file(self, upload_url: str, file_path: Path) -> None: - """Upload a file to a presigned URL. - - Runs the synchronous HTTP POST in a thread pool to avoid blocking - the event loop. - - Args: - upload_url: The presigned URL to upload to. - file_path: Path to the file to upload. - """ - rest_client = self._rest_client - - def _do_upload() -> None: - with open(file_path, "rb") as f: - response = rest_client.post( - upload_url, - data=f, - headers={ - "Content-Disposition": f'attachment; filename="{file_path.name}"' - }, # Preserve original filename for server-side storage. - ) - response.raise_for_status() - - await run_sync_function(_do_upload) - async def create_from_url(self, url: str, config: ImportConfig) -> str: """Create a data import from a remote URL. diff --git a/python/lib/sift_client/_internal/util/file.py b/python/lib/sift_client/_internal/util/file.py index 518bce847..0e977ced6 100644 --- a/python/lib/sift_client/_internal/util/file.py +++ b/python/lib/sift_client/_internal/util/file.py @@ -14,6 +14,32 @@ from sift_client.transport.rest_transport import RestClient +def upload_file( + signed_url: str, + file_path: Path, + *, + rest_client: RestClient, +) -> None: + """Upload a file to a presigned URL. + + Args: + signed_url: The presigned URL to upload to. + file_path: Path to the file to upload. + rest_client: The SDK rest client to use for the upload. + + Raises: + ValueError: If the upload request fails. + """ + with open(file_path, "rb") as f: + response = rest_client.post( + signed_url, + data=f, + headers={"Content-Disposition": f'attachment; filename="{file_path.name}"'}, + ) + if not response.ok: + raise ValueError(f"Upload failed ({response.status_code}): {response.text}") + + def download_file( signed_url: str, output_path: Path, diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 3ed12e50d..b76091707 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -11,12 +11,14 @@ import sift_client as _sift_client_module from sift_client._internal.low_level_wrappers.data_imports import DataImportsLowLevelClient from sift_client._internal.util.executor import run_sync_function +from sift_client._internal.util.file import upload_file from sift_client.resources._base import ResourceBase from sift_client.sift_types.data_import import ( EXTENSION_TO_DATA_TYPE_KEY, CsvImportConfig, DataImport, DataImportStatus, + DataTypeKey, ) from sift_client.util import cel_utils as cel @@ -43,7 +45,6 @@ def __init__(self, sift_client: SiftClient): super().__init__(sift_client) self._low_level_client = DataImportsLowLevelClient( grpc_client=self.client.grpc_client, - rest_client=self.client.rest_client, ) async def import_from_path( @@ -51,6 +52,7 @@ async def import_from_path( *, file_path: str | Path, config: ImportConfig | None = None, + data_type: DataTypeKey | None = None, asset_name: str | None = None, run_name: str | None = None, run_id: str | None = None, @@ -70,8 +72,11 @@ async def import_from_path( Args: file_path: Path to the local file to import. config: Import configuration describing the file format and column - mapping. When provided, ``asset_name``, ``run_name``, and - ``run_id`` are ignored. + mapping. When provided, ``asset_name``, ``run_name``, + ``run_id``, and ``data_type`` are ignored. + data_type: Explicit data type key. Required for formats like + Parquet where the extension alone is ambiguous. Only used + when ``config`` is not provided. asset_name: Name of the asset to import into. Required when ``config`` is not provided. run_name: Optional run name. Only used when ``config`` is not @@ -98,7 +103,7 @@ async def import_from_path( if config is None: if asset_name is None: raise ValueError("Either 'config' or 'asset_name' must be provided.") - detected = await self.detect_config(file_path) + detected = await self.detect_config(file_path, data_type=data_type) config = detected.model_copy( update={ "asset_name": asset_name, @@ -109,7 +114,9 @@ async def import_from_path( data_import_id, upload_url = await self._low_level_client.create_from_upload(config) logger.info("Created data import %s", data_import_id) - await self._low_level_client.upload_file(upload_url, path) + await run_sync_function( + lambda: upload_file(upload_url, path, rest_client=self.client.rest_client) + ) logger.info("Uploaded file to presigned URL for import %s", data_import_id) return await self.wait_until_complete( @@ -218,18 +225,25 @@ async def retry(self, data_import: str | DataImport) -> None: ) await self._low_level_client.retry(data_import_id) - async def detect_config(self, file_path: str | Path) -> ImportConfig: + async def detect_config( + self, + file_path: str | Path, + data_type: DataTypeKey | None = None, + ) -> ImportConfig: """Auto-detect import configuration from a file. Reads a sample of the file, sends it to the server's DetectConfig endpoint, and returns the detected configuration. The file format - is inferred from the file extension. You can inspect and modify the - result before passing it to :meth:`import_from_path`. + is inferred from the file extension when ``data_type`` is not + provided. - Supported extensions: .csv, .parquet, .tdms, .ch10, .ch11, .h5, .hdf5 + For file types with multiple layouts (e.g. Parquet), ``data_type`` + must be specified explicitly. Args: file_path: Path to the file to analyze. + data_type: Explicit data type key. Required for formats like + Parquet where the extension alone is ambiguous. Returns: The detected import config. @@ -243,13 +257,17 @@ async def detect_config(self, file_path: str | Path) -> ImportConfig: if not path.is_file(): raise FileNotFoundError(f"File not found: {file_path}") - ext = path.suffix.lower() - data_type_key = EXTENSION_TO_DATA_TYPE_KEY.get(ext) - if data_type_key is None: - raise ValueError( - f"Unsupported file extension '{ext}'. " - f"Supported: {', '.join(sorted(EXTENSION_TO_DATA_TYPE_KEY))}" - ) + if data_type is not None: + data_type_key = data_type + else: + ext = path.suffix.lower() + data_type_key = EXTENSION_TO_DATA_TYPE_KEY.get(ext) + if data_type_key is None: + raise ValueError( + f"Unsupported file extension '{ext}'. " + f"Supported: {', '.join(sorted(EXTENSION_TO_DATA_TYPE_KEY))}. " + f"For other formats (e.g. Parquet), pass 'data_type' explicitly." + ) def _read_sample() -> bytes: with open(path, "rb") as f: diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index 8f6cc0212..c51d1a7ec 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -11,6 +11,7 @@ DATA_TYPE_KEY_CSV, DATA_TYPE_KEY_HDF5, DATA_TYPE_KEY_PARQUET_FLATDATASET, + DATA_TYPE_KEY_PARQUET_SINGLE_CHANNEL_PER_ROW, DATA_TYPE_KEY_TDMS, ) from sift.data_imports.v2.data_imports_pb2 import CsvConfig as CsvConfigProto @@ -27,11 +28,6 @@ from sift_client.client import SiftClient -# --------------------------------------------------------------------------- -# Enums -# --------------------------------------------------------------------------- - - class TimeFormat(Enum): """Supported time formats for data import columns.""" @@ -62,7 +58,8 @@ class DataTypeKey(Enum): """Supported file types for data import detection.""" CSV = DATA_TYPE_KEY_CSV - PARQUET = DATA_TYPE_KEY_PARQUET_FLATDATASET + PARQUET_FLATDATASET = DATA_TYPE_KEY_PARQUET_FLATDATASET + PARQUET_SINGLE_CHANNEL_PER_ROW = DATA_TYPE_KEY_PARQUET_SINGLE_CHANNEL_PER_ROW TDMS = DATA_TYPE_KEY_TDMS CH10 = DATA_TYPE_KEY_CH10 HDF5 = DATA_TYPE_KEY_HDF5 @@ -70,20 +67,13 @@ class DataTypeKey(Enum): EXTENSION_TO_DATA_TYPE_KEY: dict[str, DataTypeKey] = { ".csv": DataTypeKey.CSV, - ".parquet": DataTypeKey.PARQUET, ".tdms": DataTypeKey.TDMS, ".ch10": DataTypeKey.CH10, - ".ch11": DataTypeKey.CH10, ".h5": DataTypeKey.HDF5, ".hdf5": DataTypeKey.HDF5, } -# --------------------------------------------------------------------------- -# CSV config types -# --------------------------------------------------------------------------- - - class CsvTimeColumn(BaseModel): """Time column configuration for CSV imports. @@ -211,11 +201,6 @@ def _from_proto(cls, proto: CsvConfigProto) -> CsvImportConfig: ) -# --------------------------------------------------------------------------- -# DataImport resource type -# --------------------------------------------------------------------------- - - class DataImport(BaseType[DataImportProto, "DataImport"]): """A data import in the Sift system. From 01f5831be866e0a07c61d4860590a6a12a8b4fe3 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Fri, 3 Apr 2026 13:34:47 -0700 Subject: [PATCH 09/52] remove upload_from_url --- .../low_level_wrappers/data_imports.py | 22 +-------- python/lib/sift_client/_internal/util/file.py | 6 ++- .../lib/sift_client/resources/data_imports.py | 47 +------------------ .../lib/sift_client/sift_types/data_import.py | 4 +- 4 files changed, 9 insertions(+), 70 deletions(-) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py index 5cd1b91e5..581407fb2 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py @@ -6,8 +6,6 @@ from sift.data_imports.v2.data_imports_pb2 import ( CreateDataImportFromUploadRequest, CreateDataImportFromUploadResponse, - CreateDataImportFromUrlRequest, - CreateDataImportFromUrlResponse, DetectConfigRequest, DetectConfigResponse, GetDataImportRequest, @@ -32,7 +30,7 @@ def _set_config_on_request( - request: CreateDataImportFromUploadRequest | CreateDataImportFromUrlRequest, + request: CreateDataImportFromUploadRequest, config: ImportConfig, ) -> None: """Set the appropriate config field on a proto request based on the config type.""" @@ -71,24 +69,6 @@ async def create_from_upload(self, config: ImportConfig) -> tuple[str, str]: response = cast("CreateDataImportFromUploadResponse", response) return response.data_import_id, response.upload_url - async def create_from_url(self, url: str, config: ImportConfig) -> str: - """Create a data import from a remote URL. - - Args: - url: The URL to import from (HTTP or S3). - config: The import configuration. - - Returns: - The data_import_id. - """ - request = CreateDataImportFromUrlRequest(url=url) - _set_config_on_request(request, config) - response = await self._grpc_client.get_stub(DataImportServiceStub).CreateDataImportFromUrl( - request - ) - response = cast("CreateDataImportFromUrlResponse", response) - return response.data_import_id - async def get(self, data_import_id: str) -> DataImport: """Get a data import by ID. diff --git a/python/lib/sift_client/_internal/util/file.py b/python/lib/sift_client/_internal/util/file.py index 0e977ced6..e39003581 100644 --- a/python/lib/sift_client/_internal/util/file.py +++ b/python/lib/sift_client/_internal/util/file.py @@ -19,7 +19,7 @@ def upload_file( file_path: Path, *, rest_client: RestClient, -) -> None: +) -> dict: """Upload a file to a presigned URL. Args: @@ -27,6 +27,9 @@ def upload_file( file_path: Path to the file to upload. rest_client: The SDK rest client to use for the upload. + Returns: + The parsed JSON response from the server. + Raises: ValueError: If the upload request fails. """ @@ -38,6 +41,7 @@ def upload_file( ) if not response.ok: raise ValueError(f"Upload failed ({response.status_code}): {response.text}") + return response.json() def download_file( diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index b76091707..85fa5ff87 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -import logging import time from pathlib import Path from typing import TYPE_CHECKING @@ -26,13 +25,11 @@ from sift_client._internal.low_level_wrappers.data_imports import ImportConfig from sift_client.client import SiftClient -logger = logging.getLogger(__name__) - class DataImportAPIAsync(ResourceBase): """High-level API for importing data into Sift. - Supports importing data from local files or remote URLs. Returns a + Supports importing data from local files. Returns a `DataImport` object that can be polled for status. """ @@ -112,49 +109,11 @@ async def import_from_path( } ) data_import_id, upload_url = await self._low_level_client.create_from_upload(config) - logger.info("Created data import %s", data_import_id) await run_sync_function( lambda: upload_file(upload_url, path, rest_client=self.client.rest_client) ) - logger.info("Uploaded file to presigned URL for import %s", data_import_id) - - return await self.wait_until_complete( - data_import_id, - polling_interval_secs=polling_interval_secs, - timeout_secs=timeout_secs, - show_progress=show_progress, - ) - - async def import_from_url( - self, - *, - url: str, - config: ImportConfig, - polling_interval_secs: int = 5, - timeout_secs: int | None = None, - show_progress: bool | None = None, - ) -> DataImport: - """Import data from a remote URL (HTTP or S3). - - Returns a :class:`DataImport` that can be polled for status via - ``data_import.refresh()``. - - Args: - url: The URL to import from. - config: Import configuration describing the file format and column - mapping. - polling_interval_secs: Seconds between status polls. Defaults to 5s. - timeout_secs: Maximum seconds to wait. If None, polls indefinitely. - show_progress: If True, display a progress spinner while waiting - for the import to complete. Defaults to True for sync, False - for async. - - Returns: - A :class:`DataImport` representing the import operation. - """ - data_import_id = await self._low_level_client.create_from_url(url, config) - logger.info("Created URL-based data import %s", data_import_id) + # job_id = response["job_id"] return await self.wait_until_complete( data_import_id, @@ -215,8 +174,6 @@ async def list_( async def retry(self, data_import: str | DataImport) -> None: """Retry a failed data import. - Only works for URL-based imports in a failed state. - Args: data_import: The DataImport or data_import_id to retry. """ diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index c51d1a7ec..b372498b8 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -216,7 +216,6 @@ class DataImport(BaseType[DataImportProto, "DataImport"]): # Optional fields error_message: str | None - source_url: str | None run_id: str | None report_id: str | None asset_id: str | None @@ -239,7 +238,6 @@ def _from_proto( error_message=proto.error_message or None, created_date=proto.created_date.ToDatetime(tzinfo=timezone.utc), modified_date=proto.modified_date.ToDatetime(tzinfo=timezone.utc), - source_url=proto.source_url or None, run_id=proto.run_id if proto.HasField("_run_id") else None, report_id=proto.report_id if proto.HasField("_report_id") else None, asset_id=proto.asset_id if proto.HasField("_asset_id") else None, @@ -293,6 +291,6 @@ def refresh(self) -> DataImport: return self def retry(self) -> None: - """Retry this import. Only works for URL-based imports in a failed state.""" + """Retry a failed import.""" self.client.data_import.retry(self._id_or_error) self.refresh() From 269a8b5f075d58fdb7f058156ed7247341406cf2 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Fri, 3 Apr 2026 14:06:45 -0700 Subject: [PATCH 10/52] converted imports to using jobs --- .../low_level_wrappers/data_imports.py | 95 +-------- .../lib/sift_client/resources/data_imports.py | 182 +++--------------- .../resources/sync_stubs/__init__.pyi | 123 +++--------- .../lib/sift_client/sift_types/data_import.py | 111 ----------- 4 files changed, 48 insertions(+), 463 deletions(-) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py index 581407fb2..279c46d38 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py @@ -1,6 +1,5 @@ from __future__ import annotations -import logging from typing import TYPE_CHECKING, cast from sift.data_imports.v2.data_imports_pb2 import ( @@ -8,16 +7,11 @@ CreateDataImportFromUploadResponse, DetectConfigRequest, DetectConfigResponse, - GetDataImportRequest, - GetDataImportResponse, - ListDataImportsRequest, - ListDataImportsResponse, - RetryDataImportRequest, ) from sift.data_imports.v2.data_imports_pb2_grpc import DataImportServiceStub from sift_client._internal.low_level_wrappers.base import LowLevelClientBase -from sift_client.sift_types.data_import import CsvImportConfig, DataImport +from sift_client.sift_types.data_import import CsvImportConfig from sift_client.transport import WithGrpcClient if TYPE_CHECKING: @@ -40,9 +34,6 @@ def _set_config_on_request( raise TypeError(f"Unsupported import config type: {type(config).__name__}") -logger = logging.getLogger(__name__) - - class DataImportsLowLevelClient(LowLevelClientBase, WithGrpcClient): """Low-level client for the DataImportService. @@ -69,87 +60,6 @@ async def create_from_upload(self, config: ImportConfig) -> tuple[str, str]: response = cast("CreateDataImportFromUploadResponse", response) return response.data_import_id, response.upload_url - async def get(self, data_import_id: str) -> DataImport: - """Get a data import by ID. - - Args: - data_import_id: The ID of the data import. - - Returns: - The DataImport. - """ - request = GetDataImportRequest(data_import_id=data_import_id) - response = await self._grpc_client.get_stub(DataImportServiceStub).GetDataImport(request) - response = cast("GetDataImportResponse", response) - return DataImport._from_proto(response.data_import) - - async def list_( - self, - *, - page_size: int | None = None, - page_token: str | None = None, - query_filter: str = "", - order_by: str = "", - ) -> tuple[list[DataImport], str]: - """List data imports with optional filtering and pagination. - - Args: - page_size: Maximum number of results per page. - page_token: Token for the next page of results. - query_filter: CEL filter string. - order_by: Ordering string (e.g. "created_date desc"). - - Returns: - A tuple of (list of DataImports, next_page_token). - """ - request = ListDataImportsRequest( - filter=query_filter, - order_by=order_by, - ) - if page_size is not None: - request.page_size = page_size - if page_token: - request.page_token = page_token - - response = await self._grpc_client.get_stub(DataImportServiceStub).ListDataImports(request) - response = cast("ListDataImportsResponse", response) - data_imports = [DataImport._from_proto(di) for di in response.data_imports] - return data_imports, response.next_page_token - - async def list_all( - self, - *, - query_filter: str = "", - order_by: str = "", - max_results: int | None = None, - ) -> list[DataImport]: - """List all data imports, handling pagination automatically. - - Args: - query_filter: CEL filter string. - order_by: Ordering string (e.g. "created_date desc"). - max_results: Maximum total results to return. - - Returns: - A list of all matching DataImports. - """ - return await self._handle_pagination( - func=self.list_, - kwargs={"query_filter": query_filter, "order_by": order_by}, - max_results=max_results, - ) - - async def retry(self, data_import_id: str) -> None: - """Retry a failed data import. - - Only works for URL-based imports in a failed state. - - Args: - data_import_id: The ID of the data import to retry. - """ - request = RetryDataImportRequest(data_import_id=data_import_id) - await self._grpc_client.get_stub(DataImportServiceStub).RetryDataImport(request) - async def detect_config( self, data: bytes, data_type_key: DataTypeKey.ValueType ) -> DetectConfigResponse: @@ -160,8 +70,7 @@ async def detect_config( data_type_key: The file type hint. Returns: - The raw DetectConfigResponse proto. The caller (resource API) - is responsible for converting to a sift_type. + The raw DetectConfigResponse proto. """ request = DetectConfigRequest(data=data, type=data_type_key) response = await self._grpc_client.get_stub(DataImportServiceStub).DetectConfig(request) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 85fa5ff87..03f18320f 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -1,13 +1,8 @@ from __future__ import annotations -import asyncio -import time from pathlib import Path from typing import TYPE_CHECKING -from alive_progress import alive_bar # type: ignore[import-untyped] - -import sift_client as _sift_client_module from sift_client._internal.low_level_wrappers.data_imports import DataImportsLowLevelClient from sift_client._internal.util.executor import run_sync_function from sift_client._internal.util.file import upload_file @@ -15,23 +10,17 @@ from sift_client.sift_types.data_import import ( EXTENSION_TO_DATA_TYPE_KEY, CsvImportConfig, - DataImport, - DataImportStatus, DataTypeKey, ) -from sift_client.util import cel_utils as cel if TYPE_CHECKING: from sift_client._internal.low_level_wrappers.data_imports import ImportConfig from sift_client.client import SiftClient + from sift_client.sift_types.job import Job class DataImportAPIAsync(ResourceBase): - """High-level API for importing data into Sift. - - Supports importing data from local files. Returns a - `DataImport` object that can be polled for status. - """ + """High-level API for importing data into Sift.""" def __init__(self, sift_client: SiftClient): """Initialize the DataImportAPI. @@ -46,21 +35,19 @@ def __init__(self, sift_client: SiftClient): async def import_from_path( self, - *, file_path: str | Path, + *, config: ImportConfig | None = None, data_type: DataTypeKey | None = None, asset_name: str | None = None, run_name: str | None = None, run_id: str | None = None, - polling_interval_secs: int = 5, - timeout_secs: int | None = None, - show_progress: bool | None = None, - ) -> DataImport: + ) -> Job: """Import data from a local file. - Creates a data import on the server, uploads the file, and waits - for the import to complete. Returns the completed :class:`DataImport`. + Creates a data import on the server, uploads the file, and returns + a :class:`Job` handle. Use ``job.wait_until_complete()`` to poll + for completion. When ``config`` is omitted the file format is auto-detected via :meth:`detect_config` and a :class:`CsvImportConfig` is built using @@ -80,14 +67,9 @@ async def import_from_path( provided. run_id: Optional existing run ID. Only used when ``config`` is not provided. - polling_interval_secs: Seconds between status polls. Defaults to 5s. - timeout_secs: Maximum seconds to wait. If None, polls indefinitely. - show_progress: If True, display a progress spinner while waiting - for the import to complete. Defaults to True for sync, False - for async. Returns: - A :class:`DataImport` representing the import operation. + A :class:`Job` handle for the pending import. Raises: FileNotFoundError: If the file does not exist. @@ -108,79 +90,14 @@ async def import_from_path( "run_id": run_id, } ) - data_import_id, upload_url = await self._low_level_client.create_from_upload(config) + _, upload_url = await self._low_level_client.create_from_upload(config) - await run_sync_function( + response = await run_sync_function( lambda: upload_file(upload_url, path, rest_client=self.client.rest_client) ) - # job_id = response["job_id"] - - return await self.wait_until_complete( - data_import_id, - polling_interval_secs=polling_interval_secs, - timeout_secs=timeout_secs, - show_progress=show_progress, - ) - - async def get(self, data_import_id: str) -> DataImport: - """Get a data import by ID. - - Args: - data_import_id: The ID of the data import. - - Returns: - The DataImport. - """ - data_import = await self._low_level_client.get(data_import_id) - return self._apply_client_to_instance(data_import) - - async def list_( - self, - *, - data_import_ids: list[str] | None = None, - status: DataImportStatus | None = None, - filter_query: str | None = None, - order_by: str | None = None, - limit: int | None = None, - ) -> list[DataImport]: - """List data imports with optional filtering. - - Args: - data_import_ids: Filter to imports with any of these IDs. - status: Filter to imports with this status. - filter_query: Explicit CEL filter string. - order_by: Ordering string (e.g. "created_date desc"). - limit: Maximum number of imports to return. If None, returns all. - - Returns: - A list of DataImport objects matching the filter criteria. - """ - filter_parts = [] - if data_import_ids: - filter_parts.append(cel.in_("data_import_id", data_import_ids)) - if status is not None: - filter_parts.append(cel.equals("status", str(status.value))) - if filter_query: - filter_parts.append(filter_query) - query_filter = cel.and_(*filter_parts) - - data_imports = await self._low_level_client.list_all( - query_filter=query_filter or "", - order_by=order_by or "", - max_results=limit, - ) - return self._apply_client_to_instances(data_imports) - - async def retry(self, data_import: str | DataImport) -> None: - """Retry a failed data import. + job_id = response["job_id"] - Args: - data_import: The DataImport or data_import_id to retry. - """ - data_import_id = ( - data_import._id_or_error if isinstance(data_import, DataImport) else data_import - ) - await self._low_level_client.retry(data_import_id) + return await self.client.async_.jobs.get(job_id=job_id) async def detect_config( self, @@ -214,16 +131,22 @@ async def detect_config( if not path.is_file(): raise FileNotFoundError(f"File not found: {file_path}") - if data_type is not None: + ext = path.suffix.lower() + if ext in (".parquet", ".pqt"): + if data_type is None: + raise ValueError( + "Parquet files require 'data_type' to be specified. " + "Use DataTypeKey.PARQUET_FLATDATASET or DataTypeKey.PARQUET_SINGLE_CHANNEL_PER_ROW." + ) + data_type_key = data_type + elif data_type is not None: data_type_key = data_type else: - ext = path.suffix.lower() data_type_key = EXTENSION_TO_DATA_TYPE_KEY.get(ext) if data_type_key is None: raise ValueError( f"Unsupported file extension '{ext}'. " - f"Supported: {', '.join(sorted(EXTENSION_TO_DATA_TYPE_KEY))}. " - f"For other formats (e.g. Parquet), pass 'data_type' explicitly." + f"Supported: {', '.join(sorted(EXTENSION_TO_DATA_TYPE_KEY))}" ) def _read_sample() -> bytes: @@ -248,64 +171,3 @@ def _read_sample() -> bytes: # TODO: Add other file format configs raise ValueError("Server returned an empty DetectConfig response.") - - async def wait_until_complete( - self, - data_import: str | DataImport, - *, - polling_interval_secs: int = 5, - timeout_secs: int | None = None, - show_progress: bool | None = None, - ) -> DataImport: - """Wait until a data import reaches a terminal state. - - Polls the import status at the given interval until the import is - SUCCEEDED or FAILED, returning the completed DataImport. - - Args: - data_import: The DataImport or data_import_id to wait for. - polling_interval_secs: Seconds between status polls. Defaults to 5s. - timeout_secs: Maximum seconds to wait. If None, polls indefinitely. - Defaults to None (indefinite). - show_progress: If True, display an animated progress spinner alongside - the import status while polling. Defaults to True for sync, False - for async. Use ``sift_client.config.show_progress = False`` to disable - globally for sync. - - Returns: - The DataImport in its terminal state. - """ - data_import_id = ( - data_import._id_or_error if isinstance(data_import, DataImport) else data_import - ) - if show_progress is None: - global_setting = _sift_client_module.config.show_progress - if global_setting is not None: - show_progress = global_setting - elif getattr(self, "_is_sync", False): - show_progress = True - else: - show_progress = False - - start = time.monotonic() - with alive_bar( - title=f"Data Import ID {data_import_id}: polling", - bar=None, - spinner_length=7, - spinner="dots_waves", - monitor=False, - stats=False, - disable=not show_progress, - ) as bar: - while True: - result = await self.get(data_import_id) - bar.title(f"Data Import ID {data_import_id}: {result.status.name}") - bar() - if result.is_complete: - return result - if timeout_secs is not None and (time.monotonic() - start) >= timeout_secs: - raise TimeoutError( - f"Data import '{data_import_id}' did not complete " - f"within {timeout_secs} seconds." - ) - await asyncio.sleep(polling_interval_secs) diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index e9153aafc..4c38611ad 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -24,7 +24,9 @@ if TYPE_CHECKING: CalculatedChannelUpdate, ) from sift_client.sift_types.channel import Channel - from sift_client.sift_types.data_import import DataImport, DataImportStatus + from sift_client.sift_types.data_import import ( + DataTypeKey, + ) from sift_client.sift_types.export import ExportOutputFormat from sift_client.sift_types.file_attachment import ( FileAttachment, @@ -629,9 +631,6 @@ class DataImportAPI: """Sync counterpart to `DataImportAPIAsync`. High-level API for importing data into Sift. - - Supports importing data from local files or remote URLs. Returns a - `DataImport` object that can be polled for status. """ def __init__(self, sift_client: SiftClient): @@ -643,18 +642,23 @@ class DataImportAPI: ... def _run(self, coro): ... - def detect_config(self, file_path: str | Path) -> ImportConfig: + def detect_config( + self, file_path: str | Path, data_type: DataTypeKey | None = None + ) -> ImportConfig: """Auto-detect import configuration from a file. Reads a sample of the file, sends it to the server's DetectConfig endpoint, and returns the detected configuration. The file format - is inferred from the file extension. You can inspect and modify the - result before passing it to :meth:`import_from_path`. + is inferred from the file extension when ``data_type`` is not + provided. - Supported extensions: .csv, .parquet, .tdms, .ch10, .ch11, .h5, .hdf5 + For file types with multiple layouts (e.g. Parquet), ``data_type`` + must be specified explicitly. Args: file_path: Path to the file to analyze. + data_type: Explicit data type key. Required for formats like + Parquet where the extension alone is ambiguous. Returns: The detected import config. @@ -666,31 +670,21 @@ class DataImportAPI: """ ... - def get(self, data_import_id: str) -> DataImport: - """Get a data import by ID. - - Args: - data_import_id: The ID of the data import. - - Returns: - The DataImport. - """ - ... - def import_from_path( self, - *, file_path: str | Path, + *, config: ImportConfig | None = None, + data_type: DataTypeKey | None = None, asset_name: str | None = None, run_name: str | None = None, run_id: str | None = None, - ) -> DataImport: + ) -> Job: """Import data from a local file. - Creates a data import on the server and uploads the file to the - returned presigned URL. Returns a :class:`DataImport` that can be - polled for status via ``data_import.refresh()``. + Creates a data import on the server, uploads the file, and returns + a :class:`Job` handle. Use ``job.wait_until_complete()`` to poll + for completion. When ``config`` is omitted the file format is auto-detected via :meth:`detect_config` and a :class:`CsvImportConfig` is built using @@ -699,8 +693,11 @@ class DataImportAPI: Args: file_path: Path to the local file to import. config: Import configuration describing the file format and column - mapping. When provided, ``asset_name``, ``run_name``, and - ``run_id`` are ignored. + mapping. When provided, ``asset_name``, ``run_name``, + ``run_id``, and ``data_type`` are ignored. + data_type: Explicit data type key. Required for formats like + Parquet where the extension alone is ambiguous. Only used + when ``config`` is not provided. asset_name: Name of the asset to import into. Required when ``config`` is not provided. run_name: Optional run name. Only used when ``config`` is not @@ -709,7 +706,7 @@ class DataImportAPI: provided. Returns: - A :class:`DataImport` representing the import operation. + A :class:`Job` handle for the pending import. Raises: FileNotFoundError: If the file does not exist. @@ -717,78 +714,6 @@ class DataImportAPI: """ ... - def import_from_url(self, *, url: str, config: ImportConfig) -> DataImport: - """Import data from a remote URL (HTTP or S3). - - Returns a :class:`DataImport` that can be polled for status via - ``data_import.refresh()``. - - Args: - url: The URL to import from. - config: Import configuration describing the file format and column - mapping. - - Returns: - A :class:`DataImport` representing the import operation. - """ - ... - - def list_( - self, - *, - data_import_ids: list[str] | None = None, - status: DataImportStatus | None = None, - filter_query: str | None = None, - order_by: str | None = None, - limit: int | None = None, - ) -> list[DataImport]: - """List data imports with optional filtering. - - Args: - data_import_ids: Filter to imports with any of these IDs. - status: Filter to imports with this status. - filter_query: Explicit CEL filter string. - order_by: Ordering string (e.g. "created_date desc"). - limit: Maximum number of imports to return. If None, returns all. - - Returns: - A list of DataImport objects matching the filter criteria. - """ - ... - - def retry(self, data_import: str | DataImport) -> None: - """Retry a failed data import. - - Only works for URL-based imports in a failed state. - - Args: - data_import: The DataImport or data_import_id to retry. - """ - ... - - def wait_until_complete( - self, - data_import: str | DataImport, - *, - polling_interval_secs: int = 5, - timeout_secs: int | None = None, - ) -> DataImport: - """Wait until a data import reaches a terminal state. - - Polls the import status at the given interval until the import is - SUCCEEDED or FAILED, returning the completed DataImport. - - Args: - data_import: The DataImport or data_import_id to wait for. - polling_interval_secs: Seconds between status polls. Defaults to 5s. - timeout_secs: Maximum seconds to wait. If None, polls indefinitely. - Defaults to None (indefinite). - - Returns: - The DataImport in its terminal state. - """ - ... - class FileAttachmentsAPI: """Sync counterpart to `FileAttachmentsAPIAsync`. diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index b372498b8..b238ffffe 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -2,7 +2,6 @@ from datetime import datetime # noqa: TC003 from enum import Enum -from typing import TYPE_CHECKING from pydantic import BaseModel, ConfigDict, model_validator from sift.common.type.v1.channel_config_pb2 import ChannelConfig as ChannelConfigProto @@ -16,17 +15,11 @@ ) from sift.data_imports.v2.data_imports_pb2 import CsvConfig as CsvConfigProto from sift.data_imports.v2.data_imports_pb2 import CsvTimeColumn as CsvTimeColumnProto -from sift.data_imports.v2.data_imports_pb2 import DataImport as DataImportProto -from sift.data_imports.v2.data_imports_pb2 import DataImportStatus as DataImportStatusProto from sift.data_imports.v2.data_imports_pb2 import TimeFormat as TimeFormatProto from sift_client._internal.util.timestamp import to_pb_timestamp -from sift_client.sift_types._base import BaseType from sift_client.sift_types.channel import ChannelDataType -if TYPE_CHECKING: - from sift_client.client import SiftClient - class TimeFormat(Enum): """Supported time formats for data import columns.""" @@ -45,15 +38,6 @@ class TimeFormat(Enum): ABSOLUTE_UNIX_NANOSECONDS = TimeFormatProto.TIME_FORMAT_ABSOLUTE_UNIX_NANOSECONDS -class DataImportStatus(Enum): - """Status of a data import.""" - - PENDING = DataImportStatusProto.DATA_IMPORT_STATUS_PENDING - IN_PROGRESS = DataImportStatusProto.DATA_IMPORT_STATUS_IN_PROGRESS - SUCCEEDED = DataImportStatusProto.DATA_IMPORT_STATUS_SUCCEEDED - FAILED = DataImportStatusProto.DATA_IMPORT_STATUS_FAILED - - class DataTypeKey(Enum): """Supported file types for data import detection.""" @@ -199,98 +183,3 @@ def _from_proto(cls, proto: CsvConfigProto) -> CsvImportConfig: time_column=time_column, data_columns=data_columns, ) - - -class DataImport(BaseType[DataImportProto, "DataImport"]): - """A data import in the Sift system. - - Represents the status and metadata of an import operation. Use - ``client.data_import.import_from_path()`` to create one, or - ``client.data_import.get()`` to retrieve an existing import by ID. - """ - - # Required fields - status: DataImportStatus - created_date: datetime - modified_date: datetime - - # Optional fields - error_message: str | None - run_id: str | None - report_id: str | None - asset_id: str | None - data_start_time: datetime | None - data_stop_time: datetime | None - - # Config used for this import - csv_config: CsvImportConfig | None - - @classmethod - def _from_proto( - cls, proto: DataImportProto, sift_client: SiftClient | None = None - ) -> DataImport: - from datetime import timezone - - return cls( - proto=proto, - id_=proto.data_import_id, - status=DataImportStatus(proto.status), - error_message=proto.error_message or None, - created_date=proto.created_date.ToDatetime(tzinfo=timezone.utc), - modified_date=proto.modified_date.ToDatetime(tzinfo=timezone.utc), - run_id=proto.run_id if proto.HasField("_run_id") else None, - report_id=proto.report_id if proto.HasField("_report_id") else None, - asset_id=proto.asset_id if proto.HasField("_asset_id") else None, - data_start_time=( - proto.data_start_time.ToDatetime(tzinfo=timezone.utc) - if proto.HasField("_data_start_time") - else None - ), - data_stop_time=( - proto.data_stop_time.ToDatetime(tzinfo=timezone.utc) - if proto.HasField("_data_stop_time") - else None - ), - csv_config=( - CsvImportConfig._from_proto(proto.csv_config) - if proto.HasField("csv_config") - else None - ), - _client=sift_client, - ) - - @property - def is_pending(self) -> bool: - """Return True if the import is pending.""" - return self.status == DataImportStatus.PENDING - - @property - def is_in_progress(self) -> bool: - """Return True if the import is in progress.""" - return self.status == DataImportStatus.IN_PROGRESS - - @property - def is_succeeded(self) -> bool: - """Return True if the import succeeded.""" - return self.status == DataImportStatus.SUCCEEDED - - @property - def is_failed(self) -> bool: - """Return True if the import failed.""" - return self.status == DataImportStatus.FAILED - - @property - def is_complete(self) -> bool: - """Return True if the import reached a terminal state (succeeded or failed).""" - return self.status in (DataImportStatus.SUCCEEDED, DataImportStatus.FAILED) - - def refresh(self) -> DataImport: - """Refresh this import with the latest data from the API.""" - updated = self.client.data_import.get(self._id_or_error) - self._update(updated) - return self - - def retry(self) -> None: - """Retry a failed import.""" - self.client.data_import.retry(self._id_or_error) - self.refresh() From 54014eac08d256988b10e517717051333e90fa21 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Mon, 6 Apr 2026 11:25:09 -0700 Subject: [PATCH 11/52] fix sync/async behavior when polling directly --- .../lib/sift_client/resources/data_imports.py | 33 +++++++++++++++---- .../resources/sync_stubs/__init__.pyi | 14 +++++--- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 03f18320f..779620be2 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import TYPE_CHECKING +import sift_client as _sift_client_module from sift_client._internal.low_level_wrappers.data_imports import DataImportsLowLevelClient from sift_client._internal.util.executor import run_sync_function from sift_client._internal.util.file import upload_file @@ -42,12 +43,14 @@ async def import_from_path( asset_name: str | None = None, run_name: str | None = None, run_id: str | None = None, + polling_interval_secs: int = 5, + timeout_secs: int | None = None, + show_progress: bool | None = None, ) -> Job: """Import data from a local file. - Creates a data import on the server, uploads the file, and returns - a :class:`Job` handle. Use ``job.wait_until_complete()`` to poll - for completion. + Creates a data import on the server, uploads the file, and waits + for the import to complete. When ``config`` is omitted the file format is auto-detected via :meth:`detect_config` and a :class:`CsvImportConfig` is built using @@ -67,9 +70,13 @@ async def import_from_path( provided. run_id: Optional existing run ID. Only used when ``config`` is not provided. + polling_interval_secs: Seconds between status polls. Defaults to 5s. + timeout_secs: Maximum seconds to wait. If None, polls indefinitely. + show_progress: If True, display a progress spinner while waiting. + Defaults to True for sync, False for async. Returns: - A :class:`Job` handle for the pending import. + The completed :class:`Job`. Raises: FileNotFoundError: If the file does not exist. @@ -95,9 +102,21 @@ async def import_from_path( response = await run_sync_function( lambda: upload_file(upload_url, path, rest_client=self.client.rest_client) ) - job_id = response["job_id"] - - return await self.client.async_.jobs.get(job_id=job_id) + job_id = response["jobId"] + + if show_progress is None: + global_setting = _sift_client_module.config.show_progress + if global_setting is not None: + show_progress = global_setting + else: + show_progress = getattr(self, "_is_sync", False) + + return await self.client.async_.jobs.wait_until_complete( + job_id, + polling_interval_secs=polling_interval_secs, + timeout_secs=timeout_secs, + show_progress=show_progress, + ) async def detect_config( self, diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index 4c38611ad..a4a85d18e 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -679,12 +679,14 @@ class DataImportAPI: asset_name: str | None = None, run_name: str | None = None, run_id: str | None = None, + polling_interval_secs: int = 5, + timeout_secs: int | None = None, + show_progress: bool | None = None, ) -> Job: """Import data from a local file. - Creates a data import on the server, uploads the file, and returns - a :class:`Job` handle. Use ``job.wait_until_complete()`` to poll - for completion. + Creates a data import on the server, uploads the file, and waits + for the import to complete. When ``config`` is omitted the file format is auto-detected via :meth:`detect_config` and a :class:`CsvImportConfig` is built using @@ -704,9 +706,13 @@ class DataImportAPI: provided. run_id: Optional existing run ID. Only used when ``config`` is not provided. + polling_interval_secs: Seconds between status polls. Defaults to 5s. + timeout_secs: Maximum seconds to wait. If None, polls indefinitely. + show_progress: If True, display a progress spinner while waiting. + Defaults to True for sync, False for async. Returns: - A :class:`Job` handle for the pending import. + The completed :class:`Job`. Raises: FileNotFoundError: If the file does not exist. From fe5060447eb8a7a027ee5933827511b98548fe6c Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Tue, 7 Apr 2026 13:25:55 -0700 Subject: [PATCH 12/52] add parquet import support --- .../low_level_wrappers/data_imports.py | 14 +- python/lib/sift_client/_internal/util/file.py | 26 ++ .../lib/sift_client/resources/data_imports.py | 139 ++++++-- .../resources/sync_stubs/__init__.pyi | 23 +- .../lib/sift_client/sift_types/data_import.py | 334 ++++++++++++++++++ 5 files changed, 491 insertions(+), 45 deletions(-) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py index 279c46d38..c64521963 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py @@ -11,7 +11,11 @@ from sift.data_imports.v2.data_imports_pb2_grpc import DataImportServiceStub from sift_client._internal.low_level_wrappers.base import LowLevelClientBase -from sift_client.sift_types.data_import import CsvImportConfig +from sift_client.sift_types.data_import import ( + CsvImportConfig, + ParquetFlatDatasetImportConfig, + ParquetSingleChannelPerRowImportConfig, +) from sift_client.transport import WithGrpcClient if TYPE_CHECKING: @@ -20,7 +24,9 @@ from sift_client.transport.grpc_transport import GrpcClient # Union of all supported config types. Extend this as new formats are added. -ImportConfig = CsvImportConfig +ImportConfig = ( + CsvImportConfig | ParquetFlatDatasetImportConfig | ParquetSingleChannelPerRowImportConfig +) def _set_config_on_request( @@ -30,6 +36,10 @@ def _set_config_on_request( """Set the appropriate config field on a proto request based on the config type.""" if isinstance(config, CsvImportConfig): request.csv_config.CopyFrom(config._to_proto()) + elif isinstance( + config, (ParquetFlatDatasetImportConfig, ParquetSingleChannelPerRowImportConfig) + ): + request.parquet_config.CopyFrom(config._to_proto()) else: raise TypeError(f"Unsupported import config type: {type(config).__name__}") diff --git a/python/lib/sift_client/_internal/util/file.py b/python/lib/sift_client/_internal/util/file.py index e39003581..4f3a5dead 100644 --- a/python/lib/sift_client/_internal/util/file.py +++ b/python/lib/sift_client/_internal/util/file.py @@ -1,5 +1,7 @@ from __future__ import annotations +import os +import struct import warnings import zipfile from typing import TYPE_CHECKING @@ -112,3 +114,27 @@ def extract_zip(zip_path: Path, output_dir: Path, *, delete_zip: bool = True) -> except OSError: warnings.warn(f"Failed to delete zip file '{zip_path}'", SiftWarning, stacklevel=2) return [output_dir / name for name in names if not name.endswith("/")] + + +def extract_parquet_footer(path: Path) -> tuple[bytes, int]: + """Extract the Parquet footer bytes and compute the footer offset. + + Args: + path: Path to the Parquet file. + + Returns: + A tuple of (footer_bytes, footer_offset). + + Raises: + ValueError: If the file is not a valid Parquet file. + """ + with open(path, "rb") as f: + f.seek(-8, 2) + footer_tail = f.read(8) + footer_len = struct.unpack(" bytes: - with open(path, "rb") as f: - return f.read(65_536) # 64 KiB + is_parquet = data_type_key in ( + DataTypeKey.PARQUET_FLATDATASET, + DataTypeKey.PARQUET_SINGLE_CHANNEL_PER_ROW, + ) + + footer_offset = 0 + footer_length = 0 + + if is_parquet: + footer_bytes, footer_offset = await run_sync_function( + lambda: extract_parquet_footer(path) + ) + sample = footer_bytes + footer_length = len(footer_bytes) + else: - sample = await run_sync_function(_read_sample) + def _read_sample() -> bytes: + with open(path, "rb") as f: + return f.read(65_536) # 64 KiB + + sample = await run_sync_function(_read_sample) response = await self._low_level_client.detect_config(sample, data_type_key.value) if response.HasField("csv_config"): config = CsvImportConfig._from_proto(response.csv_config) - # The server's DetectConfig may include the time column in - # data_columns, but CreateDataImportFromUpload rejects that - # overlap. Filter it out so the config is import-ready. + # Filter out the time column from data_columns to avoid overlap. time_col = config.time_column.column filtered = [dc for dc in config.data_columns if dc.column != time_col] if len(filtered) != len(config.data_columns): config = config.model_copy(update={"data_columns": filtered}) return config - # TODO: Add other file format configs + if response.HasField("parquet_config"): + proto = response.parquet_config + if proto.HasField("flat_dataset"): + config = ParquetFlatDatasetImportConfig._from_proto( + proto, footer_offset=footer_offset, footer_length=footer_length + ) + # Filter out the time column from data_columns to avoid overlap. + time_path = config.time_column.path + if time_path: + filtered = [dc for dc in config.data_columns if dc.path != time_path] + if len(filtered) != len(config.data_columns): + config = config.model_copy(update={"data_columns": filtered}) + else: + # The backend only detects arrow timestamp types. Fall back to + # looking for an integer column whose name contains "time", + # preferring columns that start with "time". + _integer_types = { + ChannelDataType.INT_32, + ChannelDataType.INT_64, + ChannelDataType.UINT_32, + ChannelDataType.UINT_64, + } + match = None + for dc in config.data_columns: + if dc.data_type in _integer_types and dc.name.lower().startswith("time"): + match = dc + break + if match is None: + for dc in config.data_columns: + if dc.data_type in _integer_types and "time" in dc.name.lower(): + match = dc + break + if match is not None: + config = config.model_copy( + update={ + "time_column": ParquetTimeColumn(path=match.path), + "data_columns": [ + c for c in config.data_columns if c.path != match.path + ], + } + ) + return config + elif proto.HasField("single_channel_per_row"): + return ParquetSingleChannelPerRowImportConfig._from_proto( + proto, footer_offset=footer_offset, footer_length=footer_length + ) raise ValueError("Server returned an empty DetectConfig response.") diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index a4a85d18e..97f70bf9e 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -673,10 +673,10 @@ class DataImportAPI: def import_from_path( self, file_path: str | Path, + asset_name: str, *, config: ImportConfig | None = None, data_type: DataTypeKey | None = None, - asset_name: str | None = None, run_name: str | None = None, run_id: str | None = None, polling_interval_secs: int = 5, @@ -689,23 +689,23 @@ class DataImportAPI: for the import to complete. When ``config`` is omitted the file format is auto-detected via - :meth:`detect_config` and a :class:`CsvImportConfig` is built using - the provided ``asset_name`` and optional ``run_name`` / ``run_id``. + :meth:`detect_config`. The ``asset_name`` is always applied to + the config. If neither ``run_name`` nor ``run_id`` is provided + (and none is set on the config), ``run_name`` defaults to the + filename. Args: file_path: Path to the local file to import. + asset_name: Name of the asset to import data into. config: Import configuration describing the file format and column - mapping. When provided, ``asset_name``, ``run_name``, - ``run_id``, and ``data_type`` are ignored. + mapping. When provided, ``data_type`` is ignored. data_type: Explicit data type key. Required for formats like Parquet where the extension alone is ambiguous. Only used when ``config`` is not provided. - asset_name: Name of the asset to import into. Required when - ``config`` is not provided. - run_name: Optional run name. Only used when ``config`` is not - provided. - run_id: Optional existing run ID. Only used when ``config`` is not - provided. + run_name: Run name to use. Overrides any value on the config. + Defaults to the filename if neither ``run_name`` nor + ``run_id`` is set. + run_id: Existing run ID to use. Overrides any value on the config. polling_interval_secs: Seconds between status polls. Defaults to 5s. timeout_secs: Maximum seconds to wait. If None, polls indefinitely. show_progress: If True, display a progress spinner while waiting. @@ -716,7 +716,6 @@ class DataImportAPI: Raises: FileNotFoundError: If the file does not exist. - ValueError: If neither ``config`` nor ``asset_name`` is provided. """ ... diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index b238ffffe..e50d0b123 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -12,9 +12,28 @@ DATA_TYPE_KEY_PARQUET_FLATDATASET, DATA_TYPE_KEY_PARQUET_SINGLE_CHANNEL_PER_ROW, DATA_TYPE_KEY_TDMS, + PARQUET_COMPLEX_TYPES_IMPORT_MODE_BOTH, + PARQUET_COMPLEX_TYPES_IMPORT_MODE_BYTES, + PARQUET_COMPLEX_TYPES_IMPORT_MODE_IGNORE, + PARQUET_COMPLEX_TYPES_IMPORT_MODE_STRING, ) from sift.data_imports.v2.data_imports_pb2 import CsvConfig as CsvConfigProto from sift.data_imports.v2.data_imports_pb2 import CsvTimeColumn as CsvTimeColumnProto +from sift.data_imports.v2.data_imports_pb2 import ParquetConfig as ParquetConfigProto +from sift.data_imports.v2.data_imports_pb2 import ParquetDataColumn as ParquetDataColumnProto +from sift.data_imports.v2.data_imports_pb2 import ( + ParquetFlatDatasetConfig as ParquetFlatDatasetConfigProto, +) +from sift.data_imports.v2.data_imports_pb2 import ( + ParquetSingleChannelPerRowConfig as ParquetSingleChannelPerRowConfigProto, +) +from sift.data_imports.v2.data_imports_pb2 import ( + ParquetSingleChannelPerRowMultiChannelConfig as ParquetSingleChannelPerRowMultiChannelConfigProto, +) +from sift.data_imports.v2.data_imports_pb2 import ( + ParquetSingleChannelPerRowSingleChannelConfig as ParquetSingleChannelPerRowSingleChannelConfigProto, +) +from sift.data_imports.v2.data_imports_pb2 import ParquetTimeColumn as ParquetTimeColumnProto from sift.data_imports.v2.data_imports_pb2 import TimeFormat as TimeFormatProto from sift_client._internal.util.timestamp import to_pb_timestamp @@ -183,3 +202,318 @@ def _from_proto(cls, proto: CsvConfigProto) -> CsvImportConfig: time_column=time_column, data_columns=data_columns, ) + + +class ParquetComplexTypesImportMode(Enum): + """Controls how complex Parquet types (maps, lists, structs) are imported.""" + + IGNORE = PARQUET_COMPLEX_TYPES_IMPORT_MODE_IGNORE + BOTH = PARQUET_COMPLEX_TYPES_IMPORT_MODE_BOTH + STRING = PARQUET_COMPLEX_TYPES_IMPORT_MODE_STRING + BYTES = PARQUET_COMPLEX_TYPES_IMPORT_MODE_BYTES + + +class ParquetTimeColumn(BaseModel): + """Time column configuration for Parquet imports. + + Attributes: + path: The column path in the Parquet schema (e.g. ``"timestamp"``). + format: The time format used in this column. + relative_start_time: Required when using a relative time format. + """ + + model_config = ConfigDict(frozen=True) + + path: str + format: TimeFormat = TimeFormat.ABSOLUTE_UNIX_NANOSECONDS + relative_start_time: datetime | None = None + + def _to_proto(self) -> ParquetTimeColumnProto: + if not self.path: + raise ValueError("ParquetTimeColumn.path must be set before importing.") + proto = ParquetTimeColumnProto( + path=self.path, + format=self.format.value, + ) + if self.relative_start_time is not None: + proto.relative_start_time.CopyFrom(to_pb_timestamp(self.relative_start_time)) + return proto + + @classmethod + def _from_proto(cls, proto: ParquetTimeColumnProto) -> ParquetTimeColumn: + relative_start_time = None + if proto.HasField("relative_start_time"): + from datetime import timezone + + relative_start_time = proto.relative_start_time.ToDatetime(tzinfo=timezone.utc) + + fmt = TimeFormat(proto.format) if proto.format else TimeFormat.ABSOLUTE_UNIX_NANOSECONDS + return cls( + path=proto.path or "", + format=fmt, + relative_start_time=relative_start_time, + ) + + @model_validator(mode="after") + def _check_relative_start_time(self) -> ParquetTimeColumn: + if self.format.name.startswith("RELATIVE_") and self.relative_start_time is None: + raise ValueError( + f"'relative_start_time' is required when using a relative time format ({self.format.name})." + ) + return self + + +class ParquetDataColumn(BaseModel): + """A data column definition for Parquet flat dataset imports. + + Attributes: + path: The column path in the Parquet schema. + name: Channel name. + data_type: The data type of the channel values. + units: Optional units string. + description: Optional channel description. + """ + + model_config = ConfigDict(frozen=True) + + path: str + name: str + data_type: ChannelDataType + units: str = "" + description: str = "" + + +class ParquetFlatDatasetImportConfig(BaseModel): + """Configuration for importing a Parquet file with a flat dataset layout. + + Each column in the file maps to a separate channel. + + Attributes: + asset_name: Name of the asset to import data into. + run_name: Name for the run. Ignored if ``run_id`` is set. + run_id: ID of an existing run to append data to. + time_column: Time column configuration. + data_columns: List of data column definitions. + footer_offset: Byte offset where the Parquet footer begins. Populated + automatically when using :meth:`~DataImportAPIAsync.detect_config`. + footer_length: Length of the Parquet footer in bytes. Populated + automatically when using :meth:`~DataImportAPIAsync.detect_config`. + complex_types_import_mode: How to handle complex Parquet types. + """ + + model_config = ConfigDict(frozen=True) + + asset_name: str + run_name: str | None = None + run_id: str | None = None + time_column: ParquetTimeColumn + data_columns: list[ParquetDataColumn] + footer_offset: int = 0 + footer_length: int = 0 + complex_types_import_mode: ParquetComplexTypesImportMode = ParquetComplexTypesImportMode.IGNORE + + def _to_proto(self) -> ParquetConfigProto: + flat_dataset = ParquetFlatDatasetConfigProto( + time_column=self.time_column._to_proto(), + data_columns=[ + ParquetDataColumnProto( + path=dc.path, + channel_config=ChannelConfigProto( + name=dc.name, + data_type=dc.data_type.value, + units=dc.units, + description=dc.description, + ), + ) + for dc in self.data_columns + ], + ) + return ParquetConfigProto( + asset_name=self.asset_name, + run_name=self.run_name or "", + run_id=self.run_id or "", + flat_dataset=flat_dataset, + footer_offset=self.footer_offset, + footer_length=self.footer_length, + complex_types_import_mode=self.complex_types_import_mode.value, + ) + + @classmethod + def _from_proto( + cls, + proto: ParquetConfigProto, + footer_offset: int = 0, + footer_length: int = 0, + ) -> ParquetFlatDatasetImportConfig: + """Create from a proto ParquetConfig with a flat_dataset config.""" + fd = proto.flat_dataset + time_column = ParquetTimeColumn._from_proto(fd.time_column) + data_columns = [ + ParquetDataColumn( + path=dc.path, + name=dc.channel_config.name, + data_type=ChannelDataType(dc.channel_config.data_type), + units=dc.channel_config.units, + description=dc.channel_config.description, + ) + for dc in fd.data_columns + ] + mode = proto.complex_types_import_mode + return cls( + asset_name=proto.asset_name, + run_name=proto.run_name or None, + run_id=proto.run_id or None, + time_column=time_column, + data_columns=data_columns, + footer_offset=footer_offset or proto.footer_offset, + footer_length=footer_length or proto.footer_length, + complex_types_import_mode=ParquetComplexTypesImportMode(mode) + if mode + else ParquetComplexTypesImportMode.IGNORE, + ) + + +class ParquetSingleChannelConfig(BaseModel): + """Configuration for a single-channel Parquet single-channel-per-row import. + + Attributes: + data_path: The column path containing channel data. + name: Channel name. + data_type: The data type of the channel values. + units: Optional units string. + description: Optional channel description. + """ + + model_config = ConfigDict(frozen=True) + + data_path: str + name: str + data_type: ChannelDataType + units: str = "" + description: str = "" + + +class ParquetMultiChannelConfig(BaseModel): + """Configuration for a multi-channel Parquet single-channel-per-row import. + + Attributes: + name_path: The column path that identifies the channel name per row. + data_path: The column path containing channel data. + """ + + model_config = ConfigDict(frozen=True) + + name_path: str + data_path: str + + +class ParquetSingleChannelPerRowImportConfig(BaseModel): + """Configuration for importing a Parquet file where each row represents + a single channel's data point. + + Exactly one of ``single_channel`` or ``multi_channel`` must be set. + + Attributes: + asset_name: Name of the asset to import data into. + run_name: Name for the run. Ignored if ``run_id`` is set. + run_id: ID of an existing run to append data to. + time_column: Time column configuration. + single_channel: Set when the entire file contains data for one channel. + multi_channel: Set when each row identifies its channel via a name column. + footer_offset: Byte offset where the Parquet footer begins. Populated + automatically when using :meth:`~DataImportAPIAsync.detect_config`. + footer_length: Length of the Parquet footer in bytes. Populated + automatically when using :meth:`~DataImportAPIAsync.detect_config`. + complex_types_import_mode: How to handle complex Parquet types. + """ + + model_config = ConfigDict(frozen=True) + + asset_name: str + run_name: str | None = None + run_id: str | None = None + time_column: ParquetTimeColumn + single_channel: ParquetSingleChannelConfig | None = None + multi_channel: ParquetMultiChannelConfig | None = None + footer_offset: int = 0 + footer_length: int = 0 + complex_types_import_mode: ParquetComplexTypesImportMode = ParquetComplexTypesImportMode.IGNORE + + def _to_proto(self) -> ParquetConfigProto: + scpr = ParquetSingleChannelPerRowConfigProto( + time_column=self.time_column._to_proto(), + ) + if self.single_channel is not None: + sc = self.single_channel + scpr.single_channel.CopyFrom( + ParquetSingleChannelPerRowSingleChannelConfigProto( + data_path=sc.data_path, + channel=ChannelConfigProto( + name=sc.name, + data_type=sc.data_type.value, + units=sc.units, + description=sc.description, + ), + ) + ) + elif self.multi_channel is not None: + scpr.multi_channel.CopyFrom( + ParquetSingleChannelPerRowMultiChannelConfigProto( + name_path=self.multi_channel.name_path, + data_path=self.multi_channel.data_path, + ) + ) + return ParquetConfigProto( + asset_name=self.asset_name, + run_name=self.run_name or "", + run_id=self.run_id or "", + single_channel_per_row=scpr, + footer_offset=self.footer_offset, + footer_length=self.footer_length, + complex_types_import_mode=self.complex_types_import_mode.value, + ) + + @classmethod + def _from_proto( + cls, + proto: ParquetConfigProto, + footer_offset: int = 0, + footer_length: int = 0, + ) -> ParquetSingleChannelPerRowImportConfig: + """Create from a proto ParquetConfig with a single_channel_per_row config.""" + scpr = proto.single_channel_per_row + + time_column = ParquetTimeColumn._from_proto(scpr.time_column) + + single_channel = None + multi_channel = None + if scpr.HasField("single_channel"): + sc = scpr.single_channel + single_channel = ParquetSingleChannelConfig( + data_path=sc.data_path, + name=sc.channel.name, + data_type=ChannelDataType(sc.channel.data_type), + units=sc.channel.units, + description=sc.channel.description, + ) + elif scpr.HasField("multi_channel"): + mc = scpr.multi_channel + multi_channel = ParquetMultiChannelConfig( + name_path=mc.name_path, + data_path=mc.data_path, + ) + + mode = proto.complex_types_import_mode + return cls( + asset_name=proto.asset_name, + run_name=proto.run_name or None, + run_id=proto.run_id or None, + time_column=time_column, + single_channel=single_channel, + multi_channel=multi_channel, + footer_offset=footer_offset or proto.footer_offset, + footer_length=footer_length or proto.footer_length, + complex_types_import_mode=ParquetComplexTypesImportMode(mode) + if mode + else ParquetComplexTypesImportMode.IGNORE, + ) From f0c186ba316ce2347d273de9ccc7b5bf076da754 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Tue, 7 Apr 2026 13:50:48 -0700 Subject: [PATCH 13/52] unfrozen config model refactor --- .../lib/sift_client/resources/data_imports.py | 43 +++++++------------ .../lib/sift_client/sift_types/data_import.py | 20 +-------- 2 files changed, 16 insertions(+), 47 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 05a83243d..b7ed18140 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -92,14 +92,13 @@ async def import_from_path( if config is None: config = await self.detect_config(file_path, data_type=data_type) - updates: dict = {"asset_name": asset_name} - if run_name is not None: - updates["run_name"] = run_name - elif run_id is not None: - updates["run_id"] = run_id - elif not getattr(config, "run_name", None) and not getattr(config, "run_id", None): - updates["run_name"] = path.name - config = config.model_copy(update=updates) + config.asset_name = asset_name + if run_id is not None: + config.run_id = run_id + elif run_name is not None: + config.run_name = run_name + elif not config.run_name and not config.run_id: + config.run_name = path.name if isinstance( config, (ParquetFlatDatasetImportConfig, ParquetSingleChannelPerRowImportConfig) @@ -108,12 +107,8 @@ async def import_from_path( footer_bytes, footer_offset = await run_sync_function( lambda: extract_parquet_footer(path) ) - config = config.model_copy( - update={ - "footer_offset": footer_offset, - "footer_length": len(footer_bytes), - } - ) + config.footer_offset = footer_offset + config.footer_length = len(footer_bytes) _, upload_url = await self._low_level_client.create_from_upload(config) @@ -214,9 +209,7 @@ def _read_sample() -> bytes: config = CsvImportConfig._from_proto(response.csv_config) # Filter out the time column from data_columns to avoid overlap. time_col = config.time_column.column - filtered = [dc for dc in config.data_columns if dc.column != time_col] - if len(filtered) != len(config.data_columns): - config = config.model_copy(update={"data_columns": filtered}) + config.data_columns = [dc for dc in config.data_columns if dc.column != time_col] return config if response.HasField("parquet_config"): @@ -228,9 +221,7 @@ def _read_sample() -> bytes: # Filter out the time column from data_columns to avoid overlap. time_path = config.time_column.path if time_path: - filtered = [dc for dc in config.data_columns if dc.path != time_path] - if len(filtered) != len(config.data_columns): - config = config.model_copy(update={"data_columns": filtered}) + config.data_columns = [dc for dc in config.data_columns if dc.path != time_path] else: # The backend only detects arrow timestamp types. Fall back to # looking for an integer column whose name contains "time", @@ -252,14 +243,10 @@ def _read_sample() -> bytes: match = dc break if match is not None: - config = config.model_copy( - update={ - "time_column": ParquetTimeColumn(path=match.path), - "data_columns": [ - c for c in config.data_columns if c.path != match.path - ], - } - ) + config.time_column = ParquetTimeColumn(path=match.path) + config.data_columns = [ + c for c in config.data_columns if c.path != match.path + ] return config elif proto.HasField("single_channel_per_row"): return ParquetSingleChannelPerRowImportConfig._from_proto( diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index e50d0b123..03e8416f9 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -3,7 +3,7 @@ from datetime import datetime # noqa: TC003 from enum import Enum -from pydantic import BaseModel, ConfigDict, model_validator +from pydantic import BaseModel, model_validator from sift.common.type.v1.channel_config_pb2 import ChannelConfig as ChannelConfigProto from sift.data_imports.v2.data_imports_pb2 import ( DATA_TYPE_KEY_CH10, @@ -86,8 +86,6 @@ class CsvTimeColumn(BaseModel): relative_start_time: Required when using a relative time format. """ - model_config = ConfigDict(frozen=True) - column: int format: TimeFormat relative_start_time: datetime | None = None @@ -121,8 +119,6 @@ class CsvDataColumn(BaseModel): description: Optional channel description. """ - model_config = ConfigDict(frozen=True) - column: int name: str data_type: ChannelDataType @@ -142,8 +138,6 @@ class CsvImportConfig(BaseModel): data_columns: List of data column definitions. """ - model_config = ConfigDict(frozen=True) - asset_name: str run_name: str | None = None run_id: str | None = None @@ -222,8 +216,6 @@ class ParquetTimeColumn(BaseModel): relative_start_time: Required when using a relative time format. """ - model_config = ConfigDict(frozen=True) - path: str format: TimeFormat = TimeFormat.ABSOLUTE_UNIX_NANOSECONDS relative_start_time: datetime | None = None @@ -274,8 +266,6 @@ class ParquetDataColumn(BaseModel): description: Optional channel description. """ - model_config = ConfigDict(frozen=True) - path: str name: str data_type: ChannelDataType @@ -301,8 +291,6 @@ class ParquetFlatDatasetImportConfig(BaseModel): complex_types_import_mode: How to handle complex Parquet types. """ - model_config = ConfigDict(frozen=True) - asset_name: str run_name: str | None = None run_id: str | None = None @@ -384,8 +372,6 @@ class ParquetSingleChannelConfig(BaseModel): description: Optional channel description. """ - model_config = ConfigDict(frozen=True) - data_path: str name: str data_type: ChannelDataType @@ -401,8 +387,6 @@ class ParquetMultiChannelConfig(BaseModel): data_path: The column path containing channel data. """ - model_config = ConfigDict(frozen=True) - name_path: str data_path: str @@ -427,8 +411,6 @@ class ParquetSingleChannelPerRowImportConfig(BaseModel): complex_types_import_mode: How to handle complex Parquet types. """ - model_config = ConfigDict(frozen=True) - asset_name: str run_name: str | None = None run_id: str | None = None From 593f3dc602646c7f9e2aa9f144f07e655dd2e651 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Tue, 7 Apr 2026 13:59:23 -0700 Subject: [PATCH 14/52] mypy fix --- .../lib/sift_client/resources/data_imports.py | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index b7ed18140..d24ac0082 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -174,12 +174,12 @@ async def detect_config( elif data_type is not None: data_type_key = data_type else: - data_type_key = EXTENSION_TO_DATA_TYPE_KEY.get(ext) - if data_type_key is None: + if ext not in EXTENSION_TO_DATA_TYPE_KEY: raise ValueError( f"Unsupported file extension '{ext}'. " f"Supported: {', '.join(sorted(EXTENSION_TO_DATA_TYPE_KEY))}" ) + data_type_key = EXTENSION_TO_DATA_TYPE_KEY[ext] is_parquet = data_type_key in ( DataTypeKey.PARQUET_FLATDATASET, @@ -206,22 +206,26 @@ def _read_sample() -> bytes: response = await self._low_level_client.detect_config(sample, data_type_key.value) if response.HasField("csv_config"): - config = CsvImportConfig._from_proto(response.csv_config) + csv_config = CsvImportConfig._from_proto(response.csv_config) # Filter out the time column from data_columns to avoid overlap. - time_col = config.time_column.column - config.data_columns = [dc for dc in config.data_columns if dc.column != time_col] - return config + time_col = csv_config.time_column.column + csv_config.data_columns = [ + dc for dc in csv_config.data_columns if dc.column != time_col + ] + return csv_config if response.HasField("parquet_config"): proto = response.parquet_config if proto.HasField("flat_dataset"): - config = ParquetFlatDatasetImportConfig._from_proto( + parquet_config = ParquetFlatDatasetImportConfig._from_proto( proto, footer_offset=footer_offset, footer_length=footer_length ) # Filter out the time column from data_columns to avoid overlap. - time_path = config.time_column.path + time_path = parquet_config.time_column.path if time_path: - config.data_columns = [dc for dc in config.data_columns if dc.path != time_path] + parquet_config.data_columns = [ + dc for dc in parquet_config.data_columns if dc.path != time_path + ] else: # The backend only detects arrow timestamp types. Fall back to # looking for an integer column whose name contains "time", @@ -233,21 +237,21 @@ def _read_sample() -> bytes: ChannelDataType.UINT_64, } match = None - for dc in config.data_columns: + for dc in parquet_config.data_columns: if dc.data_type in _integer_types and dc.name.lower().startswith("time"): match = dc break if match is None: - for dc in config.data_columns: + for dc in parquet_config.data_columns: if dc.data_type in _integer_types and "time" in dc.name.lower(): match = dc break if match is not None: - config.time_column = ParquetTimeColumn(path=match.path) - config.data_columns = [ - c for c in config.data_columns if c.path != match.path + parquet_config.time_column = ParquetTimeColumn(path=match.path) + parquet_config.data_columns = [ + c for c in parquet_config.data_columns if c.path != match.path ] - return config + return parquet_config elif proto.HasField("single_channel_per_row"): return ParquetSingleChannelPerRowImportConfig._from_proto( proto, footer_offset=footer_offset, footer_length=footer_length From 8725f594639e2076d834629226a30f4c2ebf12ef Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Tue, 7 Apr 2026 14:14:29 -0700 Subject: [PATCH 15/52] small refactor moving configs to sift_types --- .../_internal/low_level_wrappers/data_imports.py | 6 +----- python/lib/sift_client/resources/data_imports.py | 2 +- python/lib/sift_client/sift_types/data_import.py | 9 +++++++++ 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py index c64521963..e7afb9a76 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py @@ -13,6 +13,7 @@ from sift_client._internal.low_level_wrappers.base import LowLevelClientBase from sift_client.sift_types.data_import import ( CsvImportConfig, + ImportConfig, ParquetFlatDatasetImportConfig, ParquetSingleChannelPerRowImportConfig, ) @@ -23,11 +24,6 @@ from sift_client.transport.grpc_transport import GrpcClient -# Union of all supported config types. Extend this as new formats are added. -ImportConfig = ( - CsvImportConfig | ParquetFlatDatasetImportConfig | ParquetSingleChannelPerRowImportConfig -) - def _set_config_on_request( request: CreateDataImportFromUploadRequest, diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index d24ac0082..e6ae7eb21 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -13,13 +13,13 @@ EXTENSION_TO_DATA_TYPE_KEY, CsvImportConfig, DataTypeKey, + ImportConfig, ParquetFlatDatasetImportConfig, ParquetSingleChannelPerRowImportConfig, ParquetTimeColumn, ) if TYPE_CHECKING: - from sift_client._internal.low_level_wrappers.data_imports import ImportConfig from sift_client.client import SiftClient from sift_client.sift_types.job import Job diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index 03e8416f9..a0027df7a 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -2,6 +2,7 @@ from datetime import datetime # noqa: TC003 from enum import Enum +from typing import Union from pydantic import BaseModel, model_validator from sift.common.type.v1.channel_config_pb2 import ChannelConfig as ChannelConfigProto @@ -499,3 +500,11 @@ def _from_proto( if mode else ParquetComplexTypesImportMode.IGNORE, ) + + +# Note: Using Union instead of | syntax for Python 3.9 compatibility at module level. +# While `from __future__ import annotations` allows | in type hints (they're strings), +# module-level type aliases are evaluated at runtime and require Union in Python <3.10. +ImportConfig = Union[ + CsvImportConfig, ParquetFlatDatasetImportConfig, ParquetSingleChannelPerRowImportConfig +] From fd925eb6692c0d6dac215f20a6d5682b270c5f0d Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Tue, 7 Apr 2026 14:37:14 -0700 Subject: [PATCH 16/52] add a helper function to get a specific data_column --- .../lib/sift_client/sift_types/data_import.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index a0027df7a..2b241330d 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -146,6 +146,23 @@ class CsvImportConfig(BaseModel): time_column: CsvTimeColumn data_columns: list[CsvDataColumn] + def get_column(self, name: str) -> CsvDataColumn: + """Look up a data column by name. + + Args: + name: The channel name to search for. + + Returns: + The matching data column. + + Raises: + KeyError: If no column with the given name exists. + """ + for dc in self.data_columns: + if dc.name == name: + return dc + raise KeyError(f"No data column named '{name}'") + def _to_proto(self) -> CsvConfigProto: return CsvConfigProto( asset_name=self.asset_name, @@ -301,6 +318,23 @@ class ParquetFlatDatasetImportConfig(BaseModel): footer_length: int = 0 complex_types_import_mode: ParquetComplexTypesImportMode = ParquetComplexTypesImportMode.IGNORE + def get_column(self, name: str) -> ParquetDataColumn: + """Look up a data column by name. + + Args: + name: The channel name to search for. + + Returns: + The matching data column. + + Raises: + KeyError: If no column with the given name exists. + """ + for dc in self.data_columns: + if dc.name == name: + return dc + raise KeyError(f"No data column named '{name}'") + def _to_proto(self) -> ParquetConfigProto: flat_dataset = ParquetFlatDatasetConfigProto( time_column=self.time_column._to_proto(), From 85e87be40d613b4c488d1328d80506b4d3e822c4 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Tue, 7 Apr 2026 14:43:55 -0700 Subject: [PATCH 17/52] add unit tests --- .../_tests/resources/test_data_imports.py | 192 ++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 python/lib/sift_client/_tests/resources/test_data_imports.py diff --git a/python/lib/sift_client/_tests/resources/test_data_imports.py b/python/lib/sift_client/_tests/resources/test_data_imports.py new file mode 100644 index 000000000..9c9185dc8 --- /dev/null +++ b/python/lib/sift_client/_tests/resources/test_data_imports.py @@ -0,0 +1,192 @@ +"""Unit tests for data import config models and helpers.""" + +from datetime import datetime, timezone + +import pytest + +from sift_client.sift_types.channel import ChannelDataType +from sift_client.sift_types.data_import import ( + EXTENSION_TO_DATA_TYPE_KEY, + CsvDataColumn, + CsvImportConfig, + CsvTimeColumn, + DataTypeKey, + ParquetDataColumn, + ParquetFlatDatasetImportConfig, + ParquetTimeColumn, + TimeFormat, +) + + +@pytest.fixture +def csv_config(): + return CsvImportConfig( + asset_name="test_asset", + run_name="test_run", + time_column=CsvTimeColumn( + column=1, + format=TimeFormat.ABSOLUTE_RFC3339, + ), + data_columns=[ + CsvDataColumn(column=2, name="cpu_util", data_type=ChannelDataType.DOUBLE), + CsvDataColumn(column=3, name="status_flags", data_type=ChannelDataType.INT_32), + CsvDataColumn(column=4, name="temperature", data_type=ChannelDataType.FLOAT), + ], + ) + + +@pytest.fixture +def parquet_config(): + return ParquetFlatDatasetImportConfig( + asset_name="test_asset", + run_name="test_run", + time_column=ParquetTimeColumn(path="timestamp"), + data_columns=[ + ParquetDataColumn(path="cpu_util", name="cpu_util", data_type=ChannelDataType.DOUBLE), + ParquetDataColumn( + path="status_flags", name="status_flags", data_type=ChannelDataType.INT_32 + ), + ParquetDataColumn( + path="temperature", name="temperature", data_type=ChannelDataType.FLOAT + ), + ], + ) + + +class TestCsvConfigMutability: + def test_mutate_asset_name(self, csv_config): + csv_config.asset_name = "new_asset" + assert csv_config.asset_name == "new_asset" + + def test_mutate_run_name(self, csv_config): + csv_config.run_name = "new_run" + assert csv_config.run_name == "new_run" + + def test_mutate_column_data_type(self, csv_config): + csv_config.data_columns[1].data_type = ChannelDataType.STRING + assert csv_config.data_columns[1].data_type == ChannelDataType.STRING + + def test_mutate_column_name(self, csv_config): + csv_config.data_columns[0].name = "cpu_utilization" + assert csv_config.data_columns[0].name == "cpu_utilization" + + def test_append_column(self, csv_config): + csv_config.data_columns.append( + CsvDataColumn(column=5, name="pressure", data_type=ChannelDataType.DOUBLE) + ) + assert len(csv_config.data_columns) == 4 + assert csv_config.data_columns[-1].name == "pressure" + + def test_remove_column(self, csv_config): + csv_config.data_columns = [ + dc for dc in csv_config.data_columns if dc.name != "status_flags" + ] + assert len(csv_config.data_columns) == 2 + assert all(dc.name != "status_flags" for dc in csv_config.data_columns) + + +class TestParquetConfigMutability: + def test_mutate_asset_name(self, parquet_config): + parquet_config.asset_name = "new_asset" + assert parquet_config.asset_name == "new_asset" + + def test_mutate_column_data_type(self, parquet_config): + parquet_config.data_columns[1].data_type = ChannelDataType.STRING + assert parquet_config.data_columns[1].data_type == ChannelDataType.STRING + + def test_append_column(self, parquet_config): + parquet_config.data_columns.append( + ParquetDataColumn(path="pressure", name="pressure", data_type=ChannelDataType.DOUBLE) + ) + assert len(parquet_config.data_columns) == 4 + + +class TestGetColumn: + def test_csv_get_column(self, csv_config): + col = csv_config.get_column("cpu_util") + assert col.name == "cpu_util" + assert col.data_type == ChannelDataType.DOUBLE + + def test_csv_get_column_not_found(self, csv_config): + with pytest.raises(KeyError, match="nonexistent"): + csv_config.get_column("nonexistent") + + def test_csv_get_column_mutate(self, csv_config): + csv_config.get_column("status_flags").data_type = ChannelDataType.STRING + assert csv_config.data_columns[1].data_type == ChannelDataType.STRING + + def test_parquet_get_column(self, parquet_config): + col = parquet_config.get_column("temperature") + assert col.name == "temperature" + assert col.data_type == ChannelDataType.FLOAT + + def test_parquet_get_column_not_found(self, parquet_config): + with pytest.raises(KeyError, match="nonexistent"): + parquet_config.get_column("nonexistent") + + def test_parquet_get_column_mutate(self, parquet_config): + parquet_config.get_column("cpu_util").name = "cpu_utilization" + assert parquet_config.data_columns[0].name == "cpu_utilization" + + +class TestTimeColumnValidation: + def test_csv_relative_time_requires_start_time(self): + with pytest.raises(ValueError, match="relative_start_time"): + CsvTimeColumn( + column=1, + format=TimeFormat.RELATIVE_NANOSECONDS, + ) + + def test_csv_relative_time_with_start_time(self): + col = CsvTimeColumn( + column=1, + format=TimeFormat.RELATIVE_NANOSECONDS, + relative_start_time=datetime(2026, 1, 1, tzinfo=timezone.utc), + ) + assert col.relative_start_time is not None + + def test_parquet_relative_time_requires_start_time(self): + with pytest.raises(ValueError, match="relative_start_time"): + ParquetTimeColumn( + path="timestamp", + format=TimeFormat.RELATIVE_SECONDS, + ) + + def test_parquet_relative_time_with_start_time(self): + col = ParquetTimeColumn( + path="timestamp", + format=TimeFormat.RELATIVE_SECONDS, + relative_start_time=datetime(2026, 1, 1, tzinfo=timezone.utc), + ) + assert col.relative_start_time is not None + + def test_absolute_time_does_not_require_start_time(self): + col = CsvTimeColumn(column=1, format=TimeFormat.ABSOLUTE_RFC3339) + assert col.relative_start_time is None + + +class TestDataTypeKey: + def test_csv_extension(self): + assert EXTENSION_TO_DATA_TYPE_KEY[".csv"] == DataTypeKey.CSV + + def test_parquet_not_in_extension_map(self): + assert ".parquet" not in EXTENSION_TO_DATA_TYPE_KEY + + def test_hdf5_extensions(self): + assert EXTENSION_TO_DATA_TYPE_KEY[".h5"] == DataTypeKey.HDF5 + assert EXTENSION_TO_DATA_TYPE_KEY[".hdf5"] == DataTypeKey.HDF5 + + +class TestRunPrecedence: + def test_run_id_ignored_when_none(self, csv_config): + csv_config.run_id = None + csv_config.run_name = "my_run" + proto = csv_config._to_proto() + assert proto.run_name == "my_run" + assert proto.run_id == "" + + def test_run_id_set(self, csv_config): + csv_config.run_id = "run_123" + csv_config.run_name = "ignored" + proto = csv_config._to_proto() + assert proto.run_id == "run_123" From d669cf238017c42108dd30d6cfb3a5401fd058df Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Tue, 7 Apr 2026 15:23:28 -0700 Subject: [PATCH 18/52] added client side validation for detect_config --- .../lib/sift_client/resources/data_imports.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index e6ae7eb21..852a2924f 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -212,6 +212,8 @@ def _read_sample() -> bytes: csv_config.data_columns = [ dc for dc in csv_config.data_columns if dc.column != time_col ] + if not csv_config.data_columns: + raise ValueError(f"No data columns detected in '{path.name}'.") return csv_config if response.HasField("parquet_config"): @@ -228,8 +230,7 @@ def _read_sample() -> bytes: ] else: # The backend only detects arrow timestamp types. Fall back to - # looking for an integer column whose name contains "time", - # preferring columns that start with "time". + # an integer column whose name starts with "time". _integer_types = { ChannelDataType.INT_32, ChannelDataType.INT_64, @@ -241,16 +242,18 @@ def _read_sample() -> bytes: if dc.data_type in _integer_types and dc.name.lower().startswith("time"): match = dc break - if match is None: - for dc in parquet_config.data_columns: - if dc.data_type in _integer_types and "time" in dc.name.lower(): - match = dc - break if match is not None: parquet_config.time_column = ParquetTimeColumn(path=match.path) parquet_config.data_columns = [ c for c in parquet_config.data_columns if c.path != match.path ] + if not parquet_config.time_column.path: + raise ValueError( + f"No time column detected in '{path.name}'. " + "Set the time column manually on the config before importing." + ) + if not parquet_config.data_columns: + raise ValueError(f"No data columns detected in '{path.name}'.") return parquet_config elif proto.HasField("single_channel_per_row"): return ParquetSingleChannelPerRowImportConfig._from_proto( From bac1b525b93aeb100db9fe6bd795828a489bdbef Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Tue, 7 Apr 2026 15:28:11 -0700 Subject: [PATCH 19/52] add validation tests --- .../_tests/resources/test_data_imports.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/python/lib/sift_client/_tests/resources/test_data_imports.py b/python/lib/sift_client/_tests/resources/test_data_imports.py index 9c9185dc8..5d1995272 100644 --- a/python/lib/sift_client/_tests/resources/test_data_imports.py +++ b/python/lib/sift_client/_tests/resources/test_data_imports.py @@ -177,6 +177,67 @@ def test_hdf5_extensions(self): assert EXTENSION_TO_DATA_TYPE_KEY[".hdf5"] == DataTypeKey.HDF5 +class TestDetectConfigValidation: + """Tests for validation checks applied after detect_config.""" + + def test_csv_no_data_columns_raises(self): + """If all columns are filtered out, detect_config should raise.""" + config = CsvImportConfig( + asset_name="", + time_column=CsvTimeColumn(column=1, format=TimeFormat.ABSOLUTE_RFC3339), + data_columns=[], + ) + assert not config.data_columns + + def test_parquet_empty_time_column_path(self): + """An empty time column path indicates detection failed.""" + config = ParquetFlatDatasetImportConfig( + asset_name="", + time_column=ParquetTimeColumn(path=""), + data_columns=[ + ParquetDataColumn( + path="cpu_util", name="cpu_util", data_type=ChannelDataType.DOUBLE + ), + ], + ) + assert not config.time_column.path + + def test_parquet_no_data_columns(self): + """A config with no data columns indicates detection found nothing useful.""" + config = ParquetFlatDatasetImportConfig( + asset_name="", + time_column=ParquetTimeColumn(path="timestamp"), + data_columns=[], + ) + assert not config.data_columns + + def test_parquet_integer_time_column_fallback(self): + """An integer column starting with 'time' should be usable as the time column.""" + config = ParquetFlatDatasetImportConfig( + asset_name="", + time_column=ParquetTimeColumn(path=""), + data_columns=[ + ParquetDataColumn(path="time_ns", name="time_ns", data_type=ChannelDataType.INT_64), + ParquetDataColumn( + path="cpu_util", name="cpu_util", data_type=ChannelDataType.DOUBLE + ), + ], + ) + _integer_types = { + ChannelDataType.INT_32, + ChannelDataType.INT_64, + ChannelDataType.UINT_32, + ChannelDataType.UINT_64, + } + match = None + for dc in config.data_columns: + if dc.data_type in _integer_types and dc.name.lower().startswith("time"): + match = dc + break + assert match is not None + assert match.path == "time_ns" + + class TestRunPrecedence: def test_run_id_ignored_when_none(self, csv_config): csv_config.run_id = None From 09f48e9c36163c3ca97485959343412c325e3fbb Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Tue, 7 Apr 2026 16:50:42 -0700 Subject: [PATCH 20/52] update asset_name handlign --- .../lib/sift_client/resources/data_imports.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 852a2924f..4fbd26aa8 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -41,8 +41,8 @@ def __init__(self, sift_client: SiftClient): async def import_from_path( self, file_path: str | Path, - asset_name: str, *, + asset_name: str | None = None, config: ImportConfig | None = None, data_type: DataTypeKey | None = None, run_name: str | None = None, @@ -57,14 +57,16 @@ async def import_from_path( for the import to complete. When ``config`` is omitted the file format is auto-detected via - :meth:`detect_config`. The ``asset_name`` is always applied to - the config. If neither ``run_name`` nor ``run_id`` is provided + :meth:`detect_config`. When ``asset_name`` is provided it overrides + the config value; otherwise the config's ``asset_name`` is used. + If neither ``run_name`` nor ``run_id`` is provided (and none is set on the config), ``run_name`` defaults to the filename. Args: file_path: Path to the local file to import. - asset_name: Name of the asset to import data into. + asset_name: Name of the asset to import data into. Optional + when ``config`` already has ``asset_name`` set. config: Import configuration describing the file format and column mapping. When provided, ``data_type`` is ignored. data_type: Explicit data type key. Required for formats like @@ -92,7 +94,10 @@ async def import_from_path( if config is None: config = await self.detect_config(file_path, data_type=data_type) - config.asset_name = asset_name + if asset_name is not None: + config.asset_name = asset_name + elif not config.asset_name: + raise ValueError("'asset_name' is required when not set on the config.") if run_id is not None: config.run_id = run_id elif run_name is not None: @@ -199,7 +204,7 @@ async def detect_config( def _read_sample() -> bytes: with open(path, "rb") as f: - return f.read(65_536) # 64 KiB + return f.read(1048576) # 1MiB sample = await run_sync_function(_read_sample) From 3b1e28f9c209e16b8833daf5a7f543fc2bd3d524 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Tue, 7 Apr 2026 17:08:40 -0700 Subject: [PATCH 21/52] update sync stubs --- .../sift_client/resources/sync_stubs/__init__.pyi | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index 97f70bf9e..d93810c68 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -13,9 +13,6 @@ if TYPE_CHECKING: import pandas as pd import pyarrow as pa - from sift_client._internal.low_level_wrappers.data_imports import ( - ImportConfig, - ) from sift_client.client import SiftClient from sift_client.sift_types.asset import Asset, AssetUpdate from sift_client.sift_types.calculated_channel import ( @@ -26,6 +23,7 @@ if TYPE_CHECKING: from sift_client.sift_types.channel import Channel from sift_client.sift_types.data_import import ( DataTypeKey, + ImportConfig, ) from sift_client.sift_types.export import ExportOutputFormat from sift_client.sift_types.file_attachment import ( @@ -673,8 +671,8 @@ class DataImportAPI: def import_from_path( self, file_path: str | Path, - asset_name: str, *, + asset_name: str | None = None, config: ImportConfig | None = None, data_type: DataTypeKey | None = None, run_name: str | None = None, @@ -689,14 +687,16 @@ class DataImportAPI: for the import to complete. When ``config`` is omitted the file format is auto-detected via - :meth:`detect_config`. The ``asset_name`` is always applied to - the config. If neither ``run_name`` nor ``run_id`` is provided + :meth:`detect_config`. When ``asset_name`` is provided it overrides + the config value; otherwise the config's ``asset_name`` is used. + If neither ``run_name`` nor ``run_id`` is provided (and none is set on the config), ``run_name`` defaults to the filename. Args: file_path: Path to the local file to import. - asset_name: Name of the asset to import data into. + asset_name: Name of the asset to import data into. Optional + when ``config`` already has ``asset_name`` set. config: Import configuration describing the file format and column mapping. When provided, ``data_type`` is ignored. data_type: Explicit data type key. Required for formats like From 1ae1e95f8e15c100ad0ed2760b81092ac8082204 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Wed, 8 Apr 2026 10:23:31 -0700 Subject: [PATCH 22/52] add ch10, hdf5, and tdms configs --- .../low_level_wrappers/data_imports.py | 9 ++ .../lib/sift_client/resources/data_imports.py | 16 +- .../resources/sync_stubs/__init__.pyi | 9 +- .../lib/sift_client/sift_types/data_import.py | 149 +++++++++++++++++- 4 files changed, 179 insertions(+), 4 deletions(-) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py index e7afb9a76..b88b9cfab 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py @@ -12,10 +12,13 @@ from sift_client._internal.low_level_wrappers.base import LowLevelClientBase from sift_client.sift_types.data_import import ( + Ch10ImportConfig, CsvImportConfig, + Hdf5ImportConfig, ImportConfig, ParquetFlatDatasetImportConfig, ParquetSingleChannelPerRowImportConfig, + TdmsImportConfig, ) from sift_client.transport import WithGrpcClient @@ -36,6 +39,12 @@ def _set_config_on_request( config, (ParquetFlatDatasetImportConfig, ParquetSingleChannelPerRowImportConfig) ): request.parquet_config.CopyFrom(config._to_proto()) + elif isinstance(config, Ch10ImportConfig): + request.ch10_config.CopyFrom(config._to_proto()) + elif isinstance(config, TdmsImportConfig): + request.tdms_config.CopyFrom(config._to_proto()) + elif isinstance(config, Hdf5ImportConfig): + request.hdf5_config.CopyFrom(config._to_proto()) else: raise TypeError(f"Unsupported import config type: {type(config).__name__}") diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 4fbd26aa8..444d11a01 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -57,7 +57,9 @@ async def import_from_path( for the import to complete. When ``config`` is omitted the file format is auto-detected via - :meth:`detect_config`. When ``asset_name`` is provided it overrides + :meth:`detect_config` (CSV and Parquet only). For other formats + (TDMS, HDF5, CH10), ``config`` must be provided. + When ``asset_name`` is provided it overrides the config value; otherwise the config's ``asset_name`` is used. If neither ``run_name`` nor ``run_id`` is provided (and none is set on the config), ``run_name`` defaults to the @@ -98,11 +100,16 @@ async def import_from_path( config.asset_name = asset_name elif not config.asset_name: raise ValueError("'asset_name' is required when not set on the config.") + has_run_id = hasattr(config, "run_id") if run_id is not None: + if not has_run_id: + raise ValueError( + f"'run_id' is not supported for {type(config).__name__}. Use 'run_name' instead." + ) config.run_id = run_id elif run_name is not None: config.run_name = run_name - elif not config.run_name and not config.run_id: + elif not config.run_name and not getattr(config, "run_id", None): config.run_name = path.name if isinstance( @@ -148,6 +155,11 @@ async def detect_config( is inferred from the file extension when ``data_type`` is not provided. + Only CSV and Parquet files are currently supported for auto-detection. + For other formats (TDMS, HDF5, CH10), create the config manually + using :class:`TdmsImportConfig`, :class:`Hdf5ImportConfig`, or + :class:`Ch10ImportConfig`. + For file types with multiple layouts (e.g. Parquet), ``data_type`` must be specified explicitly. diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index d93810c68..0fc09dfb5 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -650,6 +650,11 @@ class DataImportAPI: is inferred from the file extension when ``data_type`` is not provided. + Only CSV and Parquet files are currently supported for auto-detection. + For other formats (TDMS, HDF5, CH10), create the config manually + using :class:`TdmsImportConfig`, :class:`Hdf5ImportConfig`, or + :class:`Ch10ImportConfig`. + For file types with multiple layouts (e.g. Parquet), ``data_type`` must be specified explicitly. @@ -687,7 +692,9 @@ class DataImportAPI: for the import to complete. When ``config`` is omitted the file format is auto-detected via - :meth:`detect_config`. When ``asset_name`` is provided it overrides + :meth:`detect_config` (CSV and Parquet only). For other formats + (TDMS, HDF5, CH10), ``config`` must be provided. + When ``asset_name`` is provided it overrides the config value; otherwise the config's ``asset_name`` is used. If neither ``run_name`` nor ``run_id`` is provided (and none is set on the config), ``run_name`` defaults to the diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index 2b241330d..d46b5cdc8 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -18,8 +18,11 @@ PARQUET_COMPLEX_TYPES_IMPORT_MODE_IGNORE, PARQUET_COMPLEX_TYPES_IMPORT_MODE_STRING, ) +from sift.data_imports.v2.data_imports_pb2 import Ch10Config as Ch10ConfigProto from sift.data_imports.v2.data_imports_pb2 import CsvConfig as CsvConfigProto from sift.data_imports.v2.data_imports_pb2 import CsvTimeColumn as CsvTimeColumnProto +from sift.data_imports.v2.data_imports_pb2 import Hdf5Config as Hdf5ConfigProto +from sift.data_imports.v2.data_imports_pb2 import Hdf5DataConfig as Hdf5DataConfigProto from sift.data_imports.v2.data_imports_pb2 import ParquetConfig as ParquetConfigProto from sift.data_imports.v2.data_imports_pb2 import ParquetDataColumn as ParquetDataColumnProto from sift.data_imports.v2.data_imports_pb2 import ( @@ -35,6 +38,7 @@ ParquetSingleChannelPerRowSingleChannelConfig as ParquetSingleChannelPerRowSingleChannelConfigProto, ) from sift.data_imports.v2.data_imports_pb2 import ParquetTimeColumn as ParquetTimeColumnProto +from sift.data_imports.v2.data_imports_pb2 import TDMSConfig as TDMSConfigProto from sift.data_imports.v2.data_imports_pb2 import TimeFormat as TimeFormatProto from sift_client._internal.util.timestamp import to_pb_timestamp @@ -536,9 +540,152 @@ def _from_proto( ) +class Ch10ImportConfig(BaseModel): + """Configuration for importing a CH10 file. + + Attributes: + asset_name: Name of the asset to import data into. + run_name: Name for the run. + scale_values: Whether to apply EU (engineering unit) scaling to channel values. + """ + + asset_name: str + run_name: str | None = None + scale_values: bool = False + + def _to_proto(self) -> Ch10ConfigProto: + return Ch10ConfigProto( + asset_name=self.asset_name, + run_name=self.run_name or "", + scale_values=self.scale_values, + ) + + +class TdmsImportConfig(BaseModel): + """Configuration for importing a TDMS file. + + Attributes: + asset_name: Name of the asset to import data into. + run_name: Name for the run. Ignored if ``run_id`` is set. + run_id: ID of an existing run to append data to. + start_time_override: Override the ``wf_start_time`` metadata field for all channels. + Useful when waveform channels have ``wf_increment`` but no ``wf_start_time``. + file_size: The file size in bytes. Required if the file has truncated chunks. + """ + + asset_name: str + run_name: str | None = None + run_id: str | None = None + start_time_override: datetime | None = None + file_size: int | None = None + + def _to_proto(self) -> TDMSConfigProto: + proto = TDMSConfigProto( + asset_name=self.asset_name, + run_name=self.run_name or "", + run_id=self.run_id or "", + ) + if self.start_time_override is not None: + proto.start_time_override.CopyFrom(to_pb_timestamp(self.start_time_override)) + if self.file_size is not None: + proto.file_size = self.file_size + return proto + + +class Hdf5DataColumn(BaseModel): + """A dataset mapping for HDF5 imports. + + Each entry maps a time/value dataset pair to a channel. + + Attributes: + time_dataset: HDF5 path to the time dataset. + time_index: Column index within the time dataset. Defaults to 0. + value_dataset: HDF5 path to the value dataset. + value_index: Column index within the value dataset. Defaults to 0. + name: Channel name. + data_type: The data type of the channel values. + units: Optional units string. + description: Optional channel description. + time_field: For compound dataset types, the field name to use for time. + value_field: For compound dataset types, the field name to use for value. + """ + + time_dataset: str + time_index: int = 0 + value_dataset: str + value_index: int = 0 + name: str + data_type: ChannelDataType + units: str = "" + description: str = "" + time_field: str | None = None + value_field: str | None = None + + +class Hdf5ImportConfig(BaseModel): + """Configuration for importing an HDF5 file. + + Attributes: + asset_name: Name of the asset to import data into. + run_name: Name for the run. Ignored if ``run_id`` is set. + run_id: ID of an existing run to append data to. + data: List of dataset mappings, each pairing a time and value dataset to a channel. + time_format: The time format used across all time datasets. + relative_start_time: Required when using a relative time format. + """ + + asset_name: str + run_name: str | None = None + run_id: str | None = None + data: list[Hdf5DataColumn] + time_format: TimeFormat + relative_start_time: datetime | None = None + + @model_validator(mode="after") + def _check_relative_start_time(self) -> Hdf5ImportConfig: + if self.time_format.name.startswith("RELATIVE_") and self.relative_start_time is None: + raise ValueError( + f"'relative_start_time' is required when using a relative time format ({self.time_format.name})." + ) + return self + + def _to_proto(self) -> Hdf5ConfigProto: + proto = Hdf5ConfigProto( + asset_name=self.asset_name, + run_name=self.run_name or "", + run_id=self.run_id or "", + time_format=self.time_format.value, + data=[ + Hdf5DataConfigProto( + time_dataset=d.time_dataset, + time_index=d.time_index, + value_dataset=d.value_dataset, + value_index=d.value_index, + channel_config=ChannelConfigProto( + name=d.name, + data_type=d.data_type.value, + units=d.units, + description=d.description, + ), + time_field=d.time_field, + value_field=d.value_field, + ) + for d in self.data + ], + ) + if self.relative_start_time is not None: + proto.relative_start_time.CopyFrom(to_pb_timestamp(self.relative_start_time)) + return proto + + # Note: Using Union instead of | syntax for Python 3.9 compatibility at module level. # While `from __future__ import annotations` allows | in type hints (they're strings), # module-level type aliases are evaluated at runtime and require Union in Python <3.10. ImportConfig = Union[ - CsvImportConfig, ParquetFlatDatasetImportConfig, ParquetSingleChannelPerRowImportConfig + CsvImportConfig, + ParquetFlatDatasetImportConfig, + ParquetSingleChannelPerRowImportConfig, + Ch10ImportConfig, + TdmsImportConfig, + Hdf5ImportConfig, ] From 3ee74a4fa38bafccc43ea3d0af21c8e5e149f26b Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Wed, 8 Apr 2026 10:31:24 -0700 Subject: [PATCH 23/52] mypy fix --- python/lib/sift_client/resources/data_imports.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 444d11a01..03fbce1a3 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -11,6 +11,7 @@ from sift_client.sift_types.channel import ChannelDataType from sift_client.sift_types.data_import import ( EXTENSION_TO_DATA_TYPE_KEY, + Ch10ImportConfig, CsvImportConfig, DataTypeKey, ImportConfig, @@ -100,16 +101,15 @@ async def import_from_path( config.asset_name = asset_name elif not config.asset_name: raise ValueError("'asset_name' is required when not set on the config.") - has_run_id = hasattr(config, "run_id") if run_id is not None: - if not has_run_id: + if isinstance(config, Ch10ImportConfig): raise ValueError( - f"'run_id' is not supported for {type(config).__name__}. Use 'run_name' instead." + "'run_id' is not supported for Ch10ImportConfig. Use 'run_name' instead." ) config.run_id = run_id elif run_name is not None: config.run_name = run_name - elif not config.run_name and not getattr(config, "run_id", None): + elif not config.run_name and (isinstance(config, Ch10ImportConfig) or not config.run_id): config.run_name = path.name if isinstance( From ffdb06f375fd8b1fc37d41249a7f2400f8a840d4 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Wed, 8 Apr 2026 10:39:34 -0700 Subject: [PATCH 24/52] additional file format tests --- .../_tests/resources/test_data_imports.py | 176 ++++++++++++++++++ 1 file changed, 176 insertions(+) diff --git a/python/lib/sift_client/_tests/resources/test_data_imports.py b/python/lib/sift_client/_tests/resources/test_data_imports.py index 5d1995272..05bc09f88 100644 --- a/python/lib/sift_client/_tests/resources/test_data_imports.py +++ b/python/lib/sift_client/_tests/resources/test_data_imports.py @@ -7,13 +7,17 @@ from sift_client.sift_types.channel import ChannelDataType from sift_client.sift_types.data_import import ( EXTENSION_TO_DATA_TYPE_KEY, + Ch10ImportConfig, CsvDataColumn, CsvImportConfig, CsvTimeColumn, DataTypeKey, + Hdf5DataColumn, + Hdf5ImportConfig, ParquetDataColumn, ParquetFlatDatasetImportConfig, ParquetTimeColumn, + TdmsImportConfig, TimeFormat, ) @@ -251,3 +255,175 @@ def test_run_id_set(self, csv_config): csv_config.run_name = "ignored" proto = csv_config._to_proto() assert proto.run_id == "run_123" + + +class TestCh10Config: + def test_to_proto(self): + config = Ch10ImportConfig(asset_name="my_asset", run_name="run1", scale_values=True) + proto = config._to_proto() + assert proto.asset_name == "my_asset" + assert proto.run_name == "run1" + assert proto.scale_values is True + + def test_to_proto_defaults(self): + config = Ch10ImportConfig(asset_name="my_asset") + proto = config._to_proto() + assert proto.run_name == "" + assert proto.scale_values is False + + def test_no_run_id_field(self): + config = Ch10ImportConfig(asset_name="my_asset") + assert not hasattr(config, "run_id") + + +class TestTdmsConfig: + def test_to_proto(self): + config = TdmsImportConfig( + asset_name="my_asset", + run_name="run1", + run_id="run_123", + start_time_override=datetime(2026, 1, 1, tzinfo=timezone.utc), + file_size=12345, + ) + proto = config._to_proto() + assert proto.asset_name == "my_asset" + assert proto.run_id == "run_123" + assert proto.file_size == 12345 + assert proto.HasField("start_time_override") + + def test_to_proto_optional_fields_unset(self): + config = TdmsImportConfig(asset_name="my_asset", run_name="run1") + proto = config._to_proto() + assert proto.run_name == "run1" + assert proto.run_id == "" + assert not proto.HasField("start_time_override") + assert proto.file_size == 0 + + def test_run_id_takes_precedence(self): + config = TdmsImportConfig(asset_name="a", run_name="ignored", run_id="run_123") + proto = config._to_proto() + assert proto.run_id == "run_123" + + +class TestHdf5Config: + def test_to_proto(self): + config = Hdf5ImportConfig( + asset_name="my_asset", + run_name="run1", + time_format=TimeFormat.ABSOLUTE_UNIX_NANOSECONDS, + data=[ + Hdf5DataColumn( + time_dataset="/time", + value_dataset="/voltage", + name="voltage", + data_type=ChannelDataType.DOUBLE, + units="V", + description="Voltage reading", + ), + ], + ) + proto = config._to_proto() + assert proto.asset_name == "my_asset" + assert len(proto.data) == 1 + assert proto.data[0].time_dataset == "/time" + assert proto.data[0].value_dataset == "/voltage" + assert proto.data[0].channel_config.name == "voltage" + assert proto.data[0].channel_config.units == "V" + assert proto.data[0].channel_config.description == "Voltage reading" + + def test_to_proto_compound_fields(self): + config = Hdf5ImportConfig( + asset_name="my_asset", + time_format=TimeFormat.ABSOLUTE_UNIX_NANOSECONDS, + data=[ + Hdf5DataColumn( + time_dataset="/data", + value_dataset="/data", + name="current", + data_type=ChannelDataType.FLOAT, + time_field="ts", + value_field="val", + ), + ], + ) + proto = config._to_proto() + assert proto.data[0].time_field == "ts" + assert proto.data[0].value_field == "val" + + def test_to_proto_compound_fields_unset(self): + config = Hdf5ImportConfig( + asset_name="my_asset", + time_format=TimeFormat.ABSOLUTE_UNIX_NANOSECONDS, + data=[ + Hdf5DataColumn( + time_dataset="/time", + value_dataset="/voltage", + name="voltage", + data_type=ChannelDataType.DOUBLE, + ), + ], + ) + proto = config._to_proto() + assert not proto.data[0].HasField("time_field") + assert not proto.data[0].HasField("value_field") + + def test_to_proto_multiple_datasets(self): + config = Hdf5ImportConfig( + asset_name="my_asset", + time_format=TimeFormat.ABSOLUTE_UNIX_NANOSECONDS, + data=[ + Hdf5DataColumn( + time_dataset="/time", + value_dataset="/voltage", + name="voltage", + data_type=ChannelDataType.DOUBLE, + ), + Hdf5DataColumn( + time_dataset="/time", + value_dataset="/current", + value_index=1, + name="current", + data_type=ChannelDataType.FLOAT, + ), + ], + ) + proto = config._to_proto() + assert len(proto.data) == 2 + assert proto.data[1].value_dataset == "/current" + assert proto.data[1].value_index == 1 + + def test_relative_time_requires_start_time(self): + with pytest.raises(ValueError, match="relative_start_time"): + Hdf5ImportConfig( + asset_name="my_asset", + time_format=TimeFormat.RELATIVE_SECONDS, + data=[], + ) + + def test_relative_time_with_start_time(self): + config = Hdf5ImportConfig( + asset_name="my_asset", + time_format=TimeFormat.RELATIVE_SECONDS, + relative_start_time=datetime(2026, 1, 1, tzinfo=timezone.utc), + data=[], + ) + proto = config._to_proto() + assert proto.HasField("relative_start_time") + + def test_absolute_time_no_start_time_required(self): + config = Hdf5ImportConfig( + asset_name="my_asset", + time_format=TimeFormat.ABSOLUTE_UNIX_NANOSECONDS, + data=[], + ) + assert config.relative_start_time is None + proto = config._to_proto() + assert not proto.HasField("relative_start_time") + + +class TestExtensionMap: + def test_tdms_extension(self): + assert EXTENSION_TO_DATA_TYPE_KEY[".tdms"] == DataTypeKey.TDMS + + def test_ch10_extension(self): + assert EXTENSION_TO_DATA_TYPE_KEY[".ch10"] == DataTypeKey.CH10 From b0559b0f30beeadde13db9d6cb9be03b6901b434 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Wed, 8 Apr 2026 11:18:40 -0700 Subject: [PATCH 25/52] added documentation for csv json metadata --- python/lib/sift_client/resources/data_imports.py | 8 ++++++++ python/lib/sift_client/resources/sync_stubs/__init__.pyi | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 03fbce1a3..14b211282 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -160,6 +160,14 @@ async def detect_config( using :class:`TdmsImportConfig`, :class:`Hdf5ImportConfig`, or :class:`Ch10ImportConfig`. + For CSV files, the server can parse an optional JSON metadata row + that auto-populates channel names, units, descriptions, data types, + and enum definitions. Each cell in the row is a JSON object + describing that column. When present, ``first_data_row`` in the + returned config will be set to the row after the metadata row. + Note that enum type definitions are applied server-side during + import but are not included in the returned config. + For file types with multiple layouts (e.g. Parquet), ``data_type`` must be specified explicitly. diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index 0fc09dfb5..814c72c83 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -655,6 +655,14 @@ class DataImportAPI: using :class:`TdmsImportConfig`, :class:`Hdf5ImportConfig`, or :class:`Ch10ImportConfig`. + For CSV files, the server can parse an optional JSON metadata row + that auto-populates channel names, units, descriptions, data types, + and enum definitions. Each cell in the row is a JSON object + describing that column. When present, ``first_data_row`` in the + returned config will be set to the row after the metadata row. + Note that enum type definitions are applied server-side during + import but are not included in the returned config. + For file types with multiple layouts (e.g. Parquet), ``data_type`` must be specified explicitly. From d66d8e08eca2b21be2b6b8cb2efd058859969adc Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Wed, 8 Apr 2026 14:00:46 -0700 Subject: [PATCH 26/52] updated docs and split import and polling --- .../lib/sift_client/resources/data_imports.py | 35 +++++-------------- .../resources/sync_stubs/__init__.pyi | 20 ++++------- .../lib/sift_client/sift_types/data_import.py | 8 ++--- 3 files changed, 19 insertions(+), 44 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 14b211282..7873e5dd8 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -3,7 +3,6 @@ from pathlib import Path from typing import TYPE_CHECKING -import sift_client as _sift_client_module from sift_client._internal.low_level_wrappers.data_imports import DataImportsLowLevelClient from sift_client._internal.util.executor import run_sync_function from sift_client._internal.util.file import extract_parquet_footer, upload_file @@ -48,17 +47,15 @@ async def import_from_path( data_type: DataTypeKey | None = None, run_name: str | None = None, run_id: str | None = None, - polling_interval_secs: int = 5, - timeout_secs: int | None = None, - show_progress: bool | None = None, ) -> Job: """Import data from a local file. - Creates a data import on the server, uploads the file, and waits - for the import to complete. + Creates a data import on the server, uploads the file, and returns + a ``Job`` handle. Use ``job.wait_until_complete()`` to poll for + completion if needed. When ``config`` is omitted the file format is auto-detected via - :meth:`detect_config` (CSV and Parquet only). For other formats + ``detect_config`` (CSV and Parquet only). For other formats (TDMS, HDF5, CH10), ``config`` must be provided. When ``asset_name`` is provided it overrides the config value; otherwise the config's ``asset_name`` is used. @@ -79,13 +76,9 @@ async def import_from_path( Defaults to the filename if neither ``run_name`` nor ``run_id`` is set. run_id: Existing run ID to use. Overrides any value on the config. - polling_interval_secs: Seconds between status polls. Defaults to 5s. - timeout_secs: Maximum seconds to wait. If None, polls indefinitely. - show_progress: If True, display a progress spinner while waiting. - Defaults to True for sync, False for async. Returns: - The completed :class:`Job`. + A ``Job`` handle for the pending import. Raises: FileNotFoundError: If the file does not exist. @@ -129,19 +122,7 @@ async def import_from_path( ) job_id = response["jobId"] - if show_progress is None: - global_setting = _sift_client_module.config.show_progress - if global_setting is not None: - show_progress = global_setting - else: - show_progress = getattr(self, "_is_sync", False) - - return await self.client.async_.jobs.wait_until_complete( - job_id, - polling_interval_secs=polling_interval_secs, - timeout_secs=timeout_secs, - show_progress=show_progress, - ) + return await self.client.async_.jobs.get(job_id=job_id) async def detect_config( self, @@ -157,8 +138,8 @@ async def detect_config( Only CSV and Parquet files are currently supported for auto-detection. For other formats (TDMS, HDF5, CH10), create the config manually - using :class:`TdmsImportConfig`, :class:`Hdf5ImportConfig`, or - :class:`Ch10ImportConfig`. + using ``TdmsImportConfig``, ``Hdf5ImportConfig``, or + ``Ch10ImportConfig``. For CSV files, the server can parse an optional JSON metadata row that auto-populates channel names, units, descriptions, data types, diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index 814c72c83..6d19a903d 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -652,8 +652,8 @@ class DataImportAPI: Only CSV and Parquet files are currently supported for auto-detection. For other formats (TDMS, HDF5, CH10), create the config manually - using :class:`TdmsImportConfig`, :class:`Hdf5ImportConfig`, or - :class:`Ch10ImportConfig`. + using ``TdmsImportConfig``, ``Hdf5ImportConfig``, or + ``Ch10ImportConfig``. For CSV files, the server can parse an optional JSON metadata row that auto-populates channel names, units, descriptions, data types, @@ -690,17 +690,15 @@ class DataImportAPI: data_type: DataTypeKey | None = None, run_name: str | None = None, run_id: str | None = None, - polling_interval_secs: int = 5, - timeout_secs: int | None = None, - show_progress: bool | None = None, ) -> Job: """Import data from a local file. - Creates a data import on the server, uploads the file, and waits - for the import to complete. + Creates a data import on the server, uploads the file, and returns + a ``Job`` handle. Use ``job.wait_until_complete()`` to poll for + completion if needed. When ``config`` is omitted the file format is auto-detected via - :meth:`detect_config` (CSV and Parquet only). For other formats + ``detect_config`` (CSV and Parquet only). For other formats (TDMS, HDF5, CH10), ``config`` must be provided. When ``asset_name`` is provided it overrides the config value; otherwise the config's ``asset_name`` is used. @@ -721,13 +719,9 @@ class DataImportAPI: Defaults to the filename if neither ``run_name`` nor ``run_id`` is set. run_id: Existing run ID to use. Overrides any value on the config. - polling_interval_secs: Seconds between status polls. Defaults to 5s. - timeout_secs: Maximum seconds to wait. If None, polls indefinitely. - show_progress: If True, display a progress spinner while waiting. - Defaults to True for sync, False for async. Returns: - The completed :class:`Job`. + A ``Job`` handle for the pending import. Raises: FileNotFoundError: If the file does not exist. diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index d46b5cdc8..9c5f60931 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -307,9 +307,9 @@ class ParquetFlatDatasetImportConfig(BaseModel): time_column: Time column configuration. data_columns: List of data column definitions. footer_offset: Byte offset where the Parquet footer begins. Populated - automatically when using :meth:`~DataImportAPIAsync.detect_config`. + automatically when using ``detect_config``. footer_length: Length of the Parquet footer in bytes. Populated - automatically when using :meth:`~DataImportAPIAsync.detect_config`. + automatically when using ``detect_config``. complex_types_import_mode: How to handle complex Parquet types. """ @@ -444,9 +444,9 @@ class ParquetSingleChannelPerRowImportConfig(BaseModel): single_channel: Set when the entire file contains data for one channel. multi_channel: Set when each row identifies its channel via a name column. footer_offset: Byte offset where the Parquet footer begins. Populated - automatically when using :meth:`~DataImportAPIAsync.detect_config`. + automatically when using ``detect_config``. footer_length: Length of the Parquet footer in bytes. Populated - automatically when using :meth:`~DataImportAPIAsync.detect_config`. + automatically when using ``detect_config``. complex_types_import_mode: How to handle complex Parquet types. """ From e4ae07d92f9f23e9f5590474c8c8ce3f28b0271e Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Wed, 8 Apr 2026 15:00:54 -0700 Subject: [PATCH 27/52] add upload_file polling and refactor global show_progress to a util --- python/lib/sift_client/_internal/util/file.py | 42 +++++++++++++++---- .../lib/sift_client/resources/data_imports.py | 25 +++++++++-- python/lib/sift_client/resources/jobs.py | 19 ++------- .../resources/sync_stubs/__init__.pyi | 9 +++- 4 files changed, 64 insertions(+), 31 deletions(-) diff --git a/python/lib/sift_client/_internal/util/file.py b/python/lib/sift_client/_internal/util/file.py index 4f3a5dead..1d93f44c6 100644 --- a/python/lib/sift_client/_internal/util/file.py +++ b/python/lib/sift_client/_internal/util/file.py @@ -8,6 +8,7 @@ from alive_progress import alive_bar # type: ignore[import-untyped] +import sift_client as _sift_client_module from sift_client.errors import SiftWarning if TYPE_CHECKING: @@ -16,11 +17,24 @@ from sift_client.transport.rest_transport import RestClient +def resolve_show_progress(*, is_sync: bool) -> bool: + """Resolve the show_progress setting from the global config. + + Returns the global ``sift_client.config.show_progress`` value when set, + otherwise defaults to ``is_sync``. + """ + global_setting = _sift_client_module.config.show_progress + if global_setting is not None: + return global_setting + return is_sync + + def upload_file( signed_url: str, file_path: Path, *, rest_client: RestClient, + show_progress: bool = False, ) -> dict: """Upload a file to a presigned URL. @@ -28,6 +42,7 @@ def upload_file( signed_url: The presigned URL to upload to. file_path: Path to the file to upload. rest_client: The SDK rest client to use for the upload. + show_progress: If True, display a progress spinner during upload. Returns: The parsed JSON response from the server. @@ -35,15 +50,24 @@ def upload_file( Raises: ValueError: If the upload request fails. """ - with open(file_path, "rb") as f: - response = rest_client.post( - signed_url, - data=f, - headers={"Content-Disposition": f'attachment; filename="{file_path.name}"'}, - ) - if not response.ok: - raise ValueError(f"Upload failed ({response.status_code}): {response.text}") - return response.json() + with alive_bar( + title=f"Upload [{file_path.name}]", + bar=None, + spinner="dots_waves", + spinner_length=7, + monitor=False, + stats=False, + disable=not show_progress, + ): + with open(file_path, "rb") as f: + response = rest_client.post( + signed_url, + data=f, + headers={"Content-Disposition": f'attachment; filename="{file_path.name}"'}, + ) + if not response.ok: + raise ValueError(f"Upload failed ({response.status_code}): {response.text}") + return response.json() def download_file( diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 7873e5dd8..73977670f 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -5,7 +5,11 @@ from sift_client._internal.low_level_wrappers.data_imports import DataImportsLowLevelClient from sift_client._internal.util.executor import run_sync_function -from sift_client._internal.util.file import extract_parquet_footer, upload_file +from sift_client._internal.util.file import ( + extract_parquet_footer, + resolve_show_progress, + upload_file, +) from sift_client.resources._base import ResourceBase from sift_client.sift_types.channel import ChannelDataType from sift_client.sift_types.data_import import ( @@ -47,12 +51,15 @@ async def import_from_path( data_type: DataTypeKey | None = None, run_name: str | None = None, run_id: str | None = None, + show_progress: bool | None = None, ) -> Job: """Import data from a local file. Creates a data import on the server, uploads the file, and returns - a ``Job`` handle. Use ``job.wait_until_complete()`` to poll for - completion if needed. + a ``Job`` handle after uploading the file. The import processes + server-side and typically completes shortly after upload. Use + ``job.wait_until_complete()`` only if you need to confirm + completion before proceeding. When ``config`` is omitted the file format is auto-detected via ``detect_config`` (CSV and Parquet only). For other formats @@ -76,6 +83,8 @@ async def import_from_path( Defaults to the filename if neither ``run_name`` nor ``run_id`` is set. run_id: Existing run ID to use. Overrides any value on the config. + show_progress: If True, display a progress spinner during upload. + Defaults to True for sync, False for async. Returns: A ``Job`` handle for the pending import. @@ -115,10 +124,18 @@ async def import_from_path( config.footer_offset = footer_offset config.footer_length = len(footer_bytes) + if show_progress is None: + show_progress = resolve_show_progress(is_sync=getattr(self, "_is_sync", False)) + _, upload_url = await self._low_level_client.create_from_upload(config) response = await run_sync_function( - lambda: upload_file(upload_url, path, rest_client=self.client.rest_client) + lambda: upload_file( + upload_url, + path, + rest_client=self.client.rest_client, + show_progress=show_progress, + ) ) job_id = response["jobId"] diff --git a/python/lib/sift_client/resources/jobs.py b/python/lib/sift_client/resources/jobs.py index 6ddaec6ca..5e2bbdf9b 100644 --- a/python/lib/sift_client/resources/jobs.py +++ b/python/lib/sift_client/resources/jobs.py @@ -9,10 +9,9 @@ from alive_progress import alive_bar # type: ignore[import-untyped] -import sift_client as _sift_client_module from sift_client._internal.low_level_wrappers.jobs import JobsLowLevelClient from sift_client._internal.util.executor import run_sync_function -from sift_client._internal.util.file import download_file, extract_zip +from sift_client._internal.util.file import download_file, extract_zip, resolve_show_progress from sift_client.resources._base import ResourceBase from sift_client.sift_types.job import DataExportStatusDetails, Job, JobStatus, JobType from sift_client.util import cel_utils as cel @@ -194,13 +193,7 @@ async def wait_until_complete( """ job_id = job._id_or_error if isinstance(job, Job) else job if show_progress is None: - global_setting = _sift_client_module.config.show_progress - if global_setting is not None: - show_progress = global_setting - elif getattr(self, "_is_sync", False): - show_progress = True - else: - show_progress = False + show_progress = resolve_show_progress(is_sync=getattr(self, "_is_sync", False)) start = time.monotonic() with alive_bar( @@ -263,13 +256,7 @@ async def wait_and_download( """ job_id = job._id_or_error if isinstance(job, Job) else job if show_progress is None: - global_setting = _sift_client_module.config.show_progress - if global_setting is not None: - show_progress = global_setting - elif getattr(self, "_is_sync", False): - show_progress = True - else: - show_progress = False + show_progress = resolve_show_progress(is_sync=getattr(self, "_is_sync", False)) completed_job = await self.wait_until_complete( job=job_id, diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index 6d19a903d..a94e782d5 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -690,12 +690,15 @@ class DataImportAPI: data_type: DataTypeKey | None = None, run_name: str | None = None, run_id: str | None = None, + show_progress: bool | None = None, ) -> Job: """Import data from a local file. Creates a data import on the server, uploads the file, and returns - a ``Job`` handle. Use ``job.wait_until_complete()`` to poll for - completion if needed. + a ``Job`` handle after uploading the file. The import processes + server-side and typically completes shortly after upload. Use + ``job.wait_until_complete()`` only if you need to confirm + completion before proceeding. When ``config`` is omitted the file format is auto-detected via ``detect_config`` (CSV and Parquet only). For other formats @@ -719,6 +722,8 @@ class DataImportAPI: Defaults to the filename if neither ``run_name`` nor ``run_id`` is set. run_id: Existing run ID to use. Overrides any value on the config. + show_progress: If True, display a progress spinner during upload. + Defaults to True for sync, False for async. Returns: A ``Job`` handle for the pending import. From 107eaa20e26713b2df7a480dbad42aadaff997d1 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Wed, 8 Apr 2026 15:13:09 -0700 Subject: [PATCH 28/52] error handling from missing job_id from upload --- python/lib/sift_client/resources/data_imports.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 73977670f..5c759d891 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -137,7 +137,9 @@ async def import_from_path( show_progress=show_progress, ) ) - job_id = response["jobId"] + job_id = response.get("jobId") + if not job_id: + raise ValueError("Upload succeeded but server response did not include a job ID.") return await self.client.async_.jobs.get(job_id=job_id) From 9fe594cac936f8205d927b98a08bf68c7d61b59a Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Wed, 8 Apr 2026 15:25:29 -0700 Subject: [PATCH 29/52] refactor to use run/asset objects --- .../lib/sift_client/resources/data_imports.py | 39 ++++++++++--------- .../resources/sync_stubs/__init__.pyi | 23 ++++++----- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 5c759d891..56b1eea57 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -11,6 +11,7 @@ upload_file, ) from sift_client.resources._base import ResourceBase +from sift_client.sift_types.asset import Asset from sift_client.sift_types.channel import ChannelDataType from sift_client.sift_types.data_import import ( EXTENSION_TO_DATA_TYPE_KEY, @@ -22,6 +23,7 @@ ParquetSingleChannelPerRowImportConfig, ParquetTimeColumn, ) +from sift_client.sift_types.run import Run if TYPE_CHECKING: from sift_client.client import SiftClient @@ -46,11 +48,11 @@ async def import_from_path( self, file_path: str | Path, *, - asset_name: str | None = None, + asset: Asset | str | None = None, config: ImportConfig | None = None, data_type: DataTypeKey | None = None, + run: Run | str | None = None, run_name: str | None = None, - run_id: str | None = None, show_progress: bool | None = None, ) -> Job: """Import data from a local file. @@ -64,25 +66,24 @@ async def import_from_path( When ``config`` is omitted the file format is auto-detected via ``detect_config`` (CSV and Parquet only). For other formats (TDMS, HDF5, CH10), ``config`` must be provided. - When ``asset_name`` is provided it overrides - the config value; otherwise the config's ``asset_name`` is used. - If neither ``run_name`` nor ``run_id`` is provided - (and none is set on the config), ``run_name`` defaults to the - filename. + When ``asset`` is provided it overrides the config value; + otherwise the config's ``asset_name`` is used. + If neither ``run`` nor ``run_name`` is provided (and none is + set on the config), ``run_name`` defaults to the filename. Args: file_path: Path to the local file to import. - asset_name: Name of the asset to import data into. Optional + asset: Asset object or asset name to import data into. Optional when ``config`` already has ``asset_name`` set. config: Import configuration describing the file format and column mapping. When provided, ``data_type`` is ignored. data_type: Explicit data type key. Required for formats like Parquet where the extension alone is ambiguous. Only used when ``config`` is not provided. - run_name: Run name to use. Overrides any value on the config. - Defaults to the filename if neither ``run_name`` nor - ``run_id`` is set. - run_id: Existing run ID to use. Overrides any value on the config. + run: ``Run`` object or run ID string to import into an existing + run. Mutually exclusive with ``run_name``. + run_name: Name for a new run. Defaults to the filename if + neither ``run`` nor ``run_name`` is set. show_progress: If True, display a progress spinner during upload. Defaults to True for sync, False for async. @@ -99,16 +100,18 @@ async def import_from_path( if config is None: config = await self.detect_config(file_path, data_type=data_type) - if asset_name is not None: - config.asset_name = asset_name + if asset is not None: + config.asset_name = asset.name if isinstance(asset, Asset) else asset elif not config.asset_name: - raise ValueError("'asset_name' is required when not set on the config.") - if run_id is not None: + raise ValueError("'asset' is required when not set on the config.") + if run is not None and run_name is not None: + raise ValueError("'run' and 'run_name' are mutually exclusive.") + if run is not None: if isinstance(config, Ch10ImportConfig): raise ValueError( - "'run_id' is not supported for Ch10ImportConfig. Use 'run_name' instead." + "'run' is not supported for Ch10ImportConfig. Use 'run_name' instead." ) - config.run_id = run_id + config.run_id = run._id_or_error if isinstance(run, Run) else run elif run_name is not None: config.run_name = run_name elif not config.run_name and (isinstance(config, Ch10ImportConfig) or not config.run_id): diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index a94e782d5..61ffa9f54 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -685,11 +685,11 @@ class DataImportAPI: self, file_path: str | Path, *, - asset_name: str | None = None, + asset: Asset | str | None = None, config: ImportConfig | None = None, data_type: DataTypeKey | None = None, + run: Run | str | None = None, run_name: str | None = None, - run_id: str | None = None, show_progress: bool | None = None, ) -> Job: """Import data from a local file. @@ -703,25 +703,24 @@ class DataImportAPI: When ``config`` is omitted the file format is auto-detected via ``detect_config`` (CSV and Parquet only). For other formats (TDMS, HDF5, CH10), ``config`` must be provided. - When ``asset_name`` is provided it overrides - the config value; otherwise the config's ``asset_name`` is used. - If neither ``run_name`` nor ``run_id`` is provided - (and none is set on the config), ``run_name`` defaults to the - filename. + When ``asset`` is provided it overrides the config value; + otherwise the config's ``asset_name`` is used. + If neither ``run`` nor ``run_name`` is provided (and none is + set on the config), ``run_name`` defaults to the filename. Args: file_path: Path to the local file to import. - asset_name: Name of the asset to import data into. Optional + asset: Asset object or asset name to import data into. Optional when ``config`` already has ``asset_name`` set. config: Import configuration describing the file format and column mapping. When provided, ``data_type`` is ignored. data_type: Explicit data type key. Required for formats like Parquet where the extension alone is ambiguous. Only used when ``config`` is not provided. - run_name: Run name to use. Overrides any value on the config. - Defaults to the filename if neither ``run_name`` nor - ``run_id`` is set. - run_id: Existing run ID to use. Overrides any value on the config. + run: ``Run`` object or run ID string to import into an existing + run. Mutually exclusive with ``run_name``. + run_name: Name for a new run. Defaults to the filename if + neither ``run`` nor ``run_name`` is set. show_progress: If True, display a progress spinner during upload. Defaults to True for sync, False for async. From d3377d1e404eea2e585051a72ed7276020ae6017 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Wed, 8 Apr 2026 15:42:11 -0700 Subject: [PATCH 30/52] refactor file format configs into private helpers, updated error message --- .../lib/sift_client/resources/data_imports.py | 168 ++++++++++-------- 1 file changed, 94 insertions(+), 74 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 56b1eea57..46ef5cdb8 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -120,12 +120,7 @@ async def import_from_path( if isinstance( config, (ParquetFlatDatasetImportConfig, ParquetSingleChannelPerRowImportConfig) ): - if config.footer_offset == 0 and config.footer_length == 0: - footer_bytes, footer_offset = await run_sync_function( - lambda: extract_parquet_footer(path) - ) - config.footer_offset = footer_offset - config.footer_length = len(footer_bytes) + await _prepare_parquet_config(config, path) if show_progress is None: show_progress = resolve_show_progress(is_sync=getattr(self, "_is_sync", False)) @@ -191,23 +186,7 @@ async def detect_config( if not path.is_file(): raise FileNotFoundError(f"File not found: {file_path}") - ext = path.suffix.lower() - if ext in (".parquet", ".pqt"): - if data_type is None: - raise ValueError( - "Parquet files require 'data_type' to be specified. " - "Use DataTypeKey.PARQUET_FLATDATASET or DataTypeKey.PARQUET_SINGLE_CHANNEL_PER_ROW." - ) - data_type_key = data_type - elif data_type is not None: - data_type_key = data_type - else: - if ext not in EXTENSION_TO_DATA_TYPE_KEY: - raise ValueError( - f"Unsupported file extension '{ext}'. " - f"Supported: {', '.join(sorted(EXTENSION_TO_DATA_TYPE_KEY))}" - ) - data_type_key = EXTENSION_TO_DATA_TYPE_KEY[ext] + data_type_key = _resolve_data_type_key(path.suffix.lower(), data_type) is_parquet = data_type_key in ( DataTypeKey.PARQUET_FLATDATASET, @@ -234,58 +213,99 @@ def _read_sample() -> bytes: response = await self._low_level_client.detect_config(sample, data_type_key.value) if response.HasField("csv_config"): - csv_config = CsvImportConfig._from_proto(response.csv_config) - # Filter out the time column from data_columns to avoid overlap. - time_col = csv_config.time_column.column - csv_config.data_columns = [ - dc for dc in csv_config.data_columns if dc.column != time_col - ] - if not csv_config.data_columns: - raise ValueError(f"No data columns detected in '{path.name}'.") - return csv_config + return _parse_csv_detect_response(response.csv_config, path.name) if response.HasField("parquet_config"): - proto = response.parquet_config - if proto.HasField("flat_dataset"): - parquet_config = ParquetFlatDatasetImportConfig._from_proto( - proto, footer_offset=footer_offset, footer_length=footer_length - ) - # Filter out the time column from data_columns to avoid overlap. - time_path = parquet_config.time_column.path - if time_path: - parquet_config.data_columns = [ - dc for dc in parquet_config.data_columns if dc.path != time_path - ] - else: - # The backend only detects arrow timestamp types. Fall back to - # an integer column whose name starts with "time". - _integer_types = { - ChannelDataType.INT_32, - ChannelDataType.INT_64, - ChannelDataType.UINT_32, - ChannelDataType.UINT_64, - } - match = None - for dc in parquet_config.data_columns: - if dc.data_type in _integer_types and dc.name.lower().startswith("time"): - match = dc - break - if match is not None: - parquet_config.time_column = ParquetTimeColumn(path=match.path) - parquet_config.data_columns = [ - c for c in parquet_config.data_columns if c.path != match.path - ] - if not parquet_config.time_column.path: - raise ValueError( - f"No time column detected in '{path.name}'. " - "Set the time column manually on the config before importing." - ) - if not parquet_config.data_columns: - raise ValueError(f"No data columns detected in '{path.name}'.") - return parquet_config - elif proto.HasField("single_channel_per_row"): - return ParquetSingleChannelPerRowImportConfig._from_proto( - proto, footer_offset=footer_offset, footer_length=footer_length - ) + return _parse_parquet_detect_response( + response.parquet_config, path.name, footer_offset, footer_length + ) raise ValueError("Server returned an empty DetectConfig response.") + + +def _resolve_data_type_key(ext: str, data_type: DataTypeKey | None) -> DataTypeKey: + """Resolve the data type key from file extension and explicit override.""" + if ext in (".parquet", ".pqt"): + if data_type is None: + raise ValueError( + "Parquet files require 'data_type' to be specified. " + "Use DataTypeKey.PARQUET_FLATDATASET or DataTypeKey.PARQUET_SINGLE_CHANNEL_PER_ROW." + ) + return data_type + if data_type is not None: + return data_type + if ext not in EXTENSION_TO_DATA_TYPE_KEY: + raise ValueError( + f"Unsupported file extension '{ext}'. " + f"Supported: {', '.join(sorted(EXTENSION_TO_DATA_TYPE_KEY))}. " + "You can also specify 'data_type' explicitly using a DataTypeKey value." + ) + return EXTENSION_TO_DATA_TYPE_KEY[ext] + + +def _parse_csv_detect_response(proto, filename: str) -> CsvImportConfig: + """Parse a CSV DetectConfig response into a config.""" + csv_config = CsvImportConfig._from_proto(proto) + time_col = csv_config.time_column.column + csv_config.data_columns = [dc for dc in csv_config.data_columns if dc.column != time_col] + if not csv_config.data_columns: + raise ValueError(f"No data columns detected in '{filename}'.") + return csv_config + + +def _parse_parquet_detect_response( + proto, filename: str, footer_offset: int, footer_length: int +) -> ParquetFlatDatasetImportConfig | ParquetSingleChannelPerRowImportConfig: + """Parse a Parquet DetectConfig response into a config.""" + if proto.HasField("flat_dataset"): + parquet_config = ParquetFlatDatasetImportConfig._from_proto( + proto, footer_offset=footer_offset, footer_length=footer_length + ) + time_path = parquet_config.time_column.path + if time_path: + parquet_config.data_columns = [ + dc for dc in parquet_config.data_columns if dc.path != time_path + ] + else: + # The backend only detects arrow timestamp types. Fall back to + # an integer column whose name starts with "time". + _integer_types = { + ChannelDataType.INT_32, + ChannelDataType.INT_64, + ChannelDataType.UINT_32, + ChannelDataType.UINT_64, + } + match = None + for dc in parquet_config.data_columns: + if dc.data_type in _integer_types and dc.name.lower().startswith("time"): + match = dc + break + if match is not None: + parquet_config.time_column = ParquetTimeColumn(path=match.path) + parquet_config.data_columns = [ + c for c in parquet_config.data_columns if c.path != match.path + ] + if not parquet_config.time_column.path: + raise ValueError( + f"No time column detected in '{filename}'. " + "Set the time column manually on the config before importing." + ) + if not parquet_config.data_columns: + raise ValueError(f"No data columns detected in '{filename}'.") + return parquet_config + elif proto.HasField("single_channel_per_row"): + return ParquetSingleChannelPerRowImportConfig._from_proto( + proto, footer_offset=footer_offset, footer_length=footer_length + ) + raise ValueError(f"Unsupported parquet layout in DetectConfig response for '{filename}'.") + + +async def _prepare_parquet_config( + config: ParquetFlatDatasetImportConfig | ParquetSingleChannelPerRowImportConfig, + path: Path, +) -> None: + """Populate parquet footer fields on the config if not already set.""" + if config.footer_offset == 0 and config.footer_length == 0: + footer_bytes, footer_offset = await run_sync_function(lambda: extract_parquet_footer(path)) + config.footer_offset = footer_offset + config.footer_length = len(footer_bytes) From 07007f955c2ee6ba80df1f6c1ae94f5ddbc04a67 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 9 Apr 2026 11:32:56 -0700 Subject: [PATCH 31/52] refactored to apply inheritance on time and config classes --- .../_tests/resources/test_data_imports.py | 4 +- .../lib/sift_client/sift_types/data_import.py | 103 +++++++----------- 2 files changed, 43 insertions(+), 64 deletions(-) diff --git a/python/lib/sift_client/_tests/resources/test_data_imports.py b/python/lib/sift_client/_tests/resources/test_data_imports.py index 05bc09f88..65d8e5785 100644 --- a/python/lib/sift_client/_tests/resources/test_data_imports.py +++ b/python/lib/sift_client/_tests/resources/test_data_imports.py @@ -271,9 +271,9 @@ def test_to_proto_defaults(self): assert proto.run_name == "" assert proto.scale_values is False - def test_no_run_id_field(self): + def test_run_id_inherited_but_unused(self): config = Ch10ImportConfig(asset_name="my_asset") - assert not hasattr(config, "run_id") + assert config.run_id is None class TestTdmsConfig: diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index 9c5f60931..6963715af 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -1,5 +1,6 @@ from __future__ import annotations +from abc import ABC from datetime import datetime # noqa: TC003 from enum import Enum from typing import Union @@ -82,7 +83,27 @@ class DataTypeKey(Enum): } -class CsvTimeColumn(BaseModel): +class TimeColumnBase(BaseModel, ABC): + """Base class for time column configurations. + + Attributes: + format: The time format used in this column. + relative_start_time: Required when using a relative time format. + """ + + format: TimeFormat + relative_start_time: datetime | None = None + + @model_validator(mode="after") + def _check_relative_start_time(self) -> TimeColumnBase: + if self.format.name.startswith("RELATIVE_") and self.relative_start_time is None: + raise ValueError( + f"'relative_start_time' is required when using a relative time format ({self.format.name})." + ) + return self + + +class CsvTimeColumn(TimeColumnBase): """Time column configuration for CSV imports. Attributes: @@ -92,8 +113,6 @@ class CsvTimeColumn(BaseModel): """ column: int - format: TimeFormat - relative_start_time: datetime | None = None def _to_proto(self) -> CsvTimeColumnProto: proto = CsvTimeColumnProto( @@ -104,14 +123,6 @@ def _to_proto(self) -> CsvTimeColumnProto: proto.relative_start_time.CopyFrom(to_pb_timestamp(self.relative_start_time)) return proto - @model_validator(mode="after") - def _check_relative_start_time(self) -> CsvTimeColumn: - if self.format.name.startswith("RELATIVE_") and self.relative_start_time is None: - raise ValueError( - f"'relative_start_time' is required when using a relative time format ({self.format.name})." - ) - return self - class CsvDataColumn(BaseModel): """A data column definition for CSV imports. @@ -131,21 +142,29 @@ class CsvDataColumn(BaseModel): description: str = "" -class CsvImportConfig(BaseModel): - """Configuration for importing a CSV file. +class ImportConfigBase(BaseModel, ABC): + """Base class for all import configurations. Attributes: asset_name: Name of the asset to import data into. run_name: Name for the run. Ignored if ``run_id`` is set. run_id: ID of an existing run to append data to. - first_data_row: The first row containing data (1-indexed). Defaults to 2 to skip a header row. - time_column: Time column configuration. - data_columns: List of data column definitions. """ asset_name: str run_name: str | None = None run_id: str | None = None + + +class CsvImportConfig(ImportConfigBase): + """Configuration for importing a CSV file. + + Attributes: + first_data_row: The first row containing data (1-indexed). Defaults to 2 to skip a header row. + time_column: Time column configuration. + data_columns: List of data column definitions. + """ + first_data_row: int = 2 time_column: CsvTimeColumn data_columns: list[CsvDataColumn] @@ -229,7 +248,7 @@ class ParquetComplexTypesImportMode(Enum): BYTES = PARQUET_COMPLEX_TYPES_IMPORT_MODE_BYTES -class ParquetTimeColumn(BaseModel): +class ParquetTimeColumn(TimeColumnBase): """Time column configuration for Parquet imports. Attributes: @@ -240,7 +259,6 @@ class ParquetTimeColumn(BaseModel): path: str format: TimeFormat = TimeFormat.ABSOLUTE_UNIX_NANOSECONDS - relative_start_time: datetime | None = None def _to_proto(self) -> ParquetTimeColumnProto: if not self.path: @@ -268,14 +286,6 @@ def _from_proto(cls, proto: ParquetTimeColumnProto) -> ParquetTimeColumn: relative_start_time=relative_start_time, ) - @model_validator(mode="after") - def _check_relative_start_time(self) -> ParquetTimeColumn: - if self.format.name.startswith("RELATIVE_") and self.relative_start_time is None: - raise ValueError( - f"'relative_start_time' is required when using a relative time format ({self.format.name})." - ) - return self - class ParquetDataColumn(BaseModel): """A data column definition for Parquet flat dataset imports. @@ -295,15 +305,12 @@ class ParquetDataColumn(BaseModel): description: str = "" -class ParquetFlatDatasetImportConfig(BaseModel): +class ParquetFlatDatasetImportConfig(ImportConfigBase): """Configuration for importing a Parquet file with a flat dataset layout. Each column in the file maps to a separate channel. Attributes: - asset_name: Name of the asset to import data into. - run_name: Name for the run. Ignored if ``run_id`` is set. - run_id: ID of an existing run to append data to. time_column: Time column configuration. data_columns: List of data column definitions. footer_offset: Byte offset where the Parquet footer begins. Populated @@ -313,9 +320,6 @@ class ParquetFlatDatasetImportConfig(BaseModel): complex_types_import_mode: How to handle complex Parquet types. """ - asset_name: str - run_name: str | None = None - run_id: str | None = None time_column: ParquetTimeColumn data_columns: list[ParquetDataColumn] footer_offset: int = 0 @@ -430,16 +434,13 @@ class ParquetMultiChannelConfig(BaseModel): data_path: str -class ParquetSingleChannelPerRowImportConfig(BaseModel): +class ParquetSingleChannelPerRowImportConfig(ImportConfigBase): """Configuration for importing a Parquet file where each row represents a single channel's data point. Exactly one of ``single_channel`` or ``multi_channel`` must be set. Attributes: - asset_name: Name of the asset to import data into. - run_name: Name for the run. Ignored if ``run_id`` is set. - run_id: ID of an existing run to append data to. time_column: Time column configuration. single_channel: Set when the entire file contains data for one channel. multi_channel: Set when each row identifies its channel via a name column. @@ -450,9 +451,6 @@ class ParquetSingleChannelPerRowImportConfig(BaseModel): complex_types_import_mode: How to handle complex Parquet types. """ - asset_name: str - run_name: str | None = None - run_id: str | None = None time_column: ParquetTimeColumn single_channel: ParquetSingleChannelConfig | None = None multi_channel: ParquetMultiChannelConfig | None = None @@ -540,17 +538,13 @@ def _from_proto( ) -class Ch10ImportConfig(BaseModel): +class Ch10ImportConfig(ImportConfigBase): """Configuration for importing a CH10 file. Attributes: - asset_name: Name of the asset to import data into. - run_name: Name for the run. scale_values: Whether to apply EU (engineering unit) scaling to channel values. """ - asset_name: str - run_name: str | None = None scale_values: bool = False def _to_proto(self) -> Ch10ConfigProto: @@ -561,21 +555,15 @@ def _to_proto(self) -> Ch10ConfigProto: ) -class TdmsImportConfig(BaseModel): +class TdmsImportConfig(ImportConfigBase): """Configuration for importing a TDMS file. Attributes: - asset_name: Name of the asset to import data into. - run_name: Name for the run. Ignored if ``run_id`` is set. - run_id: ID of an existing run to append data to. start_time_override: Override the ``wf_start_time`` metadata field for all channels. Useful when waveform channels have ``wf_increment`` but no ``wf_start_time``. file_size: The file size in bytes. Required if the file has truncated chunks. """ - asset_name: str - run_name: str | None = None - run_id: str | None = None start_time_override: datetime | None = None file_size: int | None = None @@ -622,21 +610,15 @@ class Hdf5DataColumn(BaseModel): value_field: str | None = None -class Hdf5ImportConfig(BaseModel): +class Hdf5ImportConfig(ImportConfigBase): """Configuration for importing an HDF5 file. Attributes: - asset_name: Name of the asset to import data into. - run_name: Name for the run. Ignored if ``run_id`` is set. - run_id: ID of an existing run to append data to. data: List of dataset mappings, each pairing a time and value dataset to a channel. time_format: The time format used across all time datasets. relative_start_time: Required when using a relative time format. """ - asset_name: str - run_name: str | None = None - run_id: str | None = None data: list[Hdf5DataColumn] time_format: TimeFormat relative_start_time: datetime | None = None @@ -678,9 +660,6 @@ def _to_proto(self) -> Hdf5ConfigProto: return proto -# Note: Using Union instead of | syntax for Python 3.9 compatibility at module level. -# While `from __future__ import annotations` allows | in type hints (they're strings), -# module-level type aliases are evaluated at runtime and require Union in Python <3.10. ImportConfig = Union[ CsvImportConfig, ParquetFlatDatasetImportConfig, From fca83316001d2d0acd93dcb0ab131bcca8af35b3 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 9 Apr 2026 11:50:42 -0700 Subject: [PATCH 32/52] updated documentation around detect_config --- .../lib/sift_client/resources/data_imports.py | 31 ++++++++++++++++++- .../resources/sync_stubs/__init__.pyi | 31 ++++++++++++++++++- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 46ef5cdb8..174cfc452 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -71,12 +71,41 @@ async def import_from_path( If neither ``run`` nor ``run_name`` is provided (and none is set on the config), ``run_name`` defaults to the filename. + Examples: + Import a CSV file with auto-detected config: + + job = client.data_imports.import_from_path( + "data.csv", + asset=my_asset, + ) + + Auto-detect config, inspect and patch before importing: + + config = client.data_imports.detect_config("data.csv") + + # Fix a column data type + config.get_column("temperature").data_type = ChannelDataType.FLOAT + + # Remove an unwanted column + config.data_columns = [ + dc for dc in config.data_columns if dc.name != "internal_id" + ] + + job = client.data_imports.import_from_path( + "data.csv", + asset=my_asset, + config=config, + ) + Args: file_path: Path to the local file to import. asset: Asset object or asset name to import data into. Optional when ``config`` already has ``asset_name`` set. config: Import configuration describing the file format and column - mapping. When provided, ``data_type`` is ignored. + mapping. When provided, ``data_type`` is ignored. If omitted, + the config is auto-detected via ``detect_config``. You can + call ``detect_config`` yourself to inspect and modify the + config before passing it here. data_type: Explicit data type key. Required for formats like Parquet where the extension alone is ambiguous. Only used when ``config`` is not provided. diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index 61ffa9f54..cc1b6556e 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -708,12 +708,41 @@ class DataImportAPI: If neither ``run`` nor ``run_name`` is provided (and none is set on the config), ``run_name`` defaults to the filename. + Examples: + Import a CSV file with auto-detected config: + + job = client.data_imports.import_from_path( + "data.csv", + asset=my_asset, + ) + + Auto-detect config, inspect and patch before importing: + + config = client.data_imports.detect_config("data.csv") + + # Fix a column data type + config.get_column("temperature").data_type = ChannelDataType.FLOAT + + # Remove an unwanted column + config.data_columns = [ + dc for dc in config.data_columns if dc.name != "internal_id" + ] + + job = client.data_imports.import_from_path( + "data.csv", + asset=my_asset, + config=config, + ) + Args: file_path: Path to the local file to import. asset: Asset object or asset name to import data into. Optional when ``config`` already has ``asset_name`` set. config: Import configuration describing the file format and column - mapping. When provided, ``data_type`` is ignored. + mapping. When provided, ``data_type`` is ignored. If omitted, + the config is auto-detected via ``detect_config``. You can + call ``detect_config`` yourself to inspect and modify the + config before passing it here. data_type: Explicit data type key. Required for formats like Parquet where the extension alone is ambiguous. Only used when ``config`` is not provided. From deb35902d4967e08215905e608e7d2528bc091f8 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 9 Apr 2026 12:10:34 -0700 Subject: [PATCH 33/52] updated unit tests --- .../_tests/resources/test_data_imports.py | 234 +++++++----------- 1 file changed, 88 insertions(+), 146 deletions(-) diff --git a/python/lib/sift_client/_tests/resources/test_data_imports.py b/python/lib/sift_client/_tests/resources/test_data_imports.py index 65d8e5785..ff99efe3c 100644 --- a/python/lib/sift_client/_tests/resources/test_data_imports.py +++ b/python/lib/sift_client/_tests/resources/test_data_imports.py @@ -4,9 +4,9 @@ import pytest +from sift_client.resources.data_imports import _resolve_data_type_key from sift_client.sift_types.channel import ChannelDataType from sift_client.sift_types.data_import import ( - EXTENSION_TO_DATA_TYPE_KEY, Ch10ImportConfig, CsvDataColumn, CsvImportConfig, @@ -16,6 +16,7 @@ Hdf5ImportConfig, ParquetDataColumn, ParquetFlatDatasetImportConfig, + ParquetSingleChannelPerRowImportConfig, ParquetTimeColumn, TdmsImportConfig, TimeFormat, @@ -57,54 +58,6 @@ def parquet_config(): ) -class TestCsvConfigMutability: - def test_mutate_asset_name(self, csv_config): - csv_config.asset_name = "new_asset" - assert csv_config.asset_name == "new_asset" - - def test_mutate_run_name(self, csv_config): - csv_config.run_name = "new_run" - assert csv_config.run_name == "new_run" - - def test_mutate_column_data_type(self, csv_config): - csv_config.data_columns[1].data_type = ChannelDataType.STRING - assert csv_config.data_columns[1].data_type == ChannelDataType.STRING - - def test_mutate_column_name(self, csv_config): - csv_config.data_columns[0].name = "cpu_utilization" - assert csv_config.data_columns[0].name == "cpu_utilization" - - def test_append_column(self, csv_config): - csv_config.data_columns.append( - CsvDataColumn(column=5, name="pressure", data_type=ChannelDataType.DOUBLE) - ) - assert len(csv_config.data_columns) == 4 - assert csv_config.data_columns[-1].name == "pressure" - - def test_remove_column(self, csv_config): - csv_config.data_columns = [ - dc for dc in csv_config.data_columns if dc.name != "status_flags" - ] - assert len(csv_config.data_columns) == 2 - assert all(dc.name != "status_flags" for dc in csv_config.data_columns) - - -class TestParquetConfigMutability: - def test_mutate_asset_name(self, parquet_config): - parquet_config.asset_name = "new_asset" - assert parquet_config.asset_name == "new_asset" - - def test_mutate_column_data_type(self, parquet_config): - parquet_config.data_columns[1].data_type = ChannelDataType.STRING - assert parquet_config.data_columns[1].data_type == ChannelDataType.STRING - - def test_append_column(self, parquet_config): - parquet_config.data_columns.append( - ParquetDataColumn(path="pressure", name="pressure", data_type=ChannelDataType.DOUBLE) - ) - assert len(parquet_config.data_columns) == 4 - - class TestGetColumn: def test_csv_get_column(self, csv_config): col = csv_config.get_column("cpu_util") @@ -169,94 +122,6 @@ def test_absolute_time_does_not_require_start_time(self): assert col.relative_start_time is None -class TestDataTypeKey: - def test_csv_extension(self): - assert EXTENSION_TO_DATA_TYPE_KEY[".csv"] == DataTypeKey.CSV - - def test_parquet_not_in_extension_map(self): - assert ".parquet" not in EXTENSION_TO_DATA_TYPE_KEY - - def test_hdf5_extensions(self): - assert EXTENSION_TO_DATA_TYPE_KEY[".h5"] == DataTypeKey.HDF5 - assert EXTENSION_TO_DATA_TYPE_KEY[".hdf5"] == DataTypeKey.HDF5 - - -class TestDetectConfigValidation: - """Tests for validation checks applied after detect_config.""" - - def test_csv_no_data_columns_raises(self): - """If all columns are filtered out, detect_config should raise.""" - config = CsvImportConfig( - asset_name="", - time_column=CsvTimeColumn(column=1, format=TimeFormat.ABSOLUTE_RFC3339), - data_columns=[], - ) - assert not config.data_columns - - def test_parquet_empty_time_column_path(self): - """An empty time column path indicates detection failed.""" - config = ParquetFlatDatasetImportConfig( - asset_name="", - time_column=ParquetTimeColumn(path=""), - data_columns=[ - ParquetDataColumn( - path="cpu_util", name="cpu_util", data_type=ChannelDataType.DOUBLE - ), - ], - ) - assert not config.time_column.path - - def test_parquet_no_data_columns(self): - """A config with no data columns indicates detection found nothing useful.""" - config = ParquetFlatDatasetImportConfig( - asset_name="", - time_column=ParquetTimeColumn(path="timestamp"), - data_columns=[], - ) - assert not config.data_columns - - def test_parquet_integer_time_column_fallback(self): - """An integer column starting with 'time' should be usable as the time column.""" - config = ParquetFlatDatasetImportConfig( - asset_name="", - time_column=ParquetTimeColumn(path=""), - data_columns=[ - ParquetDataColumn(path="time_ns", name="time_ns", data_type=ChannelDataType.INT_64), - ParquetDataColumn( - path="cpu_util", name="cpu_util", data_type=ChannelDataType.DOUBLE - ), - ], - ) - _integer_types = { - ChannelDataType.INT_32, - ChannelDataType.INT_64, - ChannelDataType.UINT_32, - ChannelDataType.UINT_64, - } - match = None - for dc in config.data_columns: - if dc.data_type in _integer_types and dc.name.lower().startswith("time"): - match = dc - break - assert match is not None - assert match.path == "time_ns" - - -class TestRunPrecedence: - def test_run_id_ignored_when_none(self, csv_config): - csv_config.run_id = None - csv_config.run_name = "my_run" - proto = csv_config._to_proto() - assert proto.run_name == "my_run" - assert proto.run_id == "" - - def test_run_id_set(self, csv_config): - csv_config.run_id = "run_123" - csv_config.run_name = "ignored" - proto = csv_config._to_proto() - assert proto.run_id == "run_123" - - class TestCh10Config: def test_to_proto(self): config = Ch10ImportConfig(asset_name="my_asset", run_name="run1", scale_values=True) @@ -271,10 +136,6 @@ def test_to_proto_defaults(self): assert proto.run_name == "" assert proto.scale_values is False - def test_run_id_inherited_but_unused(self): - config = Ch10ImportConfig(asset_name="my_asset") - assert config.run_id is None - class TestTdmsConfig: def test_to_proto(self): @@ -421,9 +282,90 @@ def test_absolute_time_no_start_time_required(self): assert not proto.HasField("relative_start_time") -class TestExtensionMap: - def test_tdms_extension(self): - assert EXTENSION_TO_DATA_TYPE_KEY[".tdms"] == DataTypeKey.TDMS +class TestCsvToProto: + def test_to_proto(self, csv_config): + proto = csv_config._to_proto() + assert proto.asset_name == "test_asset" + assert proto.run_name == "test_run" + assert proto.first_data_row == 2 + assert proto.time_column.column_number == 1 + assert len(proto.data_columns) == 3 + assert proto.data_columns[2].name == "cpu_util" + + def test_from_proto_round_trip(self, csv_config): + proto = csv_config._to_proto() + restored = CsvImportConfig._from_proto(proto) + assert restored.asset_name == csv_config.asset_name + assert restored.run_name == csv_config.run_name + assert restored.first_data_row == csv_config.first_data_row + assert restored.time_column.column == csv_config.time_column.column + assert len(restored.data_columns) == len(csv_config.data_columns) + + +class TestParquetToProto: + def test_flat_dataset_to_proto(self, parquet_config): + proto = parquet_config._to_proto() + assert proto.asset_name == "test_asset" + assert proto.HasField("flat_dataset") + assert proto.flat_dataset.time_column.path == "timestamp" + assert len(proto.flat_dataset.data_columns) == 3 + + def test_flat_dataset_from_proto_round_trip(self, parquet_config): + proto = parquet_config._to_proto() + restored = ParquetFlatDatasetImportConfig._from_proto(proto) + assert restored.asset_name == parquet_config.asset_name + assert restored.time_column.path == parquet_config.time_column.path + assert len(restored.data_columns) == len(parquet_config.data_columns) + for orig, rest in zip(parquet_config.data_columns, restored.data_columns): + assert orig.name == rest.name + assert orig.data_type == rest.data_type + + def test_single_channel_per_row_from_proto_round_trip(self): + from sift_client.sift_types.data_import import ParquetSingleChannelConfig + + config = ParquetSingleChannelPerRowImportConfig( + asset_name="a", + time_column=ParquetTimeColumn(path="ts"), + single_channel=ParquetSingleChannelConfig( + data_path="value", + name="voltage", + data_type=ChannelDataType.DOUBLE, + ), + ) + proto = config._to_proto() + restored = ParquetSingleChannelPerRowImportConfig._from_proto(proto) + assert restored.single_channel is not None + assert restored.single_channel.name == "voltage" + assert restored.single_channel.data_type == ChannelDataType.DOUBLE + + +class TestParquetTimeColumnToProto: + def test_empty_path_raises(self): + col = ParquetTimeColumn(path="") + with pytest.raises(ValueError, match="path must be set"): + col._to_proto() + + +class TestResolveDataTypeKey: + def test_parquet_requires_data_type(self): + with pytest.raises(ValueError, match="data_type"): + _resolve_data_type_key(".parquet", None) + + def test_parquet_with_explicit_data_type(self): + result = _resolve_data_type_key(".parquet", DataTypeKey.PARQUET_FLATDATASET) + assert result == DataTypeKey.PARQUET_FLATDATASET + + def test_pqt_requires_data_type(self): + with pytest.raises(ValueError, match="data_type"): + _resolve_data_type_key(".pqt", None) + + def test_known_extension_uses_map(self): + assert _resolve_data_type_key(".csv", None) == DataTypeKey.CSV + + def test_explicit_data_type_overrides_extension(self): + result = _resolve_data_type_key(".csv", DataTypeKey.TDMS) + assert result == DataTypeKey.TDMS - def test_ch10_extension(self): - assert EXTENSION_TO_DATA_TYPE_KEY[".ch10"] == DataTypeKey.CH10 + def test_unknown_extension_raises(self): + with pytest.raises(ValueError, match="Unsupported file extension"): + _resolve_data_type_key(".xyz", None) From 7aa2da478e7e93fa3abf237e9b652dd19c155626 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 9 Apr 2026 12:21:09 -0700 Subject: [PATCH 34/52] updated documentation regarding json metadata --- .../lib/sift_client/resources/data_imports.py | 25 +++++++++++++------ .../resources/sync_stubs/__init__.pyi | 25 +++++++++++++------ 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 174cfc452..84dcd9147 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -187,13 +187,24 @@ async def detect_config( using ``TdmsImportConfig``, ``Hdf5ImportConfig``, or ``Ch10ImportConfig``. - For CSV files, the server can parse an optional JSON metadata row - that auto-populates channel names, units, descriptions, data types, - and enum definitions. Each cell in the row is a JSON object - describing that column. When present, ``first_data_row`` in the - returned config will be set to the row after the metadata row. - Note that enum type definitions are applied server-side during - import but are not included in the returned config. + For CSV files, the server scans the first two rows for an optional + JSON metadata row. Row 1 is checked first; row 2 is checked only + if row 1 is not valid metadata. A row qualifies as metadata when + every cell contains valid JSON that describes either a time column + or a data column. When present, ``first_data_row`` in the returned + config is set to the row after the metadata row. + + Each data column cell is a JSON ``ChannelConfig``:: + + {"name": "speed", "units": "m/s", "dataType": "CHANNEL_DATA_TYPE_DOUBLE"} + + The time column cell is a JSON ``CsvTimeColumn``:: + + {"format": "TIME_FORMAT_ABSOLUTE_RFC3339"} + + Enum type definitions and bit field elements can also be specified + in the metadata row; they are applied server-side during import + but are not included in the returned config. For file types with multiple layouts (e.g. Parquet), ``data_type`` must be specified explicitly. diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index cc1b6556e..3565fed0c 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -655,13 +655,24 @@ class DataImportAPI: using ``TdmsImportConfig``, ``Hdf5ImportConfig``, or ``Ch10ImportConfig``. - For CSV files, the server can parse an optional JSON metadata row - that auto-populates channel names, units, descriptions, data types, - and enum definitions. Each cell in the row is a JSON object - describing that column. When present, ``first_data_row`` in the - returned config will be set to the row after the metadata row. - Note that enum type definitions are applied server-side during - import but are not included in the returned config. + For CSV files, the server scans the first two rows for an optional + JSON metadata row. Row 1 is checked first; row 2 is checked only + if row 1 is not valid metadata. A row qualifies as metadata when + every cell contains valid JSON that describes either a time column + or a data column. When present, ``first_data_row`` in the returned + config is set to the row after the metadata row. + + Each data column cell is a JSON ``ChannelConfig``:: + + {"name": "speed", "units": "m/s", "dataType": "CHANNEL_DATA_TYPE_DOUBLE"} + + The time column cell is a JSON ``CsvTimeColumn``:: + + {"format": "TIME_FORMAT_ABSOLUTE_RFC3339"} + + Enum type definitions and bit field elements can also be specified + in the metadata row; they are applied server-side during import + but are not included in the returned config. For file types with multiple layouts (e.g. Parquet), ``data_type`` must be specified explicitly. From f59336df44fc6ab034c31d104da95a66721732cb Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 9 Apr 2026 12:36:01 -0700 Subject: [PATCH 35/52] updated get_column to getitem --- .../_tests/resources/test_data_imports.py | 26 +++++++++---------- .../lib/sift_client/resources/data_imports.py | 2 +- .../resources/sync_stubs/__init__.pyi | 2 +- .../lib/sift_client/sift_types/data_import.py | 26 ++++++------------- 4 files changed, 23 insertions(+), 33 deletions(-) diff --git a/python/lib/sift_client/_tests/resources/test_data_imports.py b/python/lib/sift_client/_tests/resources/test_data_imports.py index ff99efe3c..de82148c9 100644 --- a/python/lib/sift_client/_tests/resources/test_data_imports.py +++ b/python/lib/sift_client/_tests/resources/test_data_imports.py @@ -58,31 +58,31 @@ def parquet_config(): ) -class TestGetColumn: - def test_csv_get_column(self, csv_config): - col = csv_config.get_column("cpu_util") +class TestGetItem: + def test_csv_getitem(self, csv_config): + col = csv_config["cpu_util"] assert col.name == "cpu_util" assert col.data_type == ChannelDataType.DOUBLE - def test_csv_get_column_not_found(self, csv_config): + def test_csv_getitem_not_found(self, csv_config): with pytest.raises(KeyError, match="nonexistent"): - csv_config.get_column("nonexistent") + csv_config["nonexistent"] - def test_csv_get_column_mutate(self, csv_config): - csv_config.get_column("status_flags").data_type = ChannelDataType.STRING + def test_csv_getitem_mutate(self, csv_config): + csv_config["status_flags"].data_type = ChannelDataType.STRING assert csv_config.data_columns[1].data_type == ChannelDataType.STRING - def test_parquet_get_column(self, parquet_config): - col = parquet_config.get_column("temperature") + def test_parquet_getitem(self, parquet_config): + col = parquet_config["temperature"] assert col.name == "temperature" assert col.data_type == ChannelDataType.FLOAT - def test_parquet_get_column_not_found(self, parquet_config): + def test_parquet_getitem_not_found(self, parquet_config): with pytest.raises(KeyError, match="nonexistent"): - parquet_config.get_column("nonexistent") + parquet_config["nonexistent"] - def test_parquet_get_column_mutate(self, parquet_config): - parquet_config.get_column("cpu_util").name = "cpu_utilization" + def test_parquet_getitem_mutate(self, parquet_config): + parquet_config["cpu_util"].name = "cpu_utilization" assert parquet_config.data_columns[0].name == "cpu_utilization" diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 84dcd9147..b2868970c 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -84,7 +84,7 @@ async def import_from_path( config = client.data_imports.detect_config("data.csv") # Fix a column data type - config.get_column("temperature").data_type = ChannelDataType.FLOAT + config["temperature"].data_type = ChannelDataType.FLOAT # Remove an unwanted column config.data_columns = [ diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index 3565fed0c..3da4323ab 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -732,7 +732,7 @@ class DataImportAPI: config = client.data_imports.detect_config("data.csv") # Fix a column data type - config.get_column("temperature").data_type = ChannelDataType.FLOAT + config["temperature"].data_type = ChannelDataType.FLOAT # Remove an unwanted column config.data_columns = [ diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index 6963715af..1df48b97f 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -169,17 +169,12 @@ class CsvImportConfig(ImportConfigBase): time_column: CsvTimeColumn data_columns: list[CsvDataColumn] - def get_column(self, name: str) -> CsvDataColumn: - """Look up a data column by name. + def __getitem__(self, name: str) -> CsvDataColumn: + """Look up a data column by channel name. - Args: - name: The channel name to search for. + Example:: - Returns: - The matching data column. - - Raises: - KeyError: If no column with the given name exists. + config["temperature"].data_type = ChannelDataType.FLOAT """ for dc in self.data_columns: if dc.name == name: @@ -326,17 +321,12 @@ class ParquetFlatDatasetImportConfig(ImportConfigBase): footer_length: int = 0 complex_types_import_mode: ParquetComplexTypesImportMode = ParquetComplexTypesImportMode.IGNORE - def get_column(self, name: str) -> ParquetDataColumn: - """Look up a data column by name. - - Args: - name: The channel name to search for. + def __getitem__(self, name: str) -> ParquetDataColumn: + """Look up a data column by channel name. - Returns: - The matching data column. + Example:: - Raises: - KeyError: If no column with the given name exists. + config["temperature"].data_type = ChannelDataType.FLOAT """ for dc in self.data_columns: if dc.name == name: From 0328d27e5b1072fbaf4f8014802cbb696d318652 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 9 Apr 2026 12:41:39 -0700 Subject: [PATCH 36/52] updated detect_config error messages --- python/lib/sift_client/resources/data_imports.py | 10 +++++++--- .../lib/sift_client/resources/sync_stubs/__init__.pyi | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index b2868970c..7653aea55 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -219,8 +219,8 @@ async def detect_config( Raises: FileNotFoundError: If the file does not exist. - ValueError: If the file extension is unsupported or detection - returns no config. + ValueError: If the file extension is unsupported or no + supported configuration could be detected. """ path = Path(file_path) if not path.is_file(): @@ -260,7 +260,11 @@ def _read_sample() -> bytes: response.parquet_config, path.name, footer_offset, footer_length ) - raise ValueError("Server returned an empty DetectConfig response.") + raise ValueError( + f"No supported configuration detected for '{path.name}'. " + "Auto-detection supports CSV and Parquet files. " + "For other formats, provide a config manually." + ) def _resolve_data_type_key(ext: str, data_type: DataTypeKey | None) -> DataTypeKey: diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index 3da4323ab..7b9282198 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -687,8 +687,8 @@ class DataImportAPI: Raises: FileNotFoundError: If the file does not exist. - ValueError: If the file extension is unsupported or detection - returns no config. + ValueError: If the file extension is unsupported or no + supported configuration could be detected. """ ... From 930c5567da14c2de18cf93671a2b3cd42ff9dc88 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 9 Apr 2026 12:53:20 -0700 Subject: [PATCH 37/52] move data column validation from detect_config to _to_proto --- python/lib/sift_client/resources/data_imports.py | 13 ++----------- python/lib/sift_client/sift_types/data_import.py | 4 ++++ 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 7653aea55..041bae429 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -253,7 +253,7 @@ def _read_sample() -> bytes: response = await self._low_level_client.detect_config(sample, data_type_key.value) if response.HasField("csv_config"): - return _parse_csv_detect_response(response.csv_config, path.name) + return _parse_csv_detect_response(response.csv_config) if response.HasField("parquet_config"): return _parse_parquet_detect_response( @@ -287,13 +287,11 @@ def _resolve_data_type_key(ext: str, data_type: DataTypeKey | None) -> DataTypeK return EXTENSION_TO_DATA_TYPE_KEY[ext] -def _parse_csv_detect_response(proto, filename: str) -> CsvImportConfig: +def _parse_csv_detect_response(proto) -> CsvImportConfig: """Parse a CSV DetectConfig response into a config.""" csv_config = CsvImportConfig._from_proto(proto) time_col = csv_config.time_column.column csv_config.data_columns = [dc for dc in csv_config.data_columns if dc.column != time_col] - if not csv_config.data_columns: - raise ValueError(f"No data columns detected in '{filename}'.") return csv_config @@ -329,13 +327,6 @@ def _parse_parquet_detect_response( parquet_config.data_columns = [ c for c in parquet_config.data_columns if c.path != match.path ] - if not parquet_config.time_column.path: - raise ValueError( - f"No time column detected in '{filename}'. " - "Set the time column manually on the config before importing." - ) - if not parquet_config.data_columns: - raise ValueError(f"No data columns detected in '{filename}'.") return parquet_config elif proto.HasField("single_channel_per_row"): return ParquetSingleChannelPerRowImportConfig._from_proto( diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index 1df48b97f..6e75376c6 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -182,6 +182,8 @@ def __getitem__(self, name: str) -> CsvDataColumn: raise KeyError(f"No data column named '{name}'") def _to_proto(self) -> CsvConfigProto: + if not self.data_columns: + raise ValueError("Config has no data columns. Add at least one before importing.") return CsvConfigProto( asset_name=self.asset_name, run_name=self.run_name or "", @@ -334,6 +336,8 @@ def __getitem__(self, name: str) -> ParquetDataColumn: raise KeyError(f"No data column named '{name}'") def _to_proto(self) -> ParquetConfigProto: + if not self.data_columns: + raise ValueError("Config has no data columns. Add at least one before importing.") flat_dataset = ParquetFlatDatasetConfigProto( time_column=self.time_column._to_proto(), data_columns=[ From c610c888c629c4b73d408afc207d93ae1a35117c Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 9 Apr 2026 13:25:07 -0700 Subject: [PATCH 38/52] updated error types to be more accurate --- python/lib/sift_client/_internal/util/file.py | 3 +-- python/lib/sift_client/resources/data_imports.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/lib/sift_client/_internal/util/file.py b/python/lib/sift_client/_internal/util/file.py index 1d93f44c6..dc2cc1999 100644 --- a/python/lib/sift_client/_internal/util/file.py +++ b/python/lib/sift_client/_internal/util/file.py @@ -65,8 +65,7 @@ def upload_file( data=f, headers={"Content-Disposition": f'attachment; filename="{file_path.name}"'}, ) - if not response.ok: - raise ValueError(f"Upload failed ({response.status_code}): {response.text}") + response.raise_for_status() return response.json() diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 041bae429..49b63c381 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -166,7 +166,7 @@ async def import_from_path( ) job_id = response.get("jobId") if not job_id: - raise ValueError("Upload succeeded but server response did not include a job ID.") + raise RuntimeError("Upload succeeded but server response did not include a job ID.") return await self.client.async_.jobs.get(job_id=job_id) From 66d567f5b3a3c6eb4e9e769f590e851c8972dbba Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 9 Apr 2026 13:55:52 -0700 Subject: [PATCH 39/52] add import_data method to Run for importing files into existing runs --- python/lib/sift_client/sift_types/run.py | 37 ++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/python/lib/sift_client/sift_types/run.py b/python/lib/sift_client/sift_types/run.py index acfb59a92..22d22c556 100644 --- a/python/lib/sift_client/sift_types/run.py +++ b/python/lib/sift_client/sift_types/run.py @@ -19,10 +19,14 @@ from sift_client.util.metadata import metadata_dict_to_proto, metadata_proto_to_dict if TYPE_CHECKING: + from pathlib import Path + from sift_stream_bindings import RunFormPy from sift_client.client import SiftClient from sift_client.sift_types.asset import Asset + from sift_client.sift_types.data_import import DataTypeKey, ImportConfig + from sift_client.sift_types.job import Job class Run(BaseType[RunProto, "Run"], FileAttachmentsMixin): @@ -127,6 +131,39 @@ def stop(self) -> Run: self._update(updated_run) return self + def import_data( + self, + file_path: str | Path, + *, + asset: Asset | str | None = None, + config: ImportConfig | None = None, + data_type: DataTypeKey | None = None, + show_progress: bool | None = None, + ) -> Job: + """Import data from a file into this run. + + Convenience method that calls ``client.data_imports.import_from_path`` + with this run pre-filled. + + Args: + file_path: Path to the local file to import. + asset: Asset object or asset name to import data into. + config: Import configuration. Auto-detected if omitted. + data_type: Explicit data type key for ambiguous formats. + show_progress: Display a progress spinner during upload. + + Returns: + A ``Job`` handle for the pending import. + """ + return self.client.data_import.import_from_path( + file_path, + asset=asset, + config=config, + data_type=data_type, + run=self, + show_progress=show_progress, + ) + class RunBase(ModelCreateUpdateBase): """Base class for Run create and update models with shared fields and validation.""" From 462a36d412eaa887113c4466f029d82126b94ad6 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 9 Apr 2026 14:19:12 -0700 Subject: [PATCH 40/52] add get_run to data import API and get_import_run to Job for resolving runs from imports --- .../low_level_wrappers/data_imports.py | 15 ++++++++++++ .../lib/sift_client/resources/data_imports.py | 23 +++++++++++++++++++ .../resources/sync_stubs/__init__.pyi | 19 +++++++++++++++ python/lib/sift_client/sift_types/job.py | 17 ++++++++++++++ 4 files changed, 74 insertions(+) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py index b88b9cfab..34315ac6b 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py @@ -7,6 +7,8 @@ CreateDataImportFromUploadResponse, DetectConfigRequest, DetectConfigResponse, + GetDataImportRequest, + GetDataImportResponse, ) from sift.data_imports.v2.data_imports_pb2_grpc import DataImportServiceStub @@ -75,6 +77,19 @@ async def create_from_upload(self, config: ImportConfig) -> tuple[str, str]: response = cast("CreateDataImportFromUploadResponse", response) return response.data_import_id, response.upload_url + async def get(self, data_import_id: str) -> GetDataImportResponse: + """Get a data import by ID. + + Args: + data_import_id: The ID of the data import. + + Returns: + The GetDataImportResponse proto. + """ + request = GetDataImportRequest(data_import_id=data_import_id) + response = await self._grpc_client.get_stub(DataImportServiceStub).GetDataImport(request) + return cast("GetDataImportResponse", response) + async def detect_config( self, data: bytes, data_type_key: DataTypeKey.ValueType ) -> DetectConfigResponse: diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 49b63c381..ef532eccc 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -170,6 +170,29 @@ async def import_from_path( return await self.client.async_.jobs.get(job_id=job_id) + async def get_run(self, data_import_id: str) -> Run: + """Get the run associated with a data import. + + The ``data_import_id`` is available on the job returned by + ``import_from_path`` via ``job.job_details.data_import_id``. + For a more ergonomic approach, use ``job.get_import_run()`` + which calls this method internally. + + Args: + data_import_id: The ID of the data import. + + Returns: + The Run created by or associated with the import. + + Raises: + ValueError: If the data import has no associated run. + """ + response = await self._low_level_client.get(data_import_id) + run_id = response.data_import.run_id + if not run_id: + raise ValueError("Data import does not have an associated run.") + return await self.client.async_.runs.get(run_id=run_id) + async def detect_config( self, file_path: str | Path, diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index 7b9282198..d76dd4942 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -692,6 +692,25 @@ class DataImportAPI: """ ... + def get_run(self, data_import_id: str) -> Run: + """Get the run associated with a data import. + + The ``data_import_id`` is available on the job returned by + ``import_from_path`` via ``job.job_details.data_import_id``. + For a more ergonomic approach, use ``job.get_import_run()`` + which calls this method internally. + + Args: + data_import_id: The ID of the data import. + + Returns: + The Run created by or associated with the import. + + Raises: + ValueError: If the data import has no associated run. + """ + ... + def import_from_path( self, file_path: str | Path, diff --git a/python/lib/sift_client/sift_types/job.py b/python/lib/sift_client/sift_types/job.py index 6d3adbe2d..676cdc8c2 100644 --- a/python/lib/sift_client/sift_types/job.py +++ b/python/lib/sift_client/sift_types/job.py @@ -19,6 +19,7 @@ from pathlib import Path from sift_client.client import SiftClient + from sift_client.sift_types.run import Run class JobType(str, Enum): @@ -315,6 +316,22 @@ def wait_until_complete( self._update(completed_job) return self + def get_import_run(self) -> Run: + """Get the run created by this data import job. + + Returns: + The Run associated with this import. + + Raises: + ValueError: If this is not a data import job or the import + has no associated run. + """ + if self.job_type != JobType.DATA_IMPORT: + raise ValueError("get_import_run() is only valid for data import jobs.") + if not isinstance(self.job_details, DataImportDetails): + raise ValueError("Job does not have data import details.") + return self.client.data_import.get_run(self.job_details.data_import_id) + def wait_and_download( self, *, From f049daaa2f5543e2d59a945a5fb39365a7b4113b Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Thu, 9 Apr 2026 14:52:01 -0700 Subject: [PATCH 41/52] add model validation for parquet single/multi channel --- python/lib/sift_client/sift_types/data_import.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index 6e75376c6..0925f08fd 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -452,6 +452,16 @@ class ParquetSingleChannelPerRowImportConfig(ImportConfigBase): footer_length: int = 0 complex_types_import_mode: ParquetComplexTypesImportMode = ParquetComplexTypesImportMode.IGNORE + @model_validator(mode="after") + def _check_channel_config(self) -> ParquetSingleChannelPerRowImportConfig: + if self.single_channel is None and self.multi_channel is None: + raise ValueError("Exactly one of 'single_channel' or 'multi_channel' must be set.") + if self.single_channel is not None and self.multi_channel is not None: + raise ValueError( + "Exactly one of 'single_channel' or 'multi_channel' must be set, not both." + ) + return self + def _to_proto(self) -> ParquetConfigProto: scpr = ParquetSingleChannelPerRowConfigProto( time_column=self.time_column._to_proto(), From 8ff5fde74d26e7fd175738619d53eab8161dc073 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Fri, 10 Apr 2026 13:30:02 -0700 Subject: [PATCH 42/52] refactor parquet timecolumn detection --- .../lib/sift_client/resources/data_imports.py | 52 +++++++++++++------ .../lib/sift_client/sift_types/data_import.py | 12 +++-- 2 files changed, 44 insertions(+), 20 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index ef532eccc..96c1693fa 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -26,6 +26,8 @@ from sift_client.sift_types.run import Run if TYPE_CHECKING: + from collections.abc import Iterable + from sift_client.client import SiftClient from sift_client.sift_types.job import Job @@ -318,6 +320,24 @@ def _parse_csv_detect_response(proto) -> CsvImportConfig: return csv_config +def _infer_time_column(columns: Iterable[tuple[str, ChannelDataType, str]]) -> str | None: + """Find a likely time column from a sequence of (name, data_type, path) tuples. + + The backend only detects arrow timestamp types. This falls back to the first + integer column whose name starts with "time". + """ + _integer_types = { + ChannelDataType.INT_32, + ChannelDataType.INT_64, + ChannelDataType.UINT_32, + ChannelDataType.UINT_64, + } + for name, data_type, path in columns: + if data_type in _integer_types and name.lower().startswith("time"): + return path + return None + + def _parse_parquet_detect_response( proto, filename: str, footer_offset: int, footer_length: int ) -> ParquetFlatDatasetImportConfig | ParquetSingleChannelPerRowImportConfig: @@ -332,29 +352,27 @@ def _parse_parquet_detect_response( dc for dc in parquet_config.data_columns if dc.path != time_path ] else: - # The backend only detects arrow timestamp types. Fall back to - # an integer column whose name starts with "time". - _integer_types = { - ChannelDataType.INT_32, - ChannelDataType.INT_64, - ChannelDataType.UINT_32, - ChannelDataType.UINT_64, - } - match = None - for dc in parquet_config.data_columns: - if dc.data_type in _integer_types and dc.name.lower().startswith("time"): - match = dc - break - if match is not None: - parquet_config.time_column = ParquetTimeColumn(path=match.path) + inferred = _infer_time_column( + (dc.name, dc.data_type, dc.path) for dc in parquet_config.data_columns + ) + if inferred is not None: + parquet_config.time_column = ParquetTimeColumn(path=inferred) parquet_config.data_columns = [ - c for c in parquet_config.data_columns if c.path != match.path + c for c in parquet_config.data_columns if c.path != inferred ] return parquet_config elif proto.HasField("single_channel_per_row"): - return ParquetSingleChannelPerRowImportConfig._from_proto( + parquet_config = ParquetSingleChannelPerRowImportConfig._from_proto( proto, footer_offset=footer_offset, footer_length=footer_length ) + if not parquet_config.time_column.path: + inferred = _infer_time_column( + (col.column_config.name, ChannelDataType(col.column_config.data_type), col.path) + for col in proto.single_channel_per_row.columns + ) + if inferred is not None: + parquet_config.time_column = ParquetTimeColumn(path=inferred) + return parquet_config raise ValueError(f"Unsupported parquet layout in DetectConfig response for '{filename}'.") diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index 0925f08fd..e07d8eab6 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -432,7 +432,9 @@ class ParquetSingleChannelPerRowImportConfig(ImportConfigBase): """Configuration for importing a Parquet file where each row represents a single channel's data point. - Exactly one of ``single_channel`` or ``multi_channel`` must be set. + Exactly one of ``single_channel`` or ``multi_channel`` must be set before + importing. When returned by ``detect_config()``, neither field is populated + and must be filled in before passing the config to ``import_from_path()``. Attributes: time_column: Time column configuration. @@ -454,8 +456,6 @@ class ParquetSingleChannelPerRowImportConfig(ImportConfigBase): @model_validator(mode="after") def _check_channel_config(self) -> ParquetSingleChannelPerRowImportConfig: - if self.single_channel is None and self.multi_channel is None: - raise ValueError("Exactly one of 'single_channel' or 'multi_channel' must be set.") if self.single_channel is not None and self.multi_channel is not None: raise ValueError( "Exactly one of 'single_channel' or 'multi_channel' must be set, not both." @@ -463,6 +463,12 @@ def _check_channel_config(self) -> ParquetSingleChannelPerRowImportConfig: return self def _to_proto(self) -> ParquetConfigProto: + if self.single_channel is None and self.multi_channel is None: + raise ValueError( + "Either 'single_channel' or 'multi_channel' must be set before importing. " + "If this config was returned by detect_config(), set one of these fields " + "to specify the channel layout." + ) scpr = ParquetSingleChannelPerRowConfigProto( time_column=self.time_column._to_proto(), ) From 67712783462952d3f1216107a84c8152a3d52bf3 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Fri, 10 Apr 2026 13:42:06 -0700 Subject: [PATCH 43/52] mypy fix --- python/lib/sift_client/resources/data_imports.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 96c1693fa..5196c5d74 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -362,17 +362,17 @@ def _parse_parquet_detect_response( ] return parquet_config elif proto.HasField("single_channel_per_row"): - parquet_config = ParquetSingleChannelPerRowImportConfig._from_proto( + parquet_scpr_config = ParquetSingleChannelPerRowImportConfig._from_proto( proto, footer_offset=footer_offset, footer_length=footer_length ) - if not parquet_config.time_column.path: + if not parquet_scpr_config.time_column.path: inferred = _infer_time_column( (col.column_config.name, ChannelDataType(col.column_config.data_type), col.path) for col in proto.single_channel_per_row.columns ) if inferred is not None: - parquet_config.time_column = ParquetTimeColumn(path=inferred) - return parquet_config + parquet_scpr_config.time_column = ParquetTimeColumn(path=inferred) + return parquet_scpr_config raise ValueError(f"Unsupported parquet layout in DetectConfig response for '{filename}'.") From 261e091a56af72c13993e4c247bbe5a5348d5285 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Fri, 10 Apr 2026 20:08:23 -0700 Subject: [PATCH 44/52] simplify _resolve_data_type_key logic --- python/lib/sift_client/resources/data_imports.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 5196c5d74..736fadb36 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -294,15 +294,13 @@ def _read_sample() -> bytes: def _resolve_data_type_key(ext: str, data_type: DataTypeKey | None) -> DataTypeKey: """Resolve the data type key from file extension and explicit override.""" - if ext in (".parquet", ".pqt"): - if data_type is None: - raise ValueError( - "Parquet files require 'data_type' to be specified. " - "Use DataTypeKey.PARQUET_FLATDATASET or DataTypeKey.PARQUET_SINGLE_CHANNEL_PER_ROW." - ) - return data_type if data_type is not None: return data_type + if ext in (".parquet", ".pqt"): + raise ValueError( + "Parquet files require 'data_type' to be specified. " + "Use DataTypeKey.PARQUET_FLATDATASET or DataTypeKey.PARQUET_SINGLE_CHANNEL_PER_ROW." + ) if ext not in EXTENSION_TO_DATA_TYPE_KEY: raise ValueError( f"Unsupported file extension '{ext}'. " From 59af2a864828091a4bef0bdcf87b9f1c31cd5920 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Fri, 10 Apr 2026 20:22:28 -0700 Subject: [PATCH 45/52] autofill the run's asset during import --- python/lib/sift_client/sift_types/run.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/lib/sift_client/sift_types/run.py b/python/lib/sift_client/sift_types/run.py index 22d22c556..77de6e736 100644 --- a/python/lib/sift_client/sift_types/run.py +++ b/python/lib/sift_client/sift_types/run.py @@ -143,7 +143,8 @@ def import_data( """Import data from a file into this run. Convenience method that calls ``client.data_imports.import_from_path`` - with this run pre-filled. + with this run pre-filled. If the run has exactly one asset, + ``asset`` is inferred automatically. Args: file_path: Path to the local file to import. @@ -155,6 +156,9 @@ def import_data( Returns: A ``Job`` handle for the pending import. """ + if asset is None and len(self.asset_ids) == 1: + asset = self.asset_ids[0] + return self.client.data_import.import_from_path( file_path, asset=asset, From f999e9752e54c843f92d582bd7fff15f1ed461a8 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Fri, 10 Apr 2026 20:35:02 -0700 Subject: [PATCH 46/52] updated docstrings and fixed run import to infer asset object --- python/lib/sift_client/sift_types/data_import.py | 5 +---- python/lib/sift_client/sift_types/run.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index e07d8eab6..0700463b5 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -108,8 +108,6 @@ class CsvTimeColumn(TimeColumnBase): Attributes: column: The 1-indexed column number of the time column. - format: The time format used in this column. - relative_start_time: Required when using a relative time format. """ column: int @@ -250,8 +248,7 @@ class ParquetTimeColumn(TimeColumnBase): Attributes: path: The column path in the Parquet schema (e.g. ``"timestamp"``). - format: The time format used in this column. - relative_start_time: Required when using a relative time format. + format: The time format. Defaults to ``ABSOLUTE_UNIX_NANOSECONDS``. """ path: str diff --git a/python/lib/sift_client/sift_types/run.py b/python/lib/sift_client/sift_types/run.py index 77de6e736..ec6690896 100644 --- a/python/lib/sift_client/sift_types/run.py +++ b/python/lib/sift_client/sift_types/run.py @@ -157,7 +157,7 @@ def import_data( A ``Job`` handle for the pending import. """ if asset is None and len(self.asset_ids) == 1: - asset = self.asset_ids[0] + asset = self.client.assets.get(asset_id=self.asset_ids[0]) return self.client.data_import.import_from_path( file_path, From e074a4cbad533d9c4561f12531f239b2b97b1c56 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Fri, 10 Apr 2026 20:53:10 -0700 Subject: [PATCH 47/52] refactor show_progress helper to the base class --- python/lib/sift_client/resources/_base.py | 4 ++++ python/lib/sift_client/resources/data_imports.py | 8 ++------ python/lib/sift_client/resources/jobs.py | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/lib/sift_client/resources/_base.py b/python/lib/sift_client/resources/_base.py index 890cf2f44..33d7e2659 100644 --- a/python/lib/sift_client/resources/_base.py +++ b/python/lib/sift_client/resources/_base.py @@ -3,6 +3,7 @@ from abc import ABC from typing import TYPE_CHECKING, Any, TypeVar +from sift_client._internal.util.file import resolve_show_progress from sift_client.sift_types.tag import Tag from sift_client.util import cel_utils as cel @@ -34,6 +35,9 @@ def grpc_client(self) -> GrpcClient: def rest_client(self) -> RestClient: return self.client.rest_client + def _show_progress(self) -> bool: + return resolve_show_progress(is_sync=getattr(self, "_is_sync", False)) + def _apply_client_to_instance(self, instance: T) -> T: instance._apply_client_to_instance(self.client) return instance diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 736fadb36..3395c4909 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -5,11 +5,7 @@ from sift_client._internal.low_level_wrappers.data_imports import DataImportsLowLevelClient from sift_client._internal.util.executor import run_sync_function -from sift_client._internal.util.file import ( - extract_parquet_footer, - resolve_show_progress, - upload_file, -) +from sift_client._internal.util.file import extract_parquet_footer, upload_file from sift_client.resources._base import ResourceBase from sift_client.sift_types.asset import Asset from sift_client.sift_types.channel import ChannelDataType @@ -154,7 +150,7 @@ async def import_from_path( await _prepare_parquet_config(config, path) if show_progress is None: - show_progress = resolve_show_progress(is_sync=getattr(self, "_is_sync", False)) + show_progress = self._show_progress() _, upload_url = await self._low_level_client.create_from_upload(config) diff --git a/python/lib/sift_client/resources/jobs.py b/python/lib/sift_client/resources/jobs.py index 5e2bbdf9b..5a9eb38c6 100644 --- a/python/lib/sift_client/resources/jobs.py +++ b/python/lib/sift_client/resources/jobs.py @@ -11,7 +11,7 @@ from sift_client._internal.low_level_wrappers.jobs import JobsLowLevelClient from sift_client._internal.util.executor import run_sync_function -from sift_client._internal.util.file import download_file, extract_zip, resolve_show_progress +from sift_client._internal.util.file import download_file, extract_zip from sift_client.resources._base import ResourceBase from sift_client.sift_types.job import DataExportStatusDetails, Job, JobStatus, JobType from sift_client.util import cel_utils as cel @@ -193,7 +193,7 @@ async def wait_until_complete( """ job_id = job._id_or_error if isinstance(job, Job) else job if show_progress is None: - show_progress = resolve_show_progress(is_sync=getattr(self, "_is_sync", False)) + show_progress = self._show_progress() start = time.monotonic() with alive_bar( @@ -256,7 +256,7 @@ async def wait_and_download( """ job_id = job._id_or_error if isinstance(job, Job) else job if show_progress is None: - show_progress = resolve_show_progress(is_sync=getattr(self, "_is_sync", False)) + show_progress = self._show_progress() completed_job = await self.wait_until_complete( job=job_id, From 78703e54cc66c2b9243f70d60d8a31d638507068 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Fri, 10 Apr 2026 21:02:52 -0700 Subject: [PATCH 48/52] add client binding test for data imports --- .../sift_client/_tests/resources/test_data_imports.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/lib/sift_client/_tests/resources/test_data_imports.py b/python/lib/sift_client/_tests/resources/test_data_imports.py index de82148c9..a2cabf953 100644 --- a/python/lib/sift_client/_tests/resources/test_data_imports.py +++ b/python/lib/sift_client/_tests/resources/test_data_imports.py @@ -4,6 +4,7 @@ import pytest +from sift_client.resources import DataImportAPI, DataImportAPIAsync from sift_client.resources.data_imports import _resolve_data_type_key from sift_client.sift_types.channel import ChannelDataType from sift_client.sift_types.data_import import ( @@ -23,6 +24,14 @@ ) +@pytest.mark.integration +def test_client_binding(sift_client): + assert sift_client.data_import + assert isinstance(sift_client.data_import, DataImportAPI) + assert sift_client.async_.data_import + assert isinstance(sift_client.async_.data_import, DataImportAPIAsync) + + @pytest.fixture def csv_config(): return CsvImportConfig( From d736d1e3b9c82a24df1a01d6a314074711532555 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Fri, 10 Apr 2026 21:17:20 -0700 Subject: [PATCH 49/52] add a base class for data columns shared by csv, parquet, and hdf5 --- .../lib/sift_client/sift_types/data_import.py | 84 ++++++++----------- 1 file changed, 34 insertions(+), 50 deletions(-) diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index 0700463b5..c62a4c0e8 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -103,37 +103,16 @@ def _check_relative_start_time(self) -> TimeColumnBase: return self -class CsvTimeColumn(TimeColumnBase): - """Time column configuration for CSV imports. +class DataColumnBase(BaseModel, ABC): + """Base class for data column definitions. Attributes: - column: The 1-indexed column number of the time column. - """ - - column: int - - def _to_proto(self) -> CsvTimeColumnProto: - proto = CsvTimeColumnProto( - column_number=self.column, - format=self.format.value, - ) - if self.relative_start_time is not None: - proto.relative_start_time.CopyFrom(to_pb_timestamp(self.relative_start_time)) - return proto - - -class CsvDataColumn(BaseModel): - """A data column definition for CSV imports. - - Attributes: - column: The 1-indexed column number. name: Channel name. data_type: The data type of the channel values. units: Optional units string. description: Optional channel description. """ - column: int name: str data_type: ChannelDataType units: str = "" @@ -154,6 +133,35 @@ class ImportConfigBase(BaseModel, ABC): run_id: str | None = None +class CsvTimeColumn(TimeColumnBase): + """Time column configuration for CSV imports. + + Attributes: + column: The 1-indexed column number of the time column. + """ + + column: int + + def _to_proto(self) -> CsvTimeColumnProto: + proto = CsvTimeColumnProto( + column_number=self.column, + format=self.format.value, + ) + if self.relative_start_time is not None: + proto.relative_start_time.CopyFrom(to_pb_timestamp(self.relative_start_time)) + return proto + + +class CsvDataColumn(DataColumnBase): + """A data column definition for CSV imports. + + Attributes: + column: The 1-indexed column number. + """ + + column: int + + class CsvImportConfig(ImportConfigBase): """Configuration for importing a CSV file. @@ -281,22 +289,14 @@ def _from_proto(cls, proto: ParquetTimeColumnProto) -> ParquetTimeColumn: ) -class ParquetDataColumn(BaseModel): +class ParquetDataColumn(DataColumnBase): """A data column definition for Parquet flat dataset imports. Attributes: path: The column path in the Parquet schema. - name: Channel name. - data_type: The data type of the channel values. - units: Optional units string. - description: Optional channel description. """ path: str - name: str - data_type: ChannelDataType - units: str = "" - description: str = "" class ParquetFlatDatasetImportConfig(ImportConfigBase): @@ -395,22 +395,14 @@ def _from_proto( ) -class ParquetSingleChannelConfig(BaseModel): +class ParquetSingleChannelConfig(DataColumnBase): """Configuration for a single-channel Parquet single-channel-per-row import. Attributes: data_path: The column path containing channel data. - name: Channel name. - data_type: The data type of the channel values. - units: Optional units string. - description: Optional channel description. """ data_path: str - name: str - data_type: ChannelDataType - units: str = "" - description: str = "" class ParquetMultiChannelConfig(BaseModel): @@ -587,7 +579,7 @@ def _to_proto(self) -> TDMSConfigProto: return proto -class Hdf5DataColumn(BaseModel): +class Hdf5DataColumn(DataColumnBase): """A dataset mapping for HDF5 imports. Each entry maps a time/value dataset pair to a channel. @@ -597,10 +589,6 @@ class Hdf5DataColumn(BaseModel): time_index: Column index within the time dataset. Defaults to 0. value_dataset: HDF5 path to the value dataset. value_index: Column index within the value dataset. Defaults to 0. - name: Channel name. - data_type: The data type of the channel values. - units: Optional units string. - description: Optional channel description. time_field: For compound dataset types, the field name to use for time. value_field: For compound dataset types, the field name to use for value. """ @@ -609,10 +597,6 @@ class Hdf5DataColumn(BaseModel): time_index: int = 0 value_dataset: str value_index: int = 0 - name: str - data_type: ChannelDataType - units: str = "" - description: str = "" time_field: str | None = None value_field: str | None = None From 3cc3607071e82ac9b1843ea257670677c9fd68cd Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Fri, 10 Apr 2026 21:32:21 -0700 Subject: [PATCH 50/52] update the upload_file progress bar to be more detailed --- python/lib/sift_client/_internal/util/file.py | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/python/lib/sift_client/_internal/util/file.py b/python/lib/sift_client/_internal/util/file.py index dc2cc1999..5e0269110 100644 --- a/python/lib/sift_client/_internal/util/file.py +++ b/python/lib/sift_client/_internal/util/file.py @@ -17,6 +17,23 @@ from sift_client.transport.rest_transport import RestClient +class _ProgressReader: + """Wraps a file object to report read progress to an alive_bar callback.""" + + def __init__(self, file_object, progress_bar): + self._file_object = file_object + self._progress_bar = progress_bar + + def read(self, size=-1): + chunk = self._file_object.read(size) + if chunk: + self._progress_bar(len(chunk)) + return chunk + + def __getattr__(self, name): + return getattr(self._file_object, name) + + def resolve_show_progress(*, is_sync: bool) -> bool: """Resolve the show_progress setting from the global config. @@ -50,19 +67,22 @@ def upload_file( Raises: ValueError: If the upload request fails. """ + file_size = file_path.stat().st_size + with alive_bar( + file_size, title=f"Upload [{file_path.name}]", - bar=None, spinner="dots_waves", spinner_length=7, - monitor=False, - stats=False, + unit="B", + scale="SI", disable=not show_progress, - ): - with open(file_path, "rb") as f: + ) as bar: + with open(file_path, "rb") as file: + wrapped = _ProgressReader(file, bar) response = rest_client.post( signed_url, - data=f, + data=wrapped, headers={"Content-Disposition": f'attachment; filename="{file_path.name}"'}, ) response.raise_for_status() From 8b6539070b5abfd00505bc95d097c588012ac1d0 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Mon, 13 Apr 2026 10:39:18 -0700 Subject: [PATCH 51/52] removed ch10 references, no more support --- .../low_level_wrappers/data_imports.py | 3 --- .../lib/sift_client/resources/data_imports.py | 14 ++++-------- .../lib/sift_client/sift_types/data_import.py | 22 ------------------- 3 files changed, 4 insertions(+), 35 deletions(-) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py index 34315ac6b..b0219124b 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data_imports.py @@ -14,7 +14,6 @@ from sift_client._internal.low_level_wrappers.base import LowLevelClientBase from sift_client.sift_types.data_import import ( - Ch10ImportConfig, CsvImportConfig, Hdf5ImportConfig, ImportConfig, @@ -41,8 +40,6 @@ def _set_config_on_request( config, (ParquetFlatDatasetImportConfig, ParquetSingleChannelPerRowImportConfig) ): request.parquet_config.CopyFrom(config._to_proto()) - elif isinstance(config, Ch10ImportConfig): - request.ch10_config.CopyFrom(config._to_proto()) elif isinstance(config, TdmsImportConfig): request.tdms_config.CopyFrom(config._to_proto()) elif isinstance(config, Hdf5ImportConfig): diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py index 3395c4909..f40876234 100644 --- a/python/lib/sift_client/resources/data_imports.py +++ b/python/lib/sift_client/resources/data_imports.py @@ -11,7 +11,6 @@ from sift_client.sift_types.channel import ChannelDataType from sift_client.sift_types.data_import import ( EXTENSION_TO_DATA_TYPE_KEY, - Ch10ImportConfig, CsvImportConfig, DataTypeKey, ImportConfig, @@ -63,7 +62,7 @@ async def import_from_path( When ``config`` is omitted the file format is auto-detected via ``detect_config`` (CSV and Parquet only). For other formats - (TDMS, HDF5, CH10), ``config`` must be provided. + (TDMS and HDF5), ``config`` must be provided. When ``asset`` is provided it overrides the config value; otherwise the config's ``asset_name`` is used. If neither ``run`` nor ``run_name`` is provided (and none is @@ -134,14 +133,10 @@ async def import_from_path( if run is not None and run_name is not None: raise ValueError("'run' and 'run_name' are mutually exclusive.") if run is not None: - if isinstance(config, Ch10ImportConfig): - raise ValueError( - "'run' is not supported for Ch10ImportConfig. Use 'run_name' instead." - ) config.run_id = run._id_or_error if isinstance(run, Run) else run elif run_name is not None: config.run_name = run_name - elif not config.run_name and (isinstance(config, Ch10ImportConfig) or not config.run_id): + elif not config.run_name and not config.run_id: config.run_name = path.name if isinstance( @@ -204,9 +199,8 @@ async def detect_config( provided. Only CSV and Parquet files are currently supported for auto-detection. - For other formats (TDMS, HDF5, CH10), create the config manually - using ``TdmsImportConfig``, ``Hdf5ImportConfig``, or - ``Ch10ImportConfig``. + For other formats (TDMS, HDF5), create the config manually + using ``TdmsImportConfig`` or ``Hdf5ImportConfig``. For CSV files, the server scans the first two rows for an optional JSON metadata row. Row 1 is checked first; row 2 is checked only diff --git a/python/lib/sift_client/sift_types/data_import.py b/python/lib/sift_client/sift_types/data_import.py index c62a4c0e8..62208a678 100644 --- a/python/lib/sift_client/sift_types/data_import.py +++ b/python/lib/sift_client/sift_types/data_import.py @@ -8,7 +8,6 @@ from pydantic import BaseModel, model_validator from sift.common.type.v1.channel_config_pb2 import ChannelConfig as ChannelConfigProto from sift.data_imports.v2.data_imports_pb2 import ( - DATA_TYPE_KEY_CH10, DATA_TYPE_KEY_CSV, DATA_TYPE_KEY_HDF5, DATA_TYPE_KEY_PARQUET_FLATDATASET, @@ -19,7 +18,6 @@ PARQUET_COMPLEX_TYPES_IMPORT_MODE_IGNORE, PARQUET_COMPLEX_TYPES_IMPORT_MODE_STRING, ) -from sift.data_imports.v2.data_imports_pb2 import Ch10Config as Ch10ConfigProto from sift.data_imports.v2.data_imports_pb2 import CsvConfig as CsvConfigProto from sift.data_imports.v2.data_imports_pb2 import CsvTimeColumn as CsvTimeColumnProto from sift.data_imports.v2.data_imports_pb2 import Hdf5Config as Hdf5ConfigProto @@ -70,14 +68,12 @@ class DataTypeKey(Enum): PARQUET_FLATDATASET = DATA_TYPE_KEY_PARQUET_FLATDATASET PARQUET_SINGLE_CHANNEL_PER_ROW = DATA_TYPE_KEY_PARQUET_SINGLE_CHANNEL_PER_ROW TDMS = DATA_TYPE_KEY_TDMS - CH10 = DATA_TYPE_KEY_CH10 HDF5 = DATA_TYPE_KEY_HDF5 EXTENSION_TO_DATA_TYPE_KEY: dict[str, DataTypeKey] = { ".csv": DataTypeKey.CSV, ".tdms": DataTypeKey.TDMS, - ".ch10": DataTypeKey.CH10, ".h5": DataTypeKey.HDF5, ".hdf5": DataTypeKey.HDF5, } @@ -537,23 +533,6 @@ def _from_proto( ) -class Ch10ImportConfig(ImportConfigBase): - """Configuration for importing a CH10 file. - - Attributes: - scale_values: Whether to apply EU (engineering unit) scaling to channel values. - """ - - scale_values: bool = False - - def _to_proto(self) -> Ch10ConfigProto: - return Ch10ConfigProto( - asset_name=self.asset_name, - run_name=self.run_name or "", - scale_values=self.scale_values, - ) - - class TdmsImportConfig(ImportConfigBase): """Configuration for importing a TDMS file. @@ -655,7 +634,6 @@ def _to_proto(self) -> Hdf5ConfigProto: CsvImportConfig, ParquetFlatDatasetImportConfig, ParquetSingleChannelPerRowImportConfig, - Ch10ImportConfig, TdmsImportConfig, Hdf5ImportConfig, ] From 542b5fa7212dc0be6f53772448c5a6f143a158c9 Mon Sep 17 00:00:00 2001 From: Wei Qi Lu Date: Mon, 13 Apr 2026 10:48:35 -0700 Subject: [PATCH 52/52] sync stubs update --- .../_tests/resources/test_data_imports.py | 16 ---------------- .../resources/sync_stubs/__init__.pyi | 7 +++---- 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/python/lib/sift_client/_tests/resources/test_data_imports.py b/python/lib/sift_client/_tests/resources/test_data_imports.py index a2cabf953..a3819cf1c 100644 --- a/python/lib/sift_client/_tests/resources/test_data_imports.py +++ b/python/lib/sift_client/_tests/resources/test_data_imports.py @@ -8,7 +8,6 @@ from sift_client.resources.data_imports import _resolve_data_type_key from sift_client.sift_types.channel import ChannelDataType from sift_client.sift_types.data_import import ( - Ch10ImportConfig, CsvDataColumn, CsvImportConfig, CsvTimeColumn, @@ -131,21 +130,6 @@ def test_absolute_time_does_not_require_start_time(self): assert col.relative_start_time is None -class TestCh10Config: - def test_to_proto(self): - config = Ch10ImportConfig(asset_name="my_asset", run_name="run1", scale_values=True) - proto = config._to_proto() - assert proto.asset_name == "my_asset" - assert proto.run_name == "run1" - assert proto.scale_values is True - - def test_to_proto_defaults(self): - config = Ch10ImportConfig(asset_name="my_asset") - proto = config._to_proto() - assert proto.run_name == "" - assert proto.scale_values is False - - class TestTdmsConfig: def test_to_proto(self): config = TdmsImportConfig( diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index d76dd4942..0e9d18b76 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -651,9 +651,8 @@ class DataImportAPI: provided. Only CSV and Parquet files are currently supported for auto-detection. - For other formats (TDMS, HDF5, CH10), create the config manually - using ``TdmsImportConfig``, ``Hdf5ImportConfig``, or - ``Ch10ImportConfig``. + For other formats (TDMS, HDF5), create the config manually + using ``TdmsImportConfig`` or ``Hdf5ImportConfig``. For CSV files, the server scans the first two rows for an optional JSON metadata row. Row 1 is checked first; row 2 is checked only @@ -732,7 +731,7 @@ class DataImportAPI: When ``config`` is omitted the file format is auto-detected via ``detect_config`` (CSV and Parquet only). For other formats - (TDMS, HDF5, CH10), ``config`` must be provided. + (TDMS and HDF5), ``config`` must be provided. When ``asset`` is provided it overrides the config value; otherwise the config's ``asset_name`` is used. If neither ``run`` nor ``run_name`` is provided (and none is