diff --git a/docs/explanations/manifest_csv.md b/docs/explanations/manifest_csv.md index e46d52f4a..e4b9dee45 100644 --- a/docs/explanations/manifest_csv.md +++ b/docs/explanations/manifest_csv.md @@ -1,6 +1,6 @@ # Manifest CSV -The manifest is a CSV file with file locations and metadata used to bulk upload and download files in Synapse. It is the standard manifest format used by `Project.sync_from_synapse`, `Project.sync_to_synapse`, `Folder.sync_from_synapse`, `Folder.sync_to_synapse`, the Synapse UI download cart, and the `synapse get-download-list` CLI command. +The manifest is a CSV file with file locations and metadata used to bulk upload and download files in Synapse. It is the standard manifest format used by `Project.sync_from_synapse`, `Project.sync_to_synapse`, `Folder.sync_from_synapse`, `Folder.sync_to_synapse`, `Project.generate_sync_manifest`, `Folder.generate_sync_manifest`, the Synapse UI download cart, and the `synapse get-download-list` CLI command. !!! note This CSV manifest replaces the legacy TSV manifest produced by `synapseutils.syncFromSynapse`. The `syncFromSynapse` and `syncToSynapse` utility functions are deprecated and will be removed in v5.0.0. Use `Project.sync_from_synapse` / `Folder.sync_from_synapse` and `Project.sync_to_synapse` / `Folder.sync_to_synapse` instead. See the [legacy TSV manifest documentation](manifest_tsv.md) for details on the old format. @@ -81,8 +81,6 @@ This is an annotation with 3 values: |-----------------|----------|--------------| | /path/file1.txt | syn1243 | "[a,b,c]" | - - ### Dates in the manifest file Dates within the manifest file will always be written as [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) format in UTC without milliseconds. For example: `2023-12-20T16:55:08Z`. @@ -93,11 +91,12 @@ Dates can be written in other formats specified in ISO 8601 and they will be rec The CSV manifest format is shared across multiple tools: -| Source | Filename | -|----------------------------------------------------------|---------------------------------| -| `Project.sync_from_synapse` / `Folder.sync_from_synapse` | manifest.csv | -| Synapse UI download cart | manifest.csv | -| CLI `synapse get-download-list` | `manifest_.csv` | +| Source | Filename | +|----------------------------------------------------------------------|---------------------------------| +| `Project.sync_from_synapse` / `Folder.sync_from_synapse` | manifest.csv | +| `Project.generate_sync_manifest` / `Folder.generate_sync_manifest` | user-specified `manifest_path` | +| Synapse UI download cart | manifest.csv | +| CLI `synapse get-download-list` | `manifest_.csv` | A manifest generated by any of these sources can be used as input to `sync_to_synapse`, provided the `path` column is present with valid local file paths. Manifests from the Synapse UI do not include a `path` column by default, so users must add it before uploading. @@ -113,7 +112,9 @@ A manifest generated by any of these sources can be used as input to `sync_to_sy - [Project.sync_from_synapse][synapseclient.models.Project.sync_from_synapse] - [Project.sync_to_synapse][synapseclient.models.Project.sync_to_synapse] +- [Project.generate_sync_manifest][synapseclient.models.Project.generate_sync_manifest] - [Folder.sync_from_synapse][synapseclient.models.Folder.sync_from_synapse] - [Folder.sync_to_synapse][synapseclient.models.Folder.sync_to_synapse] +- [Folder.generate_sync_manifest][synapseclient.models.Folder.generate_sync_manifest] - [Manifest TSV (legacy)](manifest_tsv.md) - [Managing custom metadata at scale](https://help.synapse.org/docs/Managing-Custom-Metadata-at-Scale.2004254976.html#ManagingCustomMetadataatScale-BatchUploadFileswithAnnotations) diff --git a/docs/reference/experimental/async/folder.md b/docs/reference/experimental/async/folder.md index 561c5ead8..d864d3cdc 100644 --- a/docs/reference/experimental/async/folder.md +++ b/docs/reference/experimental/async/folder.md @@ -17,6 +17,7 @@ at your own risk. - walk_async - sync_from_synapse_async - sync_to_synapse_async + - generate_sync_manifest_async - flatten_file_list - map_directory_to_all_contained_files - get_permissions_async diff --git a/docs/reference/experimental/async/project.md b/docs/reference/experimental/async/project.md index d17e8c4ef..6c948268b 100644 --- a/docs/reference/experimental/async/project.md +++ b/docs/reference/experimental/async/project.md @@ -16,6 +16,7 @@ at your own risk. - walk_async - sync_from_synapse_async - sync_to_synapse_async + - generate_sync_manifest_async - flatten_file_list - map_directory_to_all_contained_files - get_permissions_async diff --git a/docs/reference/experimental/sync/folder.md b/docs/reference/experimental/sync/folder.md index cb1b67579..9246f0cf4 100644 --- a/docs/reference/experimental/sync/folder.md +++ b/docs/reference/experimental/sync/folder.md @@ -28,6 +28,7 @@ at your own risk. - walk - sync_from_synapse - sync_to_synapse + - generate_sync_manifest - flatten_file_list - map_directory_to_all_contained_files - get_permissions diff --git a/docs/reference/experimental/sync/project.md b/docs/reference/experimental/sync/project.md index ee55c9e3b..3df9de76b 100644 --- a/docs/reference/experimental/sync/project.md +++ b/docs/reference/experimental/sync/project.md @@ -27,6 +27,7 @@ at your own risk. - walk - sync_from_synapse - sync_to_synapse + - generate_sync_manifest - flatten_file_list - map_directory_to_all_contained_files - get_permissions diff --git a/docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py b/docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py index fa3c1ffab..7d7d6f45d 100644 --- a/docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py +++ b/docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py @@ -2,6 +2,7 @@ Here is where you'll find the code for the uploading data in bulk tutorial. """ +# --8<-- [start:imports_and_constants] import pandas as pd import synapseclient @@ -14,34 +15,26 @@ DIRECTORY_FOR_MY_PROJECT = "test_folder" # This should exist with your files in it PATH_TO_MANIFEST_FILE = "test_manifest.csv" # This doesn't need to exist yet SYNAPSE_PROJECT_ID = "" # Put your Synapse project ID here. This is the project where you want to upload your data. +project = Project(id=SYNAPSE_PROJECT_ID) +# --8<-- [end:imports_and_constants] -# TODO switch to using new version of synapseutils/sync.py.generate_sync_manifest -# https://sagebionetworks.jira.com/browse/SYNPY-1809 - +# --8<-- [start:generate_manifest] # Step 2: Create a manifest CSV file with the paths to the files and their parent folders # Note: When this command is run it will re-create your directory structure within # Synapse. Be aware of this before running this command. # If folders with the exact names already exists in Synapse, those folders will be used. - - -# old function generates a TSV -from synapseutils import generate_sync_manifest - -generate_sync_manifest( - syn=syn, +project.generate_sync_manifest( directory_path=DIRECTORY_FOR_MY_PROJECT, - parent_id=SYNAPSE_PROJECT_ID, manifest_path=PATH_TO_MANIFEST_FILE, ) -# reformat the manifest file to work with sync_to_synapse -manifest_df = pd.read_csv(PATH_TO_MANIFEST_FILE, sep="\t") -manifest_df.rename(columns={"parent": "parentId"}, inplace=True) -manifest_df.to_csv(PATH_TO_MANIFEST_FILE, index=False) +# --8<-- [end:generate_manifest] +# --8<-- [start:sync_to_synapse] # Step 3: After generating the manifest file, we can upload the data in bulk -project = Project(id=SYNAPSE_PROJECT_ID) project.sync_to_synapse(manifest_path=PATH_TO_MANIFEST_FILE, send_messages=False) +# --8<-- [end:sync_to_synapse] +# --8<-- [start:add_annotation] # Step 4: Let's add an annotation to our manifest file # Pandas is a powerful data manipulation library in Python, although it is not required # for this tutorial, it is used here to demonstrate how you can manipulate the manifest @@ -57,7 +50,9 @@ df.to_csv(PATH_TO_MANIFEST_FILE, index=False) project.sync_to_synapse(manifest_path=PATH_TO_MANIFEST_FILE, send_messages=False) +# --8<-- [end:add_annotation] +# --8<-- [start:add_provenance] # Step 5: Let's create an Activity/Provenance # First let's find the row in the CSV we want to update. This code finds the row number # that we would like to update. @@ -86,3 +81,4 @@ df.to_csv(PATH_TO_MANIFEST_FILE, index=False) project.sync_to_synapse(manifest_path=PATH_TO_MANIFEST_FILE, send_messages=False) +# --8<-- [end:add_provenance] diff --git a/docs/tutorials/python/upload_data_in_bulk.md b/docs/tutorials/python/upload_data_in_bulk.md index be61524bb..7fc6b13d2 100644 --- a/docs/tutorials/python/upload_data_in_bulk.md +++ b/docs/tutorials/python/upload_data_in_bulk.md @@ -57,18 +57,18 @@ tools to open and manipulate CSV files. First let's set up some constants we'll use in this script ```python -{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=5-16} +--8<-- "docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py:imports_and_constants" ``` ## 2. Create a manifest CSV file to upload data in bulk -We use `synapseutils.generate_sync_manifest` to walk our local directory and produce a -manifest that maps each file to the correct parent folder in Synapse (creating folders -as needed). The output is a TSV with a `parent` column, so we convert it to CSV and -rename `parent` to `parentId` for use with `sync_to_synapse`. +We call `Project.generate_sync_manifest` on the project we want to mirror into. +It walks our local directory and produces a CSV manifest that maps each file to +the correct parent folder in Synapse (creating folders as needed). The output +is ready to hand directly to `sync_to_synapse`. ```python -{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=20-39} +--8<-- "docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py:generate_manifest" ```
@@ -89,7 +89,7 @@ path,parentId ## 3. Upload the data in bulk ```python -{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=41-43} +--8<-- "docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py:sync_to_synapse" ``` @@ -115,7 +115,7 @@ you are not comfortable with pandas you may use any tool that can open and manip CSV files such as Excel or Google Sheets. ```python -{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=45-59} +--8<-- "docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py:add_annotation" ``` Now that you have uploaded and annotated your files you'll be able to inspect your data @@ -137,7 +137,7 @@ Synapse. Additionally we'll link off to a sample URL that describes a process th may have executed to generate the file. ```python -{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=61-88} +--8<-- "docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py:add_provenance" ``` After running this code we may again inspect the synapse web UI. In this screenshot i've @@ -156,14 +156,14 @@ navigated to the Files tab and selected the file that we added a Provenance reco Click to show me ```python -{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!} +--8<-- "docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py" ```
## References used in this tutorial - [syn.login][synapseclient.Synapse.login] -- [synapseutils.generate_sync_manifest][] +- [Project.generate_sync_manifest][synapseclient.models.mixins.StorableContainer.generate_sync_manifest] - [Project.sync_to_synapse][synapseclient.models.mixins.StorableContainer.sync_to_synapse] - [Manifest CSV format](../../explanations/manifest_csv.md) - [Activity/Provenance](../../explanations/domain_models_of_synapse.md#activityprovenance) diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index 54123f89b..ba375653f 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -50,6 +50,7 @@ ) from synapseclient.models.services.manifest import ( generate_manifest_csv, + generate_sync_manifest, read_manifest_for_upload, upload_sync_files, ) @@ -702,6 +703,80 @@ async def main(): progress_bar.close() return uploaded_files + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: f"{self.__class__.__name__}_generate_sync_manifest: {self.id}" + ) + async def generate_sync_manifest_async( + self: Self, + directory_path: str, + manifest_path: str, + *, + synapse_client: Synapse | None = None, + ) -> None: + """Walk a local directory, mirror its folder hierarchy under this + container in Synapse, and write a CSV manifest ready for + [sync_to_synapse_async][synapseclient.models.mixins.StorableContainer.sync_to_synapse_async]. + + The manifest has two columns: path (absolute, symlink-resolved) and + parentId (the Synapse ID of the file's containing folder). Existing + Synapse folders with matching names and parents are reused. Directory + symlinks inside directory_path are not followed; file symlinks record + the symlink path and upload the target's contents. Zero-byte files are + skipped with a warning — Synapse rejects empty files. I/O errors during + walk are logged and skipped; an empty source directory produces a + warning and a header-only manifest. + + Arguments: + directory_path: Path to the local directory to be pushed to + Synapse. + manifest_path: Path where the generated manifest CSV will be + written. + synapse_client: If not passed in and caching was not disabled by + Synapse.allow_client_caching(False) this will use the last + created instance from the Synapse class constructor. + + Raises: + ValueError: If this container's id is None, or if directory_path + does not exist or is not a directory, or if this container's + id exists in Synapse but is not a Folder or Project. + SynapseHTTPError: If this container's id does not exist in + Synapse. + + Example: Generate a manifest and upload the files + Mirror ./my_data under a Synapse project and then upload it. + + ```python + import asyncio + from synapseclient import Synapse + from synapseclient.models import Project + + async def main(): + syn = Synapse() + syn.login() + + project = Project(id="syn12345") + await project.generate_sync_manifest_async( + directory_path="./my_data", + manifest_path="manifest.csv", + ) + await project.sync_to_synapse_async(manifest_path="manifest.csv") + + asyncio.run(main()) + ``` + """ + if self.id is None: + raise ValueError( + f"Cannot generate a sync manifest for a {type(self).__name__}" + " that has not been stored in Synapse. Set id on this" + " container (or store it) first." + ) + await generate_sync_manifest( + directory_path=directory_path, + parent_id=self.id, + manifest_path=manifest_path, + synapse_client=synapse_client, + ) + def flatten_file_list(self) -> List["File"]: """ Recursively loop over all of the already retrieved files and folders and return diff --git a/synapseclient/models/protocols/storable_container_protocol.py b/synapseclient/models/protocols/storable_container_protocol.py index c86a099f8..79d50c788 100644 --- a/synapseclient/models/protocols/storable_container_protocol.py +++ b/synapseclient/models/protocols/storable_container_protocol.py @@ -300,3 +300,59 @@ def sync_to_synapse( ``` """ return [] + + def generate_sync_manifest( + self: Self, + directory_path: str, + manifest_path: str, + *, + synapse_client: Synapse | None = None, + ) -> None: + """Walk a local directory, mirror its folder hierarchy under this + container in Synapse, and write a CSV manifest ready for + [sync_to_synapse][synapseclient.models.mixins.StorableContainer.sync_to_synapse]. + + The manifest has two columns: path (absolute, symlink-resolved) and + parentId (the Synapse ID of the file's containing folder). Existing + Synapse folders with matching names and parents are reused. Directory + symlinks inside directory_path are not followed; file symlinks record + the symlink path and upload the target's contents. Zero-byte files are + skipped with a warning — Synapse rejects empty files. I/O errors during + walk are logged and skipped; an empty source directory produces a + warning and a header-only manifest. + + Arguments: + directory_path: Path to the local directory to be pushed to + Synapse. + manifest_path: Path where the generated manifest CSV will be + written. + synapse_client: If not passed in and caching was not disabled by + Synapse.allow_client_caching(False) this will use the last + created instance from the Synapse class constructor. + + Raises: + ValueError: If this container's id is None, or if directory_path + does not exist or is not a directory, or if this container's + id exists in Synapse but is not a Folder or Project. + SynapseHTTPError: If this container's id does not exist in + Synapse. + + Example: Generate a manifest and upload the files + Mirror ./my_data under a Synapse project and then upload it. + + ```python + from synapseclient import Synapse + from synapseclient.models import Project + + syn = Synapse() + syn.login() + + project = Project(id="syn12345") + project.generate_sync_manifest( + directory_path="./my_data", + manifest_path="manifest.csv", + ) + project.sync_to_synapse(manifest_path="manifest.csv") + ``` + """ + return None diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index bd81bde6c..3647307da 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -1,4 +1,5 @@ -"""Services for reading and writing Synapse manifest CSV files. +"""Services for reading and generating Synapse manifest CSV files used to +drive bulk upload via Project.sync_to_synapse / Folder.sync_to_synapse. This includes reading a manifest CSV file and preparing it for upload, as well as writing a manifest CSV file from a list of File entities. @@ -10,11 +11,12 @@ import asyncio import csv import datetime +import functools import io import os import re from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, Union +from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, TypedDict, Union from synapseclient import Synapse from synapseclient.core import utils @@ -32,14 +34,14 @@ test_import_pandas, topolgical_sort, ) -from synapseclient.operations.factory_operations import FileOptions -from synapseclient.operations.factory_operations import get_async as factory_get_async +from synapseclient.operations.factory_operations import FileOptions, get_async if TYPE_CHECKING: from pandas import DataFrame, Series from synapseclient.models import UsedEntity, UsedURL from synapseclient.models.file import File + from synapseclient.models.folder import Folder MANIFEST_CSV_FILENAME = "manifest.csv" @@ -708,7 +710,7 @@ async def _check_one(syn_id: str) -> None: if not syn_id: return try: - container = await factory_get_async( + container = await get_async( synapse_id=syn_id, file_options=FileOptions(download_file=False), synapse_client=syn, @@ -1528,3 +1530,378 @@ async def _resolve_local_file_provenance( f"The provenance record for file: {owner_path} is incorrect.\n" f"Specifically {absolute_path} is not being uploaded and is not in Synapse." ) from e + + +GENERATED_MANIFEST_COLUMNS = ["path", "parentId"] + + +class ManifestRow(TypedDict): + """Shape of a single row in the generated manifest CSV. + + Keys mirror GENERATED_MANIFEST_COLUMNS so the dict can be passed + directly to csv.DictWriter without conversion. + """ + + path: str + parentId: str + + +async def generate_sync_manifest( + directory_path: str, + parent_id: str, + manifest_path: str, + *, + synapse_client: Synapse | None = None, +) -> None: + """Walk a local directory, mirror its folder hierarchy under parent_id in + Synapse, and write a CSV manifest ready for sync_to_synapse_async. + + The generated manifest has two columns: path (always an absolute, + symlink-resolved path — directory_path is resolved via os.path.realpath + before walking, so the manifest can be consumed from any working + directory and remains valid even if the original symlink is later + removed or re-pointed) and parentId (the Synapse ID of the file's + containing folder). Folders that already exist in Synapse with the same + name and parent are reused rather than re-created. + + Sibling folders at the same depth are created concurrently to reduce + latency on wide trees. Directories and files are traversed in sorted + order so manifest output is deterministic across runs and platforms. + Directory symlinks encountered inside directory_path are not followed. + File symlinks are not pruned: the symlink's path is recorded in the + manifest, and the underlying target's contents are uploaded when the + manifest is consumed. The root directory_path itself may be a symlink; + if so, it is resolved to its target and the target is walked. Zero-byte + files are skipped with a warning, since Synapse rejects empty files. + I/O errors raised by os.walk (for example, unreadable subdirectories) + are logged and skipped. If no uploadable files are found under + directory_path, a warning is logged and a header-only manifest is + written. If a folder store call fails, the exception propagates and no + manifest is written; any folders already created remain in Synapse and + will be reused on a retry. + + Arguments: + directory_path: Path to the local directory to be pushed to Synapse. + parent_id: Synapse ID of the Folder or Project to mirror the + directory hierarchy under. + manifest_path: Path where the generated manifest CSV will be written. + synapse_client: If not passed in and caching was not disabled by + Synapse.allow_client_caching(False) this will use the last + created instance from the Synapse class constructor. + + Returns: + None + + Raises: + ValueError: If the parent directory of manifest_path does not exist, + if manifest_path is an existing directory, if directory_path + does not exist or is not a directory, or if parent_id exists in + Synapse but is not a Folder or Project. + SynapseHTTPError: If parent_id does not exist in Synapse. + """ + client = Synapse.get_client(synapse_client=synapse_client) + + manifest_path = _expand_path(manifest_path) + manifest_parent = os.path.dirname(manifest_path) or "." + if not os.path.isdir(manifest_parent): + raise ValueError(f"Manifest output directory does not exist: {manifest_parent}") + if os.path.isdir(manifest_path): + raise ValueError( + f"Manifest output path is an existing directory, not a file: {manifest_path}" + ) + directory_path = await _resolve_and_validate_directory_path_async(directory_path) + await _validate_target_container_async(parent_id, client=client) + + rows = await _collect_manifest_rows_async(directory_path, parent_id, client) + + if not rows: + client.logger.warning( + f"No uploadable files found under {directory_path};" + " generated manifest contains only the header row." + ) + _write_manifest_csv(manifest_path, rows) + + +async def _resolve_and_validate_directory_path_async(directory_path: str) -> str: + """Resolve symlinks on directory_path and verify it is an existing + directory. + + Uses realpath (not abspath) so that if directory_path is itself a + symlink, the manifest records paths under the resolved target. That + keeps the manifest valid if the original symlink is later removed or + re-pointed. + + Arguments: + directory_path: Path to validate. + + Returns: + The realpath-resolved absolute directory path. + + Raises: + ValueError: If the resolved path does not exist or is not a + directory. + """ + directory_path = os.path.realpath(_expand_path(directory_path)) + if not os.path.isdir(directory_path): + raise ValueError(f"{directory_path} is not a directory or does not exist") + return directory_path + + +async def _validate_target_container_async(parent_id: str, client: Synapse) -> None: + """Verify that parent_id resolves to a Folder or Project in Synapse. + + Raises: + ValueError: If parent_id exists but is not a Folder or Project. + """ + container = await get_async( + synapse_id=parent_id, + file_options=FileOptions(download_file=False), + synapse_client=client, + ) + + from synapseclient.models import Folder, Project + + if not isinstance(container, (Folder, Project)): + raise ValueError(f"Container {parent_id} is not a Folder or Project") + + +async def _collect_manifest_rows_async( + directory_path: str, parent_id: str, client: Synapse +) -> list[ManifestRow]: + """Walk directory_path and produce manifest rows for every uploadable file. + + Orchestrates the layer between os.walk (filesystem traversal) and the + Synapse folder/file model. Called once by generate_sync_manifest, after + path validation and before _write_manifest_csv. + + Algorithm: + + 1. Initialize a local-to-Synapse parent map seeded with + directory_path → parent_id. Subfolder IDs are added as they are + created so descendants can look up their Synapse parent. + 2. Walk top-down with os.walk. Top-down order matters — a directory's + parent is always yielded (and registered) before the directory + itself is visited. Walk errors (permission denied, broken paths) + are logged via _log_walk_error rather than aborting traversal. + + For each directory: + + 3. Prune symlinks and sort dirnames in place via + _prune_symlinks_and_sort_dirnames so Synapse folders aren't created + for directories os.walk(followlinks=False) won't visit, and + descent is deterministic. + 4. Look up current_parent_id from the map (guaranteed present by the + top-down invariant). + 5. Create sibling folders concurrently via _create_child_folders_async + (asyncio.gather over Folder.store_async). Existing Synapse folders + with the same name and parent are reused, not duplicated. + 6. Register each new folder's Synapse ID in the map keyed by its + absolute local path so step 4 succeeds when os.walk descends into + it on a later iteration. + 7. Build manifest rows for files in the current directory via + _build_manifest_rows. Unreadable and zero-byte files are filtered + out with warnings inside that helper. + + Side effect vs return value: the return value is the flat list of + ManifestRow entries, but the side effect — creating the Synapse + folder hierarchy under parent_id to mirror the local tree — is what + makes the rows usable. Without it the parentId values in the rows + would point at folders that don't exist yet. + + Async because _create_child_folders_async hits the Synapse API and + parallelizes sibling-folder creation. The rest (walking, sorting, + building rows) is local I/O and would otherwise be sync. + + Arguments: + directory_path: Realpath-resolved local directory to walk. + parent_id: Synapse ID of the container that maps to directory_path. + client: Authenticated Synapse client. + + Returns: + A list of ManifestRow entries, one per uploadable file. + """ + rows: list[ManifestRow] = [] + # Step 1: seed the local-to-Synapse parent map with the root mapping. + local_to_synapse_parent: dict[str, str] = {directory_path: parent_id} + + # Step 2: walk the input dir. + for dirpath, dirnames, filenames in os.walk( + directory_path, onerror=functools.partial(_log_walk_error, client) + ): + # Step 3: prune symlinked dirs and sort in place + _prune_symlinks_and_sort_dirnames(dirnames, dirpath) + + # Step 4: look up the Synapse parent for the current local dir + current_parent_id = local_to_synapse_parent[dirpath] + + # Step 5: create sibling folders concurrently. + created = await _create_child_folders_async( + parent_id=current_parent_id, dirnames=dirnames, client=client + ) + # Step 6: register each new folder's Synapse ID. + for dirname, folder in created.items(): + local_to_synapse_parent[os.path.join(dirpath, dirname)] = folder.id + + # Step 7: build rows for files in this directory and accumulate. + rows.extend(_build_manifest_rows(dirpath, filenames, current_parent_id, client)) + + return rows + + +def _log_walk_error(client: Synapse, err: OSError) -> None: + """Turn an os.walk I/O error from fatal into logged-and-skipped. + + By default os.walk silently ignores any OSError it hits while listing a + directory (e.g., permission denied, vanished symlink target, dead mount + point). Silent skipping is dangerous during manifest generation because + the user would get an incomplete manifest with no indication that some + subtree was missed. os.walk accepts an onerror callback that, if + provided, is invoked with the OSError instead — which is what + _collect_manifest_rows_async wires up via functools.partial. + + This callback logs a warning through the Synapse client's logger naming + the offending path (err.filename) and the underlying error message, then + returns. Returning normally (rather than raising) is the contract that + tells os.walk to keep going, so traversal continues into the rest of the + tree. + + The client argument is first only so the caller can bind it via + functools.partial, leaving the (err: OSError) signature os.walk requires. + The client is needed solely to reach client.logger. + + Net effect: unreadable directories produce a visible warning but do not + abort manifest generation; readable parts of the tree still produce + manifest rows. This mirrors _build_manifest_rows, which also + warns-and-skips on unreadable or zero-byte files rather than failing the + whole run. + + Arguments: + client: Authenticated Synapse client, used only for its logger. + err: The OSError raised by os.walk while listing a directory. + """ + client.logger.warning( + f"Skipping unreadable path during manifest generation:" + f" {err.filename} ({err})" + ) + + +def _prune_symlinks_and_sort_dirnames(dirnames: list[str], dirpath: str) -> None: + """Prune symlinked subdirectories and sort the rest in place. + + Mutates dirnames in place — the documented os.walk hook for both + pruning the traversal and forcing deterministic descent order. + Symlinked subdirectories are dropped so we don't create Synapse + folders for directories whose contents os.walk (followlinks=False) + won't visit. + + Arguments: + dirnames: The dirnames list yielded by os.walk for the current dirpath. + dirpath: The current directory being walked. + """ + dirnames[:] = [d for d in dirnames if not os.path.islink(os.path.join(dirpath, d))] + dirnames.sort() + + +async def _create_child_folders_async( + parent_id: str, dirnames: list[str], client: Synapse +) -> dict[str, Folder]: + """Create sibling folders concurrently under a shared Synapse parent. + + Sibling folders have no ordering dependency on each other, so they are + gathered in a single batch rather than awaited one at a time. Each task + returns its own dirname alongside the resulting Folder so the caller's + name-to-Folder mapping does not depend on asyncio.gather preserving + submission order. + + Returns: + A dict mapping each input dirname to the Folder that was created or + reused for it. + """ + from synapseclient.models.folder import Folder + + semaphore = asyncio.Semaphore(max(client.max_threads * 2, 1)) + + # Each task carries its dirname through to the result so the caller can + # build a name-to-Folder mapping without relying on asyncio.gather's + # input-order guarantee. + # Pair-tagged results remain correct under any completion order. + async def _store(dirname: str) -> tuple[str, Folder]: + async with semaphore: + folder = await Folder(name=dirname, parent_id=parent_id).store_async( + synapse_client=client + ) + return dirname, folder + + pairs = await asyncio.gather(*[_store(d) for d in dirnames]) + return dict(pairs) + + +def _build_manifest_rows( + dirpath: str, + filenames: Iterable[str], + parent_id: str, + client: Synapse, +) -> list[ManifestRow]: + """Build manifest rows for the uploadable files in a single directory. + + Called once per directory by _collect_manifest_rows_async during the + os.walk traversal. All files in a single call share the same parent_id + (the Synapse folder corresponding to dirpath). Sync because everything + it does is local I/O with no Synapse API calls; the async parent only + awaits when creating Synapse folders, not when scanning files. + + Filenames are sorted before iteration so output is deterministic across + runs and platforms (os.walk does not guarantee filename order). Each + name is joined with dirpath to form an absolute filepath, then filtered + through _is_uploadable_file, which logs and drops unreadable files + (broken symlinks, permission errors) and zero-byte files. + + Arguments: + dirpath: Absolute directory path currently being walked. + filenames: Filenames yielded by os.walk for dirpath. + parent_id: Synapse ID of the folder that maps to dirpath. + client: Authenticated Synapse client, used only for logging. + + Returns: + A list of ManifestRow entries (path, parentId) ready to be written + into the manifest CSV (matching GENERATED_MANIFEST_COLUMNS), one + per uploadable file. + """ + rows: list[ManifestRow] = [] + for filename in sorted(filenames): + filepath = os.path.join(dirpath, filename) + if _is_uploadable_file(filepath, client): + rows.append({"path": filepath, "parentId": parent_id}) + return rows + + +def _write_manifest_csv(manifest_path: str, rows: list[ManifestRow]) -> None: + """Write generated manifest rows to a CSV at manifest_path.""" + with open(manifest_path, "w", encoding="utf-8", newline="") as fp: + writer = csv.DictWriter(fp, fieldnames=GENERATED_MANIFEST_COLUMNS) + writer.writeheader() + writer.writerows(rows) + + +def _is_uploadable_file(filepath: str, client: Synapse) -> bool: + """Return True if filepath can be included in a generated manifest. + + Logs a warning and returns False for files that cannot be uploaded: + unreadable files (broken symlinks, permission errors, races) and + zero-byte files (rejected by Synapse). + """ + try: + size = os.stat(filepath).st_size + except OSError as err: + client.logger.warning( + f"Skipping unreadable file during manifest generation:" + f" {filepath} ({err})" + ) + return False + if size == 0: + client.logger.warning( + f"Skipping zero-byte file (empty files cannot be" + f" uploaded to Synapse): {filepath}" + ) + return False + return True diff --git a/synapseutils/sync.py b/synapseutils/sync.py index 47f4a22a1..31c45f132 100644 --- a/synapseutils/sync.py +++ b/synapseutils/sync.py @@ -1559,11 +1559,28 @@ def _check_size_each_file(df): ) +@deprecated( + version="4.13.0", + reason=( + "To be removed in 5.0.0. Use Project.generate_sync_manifest or " + "Folder.generate_sync_manifest instead, which write a CSV manifest " + "compatible with the OOP Project.sync_to_synapse / " + "Folder.sync_to_synapse methods." + ), +) def generate_sync_manifest(syn, directory_path, parent_id, manifest_path) -> None: """Generate manifest for [syncToSynapse][synapseutils.sync.syncToSynapse] from a local directory. [Read more about the manifest file format](../../explanations/manifest_tsv/) + .. deprecated:: 4.13.0 + To be removed in 5.0.0. Use + [Project.generate_sync_manifest][synapseclient.models.mixins.StorableContainer.generate_sync_manifest] + or + [Folder.generate_sync_manifest][synapseclient.models.mixins.StorableContainer.generate_sync_manifest] + instead, which produce a CSV manifest (with a parentId column) that + works directly with Project.sync_to_synapse and Folder.sync_to_synapse. + Arguments: syn: A Synapse object with user's login, e.g. syn = synapseclient.login() directory_path: Path to local directory to be pushed to Synapse. diff --git a/tests/integration/synapseclient/models/async/test_storable_container_async.py b/tests/integration/synapseclient/models/async/test_storable_container_async.py index 2c54dc453..5b8af5266 100644 --- a/tests/integration/synapseclient/models/async/test_storable_container_async.py +++ b/tests/integration/synapseclient/models/async/test_storable_container_async.py @@ -1,14 +1,19 @@ """Integration tests for StorableContainer""" import csv +import os +import platform import uuid from pathlib import Path from typing import Callable +import pandas as pd import pytest +import pytest_asyncio import synapseclient.core.utils as utils from synapseclient import Synapse +from synapseclient.core.exceptions import SynapseHTTPError from synapseclient.models import File, Folder, Project from synapseclient.models.activity import UsedURL @@ -396,3 +401,170 @@ async def test_error_column_rows_skipped( # THEN only the valid row was uploaded assert len(uploaded_files) == 1 assert uploaded_files[0].name == unique_name + + +class TestGenerateSyncManifest: + """Integration tests for StorableContainer.generate_sync_manifest_async + against the live Synapse API.""" + + @pytest_asyncio.fixture(loop_scope="session", scope="function") + async def scope_folder( + self, + syn: Synapse, + project_model: Project, + schedule_for_cleanup: Callable[..., None], + request: pytest.FixtureRequest, + ) -> Folder: + """A fresh Folder under the worker-scoped project per test, so + assertions can reference the folder's full child state without + interference from sibling tests. A Folder is cheaper to create than + a Project while providing equivalent isolation for these tests. + """ + folder = await Folder( + name=f"{request.node.name}_{uuid.uuid4()}", + parent_id=project_model.id, + ).store_async(synapse_client=syn) + schedule_for_cleanup(folder) + return folder + + async def test_flat_directory_uses_parent_id( + self, + syn: Synapse, + scope_folder: Folder, + tmp_path: Path, + ) -> None: + """Flat directories should produce a manifest where every file points + directly at the container's id, and empty files should be skipped. + No Synapse folders should be created when there are no subdirectories + to mirror. + """ + # GIVEN a flat directory of non-empty files plus one empty file + src = tmp_path / "flat" + src.mkdir() + (src / "a.txt").write_text("alpha") + (src / "b.txt").write_text("bravo") + (src / "empty.txt").write_text("") + manifest = tmp_path / "manifest.csv" + + # WHEN I generate a sync manifest on the scope folder + await scope_folder.generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=syn, + ) + + # THEN the manifest only contains the non-empty files, all pointing + # at the scope folder as their parent, and paths are absolute + df = pd.read_csv(manifest) + assert list(df.columns) == ["path", "parentId"] + assert sorted(os.path.basename(p) for p in df["path"]) == ["a.txt", "b.txt"] + assert (df["parentId"] == scope_folder.id).all() + for path in df["path"]: + assert os.path.isabs(path) + + # AND no folders were created under the scope folder + await scope_folder.sync_from_synapse_async( + download_file=False, recursive=False, synapse_client=syn + ) + assert scope_folder.folders == [] + + async def test_nested_directory_creates_folders( + self, + syn: Synapse, + scope_folder: Folder, + tmp_path: Path, + ) -> None: + """Nested directory trees should create matching Synapse folders at + each level, and the manifest parentId for each file should be the ID + of the Synapse folder corresponding to the file's on-disk directory. + """ + # GIVEN a nested directory tree with sibling folders at the root and a + # deeper leaf folder + src = tmp_path / "root" + sibling_a = src / "sibling_a" + sibling_b = src / "sibling_b" + deep = sibling_a / "deep" + deep.mkdir(parents=True) + sibling_b.mkdir() + + (src / "root.txt").write_text("at root") + (sibling_a / "a.txt").write_text("sibling a") + (sibling_b / "b.txt").write_text("sibling b") + (deep / "deep.txt").write_text("deep file") + manifest = tmp_path / "manifest.csv" + + # WHEN I generate a sync manifest on the scope folder + await scope_folder.generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=syn, + ) + + # THEN each manifest row's parentId identifies the Synapse folder that + # contains the file on disk + df = pd.read_csv(manifest) + by_basename = { + os.path.basename(p): pid for p, pid in zip(df["path"], df["parentId"]) + } + assert by_basename["root.txt"] == scope_folder.id + sibling_a_id = by_basename["a.txt"] + sibling_b_id = by_basename["b.txt"] + deep_id = by_basename["deep.txt"] + + for path in df["path"]: + assert os.path.isabs(path) + + # AND the Synapse tree matches the local layout + await scope_folder.sync_from_synapse_async( + download_file=False, recursive=True, synapse_client=syn + ) + top_level = {f.name: f for f in scope_folder.folders} + assert sorted(top_level) == ["sibling_a", "sibling_b"] + assert top_level["sibling_a"].id == sibling_a_id + assert top_level["sibling_b"].id == sibling_b_id + assert [(f.name, f.id) for f in top_level["sibling_a"].folders] == [ + ("deep", deep_id) + ] + assert top_level["sibling_b"].folders == [] + + async def test_existing_folders_are_reused( + self, + syn: Synapse, + scope_folder: Folder, + tmp_path: Path, + ) -> None: + """When a Synapse folder with a matching name already exists under + the container, the method should reuse its ID instead of creating a + new folder or raising a conflict. + """ + # GIVEN a folder that already exists in Synapse under the scope folder + folder_name = "preexisting" + existing = await Folder( + name=folder_name, parent_id=scope_folder.id + ).store_async(synapse_client=syn) + + # AND a local directory that mirrors that folder's name with a file inside + src = tmp_path / "root" + child = src / folder_name + child.mkdir(parents=True) + (child / "payload.txt").write_text("payload") + manifest = tmp_path / "manifest.csv" + + # WHEN I generate a sync manifest + await scope_folder.generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=syn, + ) + + # THEN the manifest reuses the existing folder's Synapse ID + df = pd.read_csv(manifest) + assert len(df) == 1 + assert df["parentId"].iloc[0] == existing.id + + # AND the scope folder has exactly the one pre-existing child folder + await scope_folder.sync_from_synapse_async( + download_file=False, recursive=False, synapse_client=syn + ) + assert len(scope_folder.folders) == 1 + assert scope_folder.folders[0].id == existing.id diff --git a/tests/unit/synapseclient/models/async/unit_test_manifest_async.py b/tests/unit/synapseclient/models/async/unit_test_manifest_async.py index 783b40365..63577acfd 100644 --- a/tests/unit/synapseclient/models/async/unit_test_manifest_async.py +++ b/tests/unit/synapseclient/models/async/unit_test_manifest_async.py @@ -815,7 +815,7 @@ async def test_valid_container_passes(self, container_cls: str) -> None: cls = {"Project": Project, "Folder": Folder}[container_cls] mock_container = MagicMock(spec=cls) with patch( - "synapseclient.models.services.manifest.factory_get_async", + "synapseclient.models.services.manifest.get_async", new=AsyncMock(return_value=mock_container), ): await _check_parent_containers_async(["syn1"], syn=self.syn) @@ -824,7 +824,7 @@ async def test_non_container_raises(self) -> None: """A parent ID that resolves to a non-container Synapse entity raises ValueError.""" mock_entity = MagicMock() # not a Folder or Project with patch( - "synapseclient.models.services.manifest.factory_get_async", + "synapseclient.models.services.manifest.get_async", new=AsyncMock(return_value=mock_entity), ): with pytest.raises(ValueError, match="not a Folder or Project"): @@ -833,7 +833,7 @@ async def test_non_container_raises(self) -> None: async def test_empty_parent_id_skipped(self) -> None: """An empty parent ID string is skipped without calling the Synapse API.""" with patch( - "synapseclient.models.services.manifest.factory_get_async", + "synapseclient.models.services.manifest.get_async", new=AsyncMock(), ) as mock_get: await _check_parent_containers_async([""], syn=self.syn) @@ -844,7 +844,7 @@ async def test_nonexistent_parent_id_reraises_http_error(self) -> None: from synapseclient.core.exceptions import SynapseHTTPError with patch( - "synapseclient.models.services.manifest.factory_get_async", + "synapseclient.models.services.manifest.get_async", new=AsyncMock(side_effect=SynapseHTTPError("Not found")), ): with pytest.raises(SynapseHTTPError): diff --git a/tests/unit/synapseclient/models/async/unit_test_storable_container_async.py b/tests/unit/synapseclient/models/async/unit_test_storable_container_async.py index a954b6408..2df97faac 100644 --- a/tests/unit/synapseclient/models/async/unit_test_storable_container_async.py +++ b/tests/unit/synapseclient/models/async/unit_test_storable_container_async.py @@ -1,14 +1,18 @@ """Unit tests for StorableContainer""" import csv +import os +import platform import uuid from pathlib import Path -from unittest.mock import patch +from typing import Any +from unittest.mock import AsyncMock, patch import pytest from synapseclient import Synapse -from synapseclient.models import Project +from synapseclient.models import File, Folder, Project +from synapseclient.models.services import manifest as manifest_module def _write_manifest(rows: list[dict], tmp_path: Path) -> Path: @@ -81,3 +85,677 @@ async def test_all_rows_have_errors_no_crash(self, tmp_path: Path) -> None: ) mock_upload.assert_not_called() assert result == [] + + +class TestGenerateSyncManifest: + """Tests for StorableContainer.generate_sync_manifest_async.""" + + async def test_unstored_container_raises( + self, tmp_path: Path, syn: Synapse + ) -> None: + """Calling the method on a container whose id is None is a caller bug + and must surface as a ValueError rather than a confusing downstream + error. The integration suite cannot exercise this case because every + container fixture has already been stored.""" + # GIVEN a local directory with one file + src = tmp_path / "src" + src.mkdir() + (src / "a.txt").write_text("hello") + manifest = tmp_path / "manifest.csv" + + # WHEN generate_sync_manifest_async is called on a Project that has + # never been stored (id is None) + # THEN a ValueError is raised explaining the container is not stored + with pytest.raises(ValueError, match="has not been stored in Synapse"): + await Project(name="unstored").generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=syn, + ) + + async def test_empty_directory_logs_warning( + self, tmp_path: Path, syn: Synapse + ) -> None: + """An empty directory logs a 'No uploadable files found' warning so + the caller knows why their manifest is empty. The integration suite + covers the resulting header-only file content.""" + # GIVEN an empty local directory + empty_dir = tmp_path / "empty" + empty_dir.mkdir() + manifest = tmp_path / "manifest.csv" + + # WHEN we generate a manifest for it (parent-id validation is stubbed + # so this stays offline) + with ( + patch( + "synapseclient.models.services.manifest._validate_target_container_async", + new=AsyncMock(return_value=None), + ), + patch.object(syn.logger, "warning") as mock_warning, + ): + await Folder(id="syn1").generate_sync_manifest_async( + directory_path=str(empty_dir), + manifest_path=str(manifest), + synapse_client=syn, + ) + + # THEN a "no uploadable files" warning is logged so the caller knows + # why their manifest is empty + assert any( + "No uploadable files found" in call.args[0] + for call in mock_warning.call_args_list + ) + + @pytest.mark.parametrize( + "kind", + ["missing-path", "file-path"], + ) + async def test_invalid_directory_path_raises( + self, tmp_path: Path, syn: Synapse, kind: str + ) -> None: + """The public surface raises ValueError for both a non-existent path + and a regular file path, and writes no manifest. The integration suite + only exercises the missing-path case, so the file-path case is covered + here.""" + # GIVEN either a non-existent path or a regular file path + if kind == "missing-path": + path = tmp_path / "does_not_exist" + else: + path = tmp_path / "a.txt" + path.write_text("hello") + manifest = tmp_path / "manifest.csv" + + # WHEN we try to generate a manifest for it + # THEN a ValueError is raised + with pytest.raises(ValueError, match="is not a directory or does not exist"): + await Folder(id="syn1").generate_sync_manifest_async( + directory_path=str(path), + manifest_path=str(manifest), + synapse_client=syn, + ) + + # AND no manifest file is written (failure happens before any output) + assert not manifest.exists() + + async def test_parent_id_validated_upfront( + self, tmp_path: Path, syn: Synapse + ) -> None: + """The container's id is validated via _validate_target_container_async + before the directory is walked. This ordering invariant cannot be + verified through the live API; integration tests can only observe the + end state.""" + # GIVEN a directory with one file + src = tmp_path / "src" + src.mkdir() + (src / "a.txt").write_text("hello") + manifest = tmp_path / "manifest.csv" + + # WHEN we generate a manifest under a Folder with id "synROOT" with + # the parent-id validation stubbed so it stays offline + with patch( + "synapseclient.models.services.manifest._validate_target_container_async", + new_callable=AsyncMock, + ) as mock_validate: + await Folder(id="synROOT").generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=syn, + ) + + # THEN _validate_target_container_async was awaited exactly once + # with that id, ensuring the parent check happens before traversal + mock_validate.assert_awaited_once_with("synROOT", client=syn) + + +class TestResolveAndValidateDirectoryPath: + """Tests for manifest._resolve_and_validate_directory_path_async.""" + + async def test_valid_directory_returns_realpath(self, tmp_path: Path) -> None: + """A valid directory returns the realpath-resolved absolute path.""" + # GIVEN a real directory on disk + src = tmp_path / "src" + src.mkdir() + + # WHEN we validate the directory path + result = await manifest_module._resolve_and_validate_directory_path_async( + directory_path=str(src) + ) + + # THEN the realpath-resolved absolute path is returned + assert result == os.path.realpath(str(src)) + + @pytest.mark.parametrize( + "kind", + ["missing-path", "file-path"], + ) + async def test_non_directory_path_raises(self, tmp_path: Path, kind: str) -> None: + """Both a non-existent path and a regular file path raise ValueError.""" + # GIVEN either a non-existent path or a regular file path + if kind == "missing-path": + path = tmp_path / "missing" + else: + path = tmp_path / "f.txt" + path.write_text("hello") + + # WHEN we validate the directory path + # THEN a ValueError is raised explaining the path is not a directory + with pytest.raises(ValueError, match="is not a directory or does not exist"): + await manifest_module._resolve_and_validate_directory_path_async( + directory_path=str(path) + ) + + @pytest.mark.skipif( + platform.system() == "Windows", + reason="Symlink creation requires elevated privileges on Windows.", + ) + async def test_symlink_resolved_to_target(self, tmp_path: Path) -> None: + """If directory_path is a symlink, the resolved target path is returned + so the manifest survives the original symlink being removed/redirected.""" + # GIVEN a target directory and a symlink pointing to it + target = tmp_path / "target" + target.mkdir() + link = tmp_path / "link" + os.symlink(str(target), str(link)) + + # WHEN we validate the symlink path + result = await manifest_module._resolve_and_validate_directory_path_async( + directory_path=str(link) + ) + + # THEN the realpath of the symlink target is returned (not the link) + assert result == os.path.realpath(str(target)) + + +class TestValidateTargetContainer: + """Tests for manifest._validate_target_container_async.""" + + @pytest.mark.parametrize( + "container", + [ + pytest.param(Folder(id="syn1", name="folder"), id="folder"), + pytest.param(Project(id="syn1", name="project"), id="project"), + ], + ) + async def test_container_passes( + self, syn: Synapse, container: Folder | Project + ) -> None: + """Both Folder and Project pass validation""" + # GIVEN get_async is stubbed to return either a Folder or Project + with patch.object( + manifest_module, + "get_async", + new_callable=AsyncMock, + return_value=container, + ): + # WHEN we validate the target container by id + result = await manifest_module._validate_target_container_async( + "syn1", client=syn + ) + # THEN validation returns None (no error raised) + assert result is None + + async def test_non_container_raises_value_error(self, syn: Synapse) -> None: + """A File (or any non-container) raises ValueError naming the id.""" + # GIVEN get_async is stubbed to return a File (non-container) entity + not_container = File(id="syn1", name="file") + with patch.object( + manifest_module, + "get_async", + new_callable=AsyncMock, + return_value=not_container, + ): + # WHEN we validate the target container by id + # THEN a ValueError is raised naming the id and rejecting the type + with pytest.raises( + ValueError, match=r"Container syn1 is not a Folder or Project" + ): + await manifest_module._validate_target_container_async( + "syn1", client=syn + ) + + +class TestCollectManifestRows: + """Tests for manifest._collect_manifest_rows_async.""" + + async def test_empty_directory_returns_empty_list( + self, tmp_path: Path, syn: Synapse + ) -> None: + """An empty directory produces no rows.""" + # GIVEN an empty local directory + empty = tmp_path / "empty" + empty.mkdir() + + # WHEN we collect manifest rows for it + rows = await manifest_module._collect_manifest_rows_async( + directory_path=str(empty), parent_id="syn1", client=syn + ) + + # THEN the result is an empty list + assert rows == [] + + async def test_flat_directory_uses_root_parent_id( + self, tmp_path: Path, syn: Synapse + ) -> None: + """Files at the top level all share the supplied parent_id and are + emitted in sorted order. No Synapse folders are created when the + local tree has no subdirectories.""" + # GIVEN a flat directory containing two files in non-sorted order + src = tmp_path / "src" + src.mkdir() + (src / "b.txt").write_text("two") + (src / "a.txt").write_text("one") + + # WHEN we collect manifest rows under a root parent_id + with patch( + "synapseclient.models.Folder.store_async", + new_callable=AsyncMock, + ) as mock_store_async: + rows = await manifest_module._collect_manifest_rows_async( + directory_path=str(src), parent_id="synROOT", client=syn + ) + + # THEN the rows are emitted in sorted filename order + assert [os.path.basename(row["path"]) for row in rows] == ["a.txt", "b.txt"] + # AND every row uses the root parent_id + assert all(row["parentId"] == "synROOT" for row in rows) + # AND no Synapse folders are created (no subdirectories to mirror) + mock_store_async.assert_not_called() + + async def test_nested_directories_use_created_folder_ids( + self, tmp_path: Path, syn: Synapse + ) -> None: + """Files in nested subdirectories receive the id of the Synapse folder + created for their innermost local directory.""" + # GIVEN a nested directory tree with one file at the deepest level + src = tmp_path / "src" + (src / "level1" / "level2").mkdir(parents=True) + (src / "level1" / "level2" / "deep.txt").write_text("content") + + # AND a fake _create_child_folders_async that returns Folders with + # deterministic ids (syn_) so the test can verify how the + # production code threads parent ids from one walk iteration to + # the next. Records (parent_id, dirnames) per call to assert call + # order AND that each call's parent_id is the fake id assigned to + # the previously-created folder. + observed: list[tuple[str, list[str]]] = [] + + async def fake_create_child_folders_async( + parent_id: str, dirnames: list[str], client: Synapse + ) -> dict[str, Folder]: + observed.append((parent_id, list(dirnames))) + return {d: Folder(name=d, id=f"syn_{d}") for d in dirnames} + + # WHEN we collect manifest rows under the root parent_id + with patch.object( + manifest_module, + "_create_child_folders_async", + new=fake_create_child_folders_async, + ): + rows = await manifest_module._collect_manifest_rows_async( + directory_path=str(src), parent_id="synROOT", client=syn + ) + + # AND folders are created top-down: level1 under root, then level2 + # under the freshly-created level1 (id syn_level1), and a final + # no-op call for the leaf directory. + assert observed == [ + ("synROOT", ["level1"]), + ("syn_level1", ["level2"]), + ("syn_level2", []), + ] + # AND the deep file's parentId is the innermost folder's id + assert rows == [ + { + "path": os.path.join(str(src), "level1", "level2", "deep.txt"), + "parentId": "syn_level2", + } + ] + + async def test_mixed_flat_and_nested_uses_correct_parent_ids( + self, tmp_path: Path, syn: Synapse + ) -> None: + """Files at the root use the supplied parent_id while files inside + subdirectories use the id of the folder created for their containing + directory. The flat-only and nested-only tests each cover one branch + in isolation; this combined case asserts both work together in a + single walk.""" + # GIVEN a tree with one file at the root AND a subdir with a file + src = tmp_path / "src" + src.mkdir() + (src / "root.txt").write_text("root-data") + (src / "child").mkdir() + (src / "child" / "nested.txt").write_text("nested-data") + + # AND a fake _create_child_folders_async that assigns deterministic + # ids so the test can distinguish root vs. nested rows by parentId + async def fake_create_child_folders_async( + parent_id: str, dirnames: list[str], client: Synapse + ) -> dict[str, Folder]: + return {d: Folder(name=d, id=f"syn_{d}") for d in dirnames} + + # WHEN we collect manifest rows under the root parent_id + with patch.object( + manifest_module, + "_create_child_folders_async", + new=fake_create_child_folders_async, + ): + rows = await manifest_module._collect_manifest_rows_async( + directory_path=str(src), parent_id="synROOT", client=syn + ) + + # THEN the root-level file uses the root parent_id and the nested + # file uses the id assigned to its containing folder + by_basename = {os.path.basename(r["path"]): r["parentId"] for r in rows} + assert by_basename == { + "root.txt": "synROOT", + "nested.txt": "syn_child", + } + + async def test_multiple_siblings_share_parent_id_in_one_batch( + self, tmp_path: Path, syn: Synapse + ) -> None: + """Sibling directories at the same depth are passed to + _create_child_folders_async in a single batch (one call), all sharing + the same parent_id.""" + # GIVEN three sibling directories at the same depth, each with a + # uniquely-named file so rows can be matched back to their folder + src = tmp_path / "src" + src.mkdir() + for name in ("alpha", "beta", "gamma"): + (src / name).mkdir() + (src / name / f"{name}.txt").write_text("data") + + # AND a fake _create_child_folders_async that records each call's + # (parent_id, dirnames) so we can assert there's exactly one batch + # for the three siblings + observed: list[tuple[str, list[str]]] = [] + + async def fake_create_child_folders_async( + parent_id: str, dirnames: list[str], client: Synapse + ) -> dict[str, Folder]: + observed.append((parent_id, list(dirnames))) + return {d: Folder(name=d, id=f"syn_{d}") for d in dirnames} + + # WHEN we collect manifest rows under the root parent_id + with patch.object( + manifest_module, + "_create_child_folders_async", + new=fake_create_child_folders_async, + ): + rows = await manifest_module._collect_manifest_rows_async( + directory_path=str(src), parent_id="synROOT", client=syn + ) + + # THEN the first call passes all three siblings in sorted order + # under the root parent_id (one batch, not three) + assert observed[0] == ("synROOT", ["alpha", "beta", "gamma"]) + # AND each subsequent call (one per sibling, for its own children) + # passes an empty dirnames list because each sibling is a leaf dir + assert all(call[1] == [] for call in observed[1:]) + # AND every file's parentId matches the id of its containing folder + assert {os.path.basename(r["path"]): r["parentId"] for r in rows} == { + "alpha.txt": "syn_alpha", + "beta.txt": "syn_beta", + "gamma.txt": "syn_gamma", + } + + +class TestLogWalkError: + """Tests for manifest._log_walk_error.""" + + def test_logs_warning_with_filename_and_message(self, syn: Synapse) -> None: + """The warning message references the offending filename and the + underlying OSError representation.""" + # GIVEN an OSError carrying an errno, message, and filename + err = OSError(13, "permission denied", "/tmp/unreadable") + + # WHEN _log_walk_error is invoked with that error + with patch.object(syn.logger, "warning") as mock_warning: + manifest_module._log_walk_error(syn, err) + + # THEN logger.warning is called exactly once + mock_warning.assert_called_once() + # AND the message references the offending filename and OSError text + message = mock_warning.call_args.args[0] + assert "/tmp/unreadable" in message + assert "permission denied" in message + + def test_logs_warning_with_no_filename(self, syn: Synapse) -> None: + """An OSError without a filename still produces a warning rather than + raising (best-effort during traversal).""" + # GIVEN an OSError without a filename attribute + err = OSError("io failure") + + # WHEN _log_walk_error is invoked with that error + with patch.object(syn.logger, "warning") as mock_warning: + manifest_module._log_walk_error(syn, err) + + # THEN logger.warning is called once (no exception is raised) + mock_warning.assert_called_once() + + +class TestCreateChildFolders: + """Tests for manifest._create_child_folders_async.""" + + async def test_empty_dirnames_returns_empty_dict(self, syn: Synapse) -> None: + """No dirnames produces no folders and never invokes Folder.store_async.""" + # GIVEN an empty list of dirnames + dirnames = [] + # WHEN we ask for child folders + with patch( + "synapseclient.models.Folder.store_async", + new_callable=AsyncMock, + ) as mock_store_async: + result = await manifest_module._create_child_folders_async( + parent_id="synROOT", dirnames=dirnames, client=syn + ) + + # THEN the result is an empty dict and no Synapse calls were made + assert result == {} + mock_store_async.assert_not_called() + + async def test_returns_dirname_to_folder_mapping(self, syn: Synapse) -> None: + """Each input dirname maps to the Folder returned by store_async, all + sharing the supplied parent_id, regardless of completion order.""" + # GIVEN three sibling dirnames to create under a single parent + dirnames = ["alpha", "beta", "gamma"] + + # AND a fake Folder.store_async that assigns a deterministic id per + # call so the returned Folders can be asserted against by id. + # Patched onto Folder.store_async as an unbound method, so the bound + # Folder instance arrives as `self`. Mutating self.id and returning + # self matches the real signature without hitting the network. + observed_parent_ids: list[str] = [] + + async def fake_store_async(self: Any, *args: Any, **kwargs: Any) -> Any: + observed_parent_ids.append(self.parent_id) + self.id = f"syn_{self.name}" + return self + + # WHEN we create the child folders concurrently + with patch( + "synapseclient.models.Folder.store_async", + new=fake_store_async, + ): + result = await manifest_module._create_child_folders_async( + parent_id="synROOT", dirnames=dirnames, client=syn + ) + + # THEN every dirname maps to its own Folder with the expected id + assert set(result.keys()) == set(dirnames) + for dirname in dirnames: + assert result[dirname].name == dirname + assert result[dirname].id == f"syn_{dirname}" + + # AND every store call used the same parent_id + assert observed_parent_ids == ["synROOT"] * len(dirnames) + + +class TestPruneSymlinksAndSortDirnames: + """Tests for manifest._prune_symlinks_and_sort_dirnames.""" + + def test_sorts_dirnames_in_place(self, tmp_path: Path) -> None: + """Plain directories are sorted in place; the original list object is + mutated (os.walk relies on identity, not return value).""" + # GIVEN three real directories and an unsorted dirnames list referring to them + for name in ("c", "a", "b"): + (tmp_path / name).mkdir() + dirnames = ["c", "a", "b"] + + # WHEN we prune and sort + manifest_module._prune_symlinks_and_sort_dirnames(dirnames, str(tmp_path)) + + # AND the caller's list is sorted alphabetically + assert dirnames == ["a", "b", "c"] + + @pytest.mark.skipif( + platform.system() == "Windows", + reason="Symlink creation requires elevated privileges on Windows.", + ) + def test_drops_symlinked_subdirectories(self, tmp_path: Path) -> None: + """Symlinked subdirectories are pruned so we don't mirror folders whose + contents os.walk(followlinks=False) won't visit.""" + # GIVEN one real subdirectory and one symlink pointing at another + # directory, both listed in dirnames + real = tmp_path / "real" + real.mkdir() + target = tmp_path / "target" + target.mkdir() + link = tmp_path / "link" + os.symlink(str(target), str(link)) + + dirnames = ["link", "real"] + + # WHEN we prune and sort + manifest_module._prune_symlinks_and_sort_dirnames(dirnames, str(tmp_path)) + + # THEN only the real directory remains; the symlink was dropped + assert dirnames == ["real"] + + def test_empty_list_unchanged(self, tmp_path: Path) -> None: + """An empty list remains empty.""" + # GIVEN an empty dirnames list + dirnames: list[str] = [] + # WHEN we prune and sort + manifest_module._prune_symlinks_and_sort_dirnames(dirnames, str(tmp_path)) + # THEN it is still empty + assert dirnames == [] + + @pytest.mark.skipif( + platform.system() == "Windows", + reason="Symlink creation requires elevated privileges on Windows.", + ) + def test_prunes_symlinks_and_sorts_in_one_call(self, tmp_path: Path) -> None: + """Symlinks are pruned AND the surviving entries are sorted in a + single call. The prune-only and sort-only tests cover each behavior + in isolation, so this guards against a future change where one step + accidentally undoes the other (e.g., sort happening before prune + and reintroducing the link, or prune leaving the list in walk + order).""" + # GIVEN two real subdirs and one symlinked subdir, in non-sorted + # order with the symlink between them + for name in ("c", "a"): + (tmp_path / name).mkdir() + target = tmp_path / "target" + target.mkdir() + link = tmp_path / "link" + os.symlink(str(target), str(link)) + + dirnames = ["c", "link", "a"] + + # WHEN we prune and sort + manifest_module._prune_symlinks_and_sort_dirnames(dirnames, str(tmp_path)) + + # THEN the symlink is gone and the remaining real dirs are sorted + assert dirnames == ["a", "c"] + + +class TestBuildManifestRows: + """Tests for manifest._build_manifest_rows.""" + + def test_returns_rows_in_sorted_order(self, tmp_path: Path, syn: Synapse) -> None: + """Filenames are visited in sorted order so manifest output is + deterministic regardless of filesystem yield order.""" + # GIVEN three real files and an unsorted filenames list + for name in ("c.txt", "a.txt", "b.txt"): + (tmp_path / name).write_text("data") + + # WHEN we build manifest rows under "syn1" + rows = manifest_module._build_manifest_rows( + dirpath=str(tmp_path), + filenames=["c.txt", "a.txt", "b.txt"], + parent_id="syn1", + client=syn, + ) + + # THEN the rows are emitted in sorted filename order + assert [os.path.basename(row["path"]) for row in rows] == [ + "a.txt", + "b.txt", + "c.txt", + ] + # AND every row uses the supplied parent_id + assert all(row["parentId"] == "syn1" for row in rows) + + def test_skips_zero_byte_and_missing(self, tmp_path: Path, syn: Synapse) -> None: + """Zero-byte files and unreadable/missing files are dropped via + _is_uploadable_file.""" + # GIVEN one real non-empty file and one zero-byte file (and a + # filename for a file that doesn't exist on disk) + (tmp_path / "ok.txt").write_text("hello") + (tmp_path / "empty.txt").write_text("") + + # WHEN we build manifest rows for all three filenames + with patch.object(syn.logger, "warning"): + rows = manifest_module._build_manifest_rows( + dirpath=str(tmp_path), + filenames=["ok.txt", "empty.txt", "missing.txt"], + parent_id="syn1", + client=syn, + ) + + # THEN only the non-empty real file produces a row; the zero-byte + # and missing files are filtered out + assert [os.path.basename(row["path"]) for row in rows] == ["ok.txt"] + + def test_empty_filenames_returns_empty(self, tmp_path: Path, syn: Synapse) -> None: + """No filenames produce no rows.""" + # GIVEN an empty filenames list + # WHEN we build manifest rows + rows = manifest_module._build_manifest_rows( + dirpath=str(tmp_path), + filenames=[], + parent_id="syn1", + client=syn, + ) + # THEN the result is empty + assert rows == [] + + +class TestIsUploadableFile: + """Tests for manifest._is_uploadable_file.""" + + def test_regular_file_is_uploadable(self, tmp_path: Path, syn: Synapse) -> None: + """A non-empty readable file returns True.""" + # GIVEN a regular non-empty readable file + f = tmp_path / "ok.txt" + f.write_text("hello") + # THEN the result is True + assert manifest_module._is_uploadable_file(str(f), syn) is True + + def test_zero_byte_file_skipped(self, tmp_path: Path, syn: Synapse) -> None: + """A zero-byte file is skipped because Synapse rejects empty uploads.""" + # GIVEN a zero-byte file + f = tmp_path / "empty.txt" + f.write_text("") + # THEN the result is False (Synapse rejects empty uploads) + assert manifest_module._is_uploadable_file(str(f), syn) is False + + def test_missing_file_skipped(self, tmp_path: Path, syn: Synapse) -> None: + """A path that doesn't exist (e.g. broken symlink, race) is skipped + rather than raising OSError up to the caller.""" + # GIVEN a path that doesn't exist on disk (e.g. broken symlink, + # race against rmdir) + missing = tmp_path / "nope.txt" + # THEN the result is False rather than raising OSError + assert manifest_module._is_uploadable_file(str(missing), syn) is False