From 62fbf56f562e25c55a68328b3a204429675eccd3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 4 May 2026 11:36:21 -0700 Subject: [PATCH 01/11] added generate_sync_manifest method --- docs/reference/experimental/async/folder.md | 1 + docs/reference/experimental/async/project.md | 1 + docs/reference/experimental/sync/folder.md | 1 + docs/reference/experimental/sync/project.md | 1 + .../tutorial_scripts/upload_data_in_bulk.py | 28 +- docs/tutorials/python/upload_data_in_bulk.md | 22 +- .../models/mixins/storable_container.py | 95 ++++++ .../protocols/storable_container_protocol.py | 76 +++++ synapseclient/models/services/manifest.py | 172 +++++++++- synapseclient/synapsePythonClient | 2 +- synapseutils/sync.py | 17 + .../async/test_storable_container_async.py | 319 ++++++++++++++++++ .../unit_test_storable_container_async.py | 208 +++++++++++- 13 files changed, 912 insertions(+), 31 deletions(-) diff --git a/docs/reference/experimental/async/folder.md b/docs/reference/experimental/async/folder.md index 7135546b4..6f49eaba1 100644 --- a/docs/reference/experimental/async/folder.md +++ b/docs/reference/experimental/async/folder.md @@ -17,6 +17,7 @@ at your own risk. - walk_async - sync_from_synapse_async - sync_to_synapse_async + - generate_sync_manifest_async - flatten_file_list - map_directory_to_all_contained_files - get_permissions_async diff --git a/docs/reference/experimental/async/project.md b/docs/reference/experimental/async/project.md index c7f8bcb01..2a0535689 100644 --- a/docs/reference/experimental/async/project.md +++ b/docs/reference/experimental/async/project.md @@ -16,6 +16,7 @@ at your own risk. - walk_async - sync_from_synapse_async - sync_to_synapse_async + - generate_sync_manifest_async - flatten_file_list - map_directory_to_all_contained_files - get_permissions_async diff --git a/docs/reference/experimental/sync/folder.md b/docs/reference/experimental/sync/folder.md index 9703680f9..c2e64e04c 100644 --- a/docs/reference/experimental/sync/folder.md +++ b/docs/reference/experimental/sync/folder.md @@ -28,6 +28,7 @@ at your own risk. - walk - sync_from_synapse - sync_to_synapse + - generate_sync_manifest - flatten_file_list - map_directory_to_all_contained_files - get_permissions diff --git a/docs/reference/experimental/sync/project.md b/docs/reference/experimental/sync/project.md index b83bb7b4a..53b100b61 100644 --- a/docs/reference/experimental/sync/project.md +++ b/docs/reference/experimental/sync/project.md @@ -27,6 +27,7 @@ at your own risk. - walk - sync_from_synapse - sync_to_synapse + - generate_sync_manifest - flatten_file_list - map_directory_to_all_contained_files - get_permissions diff --git a/docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py b/docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py index fa3c1ffab..7d7d6f45d 100644 --- a/docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py +++ b/docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py @@ -2,6 +2,7 @@ Here is where you'll find the code for the uploading data in bulk tutorial. """ +# --8<-- [start:imports_and_constants] import pandas as pd import synapseclient @@ -14,34 +15,26 @@ DIRECTORY_FOR_MY_PROJECT = "test_folder" # This should exist with your files in it PATH_TO_MANIFEST_FILE = "test_manifest.csv" # This doesn't need to exist yet SYNAPSE_PROJECT_ID = "" # Put your Synapse project ID here. This is the project where you want to upload your data. +project = Project(id=SYNAPSE_PROJECT_ID) +# --8<-- [end:imports_and_constants] -# TODO switch to using new version of synapseutils/sync.py.generate_sync_manifest -# https://sagebionetworks.jira.com/browse/SYNPY-1809 - +# --8<-- [start:generate_manifest] # Step 2: Create a manifest CSV file with the paths to the files and their parent folders # Note: When this command is run it will re-create your directory structure within # Synapse. Be aware of this before running this command. # If folders with the exact names already exists in Synapse, those folders will be used. - - -# old function generates a TSV -from synapseutils import generate_sync_manifest - -generate_sync_manifest( - syn=syn, +project.generate_sync_manifest( directory_path=DIRECTORY_FOR_MY_PROJECT, - parent_id=SYNAPSE_PROJECT_ID, manifest_path=PATH_TO_MANIFEST_FILE, ) -# reformat the manifest file to work with sync_to_synapse -manifest_df = pd.read_csv(PATH_TO_MANIFEST_FILE, sep="\t") -manifest_df.rename(columns={"parent": "parentId"}, inplace=True) -manifest_df.to_csv(PATH_TO_MANIFEST_FILE, index=False) +# --8<-- [end:generate_manifest] +# --8<-- [start:sync_to_synapse] # Step 3: After generating the manifest file, we can upload the data in bulk -project = Project(id=SYNAPSE_PROJECT_ID) project.sync_to_synapse(manifest_path=PATH_TO_MANIFEST_FILE, send_messages=False) +# --8<-- [end:sync_to_synapse] +# --8<-- [start:add_annotation] # Step 4: Let's add an annotation to our manifest file # Pandas is a powerful data manipulation library in Python, although it is not required # for this tutorial, it is used here to demonstrate how you can manipulate the manifest @@ -57,7 +50,9 @@ df.to_csv(PATH_TO_MANIFEST_FILE, index=False) project.sync_to_synapse(manifest_path=PATH_TO_MANIFEST_FILE, send_messages=False) +# --8<-- [end:add_annotation] +# --8<-- [start:add_provenance] # Step 5: Let's create an Activity/Provenance # First let's find the row in the CSV we want to update. This code finds the row number # that we would like to update. @@ -86,3 +81,4 @@ df.to_csv(PATH_TO_MANIFEST_FILE, index=False) project.sync_to_synapse(manifest_path=PATH_TO_MANIFEST_FILE, send_messages=False) +# --8<-- [end:add_provenance] diff --git a/docs/tutorials/python/upload_data_in_bulk.md b/docs/tutorials/python/upload_data_in_bulk.md index be61524bb..7fc6b13d2 100644 --- a/docs/tutorials/python/upload_data_in_bulk.md +++ b/docs/tutorials/python/upload_data_in_bulk.md @@ -57,18 +57,18 @@ tools to open and manipulate CSV files. First let's set up some constants we'll use in this script ```python -{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=5-16} +--8<-- "docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py:imports_and_constants" ``` ## 2. Create a manifest CSV file to upload data in bulk -We use `synapseutils.generate_sync_manifest` to walk our local directory and produce a -manifest that maps each file to the correct parent folder in Synapse (creating folders -as needed). The output is a TSV with a `parent` column, so we convert it to CSV and -rename `parent` to `parentId` for use with `sync_to_synapse`. +We call `Project.generate_sync_manifest` on the project we want to mirror into. +It walks our local directory and produces a CSV manifest that maps each file to +the correct parent folder in Synapse (creating folders as needed). The output +is ready to hand directly to `sync_to_synapse`. ```python -{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=20-39} +--8<-- "docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py:generate_manifest" ```
@@ -89,7 +89,7 @@ path,parentId ## 3. Upload the data in bulk ```python -{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=41-43} +--8<-- "docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py:sync_to_synapse" ``` @@ -115,7 +115,7 @@ you are not comfortable with pandas you may use any tool that can open and manip CSV files such as Excel or Google Sheets. ```python -{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=45-59} +--8<-- "docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py:add_annotation" ``` Now that you have uploaded and annotated your files you'll be able to inspect your data @@ -137,7 +137,7 @@ Synapse. Additionally we'll link off to a sample URL that describes a process th may have executed to generate the file. ```python -{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=61-88} +--8<-- "docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py:add_provenance" ``` After running this code we may again inspect the synapse web UI. In this screenshot i've @@ -156,14 +156,14 @@ navigated to the Files tab and selected the file that we added a Provenance reco Click to show me ```python -{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!} +--8<-- "docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py" ```
## References used in this tutorial - [syn.login][synapseclient.Synapse.login] -- [synapseutils.generate_sync_manifest][] +- [Project.generate_sync_manifest][synapseclient.models.mixins.StorableContainer.generate_sync_manifest] - [Project.sync_to_synapse][synapseclient.models.mixins.StorableContainer.sync_to_synapse] - [Manifest CSV format](../../explanations/manifest_csv.md) - [Activity/Provenance](../../explanations/domain_models_of_synapse.md#activityprovenance) diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index c19ee7ed0..71231a74a 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -48,6 +48,7 @@ StorableContainerSynchronousProtocol, ) from synapseclient.models.services.manifest import ( + _generate_sync_manifest_async, read_manifest_for_upload, upload_sync_files, ) @@ -622,6 +623,100 @@ async def main(): progress_bar.close() return uploaded_files + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: f"{self.__class__.__name__}_generate_sync_manifest: {self.id}" + ) + async def generate_sync_manifest_async( + self: Self, + directory_path: str, + manifest_path: str, + *, + synapse_client: Synapse | None = None, + ) -> None: + """Walk a local directory, mirror its folder hierarchy under this + container in Synapse, and write a CSV manifest ready for + [sync_to_synapse_async][synapseclient.models.mixins.StorableContainer.sync_to_synapse_async]. + + The generated manifest has two columns: path (always an absolute, + symlink-resolved path — the input directory_path is resolved via + os.path.realpath before walking, so the manifest can be consumed + from any working directory and remains valid even if the original + symlink is later removed or repointed) and parentId (the Synapse ID + of the file's containing folder). Folders that already exist in + Synapse with the same name and parent are reused rather than + re-created. + + Sibling folders at the same depth are created concurrently to reduce + latency on wide trees. Directories and files are traversed in sorted + order so manifest output is deterministic across runs and platforms. + Directory symlinks encountered inside directory_path are not + followed. File symlinks are not pruned: the symlink's path is + recorded in the manifest, and the underlying target's contents + are uploaded when the manifest is consumed. The root + directory_path itself may be a symlink; if so, it is resolved to + its target and the target is walked. Zero-byte files are skipped with a + warning, since Synapse rejects empty files. I/O errors raised by + os.walk (for example, unreadable subdirectories) are logged and + skipped. If no uploadable files are found under directory_path, a + warning is logged and a header-only manifest is written. If a + folder store_async call fails, the exception propagates and no + manifest is written; any folders already created remain in Synapse + and will be reused on a retry. + + Arguments: + directory_path: Path to the local directory to be pushed to + Synapse. + manifest_path: Path where the generated manifest CSV will be + written. + synapse_client: If not passed in and caching was not disabled by + Synapse.allow_client_caching(False) this will use the last + created instance from the Synapse class constructor. + + Returns: + None + + Raises: + ValueError: If this container's id is None, or if directory_path + does not exist or is not a directory, or if this container's + id exists in Synapse but is not a Folder or Project. + SynapseHTTPError: If this container's id does not exist in + Synapse. + + Example: Generate a manifest and upload the files + Mirror ./my_data under a Synapse project and then upload it. + + ```python + import asyncio + from synapseclient import Synapse + from synapseclient.models import Project + + async def main(): + syn = Synapse() + syn.login() + + project = Project(id="syn12345") + await project.generate_sync_manifest_async( + directory_path="./my_data", + manifest_path="manifest.csv", + ) + await project.sync_to_synapse_async(manifest_path="manifest.csv") + + asyncio.run(main()) + ``` + """ + if self.id is None: + raise ValueError( + f"Cannot generate a sync manifest for a {type(self).__name__}" + " that has not been stored in Synapse. Set id on this" + " container (or store it) first." + ) + await _generate_sync_manifest_async( + directory_path=directory_path, + parent_id=self.id, + manifest_path=manifest_path, + synapse_client=synapse_client, + ) + def flatten_file_list(self) -> List["File"]: """ Recursively loop over all of the already retrieved files and folders and return diff --git a/synapseclient/models/protocols/storable_container_protocol.py b/synapseclient/models/protocols/storable_container_protocol.py index cfbdf5f1d..496070315 100644 --- a/synapseclient/models/protocols/storable_container_protocol.py +++ b/synapseclient/models/protocols/storable_container_protocol.py @@ -291,3 +291,79 @@ def sync_to_synapse( ``` """ return [] + + def generate_sync_manifest( + self: Self, + directory_path: str, + manifest_path: str, + *, + synapse_client: Synapse | None = None, + ) -> None: + """Walk a local directory, mirror its folder hierarchy under this + container in Synapse, and write a CSV manifest ready for + [sync_to_synapse][synapseclient.models.mixins.StorableContainer.sync_to_synapse]. + + The generated manifest has two columns: path (always an absolute, + symlink-resolved path — the input directory_path is resolved via + os.path.realpath before walking, so the manifest can be consumed + from any working directory and remains valid even if the original + symlink is later removed or repointed) and parentId (the Synapse ID + of the file's containing folder). Folders that already exist in + Synapse with the same name and parent are reused rather than + re-created. + + Sibling folders at the same depth are created concurrently to reduce + latency on wide trees. Directories and files are traversed in sorted + order so manifest output is deterministic across runs and platforms. + Directory symlinks encountered inside directory_path are not + followed. File symlinks are not pruned: the symlink's path is + recorded in the manifest, and the underlying target's contents + are uploaded when the manifest is consumed. The root + directory_path itself may be a symlink; if so, it is resolved to + its target and the target is walked. Zero-byte files are skipped with a + warning, since Synapse rejects empty files. I/O errors raised by + os.walk (for example, unreadable subdirectories) are logged and + skipped. If no uploadable files are found under directory_path, a + warning is logged and a header-only manifest is written. If a + folder store call fails, the exception propagates and no manifest + is written; any folders already created remain in Synapse and will + be reused on a retry. + + Arguments: + directory_path: Path to the local directory to be pushed to + Synapse. + manifest_path: Path where the generated manifest CSV will be + written. + synapse_client: If not passed in and caching was not disabled by + Synapse.allow_client_caching(False) this will use the last + created instance from the Synapse class constructor. + + Returns: + None + + Raises: + ValueError: If this container's id is None, or if directory_path + does not exist or is not a directory, or if this container's + id exists in Synapse but is not a Folder or Project. + SynapseHTTPError: If this container's id does not exist in + Synapse. + + Example: Generate a manifest and upload the files + Mirror ./my_data under a Synapse project and then upload it. + + ```python + from synapseclient import Synapse + from synapseclient.models import Project + + syn = Synapse() + syn.login() + + project = Project(id="syn12345") + project.generate_sync_manifest( + directory_path="./my_data", + manifest_path="manifest.csv", + ) + project.sync_to_synapse(manifest_path="manifest.csv") + ``` + """ + return None diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index 0a9547ee3..2e1f0ded5 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -1,10 +1,13 @@ -"""Services for reading a Synapse manifest CSV file and preparing it for upload.""" +"""Services for reading and generating Synapse manifest CSV files used to +drive bulk upload via Project.sync_to_synapse / Folder.sync_to_synapse.""" from __future__ import annotations import ast import asyncio +import csv import datetime +import functools import os import re from dataclasses import dataclass @@ -33,6 +36,7 @@ from synapseclient.models import UsedEntity, UsedURL from synapseclient.models.file import File + from synapseclient.models.folder import Folder #: Scalar types that Synapse supports as annotation values. SynapseAnnotationType = datetime.datetime | float | int | bool | str @@ -1281,3 +1285,169 @@ async def _resolve_local_file_provenance( f"The provenance record for file: {owner_path} is incorrect.\n" f"Specifically {absolute_path} is not being uploaded and is not in Synapse." ) from e + + +GENERATED_MANIFEST_COLUMNS = ["path", "parentId"] + + +async def _generate_sync_manifest_async( + directory_path: str, + parent_id: str, + manifest_path: str, + *, + synapse_client: Synapse | None = None, +) -> None: + """Implementation backing + [StorableContainer.generate_sync_manifest_async][synapseclient.models.mixins.StorableContainer.generate_sync_manifest_async]. + + See the mixin method docstring for behavior, arguments, and examples. + """ + client = Synapse.get_client(synapse_client=synapse_client) + # realpath (not abspath) so that if directory_path is itself a symlink, + # the manifest records paths under the resolved target. That keeps the + # manifest valid if the original symlink is later removed or repointed. + directory_path = os.path.realpath(directory_path) + if not os.path.isdir(directory_path): + raise ValueError(f"{directory_path} is not a directory or does not exist") + await _validate_target_container_async(parent_id, client=client) + rows: list[dict[str, str]] = [] + # os.walk descends top-down, so a directory's parent is always registered + # here before the directory itself is visited. + local_to_synapse_parent: dict[str, str] = {directory_path: parent_id} + + for dirpath, dirnames, filenames in os.walk( + directory_path, onerror=functools.partial(_log_walk_error, client) + ): + # Drop symlinked dirs so we don't create Synapse folders for + # directories whose contents os.walk (followlinks=False) won't + # visit. Mutating dirnames in place is the documented os.walk hook + # for pruning the traversal. + dirnames[:] = [ + d for d in dirnames if not os.path.islink(os.path.join(dirpath, d)) + ] + # Sort in place so os.walk also descends in deterministic order. + dirnames.sort() + current_parent_id = local_to_synapse_parent[dirpath] + + created = await _create_child_folders_async( + parent_id=current_parent_id, dirnames=dirnames, client=client + ) + for dirname, folder in zip(dirnames, created): + local_to_synapse_parent[os.path.join(dirpath, dirname)] = folder.id + + rows.extend( + _collect_uploadable_rows(dirpath, filenames, current_parent_id, client) + ) + + if not rows: + client.logger.warning( + f"No uploadable files found under {directory_path};" + " generated manifest contains only the header row." + ) + _write_manifest_csv(manifest_path, rows) + + +async def _validate_target_container_async(parent_id: str, client: Synapse) -> None: + """Verify that parent_id resolves to a Folder or Project in Synapse. + + Dedicated to manifest generation so error messages reference the + container the user is writing into, not a manifest parentId column. + + Raises: + SynapseHTTPError: If parent_id does not exist in Synapse. + ValueError: If parent_id exists but is not a Folder or Project. + """ + try: + container = await factory_get_async( + synapse_id=parent_id, + file_options=FileOptions(download_file=False), + synapse_client=client, + ) + except SynapseHTTPError as err: + if getattr(err.response, "status_code", None) == 404: + client.logger.warning(f"{parent_id} is not a valid Synapse Id") + raise + + from synapseclient.models.folder import Folder + from synapseclient.models.project import Project + + if not isinstance(container, (Folder, Project)): + raise ValueError(f"Container {parent_id} is not a Folder or Project") + + +def _log_walk_error(client: Synapse, err: OSError) -> None: + """Log and skip an os.walk I/O error during manifest generation.""" + client.logger.warning( + f"Skipping unreadable path during manifest generation:" + f" {err.filename} ({err})" + ) + + +async def _create_child_folders_async( + parent_id: str, dirnames: list[str], client: Synapse +) -> list[Folder]: + """Create sibling folders concurrently under a shared Synapse parent. + + Sibling folders have no ordering dependency on each other, so they are + gathered in a single batch rather than awaited one at a time. + """ + from synapseclient.models.folder import Folder + + return await asyncio.gather( + *[ + Folder(name=dirname, parent_id=parent_id).store_async(synapse_client=client) + for dirname in dirnames + ] + ) + + +def _collect_uploadable_rows( + dirpath: str, + filenames: Iterable[str], + parent_id: str, + client: Synapse, +) -> list[dict[str, str]]: + """Build manifest rows for the uploadable files in a single directory. + + Files are visited in sorted order so manifest output is deterministic. + Unreadable and zero-byte files are logged and dropped by + _is_uploadable_file. + """ + rows: list[dict[str, str]] = [] + for filename in sorted(filenames): + filepath = os.path.join(dirpath, filename) + if _is_uploadable_file(filepath, client): + rows.append({"path": filepath, "parentId": parent_id}) + return rows + + +def _write_manifest_csv(manifest_path: str, rows: list[dict[str, str]]) -> None: + """Write generated manifest rows to a CSV at manifest_path.""" + with open(manifest_path, "w", encoding="utf-8", newline="") as fp: + writer = csv.DictWriter(fp, fieldnames=GENERATED_MANIFEST_COLUMNS) + writer.writeheader() + writer.writerows(rows) + + +def _is_uploadable_file(filepath: str, client: Synapse) -> bool: + """Return True if filepath can be included in a generated manifest. + + Logs a warning and returns False for files that cannot be uploaded: + unreadable files (broken symlinks, permission errors, races) and + zero-byte files (rejected by Synapse). + """ + try: + size = os.stat(filepath).st_size + except OSError as err: + client.logger.warning( + f"Skipping unreadable file during manifest generation:" + f" {filepath} ({err})" + ) + return False + if size == 0: + client.logger.warning( + f"Skipping zero-byte file (empty files cannot be" + f" uploaded to Synapse): {filepath}" + ) + return False + return True diff --git a/synapseclient/synapsePythonClient b/synapseclient/synapsePythonClient index a28e64ee8..5131531b8 100644 --- a/synapseclient/synapsePythonClient +++ b/synapseclient/synapsePythonClient @@ -1,6 +1,6 @@ { "client": "synapsePythonClient", - "latestVersion": "4.11.0", + "latestVersion": "4.12.0", "blacklist": [ "0.0.0", "0.4.1", diff --git a/synapseutils/sync.py b/synapseutils/sync.py index b7fd34023..6859b6b5f 100644 --- a/synapseutils/sync.py +++ b/synapseutils/sync.py @@ -1552,11 +1552,28 @@ def _check_size_each_file(df): ) +@deprecated( + version="4.13.0", + reason=( + "To be removed in 5.0.0. Use Project.generate_sync_manifest or " + "Folder.generate_sync_manifest instead, which write a CSV manifest " + "compatible with the OOP Project.sync_to_synapse / " + "Folder.sync_to_synapse methods." + ), +) def generate_sync_manifest(syn, directory_path, parent_id, manifest_path) -> None: """Generate manifest for [syncToSynapse][synapseutils.sync.syncToSynapse] from a local directory. [Read more about the manifest file format](../../explanations/manifest_tsv/) + .. deprecated:: 4.13.0 + To be removed in 5.0.0. Use + [Project.generate_sync_manifest][synapseclient.models.mixins.StorableContainer.generate_sync_manifest] + or + [Folder.generate_sync_manifest][synapseclient.models.mixins.StorableContainer.generate_sync_manifest] + instead, which produce a CSV manifest (with a parentId column) that + works directly with Project.sync_to_synapse and Folder.sync_to_synapse. + Arguments: syn: A Synapse object with user's login, e.g. syn = synapseclient.login() directory_path: Path to local directory to be pushed to Synapse. diff --git a/tests/integration/synapseclient/models/async/test_storable_container_async.py b/tests/integration/synapseclient/models/async/test_storable_container_async.py index 2c54dc453..87a1aca87 100644 --- a/tests/integration/synapseclient/models/async/test_storable_container_async.py +++ b/tests/integration/synapseclient/models/async/test_storable_container_async.py @@ -1,14 +1,19 @@ """Integration tests for StorableContainer""" import csv +import os +import platform import uuid from pathlib import Path from typing import Callable +import pandas as pd import pytest +import pytest_asyncio import synapseclient.core.utils as utils from synapseclient import Synapse +from synapseclient.core.exceptions import SynapseHTTPError from synapseclient.models import File, Folder, Project from synapseclient.models.activity import UsedURL @@ -396,3 +401,317 @@ async def test_error_column_rows_skipped( # THEN only the valid row was uploaded assert len(uploaded_files) == 1 assert uploaded_files[0].name == unique_name + + +class TestGenerateSyncManifest: + """Integration tests for StorableContainer.generate_sync_manifest_async + against the live Synapse API. + + These tests walk a local temporary directory and verify that the method + creates matching Synapse folders under a real container and writes a + manifest CSV that references the newly-created folder IDs. + """ + + @pytest_asyncio.fixture(loop_scope="session", scope="function") + async def scope_folder( + self, + syn: Synapse, + project_model: Project, + schedule_for_cleanup: Callable[..., None], + request: pytest.FixtureRequest, + ) -> Folder: + """A fresh Folder under the worker-scoped project per test, so + assertions can reference the folder's full child state without + interference from sibling tests. A Folder is cheaper to create than + a Project while providing equivalent isolation for these tests. + """ + folder = await Folder( + name=f"{request.node.name}_{uuid.uuid4()}", + parent_id=project_model.id, + ).store_async(synapse_client=syn) + schedule_for_cleanup(folder) + return folder + + async def test_flat_directory_uses_parent_id( + self, + syn: Synapse, + scope_folder: Folder, + tmp_path: Path, + ) -> None: + """Flat directories should produce a manifest where every file points + directly at the container's id, and empty files should be skipped. + No Synapse folders should be created when there are no subdirectories + to mirror. + """ + # GIVEN a flat directory of non-empty files plus one empty file + src = tmp_path / "flat" + src.mkdir() + (src / "a.txt").write_text("alpha") + (src / "b.txt").write_text("bravo") + (src / "empty.txt").write_text("") + manifest = tmp_path / "manifest.csv" + + # WHEN I generate a sync manifest on the scope folder + await scope_folder.generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=syn, + ) + + # THEN the manifest only contains the non-empty files, all pointing + # at the scope folder as their parent, and paths are absolute + df = pd.read_csv(manifest) + assert list(df.columns) == ["path", "parentId"] + assert sorted(os.path.basename(p) for p in df["path"]) == ["a.txt", "b.txt"] + assert (df["parentId"] == scope_folder.id).all() + for path in df["path"]: + assert os.path.isabs(path) + + # AND no folders were created under the scope folder + await scope_folder.sync_from_synapse_async( + download_file=False, recursive=False, synapse_client=syn + ) + assert scope_folder.folders == [] + + async def test_nested_directory_creates_folders( + self, + syn: Synapse, + scope_folder: Folder, + tmp_path: Path, + ) -> None: + """Nested directory trees should create matching Synapse folders at + each level, and the manifest parentId for each file should be the ID + of the Synapse folder corresponding to the file's on-disk directory. + """ + # GIVEN a nested directory tree with sibling folders at the root and a + # deeper leaf folder + src = tmp_path / "root" + sibling_a = src / "sibling_a" + sibling_b = src / "sibling_b" + deep = sibling_a / "deep" + deep.mkdir(parents=True) + sibling_b.mkdir() + + (src / "root.txt").write_text("at root") + (sibling_a / "a.txt").write_text("sibling a") + (sibling_b / "b.txt").write_text("sibling b") + (deep / "deep.txt").write_text("deep file") + manifest = tmp_path / "manifest.csv" + + # WHEN I generate a sync manifest on the scope folder + await scope_folder.generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=syn, + ) + + # THEN each manifest row's parentId identifies the Synapse folder that + # contains the file on disk + df = pd.read_csv(manifest) + by_basename = { + os.path.basename(p): pid for p, pid in zip(df["path"], df["parentId"]) + } + assert by_basename["root.txt"] == scope_folder.id + sibling_a_id = by_basename["a.txt"] + sibling_b_id = by_basename["b.txt"] + deep_id = by_basename["deep.txt"] + + for path in df["path"]: + assert os.path.isabs(path) + + # AND the Synapse tree matches the local layout + await scope_folder.sync_from_synapse_async( + download_file=False, recursive=True, synapse_client=syn + ) + top_level = {f.name: f for f in scope_folder.folders} + assert sorted(top_level) == ["sibling_a", "sibling_b"] + assert top_level["sibling_a"].id == sibling_a_id + assert top_level["sibling_b"].id == sibling_b_id + assert [(f.name, f.id) for f in top_level["sibling_a"].folders] == [ + ("deep", deep_id) + ] + assert top_level["sibling_b"].folders == [] + + async def test_existing_folders_are_reused( + self, + syn: Synapse, + scope_folder: Folder, + tmp_path: Path, + ) -> None: + """When a Synapse folder with a matching name already exists under + the container, the method should reuse its ID instead of creating a + new folder or raising a conflict. + """ + # GIVEN a folder that already exists in Synapse under the scope folder + folder_name = "preexisting" + existing = await Folder( + name=folder_name, parent_id=scope_folder.id + ).store_async(synapse_client=syn) + + # AND a local directory that mirrors that folder's name with a file inside + src = tmp_path / "root" + child = src / folder_name + child.mkdir(parents=True) + (child / "payload.txt").write_text("payload") + manifest = tmp_path / "manifest.csv" + + # WHEN I generate a sync manifest + await scope_folder.generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=syn, + ) + + # THEN the manifest reuses the existing folder's Synapse ID + df = pd.read_csv(manifest) + assert len(df) == 1 + assert df["parentId"].iloc[0] == existing.id + + # AND the scope folder has exactly the one pre-existing child folder + await scope_folder.sync_from_synapse_async( + download_file=False, recursive=False, synapse_client=syn + ) + assert len(scope_folder.folders) == 1 + assert scope_folder.folders[0].id == existing.id + + async def test_empty_directory_writes_header_only( + self, syn: Synapse, scope_folder: Folder, tmp_path: Path + ) -> None: + """An empty source directory should produce a manifest containing + only the CSV header row.""" + empty = tmp_path / "empty" + empty.mkdir() + manifest = tmp_path / "manifest.csv" + + await scope_folder.generate_sync_manifest_async( + directory_path=str(empty), + manifest_path=str(manifest), + synapse_client=syn, + ) + + assert manifest.read_text().strip().splitlines() == ["path,parentId"] + + async def test_missing_directory_raises( + self, syn: Synapse, project_model: Project, tmp_path: Path + ) -> None: + """A directory_path that does not exist on disk should raise + ValueError before any manifest file is written.""" + missing = tmp_path / "does_not_exist" + manifest = tmp_path / "manifest.csv" + + with pytest.raises(ValueError, match="is not a directory or does not exist"): + await project_model.generate_sync_manifest_async( + directory_path=str(missing), + manifest_path=str(manifest), + synapse_client=syn, + ) + assert not manifest.exists() + + async def test_invalid_parent_id_raises_http_error( + self, syn: Synapse, tmp_path: Path + ) -> None: + """A container whose id does not resolve to any Synapse entity should + surface as a SynapseHTTPError from the server, and no manifest file + should be written. + """ + src = tmp_path / "src" + src.mkdir() + (src / "a.txt").write_text("hello") + manifest = tmp_path / "manifest.csv" + + with pytest.raises(SynapseHTTPError): + await Folder(id="syn00000000").generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=syn, + ) + assert not manifest.exists() + + async def test_output_is_sorted_deterministically( + self, + syn: Synapse, + scope_folder: Folder, + tmp_path: Path, + ) -> None: + """Directories and files should be traversed in sorted order so the + generated manifest is deterministic regardless of filesystem + iteration order. + """ + # GIVEN subdirs and files created in non-alphabetical order + src = tmp_path / "root" + for dirname in ["charlie", "alpha", "bravo"]: + subdir = src / dirname + subdir.mkdir(parents=True) + for filename in ["z.txt", "a.txt", "m.txt"]: + (subdir / filename).write_text("payload") + manifest = tmp_path / "manifest.csv" + + await scope_folder.generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=syn, + ) + + # THEN manifest rows are sorted first by directory name, then by file + # name within each directory + df = pd.read_csv(manifest) + ordered = [ + (os.path.basename(os.path.dirname(p)), os.path.basename(p)) + for p in df["path"] + ] + assert ordered == [ + ("alpha", "a.txt"), + ("alpha", "m.txt"), + ("alpha", "z.txt"), + ("bravo", "a.txt"), + ("bravo", "m.txt"), + ("bravo", "z.txt"), + ("charlie", "a.txt"), + ("charlie", "m.txt"), + ("charlie", "z.txt"), + ] + + @pytest.mark.skipif( + platform.system() == "Windows", + reason="Creating symlinks on Windows requires elevated privileges.", + ) + async def test_directory_symlinks_are_not_followed( + self, + syn: Synapse, + scope_folder: Folder, + tmp_path: Path, + ) -> None: + """Symlinks to directories should not be recursed into — files under + a symlinked directory must not appear in the generated manifest, and + no Synapse folder should be created for the symlinked directory. + """ + # GIVEN a tree where a sibling directory is a symlink to a real + # directory that contains a file + src = tmp_path / "root" + real = src / "real" + real.mkdir(parents=True) + (real / "real_file.txt").write_text("real") + os.symlink(real, src / "link", target_is_directory=True) + manifest = tmp_path / "manifest.csv" + + await scope_folder.generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=syn, + ) + + # THEN only the file in the real directory appears in the manifest + df = pd.read_csv(manifest) + parents_and_names = [ + (os.path.basename(os.path.dirname(p)), os.path.basename(p)) + for p in df["path"] + ] + assert parents_and_names == [("real", "real_file.txt")] + + # AND no Synapse folder is created for the symlinked directory: + # only the on-disk "real" sibling is mirrored; the "link" symlink + # was pruned during the walk and produced no Synapse folder. + await scope_folder.sync_from_synapse_async( + download_file=False, recursive=False, synapse_client=syn + ) + assert sorted(f.name for f in scope_folder.folders) == ["real"] diff --git a/tests/unit/synapseclient/models/async/unit_test_storable_container_async.py b/tests/unit/synapseclient/models/async/unit_test_storable_container_async.py index a954b6408..bc0d047e3 100644 --- a/tests/unit/synapseclient/models/async/unit_test_storable_container_async.py +++ b/tests/unit/synapseclient/models/async/unit_test_storable_container_async.py @@ -1,14 +1,17 @@ """Unit tests for StorableContainer""" import csv +import os import uuid from pathlib import Path -from unittest.mock import patch +from typing import Any +from unittest.mock import AsyncMock, patch +import pandas as pd import pytest from synapseclient import Synapse -from synapseclient.models import Project +from synapseclient.models import Folder, Project def _write_manifest(rows: list[dict], tmp_path: Path) -> Path: @@ -81,3 +84,204 @@ async def test_all_rows_have_errors_no_crash(self, tmp_path: Path) -> None: ) mock_upload.assert_not_called() assert result == [] + + +class TestGenerateSyncManifest: + """Tests for StorableContainer.generate_sync_manifest_async.""" + + @pytest.fixture(autouse=True) + def init_syn(self, syn: Synapse) -> None: + self.syn = syn + + @pytest.fixture(autouse=True) + def stub_parent_check(self) -> Any: + """Bypass the upfront parent_id validation so tests don't hit the network. + Tests that exercise the validation itself patch this explicitly.""" + with patch( + "synapseclient.models.services.manifest._validate_target_container_async", + new=AsyncMock(return_value=None), + ) as mock: + yield mock + + async def test_unstored_container_raises(self, tmp_path: Path) -> None: + """Calling the method on a container whose id is None is a caller bug + and must surface as a ValueError rather than a confusing downstream + error.""" + src = tmp_path / "src" + src.mkdir() + (src / "a.txt").write_text("hello") + manifest = tmp_path / "manifest.csv" + + with pytest.raises(ValueError, match="has not been stored in Synapse"): + await Project(name="unstored").generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=self.syn, + ) + + async def test_empty_directory_writes_header_only(self, tmp_path: Path) -> None: + """An empty directory produces a manifest with only the header row and + logs a warning so the caller is not left wondering why no files uploaded.""" + empty_dir = tmp_path / "empty" + empty_dir.mkdir() + manifest = tmp_path / "manifest.csv" + + with patch.object(self.syn.logger, "warning") as mock_warning: + await Folder(id="syn1").generate_sync_manifest_async( + directory_path=str(empty_dir), + manifest_path=str(manifest), + synapse_client=self.syn, + ) + + assert manifest.read_text().strip().splitlines() == ["path,parentId"] + assert any( + "No uploadable files found" in call.args[0] + for call in mock_warning.call_args_list + ) + + async def test_flat_directory_lists_files_skips_empty(self, tmp_path: Path) -> None: + """Non-empty files each produce a manifest row sharing the container's + id as parentId; zero-byte files are skipped.""" + src = tmp_path / "src" + src.mkdir() + (src / "a.txt").write_text("hello") + (src / "b.txt").write_text("world") + (src / "empty.txt").write_text("") + manifest = tmp_path / "manifest.csv" + + await Folder(id="syn42").generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=self.syn, + ) + + df = pd.read_csv(manifest) + assert list(df.columns) == ["path", "parentId"] + assert sorted(os.path.basename(p) for p in df["path"]) == ["a.txt", "b.txt"] + assert (df["parentId"] == "syn42").all() + + async def test_nested_directories_thread_parent_ids(self, tmp_path: Path) -> None: + """Files in nested directories get the Synapse ID of their innermost + Synapse folder as parentId, and Folder.store_async is called once per + subdirectory with the correct name/parent_id.""" + src = tmp_path / "src" + (src / "level1" / "level2").mkdir(parents=True) + (src / "level1" / "level2" / "deep.txt").write_text("content") + manifest = tmp_path / "manifest.csv" + + folder_id_iter = iter(["syn100", "syn200"]) + observed: list[tuple[str, str]] = [] + + async def fake_store_async(self: Any, *args: Any, **kwargs: Any) -> Any: + observed.append((self.name, self.parent_id)) + self.id = next(folder_id_iter) + return self + + with patch( + "synapseclient.models.Folder.store_async", + new=fake_store_async, + ): + await Folder(id="synROOT").generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=self.syn, + ) + + assert observed == [("level1", "synROOT"), ("level2", "syn100")] + df = pd.read_csv(manifest) + assert df["path"].tolist() == [ + os.path.join(os.path.realpath(str(src)), "level1", "level2", "deep.txt") + ] + assert df["parentId"].tolist() == ["syn200"] + + async def test_output_is_csv_not_tsv(self, tmp_path: Path) -> None: + """Output uses comma delimiter (CSV), matching the new manifest reader. + + The legacy synapseutils.generate_sync_manifest wrote TSV with a + 'parent' column; the OOP sync_to_synapse expects CSV with 'parentId'. + """ + src = tmp_path / "src" + src.mkdir() + (src / "file.txt").write_text("data") + manifest = tmp_path / "manifest.csv" + + await Folder(id="syn1").generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=self.syn, + ) + + text = manifest.read_text() + assert "\t" not in text + assert text.startswith("path,parentId") + + async def test_missing_directory_raises(self, tmp_path: Path) -> None: + """A directory_path that does not exist raises ValueError instead of + silently writing an empty manifest.""" + missing = tmp_path / "does_not_exist" + manifest = tmp_path / "manifest.csv" + + with pytest.raises(ValueError, match="is not a directory or does not exist"): + await Folder(id="syn1").generate_sync_manifest_async( + directory_path=str(missing), + manifest_path=str(manifest), + synapse_client=self.syn, + ) + + assert not manifest.exists() + + async def test_file_path_raises(self, tmp_path: Path) -> None: + """A directory_path pointing at a file (not a directory) raises ValueError.""" + file_path = tmp_path / "a.txt" + file_path.write_text("hello") + manifest = tmp_path / "manifest.csv" + + with pytest.raises(ValueError, match="is not a directory or does not exist"): + await Folder(id="syn1").generate_sync_manifest_async( + directory_path=str(file_path), + manifest_path=str(manifest), + synapse_client=self.syn, + ) + + async def test_parent_id_validated_upfront( + self, tmp_path: Path, stub_parent_check: AsyncMock + ) -> None: + """The container's id is validated via _validate_target_container_async + before the directory is walked.""" + src = tmp_path / "src" + src.mkdir() + (src / "a.txt").write_text("hello") + manifest = tmp_path / "manifest.csv" + + await Folder(id="synROOT").generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=self.syn, + ) + + stub_parent_check.assert_awaited_once_with("synROOT", client=self.syn) + + async def test_invalid_parent_id_propagates_http_error( + self, tmp_path: Path + ) -> None: + """If the upfront parent_id check raises SynapseHTTPError, no manifest + is written and the error propagates to the caller.""" + from synapseclient.core.exceptions import SynapseHTTPError + + src = tmp_path / "src" + src.mkdir() + (src / "a.txt").write_text("hello") + manifest = tmp_path / "manifest.csv" + + with patch( + "synapseclient.models.services.manifest._validate_target_container_async", + new=AsyncMock(side_effect=SynapseHTTPError("Not found")), + ): + with pytest.raises(SynapseHTTPError): + await Folder(id="synBOGUS").generate_sync_manifest_async( + directory_path=str(src), + manifest_path=str(manifest), + synapse_client=self.syn, + ) + + assert not manifest.exists() From 063674bb6b1fd62f66e1b52db0f780a9000d5c41 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 5 May 2026 12:10:09 -0700 Subject: [PATCH 02/11] rename function --- synapseclient/models/mixins/storable_container.py | 4 ++-- synapseclient/models/services/manifest.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index 71231a74a..1287495d2 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -48,7 +48,7 @@ StorableContainerSynchronousProtocol, ) from synapseclient.models.services.manifest import ( - _generate_sync_manifest_async, + generate_sync_manifest, read_manifest_for_upload, upload_sync_files, ) @@ -710,7 +710,7 @@ async def main(): " that has not been stored in Synapse. Set id on this" " container (or store it) first." ) - await _generate_sync_manifest_async( + await generate_sync_manifest( directory_path=directory_path, parent_id=self.id, manifest_path=manifest_path, diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index 2e1f0ded5..60709f96e 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -1290,7 +1290,7 @@ async def _resolve_local_file_provenance( GENERATED_MANIFEST_COLUMNS = ["path", "parentId"] -async def _generate_sync_manifest_async( +async def generate_sync_manifest( directory_path: str, parent_id: str, manifest_path: str, From edf9c1e6ab4f579b67a2aa05b7d56552284faa80 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 5 May 2026 12:14:00 -0700 Subject: [PATCH 03/11] add docstring --- synapseclient/models/services/manifest.py | 45 +++++++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index 60709f96e..2e592035d 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -1297,10 +1297,49 @@ async def generate_sync_manifest( *, synapse_client: Synapse | None = None, ) -> None: - """Implementation backing - [StorableContainer.generate_sync_manifest_async][synapseclient.models.mixins.StorableContainer.generate_sync_manifest_async]. + """Walk a local directory, mirror its folder hierarchy under parent_id in + Synapse, and write a CSV manifest ready for sync_to_synapse_async. + + The generated manifest has two columns: path (always an absolute, + symlink-resolved path — directory_path is resolved via os.path.realpath + before walking, so the manifest can be consumed from any working + directory and remains valid even if the original symlink is later + removed or repointed) and parentId (the Synapse ID of the file's + containing folder). Folders that already exist in Synapse with the same + name and parent are reused rather than re-created. + + Sibling folders at the same depth are created concurrently to reduce + latency on wide trees. Directories and files are traversed in sorted + order so manifest output is deterministic across runs and platforms. + Directory symlinks encountered inside directory_path are not followed. + File symlinks are not pruned: the symlink's path is recorded in the + manifest, and the underlying target's contents are uploaded when the + manifest is consumed. The root directory_path itself may be a symlink; + if so, it is resolved to its target and the target is walked. Zero-byte + files are skipped with a warning, since Synapse rejects empty files. + I/O errors raised by os.walk (for example, unreadable subdirectories) + are logged and skipped. If no uploadable files are found under + directory_path, a warning is logged and a header-only manifest is + written. If a folder store call fails, the exception propagates and no + manifest is written; any folders already created remain in Synapse and + will be reused on a retry. - See the mixin method docstring for behavior, arguments, and examples. + Arguments: + directory_path: Path to the local directory to be pushed to Synapse. + parent_id: Synapse ID of the Folder or Project to mirror the + directory hierarchy under. + manifest_path: Path where the generated manifest CSV will be written. + synapse_client: If not passed in and caching was not disabled by + Synapse.allow_client_caching(False) this will use the last + created instance from the Synapse class constructor. + + Returns: + None + + Raises: + ValueError: If directory_path does not exist or is not a directory, + or if parent_id exists in Synapse but is not a Folder or Project. + SynapseHTTPError: If parent_id does not exist in Synapse. """ client = Synapse.get_client(synapse_client=synapse_client) # realpath (not abspath) so that if directory_path is itself a symlink, From f4a3138a80883dbae16b307caab56a34ae417c83 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 6 May 2026 07:57:42 -0700 Subject: [PATCH 04/11] refactoring to improve readability --- synapseclient/models/services/manifest.py | 135 ++++++++++++++++------ 1 file changed, 100 insertions(+), 35 deletions(-) diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index 2e592035d..b9380d379 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -1337,46 +1337,20 @@ async def generate_sync_manifest( None Raises: - ValueError: If directory_path does not exist or is not a directory, - or if parent_id exists in Synapse but is not a Folder or Project. + ValueError: If the parent directory of manifest_path does not exist, + if directory_path does not exist or is not a directory, or if + parent_id exists in Synapse but is not a Folder or Project. SynapseHTTPError: If parent_id does not exist in Synapse. """ client = Synapse.get_client(synapse_client=synapse_client) - # realpath (not abspath) so that if directory_path is itself a symlink, - # the manifest records paths under the resolved target. That keeps the - # manifest valid if the original symlink is later removed or repointed. - directory_path = os.path.realpath(directory_path) - if not os.path.isdir(directory_path): - raise ValueError(f"{directory_path} is not a directory or does not exist") - await _validate_target_container_async(parent_id, client=client) - rows: list[dict[str, str]] = [] - # os.walk descends top-down, so a directory's parent is always registered - # here before the directory itself is visited. - local_to_synapse_parent: dict[str, str] = {directory_path: parent_id} - for dirpath, dirnames, filenames in os.walk( - directory_path, onerror=functools.partial(_log_walk_error, client) - ): - # Drop symlinked dirs so we don't create Synapse folders for - # directories whose contents os.walk (followlinks=False) won't - # visit. Mutating dirnames in place is the documented os.walk hook - # for pruning the traversal. - dirnames[:] = [ - d for d in dirnames if not os.path.islink(os.path.join(dirpath, d)) - ] - # Sort in place so os.walk also descends in deterministic order. - dirnames.sort() - current_parent_id = local_to_synapse_parent[dirpath] + manifest_path = _expand_path(manifest_path) + manifest_parent = os.path.dirname(manifest_path) or "." + if not os.path.isdir(manifest_parent): + raise ValueError(f"Manifest output directory does not exist: {manifest_parent}") + directory_path = await _validate_directory_path(directory_path, parent_id, client) - created = await _create_child_folders_async( - parent_id=current_parent_id, dirnames=dirnames, client=client - ) - for dirname, folder in zip(dirnames, created): - local_to_synapse_parent[os.path.join(dirpath, dirname)] = folder.id - - rows.extend( - _collect_uploadable_rows(dirpath, filenames, current_parent_id, client) - ) + rows = await _collect_manifest_rows(directory_path, parent_id, client) if not rows: client.logger.warning( @@ -1386,6 +1360,37 @@ async def generate_sync_manifest( _write_manifest_csv(manifest_path, rows) +async def _validate_directory_path( + directory_path: str, parent_id: str, client: Synapse +) -> str: + """Resolve symlinks on directory_path, verify it is an existing directory, + and confirm parent_id points to a Folder or Project in Synapse. + + Uses realpath (not abspath) so that if directory_path is itself a + symlink, the manifest records paths under the resolved target. That + keeps the manifest valid if the original symlink is later removed or + re-pointed. + + Arguments: + directory_path: Path to validate. + parent_id: Synapse ID of the target container to validate. + client: Authenticated Synapse client. + + Returns: + The realpath-resolved absolute directory path. + + Raises: + ValueError: If the resolved path does not exist or is not a + directory, or if parent_id exists but is not a Folder or Project. + SynapseHTTPError: If parent_id does not exist in Synapse. + """ + directory_path = os.path.realpath(_expand_path(directory_path)) + if not os.path.isdir(directory_path): + raise ValueError(f"{directory_path} is not a directory or does not exist") + await _validate_target_container_async(parent_id, client=client) + return directory_path + + async def _validate_target_container_async(parent_id: str, client: Synapse) -> None: """Verify that parent_id resolves to a Folder or Project in Synapse. @@ -1422,6 +1427,66 @@ def _log_walk_error(client: Synapse, err: OSError) -> None: ) +async def _collect_manifest_rows( + directory_path: str, parent_id: str, client: Synapse +) -> list[dict[str, str]]: + """Walk directory_path and produce manifest rows for every uploadable file. + + Mirrors the local folder hierarchy under parent_id in Synapse as a side + effect: sibling folders at each depth are created concurrently, and + existing Synapse folders with the same name and parent are reused. + Directory symlinks are pruned so os.walk's traversal stays consistent + with the folders actually created in Synapse. + + Arguments: + directory_path: Realpath-resolved local directory to walk. + parent_id: Synapse ID of the container that maps to directory_path. + client: Authenticated Synapse client. + + Returns: + A list of dicts with path and parentId keys, one per uploadable file. + """ + rows: list[dict[str, str]] = [] + # os.walk descends top-down, so a directory's parent is always registered + # here before the directory itself is visited. + local_to_synapse_parent: dict[str, str] = {directory_path: parent_id} + + for dirpath, dirnames, filenames in os.walk( + directory_path, onerror=functools.partial(_log_walk_error, client) + ): + _prepare_dirnames(dirnames, dirpath) + current_parent_id = local_to_synapse_parent[dirpath] + + created = await _create_child_folders_async( + parent_id=current_parent_id, dirnames=dirnames, client=client + ) + for dirname, folder in zip(dirnames, created): + local_to_synapse_parent[os.path.join(dirpath, dirname)] = folder.id + + rows.extend( + _collect_uploadable_rows(dirpath, filenames, current_parent_id, client) + ) + + return rows + + +def _prepare_dirnames(dirnames: list[str], dirpath: str) -> None: + """Prune symlinked subdirectories and sort the rest in place. + + Mutates dirnames in place — the documented os.walk hook for both + pruning the traversal and forcing deterministic descent order. + Symlinked subdirectories are dropped so we don't create Synapse + folders for directories whose contents os.walk (followlinks=False) + won't visit. + + Arguments: + dirnames: The dirnames list yielded by os.walk for the current dirpath. + dirpath: The current directory being walked. + """ + dirnames[:] = [d for d in dirnames if not os.path.islink(os.path.join(dirpath, d))] + dirnames.sort() + + async def _create_child_folders_async( parent_id: str, dirnames: list[str], client: Synapse ) -> list[Folder]: From f0c16cb7ac8c931bc29f2e1687ac51e5b8450aad Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 6 May 2026 08:52:00 -0700 Subject: [PATCH 05/11] improve readability --- synapseclient/models/services/manifest.py | 49 +++++++++++++++-------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index 7f1269060..513cf7632 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -34,8 +34,7 @@ test_import_pandas, topolgical_sort, ) -from synapseclient.operations.factory_operations import FileOptions -from synapseclient.operations.factory_operations import get_async as factory_get_async +from synapseclient.operations.factory_operations import FileOptions, get_async if TYPE_CHECKING: from pandas import DataFrame, Series @@ -711,7 +710,7 @@ async def _check_one(syn_id: str) -> None: if not syn_id: return try: - container = await factory_get_async( + container = await get_async( synapse_id=syn_id, file_options=FileOptions(download_file=False), synapse_client=syn, @@ -1550,7 +1549,7 @@ async def generate_sync_manifest( symlink-resolved path — directory_path is resolved via os.path.realpath before walking, so the manifest can be consumed from any working directory and remains valid even if the original symlink is later - removed or repointed) and parentId (the Synapse ID of the file's + removed or re-pointed) and parentId (the Synapse ID of the file's containing folder). Folders that already exist in Synapse with the same name and parent are reused rather than re-created. @@ -1584,8 +1583,9 @@ async def generate_sync_manifest( Raises: ValueError: If the parent directory of manifest_path does not exist, - if directory_path does not exist or is not a directory, or if - parent_id exists in Synapse but is not a Folder or Project. + if manifest_path is an existing directory, if directory_path + does not exist or is not a directory, or if parent_id exists in + Synapse but is not a Folder or Project. SynapseHTTPError: If parent_id does not exist in Synapse. """ client = Synapse.get_client(synapse_client=synapse_client) @@ -1594,6 +1594,10 @@ async def generate_sync_manifest( manifest_parent = os.path.dirname(manifest_path) or "." if not os.path.isdir(manifest_parent): raise ValueError(f"Manifest output directory does not exist: {manifest_parent}") + if os.path.isdir(manifest_path): + raise ValueError( + f"Manifest output path is an existing directory, not a file: {manifest_path}" + ) directory_path = await _validate_directory_path(directory_path, parent_id, client) rows = await _collect_manifest_rows(directory_path, parent_id, client) @@ -1648,7 +1652,7 @@ async def _validate_target_container_async(parent_id: str, client: Synapse) -> N ValueError: If parent_id exists but is not a Folder or Project. """ try: - container = await factory_get_async( + container = await get_async( synapse_id=parent_id, file_options=FileOptions(download_file=False), synapse_client=client, @@ -1706,7 +1710,7 @@ async def _collect_manifest_rows( created = await _create_child_folders_async( parent_id=current_parent_id, dirnames=dirnames, client=client ) - for dirname, folder in zip(dirnames, created): + for dirname, folder in created.items(): local_to_synapse_parent[os.path.join(dirpath, dirname)] = folder.id rows.extend( @@ -1735,20 +1739,33 @@ def _prepare_dirnames(dirnames: list[str], dirpath: str) -> None: async def _create_child_folders_async( parent_id: str, dirnames: list[str], client: Synapse -) -> list[Folder]: +) -> dict[str, Folder]: """Create sibling folders concurrently under a shared Synapse parent. Sibling folders have no ordering dependency on each other, so they are - gathered in a single batch rather than awaited one at a time. + gathered in a single batch rather than awaited one at a time. Each task + returns its own dirname alongside the resulting Folder so the caller's + name-to-Folder mapping does not depend on asyncio.gather preserving + submission order. + + Returns: + A dict mapping each input dirname to the Folder that was created or + reused for it. """ from synapseclient.models.folder import Folder - return await asyncio.gather( - *[ - Folder(name=dirname, parent_id=parent_id).store_async(synapse_client=client) - for dirname in dirnames - ] - ) + # Each task carries its dirname through to the result so the caller can + # build a name-to-Folder mapping without relying on asyncio.gather's + # input-order guarantee. + # Pair-tagged results remain correct under any completion order. + async def _store(dirname: str) -> tuple[str, Folder]: + folder = await Folder(name=dirname, parent_id=parent_id).store_async( + synapse_client=client + ) + return dirname, folder + + pairs = await asyncio.gather(*[_store(d) for d in dirnames]) + return dict(pairs) def _collect_uploadable_rows( From 9c56b71740ce97b89c9e0a4f7ed4e62b44585df7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 6 May 2026 13:17:04 -0700 Subject: [PATCH 06/11] added unit tests --- synapseclient/models/services/manifest.py | 218 ++++-- .../unit_test_storable_container_async.py | 734 ++++++++++++++---- 2 files changed, 755 insertions(+), 197 deletions(-) diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index 513cf7632..b8d63c9ee 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -16,7 +16,7 @@ import os import re from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, Union +from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, TypedDict, Union from synapseclient import Synapse from synapseclient.core import utils @@ -1535,6 +1535,17 @@ async def _resolve_local_file_provenance( GENERATED_MANIFEST_COLUMNS = ["path", "parentId"] +class ManifestRow(TypedDict): + """Shape of a single row in the generated manifest CSV. + + Keys mirror GENERATED_MANIFEST_COLUMNS so the dict can be passed + directly to csv.DictWriter without conversion. + """ + + path: str + parentId: str + + async def generate_sync_manifest( directory_path: str, parent_id: str, @@ -1598,9 +1609,10 @@ async def generate_sync_manifest( raise ValueError( f"Manifest output path is an existing directory, not a file: {manifest_path}" ) - directory_path = await _validate_directory_path(directory_path, parent_id, client) + directory_path = await _resolve_and_validate_directory_path_async(directory_path) + await _validate_target_container_async(parent_id, client=client) - rows = await _collect_manifest_rows(directory_path, parent_id, client) + rows = await _collect_manifest_rows_async(directory_path, parent_id, client) if not rows: client.logger.warning( @@ -1610,11 +1622,9 @@ async def generate_sync_manifest( _write_manifest_csv(manifest_path, rows) -async def _validate_directory_path( - directory_path: str, parent_id: str, client: Synapse -) -> str: - """Resolve symlinks on directory_path, verify it is an existing directory, - and confirm parent_id points to a Folder or Project in Synapse. +async def _resolve_and_validate_directory_path_async(directory_path: str) -> str: + """Resolve symlinks on directory_path and verify it is an existing + directory. Uses realpath (not abspath) so that if directory_path is itself a symlink, the manifest records paths under the resolved target. That @@ -1623,104 +1633,159 @@ async def _validate_directory_path( Arguments: directory_path: Path to validate. - parent_id: Synapse ID of the target container to validate. - client: Authenticated Synapse client. Returns: The realpath-resolved absolute directory path. Raises: ValueError: If the resolved path does not exist or is not a - directory, or if parent_id exists but is not a Folder or Project. - SynapseHTTPError: If parent_id does not exist in Synapse. + directory. """ directory_path = os.path.realpath(_expand_path(directory_path)) if not os.path.isdir(directory_path): raise ValueError(f"{directory_path} is not a directory or does not exist") - await _validate_target_container_async(parent_id, client=client) return directory_path async def _validate_target_container_async(parent_id: str, client: Synapse) -> None: """Verify that parent_id resolves to a Folder or Project in Synapse. - Dedicated to manifest generation so error messages reference the - container the user is writing into, not a manifest parentId column. - Raises: - SynapseHTTPError: If parent_id does not exist in Synapse. ValueError: If parent_id exists but is not a Folder or Project. """ - try: - container = await get_async( - synapse_id=parent_id, - file_options=FileOptions(download_file=False), - synapse_client=client, - ) - except SynapseHTTPError as err: - if getattr(err.response, "status_code", None) == 404: - client.logger.warning(f"{parent_id} is not a valid Synapse Id") - raise + container = await get_async( + synapse_id=parent_id, + file_options=FileOptions(download_file=False), + synapse_client=client, + ) - from synapseclient.models.folder import Folder - from synapseclient.models.project import Project + from synapseclient.models import Folder, Project if not isinstance(container, (Folder, Project)): raise ValueError(f"Container {parent_id} is not a Folder or Project") -def _log_walk_error(client: Synapse, err: OSError) -> None: - """Log and skip an os.walk I/O error during manifest generation.""" - client.logger.warning( - f"Skipping unreadable path during manifest generation:" - f" {err.filename} ({err})" - ) - - -async def _collect_manifest_rows( +async def _collect_manifest_rows_async( directory_path: str, parent_id: str, client: Synapse -) -> list[dict[str, str]]: +) -> list[ManifestRow]: """Walk directory_path and produce manifest rows for every uploadable file. - Mirrors the local folder hierarchy under parent_id in Synapse as a side - effect: sibling folders at each depth are created concurrently, and - existing Synapse folders with the same name and parent are reused. - Directory symlinks are pruned so os.walk's traversal stays consistent - with the folders actually created in Synapse. - - Arguments: - directory_path: Realpath-resolved local directory to walk. - parent_id: Synapse ID of the container that maps to directory_path. - client: Authenticated Synapse client. - - Returns: - A list of dicts with path and parentId keys, one per uploadable file. + Orchestrates the layer between os.walk (filesystem traversal) and the + Synapse folder/file model. Called once by generate_sync_manifest, after + path validation and before _write_manifest_csv. + + Algorithm: + + 1. Initialize a local-to-Synapse parent map seeded with + directory_path → parent_id. Subfolder IDs are added as they are + created so descendants can look up their Synapse parent. + 2. Walk top-down with os.walk. Top-down order matters — a directory's + parent is always yielded (and registered) before the directory + itself is visited. Walk errors (permission denied, broken paths) + are logged via _log_walk_error rather than aborting traversal. + + For each directory: + + 3. Prune symlinks and sort dirnames in place via + _prune_symlinks_and_sort_dirnames so Synapse folders aren't created + for directories os.walk(followlinks=False) won't visit, and + descent is deterministic. + 4. Look up current_parent_id from the map (guaranteed present by the + top-down invariant). + 5. Create sibling folders concurrently via _create_child_folders_async + (asyncio.gather over Folder.store_async). Existing Synapse folders + with the same name and parent are reused, not duplicated. + 6. Register each new folder's Synapse ID in the map keyed by its + absolute local path so step 4 succeeds when os.walk descends into + it on a later iteration. + 7. Build manifest rows for files in the current directory via + _build_manifest_rows. Unreadable and zero-byte files are filtered + out with warnings inside that helper. + + Side effect vs return value: the return value is the flat list of + ManifestRow entries, but the side effect — creating the Synapse + folder hierarchy under parent_id to mirror the local tree — is what + makes the rows usable. Without it the parentId values in the rows + would point at folders that don't exist yet. + + Async because _create_child_folders_async hits the Synapse API and + parallelizes sibling-folder creation. The rest (walking, sorting, + building rows) is local I/O and would otherwise be sync. + + Arguments: + directory_path: Realpath-resolved local directory to walk. + parent_id: Synapse ID of the container that maps to directory_path. + client: Authenticated Synapse client. + + Returns: + A list of ManifestRow entries, one per uploadable file. """ - rows: list[dict[str, str]] = [] - # os.walk descends top-down, so a directory's parent is always registered - # here before the directory itself is visited. + rows: list[ManifestRow] = [] + # Step 1: seed the local-to-Synapse parent map with the root mapping. local_to_synapse_parent: dict[str, str] = {directory_path: parent_id} + # Step 2: walk the input dir. for dirpath, dirnames, filenames in os.walk( directory_path, onerror=functools.partial(_log_walk_error, client) ): - _prepare_dirnames(dirnames, dirpath) + # Step 3: prune symlinked dirs and sort in place + _prune_symlinks_and_sort_dirnames(dirnames, dirpath) + + # Step 4: look up the Synapse parent for the current local dir current_parent_id = local_to_synapse_parent[dirpath] + # Step 5: create sibling folders concurrently. created = await _create_child_folders_async( parent_id=current_parent_id, dirnames=dirnames, client=client ) + # Step 6: register each new folder's Synapse ID. for dirname, folder in created.items(): local_to_synapse_parent[os.path.join(dirpath, dirname)] = folder.id - rows.extend( - _collect_uploadable_rows(dirpath, filenames, current_parent_id, client) - ) + # Step 7: build rows for files in this directory and accumulate. + rows.extend(_build_manifest_rows(dirpath, filenames, current_parent_id, client)) return rows -def _prepare_dirnames(dirnames: list[str], dirpath: str) -> None: +def _log_walk_error(client: Synapse, err: OSError) -> None: + """Turn an os.walk I/O error from fatal into logged-and-skipped. + + By default os.walk silently ignores any OSError it hits while listing a + directory (e.g., permission denied, vanished symlink target, dead mount + point). Silent skipping is dangerous during manifest generation because + the user would get an incomplete manifest with no indication that some + subtree was missed. os.walk accepts an onerror callback that, if + provided, is invoked with the OSError instead — which is what + _collect_manifest_rows_async wires up via functools.partial. + + This callback logs a warning through the Synapse client's logger naming + the offending path (err.filename) and the underlying error message, then + returns. Returning normally (rather than raising) is the contract that + tells os.walk to keep going, so traversal continues into the rest of the + tree. + + The client argument is first only so the caller can bind it via + functools.partial, leaving the (err: OSError) signature os.walk requires. + The client is needed solely to reach client.logger. + + Net effect: unreadable directories produce a visible warning but do not + abort manifest generation; readable parts of the tree still produce + manifest rows. This mirrors _build_manifest_rows, which also + warns-and-skips on unreadable or zero-byte files rather than failing the + whole run. + + Arguments: + client: Authenticated Synapse client, used only for its logger. + err: The OSError raised by os.walk while listing a directory. + """ + client.logger.warning( + f"Skipping unreadable path during manifest generation:" + f" {err.filename} ({err})" + ) + + +def _prune_symlinks_and_sort_dirnames(dirnames: list[str], dirpath: str) -> None: """Prune symlinked subdirectories and sort the rest in place. Mutates dirnames in place — the documented os.walk hook for both @@ -1768,19 +1833,38 @@ async def _store(dirname: str) -> tuple[str, Folder]: return dict(pairs) -def _collect_uploadable_rows( +def _build_manifest_rows( dirpath: str, filenames: Iterable[str], parent_id: str, client: Synapse, -) -> list[dict[str, str]]: +) -> list[ManifestRow]: """Build manifest rows for the uploadable files in a single directory. - Files are visited in sorted order so manifest output is deterministic. - Unreadable and zero-byte files are logged and dropped by - _is_uploadable_file. + Called once per directory by _collect_manifest_rows_async during the + os.walk traversal. All files in a single call share the same parent_id + (the Synapse folder corresponding to dirpath). Sync because everything + it does is local I/O with no Synapse API calls; the async parent only + awaits when creating Synapse folders, not when scanning files. + + Filenames are sorted before iteration so output is deterministic across + runs and platforms (os.walk does not guarantee filename order). Each + name is joined with dirpath to form an absolute filepath, then filtered + through _is_uploadable_file, which logs and drops unreadable files + (broken symlinks, permission errors) and zero-byte files. + + Arguments: + dirpath: Absolute directory path currently being walked. + filenames: Filenames yielded by os.walk for dirpath. + parent_id: Synapse ID of the folder that maps to dirpath. + client: Authenticated Synapse client, used only for logging. + + Returns: + A list of ManifestRow entries (path, parentId) ready to be written + into the manifest CSV (matching GENERATED_MANIFEST_COLUMNS), one + per uploadable file. """ - rows: list[dict[str, str]] = [] + rows: list[ManifestRow] = [] for filename in sorted(filenames): filepath = os.path.join(dirpath, filename) if _is_uploadable_file(filepath, client): @@ -1788,7 +1872,7 @@ def _collect_uploadable_rows( return rows -def _write_manifest_csv(manifest_path: str, rows: list[dict[str, str]]) -> None: +def _write_manifest_csv(manifest_path: str, rows: list[ManifestRow]) -> None: """Write generated manifest rows to a CSV at manifest_path.""" with open(manifest_path, "w", encoding="utf-8", newline="") as fp: writer = csv.DictWriter(fp, fieldnames=GENERATED_MANIFEST_COLUMNS) diff --git a/tests/unit/synapseclient/models/async/unit_test_storable_container_async.py b/tests/unit/synapseclient/models/async/unit_test_storable_container_async.py index bc0d047e3..2df97faac 100644 --- a/tests/unit/synapseclient/models/async/unit_test_storable_container_async.py +++ b/tests/unit/synapseclient/models/async/unit_test_storable_container_async.py @@ -2,16 +2,17 @@ import csv import os +import platform import uuid from pathlib import Path from typing import Any from unittest.mock import AsyncMock, patch -import pandas as pd import pytest from synapseclient import Synapse -from synapseclient.models import Folder, Project +from synapseclient.models import File, Folder, Project +from synapseclient.models.services import manifest as manifest_module def _write_manifest(rows: list[dict], tmp_path: Path) -> Path: @@ -89,199 +90,672 @@ async def test_all_rows_have_errors_no_crash(self, tmp_path: Path) -> None: class TestGenerateSyncManifest: """Tests for StorableContainer.generate_sync_manifest_async.""" - @pytest.fixture(autouse=True) - def init_syn(self, syn: Synapse) -> None: - self.syn = syn - - @pytest.fixture(autouse=True) - def stub_parent_check(self) -> Any: - """Bypass the upfront parent_id validation so tests don't hit the network. - Tests that exercise the validation itself patch this explicitly.""" - with patch( - "synapseclient.models.services.manifest._validate_target_container_async", - new=AsyncMock(return_value=None), - ) as mock: - yield mock - - async def test_unstored_container_raises(self, tmp_path: Path) -> None: + async def test_unstored_container_raises( + self, tmp_path: Path, syn: Synapse + ) -> None: """Calling the method on a container whose id is None is a caller bug and must surface as a ValueError rather than a confusing downstream - error.""" + error. The integration suite cannot exercise this case because every + container fixture has already been stored.""" + # GIVEN a local directory with one file src = tmp_path / "src" src.mkdir() (src / "a.txt").write_text("hello") manifest = tmp_path / "manifest.csv" + # WHEN generate_sync_manifest_async is called on a Project that has + # never been stored (id is None) + # THEN a ValueError is raised explaining the container is not stored with pytest.raises(ValueError, match="has not been stored in Synapse"): await Project(name="unstored").generate_sync_manifest_async( directory_path=str(src), manifest_path=str(manifest), - synapse_client=self.syn, + synapse_client=syn, ) - async def test_empty_directory_writes_header_only(self, tmp_path: Path) -> None: - """An empty directory produces a manifest with only the header row and - logs a warning so the caller is not left wondering why no files uploaded.""" + async def test_empty_directory_logs_warning( + self, tmp_path: Path, syn: Synapse + ) -> None: + """An empty directory logs a 'No uploadable files found' warning so + the caller knows why their manifest is empty. The integration suite + covers the resulting header-only file content.""" + # GIVEN an empty local directory empty_dir = tmp_path / "empty" empty_dir.mkdir() manifest = tmp_path / "manifest.csv" - with patch.object(self.syn.logger, "warning") as mock_warning: + # WHEN we generate a manifest for it (parent-id validation is stubbed + # so this stays offline) + with ( + patch( + "synapseclient.models.services.manifest._validate_target_container_async", + new=AsyncMock(return_value=None), + ), + patch.object(syn.logger, "warning") as mock_warning, + ): await Folder(id="syn1").generate_sync_manifest_async( directory_path=str(empty_dir), manifest_path=str(manifest), - synapse_client=self.syn, + synapse_client=syn, ) - assert manifest.read_text().strip().splitlines() == ["path,parentId"] + # THEN a "no uploadable files" warning is logged so the caller knows + # why their manifest is empty assert any( "No uploadable files found" in call.args[0] for call in mock_warning.call_args_list ) - async def test_flat_directory_lists_files_skips_empty(self, tmp_path: Path) -> None: - """Non-empty files each produce a manifest row sharing the container's - id as parentId; zero-byte files are skipped.""" - src = tmp_path / "src" - src.mkdir() - (src / "a.txt").write_text("hello") - (src / "b.txt").write_text("world") - (src / "empty.txt").write_text("") + @pytest.mark.parametrize( + "kind", + ["missing-path", "file-path"], + ) + async def test_invalid_directory_path_raises( + self, tmp_path: Path, syn: Synapse, kind: str + ) -> None: + """The public surface raises ValueError for both a non-existent path + and a regular file path, and writes no manifest. The integration suite + only exercises the missing-path case, so the file-path case is covered + here.""" + # GIVEN either a non-existent path or a regular file path + if kind == "missing-path": + path = tmp_path / "does_not_exist" + else: + path = tmp_path / "a.txt" + path.write_text("hello") manifest = tmp_path / "manifest.csv" - await Folder(id="syn42").generate_sync_manifest_async( - directory_path=str(src), - manifest_path=str(manifest), - synapse_client=self.syn, - ) + # WHEN we try to generate a manifest for it + # THEN a ValueError is raised + with pytest.raises(ValueError, match="is not a directory or does not exist"): + await Folder(id="syn1").generate_sync_manifest_async( + directory_path=str(path), + manifest_path=str(manifest), + synapse_client=syn, + ) - df = pd.read_csv(manifest) - assert list(df.columns) == ["path", "parentId"] - assert sorted(os.path.basename(p) for p in df["path"]) == ["a.txt", "b.txt"] - assert (df["parentId"] == "syn42").all() + # AND no manifest file is written (failure happens before any output) + assert not manifest.exists() - async def test_nested_directories_thread_parent_ids(self, tmp_path: Path) -> None: - """Files in nested directories get the Synapse ID of their innermost - Synapse folder as parentId, and Folder.store_async is called once per - subdirectory with the correct name/parent_id.""" + async def test_parent_id_validated_upfront( + self, tmp_path: Path, syn: Synapse + ) -> None: + """The container's id is validated via _validate_target_container_async + before the directory is walked. This ordering invariant cannot be + verified through the live API; integration tests can only observe the + end state.""" + # GIVEN a directory with one file src = tmp_path / "src" - (src / "level1" / "level2").mkdir(parents=True) - (src / "level1" / "level2" / "deep.txt").write_text("content") + src.mkdir() + (src / "a.txt").write_text("hello") manifest = tmp_path / "manifest.csv" - folder_id_iter = iter(["syn100", "syn200"]) - observed: list[tuple[str, str]] = [] - - async def fake_store_async(self: Any, *args: Any, **kwargs: Any) -> Any: - observed.append((self.name, self.parent_id)) - self.id = next(folder_id_iter) - return self - + # WHEN we generate a manifest under a Folder with id "synROOT" with + # the parent-id validation stubbed so it stays offline with patch( - "synapseclient.models.Folder.store_async", - new=fake_store_async, - ): + "synapseclient.models.services.manifest._validate_target_container_async", + new_callable=AsyncMock, + ) as mock_validate: await Folder(id="synROOT").generate_sync_manifest_async( directory_path=str(src), manifest_path=str(manifest), - synapse_client=self.syn, + synapse_client=syn, ) - assert observed == [("level1", "synROOT"), ("level2", "syn100")] - df = pd.read_csv(manifest) - assert df["path"].tolist() == [ - os.path.join(os.path.realpath(str(src)), "level1", "level2", "deep.txt") - ] - assert df["parentId"].tolist() == ["syn200"] + # THEN _validate_target_container_async was awaited exactly once + # with that id, ensuring the parent check happens before traversal + mock_validate.assert_awaited_once_with("synROOT", client=syn) + - async def test_output_is_csv_not_tsv(self, tmp_path: Path) -> None: - """Output uses comma delimiter (CSV), matching the new manifest reader. +class TestResolveAndValidateDirectoryPath: + """Tests for manifest._resolve_and_validate_directory_path_async.""" - The legacy synapseutils.generate_sync_manifest wrote TSV with a - 'parent' column; the OOP sync_to_synapse expects CSV with 'parentId'. - """ + async def test_valid_directory_returns_realpath(self, tmp_path: Path) -> None: + """A valid directory returns the realpath-resolved absolute path.""" + # GIVEN a real directory on disk src = tmp_path / "src" src.mkdir() - (src / "file.txt").write_text("data") - manifest = tmp_path / "manifest.csv" - await Folder(id="syn1").generate_sync_manifest_async( - directory_path=str(src), - manifest_path=str(manifest), - synapse_client=self.syn, + # WHEN we validate the directory path + result = await manifest_module._resolve_and_validate_directory_path_async( + directory_path=str(src) ) - text = manifest.read_text() - assert "\t" not in text - assert text.startswith("path,parentId") - - async def test_missing_directory_raises(self, tmp_path: Path) -> None: - """A directory_path that does not exist raises ValueError instead of - silently writing an empty manifest.""" - missing = tmp_path / "does_not_exist" - manifest = tmp_path / "manifest.csv" - + # THEN the realpath-resolved absolute path is returned + assert result == os.path.realpath(str(src)) + + @pytest.mark.parametrize( + "kind", + ["missing-path", "file-path"], + ) + async def test_non_directory_path_raises(self, tmp_path: Path, kind: str) -> None: + """Both a non-existent path and a regular file path raise ValueError.""" + # GIVEN either a non-existent path or a regular file path + if kind == "missing-path": + path = tmp_path / "missing" + else: + path = tmp_path / "f.txt" + path.write_text("hello") + + # WHEN we validate the directory path + # THEN a ValueError is raised explaining the path is not a directory with pytest.raises(ValueError, match="is not a directory or does not exist"): - await Folder(id="syn1").generate_sync_manifest_async( - directory_path=str(missing), - manifest_path=str(manifest), - synapse_client=self.syn, + await manifest_module._resolve_and_validate_directory_path_async( + directory_path=str(path) ) - assert not manifest.exists() + @pytest.mark.skipif( + platform.system() == "Windows", + reason="Symlink creation requires elevated privileges on Windows.", + ) + async def test_symlink_resolved_to_target(self, tmp_path: Path) -> None: + """If directory_path is a symlink, the resolved target path is returned + so the manifest survives the original symlink being removed/redirected.""" + # GIVEN a target directory and a symlink pointing to it + target = tmp_path / "target" + target.mkdir() + link = tmp_path / "link" + os.symlink(str(target), str(link)) + + # WHEN we validate the symlink path + result = await manifest_module._resolve_and_validate_directory_path_async( + directory_path=str(link) + ) - async def test_file_path_raises(self, tmp_path: Path) -> None: - """A directory_path pointing at a file (not a directory) raises ValueError.""" - file_path = tmp_path / "a.txt" - file_path.write_text("hello") - manifest = tmp_path / "manifest.csv" + # THEN the realpath of the symlink target is returned (not the link) + assert result == os.path.realpath(str(target)) - with pytest.raises(ValueError, match="is not a directory or does not exist"): - await Folder(id="syn1").generate_sync_manifest_async( - directory_path=str(file_path), - manifest_path=str(manifest), - synapse_client=self.syn, + +class TestValidateTargetContainer: + """Tests for manifest._validate_target_container_async.""" + + @pytest.mark.parametrize( + "container", + [ + pytest.param(Folder(id="syn1", name="folder"), id="folder"), + pytest.param(Project(id="syn1", name="project"), id="project"), + ], + ) + async def test_container_passes( + self, syn: Synapse, container: Folder | Project + ) -> None: + """Both Folder and Project pass validation""" + # GIVEN get_async is stubbed to return either a Folder or Project + with patch.object( + manifest_module, + "get_async", + new_callable=AsyncMock, + return_value=container, + ): + # WHEN we validate the target container by id + result = await manifest_module._validate_target_container_async( + "syn1", client=syn ) + # THEN validation returns None (no error raised) + assert result is None + + async def test_non_container_raises_value_error(self, syn: Synapse) -> None: + """A File (or any non-container) raises ValueError naming the id.""" + # GIVEN get_async is stubbed to return a File (non-container) entity + not_container = File(id="syn1", name="file") + with patch.object( + manifest_module, + "get_async", + new_callable=AsyncMock, + return_value=not_container, + ): + # WHEN we validate the target container by id + # THEN a ValueError is raised naming the id and rejecting the type + with pytest.raises( + ValueError, match=r"Container syn1 is not a Folder or Project" + ): + await manifest_module._validate_target_container_async( + "syn1", client=syn + ) - async def test_parent_id_validated_upfront( - self, tmp_path: Path, stub_parent_check: AsyncMock + +class TestCollectManifestRows: + """Tests for manifest._collect_manifest_rows_async.""" + + async def test_empty_directory_returns_empty_list( + self, tmp_path: Path, syn: Synapse ) -> None: - """The container's id is validated via _validate_target_container_async - before the directory is walked.""" + """An empty directory produces no rows.""" + # GIVEN an empty local directory + empty = tmp_path / "empty" + empty.mkdir() + + # WHEN we collect manifest rows for it + rows = await manifest_module._collect_manifest_rows_async( + directory_path=str(empty), parent_id="syn1", client=syn + ) + + # THEN the result is an empty list + assert rows == [] + + async def test_flat_directory_uses_root_parent_id( + self, tmp_path: Path, syn: Synapse + ) -> None: + """Files at the top level all share the supplied parent_id and are + emitted in sorted order. No Synapse folders are created when the + local tree has no subdirectories.""" + # GIVEN a flat directory containing two files in non-sorted order src = tmp_path / "src" src.mkdir() - (src / "a.txt").write_text("hello") - manifest = tmp_path / "manifest.csv" + (src / "b.txt").write_text("two") + (src / "a.txt").write_text("one") - await Folder(id="synROOT").generate_sync_manifest_async( - directory_path=str(src), - manifest_path=str(manifest), - synapse_client=self.syn, - ) + # WHEN we collect manifest rows under a root parent_id + with patch( + "synapseclient.models.Folder.store_async", + new_callable=AsyncMock, + ) as mock_store_async: + rows = await manifest_module._collect_manifest_rows_async( + directory_path=str(src), parent_id="synROOT", client=syn + ) - stub_parent_check.assert_awaited_once_with("synROOT", client=self.syn) + # THEN the rows are emitted in sorted filename order + assert [os.path.basename(row["path"]) for row in rows] == ["a.txt", "b.txt"] + # AND every row uses the root parent_id + assert all(row["parentId"] == "synROOT" for row in rows) + # AND no Synapse folders are created (no subdirectories to mirror) + mock_store_async.assert_not_called() - async def test_invalid_parent_id_propagates_http_error( - self, tmp_path: Path + async def test_nested_directories_use_created_folder_ids( + self, tmp_path: Path, syn: Synapse ) -> None: - """If the upfront parent_id check raises SynapseHTTPError, no manifest - is written and the error propagates to the caller.""" - from synapseclient.core.exceptions import SynapseHTTPError + """Files in nested subdirectories receive the id of the Synapse folder + created for their innermost local directory.""" + # GIVEN a nested directory tree with one file at the deepest level + src = tmp_path / "src" + (src / "level1" / "level2").mkdir(parents=True) + (src / "level1" / "level2" / "deep.txt").write_text("content") + + # AND a fake _create_child_folders_async that returns Folders with + # deterministic ids (syn_) so the test can verify how the + # production code threads parent ids from one walk iteration to + # the next. Records (parent_id, dirnames) per call to assert call + # order AND that each call's parent_id is the fake id assigned to + # the previously-created folder. + observed: list[tuple[str, list[str]]] = [] + + async def fake_create_child_folders_async( + parent_id: str, dirnames: list[str], client: Synapse + ) -> dict[str, Folder]: + observed.append((parent_id, list(dirnames))) + return {d: Folder(name=d, id=f"syn_{d}") for d in dirnames} + + # WHEN we collect manifest rows under the root parent_id + with patch.object( + manifest_module, + "_create_child_folders_async", + new=fake_create_child_folders_async, + ): + rows = await manifest_module._collect_manifest_rows_async( + directory_path=str(src), parent_id="synROOT", client=syn + ) + + # AND folders are created top-down: level1 under root, then level2 + # under the freshly-created level1 (id syn_level1), and a final + # no-op call for the leaf directory. + assert observed == [ + ("synROOT", ["level1"]), + ("syn_level1", ["level2"]), + ("syn_level2", []), + ] + # AND the deep file's parentId is the innermost folder's id + assert rows == [ + { + "path": os.path.join(str(src), "level1", "level2", "deep.txt"), + "parentId": "syn_level2", + } + ] + async def test_mixed_flat_and_nested_uses_correct_parent_ids( + self, tmp_path: Path, syn: Synapse + ) -> None: + """Files at the root use the supplied parent_id while files inside + subdirectories use the id of the folder created for their containing + directory. The flat-only and nested-only tests each cover one branch + in isolation; this combined case asserts both work together in a + single walk.""" + # GIVEN a tree with one file at the root AND a subdir with a file src = tmp_path / "src" src.mkdir() - (src / "a.txt").write_text("hello") - manifest = tmp_path / "manifest.csv" + (src / "root.txt").write_text("root-data") + (src / "child").mkdir() + (src / "child" / "nested.txt").write_text("nested-data") + + # AND a fake _create_child_folders_async that assigns deterministic + # ids so the test can distinguish root vs. nested rows by parentId + async def fake_create_child_folders_async( + parent_id: str, dirnames: list[str], client: Synapse + ) -> dict[str, Folder]: + return {d: Folder(name=d, id=f"syn_{d}") for d in dirnames} + + # WHEN we collect manifest rows under the root parent_id + with patch.object( + manifest_module, + "_create_child_folders_async", + new=fake_create_child_folders_async, + ): + rows = await manifest_module._collect_manifest_rows_async( + directory_path=str(src), parent_id="synROOT", client=syn + ) + + # THEN the root-level file uses the root parent_id and the nested + # file uses the id assigned to its containing folder + by_basename = {os.path.basename(r["path"]): r["parentId"] for r in rows} + assert by_basename == { + "root.txt": "synROOT", + "nested.txt": "syn_child", + } + async def test_multiple_siblings_share_parent_id_in_one_batch( + self, tmp_path: Path, syn: Synapse + ) -> None: + """Sibling directories at the same depth are passed to + _create_child_folders_async in a single batch (one call), all sharing + the same parent_id.""" + # GIVEN three sibling directories at the same depth, each with a + # uniquely-named file so rows can be matched back to their folder + src = tmp_path / "src" + src.mkdir() + for name in ("alpha", "beta", "gamma"): + (src / name).mkdir() + (src / name / f"{name}.txt").write_text("data") + + # AND a fake _create_child_folders_async that records each call's + # (parent_id, dirnames) so we can assert there's exactly one batch + # for the three siblings + observed: list[tuple[str, list[str]]] = [] + + async def fake_create_child_folders_async( + parent_id: str, dirnames: list[str], client: Synapse + ) -> dict[str, Folder]: + observed.append((parent_id, list(dirnames))) + return {d: Folder(name=d, id=f"syn_{d}") for d in dirnames} + + # WHEN we collect manifest rows under the root parent_id + with patch.object( + manifest_module, + "_create_child_folders_async", + new=fake_create_child_folders_async, + ): + rows = await manifest_module._collect_manifest_rows_async( + directory_path=str(src), parent_id="synROOT", client=syn + ) + + # THEN the first call passes all three siblings in sorted order + # under the root parent_id (one batch, not three) + assert observed[0] == ("synROOT", ["alpha", "beta", "gamma"]) + # AND each subsequent call (one per sibling, for its own children) + # passes an empty dirnames list because each sibling is a leaf dir + assert all(call[1] == [] for call in observed[1:]) + # AND every file's parentId matches the id of its containing folder + assert {os.path.basename(r["path"]): r["parentId"] for r in rows} == { + "alpha.txt": "syn_alpha", + "beta.txt": "syn_beta", + "gamma.txt": "syn_gamma", + } + + +class TestLogWalkError: + """Tests for manifest._log_walk_error.""" + + def test_logs_warning_with_filename_and_message(self, syn: Synapse) -> None: + """The warning message references the offending filename and the + underlying OSError representation.""" + # GIVEN an OSError carrying an errno, message, and filename + err = OSError(13, "permission denied", "/tmp/unreadable") + + # WHEN _log_walk_error is invoked with that error + with patch.object(syn.logger, "warning") as mock_warning: + manifest_module._log_walk_error(syn, err) + + # THEN logger.warning is called exactly once + mock_warning.assert_called_once() + # AND the message references the offending filename and OSError text + message = mock_warning.call_args.args[0] + assert "/tmp/unreadable" in message + assert "permission denied" in message + + def test_logs_warning_with_no_filename(self, syn: Synapse) -> None: + """An OSError without a filename still produces a warning rather than + raising (best-effort during traversal).""" + # GIVEN an OSError without a filename attribute + err = OSError("io failure") + + # WHEN _log_walk_error is invoked with that error + with patch.object(syn.logger, "warning") as mock_warning: + manifest_module._log_walk_error(syn, err) + + # THEN logger.warning is called once (no exception is raised) + mock_warning.assert_called_once() + + +class TestCreateChildFolders: + """Tests for manifest._create_child_folders_async.""" + + async def test_empty_dirnames_returns_empty_dict(self, syn: Synapse) -> None: + """No dirnames produces no folders and never invokes Folder.store_async.""" + # GIVEN an empty list of dirnames + dirnames = [] + # WHEN we ask for child folders with patch( - "synapseclient.models.services.manifest._validate_target_container_async", - new=AsyncMock(side_effect=SynapseHTTPError("Not found")), + "synapseclient.models.Folder.store_async", + new_callable=AsyncMock, + ) as mock_store_async: + result = await manifest_module._create_child_folders_async( + parent_id="synROOT", dirnames=dirnames, client=syn + ) + + # THEN the result is an empty dict and no Synapse calls were made + assert result == {} + mock_store_async.assert_not_called() + + async def test_returns_dirname_to_folder_mapping(self, syn: Synapse) -> None: + """Each input dirname maps to the Folder returned by store_async, all + sharing the supplied parent_id, regardless of completion order.""" + # GIVEN three sibling dirnames to create under a single parent + dirnames = ["alpha", "beta", "gamma"] + + # AND a fake Folder.store_async that assigns a deterministic id per + # call so the returned Folders can be asserted against by id. + # Patched onto Folder.store_async as an unbound method, so the bound + # Folder instance arrives as `self`. Mutating self.id and returning + # self matches the real signature without hitting the network. + observed_parent_ids: list[str] = [] + + async def fake_store_async(self: Any, *args: Any, **kwargs: Any) -> Any: + observed_parent_ids.append(self.parent_id) + self.id = f"syn_{self.name}" + return self + + # WHEN we create the child folders concurrently + with patch( + "synapseclient.models.Folder.store_async", + new=fake_store_async, ): - with pytest.raises(SynapseHTTPError): - await Folder(id="synBOGUS").generate_sync_manifest_async( - directory_path=str(src), - manifest_path=str(manifest), - synapse_client=self.syn, - ) + result = await manifest_module._create_child_folders_async( + parent_id="synROOT", dirnames=dirnames, client=syn + ) - assert not manifest.exists() + # THEN every dirname maps to its own Folder with the expected id + assert set(result.keys()) == set(dirnames) + for dirname in dirnames: + assert result[dirname].name == dirname + assert result[dirname].id == f"syn_{dirname}" + + # AND every store call used the same parent_id + assert observed_parent_ids == ["synROOT"] * len(dirnames) + + +class TestPruneSymlinksAndSortDirnames: + """Tests for manifest._prune_symlinks_and_sort_dirnames.""" + + def test_sorts_dirnames_in_place(self, tmp_path: Path) -> None: + """Plain directories are sorted in place; the original list object is + mutated (os.walk relies on identity, not return value).""" + # GIVEN three real directories and an unsorted dirnames list referring to them + for name in ("c", "a", "b"): + (tmp_path / name).mkdir() + dirnames = ["c", "a", "b"] + + # WHEN we prune and sort + manifest_module._prune_symlinks_and_sort_dirnames(dirnames, str(tmp_path)) + + # AND the caller's list is sorted alphabetically + assert dirnames == ["a", "b", "c"] + + @pytest.mark.skipif( + platform.system() == "Windows", + reason="Symlink creation requires elevated privileges on Windows.", + ) + def test_drops_symlinked_subdirectories(self, tmp_path: Path) -> None: + """Symlinked subdirectories are pruned so we don't mirror folders whose + contents os.walk(followlinks=False) won't visit.""" + # GIVEN one real subdirectory and one symlink pointing at another + # directory, both listed in dirnames + real = tmp_path / "real" + real.mkdir() + target = tmp_path / "target" + target.mkdir() + link = tmp_path / "link" + os.symlink(str(target), str(link)) + + dirnames = ["link", "real"] + + # WHEN we prune and sort + manifest_module._prune_symlinks_and_sort_dirnames(dirnames, str(tmp_path)) + + # THEN only the real directory remains; the symlink was dropped + assert dirnames == ["real"] + + def test_empty_list_unchanged(self, tmp_path: Path) -> None: + """An empty list remains empty.""" + # GIVEN an empty dirnames list + dirnames: list[str] = [] + # WHEN we prune and sort + manifest_module._prune_symlinks_and_sort_dirnames(dirnames, str(tmp_path)) + # THEN it is still empty + assert dirnames == [] + + @pytest.mark.skipif( + platform.system() == "Windows", + reason="Symlink creation requires elevated privileges on Windows.", + ) + def test_prunes_symlinks_and_sorts_in_one_call(self, tmp_path: Path) -> None: + """Symlinks are pruned AND the surviving entries are sorted in a + single call. The prune-only and sort-only tests cover each behavior + in isolation, so this guards against a future change where one step + accidentally undoes the other (e.g., sort happening before prune + and reintroducing the link, or prune leaving the list in walk + order).""" + # GIVEN two real subdirs and one symlinked subdir, in non-sorted + # order with the symlink between them + for name in ("c", "a"): + (tmp_path / name).mkdir() + target = tmp_path / "target" + target.mkdir() + link = tmp_path / "link" + os.symlink(str(target), str(link)) + + dirnames = ["c", "link", "a"] + + # WHEN we prune and sort + manifest_module._prune_symlinks_and_sort_dirnames(dirnames, str(tmp_path)) + + # THEN the symlink is gone and the remaining real dirs are sorted + assert dirnames == ["a", "c"] + + +class TestBuildManifestRows: + """Tests for manifest._build_manifest_rows.""" + + def test_returns_rows_in_sorted_order(self, tmp_path: Path, syn: Synapse) -> None: + """Filenames are visited in sorted order so manifest output is + deterministic regardless of filesystem yield order.""" + # GIVEN three real files and an unsorted filenames list + for name in ("c.txt", "a.txt", "b.txt"): + (tmp_path / name).write_text("data") + + # WHEN we build manifest rows under "syn1" + rows = manifest_module._build_manifest_rows( + dirpath=str(tmp_path), + filenames=["c.txt", "a.txt", "b.txt"], + parent_id="syn1", + client=syn, + ) + + # THEN the rows are emitted in sorted filename order + assert [os.path.basename(row["path"]) for row in rows] == [ + "a.txt", + "b.txt", + "c.txt", + ] + # AND every row uses the supplied parent_id + assert all(row["parentId"] == "syn1" for row in rows) + + def test_skips_zero_byte_and_missing(self, tmp_path: Path, syn: Synapse) -> None: + """Zero-byte files and unreadable/missing files are dropped via + _is_uploadable_file.""" + # GIVEN one real non-empty file and one zero-byte file (and a + # filename for a file that doesn't exist on disk) + (tmp_path / "ok.txt").write_text("hello") + (tmp_path / "empty.txt").write_text("") + + # WHEN we build manifest rows for all three filenames + with patch.object(syn.logger, "warning"): + rows = manifest_module._build_manifest_rows( + dirpath=str(tmp_path), + filenames=["ok.txt", "empty.txt", "missing.txt"], + parent_id="syn1", + client=syn, + ) + + # THEN only the non-empty real file produces a row; the zero-byte + # and missing files are filtered out + assert [os.path.basename(row["path"]) for row in rows] == ["ok.txt"] + + def test_empty_filenames_returns_empty(self, tmp_path: Path, syn: Synapse) -> None: + """No filenames produce no rows.""" + # GIVEN an empty filenames list + # WHEN we build manifest rows + rows = manifest_module._build_manifest_rows( + dirpath=str(tmp_path), + filenames=[], + parent_id="syn1", + client=syn, + ) + # THEN the result is empty + assert rows == [] + + +class TestIsUploadableFile: + """Tests for manifest._is_uploadable_file.""" + + def test_regular_file_is_uploadable(self, tmp_path: Path, syn: Synapse) -> None: + """A non-empty readable file returns True.""" + # GIVEN a regular non-empty readable file + f = tmp_path / "ok.txt" + f.write_text("hello") + # THEN the result is True + assert manifest_module._is_uploadable_file(str(f), syn) is True + + def test_zero_byte_file_skipped(self, tmp_path: Path, syn: Synapse) -> None: + """A zero-byte file is skipped because Synapse rejects empty uploads.""" + # GIVEN a zero-byte file + f = tmp_path / "empty.txt" + f.write_text("") + # THEN the result is False (Synapse rejects empty uploads) + assert manifest_module._is_uploadable_file(str(f), syn) is False + + def test_missing_file_skipped(self, tmp_path: Path, syn: Synapse) -> None: + """A path that doesn't exist (e.g. broken symlink, race) is skipped + rather than raising OSError up to the caller.""" + # GIVEN a path that doesn't exist on disk (e.g. broken symlink, + # race against rmdir) + missing = tmp_path / "nope.txt" + # THEN the result is False rather than raising OSError + assert manifest_module._is_uploadable_file(str(missing), syn) is False From 2b0ebfaccb590bfd8330a6738b6df28eadfd85a4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 6 May 2026 13:22:27 -0700 Subject: [PATCH 07/11] update doc --- docs/explanations/manifest_csv.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/explanations/manifest_csv.md b/docs/explanations/manifest_csv.md index e46d52f4a..e4b9dee45 100644 --- a/docs/explanations/manifest_csv.md +++ b/docs/explanations/manifest_csv.md @@ -1,6 +1,6 @@ # Manifest CSV -The manifest is a CSV file with file locations and metadata used to bulk upload and download files in Synapse. It is the standard manifest format used by `Project.sync_from_synapse`, `Project.sync_to_synapse`, `Folder.sync_from_synapse`, `Folder.sync_to_synapse`, the Synapse UI download cart, and the `synapse get-download-list` CLI command. +The manifest is a CSV file with file locations and metadata used to bulk upload and download files in Synapse. It is the standard manifest format used by `Project.sync_from_synapse`, `Project.sync_to_synapse`, `Folder.sync_from_synapse`, `Folder.sync_to_synapse`, `Project.generate_sync_manifest`, `Folder.generate_sync_manifest`, the Synapse UI download cart, and the `synapse get-download-list` CLI command. !!! note This CSV manifest replaces the legacy TSV manifest produced by `synapseutils.syncFromSynapse`. The `syncFromSynapse` and `syncToSynapse` utility functions are deprecated and will be removed in v5.0.0. Use `Project.sync_from_synapse` / `Folder.sync_from_synapse` and `Project.sync_to_synapse` / `Folder.sync_to_synapse` instead. See the [legacy TSV manifest documentation](manifest_tsv.md) for details on the old format. @@ -81,8 +81,6 @@ This is an annotation with 3 values: |-----------------|----------|--------------| | /path/file1.txt | syn1243 | "[a,b,c]" | - - ### Dates in the manifest file Dates within the manifest file will always be written as [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) format in UTC without milliseconds. For example: `2023-12-20T16:55:08Z`. @@ -93,11 +91,12 @@ Dates can be written in other formats specified in ISO 8601 and they will be rec The CSV manifest format is shared across multiple tools: -| Source | Filename | -|----------------------------------------------------------|---------------------------------| -| `Project.sync_from_synapse` / `Folder.sync_from_synapse` | manifest.csv | -| Synapse UI download cart | manifest.csv | -| CLI `synapse get-download-list` | `manifest_.csv` | +| Source | Filename | +|----------------------------------------------------------------------|---------------------------------| +| `Project.sync_from_synapse` / `Folder.sync_from_synapse` | manifest.csv | +| `Project.generate_sync_manifest` / `Folder.generate_sync_manifest` | user-specified `manifest_path` | +| Synapse UI download cart | manifest.csv | +| CLI `synapse get-download-list` | `manifest_.csv` | A manifest generated by any of these sources can be used as input to `sync_to_synapse`, provided the `path` column is present with valid local file paths. Manifests from the Synapse UI do not include a `path` column by default, so users must add it before uploading. @@ -113,7 +112,9 @@ A manifest generated by any of these sources can be used as input to `sync_to_sy - [Project.sync_from_synapse][synapseclient.models.Project.sync_from_synapse] - [Project.sync_to_synapse][synapseclient.models.Project.sync_to_synapse] +- [Project.generate_sync_manifest][synapseclient.models.Project.generate_sync_manifest] - [Folder.sync_from_synapse][synapseclient.models.Folder.sync_from_synapse] - [Folder.sync_to_synapse][synapseclient.models.Folder.sync_to_synapse] +- [Folder.generate_sync_manifest][synapseclient.models.Folder.generate_sync_manifest] - [Manifest TSV (legacy)](manifest_tsv.md) - [Managing custom metadata at scale](https://help.synapse.org/docs/Managing-Custom-Metadata-at-Scale.2004254976.html#ManagingCustomMetadataatScale-BatchUploadFileswithAnnotations) From ff49ffbfa9fbeeb7ea2218a610533574997fbfd4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 6 May 2026 13:27:35 -0700 Subject: [PATCH 08/11] remove tests that didnt need to be integration tests --- .../async/test_storable_container_async.py | 149 +----------------- 1 file changed, 1 insertion(+), 148 deletions(-) diff --git a/tests/integration/synapseclient/models/async/test_storable_container_async.py b/tests/integration/synapseclient/models/async/test_storable_container_async.py index 87a1aca87..5b8af5266 100644 --- a/tests/integration/synapseclient/models/async/test_storable_container_async.py +++ b/tests/integration/synapseclient/models/async/test_storable_container_async.py @@ -405,12 +405,7 @@ async def test_error_column_rows_skipped( class TestGenerateSyncManifest: """Integration tests for StorableContainer.generate_sync_manifest_async - against the live Synapse API. - - These tests walk a local temporary directory and verify that the method - creates matching Synapse folders under a real container and writes a - manifest CSV that references the newly-created folder IDs. - """ + against the live Synapse API.""" @pytest_asyncio.fixture(loop_scope="session", scope="function") async def scope_folder( @@ -573,145 +568,3 @@ async def test_existing_folders_are_reused( ) assert len(scope_folder.folders) == 1 assert scope_folder.folders[0].id == existing.id - - async def test_empty_directory_writes_header_only( - self, syn: Synapse, scope_folder: Folder, tmp_path: Path - ) -> None: - """An empty source directory should produce a manifest containing - only the CSV header row.""" - empty = tmp_path / "empty" - empty.mkdir() - manifest = tmp_path / "manifest.csv" - - await scope_folder.generate_sync_manifest_async( - directory_path=str(empty), - manifest_path=str(manifest), - synapse_client=syn, - ) - - assert manifest.read_text().strip().splitlines() == ["path,parentId"] - - async def test_missing_directory_raises( - self, syn: Synapse, project_model: Project, tmp_path: Path - ) -> None: - """A directory_path that does not exist on disk should raise - ValueError before any manifest file is written.""" - missing = tmp_path / "does_not_exist" - manifest = tmp_path / "manifest.csv" - - with pytest.raises(ValueError, match="is not a directory or does not exist"): - await project_model.generate_sync_manifest_async( - directory_path=str(missing), - manifest_path=str(manifest), - synapse_client=syn, - ) - assert not manifest.exists() - - async def test_invalid_parent_id_raises_http_error( - self, syn: Synapse, tmp_path: Path - ) -> None: - """A container whose id does not resolve to any Synapse entity should - surface as a SynapseHTTPError from the server, and no manifest file - should be written. - """ - src = tmp_path / "src" - src.mkdir() - (src / "a.txt").write_text("hello") - manifest = tmp_path / "manifest.csv" - - with pytest.raises(SynapseHTTPError): - await Folder(id="syn00000000").generate_sync_manifest_async( - directory_path=str(src), - manifest_path=str(manifest), - synapse_client=syn, - ) - assert not manifest.exists() - - async def test_output_is_sorted_deterministically( - self, - syn: Synapse, - scope_folder: Folder, - tmp_path: Path, - ) -> None: - """Directories and files should be traversed in sorted order so the - generated manifest is deterministic regardless of filesystem - iteration order. - """ - # GIVEN subdirs and files created in non-alphabetical order - src = tmp_path / "root" - for dirname in ["charlie", "alpha", "bravo"]: - subdir = src / dirname - subdir.mkdir(parents=True) - for filename in ["z.txt", "a.txt", "m.txt"]: - (subdir / filename).write_text("payload") - manifest = tmp_path / "manifest.csv" - - await scope_folder.generate_sync_manifest_async( - directory_path=str(src), - manifest_path=str(manifest), - synapse_client=syn, - ) - - # THEN manifest rows are sorted first by directory name, then by file - # name within each directory - df = pd.read_csv(manifest) - ordered = [ - (os.path.basename(os.path.dirname(p)), os.path.basename(p)) - for p in df["path"] - ] - assert ordered == [ - ("alpha", "a.txt"), - ("alpha", "m.txt"), - ("alpha", "z.txt"), - ("bravo", "a.txt"), - ("bravo", "m.txt"), - ("bravo", "z.txt"), - ("charlie", "a.txt"), - ("charlie", "m.txt"), - ("charlie", "z.txt"), - ] - - @pytest.mark.skipif( - platform.system() == "Windows", - reason="Creating symlinks on Windows requires elevated privileges.", - ) - async def test_directory_symlinks_are_not_followed( - self, - syn: Synapse, - scope_folder: Folder, - tmp_path: Path, - ) -> None: - """Symlinks to directories should not be recursed into — files under - a symlinked directory must not appear in the generated manifest, and - no Synapse folder should be created for the symlinked directory. - """ - # GIVEN a tree where a sibling directory is a symlink to a real - # directory that contains a file - src = tmp_path / "root" - real = src / "real" - real.mkdir(parents=True) - (real / "real_file.txt").write_text("real") - os.symlink(real, src / "link", target_is_directory=True) - manifest = tmp_path / "manifest.csv" - - await scope_folder.generate_sync_manifest_async( - directory_path=str(src), - manifest_path=str(manifest), - synapse_client=syn, - ) - - # THEN only the file in the real directory appears in the manifest - df = pd.read_csv(manifest) - parents_and_names = [ - (os.path.basename(os.path.dirname(p)), os.path.basename(p)) - for p in df["path"] - ] - assert parents_and_names == [("real", "real_file.txt")] - - # AND no Synapse folder is created for the symlinked directory: - # only the on-disk "real" sibling is mirrored; the "link" symlink - # was pruned during the walk and produced no Synapse folder. - await scope_folder.sync_from_synapse_async( - download_file=False, recursive=False, synapse_client=syn - ) - assert sorted(f.name for f in scope_folder.folders) == ["real"] From 8593579e590c3ee5ff7889b723dd28885d01a6db Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 6 May 2026 13:33:37 -0700 Subject: [PATCH 09/11] fix errors due to import name change --- .../models/async/unit_test_manifest_async.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/synapseclient/models/async/unit_test_manifest_async.py b/tests/unit/synapseclient/models/async/unit_test_manifest_async.py index 783b40365..63577acfd 100644 --- a/tests/unit/synapseclient/models/async/unit_test_manifest_async.py +++ b/tests/unit/synapseclient/models/async/unit_test_manifest_async.py @@ -815,7 +815,7 @@ async def test_valid_container_passes(self, container_cls: str) -> None: cls = {"Project": Project, "Folder": Folder}[container_cls] mock_container = MagicMock(spec=cls) with patch( - "synapseclient.models.services.manifest.factory_get_async", + "synapseclient.models.services.manifest.get_async", new=AsyncMock(return_value=mock_container), ): await _check_parent_containers_async(["syn1"], syn=self.syn) @@ -824,7 +824,7 @@ async def test_non_container_raises(self) -> None: """A parent ID that resolves to a non-container Synapse entity raises ValueError.""" mock_entity = MagicMock() # not a Folder or Project with patch( - "synapseclient.models.services.manifest.factory_get_async", + "synapseclient.models.services.manifest.get_async", new=AsyncMock(return_value=mock_entity), ): with pytest.raises(ValueError, match="not a Folder or Project"): @@ -833,7 +833,7 @@ async def test_non_container_raises(self) -> None: async def test_empty_parent_id_skipped(self) -> None: """An empty parent ID string is skipped without calling the Synapse API.""" with patch( - "synapseclient.models.services.manifest.factory_get_async", + "synapseclient.models.services.manifest.get_async", new=AsyncMock(), ) as mock_get: await _check_parent_containers_async([""], syn=self.syn) @@ -844,7 +844,7 @@ async def test_nonexistent_parent_id_reraises_http_error(self) -> None: from synapseclient.core.exceptions import SynapseHTTPError with patch( - "synapseclient.models.services.manifest.factory_get_async", + "synapseclient.models.services.manifest.get_async", new=AsyncMock(side_effect=SynapseHTTPError("Not found")), ): with pytest.raises(SynapseHTTPError): From 04b31874b1fe96bc792c561f4aa6231f70cd79f4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 7 May 2026 08:02:23 -0700 Subject: [PATCH 10/11] shortened docstrings --- .../models/mixins/storable_container.py | 36 +++++-------------- .../protocols/storable_container_protocol.py | 36 +++++-------------- 2 files changed, 16 insertions(+), 56 deletions(-) diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index 5a3267e5a..ba375653f 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -717,31 +717,14 @@ async def generate_sync_manifest_async( container in Synapse, and write a CSV manifest ready for [sync_to_synapse_async][synapseclient.models.mixins.StorableContainer.sync_to_synapse_async]. - The generated manifest has two columns: path (always an absolute, - symlink-resolved path — the input directory_path is resolved via - os.path.realpath before walking, so the manifest can be consumed - from any working directory and remains valid even if the original - symlink is later removed or repointed) and parentId (the Synapse ID - of the file's containing folder). Folders that already exist in - Synapse with the same name and parent are reused rather than - re-created. - - Sibling folders at the same depth are created concurrently to reduce - latency on wide trees. Directories and files are traversed in sorted - order so manifest output is deterministic across runs and platforms. - Directory symlinks encountered inside directory_path are not - followed. File symlinks are not pruned: the symlink's path is - recorded in the manifest, and the underlying target's contents - are uploaded when the manifest is consumed. The root - directory_path itself may be a symlink; if so, it is resolved to - its target and the target is walked. Zero-byte files are skipped with a - warning, since Synapse rejects empty files. I/O errors raised by - os.walk (for example, unreadable subdirectories) are logged and - skipped. If no uploadable files are found under directory_path, a - warning is logged and a header-only manifest is written. If a - folder store_async call fails, the exception propagates and no - manifest is written; any folders already created remain in Synapse - and will be reused on a retry. + The manifest has two columns: path (absolute, symlink-resolved) and + parentId (the Synapse ID of the file's containing folder). Existing + Synapse folders with matching names and parents are reused. Directory + symlinks inside directory_path are not followed; file symlinks record + the symlink path and upload the target's contents. Zero-byte files are + skipped with a warning — Synapse rejects empty files. I/O errors during + walk are logged and skipped; an empty source directory produces a + warning and a header-only manifest. Arguments: directory_path: Path to the local directory to be pushed to @@ -752,9 +735,6 @@ async def generate_sync_manifest_async( Synapse.allow_client_caching(False) this will use the last created instance from the Synapse class constructor. - Returns: - None - Raises: ValueError: If this container's id is None, or if directory_path does not exist or is not a directory, or if this container's diff --git a/synapseclient/models/protocols/storable_container_protocol.py b/synapseclient/models/protocols/storable_container_protocol.py index 75c4d058b..79d50c788 100644 --- a/synapseclient/models/protocols/storable_container_protocol.py +++ b/synapseclient/models/protocols/storable_container_protocol.py @@ -312,31 +312,14 @@ def generate_sync_manifest( container in Synapse, and write a CSV manifest ready for [sync_to_synapse][synapseclient.models.mixins.StorableContainer.sync_to_synapse]. - The generated manifest has two columns: path (always an absolute, - symlink-resolved path — the input directory_path is resolved via - os.path.realpath before walking, so the manifest can be consumed - from any working directory and remains valid even if the original - symlink is later removed or repointed) and parentId (the Synapse ID - of the file's containing folder). Folders that already exist in - Synapse with the same name and parent are reused rather than - re-created. - - Sibling folders at the same depth are created concurrently to reduce - latency on wide trees. Directories and files are traversed in sorted - order so manifest output is deterministic across runs and platforms. - Directory symlinks encountered inside directory_path are not - followed. File symlinks are not pruned: the symlink's path is - recorded in the manifest, and the underlying target's contents - are uploaded when the manifest is consumed. The root - directory_path itself may be a symlink; if so, it is resolved to - its target and the target is walked. Zero-byte files are skipped with a - warning, since Synapse rejects empty files. I/O errors raised by - os.walk (for example, unreadable subdirectories) are logged and - skipped. If no uploadable files are found under directory_path, a - warning is logged and a header-only manifest is written. If a - folder store call fails, the exception propagates and no manifest - is written; any folders already created remain in Synapse and will - be reused on a retry. + The manifest has two columns: path (absolute, symlink-resolved) and + parentId (the Synapse ID of the file's containing folder). Existing + Synapse folders with matching names and parents are reused. Directory + symlinks inside directory_path are not followed; file symlinks record + the symlink path and upload the target's contents. Zero-byte files are + skipped with a warning — Synapse rejects empty files. I/O errors during + walk are logged and skipped; an empty source directory produces a + warning and a header-only manifest. Arguments: directory_path: Path to the local directory to be pushed to @@ -347,9 +330,6 @@ def generate_sync_manifest( Synapse.allow_client_caching(False) this will use the last created instance from the Synapse class constructor. - Returns: - None - Raises: ValueError: If this container's id is None, or if directory_path does not exist or is not a directory, or if this container's From 7ae2fc1f8ba442a26e6567e9ac81a58107ce40bb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 7 May 2026 08:06:04 -0700 Subject: [PATCH 11/11] added semaphore --- synapseclient/models/services/manifest.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index b8d63c9ee..3647307da 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -1819,15 +1819,18 @@ async def _create_child_folders_async( """ from synapseclient.models.folder import Folder + semaphore = asyncio.Semaphore(max(client.max_threads * 2, 1)) + # Each task carries its dirname through to the result so the caller can # build a name-to-Folder mapping without relying on asyncio.gather's # input-order guarantee. # Pair-tagged results remain correct under any completion order. async def _store(dirname: str) -> tuple[str, Folder]: - folder = await Folder(name=dirname, parent_id=parent_id).store_async( - synapse_client=client - ) - return dirname, folder + async with semaphore: + folder = await Folder(name=dirname, parent_id=parent_id).store_async( + synapse_client=client + ) + return dirname, folder pairs = await asyncio.gather(*[_store(d) for d in dirnames]) return dict(pairs)