Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion S3MP/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Semantic versioning for S3MP."""

__version__ = "0.5.3"
__version__ = "0.6.0"
33 changes: 10 additions & 23 deletions S3MP/mirror_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,11 @@ def s3_key(self) -> str:
"""Get s3 key."""
ret_key = "/".join([str(s.name) for s in self.key_segments])
# We'll infer folder/file based on extension
return ret_key if '.' in self.key_segments[-1].name else f"{ret_key}/"
# HACK to catch case where the "extension" is actually a part of the folder name
# (eg, a folder named "v0.1.0"), we check if the extension is actually a number
name = self.key_segments[-1].name
ext = name.split('.')[-1]
return ret_key if (ext is not name and not ext.isdigit()) else f"{ret_key}/"
Comment on lines 37 to +44
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we just check the is_file property on the last key segment, or is it possible that it's inconsistent?
Technically S3 objects don't need extensions at all, so checking for "." seems problematic (which is my own fault).

Copy link
Member Author

@TrevorBurgoyne TrevorBurgoyne Jan 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah as far as i can tell the is_file property has to be explicitly set currently (it isn't set when initing a MirrorPath)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😭


@property
def local_path(self) -> Path:
Expand Down Expand Up @@ -86,6 +90,10 @@ def is_file_on_s3(self) -> bool:
"""Check if is a file on s3."""
return key_is_file_on_s3(self.s3_key)

def is_file_and_exists_on_s3(self) -> bool:
"""Check if is a file and exists on s3."""
return self.exists_on_s3() and self.is_file_on_s3()

def update_callback_on_skipped_transfer(self):
"""Update the current global callback if the transfer gets skipped."""
if S3MPConfig.callback and self in S3MPConfig.callback._transfer_objs:
Expand Down Expand Up @@ -151,28 +159,7 @@ def get_child(self, child_name: str) -> MirrorPath:

def get_children_on_s3(self) -> List[MirrorPath]:
"""Get all children on s3."""
child_s3_keys = []
continuation_token = None

while True:
resp = s3_list_child_keys(
self.s3_key, continuation_token=continuation_token
)

# Collect keys from the current response
if "Contents" in resp:
child_s3_keys.extend(
obj["Key"] for obj in resp["Contents"] if obj["Key"] != self.s3_key
)
if "CommonPrefixes" in resp:
child_s3_keys.extend(obj["Prefix"] for obj in resp["CommonPrefixes"])

# Check if there are more pages to fetch
if "NextContinuationToken" in resp:
continuation_token = resp["NextContinuationToken"]
else:
break
return [MirrorPath.from_s3_key(key) for key in child_s3_keys]
return [MirrorPath.from_s3_key(key) for key in s3_list_child_keys(self.s3_key)]

def get_parent(self) -> MirrorPath:
"""Get the parent of this file."""
Expand Down
73 changes: 48 additions & 25 deletions S3MP/utils/s3_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
from pathlib import Path
from S3MP.global_config import S3MPConfig
from S3MP.types import S3Bucket, S3Client, S3ListObjectV2Output
from botocore.exceptions import ClientError

from typing import List

def s3_list_single_key(
key: str,
Expand All @@ -20,10 +19,9 @@ def s3_list_single_key(

def s3_list_child_keys(
key: str,
bucket=None,
client=None,
continuation_token=None,
):
bucket: S3Bucket = None,
client: S3Client = None,
) -> List[str]:
"""List details of all child keys on S3."""
if not key.endswith("/"):
warnings.warn(f"Listing child keys of {key} - key does not end with '/'")
Expand All @@ -34,9 +32,29 @@ def s3_list_child_keys(
"Prefix": key,
"Delimiter": "/",
}
if continuation_token:
params["ContinuationToken"] = continuation_token
return client.list_objects_v2(**params)
child_s3_keys: List[str] = []
continuation_token: str = None
while True:
if continuation_token:
params["ContinuationToken"] = continuation_token

resp = client.list_objects_v2(**params)

# Collect keys from the current response
if "Contents" in resp:
child_s3_keys.extend(
obj["Key"] for obj in resp["Contents"] if obj["Key"] != key
)
if "CommonPrefixes" in resp:
child_s3_keys.extend(obj["Prefix"] for obj in resp["CommonPrefixes"])

# Check if there are more pages to fetch
if "NextContinuationToken" in resp:
continuation_token = resp["NextContinuationToken"]
else:
break

return child_s3_keys

def download_key(
key: str,
Expand All @@ -47,12 +65,18 @@ def download_key(
"""Download a key from S3."""
bucket = bucket or S3MPConfig.bucket
client = client or S3MPConfig.s3_client

if not key_exists_on_s3(key, bucket, client):
raise ValueError(f"Key {key} does not exist on S3")

# If the key is a file, download it
# Otherwise, download all child keys
if key_is_file_on_s3(key, bucket, client):
local_path.parent.mkdir(parents=True, exist_ok=True)
client.download_file(bucket.name, key, str(local_path), Callback=S3MPConfig.callback, Config=S3MPConfig.transfer_config)
else:
for obj in s3_list_child_keys(key, bucket, client)["Contents"]:
download_key(obj["Key"], local_path / obj["Key"].replace(key, ""))
for child_key in s3_list_child_keys(key, bucket, client):
download_key(child_key, local_path / child_key.replace(key, ""))

def upload_to_key(
key: str,
Expand Down Expand Up @@ -86,20 +110,19 @@ def key_is_file_on_s3(
bucket: S3Bucket = None,
client: S3Client = None,
) -> bool:
"""Check if a key is a file on S3, returns false if it is a folder. Raises an error if the key does not exist."""
"""Check if a key is a file on S3 by using head_object."""
bucket = bucket or S3MPConfig.bucket
client = client or S3MPConfig.s3_client
if not key_exists_on_s3(key, bucket, client):
raise ValueError(f"Key {key} does not exist on S3")
res = s3_list_single_key(key, bucket, client)
# Handle case of trailing slash, but still verify
if (
key[-1] == "/"
and len(res['Contents']) == 1
and res['Contents'][0]['Key'] == key
):
return False
return "Contents" in res

try:
client.head_object(Bucket=bucket.name, Key=key)
return True
except Exception as e:
# 404 occurs if the key is a "folder" or does not exist
if "404" in str(e):
return False
Comment on lines +120 to +123
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a specific exception that could be caught?
Unlikely case but a different error with a key that contains "404" could still hit this, maybe.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably yeah, i initially tried using the botocore ClientError exception but for some reason it wasn't catching properly

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll just tag #23 and let it slow-cook on the backburner

else:
raise e


def key_size_on_s3(
Expand All @@ -124,8 +147,8 @@ def delete_child_keys_on_s3(
"""Delete all keys that are children of a key on S3."""
bucket = bucket or S3MPConfig.bucket
client = client or S3MPConfig.s3_client
for obj in s3_list_child_keys(key, bucket, client)["Contents"]:
client.delete_object(Bucket=bucket.name, Key=obj["Key"])
for child_key in s3_list_child_keys(key, bucket, client):
client.delete_object(Bucket=bucket.name, Key=child_key)


def delete_key_on_s3(
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "S3MP"
version = "0.5.3"
version = "0.6.0"
description = ""
authors = [
{name = "Joshua Dean", email = "joshua.dean@sentera.com"}
Expand Down
Loading