diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 43bee95..c8e0dc8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,7 +37,7 @@ jobs: fetch-depth: 0 - name: Install uv (official Astral action) - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0 + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 with: # Update this as needed: version: "0.10.2" diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 02d4951..55ce67f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -24,7 +24,7 @@ jobs: fetch-depth: 0 - name: Install uv (official Astral action) - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0 + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 with: version: "0.10.2" enable-cache: true @@ -52,7 +52,7 @@ jobs: # Pinned to commit SHA because this is a single-maintainer third-party action # that runs alongside a publish step with write permissions. Update the SHA # together with the version comment when bumping. - uses: softprops/action-gh-release@3bb12739c298aeb8a4eeaf626c5b8d85266b0e65 # v2.6.2 + uses: softprops/action-gh-release@b4309332981a82ec1c5618f44dd2e27cc8bfbfda # v3.0.0 with: generate_release_notes: true prerelease: ${{ contains(github.ref_name, '-') }} diff --git a/README.md b/README.md index 61544ed..19d74d1 100644 --- a/README.md +++ b/README.md @@ -7,14 +7,16 @@ It is simply a few functions and tricks that have repeatedly shown value in vari projects. The goal is not to give a comprehensive suite of utilities but simply to complement the standard libraries and fill in a few gaps. -✨ **NEW:** **Version 3.0** is out and has additions and updates for Python 3.10-3.13! ✨ +✨ **NEW:** **Version 3.1** adds `atomic_write_text()`/`atomic_write_bytes()`, exposes +`__version__`, and supports Python 3.10-3.14. ✨ ## Key Features - **Atomic file operations** with handling of parent directories and backups. This is essential for thread safety and good hygiene so partial or corrupt outputs are never present in final file locations, even in case a program crashes. - See `atomic_output_file()`, `copyfile_atomic()`. + See `atomic_output_file()`, `atomic_write_text()`, `atomic_write_bytes()`, + `copyfile_atomic()`. - **Abbreviate and quote strings**, which is useful for logging a clean way. See `abbrev_str()`, `single_line()`, `quote_if_needed()`. @@ -43,6 +45,27 @@ The libs are all small so see pydoc strings or code for full docs. > that has some extra functions for pretty, human-readable outputs for objects, sizes, > times and dates, etc. +## Using strif with LLM Agents + +Strif is handy for code that generates files, which is increasingly often AI agent code. + +- **Atomic writes for streamed or generated output.** If a generation is interrupted or + crashes mid-write, you never leave a truncated or corrupt file in its final location. + `atomic_write_text("out.md", content)` is a one-liner for the common case. + +- **Content hashing for caching and dedup.** Use `hash_file()` or `hash_string()` to key + a cache on file contents, or `file_mtime_hash()` for a fast (content-free) cache key. + +- **Sortable, readable run ids.** `new_timestamped_uid()` gives ids that sort by creation + time, which is convenient for logs and scratch directories. + +```python +from strif import atomic_write_text + +# Safe even if the process dies partway through writing: +atomic_write_text("some-dir/output.md", generated_text, make_parents=True) +``` + ## Installation ```sh @@ -178,6 +201,13 @@ pip install strif Moves a file to a new location, automatically creating parent directories and optionally keeping a backup of the destination if it already exists. +- **`atomic_write_text(dest_path, text, make_parents=False, backup_suffix=None, + encoding='utf-8')`** and **`atomic_write_bytes(dest_path, data, make_parents=False, + backup_suffix=None)`** + + Convenience wrappers around `atomic_output_file()` for the common case of writing a + whole string or bytes value atomically in a single call. + For example, it is generally a good idea to wrap an `open()` call with `atomic_output_file()`: @@ -187,6 +217,12 @@ with atomic_output_file("some-dir/my-final-output.txt") as temp_target: f.write("some contents") ``` +Or, for the common whole-value case, just: + +```python +atomic_write_text("some-dir/my-final-output.txt", "some contents") +``` + And this can (and in most cases should) be used in place of `shutil.copyfile`: ```python @@ -204,7 +240,7 @@ There are also some handy additional options: with atomic_output_file("some-dir/my-final-output.txt", make_parents=True, backup_suffix=".old.{timestamp}") as temp_target: with open(temp_target, "w") as f: - sf.write("some contents") + f.write("some contents") ``` This creates parent folders as needed (a major convenience). @@ -310,6 +346,9 @@ Examples: ## Multiple String Replacements +`Insertion` and `Replacement` are `NamedTuple`s, so you can use named fields +(`Insertion(offset, text)`, `Replacement(start, end, text)`) or plain positional tuples. + - **`insert_multiple(text: str, insertions: list[Insertion]) -> str`** Insert multiple strings into `text` at the given offsets, at once. @@ -317,7 +356,7 @@ Examples: - **`replace_multiple(text: str, replacements: list[Replacement]) -> str`** Replace multiple substrings in `text` with new strings, simultaneously. - The replacements are a list of tuples (start_offset, end_offset, new_string). + Each `Replacement` is `(start_offset, end_offset, new_string)`. ## FAQ diff --git a/pyproject.toml b/pyproject.toml index e4195ae..35975a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,10 @@ bump = true # The source location for the package. packages = ["src/strif"] +[tool.hatch.build.targets.sdist] +# Keep agent/tooling state out of the published source distribution. +exclude = [".claude", ".tbd", ".github", ".copier-answers.yml", "attic"] + # ---- Settings ---- diff --git a/src/strif/__init__.py b/src/strif/__init__.py index 90e0a41..cd2ec05 100644 --- a/src/strif/__init__.py +++ b/src/strif/__init__.py @@ -1,4 +1,7 @@ +from importlib.metadata import PackageNotFoundError, version + __all__ = ( # noqa: F405 + "__version__", # atomic_var.py "AtomicVar", # strif.py @@ -11,6 +14,7 @@ "clean_alphanum_hash", "file_mtime_hash", "base36_encode", + "HashAlgorithm", "Hash", "hash_string", "hash_file", @@ -25,6 +29,8 @@ "move_file", "make_parent_dirs", "atomic_output_file", + "atomic_write_text", + "atomic_write_bytes", "temp_output_file", "temp_output_dir", "copyfile_atomic", @@ -41,7 +47,13 @@ "StringTemplate", ) -from .atomic_var import * # noqa: F403 -from .strif import * # noqa: F403 -from .string_replace import * # noqa: F403 -from .string_template import * # noqa: F403 +try: + __version__ = version("strif") +except PackageNotFoundError: + # Running from a source tree that isn't installed. + __version__ = "0.0.0.dev0" + +from .atomic_var import * # noqa: F403, E402 +from .strif import * # noqa: F403, E402 +from .string_replace import * # noqa: F403, E402 +from .string_template import * # noqa: F403, E402 diff --git a/src/strif/strif.py b/src/strif/strif.py index c374298..33d89eb 100644 --- a/src/strif/strif.py +++ b/src/strif/strif.py @@ -18,7 +18,7 @@ from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path -from typing import Any +from typing import Any, Literal __all__ = ( "DEV_NULL", @@ -30,6 +30,7 @@ "clean_alphanum_hash", "file_mtime_hash", "base36_encode", + "HashAlgorithm", "Hash", "hash_string", "hash_file", @@ -44,6 +45,8 @@ "move_file", "make_parent_dirs", "atomic_output_file", + "atomic_write_text", + "atomic_write_bytes", "temp_output_file", "temp_output_dir", "copyfile_atomic", @@ -183,6 +186,10 @@ def base36_encode(n: int) -> str: return encoded +HashAlgorithm = Literal["sha1", "sha256", "sha384", "sha512", "md5", "blake2b", "blake2s"] +"""Common hash algorithms, for autocompletion. Any name `hashlib` accepts also works.""" + + @dataclass(frozen=True) class Hash: """ @@ -222,7 +229,7 @@ def with_prefix(self) -> str: return f"{self.algorithm}:{self.hex}" -def hash_string(string: str, algorithm: str = "sha1") -> Hash: +def hash_string(string: str, algorithm: HashAlgorithm | str = "sha1") -> Hash: """ Flexible hash of a string. """ @@ -231,13 +238,10 @@ def hash_string(string: str, algorithm: str = "sha1") -> Hash: return Hash(algorithm, hasher.digest()) -def hash_file(file_path: str | Path, algorithm: str = "sha1") -> Hash: +def hash_file(file_path: str | Path, algorithm: HashAlgorithm | str = "sha1") -> Hash: """ Hash the content of a file. """ - if algorithm not in hashlib.algorithms_available: - raise ValueError(f"Unsupported hash algorithm: {algorithm}") - hasher = hashlib.new(algorithm) file_path = Path(file_path) with file_path.open("rb") as file: @@ -286,13 +290,6 @@ def abbrev_list( return joiner.join(shortened) -abbreviate_str = abbrev_str -"""Deprecated. Use `abbrev_str()` instead.""" - -abbreviate_list = abbrev_list -"""Deprecated. Use `abbrev_list()` instead.""" - - def single_line(text: str) -> str: """ Convert newlines and other whitespace to spaces. @@ -569,6 +566,39 @@ def atomic_output_file( tmp_path.replace(dest_path) +def atomic_write_text( + dest_path: str | Path, + text: str, + make_parents: bool = False, + backup_suffix: str | None = None, + encoding: str = "utf-8", +) -> None: + """ + Atomically write a string to a file, so a partial or corrupt file never appears + at `dest_path`. Convenience wrapper around `atomic_output_file()`. + """ + with atomic_output_file( + dest_path, make_parents=make_parents, backup_suffix=backup_suffix + ) as tmp_path: + tmp_path.write_text(text, encoding=encoding) + + +def atomic_write_bytes( + dest_path: str | Path, + data: bytes, + make_parents: bool = False, + backup_suffix: str | None = None, +) -> None: + """ + Atomically write bytes to a file, so a partial or corrupt file never appears + at `dest_path`. Convenience wrapper around `atomic_output_file()`. + """ + with atomic_output_file( + dest_path, make_parents=make_parents, backup_suffix=backup_suffix + ) as tmp_path: + tmp_path.write_bytes(data) + + @contextmanager def temp_output_file( prefix: str = "tmp", diff --git a/src/strif/string_replace.py b/src/strif/string_replace.py index 51a7b5d..f7ae81a 100644 --- a/src/strif/string_replace.py +++ b/src/strif/string_replace.py @@ -1,8 +1,13 @@ -from typing import TypeAlias +from __future__ import annotations + +from typing import NamedTuple __all__ = ["Insertion", "insert_multiple", "Replacement", "replace_multiple"] -Insertion = tuple[int, str] + +class Insertion(NamedTuple): + offset: int + text: str def insert_multiple(text: str, insertions: list[Insertion]) -> str: @@ -19,7 +24,10 @@ def insert_multiple(text: str, insertions: list[Insertion]) -> str: return "".join(chunks) -Replacement: TypeAlias = tuple[int, int, str] +class Replacement(NamedTuple): + start: int + end: int + text: str def replace_multiple(text: str, replacements: list[Replacement]) -> str: diff --git a/tests/test_atomic_var.py b/tests/test_atomic_var.py new file mode 100644 index 0000000..c6a7e1a --- /dev/null +++ b/tests/test_atomic_var.py @@ -0,0 +1,91 @@ +import threading +from dataclasses import dataclass + +from strif import AtomicVar +from strif.atomic_var import value_is_immutable + + +def test_atomic_var_set_swap_update(): + var = AtomicVar(0) + var.set(5) + assert var.value == 5 + assert var.swap(10) == 5 + assert var.value == 10 + # update() with a returning function. + assert var.update(lambda x: x + 1) == 11 + # update() with an in-place mutation returns None -> value unchanged reference. + lst = AtomicVar([1, 2]) + assert lst.update(lambda x: x.append(3)) == [1, 2, 3] + + +def test_atomic_var_copy_independence(): + var = AtomicVar([[1], [2]]) + shallow = var.copy() + deep = var.deepcopy() + var.value[0].append(99) + # Shallow copy shares inner lists; deep copy does not. + assert shallow[0] == [1, 99] + assert deep[0] == [1] + + +def test_atomic_var_updates_context_manager(): + var = AtomicVar([1, 2, 3]) + with var.updates() as value: + value.append(4) + assert var.value == [1, 2, 3, 4] + + +def test_atomic_var_updates_rejects_immutable(): + var = AtomicVar(0) + try: + with var.updates(): + pass + raise AssertionError("updates() should reject immutable values") + except ValueError: + pass + + +def test_atomic_var_truthiness(): + assert not AtomicVar(0) + assert AtomicVar(1) + assert not AtomicVar([]) + assert AtomicVar([1]) + + +def test_value_is_immutable(): + assert value_is_immutable(0) + assert value_is_immutable("x") + assert value_is_immutable((1, 2)) + assert not value_is_immutable([1, 2]) + assert not value_is_immutable({}) + + @dataclass(frozen=True) + class Frozen: + x: int + + @dataclass + class Mutable: + x: int + + assert value_is_immutable(Frozen(1)) + assert not value_is_immutable(Mutable(1)) + + +def test_atomic_var_concurrent_updates(): + # Without the lock serializing the read-modify-write, the final count would be + # less than the expected total due to lost updates. + var = AtomicVar(0) + threads_count = 10 + increments = 1000 + + def worker(): + for _ in range(increments): + var.update(lambda x: x + 1) + + threads = [threading.Thread(target=worker) for _ in range(threads_count)] + for t in threads: + t.start() + for t in threads: + t.join() + + assert var.value == threads_count * increments diff --git a/tests/test_files.py b/tests/test_files.py index 2120fcd..43ffb77 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -5,6 +5,8 @@ from strif import ( atomic_output_file, + atomic_write_bytes, + atomic_write_text, copy_to_backup, is_truthy, move_file, @@ -65,6 +67,37 @@ def test_atomic_output_file_force_replaces_dir(tmp_path: Path): assert target.read_text() == "now a file" +def test_atomic_output_file_timestamp_backups_do_not_clobber(tmp_path: Path): + out = tmp_path / "out.txt" + out.write_text("v1") + with atomic_output_file(out, backup_suffix="{timestamp}.bak") as tmp: + tmp.write_text("v2") + with atomic_output_file(out, backup_suffix="{timestamp}.bak") as tmp: + tmp.write_text("v3") + # Each write keeps its own uniquely-named backup, so both prior versions survive. + backups = sorted(p.read_text() for p in tmp_path.glob("out.txt*bak")) + assert out.read_text() == "v3" + assert backups == ["v1", "v2"] + + +def test_atomic_write_text_and_bytes(tmp_path: Path): + text_path = tmp_path / "a.txt" + atomic_write_text(text_path, "hello") + assert text_path.read_text() == "hello" + + bytes_path = tmp_path / "b.bin" + atomic_write_bytes(bytes_path, b"\x00\x01\x02") + assert bytes_path.read_bytes() == b"\x00\x01\x02" + + +def test_atomic_write_text_make_parents_and_backup(tmp_path: Path): + nested = tmp_path / "sub" / "a.txt" + atomic_write_text(nested, "first", make_parents=True) + atomic_write_text(nested, "second", backup_suffix=".bak") + assert nested.read_text() == "second" + assert (tmp_path / "sub" / "a.txt.bak").read_text() == "first" + + @pytest.mark.skipif(not os.path.exists("/proc/self/fd"), reason="Linux-only fd accounting") def test_temp_output_file_no_fd_leak(): before = len(os.listdir("/proc/self/fd")) diff --git a/tests/test_hash.py b/tests/test_hash.py index 52f43c3..c4ecdc0 100644 --- a/tests/test_hash.py +++ b/tests/test_hash.py @@ -1,14 +1,14 @@ -import os +import hashlib +from pathlib import Path -from strif import hash_file, hash_string +import pytest +from strif import hash_file, hash_string -def test_hash_file(): - os.makedirs("tmp", exist_ok=True) - file_path = "tmp/test_file.txt" - with open(file_path, "w") as f: - f.write("Hello, World!") +def test_hash_file(tmp_path: Path): + file_path = tmp_path / "test_file.txt" + file_path.write_text("Hello, World!") result_hash = hash_file(file_path, "sha1").with_prefix assert result_hash == "sha1:0a0a9f2a6772942557ab5355d76af442f8f65e01" @@ -16,3 +16,20 @@ def test_hash_file(): assert hash_string("Hello, World!").with_prefix == result_hash assert hash_string("Hello, World!").base64 == "CgqfKmdylCVXq1NV12r0Qvj2XgE=" + + +def test_hash_file_binary_and_chunked(tmp_path: Path): + # Non-UTF8 bytes plus a payload larger than the 8192-byte read chunk, to exercise + # the chunked read loop and confirm binary content hashes identically to hashlib. + payload = bytes(range(256)) * 500 # 128 KB + file_path = tmp_path / "blob.bin" + file_path.write_bytes(payload) + + assert hash_file(file_path, "sha256").hex == hashlib.sha256(payload).hexdigest() + + +def test_hash_file_unsupported_algorithm(tmp_path: Path): + file_path = tmp_path / "f.txt" + file_path.write_text("x") + with pytest.raises(ValueError): + hash_file(file_path, "not-a-real-algo") diff --git a/tests/test_ids.py b/tests/test_ids.py new file mode 100644 index 0000000..1e7a8d1 --- /dev/null +++ b/tests/test_ids.py @@ -0,0 +1,27 @@ +import re + +from strif import new_timestamped_uid, new_uid + +_BASE36 = re.compile(r"^[0-9a-z]+$") + + +def test_new_uid_charset_and_length(): + assert _BASE36.match(new_uid()) + # Length follows int(bits / 5.16) + 1 over the 36-char alphabet. + assert len(new_uid(32)) == int(32 / 5.16) + 1 + assert len(new_uid(64)) == int(64 / 5.16) + 1 + assert len(new_uid(128)) > len(new_uid(64)) + + +def test_new_uid_is_random(): + assert new_uid() != new_uid() + + +def test_new_timestamped_uid_format(): + uid = new_timestamped_uid() + # The id starts with a fixed-width UTC timestamp (e.g. 20150912T084555Z-...), which + # is what makes these ids sort by creation time lexically. A random suffix follows. + prefix, _, suffix = uid.partition("-") + assert re.match(r"^\d{8}T\d{6}", prefix) + assert _BASE36.match(suffix.rsplit("-", 1)[-1]) + assert new_timestamped_uid() != new_timestamped_uid() diff --git a/tests/test_string_replace.py b/tests/test_string_replace.py index 2a0e816..a0fcc65 100644 --- a/tests/test_string_replace.py +++ b/tests/test_string_replace.py @@ -1,53 +1,46 @@ -from strif.string_replace import Insertion, Replacement, insert_multiple, replace_multiple +import pytest +from strif.string_replace import Insertion, Replacement, insert_multiple, replace_multiple -def test_insert_multiple(): - text = "hello world" - insertions: list[Insertion] = [(5, ",")] - expected = "hello, world" - assert insert_multiple(text, insertions) == expected, "Single insertion failed" - text = "hello world" - insertions = [(0, "Start "), (11, " End")] - expected = "Start hello world End" - assert insert_multiple(text, insertions) == expected, "Multiple insertions failed" +def test_named_tuple_fields_and_positional_compat(): + # NamedTuple gives named access while staying tuple-compatible, so both the named + # and positional/unpacking APIs must work. + ins = Insertion(offset=5, text=",") + assert ins.offset == 5 and ins.text == "," + assert ins == (5, ",") - text = "short" - insertions = [(10, " end")] - expected = "short end" - assert insert_multiple(text, insertions) == expected, "Out of bounds insertion failed" + start, end, text = Replacement(0, 3, "x") + assert (start, end, text) == (0, 3, "x") - text = "negative test" - insertions = [(-1, "ss")] - expected = "negative tessst" - assert insert_multiple(text, insertions) == expected, "Negative offset insertion failed" - text = "no change" - insertions = [] - expected = "no change" - assert insert_multiple(text, insertions) == expected, "Empty insertions failed" +def test_insert_multiple(): + assert insert_multiple("hello world", [Insertion(5, ",")]) == "hello, world" + assert ( + insert_multiple("hello world", [Insertion(0, "Start "), Insertion(11, " End")]) + == "Start hello world End" + ) + # Out-of-bounds offset clamps to the end. + assert insert_multiple("short", [Insertion(10, " end")]) == "short end" + # Negative offset indexes from the end. + assert insert_multiple("negative test", [Insertion(-1, "ss")]) == "negative tessst" + assert insert_multiple("no change", []) == "no change" def test_replace_multiple(): - text = "The quick brown fox" - replacements: list[Replacement] = [(4, 9, "slow"), (16, 19, "dog")] - expected = "The slow brown dog" - assert replace_multiple(text, replacements) == expected, "Multiple replacements failed" - - text = "overlap test" - replacements = [(0, 6, "start"), (5, 10, "end")] - try: - replace_multiple(text, replacements) - raise AssertionError("Overlapping replacements did not raise ValueError") - except ValueError: - pass # Expected exception - - text = "short text" - replacements = [(5, 10, " longer text")] - expected = "short longer text" - assert replace_multiple(text, replacements) == expected, "Out of bounds replacement failed" - - text = "no change" - replacements = [] - expected = "no change" - assert replace_multiple(text, replacements) == expected, "Empty replacements failed" + assert ( + replace_multiple( + "The quick brown fox", [Replacement(4, 9, "slow"), Replacement(16, 19, "dog")] + ) + == "The slow brown dog" + ) + # Out-of-bounds end clamps to the end. + assert ( + replace_multiple("short text", [Replacement(5, 10, " longer text")]) == "short longer text" + ) + assert replace_multiple("no change", []) == "no change" + + +def test_replace_multiple_rejects_overlap(): + with pytest.raises(ValueError): + replace_multiple("overlap test", [Replacement(0, 6, "start"), Replacement(5, 10, "end")])