diff --git a/poetry.lock b/poetry.lock index d83268cc..6f5b3c5e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -33,8 +33,8 @@ torch = ">=1.10.0" [package.extras] deepspeed = ["deepspeed"] -dev = ["bitsandbytes", "black (~=23.1)", "datasets", "diffusers", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "rich", "ruff (~=0.6.4)", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"] -quality = ["black (~=23.1)", "hf-doc-builder (>=0.3.0)", "ruff (~=0.6.4)"] +dev = ["bitsandbytes", "black (>=23.1,<24.0)", "datasets", "diffusers", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.6.4,<0.7.0)", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"] +quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.6.4,<0.7.0)"] rich = ["rich"] sagemaker = ["sagemaker"] test-dev = ["bitsandbytes", "datasets", "diffusers", "evaluate", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"] @@ -5179,13 +5179,13 @@ type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.12 [[package]] name = "sil-machine" -version = "1.8.7" +version = "1.8.8" description = "A natural language processing library that is focused on providing tools for resource-poor languages." optional = false python-versions = "<3.14,>=3.10" files = [ - {file = "sil_machine-1.8.7-py3-none-any.whl", hash = "sha256:c6fdd64b0a071bdeb9ca6d999d3685b15d0ff5d9104758eb90e54ab37dc1d885"}, - {file = "sil_machine-1.8.7.tar.gz", hash = "sha256:5a6e7830ad609ec3b0c1b9c6bb1eedd22b63c3eb28493d2c6c18790439f2a654"}, + {file = "sil_machine-1.8.8-py3-none-any.whl", hash = "sha256:cd2479b3ac12b6421b44774f386f8da9e5fc84e4db446722890613c4bafdacc4"}, + {file = "sil_machine-1.8.8.tar.gz", hash = "sha256:ec8623c9acb593907e7cdfaf3ca0c54b6a4a87c596c24c50d7736fe10842c505"}, ] [package.dependencies] @@ -5373,6 +5373,7 @@ huggingface-hub = ">=0.16.4,<1.0" dev = ["tokenizers[testing]"] docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"] testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"] + [[package]] name = "toml" version = "0.10.2" @@ -5534,17 +5535,17 @@ tqdm = ">=4.27" [package.extras] accelerate = ["accelerate (>=0.26.0)"] -agents = ["Pillow (<=15.0,>=10.0.1)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (!=0.1.92,>=0.1.91)", "torch (>=2.0)"] -all = ["Pillow (<=15.0,>=10.0.1)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (>=2.8.1)", "flax (<=0.7.0,>=0.4.1)", "jax (<=0.4.13,>=0.4.1)", "jaxlib (<=0.4.13,>=0.4.1)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (<=0.1.4,>=0.0.8)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (!=0.1.92,>=0.1.91)", "sigopt", "tensorflow (<2.16,>2.9)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"] +agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=2.0)"] +all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"] audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] benchmark = ["optimum-benchmark (>=0.3.0)"] codecarbon = ["codecarbon (>=2.8.1)"] deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"] -deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (<8.0.0,>=7.2.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (!=0.1.92,>=0.1.91)", "tensorboard", "timeout-decorator"] -dev = ["GitPython (<3.1.19)", "Pillow (<=15.0,>=10.0.1)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (<=0.7.0,>=0.4.1)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (<=0.4.13,>=0.4.1)", "jaxlib (<=0.4.13,>=0.4.1)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (<=0.1.4,>=0.0.8)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pydantic", "pytest (<8.0.0,>=7.2.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (!=0.1.92,>=0.1.91)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (<2.16,>2.9)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] -dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (<=15.0,>=10.0.1)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (<8.0.0,>=7.2.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (!=0.1.92,>=0.1.91)", "tensorboard", "tensorflow (<2.16,>2.9)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "Pillow (<=15.0,>=10.0.1)", "isort (>=5.5.4)", "urllib3 (<2.0.0)", "libcst", "rich", "scikit-learn", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "librosa", "pyctcdecode (>=0.4.0)", "phonemizer", "kenlm"] -dev-torch = ["GitPython (<3.1.19)", "Pillow (<=15.0,>=10.0.1)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (<8.0.0,>=7.2.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (!=0.1.92,>=0.1.91)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] -flax = ["flax (<=0.7.0,>=0.4.1)", "jax (<=0.4.13,>=0.4.1)", "jaxlib (<=0.4.13,>=0.4.1)", "optax (<=0.1.4,>=0.0.8)", "scipy (<1.13.0)"] +deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"] +dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"] flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] ftfy = ["ftfy"] integrations = ["optuna", "ray[tune] (>=2.7.0)", "sigopt"] @@ -5559,13 +5560,13 @@ ray = ["ray[tune] (>=2.7.0)"] retrieval = ["datasets (!=2.5.0)", "faiss-cpu"] ruff = ["ruff (==0.5.1)"] sagemaker = ["sagemaker (>=2.31.0)"] -sentencepiece = ["protobuf", "sentencepiece (!=0.1.92,>=0.1.91)"] +sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"] serving = ["fastapi", "pydantic", "starlette", "uvicorn"] sigopt = ["sigopt"] sklearn = ["scikit-learn"] speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (<8.0.0,>=7.2.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (!=0.1.92,>=0.1.91)", "tensorboard", "timeout-decorator"] -tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (<2.16,>2.9)", "tensorflow-text (<2.16)", "tf2onnx"] +testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"] tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"] tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] tiktoken = ["blobfile", "tiktoken"] @@ -5573,10 +5574,11 @@ timm = ["timm (<=1.0.11)"] tokenizers = ["tokenizers (>=0.21,<0.22)"] torch = ["accelerate (>=0.26.0)", "torch (>=2.0)"] torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -torch-vision = ["Pillow (<=15.0,>=10.0.1)", "torchvision"] -torchhub = ["filelock", "huggingface-hub (>=0.24.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (!=0.1.92,>=0.1.91)", "torch (>=2.0)", "tokenizers (>=0.21,<0.22)", "tqdm (>=4.27)"] +torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] +torchhub = ["filelock", "huggingface-hub (>=0.24.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"] video = ["av (==9.2.0)"] -vision = ["Pillow (<=15.0,>=10.0.1)"] +vision = ["Pillow (>=10.0.1,<=15.0)"] + [[package]] name = "triton" version = "3.0.0" @@ -6141,4 +6143,4 @@ eflomal = ["eflomal"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.11" -content-hash = "d62919aa6e21a77234e7a3adc039a4a39bd3938a6921762738beab2fda9570f0" +content-hash = "8c59c48c6055ee5bcb03ced812d963c5151bf8e03672aaf0d188cbbb444785e1" diff --git a/pyproject.toml b/pyproject.toml index bf0f4987..7e94be7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,7 @@ tqdm = "^4.62.2" sacrebleu = "^2.3.1" ctranslate2 = "^3.5.1" libclang = "14.0.6" -sil-machine = {extras = ["thot"], version = "1.8.7"} +sil-machine = {extras = ["thot"], version = "1.8.8"} datasets = "^2.7.1" torch = {version = "^2.4", source = "torch"} sacremoses = "^0.0.53" diff --git a/silnlp/common/translator.py b/silnlp/common/translator.py index 6dc98c0e..8e3b6da4 100644 --- a/silnlp/common/translator.py +++ b/silnlp/common/translator.py @@ -336,7 +336,7 @@ def translate_book( trg_iso: str, produce_multiple_translations: bool = False, save_confidences: bool = False, - chapters: List[int] = [], + chapters: Optional[List[int]] = None, trg_project: Optional[str] = None, postprocess_handler: Optional[PostprocessHandler] = None, experiment_ckpt_str: str = "", @@ -372,7 +372,7 @@ def translate_usfm( trg_iso: str, produce_multiple_translations: bool = False, save_confidences: bool = False, - chapters: List[int] = [], + chapters: Optional[List[int]] = None, trg_project: Optional[str] = None, postprocess_handler: Optional[PostprocessHandler] = None, experiment_ckpt_str: str = "", @@ -440,10 +440,21 @@ def translate_usfm( for config in postprocess_handler.configs: # Compile draft remarks - draft_src_str = f"project {src_file_text.project}" if src_from_project else f"file {src_file_path.name}" - draft_remark = f"This draft of {sentences.get_book()} was machine translated on {date.today()} from {draft_src_str} using model {experiment_ckpt_str}. It should be reviewed and edited carefully." + remarks: List[Tuple[int, str]] = [] postprocess_remark = config.get_postprocess_remark() - remarks = [draft_remark] + ([postprocess_remark] if postprocess_remark else []) + draft_src_str = f"project {src_file_text.project}" if src_from_project else f"file {src_file_path.name}" + chapters_for_remarks = ( + chapters if chapters else sorted({sr.verse_ref.chapter_num for sr in scripture_refs}) + ) + for chapter_num in chapters_for_remarks: + draft_remark = ( + f"This draft of {sentences.get_book()} {chapter_num} was machine translated on " + f"{date.today()} from {draft_src_str} using model {experiment_ckpt_str}. It should be " + f"reviewed and edited carefully." + ) + remarks.append((chapter_num, draft_remark)) + if postprocess_remark: + remarks.append((chapter_num, postprocess_remark)) # Insert translation into the USFM structure of an existing project # If the target project is not the same as the translated file's original project, @@ -460,6 +471,7 @@ def translate_usfm( usfm_out = dest_updater.update_usfm( book_id=src_file_text.id, rows=config.rows, + chapters=chapters, text_behavior=text_behavior, paragraph_behavior=config.get_paragraph_behavior(), embed_behavior=config.get_embed_behavior(), diff --git a/silnlp/nmt/translate.py b/silnlp/nmt/translate.py index 5b258d1a..12a42c92 100644 --- a/silnlp/nmt/translate.py +++ b/silnlp/nmt/translate.py @@ -163,7 +163,7 @@ def translate_books( trg_iso, produce_multiple_translations, save_confidences, - chapters, + chapters if chapters else None, trg_project, postprocess_handler, experiment_ckpt_str,