From aa17eb96fd7c22c5a41eabfbd285f7370f911010 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Mar 2026 13:12:10 +0100 Subject: [PATCH 01/42] Add lazy spaCy CLI loading and static launcher --- MANIFEST.in | 1 + setup.cfg | 2 +- setup.py | 2 +- spacy/cli/__init__.py | 110 +++++++++++++++++++--------- spacy/cli/_dispatch.py | 105 +++++++++++++++++++++++++++ spacy/cli/_util.py | 18 ++--- spacy/tests/test_cli_app.py | 4 ++ spacy/tests/test_cli_launcher.py | 102 ++++++++++++++++++++++++++ spacy_cli/__init__.py | 1 + spacy_cli/build_manifest.py | 99 ++++++++++++++++++++++++++ spacy_cli/cli_manifest.json | 118 +++++++++++++++++++++++++++++++ spacy_cli/main.py | 69 ++++++++++++++++++ spacy_cli/static.py | 24 +++++++ 13 files changed, 610 insertions(+), 45 deletions(-) create mode 100644 spacy/cli/_dispatch.py create mode 100644 spacy/tests/test_cli_launcher.py create mode 100644 spacy_cli/__init__.py create mode 100644 spacy_cli/build_manifest.py create mode 100644 spacy_cli/cli_manifest.json create mode 100644 spacy_cli/main.py create mode 100644 spacy_cli/static.py diff --git a/MANIFEST.in b/MANIFEST.in index 1caf758464f..36465ea94a0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml *.hh +recursive-include spacy_cli *.json include LICENSE include README.md include pyproject.toml diff --git a/setup.cfg b/setup.cfg index c4928af9224..a0cfac9749a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -68,7 +68,7 @@ install_requires = [options.entry_points] console_scripts = - spacy = spacy.cli:setup_cli + spacy = spacy_cli.main:main [options.extras_require] lookups = diff --git a/setup.py b/setup.py index 33178662df4..4243f8731f3 100755 --- a/setup.py +++ b/setup.py @@ -213,7 +213,7 @@ def setup_package(): version=about["__version__"], ext_modules=ext_modules, cmdclass={"build_ext": build_ext_subclass}, - package_data={"": ["*.pyx", "*.pxd", "*.pxi"]}, + package_data={"": ["*.pyx", "*.pxd", "*.pxi"], "spacy_cli": ["*.json"]}, ) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 3095778fe22..dcfb4b8a92e 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,40 +1,82 @@ +import sys +from importlib import import_module +from typing import Iterable + +from typer.main import get_command from wasabi import msg -# Needed for testing -from . import download as download_module # noqa: F401 -from ._util import app, setup_cli # noqa: F401 -from .apply import apply # noqa: F401 -from .assemble import assemble_cli # noqa: F401 - -# These are the actual functions, NOT the wrapped CLI commands. The CLI commands -# are registered automatically and won't have to be imported here. -from .benchmark_speed import benchmark_speed_cli # noqa: F401 -from .convert import convert # noqa: F401 -from .debug_config import debug_config # noqa: F401 -from .debug_data import debug_data # noqa: F401 -from .debug_diff import debug_diff # noqa: F401 -from .debug_model import debug_model # noqa: F401 -from .download import download # noqa: F401 -from .evaluate import evaluate # noqa: F401 -from .find_function import find_function # noqa: F401 -from .find_threshold import find_threshold # noqa: F401 -from .info import info # noqa: F401 -from .init_config import fill_config, init_config # noqa: F401 -from .init_pipeline import init_pipeline_cli # noqa: F401 -from .package import package # noqa: F401 -from .pretrain import pretrain # noqa: F401 -from .profile import profile # noqa: F401 -from .project.assets import project_assets # type: ignore[attr-defined] # noqa: F401 -from .project.clone import project_clone # type: ignore[attr-defined] # noqa: F401 -from .project.document import ( # type: ignore[attr-defined] # noqa: F401 - project_document, +from ..util import registry +from ._dispatch import ( + GROUP_MODULES, + PUBLIC_ATTRS, + SUBCOMMAND_MODULES, + TOP_LEVEL_MODULES, ) -from .project.dvc import project_update_dvc # type: ignore[attr-defined] # noqa: F401 -from .project.pull import project_pull # type: ignore[attr-defined] # noqa: F401 -from .project.push import project_push # type: ignore[attr-defined] # noqa: F401 -from .project.run import project_run # type: ignore[attr-defined] # noqa: F401 -from .train import train_cli # type: ignore[attr-defined] # noqa: F401 -from .validate import validate # type: ignore[attr-defined] # noqa: F401 +from ._dispatch import iter_builtin_modules +from ._util import COMMAND, add_project_cli, app + +HELP_OPTIONS = {"--help", "-h"} +ROOT_OPTIONS = HELP_OPTIONS | {"--install-completion", "--show-completion"} + +__all__ = [ + "app", + "load_all_commands", + "load_for_argv", + "setup_cli", + *sorted(PUBLIC_ATTRS), +] + + +def _import_modules(module_names: Iterable[str]) -> None: + for module_name in module_names: + import_module(module_name) + + +def load_all_commands() -> None: + _import_modules(iter_builtin_modules()) + add_project_cli() + + +def load_for_argv(argv: Iterable[str]) -> None: + args = list(argv) + if not args or args[0] in ROOT_OPTIONS or args[0].startswith("-"): + load_all_commands() + return + command = args[0] + if command == "project": + add_project_cli() + return + if command in GROUP_MODULES: + subcommand = args[1] if len(args) > 1 and not args[1].startswith("-") else None + if subcommand is not None and (command, subcommand) in SUBCOMMAND_MODULES: + _import_modules(SUBCOMMAND_MODULES[(command, subcommand)]) + return + _import_modules(GROUP_MODULES[command]) + return + if command in TOP_LEVEL_MODULES: + _import_modules(TOP_LEVEL_MODULES[command]) + + +def setup_cli() -> None: + # Make sure entry-point CLI integrations are imported before command dispatch. + registry.cli.get_all() + load_for_argv(sys.argv[1:]) + command = get_command(app) + command(prog_name=COMMAND) + + +def __getattr__(name: str): + if name not in PUBLIC_ATTRS: + raise AttributeError(f"module 'spacy.cli' has no attribute {name!r}") + module_name, attr_name = PUBLIC_ATTRS[name] + module = import_module(module_name) + value = module if attr_name is None else getattr(module, attr_name) + globals()[name] = value + return value + + +def __dir__(): + return sorted(set(globals()) | set(PUBLIC_ATTRS)) @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/_dispatch.py b/spacy/cli/_dispatch.py new file mode 100644 index 00000000000..e1975dd7e1d --- /dev/null +++ b/spacy/cli/_dispatch.py @@ -0,0 +1,105 @@ +from typing import Dict, Iterable, Optional, Tuple + + +CommandPath = Tuple[str, ...] + + +TOP_LEVEL_MODULES: Dict[str, Tuple[str, ...]] = { + "apply": ("spacy.cli.apply",), + "assemble": ("spacy.cli.assemble",), + "convert": ("spacy.cli.convert",), + "debug-data": ("spacy.cli.debug_data",), + "download": ("spacy.cli.download",), + "evaluate": ("spacy.cli.evaluate",), + "find-function": ("spacy.cli.find_function",), + "find-threshold": ("spacy.cli.find_threshold",), + "info": ("spacy.cli.info",), + "package": ("spacy.cli.package",), + "pretrain": ("spacy.cli.pretrain",), + "profile": ("spacy.cli.profile",), + "train": ("spacy.cli.train",), + "validate": ("spacy.cli.validate",), +} + + +GROUP_MODULES: Dict[str, Tuple[str, ...]] = { + "benchmark": ( + "spacy.cli.benchmark_speed", + "spacy.cli.evaluate", + ), + "debug": ( + "spacy.cli.debug_config", + "spacy.cli.debug_data", + "spacy.cli.debug_diff", + "spacy.cli.debug_model", + "spacy.cli.profile", + ), + "init": ( + "spacy.cli.init_config", + "spacy.cli.init_pipeline", + ), +} + + +SUBCOMMAND_MODULES: Dict[CommandPath, Tuple[str, ...]] = { + ("benchmark", "accuracy"): ("spacy.cli.evaluate",), + ("benchmark", "speed"): ("spacy.cli.benchmark_speed",), + ("debug", "config"): ("spacy.cli.debug_config",), + ("debug", "data"): ("spacy.cli.debug_data",), + ("debug", "diff-config"): ("spacy.cli.debug_diff",), + ("debug", "model"): ("spacy.cli.debug_model",), + ("debug", "profile"): ("spacy.cli.profile",), + ("init", "config"): ("spacy.cli.init_config",), + ("init", "fill-config"): ("spacy.cli.init_config",), + ("init", "labels"): ("spacy.cli.init_pipeline",), + ("init", "nlp"): ("spacy.cli.init_pipeline",), + ("init", "vectors"): ("spacy.cli.init_pipeline",), +} + + +PUBLIC_ATTRS: Dict[str, Tuple[str, Optional[str]]] = { + "app": ("spacy.cli._util", "app"), + "apply": ("spacy.cli.apply", "apply"), + "assemble_cli": ("spacy.cli.assemble", "assemble_cli"), + "benchmark_speed_cli": ("spacy.cli.benchmark_speed", "benchmark_speed_cli"), + "convert": ("spacy.cli.convert", "convert"), + "debug_config": ("spacy.cli.debug_config", "debug_config"), + "debug_data": ("spacy.cli.debug_data", "debug_data"), + "debug_diff": ("spacy.cli.debug_diff", "debug_diff"), + "debug_model": ("spacy.cli.debug_model", "debug_model"), + "download": ("spacy.cli.download", "download"), + "download_module": ("spacy.cli.download", None), + "evaluate": ("spacy.cli.evaluate", "evaluate"), + "fill_config": ("spacy.cli.init_config", "fill_config"), + "find_function": ("spacy.cli.find_function", "find_function"), + "find_threshold": ("spacy.cli.find_threshold", "find_threshold"), + "info": ("spacy.cli.info", "info"), + "init_config": ("spacy.cli.init_config", "init_config"), + "init_pipeline_cli": ("spacy.cli.init_pipeline", "init_pipeline_cli"), + "package": ("spacy.cli.package", "package"), + "pretrain": ("spacy.cli.pretrain", "pretrain"), + "profile": ("spacy.cli.profile", "profile"), + "project_assets": ("spacy.cli.project.assets", "project_assets"), + "project_clone": ("spacy.cli.project.clone", "project_clone"), + "project_document": ("spacy.cli.project.document", "project_document"), + "project_pull": ("spacy.cli.project.pull", "project_pull"), + "project_push": ("spacy.cli.project.push", "project_push"), + "project_run": ("spacy.cli.project.run", "project_run"), + "project_update_dvc": ("spacy.cli.project.dvc", "project_update_dvc"), + "train_cli": ("spacy.cli.train", "train_cli"), + "validate": ("spacy.cli.validate", "validate"), +} + + +def iter_builtin_modules() -> Iterable[str]: + seen = set() + for modules in TOP_LEVEL_MODULES.values(): + for module in modules: + if module not in seen: + seen.add(module) + yield module + for modules in GROUP_MODULES.values(): + for module in modules: + if module not in seen: + seen.add(module) + yield module diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 309b6b1e79a..75b21dd3775 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -23,9 +23,7 @@ from click.shell_completion import split_arg_string from thinc.api import Config, ConfigValidationError, require_gpu from thinc.util import gpu_is_available -from typer.main import get_command from wasabi import Printer, msg -from weasel import app as project_cli from .. import about from ..compat import Literal @@ -72,19 +70,21 @@ benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True) debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True) init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True) +_PROJECT_CLI_ADDED = False -app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True) app.add_typer(debug_cli) app.add_typer(benchmark_cli) app.add_typer(init_cli) -def setup_cli() -> None: - # Make sure the entry-point for CLI runs, so that they get imported. - registry.cli.get_all() - # Ensure that the help messages always display the correct prompt - command = get_command(app) - command(prog_name=COMMAND) +def add_project_cli() -> None: + global _PROJECT_CLI_ADDED + if _PROJECT_CLI_ADDED: + return + from weasel import app as project_cli + + app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True) + _PROJECT_CLI_ADDED = True def parse_config_overrides( diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 1789d60ea4c..031de1fc25a 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -6,12 +6,16 @@ import srsly from typer.testing import CliRunner +from spacy.cli import load_all_commands from spacy.cli._util import app, get_git_version from spacy.tokens import Doc, DocBin, Span from .util import make_tempdir, normalize_whitespace +load_all_commands() + + def has_git(): try: get_git_version() diff --git a/spacy/tests/test_cli_launcher.py b/spacy/tests/test_cli_launcher.py new file mode 100644 index 00000000000..392572fdeec --- /dev/null +++ b/spacy/tests/test_cli_launcher.py @@ -0,0 +1,102 @@ +import importlib +import subprocess +import sys + +import pytest + +from spacy_cli.build_manifest import build_manifest +from spacy_cli.static import load_manifest + +launcher_module = importlib.import_module("spacy_cli.main") + + +def _run_python(code: str) -> str: + result = subprocess.run( + [sys.executable, "-c", code], + check=True, + capture_output=True, + text=True, + ) + return result.stdout.strip() + + +def test_cli_package_import_is_lazy(): + output = _run_python( + "import sys; import spacy.cli; " + "print('spacy.cli.train' in sys.modules); print('weasel' in sys.modules)" + ) + assert output.splitlines() == ["False", "False"] + + +def test_load_for_argv_imports_only_requested_command(): + output = _run_python( + "import sys; from spacy.cli import load_for_argv; " + "load_for_argv(['train', '--help']); " + "print('spacy.cli.train' in sys.modules); print('weasel' in sys.modules)" + ) + assert output.splitlines() == ["True", "False"] + + +def test_load_for_argv_imports_project_on_demand(): + output = _run_python( + "import sys; from spacy.cli import load_for_argv; " + "load_for_argv(['project', '--help']); print('weasel' in sys.modules)" + ) + assert output == "True" + + +def test_manifest_is_current(): + assert build_manifest() == load_manifest() + + +def test_launcher_root_help_uses_static(capsys, monkeypatch): + monkeypatch.setattr( + launcher_module, "_run_live", lambda: (_ for _ in ()).throw(AssertionError) + ) + with pytest.raises(SystemExit) as exc: + launcher_module.main(["--help"]) + assert exc.value.code == 0 + assert capsys.readouterr().out == load_manifest()["root_help"] + + +def test_launcher_command_help_uses_static(capsys, monkeypatch): + monkeypatch.setattr( + launcher_module, "_run_live", lambda: (_ for _ in ()).throw(AssertionError) + ) + with pytest.raises(SystemExit) as exc: + launcher_module.main(["train", "--help"]) + assert exc.value.code == 0 + assert capsys.readouterr().out == load_manifest()["command_help"]["train"] + + +def test_launcher_unknown_command_uses_static_error(capsys, monkeypatch): + monkeypatch.setattr( + launcher_module, "_run_live", lambda: (_ for _ in ()).throw(AssertionError) + ) + with pytest.raises(SystemExit) as exc: + launcher_module.main(["definitely-not-a-command"]) + assert exc.value.code == 2 + assert "No such command 'definitely-not-a-command'" in capsys.readouterr().out + + +def test_launcher_non_help_command_falls_back_to_live(monkeypatch): + called = [] + + def fake_run_live(): + called.append(True) + + monkeypatch.setattr(launcher_module, "_run_live", fake_run_live) + launcher_module.main(["train", "config.cfg"]) + assert called == [True] + + +def test_launcher_root_help_falls_back_with_plugins(monkeypatch): + called = [] + + def fake_run_live(): + called.append(True) + + monkeypatch.setattr(launcher_module, "_run_live", fake_run_live) + monkeypatch.setattr(launcher_module, "get_plugin_command_names", lambda: {"custom"}) + launcher_module.main(["--help"]) + assert called == [True] diff --git a/spacy_cli/__init__.py b/spacy_cli/__init__.py new file mode 100644 index 00000000000..a2cb1f66b78 --- /dev/null +++ b/spacy_cli/__init__.py @@ -0,0 +1 @@ +"""Lightweight launcher package for the spaCy console script.""" diff --git a/spacy_cli/build_manifest.py b/spacy_cli/build_manifest.py new file mode 100644 index 00000000000..6e019bdcb95 --- /dev/null +++ b/spacy_cli/build_manifest.py @@ -0,0 +1,99 @@ +import json +from pathlib import Path +from typing import Dict, Iterable, List + +from typer.main import get_command +from typer.testing import CliRunner + +from spacy.cli import load_all_commands +from spacy.cli._util import COMMAND, app + +from .static import MANIFEST_FILE, UNKNOWN_COMMAND_TOKEN, UNKNOWN_SUBCOMMAND_TOKEN + +DEFAULT_ENV = {"COLUMNS": "100", "LINES": "40", "TERM": "xterm-256color"} + + +def _invoke(runner: CliRunner, cli, args: Iterable[str]): + return runner.invoke(cli, list(args), prog_name=COMMAND, env=DEFAULT_ENV) + + +def _get_help(runner: CliRunner, cli, args: Iterable[str]) -> str: + result = _invoke(runner, cli, [*list(args), "--help"]) + if result.exit_code != 0: + err = f"Could not render help for: {' '.join(args) or ''}" + raise RuntimeError(err) + return result.stdout + + +def _maybe_get_help(runner: CliRunner, cli, args: Iterable[str]): + result = _invoke(runner, cli, [*list(args), "--help"]) + if result.exit_code != 0: + return None + return result.stdout + + +def build_manifest() -> Dict[str, object]: + load_all_commands() + cli = get_command(app) + runner = CliRunner() + known_top_level: List[str] = sorted(cli.commands.keys()) + known_groups: Dict[str, List[str]] = {} + hidden_top_level: List[str] = [] + hidden_group_commands: Dict[str, List[str]] = {} + group_help: Dict[str, str] = {} + command_help: Dict[str, str] = {} + unknown_subcommand: Dict[str, str] = {} + + for name, command in cli.commands.items(): + if getattr(command, "hidden", False): + hidden_top_level.append(name) + if hasattr(command, "commands"): + subcommands = sorted(command.commands.keys()) + known_groups[name] = subcommands + hidden_group_commands[name] = sorted( + sub_name + for sub_name, sub_cmd in command.commands.items() + if getattr(sub_cmd, "hidden", False) + ) + group_help[name] = _get_help(runner, app, [name]) + unknown_subcommand[name] = _invoke( + runner, app, [name, UNKNOWN_SUBCOMMAND_TOKEN] + ).stdout + for sub_name in subcommands: + help_text = _maybe_get_help(runner, app, [name, sub_name]) + if help_text is not None: + command_help[f"{name} {sub_name}"] = help_text + else: + command_help[name] = _get_help(runner, app, [name]) + + return { + "command": COMMAND, + "known_top_level": known_top_level, + "known_groups": known_groups, + "hidden_top_level": hidden_top_level, + "hidden_group_commands": hidden_group_commands, + "root_help": _get_help(runner, app, []), + "group_help": group_help, + "command_help": command_help, + "errors": { + "missing_command": _invoke(runner, app, []).stdout, + "unknown_command": _invoke(runner, app, [UNKNOWN_COMMAND_TOKEN]).stdout, + "unknown_subcommand": unknown_subcommand, + }, + } + + +def write_manifest(path: Path) -> Path: + data = build_manifest() + path.write_text( + json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True) + "\n" + ) + return path + + +def main() -> None: + write_manifest(Path(__file__).with_name(MANIFEST_FILE)) + + +if __name__ == "__main__": + main() diff --git a/spacy_cli/cli_manifest.json b/spacy_cli/cli_manifest.json new file mode 100644 index 00000000000..361a10dca16 --- /dev/null +++ b/spacy_cli/cli_manifest.json @@ -0,0 +1,118 @@ +{ + "command": "python -m spacy", + "command_help": { + "apply": " \n Usage: python -m spacy apply [OPTIONS] MODEL DATA_PATH OUTPUT_FILE \n \n Apply a trained pipeline to documents to get predictions. Expects a loadable spaCy pipeline and \n path to the data, which can be a directory or a file. The data files can be provided in multiple \n formats: 1. .spacy files 2. .jsonl files with a specified \"field\" to read the text from. \n 3. Files with any other extension are assumed to be containing a single document. DOCS: \n https://spacy.io/api/cli#apply \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of the documents to predict on. Can be a single file in │\n│ .spacy format or a .jsonl file. Files with other extensions are │\n│ treated as single plain text documents. If a directory is provided │\n│ it is traversed recursively to grab all files to be processed. The │\n│ files can be a mixture of .spacy, .jsonl and text files. If .jsonl │\n│ is provided the specified field is going to be grabbed (\"text\" by │\n│ default). │\n│ [default: None] │\n│ [required] │\n│ * output_file FILE Path to save the resulting .spacy file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ [default: None] │\n│ --text-key -tk TEXT Key containing text string for JSONL [default: text] │\n│ --force -F Force overwriting the output file │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU. [default: -1] │\n│ --batch-size -b INTEGER Batch size. [default: 1] │\n│ --n-process -n INTEGER number of processors to use. [default: 1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "assemble": " \n Usage: python -m spacy assemble [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Assemble a spaCy pipeline from a config file. The config file includes all settings for \n initializing the pipeline. To override settings in the config, e.g. settings that point to local \n paths or that you want to experiment with, you can override them as command line options. The \n --code argument lets you pass in a Python file that can be used to register custom functions that \n are referenced in the config. \n \n DOCS: https://spacy.io/api/cli#assemble \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * output_path PATH Output directory to store assembled pipeline in [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) to │\n│ be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "benchmark accuracy": " \n Usage: python -m spacy benchmark accuracy [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics [default: None] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ [default: None] │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "benchmark speed": " \n Usage: python -m spacy benchmark speed [OPTIONS] MODEL DATA_PATH \n \n Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the binary .spacy \n format. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --batch-size -b INTEGER RANGE [x>=1] Override the pipeline batch size [default: None] │\n│ --no-shuffle Do not shuffle benchmark data │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --batches INTEGER RANGE [x>=30] Minimum number of batches to benchmark │\n│ [default: 50] │\n│ --warmup -w INTEGER RANGE [x>=0] Number of iterations over the data for warmup │\n│ [default: 3] │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "convert": " \n Usage: python -m spacy convert [OPTIONS] INPUT_PATH [OUTPUT_DIR] \n \n Convert files into json or DocBin format for training. The resulting .spacy file can be used with \n the train command and other experiment management functions. \n \n If no output_dir is specified and the output format is JSON, the data is written to stdout, so you \n can pipe them forward to a JSON file: $ spacy convert some_file.conllu --file-type json > \n some_file.json \n DOCS: https://spacy.io/api/cli#convert \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_path TEXT Input file or directory [default: None] [required] │\n│ output_dir [OUTPUT_DIR] Output directory. '-' for stdout. [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --file-type -t [json|spacy] Type of data to produce [default: spacy] │\n│ --n-sents -n INTEGER Number of sentences per doc (0 to disable) │\n│ [default: 1] │\n│ --seg-sents -s Segment sentences (for -c ner) │\n│ --model,--base -b TEXT Trained spaCy pipeline for sentence segmentation to │\n│ use as base (for --seg-sents) │\n│ [default: None] │\n│ --morphology -m Enable appending morphology to tags │\n│ --merge-subtokens -T Merge CoNLL-U subtokens │\n│ --converter -c TEXT Converter: ('conllubio', 'conllu', 'conll', 'ner', │\n│ 'iob', 'json') │\n│ [default: auto] │\n│ --ner-map -nm PATH NER tag mapping (as JSON-encoded dict of entity types) │\n│ [default: None] │\n│ --lang -l TEXT Language (if tokenizer required) [default: None] │\n│ --concatenate -C Concatenate output to a single file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug config": " \n Usage: python -m spacy debug config [OPTIONS] CONFIG_PATH \n \n Debug a config file and show validation errors. The command will create all objects in the tree \n and validate them. Note that some config validation errors are blocking and will prevent the rest \n of the config from being resolved. This means that you may not see all validation errors at once \n and some issues are only shown once previous errors have been fixed. Similar as with the 'train' \n command, you can override settings from the config as command line options. For instance, \n --training.batch_size 128 overrides the value of \"batch_size\" in the block \"[training]\". \n \n DOCS: https://spacy.io/api/cli#debug-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --show-functions -F Show an overview of all registered functions used in the │\n│ config and where they come from (modules, files etc.) │\n│ --show-variables -V Show an overview of all variables referenced in the config and │\n│ their values. This will also reflect variables overwritten on │\n│ the CLI. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug data": " \n Usage: python -m spacy debug data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug diff-config": " \n Usage: python -m spacy debug diff-config [OPTIONS] CONFIG_PATH \n \n Show a diff of a config file with respect to spaCy's defaults or another config file. If \n additional settings were used in the creation of the config file, then you must supply these as \n extra parameters to the command when comparing to the default settings. The generated diff can \n also be used when posting to the discussion forum to provide more information for the maintainers. \n \n The `optimize`, `gpu`, and `pretraining` options are only relevant when comparing against the \n default configuration (or specifically when `compare_to` is None). \n DOCS: https://spacy.io/api/cli#debug-diff \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --compare-to PATH Path to a config file to diff against, or │\n│ `None` to compare against default settings │\n│ [default: None] │\n│ --optimize -o [efficiency|accuracy] Whether the user config was optimized for │\n│ efficiency or accuracy. Only relevant when │\n│ comparing against the default config. │\n│ [default: efficiency] │\n│ --gpu -G Whether the original config can run on a │\n│ GPU. Only relevant when comparing against │\n│ the default config. │\n│ --pretraining,--pt Whether to compare on a config with │\n│ pretraining involved. Only relevant when │\n│ comparing against the default config. │\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug model": " \n Usage: python -m spacy debug model [OPTIONS] CONFIG_PATH COMPONENT \n \n Analyze a Thinc model implementation. Includes checks for internal structure and activations \n during training. \n \n DOCS: https://spacy.io/api/cli#debug-model \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * component TEXT Name of the pipeline component of which the model should be analysed │\n│ [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --layers -l TEXT Comma-separated names of layer IDs to print │\n│ --dimensions -DIM Show dimensions │\n│ --parameters -PAR Show parameters │\n│ --gradients -GRAD Show gradients │\n│ --attributes -ATTR Show attributes │\n│ --print-step0 -P0 Print model before training │\n│ --print-step1 -P1 Print model after initialization │\n│ --print-step2 -P2 Print model after training │\n│ --print-step3 -P3 Print final predictions │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug profile": " \n Usage: python -m spacy debug profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [default: None] [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug-data": " \n Usage: python -m spacy debug-data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "download": " \n Usage: python -m spacy download [OPTIONS] MODEL \n \n Download compatible trained pipeline from the default download path using pip. If --direct flag is \n set, the command expects the full package name with version. For direct downloads, the \n compatibility check will be skipped. All additional arguments provided to this command will be \n passed to `pip install` on package installation. \n \n DOCS: https://spacy.io/api/cli#download AVAILABLE PACKAGES: https://spacy.io/models \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Name of pipeline package to download [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --direct -d,-D Force direct download of name + version │\n│ --sdist -S Download sdist (.tar.gz) archive instead of pre-built binary wheel │\n│ --url -U TEXT Download from given url [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "evaluate": " \n Usage: python -m spacy evaluate [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics [default: None] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ [default: None] │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "find-function": " \n Usage: python -m spacy find-function [OPTIONS] FUNC_NAME \n \n Find the module, path and line number to the file the registered function is defined in, if \n available. \n \n func_name (str): Name of the registered function. registry_name (Optional[str]): Name of the \n catalogue registry. \n DOCS: https://spacy.io/api/cli#find-function \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * func_name TEXT Name of the registered function. [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --registry -r TEXT Name of the catalogue registry. [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "find-threshold": " \n Usage: python -m spacy find-threshold [OPTIONS] MODEL DATA_PATH PIPE_NAME \n THRESHOLD_KEY SCORES_KEY \n \n Runs prediction trials for a trained model with varying thresholds to maximize the specified \n metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. \n Results are displayed in a table on `stdout` (the corresponding API call to \n `spacy.cli.find_threshold.find_threshold()` returns all results). \n \n This is applicable only for components whose predictions are influenced by thresholds - e.g. \n `textcat_multilabel` and `spancat`, but not `textcat`. Note that the full path to the \n corresponding threshold attribute in the config has to be provided. \n DOCS: https://spacy.io/api/cli#find-threshold \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format │\n│ [default: None] │\n│ [required] │\n│ * pipe_name TEXT Name of pipe to examine thresholds for [default: None] [required] │\n│ * threshold_key TEXT Key of threshold attribute in component's configuration │\n│ [default: None] │\n│ [required] │\n│ * scores_key TEXT Metric to optimize [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n_trials -n INTEGER Number of trials to determine optimal thresholds │\n│ [default: 11] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "info": " \n Usage: python -m spacy info [OPTIONS] [MODEL] \n \n Print info about spaCy installation. If a pipeline is specified as an argument, print its meta \n information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. \n \n Flag --url prints only the download URL of the most recent compatible version of the pipeline. \n DOCS: https://spacy.io/api/cli#info \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ model [MODEL] Optional loadable spaCy pipeline [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --silent -s,-S Don't print anything (just return) │\n│ --exclude -e TEXT Comma-separated keys to exclude from the print-out │\n│ [default: labels] │\n│ --url -u Print the URL to download the most recent compatible version of the │\n│ pipeline │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init config": " \n Usage: python -m spacy init config [OPTIONS] OUTPUT_FILE \n \n Generate a starter config file for training. Based on your requirements specified via the CLI \n arguments, this command generates a config with the optimal settings for your use case. This \n includes the choice of architecture, pretrained weights and related hyperparameters. \n \n DOCS: https://spacy.io/api/cli#init-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * output_file PATH File to save the config to or - for stdout (will only output config │\n│ and no additional logging info) │\n│ [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --lang -l TEXT Two-letter code of the language to use │\n│ [default: en] │\n│ --pipeline -p TEXT Comma-separated names of trainable pipeline │\n│ components to include (without 'tok2vec' or │\n│ 'transformer') │\n│ [default: tagger,parser,ner] │\n│ --optimize -o [efficiency|accuracy] Whether to optimize for efficiency (faster │\n│ inference, smaller model, lower memory │\n│ consumption) or higher accuracy (potentially │\n│ larger and slower model). This will impact the │\n│ choice of architecture, pretrained weights and │\n│ related hyperparameters. │\n│ [default: efficiency] │\n│ --gpu -G Whether the model can run on GPU. This will │\n│ impact the choice of architecture, pretrained │\n│ weights and related hyperparameters. │\n│ --pretraining -pt Include config for pretraining (with 'spacy │\n│ pretrain') │\n│ --force -F Force overwriting the output file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init fill-config": " \n Usage: python -m spacy init fill-config [OPTIONS] BASE_PATH [OUTPUT_FILE] \n \n Fill partial config file with default values. Will add all missing settings from the default \n config and will create all objects, check the registered functions for their default values and \n update the base config. This command can be used with a config generated via the training \n quickstart widget: https://spacy.io/usage/training#quickstart \n \n DOCS: https://spacy.io/api/cli#init-fill-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * base_path FILE Path to base config to fill [default: None] [required] │\n│ output_file [OUTPUT_FILE] Path to output .cfg file (or - for stdout) [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --pretraining -pt Include config for pretraining (with 'spacy pretrain') │\n│ --diff -D Print a visual diff highlighting the changes │\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init labels": " \n Usage: python -m spacy init labels [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Generate JSON files for the labels in the data. This helps speed up the training process, since \n spaCy won't have to preprocess the data to extract the labels. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * output_path PATH Output directory for the labels [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init nlp": " \n Usage: python -m spacy init nlp [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * output_path PATH Output directory for the prepared data [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init vectors": " \n Usage: python -m spacy init vectors [OPTIONS] LANG VECTORS_LOC OUTPUT_DIR \n \n Convert word vectors for use with spaCy. Will export an nlp object that you can use in the \n [initialize] block of your config to initialize a model with vectors. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * lang TEXT The language of the nlp object to create [default: None] [required] │\n│ * vectors_loc PATH Vectors file in Word2Vec format [default: None] [required] │\n│ * output_dir PATH Pipeline output directory [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --prune -p INTEGER Optional number of vectors to prune to [default: -1] │\n│ --truncate -t INTEGER Optional number of vectors to truncate to when reading in │\n│ vectors file │\n│ [default: 0] │\n│ --mode -m TEXT Vectors mode: default or floret [default: default] │\n│ --name -n TEXT Optional name for the word vectors, e.g. en_core_web_lg.vectors │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --attr -a TEXT Optional token attribute to use for vectors, e.g. LOWER or NORM │\n│ [default: ORTH] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "link": " \n Usage: python -m spacy link [OPTIONS] ARGS KWARGS \n \n (deprecated) \n As of spaCy v3.0, symlinks like \"en\" are not supported anymore. You can load trained pipeline \n packages using their full names or from a directory path. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * args TEXT [default: None] [required] │\n│ * kwargs TEXT [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "package": " \n Usage: python -m spacy package [OPTIONS] INPUT_DIR OUTPUT_DIR \n \n Generate an installable Python package for a pipeline. Includes binary data, meta and required \n installation files. A new directory will be created in the specified output directory, and the \n data will be copied over. If --create-meta is set and a meta.json already exists in the output \n directory, the existing values will be used as the defaults in the command-line prompt. After \n packaging, \"python -m build --sdist\" is run in the package directory, which will create a .tar.gz \n archive that can be installed via \"pip install\". \n \n If additional code files are provided (e.g. Python files containing custom registered functions \n like pipeline components), they are copied into the package and imported in the __init__.py. \n DOCS: https://spacy.io/api/cli#package \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_dir DIRECTORY Directory with pipeline data [default: None] [required] │\n│ * output_dir DIRECTORY Output parent directory [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c TEXT Comma-separated paths to Python file │\n│ with additional code (registered │\n│ functions) to be included in the package │\n│ --meta-path,--meta -m FILE Path to meta.json [default: None] │\n│ --create-meta -C Create meta.json, even if one exists │\n│ --name -n TEXT Package name to override meta │\n│ [default: None] │\n│ --version -v TEXT Package version to override meta │\n│ [default: None] │\n│ --build -b TEXT Comma-separated formats to build: sdist │\n│ and/or wheel, or none. │\n│ [default: sdist] │\n│ --force -f,-F Force overwriting existing data in │\n│ output directory │\n│ --require-parent -R,-R --no-require-parent Include the parent package (e.g. spacy) │\n│ in the requirements │\n│ [default: require-parent] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "pretrain": " \n Usage: python -m spacy pretrain [OPTIONS] CONFIG_PATH OUTPUT_DIR \n \n Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate \n language-modelling objective. Two objective types are available, vector-based and character-based. \n \n In the vector-based objective, we load word vectors that have been trained using a word2vec-style \n distributional similarity algorithm, and train a component like a CNN, BiLSTM, etc to predict \n vectors which match the pretrained ones. The weights are saved to a directory after each epoch. \n You can then pass a path to one of these pretrained weights files to the 'spacy train' command. \n This technique may be especially helpful if you have little labelled data. However, it's still \n quite experimental, so your mileage may vary. \n To load the weights back in during 'spacy train', you need to ensure all settings are the same \n between pretraining and training. Ideally, this is done by using the same config file for both \n commands. \n DOCS: https://spacy.io/api/cli#pretrain \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path FILE Path to config file [default: None] [required] │\n│ * output_dir PATH Directory to write weights to on each epoch [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --resume-path -r PATH Path to pretrained weights from which to resume pretraining │\n│ [default: None] │\n│ --epoch-resume -er INTEGER The epoch to resume counting from when using --resume-path. │\n│ Prevents unintended overwriting of existing weight files. │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --skip-last -L Skip saving model-last.bin │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "profile": " \n Usage: python -m spacy profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [default: None] [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project assets": " \n Usage: python -m spacy project assets [OPTIONS] [PROJECT_DIR] \n \n Fetch project assets like datasets and pretrained weights. Assets are defined in the \"assets\" \n section of the project.yml. If a checksum is provided in the project.yml, the file is only \n downloaded if no local file with the same checksum exists. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/tutorial/directory-and-assets.md \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Path to cloned project. Defaults to current working directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --sparse -S Use sparse checkout for assets provided via Git, to only check out and clone │\n│ the files needed. Requires Git v22.2+. │\n│ --extra -e Download all assets, including those marked as 'extra'. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project clone": " \n Usage: python -m spacy project clone [OPTIONS] NAME [DEST] \n \n Clone a project template from a repository. Calls into \"git\" and will only download the files from \n the given subdirectory. The GitHub repo defaults to the official Weasel template repo, but can be \n customized (including using a private repo). \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#clipboard-clone \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * name TEXT The name of the template to clone [default: None] [required] │\n│ dest [DEST] Where to clone the project. Defaults to current working directory │\n│ [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --repo -r TEXT The repository to clone from │\n│ [default: https://github.com/explosion/projects] │\n│ --branch -b TEXT The branch to clone from. If not provided, will attempt main, master │\n│ [default: None] │\n│ --sparse -S Use sparse Git checkout to only check out and clone the files needed. │\n│ Requires Git v22.2+. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project document": " \n Usage: python -m spacy project document [OPTIONS] [PROJECT_DIR] \n \n Auto-generate a README.md for a project. If the content is saved to a file, hidden markers are \n added so you can add custom content before or after the auto-generated section and only the \n auto-generated docs will be replaced when you re-run the command. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#closed_book-document \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Path to cloned project. Defaults to current working directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o PATH Path to output Markdown file for output. Defaults to - for standard │\n│ output │\n│ [default: -] │\n│ --no-emoji -NE Don't use emoji │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project dvc": " \n Usage: python -m spacy project dvc [OPTIONS] [PROJECT_DIR] [WORKFLOW] \n \n Auto-generate Data Version Control (DVC) config. A DVC project can only define one pipeline, so \n you need to specify one workflow defined in the project.yml. If no workflow is specified, the \n first defined workflow is used. The DVC config will only be updated if the project.yml changed. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#repeat-dvc \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n│ workflow [WORKFLOW] Name of workflow defined in project.yml. Defaults to first │\n│ workflow if not set. │\n│ [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --verbose -V Print more info │\n│ --quiet -q Print less info │\n│ --force -F Force update DVC config │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project pull": " \n Usage: python -m spacy project pull [OPTIONS] [REMOTE] [PROJECT_DIR] \n \n Retrieve available precomputed outputs from a remote storage. You can alias remotes in your \n project.yml by mapping them to storage paths. A storage can be anything that the smart_open \n library can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_down-push \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ remote [REMOTE] Name or path of remote storage [default: default] │\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project push": " \n Usage: python -m spacy project push [OPTIONS] [REMOTE] [PROJECT_DIR] \n \n Persist outputs to a remote storage. You can alias remotes in your project.yml by mapping them to \n storage paths. A storage can be anything that the smart_open library can upload to, e.g. AWS, \n Google Cloud Storage, SSH, local directories etc. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_up-push \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ remote [REMOTE] Name or path of remote storage [default: default] │\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "train": " \n Usage: python -m spacy train [OPTIONS] CONFIG_PATH \n \n Train or update a spaCy pipeline. Requires data in spaCy's binary format. To convert data from \n other formats, use the `spacy convert` command. The config file includes all settings and \n hyperparameters used during training. To override settings in the config, e.g. settings that point \n to local paths or that you want to experiment with, you can override them as command line options. \n For instance, --training.batch_size 128 overrides the value of \"batch_size\" in the block \n \"[training]\". The --code argument lets you pass in a Python file that's imported before training. \n It can be used to register custom functions and architectures that can then be referenced in the \n config. \n \n DOCS: https://spacy.io/api/cli#train \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output,--output-path -o PATH Output directory to store trained pipeline in │\n│ [default: None] │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "validate": " \n Usage: python -m spacy validate [OPTIONS] \n \n Validate the currently installed pipeline packages and spaCy version. Checks if the installed \n packages are compatible and shows upgrade instructions if available. Should be run after `pip \n install -U spacy`. \n \n DOCS: https://spacy.io/api/cli#validate \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + }, + "errors": { + "missing_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ Missing command. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "unknown_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_COMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "unknown_subcommand": { + "benchmark": " \n Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]... \n \n Commands for benchmarking pipelines. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ accuracy Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in │\n│ the binary .spacy format. The --gold-preproc option sets up the evaluation examples │\n│ with gold-standard sentences and tokens for the predictions. Gold preprocessing helps │\n│ the annotations align to the tokenization, and may result in sequences of more │\n│ consistent length. However, it may reduce runtime accuracy due to train/test skew. To │\n│ render a sample of dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ speed Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the │\n│ binary .spacy format. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug": " \n Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]... \n \n Suite of helpful commands for debugging and profiling. Includes commands to check and validate \n your config files, training and evaluation data, and custom model implementations. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ data Analyze, debug and validate your training and development data. Outputs useful │\n│ stats, and can help you find problems like invalid entity annotations, cyclic │\n│ dependencies, low data labels and more. │\n│ profile Profile which functions take the most time in a spaCy pipeline. Input should be │\n│ formatted as one JSON object per line with a key \"text\". It can either be provided │\n│ as a JSONL file, or be read from sys.sytdin. If no input file is specified, the │\n│ IMDB dataset is loaded via Thinc. │\n│ config Debug a config file and show validation errors. The command will create all │\n│ objects in the tree and validate them. Note that some config validation errors are │\n│ blocking and will prevent the rest of the config from being resolved. This means │\n│ that you may not see all validation errors at once and some issues are only shown │\n│ once previous errors have been fixed. Similar as with the 'train' command, you can │\n│ override settings from the config as command line options. For instance, │\n│ --training.batch_size 128 overrides the value of \"batch_size\" in the block │\n│ \"[training]\". │\n│ diff-config Show a diff of a config file with respect to spaCy's defaults or another config │\n│ file. If additional settings were used in the creation of the config file, then │\n│ you must supply these as extra parameters to the command when comparing to the │\n│ default settings. The generated diff can also be used when posting to the │\n│ discussion forum to provide more information for the maintainers. │\n│ model Analyze a Thinc model implementation. Includes checks for internal structure and │\n│ activations during training. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init": " \n Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]... \n \n Commands for initializing configs and pipeline packages. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ config Generate a starter config file for training. Based on your requirements specified │\n│ via the CLI arguments, this command generates a config with the optimal settings │\n│ for your use case. This includes the choice of architecture, pretrained weights │\n│ and related hyperparameters. │\n│ fill-config Fill partial config file with default values. Will add all missing settings from │\n│ the default config and will create all objects, check the registered functions for │\n│ their default values and update the base config. This command can be used with a │\n│ config generated via the training quickstart widget: │\n│ https://spacy.io/usage/training#quickstart │\n│ vectors Convert word vectors for use with spaCy. Will export an nlp object that you can │\n│ use in the [initialize] block of your config to initialize a model with vectors. │\n│ labels Generate JSON files for the labels in the data. This helps speed up the training │\n│ process, since spaCy won't have to preprocess the data to extract the labels. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project": " \n Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]... \n \n Command-line interface for spaCy projects and templates. You'd typically start by cloning a \n project template to a local directory and fetching its assets like datasets etc. See the project's \n project.yml for the available commands. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ assets Fetch project assets like datasets and pretrained weights. Assets are defined in the │\n│ \"assets\" section of the project.yml. If a checksum is provided in the project.yml, │\n│ the file is only downloaded if no local file with the same checksum exists. │\n│ clone Clone a project template from a repository. Calls into \"git\" and will only download │\n│ the files from the given subdirectory. The GitHub repo defaults to the official │\n│ Weasel template repo, but can be customized (including using a private repo). │\n│ document Auto-generate a README.md for a project. If the content is saved to a file, hidden │\n│ markers are added so you can add custom content before or after the auto-generated │\n│ section and only the auto-generated docs will be replaced when you re-run the │\n│ command. │\n│ dvc Auto-generate Data Version Control (DVC) config. A DVC project can only define one │\n│ pipeline, so you need to specify one workflow defined in the project.yml. If no │\n│ workflow is specified, the first defined workflow is used. The DVC config will only │\n│ be updated if the project.yml changed. │\n│ run Run a named command or workflow defined in the project.yml. If a workflow name is │\n│ specified, all commands in the workflow are run, in order. If commands define │\n│ dependencies and/or outputs, they will only be re-run if state has changed. │\n│ pull Retrieve available precomputed outputs from a remote storage. You can alias remotes │\n│ in your project.yml by mapping them to storage paths. A storage can be anything that │\n│ the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, local │\n│ directories etc. │\n│ push Persist outputs to a remote storage. You can alias remotes in your project.yml by │\n│ mapping them to storage paths. A storage can be anything that the smart_open library │\n│ can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + } + }, + "group_help": { + "benchmark": " \n Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]... \n \n Commands for benchmarking pipelines. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ accuracy Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in │\n│ the binary .spacy format. The --gold-preproc option sets up the evaluation examples │\n│ with gold-standard sentences and tokens for the predictions. Gold preprocessing helps │\n│ the annotations align to the tokenization, and may result in sequences of more │\n│ consistent length. However, it may reduce runtime accuracy due to train/test skew. To │\n│ render a sample of dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ speed Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the │\n│ binary .spacy format. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug": " \n Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]... \n \n Suite of helpful commands for debugging and profiling. Includes commands to check and validate \n your config files, training and evaluation data, and custom model implementations. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ data Analyze, debug and validate your training and development data. Outputs useful │\n│ stats, and can help you find problems like invalid entity annotations, cyclic │\n│ dependencies, low data labels and more. │\n│ profile Profile which functions take the most time in a spaCy pipeline. Input should be │\n│ formatted as one JSON object per line with a key \"text\". It can either be provided │\n│ as a JSONL file, or be read from sys.sytdin. If no input file is specified, the │\n│ IMDB dataset is loaded via Thinc. │\n│ config Debug a config file and show validation errors. The command will create all │\n│ objects in the tree and validate them. Note that some config validation errors are │\n│ blocking and will prevent the rest of the config from being resolved. This means │\n│ that you may not see all validation errors at once and some issues are only shown │\n│ once previous errors have been fixed. Similar as with the 'train' command, you can │\n│ override settings from the config as command line options. For instance, │\n│ --training.batch_size 128 overrides the value of \"batch_size\" in the block │\n│ \"[training]\". │\n│ diff-config Show a diff of a config file with respect to spaCy's defaults or another config │\n│ file. If additional settings were used in the creation of the config file, then │\n│ you must supply these as extra parameters to the command when comparing to the │\n│ default settings. The generated diff can also be used when posting to the │\n│ discussion forum to provide more information for the maintainers. │\n│ model Analyze a Thinc model implementation. Includes checks for internal structure and │\n│ activations during training. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init": " \n Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]... \n \n Commands for initializing configs and pipeline packages. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ config Generate a starter config file for training. Based on your requirements specified │\n│ via the CLI arguments, this command generates a config with the optimal settings │\n│ for your use case. This includes the choice of architecture, pretrained weights │\n│ and related hyperparameters. │\n│ fill-config Fill partial config file with default values. Will add all missing settings from │\n│ the default config and will create all objects, check the registered functions for │\n│ their default values and update the base config. This command can be used with a │\n│ config generated via the training quickstart widget: │\n│ https://spacy.io/usage/training#quickstart │\n│ vectors Convert word vectors for use with spaCy. Will export an nlp object that you can │\n│ use in the [initialize] block of your config to initialize a model with vectors. │\n│ labels Generate JSON files for the labels in the data. This helps speed up the training │\n│ process, since spaCy won't have to preprocess the data to extract the labels. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project": " \n Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]... \n \n Command-line interface for spaCy projects and templates. You'd typically start by cloning a \n project template to a local directory and fetching its assets like datasets etc. See the project's \n project.yml for the available commands. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ assets Fetch project assets like datasets and pretrained weights. Assets are defined in the │\n│ \"assets\" section of the project.yml. If a checksum is provided in the project.yml, │\n│ the file is only downloaded if no local file with the same checksum exists. │\n│ clone Clone a project template from a repository. Calls into \"git\" and will only download │\n│ the files from the given subdirectory. The GitHub repo defaults to the official │\n│ Weasel template repo, but can be customized (including using a private repo). │\n│ document Auto-generate a README.md for a project. If the content is saved to a file, hidden │\n│ markers are added so you can add custom content before or after the auto-generated │\n│ section and only the auto-generated docs will be replaced when you re-run the │\n│ command. │\n│ dvc Auto-generate Data Version Control (DVC) config. A DVC project can only define one │\n│ pipeline, so you need to specify one workflow defined in the project.yml. If no │\n│ workflow is specified, the first defined workflow is used. The DVC config will only │\n│ be updated if the project.yml changed. │\n│ run Run a named command or workflow defined in the project.yml. If a workflow name is │\n│ specified, all commands in the workflow are run, in order. If commands define │\n│ dependencies and/or outputs, they will only be re-run if state has changed. │\n│ pull Retrieve available precomputed outputs from a remote storage. You can alias remotes │\n│ in your project.yml by mapping them to storage paths. A storage can be anything that │\n│ the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, local │\n│ directories etc. │\n│ push Persist outputs to a remote storage. You can alias remotes in your project.yml by │\n│ mapping them to storage paths. A storage can be anything that the smart_open library │\n│ can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + }, + "hidden_group_commands": { + "benchmark": [], + "debug": [], + "init": [ + "nlp" + ], + "project": [] + }, + "hidden_top_level": [ + "link", + "debug-data", + "profile" + ], + "known_groups": { + "benchmark": [ + "accuracy", + "speed" + ], + "debug": [ + "config", + "data", + "diff-config", + "model", + "profile" + ], + "init": [ + "config", + "fill-config", + "labels", + "nlp", + "vectors" + ], + "project": [ + "assets", + "clone", + "document", + "dvc", + "pull", + "push", + "run" + ] + }, + "known_top_level": [ + "apply", + "assemble", + "benchmark", + "convert", + "debug", + "debug-data", + "download", + "evaluate", + "find-function", + "find-threshold", + "info", + "init", + "link", + "package", + "pretrain", + "profile", + "project", + "train", + "validate" + ], + "root_help": " \n Usage: python -m spacy [OPTIONS] COMMAND [ARGS]... \n \n spaCy Command-line Interface \n \n DOCS: https://spacy.io/api/cli \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --install-completion Install completion for the current shell. │\n│ --show-completion Show completion for the current shell, to copy it or customize the │\n│ installation. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ download Download compatible trained pipeline from the default download path using pip. │\n│ If --direct flag is set, the command expects the full package name with │\n│ version. For direct downloads, the compatibility check will be skipped. All │\n│ additional arguments provided to this command will be passed to `pip install` │\n│ on package installation. │\n│ info Print info about spaCy installation. If a pipeline is specified as an argument, │\n│ print its meta information. Flag --markdown prints details in Markdown for easy │\n│ copy-pasting to GitHub issues. │\n│ apply Apply a trained pipeline to documents to get predictions. Expects a loadable │\n│ spaCy pipeline and path to the data, which can be a directory or a file. The │\n│ data files can be provided in multiple formats: 1. .spacy files 2. │\n│ .jsonl files with a specified \"field\" to read the text from. 3. Files with │\n│ any other extension are assumed to be containing a single document. │\n│ DOCS: https://spacy.io/api/cli#apply │\n│ assemble Assemble a spaCy pipeline from a config file. The config file includes all │\n│ settings for initializing the pipeline. To override settings in the config, │\n│ e.g. settings that point to local paths or that you want to experiment with, │\n│ you can override them as command line options. The --code argument lets you │\n│ pass in a Python file that can be used to register custom functions that are │\n│ referenced in the config. │\n│ convert Convert files into json or DocBin format for training. The resulting .spacy │\n│ file can be used with the train command and other experiment management │\n│ functions. │\n│ evaluate Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation │\n│ data in the binary .spacy format. The --gold-preproc option sets up the │\n│ evaluation examples with gold-standard sentences and tokens for the │\n│ predictions. Gold preprocessing helps the annotations align to the │\n│ tokenization, and may result in sequences of more consistent length. However, │\n│ it may reduce runtime accuracy due to train/test skew. To render a sample of │\n│ dependency parses in a HTML file, set as output directory as the displacy_path │\n│ argument. │\n│ find-function Find the module, path and line number to the file the registered function is │\n│ defined in, if available. │\n│ find-threshold Runs prediction trials for a trained model with varying thresholds to maximize │\n│ the specified metric. The search space for the threshold is traversed linearly │\n│ from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` │\n│ (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` │\n│ returns all results). │\n│ package Generate an installable Python package for a pipeline. Includes binary data, │\n│ meta and required installation files. A new directory will be created in the │\n│ specified output directory, and the data will be copied over. If --create-meta │\n│ is set and a meta.json already exists in the output directory, the existing │\n│ values will be used as the defaults in the command-line prompt. After │\n│ packaging, \"python -m build --sdist\" is run in the package directory, which │\n│ will create a .tar.gz archive that can be installed via \"pip install\". │\n│ pretrain Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using │\n│ an approximate language-modelling objective. Two objective types are available, │\n│ vector-based and character-based. │\n│ train Train or update a spaCy pipeline. Requires data in spaCy's binary format. To │\n│ convert data from other formats, use the `spacy convert` command. The config │\n│ file includes all settings and hyperparameters used during training. To │\n│ override settings in the config, e.g. settings that point to local paths or │\n│ that you want to experiment with, you can override them as command line │\n│ options. For instance, --training.batch_size 128 overrides the value of │\n│ \"batch_size\" in the block \"[training]\". The --code argument lets you pass in a │\n│ Python file that's imported before training. It can be used to register custom │\n│ functions and architectures that can then be referenced in the config. │\n│ validate Validate the currently installed pipeline packages and spaCy version. Checks if │\n│ the installed packages are compatible and shows upgrade instructions if │\n│ available. Should be run after `pip install -U spacy`. │\n│ debug Suite of helpful commands for debugging and profiling. Includes commands to │\n│ check and validate your config files, training and evaluation data, and custom │\n│ model implementations. │\n│ benchmark Commands for benchmarking pipelines. │\n│ init Commands for initializing configs and pipeline packages. │\n│ project Command-line interface for spaCy projects and templates. You'd typically start │\n│ by cloning a project template to a local directory and fetching its assets like │\n│ datasets etc. See the project's project.yml for the available commands. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" +} diff --git a/spacy_cli/main.py b/spacy_cli/main.py new file mode 100644 index 00000000000..cbe6e376c58 --- /dev/null +++ b/spacy_cli/main.py @@ -0,0 +1,69 @@ +import sys +from typing import Iterable, Optional + +from .static import HELP_OPTIONS, UNKNOWN_COMMAND_TOKEN, UNKNOWN_SUBCOMMAND_TOKEN +from .static import get_plugin_command_names, load_manifest + + +def _write_output(text: str) -> None: + sys.stdout.write(text) + if not text.endswith("\n"): + sys.stdout.write("\n") + + +def _run_live() -> None: + from spacy.cli import setup_cli + + setup_cli() + + +def _try_static(argv: Iterable[str]): + args = list(argv) + manifest = load_manifest() + plugin_command_names = get_plugin_command_names() + known_groups = manifest["known_groups"] + known_top_level = set(manifest["known_top_level"]) + if not args: + return manifest["errors"]["missing_command"], 2 + first = args[0] + if first in HELP_OPTIONS: + if plugin_command_names: + return None + return manifest["root_help"], 0 + if first.startswith("-"): + return None + if first not in known_top_level: + if first in plugin_command_names: + return None + template = manifest["errors"]["unknown_command"] + return template.replace(UNKNOWN_COMMAND_TOKEN, first), 2 + if first in known_groups: + if len(args) == 1 or args[1] in HELP_OPTIONS: + if plugin_command_names: + return None + return manifest["group_help"][first], 0 + second = args[1] + if second not in known_groups[first]: + if plugin_command_names: + return None + template = manifest["errors"]["unknown_subcommand"][first] + return template.replace(UNKNOWN_SUBCOMMAND_TOKEN, second), 2 + if any(arg in HELP_OPTIONS for arg in args[2:]): + return manifest["command_help"][f"{first} {second}"], 0 + return None + if any(arg in HELP_OPTIONS for arg in args[1:]): + return manifest["command_help"][first], 0 + return None + + +def main(argv: Optional[Iterable[str]] = None) -> None: + args = sys.argv[1:] if argv is None else list(argv) + try: + static_result = _try_static(args) + except Exception: + return _run_live() + if static_result is None: + return _run_live() + text, code = static_result + _write_output(text) + raise SystemExit(code) diff --git a/spacy_cli/static.py b/spacy_cli/static.py new file mode 100644 index 00000000000..51594ceef9a --- /dev/null +++ b/spacy_cli/static.py @@ -0,0 +1,24 @@ +import json +from functools import lru_cache +from importlib.metadata import entry_points +from importlib.resources import files +from typing import Any, Dict, Set + + +HELP_OPTIONS = {"--help", "-h"} +PLUGIN_ENTRY_POINT_GROUP = "spacy_cli" +MANIFEST_FILE = "cli_manifest.json" +UNKNOWN_COMMAND_TOKEN = "__SPACY_UNKNOWN_COMMAND__" +UNKNOWN_SUBCOMMAND_TOKEN = "__SPACY_UNKNOWN_SUBCOMMAND__" + + +@lru_cache(maxsize=1) +def load_manifest() -> Dict[str, Any]: + data = files("spacy_cli").joinpath(MANIFEST_FILE).read_text(encoding="utf8") + return json.loads(data) + + +def get_plugin_command_names() -> Set[str]: + return { + entry_point.name for entry_point in entry_points(group=PLUGIN_ENTRY_POINT_GROUP) + } From c7d7a724f22e352547d3706dfdcc35e6487a0c5d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Mar 2026 14:32:31 +0100 Subject: [PATCH 02/42] Fix lazy load on modules where the function shadows --- spacy/cli/__init__.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index dcfb4b8a92e..ded45efe9f7 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,4 +1,5 @@ import sys +import types from importlib import import_module from typing import Iterable @@ -79,6 +80,19 @@ def __dir__(): return sorted(set(globals()) | set(PUBLIC_ATTRS)) +class _CLIModule(types.ModuleType): + def __setattr__(self, name, value): + if isinstance(value, types.ModuleType) and name in PUBLIC_ATTRS: + _, attr_name = PUBLIC_ATTRS[name] + if attr_name is not None: + super().__setattr__(name, getattr(value, attr_name)) + return + super().__setattr__(name, value) + + +sys.modules[__name__].__class__ = _CLIModule + + @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) def link(*args, **kwargs): """As of spaCy v3.0, symlinks like "en" are not supported anymore. You can load trained From 126deace20c1e9382917e675361b6448188cfcf5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Mar 2026 14:32:41 +0100 Subject: [PATCH 03/42] Update manifest --- spacy_cli/cli_manifest.json | 78 ++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/spacy_cli/cli_manifest.json b/spacy_cli/cli_manifest.json index 361a10dca16..12d760416c5 100644 --- a/spacy_cli/cli_manifest.json +++ b/spacy_cli/cli_manifest.json @@ -1,55 +1,55 @@ { "command": "python -m spacy", "command_help": { - "apply": " \n Usage: python -m spacy apply [OPTIONS] MODEL DATA_PATH OUTPUT_FILE \n \n Apply a trained pipeline to documents to get predictions. Expects a loadable spaCy pipeline and \n path to the data, which can be a directory or a file. The data files can be provided in multiple \n formats: 1. .spacy files 2. .jsonl files with a specified \"field\" to read the text from. \n 3. Files with any other extension are assumed to be containing a single document. DOCS: \n https://spacy.io/api/cli#apply \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of the documents to predict on. Can be a single file in │\n│ .spacy format or a .jsonl file. Files with other extensions are │\n│ treated as single plain text documents. If a directory is provided │\n│ it is traversed recursively to grab all files to be processed. The │\n│ files can be a mixture of .spacy, .jsonl and text files. If .jsonl │\n│ is provided the specified field is going to be grabbed (\"text\" by │\n│ default). │\n│ [default: None] │\n│ [required] │\n│ * output_file FILE Path to save the resulting .spacy file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ [default: None] │\n│ --text-key -tk TEXT Key containing text string for JSONL [default: text] │\n│ --force -F Force overwriting the output file │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU. [default: -1] │\n│ --batch-size -b INTEGER Batch size. [default: 1] │\n│ --n-process -n INTEGER number of processors to use. [default: 1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "assemble": " \n Usage: python -m spacy assemble [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Assemble a spaCy pipeline from a config file. The config file includes all settings for \n initializing the pipeline. To override settings in the config, e.g. settings that point to local \n paths or that you want to experiment with, you can override them as command line options. The \n --code argument lets you pass in a Python file that can be used to register custom functions that \n are referenced in the config. \n \n DOCS: https://spacy.io/api/cli#assemble \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * output_path PATH Output directory to store assembled pipeline in [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) to │\n│ be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "benchmark accuracy": " \n Usage: python -m spacy benchmark accuracy [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics [default: None] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ [default: None] │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "benchmark speed": " \n Usage: python -m spacy benchmark speed [OPTIONS] MODEL DATA_PATH \n \n Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the binary .spacy \n format. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --batch-size -b INTEGER RANGE [x>=1] Override the pipeline batch size [default: None] │\n│ --no-shuffle Do not shuffle benchmark data │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --batches INTEGER RANGE [x>=30] Minimum number of batches to benchmark │\n│ [default: 50] │\n│ --warmup -w INTEGER RANGE [x>=0] Number of iterations over the data for warmup │\n│ [default: 3] │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "convert": " \n Usage: python -m spacy convert [OPTIONS] INPUT_PATH [OUTPUT_DIR] \n \n Convert files into json or DocBin format for training. The resulting .spacy file can be used with \n the train command and other experiment management functions. \n \n If no output_dir is specified and the output format is JSON, the data is written to stdout, so you \n can pipe them forward to a JSON file: $ spacy convert some_file.conllu --file-type json > \n some_file.json \n DOCS: https://spacy.io/api/cli#convert \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_path TEXT Input file or directory [default: None] [required] │\n│ output_dir [OUTPUT_DIR] Output directory. '-' for stdout. [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --file-type -t [json|spacy] Type of data to produce [default: spacy] │\n│ --n-sents -n INTEGER Number of sentences per doc (0 to disable) │\n│ [default: 1] │\n│ --seg-sents -s Segment sentences (for -c ner) │\n│ --model,--base -b TEXT Trained spaCy pipeline for sentence segmentation to │\n│ use as base (for --seg-sents) │\n│ [default: None] │\n│ --morphology -m Enable appending morphology to tags │\n│ --merge-subtokens -T Merge CoNLL-U subtokens │\n│ --converter -c TEXT Converter: ('conllubio', 'conllu', 'conll', 'ner', │\n│ 'iob', 'json') │\n│ [default: auto] │\n│ --ner-map -nm PATH NER tag mapping (as JSON-encoded dict of entity types) │\n│ [default: None] │\n│ --lang -l TEXT Language (if tokenizer required) [default: None] │\n│ --concatenate -C Concatenate output to a single file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug config": " \n Usage: python -m spacy debug config [OPTIONS] CONFIG_PATH \n \n Debug a config file and show validation errors. The command will create all objects in the tree \n and validate them. Note that some config validation errors are blocking and will prevent the rest \n of the config from being resolved. This means that you may not see all validation errors at once \n and some issues are only shown once previous errors have been fixed. Similar as with the 'train' \n command, you can override settings from the config as command line options. For instance, \n --training.batch_size 128 overrides the value of \"batch_size\" in the block \"[training]\". \n \n DOCS: https://spacy.io/api/cli#debug-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --show-functions -F Show an overview of all registered functions used in the │\n│ config and where they come from (modules, files etc.) │\n│ --show-variables -V Show an overview of all variables referenced in the config and │\n│ their values. This will also reflect variables overwritten on │\n│ the CLI. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug data": " \n Usage: python -m spacy debug data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug diff-config": " \n Usage: python -m spacy debug diff-config [OPTIONS] CONFIG_PATH \n \n Show a diff of a config file with respect to spaCy's defaults or another config file. If \n additional settings were used in the creation of the config file, then you must supply these as \n extra parameters to the command when comparing to the default settings. The generated diff can \n also be used when posting to the discussion forum to provide more information for the maintainers. \n \n The `optimize`, `gpu`, and `pretraining` options are only relevant when comparing against the \n default configuration (or specifically when `compare_to` is None). \n DOCS: https://spacy.io/api/cli#debug-diff \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --compare-to PATH Path to a config file to diff against, or │\n│ `None` to compare against default settings │\n│ [default: None] │\n│ --optimize -o [efficiency|accuracy] Whether the user config was optimized for │\n│ efficiency or accuracy. Only relevant when │\n│ comparing against the default config. │\n│ [default: efficiency] │\n│ --gpu -G Whether the original config can run on a │\n│ GPU. Only relevant when comparing against │\n│ the default config. │\n│ --pretraining,--pt Whether to compare on a config with │\n│ pretraining involved. Only relevant when │\n│ comparing against the default config. │\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug model": " \n Usage: python -m spacy debug model [OPTIONS] CONFIG_PATH COMPONENT \n \n Analyze a Thinc model implementation. Includes checks for internal structure and activations \n during training. \n \n DOCS: https://spacy.io/api/cli#debug-model \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * component TEXT Name of the pipeline component of which the model should be analysed │\n│ [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --layers -l TEXT Comma-separated names of layer IDs to print │\n│ --dimensions -DIM Show dimensions │\n│ --parameters -PAR Show parameters │\n│ --gradients -GRAD Show gradients │\n│ --attributes -ATTR Show attributes │\n│ --print-step0 -P0 Print model before training │\n│ --print-step1 -P1 Print model after initialization │\n│ --print-step2 -P2 Print model after training │\n│ --print-step3 -P3 Print final predictions │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug profile": " \n Usage: python -m spacy debug profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [default: None] [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug-data": " \n Usage: python -m spacy debug-data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "download": " \n Usage: python -m spacy download [OPTIONS] MODEL \n \n Download compatible trained pipeline from the default download path using pip. If --direct flag is \n set, the command expects the full package name with version. For direct downloads, the \n compatibility check will be skipped. All additional arguments provided to this command will be \n passed to `pip install` on package installation. \n \n DOCS: https://spacy.io/api/cli#download AVAILABLE PACKAGES: https://spacy.io/models \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Name of pipeline package to download [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --direct -d,-D Force direct download of name + version │\n│ --sdist -S Download sdist (.tar.gz) archive instead of pre-built binary wheel │\n│ --url -U TEXT Download from given url [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "evaluate": " \n Usage: python -m spacy evaluate [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics [default: None] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ [default: None] │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "find-function": " \n Usage: python -m spacy find-function [OPTIONS] FUNC_NAME \n \n Find the module, path and line number to the file the registered function is defined in, if \n available. \n \n func_name (str): Name of the registered function. registry_name (Optional[str]): Name of the \n catalogue registry. \n DOCS: https://spacy.io/api/cli#find-function \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * func_name TEXT Name of the registered function. [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --registry -r TEXT Name of the catalogue registry. [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "find-threshold": " \n Usage: python -m spacy find-threshold [OPTIONS] MODEL DATA_PATH PIPE_NAME \n THRESHOLD_KEY SCORES_KEY \n \n Runs prediction trials for a trained model with varying thresholds to maximize the specified \n metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. \n Results are displayed in a table on `stdout` (the corresponding API call to \n `spacy.cli.find_threshold.find_threshold()` returns all results). \n \n This is applicable only for components whose predictions are influenced by thresholds - e.g. \n `textcat_multilabel` and `spancat`, but not `textcat`. Note that the full path to the \n corresponding threshold attribute in the config has to be provided. \n DOCS: https://spacy.io/api/cli#find-threshold \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format │\n│ [default: None] │\n│ [required] │\n│ * pipe_name TEXT Name of pipe to examine thresholds for [default: None] [required] │\n│ * threshold_key TEXT Key of threshold attribute in component's configuration │\n│ [default: None] │\n│ [required] │\n│ * scores_key TEXT Metric to optimize [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n_trials -n INTEGER Number of trials to determine optimal thresholds │\n│ [default: 11] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "info": " \n Usage: python -m spacy info [OPTIONS] [MODEL] \n \n Print info about spaCy installation. If a pipeline is specified as an argument, print its meta \n information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. \n \n Flag --url prints only the download URL of the most recent compatible version of the pipeline. \n DOCS: https://spacy.io/api/cli#info \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ model [MODEL] Optional loadable spaCy pipeline [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --silent -s,-S Don't print anything (just return) │\n│ --exclude -e TEXT Comma-separated keys to exclude from the print-out │\n│ [default: labels] │\n│ --url -u Print the URL to download the most recent compatible version of the │\n│ pipeline │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init config": " \n Usage: python -m spacy init config [OPTIONS] OUTPUT_FILE \n \n Generate a starter config file for training. Based on your requirements specified via the CLI \n arguments, this command generates a config with the optimal settings for your use case. This \n includes the choice of architecture, pretrained weights and related hyperparameters. \n \n DOCS: https://spacy.io/api/cli#init-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * output_file PATH File to save the config to or - for stdout (will only output config │\n│ and no additional logging info) │\n│ [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --lang -l TEXT Two-letter code of the language to use │\n│ [default: en] │\n│ --pipeline -p TEXT Comma-separated names of trainable pipeline │\n│ components to include (without 'tok2vec' or │\n│ 'transformer') │\n│ [default: tagger,parser,ner] │\n│ --optimize -o [efficiency|accuracy] Whether to optimize for efficiency (faster │\n│ inference, smaller model, lower memory │\n│ consumption) or higher accuracy (potentially │\n│ larger and slower model). This will impact the │\n│ choice of architecture, pretrained weights and │\n│ related hyperparameters. │\n│ [default: efficiency] │\n│ --gpu -G Whether the model can run on GPU. This will │\n│ impact the choice of architecture, pretrained │\n│ weights and related hyperparameters. │\n│ --pretraining -pt Include config for pretraining (with 'spacy │\n│ pretrain') │\n│ --force -F Force overwriting the output file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init fill-config": " \n Usage: python -m spacy init fill-config [OPTIONS] BASE_PATH [OUTPUT_FILE] \n \n Fill partial config file with default values. Will add all missing settings from the default \n config and will create all objects, check the registered functions for their default values and \n update the base config. This command can be used with a config generated via the training \n quickstart widget: https://spacy.io/usage/training#quickstart \n \n DOCS: https://spacy.io/api/cli#init-fill-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * base_path FILE Path to base config to fill [default: None] [required] │\n│ output_file [OUTPUT_FILE] Path to output .cfg file (or - for stdout) [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --pretraining -pt Include config for pretraining (with 'spacy pretrain') │\n│ --diff -D Print a visual diff highlighting the changes │\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init labels": " \n Usage: python -m spacy init labels [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Generate JSON files for the labels in the data. This helps speed up the training process, since \n spaCy won't have to preprocess the data to extract the labels. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * output_path PATH Output directory for the labels [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init nlp": " \n Usage: python -m spacy init nlp [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * output_path PATH Output directory for the prepared data [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init vectors": " \n Usage: python -m spacy init vectors [OPTIONS] LANG VECTORS_LOC OUTPUT_DIR \n \n Convert word vectors for use with spaCy. Will export an nlp object that you can use in the \n [initialize] block of your config to initialize a model with vectors. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * lang TEXT The language of the nlp object to create [default: None] [required] │\n│ * vectors_loc PATH Vectors file in Word2Vec format [default: None] [required] │\n│ * output_dir PATH Pipeline output directory [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --prune -p INTEGER Optional number of vectors to prune to [default: -1] │\n│ --truncate -t INTEGER Optional number of vectors to truncate to when reading in │\n│ vectors file │\n│ [default: 0] │\n│ --mode -m TEXT Vectors mode: default or floret [default: default] │\n│ --name -n TEXT Optional name for the word vectors, e.g. en_core_web_lg.vectors │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --attr -a TEXT Optional token attribute to use for vectors, e.g. LOWER or NORM │\n│ [default: ORTH] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "link": " \n Usage: python -m spacy link [OPTIONS] ARGS KWARGS \n \n (deprecated) \n As of spaCy v3.0, symlinks like \"en\" are not supported anymore. You can load trained pipeline \n packages using their full names or from a directory path. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * args TEXT [default: None] [required] │\n│ * kwargs TEXT [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "package": " \n Usage: python -m spacy package [OPTIONS] INPUT_DIR OUTPUT_DIR \n \n Generate an installable Python package for a pipeline. Includes binary data, meta and required \n installation files. A new directory will be created in the specified output directory, and the \n data will be copied over. If --create-meta is set and a meta.json already exists in the output \n directory, the existing values will be used as the defaults in the command-line prompt. After \n packaging, \"python -m build --sdist\" is run in the package directory, which will create a .tar.gz \n archive that can be installed via \"pip install\". \n \n If additional code files are provided (e.g. Python files containing custom registered functions \n like pipeline components), they are copied into the package and imported in the __init__.py. \n DOCS: https://spacy.io/api/cli#package \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_dir DIRECTORY Directory with pipeline data [default: None] [required] │\n│ * output_dir DIRECTORY Output parent directory [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c TEXT Comma-separated paths to Python file │\n│ with additional code (registered │\n│ functions) to be included in the package │\n│ --meta-path,--meta -m FILE Path to meta.json [default: None] │\n│ --create-meta -C Create meta.json, even if one exists │\n│ --name -n TEXT Package name to override meta │\n│ [default: None] │\n│ --version -v TEXT Package version to override meta │\n│ [default: None] │\n│ --build -b TEXT Comma-separated formats to build: sdist │\n│ and/or wheel, or none. │\n│ [default: sdist] │\n│ --force -f,-F Force overwriting existing data in │\n│ output directory │\n│ --require-parent -R,-R --no-require-parent Include the parent package (e.g. spacy) │\n│ in the requirements │\n│ [default: require-parent] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "pretrain": " \n Usage: python -m spacy pretrain [OPTIONS] CONFIG_PATH OUTPUT_DIR \n \n Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate \n language-modelling objective. Two objective types are available, vector-based and character-based. \n \n In the vector-based objective, we load word vectors that have been trained using a word2vec-style \n distributional similarity algorithm, and train a component like a CNN, BiLSTM, etc to predict \n vectors which match the pretrained ones. The weights are saved to a directory after each epoch. \n You can then pass a path to one of these pretrained weights files to the 'spacy train' command. \n This technique may be especially helpful if you have little labelled data. However, it's still \n quite experimental, so your mileage may vary. \n To load the weights back in during 'spacy train', you need to ensure all settings are the same \n between pretraining and training. Ideally, this is done by using the same config file for both \n commands. \n DOCS: https://spacy.io/api/cli#pretrain \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path FILE Path to config file [default: None] [required] │\n│ * output_dir PATH Directory to write weights to on each epoch [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --resume-path -r PATH Path to pretrained weights from which to resume pretraining │\n│ [default: None] │\n│ --epoch-resume -er INTEGER The epoch to resume counting from when using --resume-path. │\n│ Prevents unintended overwriting of existing weight files. │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --skip-last -L Skip saving model-last.bin │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "profile": " \n Usage: python -m spacy profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [default: None] [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "apply": " \n Usage: python -m spacy apply [OPTIONS] MODEL DATA_PATH OUTPUT_FILE \n \n Apply a trained pipeline to documents to get predictions. Expects a loadable spaCy pipeline and \n path to the data, which can be a directory or a file. The data files can be provided in multiple \n formats: 1. .spacy files 2. .jsonl files with a specified \"field\" to read the text from. \n 3. Files with any other extension are assumed to be containing a single document. DOCS: \n https://spacy.io/api/cli#apply \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of the documents to predict on. Can be a single file in │\n│ .spacy format or a .jsonl file. Files with other extensions are │\n│ treated as single plain text documents. If a directory is provided │\n│ it is traversed recursively to grab all files to be processed. The │\n│ files can be a mixture of .spacy, .jsonl and text files. If .jsonl │\n│ is provided the specified field is going to be grabbed (\"text\" by │\n│ default). │\n│ [required] │\n│ * output_file FILE Path to save the resulting .spacy file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ --text-key -tk TEXT Key containing text string for JSONL [default: text] │\n│ --force -F Force overwriting the output file │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU. [default: -1] │\n│ --batch-size -b INTEGER Batch size. [default: 1] │\n│ --n-process -n INTEGER number of processors to use. [default: 1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "assemble": " \n Usage: python -m spacy assemble [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Assemble a spaCy pipeline from a config file. The config file includes all settings for \n initializing the pipeline. To override settings in the config, e.g. settings that point to local \n paths or that you want to experiment with, you can override them as command line options. The \n --code argument lets you pass in a Python file that can be used to register custom functions that \n are referenced in the config. \n \n DOCS: https://spacy.io/api/cli#assemble \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n│ * output_path PATH Output directory to store assembled pipeline in [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) to │\n│ be imported │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "benchmark accuracy": " \n Usage: python -m spacy benchmark accuracy [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "benchmark speed": " \n Usage: python -m spacy benchmark speed [OPTIONS] MODEL DATA_PATH \n \n Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the binary .spacy \n format. \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --batch-size -b INTEGER RANGE [x>=1] Override the pipeline batch size │\n│ --no-shuffle Do not shuffle benchmark data │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --batches INTEGER RANGE [x>=30] Minimum number of batches to benchmark │\n│ [default: 50] │\n│ --warmup -w INTEGER RANGE [x>=0] Number of iterations over the data for warmup │\n│ [default: 3] │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "convert": " \n Usage: python -m spacy convert [OPTIONS] INPUT_PATH [OUTPUT_DIR] \n \n Convert files into json or DocBin format for training. The resulting .spacy file can be used with \n the train command and other experiment management functions. \n \n If no output_dir is specified and the output format is JSON, the data \n is written to stdout, so you can pipe them forward to a JSON file: \n $ spacy convert some_file.conllu --file-type json > some_file.json \n \n DOCS: https://spacy.io/api/cli#convert \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_path TEXT Input file or directory [required] │\n│ output_dir [OUTPUT_DIR] Output directory. '-' for stdout. [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --file-type -t [json|spacy] Type of data to produce [default: spacy] │\n│ --n-sents -n INTEGER Number of sentences per doc (0 to disable) │\n│ [default: 1] │\n│ --seg-sents -s Segment sentences (for -c ner) │\n│ --model,--base -b TEXT Trained spaCy pipeline for sentence segmentation to │\n│ use as base (for --seg-sents) │\n│ --morphology -m Enable appending morphology to tags │\n│ --merge-subtokens -T Merge CoNLL-U subtokens │\n│ --converter -c TEXT Converter: ('conllubio', 'conllu', 'conll', 'ner', │\n│ 'iob', 'json') │\n│ [default: auto] │\n│ --ner-map -nm PATH NER tag mapping (as JSON-encoded dict of entity types) │\n│ --lang -l TEXT Language (if tokenizer required) │\n│ --concatenate -C Concatenate output to a single file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug config": " \n Usage: python -m spacy debug config [OPTIONS] CONFIG_PATH \n \n Debug a config file and show validation errors. The command will create all objects in the tree \n and validate them. Note that some config validation errors are blocking and will prevent the rest \n of the config from being resolved. This means that you may not see all validation errors at once \n and some issues are only shown once previous errors have been fixed. Similar as with the 'train' \n command, you can override settings from the config as command line options. For instance, \n --training.batch_size 128 overrides the value of \"batch_size\" in the block \"\". \n \n DOCS: https://spacy.io/api/cli#debug-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --show-functions -F Show an overview of all registered functions used in the │\n│ config and where they come from (modules, files etc.) │\n│ --show-variables -V Show an overview of all variables referenced in the config and │\n│ their values. This will also reflect variables overwritten on │\n│ the CLI. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug data": " \n Usage: python -m spacy debug data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug diff-config": " \n Usage: python -m spacy debug diff-config [OPTIONS] CONFIG_PATH \n \n Show a diff of a config file with respect to spaCy's defaults or another config file. If \n additional settings were used in the creation of the config file, then you must supply these as \n extra parameters to the command when comparing to the default settings. The generated diff can \n also be used when posting to the discussion forum to provide more information for the maintainers. \n \n The `optimize`, `gpu`, and `pretraining` options are only relevant when \n comparing against the default configuration (or specifically when `compare_to` is None). \n \n DOCS: https://spacy.io/api/cli#debug-diff \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --compare-to PATH Path to a config file to diff against, or │\n│ `None` to compare against default settings │\n│ --optimize -o [efficiency|accuracy] Whether the user config was optimized for │\n│ efficiency or accuracy. Only relevant when │\n│ comparing against the default config. │\n│ [default: efficiency] │\n│ --gpu -G Whether the original config can run on a │\n│ GPU. Only relevant when comparing against │\n│ the default config. │\n│ --pretraining,--pt Whether to compare on a config with │\n│ pretraining involved. Only relevant when │\n│ comparing against the default config. │\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug model": " \n Usage: python -m spacy debug model [OPTIONS] CONFIG_PATH COMPONENT \n \n Analyze a Thinc model implementation. Includes checks for internal structure and activations \n during training. \n \n DOCS: https://spacy.io/api/cli#debug-model \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n│ * component TEXT Name of the pipeline component of which the model should be analysed │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --layers -l TEXT Comma-separated names of layer IDs to print │\n│ --dimensions -DIM Show dimensions │\n│ --parameters -PAR Show parameters │\n│ --gradients -GRAD Show gradients │\n│ --attributes -ATTR Show attributes │\n│ --print-step0 -P0 Print model before training │\n│ --print-step1 -P1 Print model after initialization │\n│ --print-step2 -P2 Print model after training │\n│ --print-step3 -P3 Print final predictions │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug profile": " \n Usage: python -m spacy debug profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug-data": " \n Usage: python -m spacy debug-data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "download": " \n Usage: python -m spacy download [OPTIONS] MODEL \n \n Download compatible trained pipeline from the default download path using pip. If --direct flag is \n set, the command expects the full package name with version. For direct downloads, the \n compatibility check will be skipped. All additional arguments provided to this command will be \n passed to `pip install` on package installation. \n \n DOCS: https://spacy.io/api/cli#download \n AVAILABLE PACKAGES: https://spacy.io/models \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Name of pipeline package to download [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --direct -d,-D Force direct download of name + version │\n│ --sdist -S Download sdist (.tar.gz) archive instead of pre-built binary wheel │\n│ --url -U TEXT Download from given url │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "evaluate": " \n Usage: python -m spacy evaluate [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "find-function": " \n Usage: python -m spacy find-function [OPTIONS] FUNC_NAME \n \n Find the module, path and line number to the file the registered function is defined in, if \n available. \n \n func_name (str): Name of the registered function. \n registry_name (Optional): Name of the catalogue registry. \n \n DOCS: https://spacy.io/api/cli#find-function \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * func_name TEXT Name of the registered function. [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --registry -r TEXT Name of the catalogue registry. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "find-threshold": " \n Usage: python -m spacy find-threshold [OPTIONS] MODEL DATA_PATH PIPE_NAME \n THRESHOLD_KEY SCORES_KEY \n \n Runs prediction trials for a trained model with varying thresholds to maximize the specified \n metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. \n Results are displayed in a table on `stdout` (the corresponding API call to \n `spacy.cli.find_threshold.find_threshold()` returns all results). \n \n This is applicable only for components whose predictions are influenced by \n thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note \n that the full path to the corresponding threshold attribute in the config has to \n be provided. \n \n DOCS: https://spacy.io/api/cli#find-threshold \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [required] │\n│ * pipe_name TEXT Name of pipe to examine thresholds for [required] │\n│ * threshold_key TEXT Key of threshold attribute in component's configuration [required] │\n│ * scores_key TEXT Metric to optimize [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n_trials -n INTEGER Number of trials to determine optimal thresholds │\n│ [default: 11] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "info": " \n Usage: python -m spacy info [OPTIONS] [MODEL] \n \n Print info about spaCy installation. If a pipeline is specified as an argument, print its meta \n information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. \n \n Flag --url prints only the download URL of the most recent compatible \n version of the pipeline. \n \n DOCS: https://spacy.io/api/cli#info \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ model [MODEL] Optional loadable spaCy pipeline │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --silent -s,-S Don't print anything (just return) │\n│ --exclude -e TEXT Comma-separated keys to exclude from the print-out │\n│ [default: labels] │\n│ --url -u Print the URL to download the most recent compatible version of the │\n│ pipeline │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init config": " \n Usage: python -m spacy init config [OPTIONS] OUTPUT_FILE \n \n Generate a starter config file for training. Based on your requirements specified via the CLI \n arguments, this command generates a config with the optimal settings for your use case. This \n includes the choice of architecture, pretrained weights and related hyperparameters. \n \n DOCS: https://spacy.io/api/cli#init-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * output_file PATH File to save the config to or - for stdout (will only output config │\n│ and no additional logging info) │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --lang -l TEXT Two-letter code of the language to use │\n│ [default: en] │\n│ --pipeline -p TEXT Comma-separated names of trainable pipeline │\n│ components to include (without 'tok2vec' or │\n│ 'transformer') │\n│ [default: tagger,parser,ner] │\n│ --optimize -o [efficiency|accuracy] Whether to optimize for efficiency (faster │\n│ inference, smaller model, lower memory │\n│ consumption) or higher accuracy (potentially │\n│ larger and slower model). This will impact the │\n│ choice of architecture, pretrained weights and │\n│ related hyperparameters. │\n│ [default: efficiency] │\n│ --gpu -G Whether the model can run on GPU. This will │\n│ impact the choice of architecture, pretrained │\n│ weights and related hyperparameters. │\n│ --pretraining -pt Include config for pretraining (with 'spacy │\n│ pretrain') │\n│ --force -F Force overwriting the output file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init fill-config": " \n Usage: python -m spacy init fill-config [OPTIONS] BASE_PATH [OUTPUT_FILE] \n \n Fill partial config file with default values. Will add all missing settings from the default \n config and will create all objects, check the registered functions for their default values and \n update the base config. This command can be used with a config generated via the training \n quickstart widget: https://spacy.io/usage/training#quickstart \n \n DOCS: https://spacy.io/api/cli#init-fill-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * base_path FILE Path to base config to fill [required] │\n│ output_file [OUTPUT_FILE] Path to output .cfg file (or - for stdout) [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --pretraining -pt Include config for pretraining (with 'spacy pretrain') │\n│ --diff -D Print a visual diff highlighting the changes │\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init labels": " \n Usage: python -m spacy init labels [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Generate JSON files for the labels in the data. This helps speed up the training process, since \n spaCy won't have to preprocess the data to extract the labels. \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n│ * output_path PATH Output directory for the labels [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init nlp": " \n Usage: python -m spacy init nlp [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n│ * output_path PATH Output directory for the prepared data [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init vectors": " \n Usage: python -m spacy init vectors [OPTIONS] LANG VECTORS_LOC OUTPUT_DIR \n \n Convert word vectors for use with spaCy. Will export an nlp object that you can use in the block \n of your config to initialize a model with vectors. \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * lang TEXT The language of the nlp object to create [required] │\n│ * vectors_loc PATH Vectors file in Word2Vec format [required] │\n│ * output_dir PATH Pipeline output directory [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --prune -p INTEGER Optional number of vectors to prune to [default: -1] │\n│ --truncate -t INTEGER Optional number of vectors to truncate to when reading in │\n│ vectors file │\n│ [default: 0] │\n│ --mode -m TEXT Vectors mode: default or floret [default: default] │\n│ --name -n TEXT Optional name for the word vectors, e.g. en_core_web_lg.vectors │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --attr -a TEXT Optional token attribute to use for vectors, e.g. LOWER or NORM │\n│ [default: ORTH] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "link": " \n Usage: python -m spacy link [OPTIONS] ARGS KWARGS \n \n (deprecated) \n As of spaCy v3.0, symlinks like \"en\" are not supported anymore. You can load trained pipeline \n packages using their full names or from a directory path. \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * args TEXT [required] │\n│ * kwargs TEXT [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "package": " \n Usage: python -m spacy package [OPTIONS] INPUT_DIR OUTPUT_DIR \n \n Generate an installable Python package for a pipeline. Includes binary data, meta and required \n installation files. A new directory will be created in the specified output directory, and the \n data will be copied over. If --create-meta is set and a meta.json already exists in the output \n directory, the existing values will be used as the defaults in the command-line prompt. After \n packaging, \"python -m build --sdist\" is run in the package directory, which will create a .tar.gz \n archive that can be installed via \"pip install\". \n \n If additional code files are provided (e.g. Python files containing custom \n registered functions like pipeline components), they are copied into the \n package and imported in the __init__.py. \n \n DOCS: https://spacy.io/api/cli#package \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_dir DIRECTORY Directory with pipeline data [required] │\n│ * output_dir DIRECTORY Output parent directory [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c TEXT Comma-separated paths to Python file │\n│ with additional code (registered │\n│ functions) to be included in the package │\n│ --meta-path,--meta -m FILE Path to meta.json │\n│ --create-meta -C Create meta.json, even if one exists │\n│ --name -n TEXT Package name to override meta │\n│ --version -v TEXT Package version to override meta │\n│ --build -b TEXT Comma-separated formats to build: sdist │\n│ and/or wheel, or none. │\n│ [default: sdist] │\n│ --force -f,-F Force overwriting existing data in │\n│ output directory │\n│ --require-parent -R,-R --no-require-parent Include the parent package (e.g. spacy) │\n│ in the requirements │\n│ [default: require-parent] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "pretrain": " \n Usage: python -m spacy pretrain [OPTIONS] CONFIG_PATH OUTPUT_DIR \n \n Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate \n language-modelling objective. Two objective types are available, vector-based and character-based. \n \n In the vector-based objective, we load word vectors that have been trained \n using a word2vec-style distributional similarity algorithm, and train a \n component like a CNN, BiLSTM, etc to predict vectors which match the \n pretrained ones. The weights are saved to a directory after each epoch. You \n can then pass a path to one of these pretrained weights files to the \n 'spacy train' command. \n \n This technique may be especially helpful if you have little labelled data. \n However, it's still quite experimental, so your mileage may vary. \n \n To load the weights back in during 'spacy train', you need to ensure \n all settings are the same between pretraining and training. Ideally, \n this is done by using the same config file for both commands. \n \n DOCS: https://spacy.io/api/cli#pretrain \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path FILE Path to config file [required] │\n│ * output_dir PATH Directory to write weights to on each epoch [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --resume-path -r PATH Path to pretrained weights from which to resume pretraining │\n│ --epoch-resume -er INTEGER The epoch to resume counting from when using --resume-path. │\n│ Prevents unintended overwriting of existing weight files. │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --skip-last -L Skip saving model-last.bin │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "profile": " \n Usage: python -m spacy profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", "project assets": " \n Usage: python -m spacy project assets [OPTIONS] [PROJECT_DIR] \n \n Fetch project assets like datasets and pretrained weights. Assets are defined in the \"assets\" \n section of the project.yml. If a checksum is provided in the project.yml, the file is only \n downloaded if no local file with the same checksum exists. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/tutorial/directory-and-assets.md \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Path to cloned project. Defaults to current working directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --sparse -S Use sparse checkout for assets provided via Git, to only check out and clone │\n│ the files needed. Requires Git v22.2+. │\n│ --extra -e Download all assets, including those marked as 'extra'. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project clone": " \n Usage: python -m spacy project clone [OPTIONS] NAME [DEST] \n \n Clone a project template from a repository. Calls into \"git\" and will only download the files from \n the given subdirectory. The GitHub repo defaults to the official Weasel template repo, but can be \n customized (including using a private repo). \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#clipboard-clone \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * name TEXT The name of the template to clone [default: None] [required] │\n│ dest [DEST] Where to clone the project. Defaults to current working directory │\n│ [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --repo -r TEXT The repository to clone from │\n│ [default: https://github.com/explosion/projects] │\n│ --branch -b TEXT The branch to clone from. If not provided, will attempt main, master │\n│ [default: None] │\n│ --sparse -S Use sparse Git checkout to only check out and clone the files needed. │\n│ Requires Git v22.2+. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project clone": " \n Usage: python -m spacy project clone [OPTIONS] NAME [DEST] \n \n Clone a project template from a repository. Calls into \"git\" and will only download the files from \n the given subdirectory. The GitHub repo defaults to the official Weasel template repo, but can be \n customized (including using a private repo). \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#clipboard-clone \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * name TEXT The name of the template to clone [required] │\n│ dest [DEST] Where to clone the project. Defaults to current working directory │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --repo -r TEXT The repository to clone from │\n│ [default: https://github.com/explosion/projects] │\n│ --branch -b TEXT The branch to clone from. If not provided, will attempt main, master │\n│ --sparse -S Use sparse Git checkout to only check out and clone the files needed. │\n│ Requires Git v22.2+. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", "project document": " \n Usage: python -m spacy project document [OPTIONS] [PROJECT_DIR] \n \n Auto-generate a README.md for a project. If the content is saved to a file, hidden markers are \n added so you can add custom content before or after the auto-generated section and only the \n auto-generated docs will be replaced when you re-run the command. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#closed_book-document \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Path to cloned project. Defaults to current working directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o PATH Path to output Markdown file for output. Defaults to - for standard │\n│ output │\n│ [default: -] │\n│ --no-emoji -NE Don't use emoji │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project dvc": " \n Usage: python -m spacy project dvc [OPTIONS] [PROJECT_DIR] [WORKFLOW] \n \n Auto-generate Data Version Control (DVC) config. A DVC project can only define one pipeline, so \n you need to specify one workflow defined in the project.yml. If no workflow is specified, the \n first defined workflow is used. The DVC config will only be updated if the project.yml changed. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#repeat-dvc \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n│ workflow [WORKFLOW] Name of workflow defined in project.yml. Defaults to first │\n│ workflow if not set. │\n│ [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --verbose -V Print more info │\n│ --quiet -q Print less info │\n│ --force -F Force update DVC config │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project dvc": " \n Usage: python -m spacy project dvc [OPTIONS] [PROJECT_DIR] [WORKFLOW] \n \n Auto-generate Data Version Control (DVC) config. A DVC project can only define one pipeline, so \n you need to specify one workflow defined in the project.yml. If no workflow is specified, the \n first defined workflow is used. The DVC config will only be updated if the project.yml changed. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#repeat-dvc \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n│ workflow [WORKFLOW] Name of workflow defined in project.yml. Defaults to first │\n│ workflow if not set. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --verbose -V Print more info │\n│ --quiet -q Print less info │\n│ --force -F Force update DVC config │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", "project pull": " \n Usage: python -m spacy project pull [OPTIONS] [REMOTE] [PROJECT_DIR] \n \n Retrieve available precomputed outputs from a remote storage. You can alias remotes in your \n project.yml by mapping them to storage paths. A storage can be anything that the smart_open \n library can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_down-push \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ remote [REMOTE] Name or path of remote storage [default: default] │\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", "project push": " \n Usage: python -m spacy project push [OPTIONS] [REMOTE] [PROJECT_DIR] \n \n Persist outputs to a remote storage. You can alias remotes in your project.yml by mapping them to \n storage paths. A storage can be anything that the smart_open library can upload to, e.g. AWS, \n Google Cloud Storage, SSH, local directories etc. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_up-push \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ remote [REMOTE] Name or path of remote storage [default: default] │\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "train": " \n Usage: python -m spacy train [OPTIONS] CONFIG_PATH \n \n Train or update a spaCy pipeline. Requires data in spaCy's binary format. To convert data from \n other formats, use the `spacy convert` command. The config file includes all settings and \n hyperparameters used during training. To override settings in the config, e.g. settings that point \n to local paths or that you want to experiment with, you can override them as command line options. \n For instance, --training.batch_size 128 overrides the value of \"batch_size\" in the block \n \"[training]\". The --code argument lets you pass in a Python file that's imported before training. \n It can be used to register custom functions and architectures that can then be referenced in the \n config. \n \n DOCS: https://spacy.io/api/cli#train \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output,--output-path -o PATH Output directory to store trained pipeline in │\n│ [default: None] │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "train": " \n Usage: python -m spacy train [OPTIONS] CONFIG_PATH \n \n Train or update a spaCy pipeline. Requires data in spaCy's binary format. To convert data from \n other formats, use the `spacy convert` command. The config file includes all settings and \n hyperparameters used during training. To override settings in the config, e.g. settings that point \n to local paths or that you want to experiment with, you can override them as command line options. \n For instance, --training.batch_size 128 overrides the value of \"batch_size\" in the block \"\". The \n --code argument lets you pass in a Python file that's imported before training. It can be used to \n register custom functions and architectures that can then be referenced in the config. \n \n DOCS: https://spacy.io/api/cli#train \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output,--output-path -o PATH Output directory to store trained pipeline in │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", "validate": " \n Usage: python -m spacy validate [OPTIONS] \n \n Validate the currently installed pipeline packages and spaCy version. Checks if the installed \n packages are compatible and shows upgrade instructions if available. Should be run after `pip \n install -U spacy`. \n \n DOCS: https://spacy.io/api/cli#validate \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" }, "errors": { - "missing_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ Missing command. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", - "unknown_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_COMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "missing_command": "", + "unknown_command": "", "unknown_subcommand": { - "benchmark": " \n Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]... \n \n Commands for benchmarking pipelines. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ accuracy Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in │\n│ the binary .spacy format. The --gold-preproc option sets up the evaluation examples │\n│ with gold-standard sentences and tokens for the predictions. Gold preprocessing helps │\n│ the annotations align to the tokenization, and may result in sequences of more │\n│ consistent length. However, it may reduce runtime accuracy due to train/test skew. To │\n│ render a sample of dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ speed Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the │\n│ binary .spacy format. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug": " \n Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]... \n \n Suite of helpful commands for debugging and profiling. Includes commands to check and validate \n your config files, training and evaluation data, and custom model implementations. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ data Analyze, debug and validate your training and development data. Outputs useful │\n│ stats, and can help you find problems like invalid entity annotations, cyclic │\n│ dependencies, low data labels and more. │\n│ profile Profile which functions take the most time in a spaCy pipeline. Input should be │\n│ formatted as one JSON object per line with a key \"text\". It can either be provided │\n│ as a JSONL file, or be read from sys.sytdin. If no input file is specified, the │\n│ IMDB dataset is loaded via Thinc. │\n│ config Debug a config file and show validation errors. The command will create all │\n│ objects in the tree and validate them. Note that some config validation errors are │\n│ blocking and will prevent the rest of the config from being resolved. This means │\n│ that you may not see all validation errors at once and some issues are only shown │\n│ once previous errors have been fixed. Similar as with the 'train' command, you can │\n│ override settings from the config as command line options. For instance, │\n│ --training.batch_size 128 overrides the value of \"batch_size\" in the block │\n│ \"[training]\". │\n│ diff-config Show a diff of a config file with respect to spaCy's defaults or another config │\n│ file. If additional settings were used in the creation of the config file, then │\n│ you must supply these as extra parameters to the command when comparing to the │\n│ default settings. The generated diff can also be used when posting to the │\n│ discussion forum to provide more information for the maintainers. │\n│ model Analyze a Thinc model implementation. Includes checks for internal structure and │\n│ activations during training. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init": " \n Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]... \n \n Commands for initializing configs and pipeline packages. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ config Generate a starter config file for training. Based on your requirements specified │\n│ via the CLI arguments, this command generates a config with the optimal settings │\n│ for your use case. This includes the choice of architecture, pretrained weights │\n│ and related hyperparameters. │\n│ fill-config Fill partial config file with default values. Will add all missing settings from │\n│ the default config and will create all objects, check the registered functions for │\n│ their default values and update the base config. This command can be used with a │\n│ config generated via the training quickstart widget: │\n│ https://spacy.io/usage/training#quickstart │\n│ vectors Convert word vectors for use with spaCy. Will export an nlp object that you can │\n│ use in the [initialize] block of your config to initialize a model with vectors. │\n│ labels Generate JSON files for the labels in the data. This helps speed up the training │\n│ process, since spaCy won't have to preprocess the data to extract the labels. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project": " \n Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]... \n \n Command-line interface for spaCy projects and templates. You'd typically start by cloning a \n project template to a local directory and fetching its assets like datasets etc. See the project's \n project.yml for the available commands. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ assets Fetch project assets like datasets and pretrained weights. Assets are defined in the │\n│ \"assets\" section of the project.yml. If a checksum is provided in the project.yml, │\n│ the file is only downloaded if no local file with the same checksum exists. │\n│ clone Clone a project template from a repository. Calls into \"git\" and will only download │\n│ the files from the given subdirectory. The GitHub repo defaults to the official │\n│ Weasel template repo, but can be customized (including using a private repo). │\n│ document Auto-generate a README.md for a project. If the content is saved to a file, hidden │\n│ markers are added so you can add custom content before or after the auto-generated │\n│ section and only the auto-generated docs will be replaced when you re-run the │\n│ command. │\n│ dvc Auto-generate Data Version Control (DVC) config. A DVC project can only define one │\n│ pipeline, so you need to specify one workflow defined in the project.yml. If no │\n│ workflow is specified, the first defined workflow is used. The DVC config will only │\n│ be updated if the project.yml changed. │\n│ run Run a named command or workflow defined in the project.yml. If a workflow name is │\n│ specified, all commands in the workflow are run, in order. If commands define │\n│ dependencies and/or outputs, they will only be re-run if state has changed. │\n│ pull Retrieve available precomputed outputs from a remote storage. You can alias remotes │\n│ in your project.yml by mapping them to storage paths. A storage can be anything that │\n│ the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, local │\n│ directories etc. │\n│ push Persist outputs to a remote storage. You can alias remotes in your project.yml by │\n│ mapping them to storage paths. A storage can be anything that the smart_open library │\n│ can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + "benchmark": "", + "debug": "", + "init": "", + "project": "" } }, "group_help": { - "benchmark": " \n Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]... \n \n Commands for benchmarking pipelines. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ accuracy Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in │\n│ the binary .spacy format. The --gold-preproc option sets up the evaluation examples │\n│ with gold-standard sentences and tokens for the predictions. Gold preprocessing helps │\n│ the annotations align to the tokenization, and may result in sequences of more │\n│ consistent length. However, it may reduce runtime accuracy due to train/test skew. To │\n│ render a sample of dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ speed Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the │\n│ binary .spacy format. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug": " \n Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]... \n \n Suite of helpful commands for debugging and profiling. Includes commands to check and validate \n your config files, training and evaluation data, and custom model implementations. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ data Analyze, debug and validate your training and development data. Outputs useful │\n│ stats, and can help you find problems like invalid entity annotations, cyclic │\n│ dependencies, low data labels and more. │\n│ profile Profile which functions take the most time in a spaCy pipeline. Input should be │\n│ formatted as one JSON object per line with a key \"text\". It can either be provided │\n│ as a JSONL file, or be read from sys.sytdin. If no input file is specified, the │\n│ IMDB dataset is loaded via Thinc. │\n│ config Debug a config file and show validation errors. The command will create all │\n│ objects in the tree and validate them. Note that some config validation errors are │\n│ blocking and will prevent the rest of the config from being resolved. This means │\n│ that you may not see all validation errors at once and some issues are only shown │\n│ once previous errors have been fixed. Similar as with the 'train' command, you can │\n│ override settings from the config as command line options. For instance, │\n│ --training.batch_size 128 overrides the value of \"batch_size\" in the block │\n│ \"[training]\". │\n│ diff-config Show a diff of a config file with respect to spaCy's defaults or another config │\n│ file. If additional settings were used in the creation of the config file, then │\n│ you must supply these as extra parameters to the command when comparing to the │\n│ default settings. The generated diff can also be used when posting to the │\n│ discussion forum to provide more information for the maintainers. │\n│ model Analyze a Thinc model implementation. Includes checks for internal structure and │\n│ activations during training. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init": " \n Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]... \n \n Commands for initializing configs and pipeline packages. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ config Generate a starter config file for training. Based on your requirements specified │\n│ via the CLI arguments, this command generates a config with the optimal settings │\n│ for your use case. This includes the choice of architecture, pretrained weights │\n│ and related hyperparameters. │\n│ fill-config Fill partial config file with default values. Will add all missing settings from │\n│ the default config and will create all objects, check the registered functions for │\n│ their default values and update the base config. This command can be used with a │\n│ config generated via the training quickstart widget: │\n│ https://spacy.io/usage/training#quickstart │\n│ vectors Convert word vectors for use with spaCy. Will export an nlp object that you can │\n│ use in the [initialize] block of your config to initialize a model with vectors. │\n│ labels Generate JSON files for the labels in the data. This helps speed up the training │\n│ process, since spaCy won't have to preprocess the data to extract the labels. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project": " \n Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]... \n \n Command-line interface for spaCy projects and templates. You'd typically start by cloning a \n project template to a local directory and fetching its assets like datasets etc. See the project's \n project.yml for the available commands. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ assets Fetch project assets like datasets and pretrained weights. Assets are defined in the │\n│ \"assets\" section of the project.yml. If a checksum is provided in the project.yml, │\n│ the file is only downloaded if no local file with the same checksum exists. │\n│ clone Clone a project template from a repository. Calls into \"git\" and will only download │\n│ the files from the given subdirectory. The GitHub repo defaults to the official │\n│ Weasel template repo, but can be customized (including using a private repo). │\n│ document Auto-generate a README.md for a project. If the content is saved to a file, hidden │\n│ markers are added so you can add custom content before or after the auto-generated │\n│ section and only the auto-generated docs will be replaced when you re-run the │\n│ command. │\n│ dvc Auto-generate Data Version Control (DVC) config. A DVC project can only define one │\n│ pipeline, so you need to specify one workflow defined in the project.yml. If no │\n│ workflow is specified, the first defined workflow is used. The DVC config will only │\n│ be updated if the project.yml changed. │\n│ run Run a named command or workflow defined in the project.yml. If a workflow name is │\n│ specified, all commands in the workflow are run, in order. If commands define │\n│ dependencies and/or outputs, they will only be re-run if state has changed. │\n│ pull Retrieve available precomputed outputs from a remote storage. You can alias remotes │\n│ in your project.yml by mapping them to storage paths. A storage can be anything that │\n│ the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, local │\n│ directories etc. │\n│ push Persist outputs to a remote storage. You can alias remotes in your project.yml by │\n│ mapping them to storage paths. A storage can be anything that the smart_open library │\n│ can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + "benchmark": " \n Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]... \n \n Commands for benchmarking pipelines. \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ accuracy Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation │\n│ data in the binary .spacy format. The --gold-preproc option sets up the │\n│ evaluation examples with gold-standard sentences and tokens for the │\n│ predictions. Gold preprocessing helps the annotations align to the │\n│ tokenization, and may result in sequences of more consistent length. However, │\n│ it may reduce runtime accuracy due to train/test skew. To render a sample of │\n│ dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ speed Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark │\n│ data in the binary .spacy format. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug": " \n Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]... \n \n Suite of helpful commands for debugging and profiling. Includes commands to check and validate \n your config files, training and evaluation data, and custom model implementations. \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ data Analyze, debug and validate your training and development data. Outputs │\n│ useful stats, and can help you find problems like invalid entity annotations, │\n│ cyclic dependencies, low data labels and more. │\n│ profile Profile which functions take the most time in a spaCy pipeline. │\n│ Input should be formatted as one JSON object per line with a key \"text\". │\n│ It can either be provided as a JSONL file, or be read from sys.sytdin. │\n│ If no input file is specified, the IMDB dataset is loaded via Thinc. │\n│ config Debug a config file and show validation errors. The command will │\n│ create all objects in the tree and validate them. Note that some config │\n│ validation errors are blocking and will prevent the rest of the config from │\n│ being resolved. This means that you may not see all validation errors at │\n│ once and some issues are only shown once previous errors have been fixed. │\n│ Similar as with the 'train' command, you can override settings from the config │\n│ as command line options. For instance, --training.batch_size 128 overrides │\n│ the value of \"batch_size\" in the block \"\". │\n│ diff-config Show a diff of a config file with respect to spaCy's defaults or another config │\n│ file. If │\n│ additional settings were used in the creation of the config file, then you │\n│ must supply these as extra parameters to the command when comparing to the default │\n│ settings. The generated diff │\n│ can also be used when posting to the discussion forum to provide more │\n│ information for the maintainers. │\n│ model Analyze a Thinc model implementation. Includes checks for internal structure │\n│ and activations during training. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init": " \n Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]... \n \n Commands for initializing configs and pipeline packages. \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ config Generate a starter config file for training. Based on your requirements │\n│ specified via the CLI arguments, this command generates a config with the │\n│ optimal settings for your use case. This includes the choice of architecture, │\n│ pretrained weights and related hyperparameters. │\n│ fill-config Fill partial config file with default values. Will add all missing settings │\n│ from the default config and will create all objects, check the registered │\n│ functions for their default values and update the base config. This command │\n│ can be used with a config generated via the training quickstart widget: │\n│ https://spacy.io/usage/training#quickstart │\n│ vectors Convert word vectors for use with spaCy. Will export an nlp object that │\n│ you can use in the block of your config to initialize │\n│ a model with vectors. │\n│ labels Generate JSON files for the labels in the data. This helps speed up the │\n│ training process, since spaCy won't have to preprocess the data to │\n│ extract the labels. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project": " \n Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]... \n \n Command-line interface for spaCy projects and templates. You'd typically start by cloning a \n project template to a local directory and fetching its assets like datasets etc. See the project's \n project.yml for the available commands. \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ assets Fetch project assets like datasets and pretrained weights. Assets are │\n│ defined in the \"assets\" section of the project.yml. If a checksum is │\n│ provided in the project.yml, the file is only downloaded if no local file │\n│ with the same checksum exists. │\n│ clone Clone a project template from a repository. Calls into \"git\" and will │\n│ only download the files from the given subdirectory. The GitHub repo │\n│ defaults to the official Weasel template repo, but can be customized │\n│ (including using a private repo). │\n│ document Auto-generate a README.md for a project. If the content is saved to a file, │\n│ hidden markers are added so you can add custom content before or after the │\n│ auto-generated section and only the auto-generated docs will be replaced │\n│ when you re-run the command. │\n│ dvc Auto-generate Data Version Control (DVC) config. A DVC │\n│ project can only define one pipeline, so you need to specify one workflow │\n│ defined in the project.yml. If no workflow is specified, the first defined │\n│ workflow is used. The DVC config will only be updated if the project.yml │\n│ changed. │\n│ run Run a named command or workflow defined in the project.yml. If a workflow │\n│ name is specified, all commands in the workflow are run, in order. If │\n│ commands define dependencies and/or outputs, they will only be re-run if │\n│ state has changed. │\n│ pull Retrieve available precomputed outputs from a remote storage. │\n│ You can alias remotes in your project.yml by mapping them to storage paths. │\n│ A storage can be anything that the smart_open library can upload to, e.g. │\n│ AWS, Google Cloud Storage, SSH, local directories etc. │\n│ push Persist outputs to a remote storage. You can alias remotes in your │\n│ project.yml by mapping them to storage paths. A storage can be anything that │\n│ the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, │\n│ local directories etc. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" }, "hidden_group_commands": { "benchmark": [], @@ -114,5 +114,5 @@ "train", "validate" ], - "root_help": " \n Usage: python -m spacy [OPTIONS] COMMAND [ARGS]... \n \n spaCy Command-line Interface \n \n DOCS: https://spacy.io/api/cli \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --install-completion Install completion for the current shell. │\n│ --show-completion Show completion for the current shell, to copy it or customize the │\n│ installation. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ download Download compatible trained pipeline from the default download path using pip. │\n│ If --direct flag is set, the command expects the full package name with │\n│ version. For direct downloads, the compatibility check will be skipped. All │\n│ additional arguments provided to this command will be passed to `pip install` │\n│ on package installation. │\n│ info Print info about spaCy installation. If a pipeline is specified as an argument, │\n│ print its meta information. Flag --markdown prints details in Markdown for easy │\n│ copy-pasting to GitHub issues. │\n│ apply Apply a trained pipeline to documents to get predictions. Expects a loadable │\n│ spaCy pipeline and path to the data, which can be a directory or a file. The │\n│ data files can be provided in multiple formats: 1. .spacy files 2. │\n│ .jsonl files with a specified \"field\" to read the text from. 3. Files with │\n│ any other extension are assumed to be containing a single document. │\n│ DOCS: https://spacy.io/api/cli#apply │\n│ assemble Assemble a spaCy pipeline from a config file. The config file includes all │\n│ settings for initializing the pipeline. To override settings in the config, │\n│ e.g. settings that point to local paths or that you want to experiment with, │\n│ you can override them as command line options. The --code argument lets you │\n│ pass in a Python file that can be used to register custom functions that are │\n│ referenced in the config. │\n│ convert Convert files into json or DocBin format for training. The resulting .spacy │\n│ file can be used with the train command and other experiment management │\n│ functions. │\n│ evaluate Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation │\n│ data in the binary .spacy format. The --gold-preproc option sets up the │\n│ evaluation examples with gold-standard sentences and tokens for the │\n│ predictions. Gold preprocessing helps the annotations align to the │\n│ tokenization, and may result in sequences of more consistent length. However, │\n│ it may reduce runtime accuracy due to train/test skew. To render a sample of │\n│ dependency parses in a HTML file, set as output directory as the displacy_path │\n│ argument. │\n│ find-function Find the module, path and line number to the file the registered function is │\n│ defined in, if available. │\n│ find-threshold Runs prediction trials for a trained model with varying thresholds to maximize │\n│ the specified metric. The search space for the threshold is traversed linearly │\n│ from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` │\n│ (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` │\n│ returns all results). │\n│ package Generate an installable Python package for a pipeline. Includes binary data, │\n│ meta and required installation files. A new directory will be created in the │\n│ specified output directory, and the data will be copied over. If --create-meta │\n│ is set and a meta.json already exists in the output directory, the existing │\n│ values will be used as the defaults in the command-line prompt. After │\n│ packaging, \"python -m build --sdist\" is run in the package directory, which │\n│ will create a .tar.gz archive that can be installed via \"pip install\". │\n│ pretrain Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using │\n│ an approximate language-modelling objective. Two objective types are available, │\n│ vector-based and character-based. │\n│ train Train or update a spaCy pipeline. Requires data in spaCy's binary format. To │\n│ convert data from other formats, use the `spacy convert` command. The config │\n│ file includes all settings and hyperparameters used during training. To │\n│ override settings in the config, e.g. settings that point to local paths or │\n│ that you want to experiment with, you can override them as command line │\n│ options. For instance, --training.batch_size 128 overrides the value of │\n│ \"batch_size\" in the block \"[training]\". The --code argument lets you pass in a │\n│ Python file that's imported before training. It can be used to register custom │\n│ functions and architectures that can then be referenced in the config. │\n│ validate Validate the currently installed pipeline packages and spaCy version. Checks if │\n│ the installed packages are compatible and shows upgrade instructions if │\n│ available. Should be run after `pip install -U spacy`. │\n│ debug Suite of helpful commands for debugging and profiling. Includes commands to │\n│ check and validate your config files, training and evaluation data, and custom │\n│ model implementations. │\n│ benchmark Commands for benchmarking pipelines. │\n│ init Commands for initializing configs and pipeline packages. │\n│ project Command-line interface for spaCy projects and templates. You'd typically start │\n│ by cloning a project template to a local directory and fetching its assets like │\n│ datasets etc. See the project's project.yml for the available commands. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + "root_help": " \n Usage: python -m spacy [OPTIONS] COMMAND [ARGS]... \n \n spaCy Command-line Interface \n \n DOCS: https://spacy.io/api/cli \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --install-completion Install completion for the current shell. │\n│ --show-completion Show completion for the current shell, to copy it or customize the │\n│ installation. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ download Download compatible trained pipeline from the default download path using │\n│ pip. If --direct flag is set, the command expects the full package name with │\n│ version. For direct downloads, the compatibility check will be skipped. All │\n│ additional arguments provided to this command will be passed to `pip install` │\n│ on package installation. │\n│ info Print info about spaCy installation. If a pipeline is specified as an argument, │\n│ print its meta information. Flag --markdown prints details in Markdown for easy │\n│ copy-pasting to GitHub issues. │\n│ apply Apply a trained pipeline to documents to get predictions. │\n│ Expects a loadable spaCy pipeline and path to the data, which │\n│ can be a directory or a file. │\n│ The data files can be provided in multiple formats: │\n│ 1. .spacy files │\n│ 2. .jsonl files with a specified \"field\" to read the text from. │\n│ 3. Files with any other extension are assumed to be containing │\n│ a single document. │\n│ DOCS: https://spacy.io/api/cli#apply │\n│ assemble Assemble a spaCy pipeline from a config file. The config file includes │\n│ all settings for initializing the pipeline. To override settings in the │\n│ config, e.g. settings that point to local paths or that you want to │\n│ experiment with, you can override them as command line options. The │\n│ --code argument lets you pass in a Python file that can be used to │\n│ register custom functions that are referenced in the config. │\n│ convert Convert files into json or DocBin format for training. The resulting .spacy │\n│ file can be used with the train command and other experiment management │\n│ functions. │\n│ evaluate Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation │\n│ data in the binary .spacy format. The --gold-preproc option sets up the │\n│ evaluation examples with gold-standard sentences and tokens for the │\n│ predictions. Gold preprocessing helps the annotations align to the │\n│ tokenization, and may result in sequences of more consistent length. However, │\n│ it may reduce runtime accuracy due to train/test skew. To render a sample of │\n│ dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ find-function Find the module, path and line number to the file the registered │\n│ function is defined in, if available. │\n│ find-threshold Runs prediction trials for a trained model with varying thresholds to maximize │\n│ the specified metric. The search space for the threshold is traversed linearly │\n│ from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` │\n│ (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` │\n│ returns all results). │\n│ package Generate an installable Python package for a pipeline. Includes binary data, │\n│ meta and required installation files. A new directory will be created in the │\n│ specified output directory, and the data will be copied over. If │\n│ --create-meta is set and a meta.json already exists in the output directory, │\n│ the existing values will be used as the defaults in the command-line prompt. │\n│ After packaging, \"python -m build --sdist\" is run in the package directory, │\n│ which will create a .tar.gz archive that can be installed via \"pip install\". │\n│ pretrain Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, │\n│ using an approximate language-modelling objective. Two objective types │\n│ are available, vector-based and character-based. │\n│ train Train or update a spaCy pipeline. Requires data in spaCy's binary format. To │\n│ convert data from other formats, use the `spacy convert` command. The │\n│ config file includes all settings and hyperparameters used during training. │\n│ To override settings in the config, e.g. settings that point to local │\n│ paths or that you want to experiment with, you can override them as │\n│ command line options. For instance, --training.batch_size 128 overrides │\n│ the value of \"batch_size\" in the block \"\". The --code argument │\n│ lets you pass in a Python file that's imported before training. It can be │\n│ used to register custom functions and architectures that can then be │\n│ referenced in the config. │\n│ validate Validate the currently installed pipeline packages and spaCy version. Checks │\n│ if the installed packages are compatible and shows upgrade instructions if │\n│ available. Should be run after `pip install -U spacy`. │\n│ debug Suite of helpful commands for debugging and profiling. Includes │\n│ commands to check and validate your config files, training and evaluation data, │\n│ and custom model implementations. │\n│ benchmark Commands for benchmarking pipelines. │\n│ init Commands for initializing configs and pipeline packages. │\n│ project Command-line interface for spaCy projects and templates. │\n│ You'd typically start by cloning a project template to a local directory and │\n│ fetching its assets like datasets etc. See the project's project.yml for the │\n│ available commands. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" } From 5c559fc0175dc2eaaee097c2bb888f04b72bbe23 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Mar 2026 16:12:26 +0100 Subject: [PATCH 04/42] Fix manifest --- spacy_cli/build_manifest.py | 6 +++--- spacy_cli/cli_manifest.json | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/spacy_cli/build_manifest.py b/spacy_cli/build_manifest.py index 6e019bdcb95..71982d82d77 100644 --- a/spacy_cli/build_manifest.py +++ b/spacy_cli/build_manifest.py @@ -58,7 +58,7 @@ def build_manifest() -> Dict[str, object]: group_help[name] = _get_help(runner, app, [name]) unknown_subcommand[name] = _invoke( runner, app, [name, UNKNOWN_SUBCOMMAND_TOKEN] - ).stdout + ).output for sub_name in subcommands: help_text = _maybe_get_help(runner, app, [name, sub_name]) if help_text is not None: @@ -76,8 +76,8 @@ def build_manifest() -> Dict[str, object]: "group_help": group_help, "command_help": command_help, "errors": { - "missing_command": _invoke(runner, app, []).stdout, - "unknown_command": _invoke(runner, app, [UNKNOWN_COMMAND_TOKEN]).stdout, + "missing_command": _invoke(runner, app, []).output, + "unknown_command": _invoke(runner, app, [UNKNOWN_COMMAND_TOKEN]).output, "unknown_subcommand": unknown_subcommand, }, } diff --git a/spacy_cli/cli_manifest.json b/spacy_cli/cli_manifest.json index 12d760416c5..9aa7da9b973 100644 --- a/spacy_cli/cli_manifest.json +++ b/spacy_cli/cli_manifest.json @@ -36,13 +36,13 @@ "validate": " \n Usage: python -m spacy validate [OPTIONS] \n \n Validate the currently installed pipeline packages and spaCy version. Checks if the installed \n packages are compatible and shows upgrade instructions if available. Should be run after `pip \n install -U spacy`. \n \n DOCS: https://spacy.io/api/cli#validate \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" }, "errors": { - "missing_command": "", - "unknown_command": "", + "missing_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ Missing command. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "unknown_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_COMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "unknown_subcommand": { - "benchmark": "", - "debug": "", - "init": "", - "project": "" + "benchmark": "Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy benchmark --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_SUBCOMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "debug": "Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy debug --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_SUBCOMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "init": "Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy init --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_SUBCOMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "project": "Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy project --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_SUBCOMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n" } }, "group_help": { From cfa1d3a59abd8ebb859070f97d681120ac9d07a7 Mon Sep 17 00:00:00 2001 From: Kaushik Rajan Date: Sun, 15 Mar 2026 15:59:09 +0530 Subject: [PATCH 05/42] fix: ensure memory_zone cleanup runs on exception (#13924) (#13932) Wrap yield in try/finally in StringStore.memory_zone and Vocab.memory_zone so transient state is always cleaned up, even when an exception propagates through the context manager. --- spacy/strings.pyx | 12 ++++++---- spacy/tests/vocab_vectors/test_memory_zone.py | 23 +++++++++++++++++++ spacy/vocab.pyx | 8 ++++--- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 65e851cae4e..a65cdb6fc62 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -205,11 +205,13 @@ cdef class StringStore: if mem is None: mem = Pool() self.mem = mem - yield mem - for key in self._transient_keys: - map_clear(self._map.c_map, key) - self._transient_keys.clear() - self.mem = self._non_temp_mem + try: + yield mem + finally: + for key in self._transient_keys: + map_clear(self._map.c_map, key) + self._transient_keys.clear() + self.mem = self._non_temp_mem def add(self, string: str, allow_transient: Optional[bool] = None) -> int: """Add a string to the StringStore. diff --git a/spacy/tests/vocab_vectors/test_memory_zone.py b/spacy/tests/vocab_vectors/test_memory_zone.py index 910d2664eb4..f718afa2f6e 100644 --- a/spacy/tests/vocab_vectors/test_memory_zone.py +++ b/spacy/tests/vocab_vectors/test_memory_zone.py @@ -34,3 +34,26 @@ def test_memory_zone_redundant_insertion(): _ = vocab["dog"] assert "dog" in vocab assert "horse" not in vocab + + +def test_memory_zone_exception_cleanup(): + """Test that if an exception occurs inside a memory zone, the vocab + is properly cleaned up and remains usable afterward.""" + vocab = Vocab() + _ = vocab["dog"] + assert "dog" in vocab + try: + with vocab.memory_zone(): + _ = vocab["horse"] + raise ValueError("simulated error") + except ValueError: + pass + # Vocab should not be stuck in memory zone state + assert not vocab.in_memory_zone + # Pre-existing words should still work + assert "dog" in vocab + # Transient word from failed zone should be cleaned up + assert "horse" not in vocab + # Vocab should be fully usable for new operations + lex = vocab["cat"] + assert lex.text == "cat" diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 11043c17ae7..4bf80c85d8e 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -150,9 +150,11 @@ cdef class Vocab: if hasattr(self._vectors, "memory_zone"): contexts.append(stack.enter_context(self._vectors.memory_zone(mem))) self.mem = mem - yield mem - self._clear_transient_orths() - self.mem = self._non_temp_mem + try: + yield mem + finally: + self._clear_transient_orths() + self.mem = self._non_temp_mem def add_flag(self, flag_getter, int flag_id=-1): """Set a new boolean flag to words in the vocabulary. From 4168448124be73654069e77e7d48b59847d0d137 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 09:12:08 +0100 Subject: [PATCH 06/42] Update test_cli_launcher --- spacy/tests/test_cli_launcher.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/spacy/tests/test_cli_launcher.py b/spacy/tests/test_cli_launcher.py index 392572fdeec..1fd6e3a6fc6 100644 --- a/spacy/tests/test_cli_launcher.py +++ b/spacy/tests/test_cli_launcher.py @@ -4,7 +4,6 @@ import pytest -from spacy_cli.build_manifest import build_manifest from spacy_cli.static import load_manifest launcher_module = importlib.import_module("spacy_cli.main") @@ -46,7 +45,21 @@ def test_load_for_argv_imports_project_on_demand(): def test_manifest_is_current(): - assert build_manifest() == load_manifest() + # Run in a subprocess to avoid command registration order being affected + # by other test modules importing CLI submodules (which register commands + # as a side effect of import). + result = subprocess.run( + [ + sys.executable, + "-c", + "from spacy_cli.build_manifest import build_manifest; " + "from spacy_cli.static import load_manifest; " + "assert build_manifest() == load_manifest()", + ], + capture_output=True, + text=True, + ) + assert result.returncode == 0, result.stderr def test_launcher_root_help_uses_static(capsys, monkeypatch): From 37b4a74fa72bfe607e5050b95c1c0b4897589b97 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 20 Mar 2026 09:14:19 +0100 Subject: [PATCH 07/42] Switch dependency back from `typer-slim` to `typer` (#13922) * change typer-slim dependency to typer * set rich_markup_mode to None to preserve behaviour --- requirements.txt | 2 +- setup.cfg | 2 +- spacy/cli/_util.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6e79ed526bd..0162c694286 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 -typer-slim>=0.3.0,<1.0.0 +typer>=0.3.0,<1.0.0 weasel>=0.4.2,<0.5.0 # Third party dependencies numpy>=2.0.0,<3.0.0 diff --git a/setup.cfg b/setup.cfg index c4928af9224..57a6d2f29db 100644 --- a/setup.cfg +++ b/setup.cfg @@ -55,7 +55,7 @@ install_requires = catalogue>=2.0.6,<2.1.0 weasel>=0.4.2,<0.5.0 # Third-party dependencies - typer-slim>=0.3.0,<1.0.0 + typer>=0.3.0,<1.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0; python_version < "3.9" numpy>=1.19.0; python_version >= "3.9" diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 309b6b1e79a..5057640a5b9 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -68,7 +68,7 @@ Arg = typer.Argument Opt = typer.Option -app = typer.Typer(name=NAME, help=HELP) +app = typer.Typer(name=NAME, help=HELP, rich_markup_mode=None) benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True) debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True) init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True) From 2f6142b43e9f7dcb2762429924b48a3ef78556f7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 09:17:28 +0100 Subject: [PATCH 08/42] Require weasel 1.0 --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0162c694286..c850cfb77c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<1.0.0 -weasel>=0.4.2,<0.5.0 +weasel>=1.0.0,<1.1.0 # Third party dependencies numpy>=2.0.0,<3.0.0 requests>=2.13.0,<3.0.0 diff --git a/setup.cfg b/setup.cfg index 57a6d2f29db..a3df44a6f2d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,7 +53,7 @@ install_requires = wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 - weasel>=0.4.2,<0.5.0 + weasel>=1.0.0,<1.1.0 # Third-party dependencies typer>=0.3.0,<1.0.0 tqdm>=4.38.0,<5.0.0 From c6c78d6c330c5ff4efbf113f0eac416e5031d047 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 09:20:32 +0100 Subject: [PATCH 09/42] Allow use of uv as a fallback to pip in spacy download --- spacy/cli/download.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 8104fd2d285..120616753b8 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,3 +1,4 @@ +import shutil import sys from typing import Optional, Sequence from urllib.parse import urljoin @@ -176,5 +177,19 @@ def download_model( if not download_url.startswith(about.__download_url__): raise ValueError(f"Download from {filename} rejected. Was it a relative path?") pip_args = list(user_pip_args) if user_pip_args is not None else [] - cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] + cmd = _get_pip_install_cmd() + pip_args + [download_url] run_command(cmd) + + +def _get_pip_install_cmd() -> list: + if shutil.which("pip"): + return [sys.executable, "-m", "pip", "install"] + elif shutil.which("uv"): + return ["uv", "pip", "install"] + else: + msg.fail( + "No package installer found", + "spaCy requires either pip or uv to download models. " + "Please install one of them and try again.", + exits=1, + ) From ed20f79abe543ea4033be5b1d7950d2bb8fa5cf0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 09:43:20 +0100 Subject: [PATCH 10/42] Require confection --- requirements.txt | 1 + setup.cfg | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index c850cfb77c5..19b803325b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -36,3 +36,4 @@ types-setuptools>=57.0.0 black>=25.0.0 cython-lint>=0.15.0 isort>=5.0,<6.0 +confection>=1.0.0,<2.0.0 diff --git a/setup.cfg b/setup.cfg index a3df44a6f2d..56dcc343aa0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -54,6 +54,7 @@ install_requires = srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 weasel>=1.0.0,<1.1.0 + confection>=1.0.0,<1.1.0 # Third-party dependencies typer>=0.3.0,<1.0.0 tqdm>=4.38.0,<5.0.0 From f22ff91476c1052f9b71218144e7e88841b5703a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 09:55:17 +0100 Subject: [PATCH 11/42] Fix vuln scan by not calling test file requirements requirements.txt --- setup.py | 12 ++++++------ spacy/tests/package/test_requirements.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 33178662df4..6f28b7340d0 100755 --- a/setup.py +++ b/setup.py @@ -82,9 +82,9 @@ } # Files to copy into the package that are otherwise not included COPY_FILES = { - ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package", - ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package", - ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package", + ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package" / "test.cfg", + ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package" / "test.toml", + ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package" / "test.txt", } @@ -173,10 +173,10 @@ def setup_package(): about = {} exec(f.read(), about) - for copy_file, target_dir in COPY_FILES.items(): + for copy_file, target_file in COPY_FILES.items(): if copy_file.exists(): - shutil.copy(str(copy_file), str(target_dir)) - print(f"Copied {copy_file} -> {target_dir}") + shutil.copyfile(str(copy_file), str(target_file)) + print(f"Copied {copy_file} -> {target_file}") include_dirs = [ numpy.get_include(), diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index ff07c5b454a..e7df02e9769 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -37,7 +37,7 @@ def test_build_dependencies(): req_dict = {} root_dir = Path(__file__).parent - req_file = root_dir / "requirements.txt" + req_file = root_dir / "test.txt" with req_file.open() as f: lines = f.readlines() for line in lines: @@ -48,7 +48,7 @@ def test_build_dependencies(): req_dict[lib] = v # check setup.cfg and compare to requirements.txt # also fails when there are missing or additional libs - setup_file = root_dir / "setup.cfg" + setup_file = root_dir / "test.cfg" with setup_file.open() as f: lines = f.readlines() @@ -73,7 +73,7 @@ def test_build_dependencies(): # check pyproject.toml and compare the versions of the libs to requirements.txt # does not fail when there are missing or additional libs - toml_file = root_dir / "pyproject.toml" + toml_file = root_dir / "test.toml" with toml_file.open() as f: lines = f.readlines() for line in lines: From 21439ec09d181229aae51639aa3049a7a7b107fa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 11:10:26 +0100 Subject: [PATCH 12/42] Migrate from pydantic v1 to v2, require pydantic>=2.0.0 Replace all pydantic.v1 compat imports with direct pydantic v2 imports. Migrate schemas to v2 API: ConfigDict instead of inner Config class, field_validator instead of validator, RootModel instead of __root__, model_dump() instead of dict(), model_validate() instead of parse_obj(), Annotated[str, StringConstraints()] instead of ConstrainedStr, min_length instead of min_items, populate_by_name instead of allow_population_by_field_name. --- requirements.txt | 2 +- setup.cfg | 2 +- spacy/cli/init_config.py | 2 +- spacy/matcher/phrasematcher.pyx | 2 +- .../pipeline/_edit_tree_internals/schemas.py | 32 ++-- spacy/schemas.py | 155 +++++++----------- spacy/tests/pipeline/test_initialize.py | 5 +- spacy/tests/pipeline/test_pipe_factories.py | 5 +- spacy/tests/test_misc.py | 5 +- 9 files changed, 84 insertions(+), 126 deletions(-) diff --git a/requirements.txt b/requirements.txt index 19b803325b8..e2a4a181813 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ weasel>=1.0.0,<1.1.0 numpy>=2.0.0,<3.0.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 -pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 +pydantic>=2.0.0,<3.0.0 jinja2 # Official Python utilities setuptools diff --git a/setup.cfg b/setup.cfg index 56dcc343aa0..f44d661feec 100644 --- a/setup.cfg +++ b/setup.cfg @@ -61,7 +61,7 @@ install_requires = numpy>=1.15.0; python_version < "3.9" numpy>=1.19.0; python_version >= "3.9" requests>=2.13.0,<3.0.0 - pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 + pydantic>=2.0.0,<3.0.0 jinja2 # Official Python utilities setuptools diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index a7fb2b5b81f..c2c26ca56ec 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -168,7 +168,7 @@ def init_config( # Filter out duplicates since tok2vec and transformer are added by template pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")] defaults = RECOMMENDATIONS["__default__"] - reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict() + reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).model_dump() variables = { "lang": lang, "components": pipeline, diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index ccc830e35c1..a71f85f6e63 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -57,7 +57,7 @@ cdef class PhraseMatcher: attr = "ORTH" if attr == "IS_SENT_START": attr = "SENT_START" - if attr.lower() not in TokenPattern().dict(): + if attr.lower() not in TokenPattern().model_dump(): raise ValueError(Errors.E152.format(attr=attr)) self.attr = IDS.get(attr) diff --git a/spacy/pipeline/_edit_tree_internals/schemas.py b/spacy/pipeline/_edit_tree_internals/schemas.py index 89f2861ceac..4afb6b3747c 100644 --- a/spacy/pipeline/_edit_tree_internals/schemas.py +++ b/spacy/pipeline/_edit_tree_internals/schemas.py @@ -1,34 +1,36 @@ from collections import defaultdict from typing import Any, Dict, List, Union -try: - from pydantic.v1 import BaseModel, Field, ValidationError - from pydantic.v1.types import StrictBool, StrictInt, StrictStr -except ImportError: - from pydantic import BaseModel, Field, ValidationError # type: ignore - from pydantic.types import StrictBool, StrictInt, StrictStr # type: ignore +from pydantic import ( + BaseModel, + ConfigDict, + Field, + RootModel, + StrictBool, + StrictInt, + StrictStr, + ValidationError, +) class MatchNodeSchema(BaseModel): + model_config = ConfigDict(extra="forbid") + prefix_len: StrictInt = Field(..., title="Prefix length") suffix_len: StrictInt = Field(..., title="Suffix length") prefix_tree: StrictInt = Field(..., title="Prefix tree") suffix_tree: StrictInt = Field(..., title="Suffix tree") - class Config: - extra = "forbid" - class SubstNodeSchema(BaseModel): + model_config = ConfigDict(extra="forbid") + orig: Union[int, StrictStr] = Field(..., title="Original substring") subst: Union[int, StrictStr] = Field(..., title="Replacement substring") - class Config: - extra = "forbid" - -class EditTreeSchema(BaseModel): - __root__: Union[MatchNodeSchema, SubstNodeSchema] +class EditTreeSchema(RootModel[Union[MatchNodeSchema, SubstNodeSchema]]): + pass def validate_edit_tree(obj: Dict[str, Any]) -> List[str]: @@ -38,7 +40,7 @@ def validate_edit_tree(obj: Dict[str, Any]) -> List[str]: RETURNS (List[str]): A list of error messages, if available. """ try: - EditTreeSchema.parse_obj(obj) + EditTreeSchema.model_validate(obj) return [] except ValidationError as e: errors = e.errors() diff --git a/spacy/schemas.py b/spacy/schemas.py index fa987b90f19..b72037a0a8a 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -4,6 +4,7 @@ from enum import Enum from typing import ( TYPE_CHECKING, + Annotated, Any, Callable, Dict, @@ -16,34 +17,20 @@ Union, ) -try: - from pydantic.v1 import ( - BaseModel, - ConstrainedStr, - Field, - StrictBool, - StrictFloat, - StrictInt, - StrictStr, - ValidationError, - create_model, - validator, - ) - from pydantic.v1.main import ModelMetaclass -except ImportError: - from pydantic import ( # type: ignore - BaseModel, - ConstrainedStr, - Field, - StrictBool, - StrictFloat, - StrictInt, - StrictStr, - ValidationError, - create_model, - validator, - ) - from pydantic.main import ModelMetaclass # type: ignore +from pydantic import ( + BaseModel, + ConfigDict, + Field, + RootModel, + StrictBool, + StrictFloat, + StrictInt, + StrictStr, + StringConstraints, + ValidationError, + create_model, + field_validator, +) from thinc.api import ConfigValidationError, Model, Optimizer from thinc.config import Promise @@ -89,14 +76,8 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]: # Initialization -class ArgSchemaConfig: - extra = "forbid" - arbitrary_types_allowed = True - - -class ArgSchemaConfigExtra: - extra = "forbid" - arbitrary_types_allowed = True +_ARG_SCHEMA_CONFIG = ConfigDict(extra="forbid", arbitrary_types_allowed=True) +_ARG_SCHEMA_CONFIG_EXTRA = ConfigDict(extra="allow", arbitrary_types_allowed=True) def get_arg_model( @@ -105,7 +86,7 @@ def get_arg_model( exclude: Iterable[str] = tuple(), name: str = "ArgModel", strict: bool = True, -) -> ModelMetaclass: +) -> Type[BaseModel]: """Generate a pydantic model for function arguments. func (Callable): The function to generate the schema for. @@ -113,15 +94,15 @@ def get_arg_model( name (str): Name of created model class. strict (bool): Don't allow extra arguments if no variable keyword arguments are allowed on the function. - RETURNS (ModelMetaclass): A pydantic model. + RETURNS (Type[BaseModel]): A pydantic model. """ - sig_args = {} + sig_args: Dict[str, Any] = {} try: sig = inspect.signature(func) except ValueError: # Typically happens if the method is part of a Cython module without # binding=True. Here we just use an empty model that allows everything. - return create_model(name, __config__=ArgSchemaConfigExtra) # type: ignore[arg-type, return-value] + return create_model(name, __config__=_ARG_SCHEMA_CONFIG_EXTRA) # type: ignore[call-overload] has_variable = False for param in sig.parameters.values(): if param.name in exclude: @@ -141,8 +122,8 @@ def get_arg_model( default = param.default if param.default != param.empty else default_empty sig_args[param.name] = (annotation, default) is_strict = strict and not has_variable - sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra # type: ignore[assignment] - return create_model(name, **sig_args) # type: ignore[call-overload, arg-type, return-value] + config = _ARG_SCHEMA_CONFIG if is_strict else _ARG_SCHEMA_CONFIG_EXTRA + return create_model(name, __config__=config, **sig_args) # type: ignore[call-overload] def validate_init_settings( @@ -167,7 +148,7 @@ def validate_init_settings( """ schema = get_arg_model(func, exclude=exclude, name="InitArgModel") try: - return schema(**settings).dict() + return schema(**settings).model_dump() except ValidationError as e: block = "initialize" if not section else f"initialize.{section}" title = f"Error validating initialization settings in [{block}]" @@ -193,6 +174,8 @@ def validate_token_pattern(obj: list) -> List[str]: class TokenPatternString(BaseModel): + model_config = ConfigDict(extra="forbid", populate_by_name=True) + REGEX: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="regex") IN: Optional[List[StrictStr]] = Field(None, alias="in") NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in") @@ -228,18 +211,17 @@ class TokenPatternString(BaseModel): None, alias="fuzzy9" ) - class Config: - extra = "forbid" - allow_population_by_field_name = True # allow alias and field name - - @validator("*", pre=True, each_item=True, allow_reuse=True) - def raise_for_none(cls, v): + @field_validator("*", mode="before") + @classmethod + def raise_for_none(cls, v: Any) -> Any: if v is None: raise ValueError("None / null is not allowed") return v class TokenPatternNumber(BaseModel): + model_config = ConfigDict(extra="forbid", populate_by_name=True) + REGEX: Optional[StrictStr] = Field(None, alias="regex") IN: Optional[List[StrictInt]] = Field(None, alias="in") NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in") @@ -253,26 +235,24 @@ class TokenPatternNumber(BaseModel): GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">") LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<") - class Config: - extra = "forbid" - allow_population_by_field_name = True # allow alias and field name - - @validator("*", pre=True, each_item=True, allow_reuse=True) - def raise_for_none(cls, v): + @field_validator("*", mode="before") + @classmethod + def raise_for_none(cls, v: Any) -> Any: if v is None: raise ValueError("None / null is not allowed") return v class TokenPatternOperatorSimple(str, Enum): - plus: StrictStr = StrictStr("+") - star: StrictStr = StrictStr("*") - question: StrictStr = StrictStr("?") - exclamation: StrictStr = StrictStr("!") + plus = "+" + star = "*" + question = "?" + exclamation = "!" -class TokenPatternOperatorMinMax(ConstrainedStr): - regex = re.compile(r"^({\d+}|{\d+,\d*}|{\d*,\d+})$") +TokenPatternOperatorMinMax = Annotated[ + str, StringConstraints(pattern=r"^({\d+}|{\d+,\d*}|{\d*,\d+})$") +] TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax] @@ -285,6 +265,12 @@ class TokenPatternOperatorMinMax(ConstrainedStr): class TokenPattern(BaseModel): + model_config = ConfigDict( + extra="forbid", + populate_by_name=True, + alias_generator=lambda value: value.upper(), + ) + orth: Optional[StringValue] = None text: Optional[StringValue] = None lower: Optional[StringValue] = None @@ -323,23 +309,18 @@ class TokenPattern(BaseModel): op: Optional[TokenPatternOperator] = None underscore: Optional[Dict[StrictStr, UnderscoreValue]] = Field(None, alias="_") - class Config: - extra = "forbid" - allow_population_by_field_name = True - alias_generator = lambda value: value.upper() - - @validator("*", pre=True, allow_reuse=True) - def raise_for_none(cls, v): + @field_validator("*", mode="before") + @classmethod + def raise_for_none(cls, v: Any) -> Any: if v is None: raise ValueError("None / null is not allowed") return v class TokenPatternSchema(BaseModel): - pattern: List[TokenPattern] = Field(..., min_items=1) + model_config = ConfigDict(extra="forbid") - class Config: - extra = "forbid" + pattern: List[TokenPattern] = Field(..., min_length=1) # Model meta @@ -376,6 +357,7 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) # fmt: off dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") train_corpus: StrictStr = Field(..., title="Path in the config to the training data") @@ -397,12 +379,9 @@ class ConfigSchemaTraining(BaseModel): before_update: Optional[Callable[["Language", Dict[str, Any]], None]] = Field(..., title="Optional callback that is invoked at the start of each training step") # fmt: on - class Config: - extra = "forbid" - arbitrary_types_allowed = True - class ConfigSchemaNlp(BaseModel): + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) # fmt: off lang: StrictStr = Field(..., title="The base language to use") pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") @@ -415,17 +394,13 @@ class ConfigSchemaNlp(BaseModel): vectors: Callable = Field(..., title="Vectors implementation") # fmt: on - class Config: - extra = "forbid" - arbitrary_types_allowed = True - class ConfigSchemaPretrainEmpty(BaseModel): - class Config: - extra = "forbid" + model_config = ConfigDict(extra="forbid") class ConfigSchemaPretrain(BaseModel): + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) # fmt: off max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for") dropout: StrictFloat = Field(..., title="Dropout rate") @@ -439,29 +414,23 @@ class ConfigSchemaPretrain(BaseModel): objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.") # fmt: on - class Config: - extra = "forbid" - arbitrary_types_allowed = True - class ConfigSchemaInit(BaseModel): + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) # fmt: off vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file") lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization") vectors: Optional[StrictStr] = Field(..., title="Path to vectors") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") - tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize") - components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component") + tokenizer: Dict[StrictStr, Any] = Field(..., description="Arguments to be passed into Tokenizer.initialize") + components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., description="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component") before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization") after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization") # fmt: on - class Config: - extra = "forbid" - arbitrary_types_allowed = True - class ConfigSchema(BaseModel): + model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True) training: ConfigSchemaTraining nlp: ConfigSchemaNlp pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} # type: ignore[assignment] @@ -469,10 +438,6 @@ class ConfigSchema(BaseModel): corpora: Dict[str, Reader] initialize: ConfigSchemaInit - class Config: - extra = "allow" - arbitrary_types_allowed = True - CONFIG_SCHEMAS = { "nlp": ConfigSchemaNlp, diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py index 9854b391e60..546068800fa 100644 --- a/spacy/tests/pipeline/test_initialize.py +++ b/spacy/tests/pipeline/test_initialize.py @@ -1,9 +1,6 @@ import pytest -try: - from pydantic.v1 import StrictBool -except ImportError: - from pydantic import StrictBool # type: ignore +from pydantic import StrictBool from thinc.api import ConfigValidationError diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index b355379bfd0..f8f9ec4b10d 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -1,9 +1,6 @@ import pytest -try: - from pydantic.v1 import StrictInt, StrictStr -except ImportError: - from pydantic import StrictInt, StrictStr # type: ignore +from pydantic import StrictInt, StrictStr from thinc.api import ConfigValidationError, Linear, Model diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index d2a41ff0fed..2c26952891d 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -4,10 +4,7 @@ import pytest -try: - from pydantic.v1 import ValidationError -except ImportError: - from pydantic import ValidationError # type: ignore +from pydantic import ValidationError from thinc.api import ( Config, From d41afc2966037c15a136c8f070b2ef6aefaafc88 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 11:13:21 +0100 Subject: [PATCH 13/42] isort --- spacy/tests/pipeline/test_initialize.py | 2 -- spacy/tests/pipeline/test_pipe_factories.py | 2 -- spacy/tests/test_misc.py | 2 -- 3 files changed, 6 deletions(-) diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py index 546068800fa..6dd4114f1cd 100644 --- a/spacy/tests/pipeline/test_initialize.py +++ b/spacy/tests/pipeline/test_initialize.py @@ -1,7 +1,5 @@ import pytest - from pydantic import StrictBool - from thinc.api import ConfigValidationError from spacy.lang.en import English diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index f8f9ec4b10d..a8a6c7d136a 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -1,7 +1,5 @@ import pytest - from pydantic import StrictInt, StrictStr - from thinc.api import ConfigValidationError, Linear, Model import spacy diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 2c26952891d..309c57b0926 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -3,9 +3,7 @@ from pathlib import Path import pytest - from pydantic import ValidationError - from thinc.api import ( Config, ConfigValidationError, From 2afc3fd41c93af48f1f4d366cda301fb067b87c9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 11:34:12 +0100 Subject: [PATCH 14/42] Escape braces in TokenPatternOperatorMinMax regex for Rust regex engine --- spacy/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/schemas.py b/spacy/schemas.py index b72037a0a8a..079e3aa853a 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -251,7 +251,7 @@ class TokenPatternOperatorSimple(str, Enum): TokenPatternOperatorMinMax = Annotated[ - str, StringConstraints(pattern=r"^({\d+}|{\d+,\d*}|{\d*,\d+})$") + str, StringConstraints(pattern=r"^(\{\d+\}|\{\d+,\d*\}|\{\d*,\d+\})$") ] From 60a19cb800e84553298170e56f72cb6695360671 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 14:02:03 +0100 Subject: [PATCH 15/42] Revert to confection <1 and allow pydantic v1 --- requirements.txt | 4 ++-- setup.cfg | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index e2a4a181813..6e3fbdf71d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ weasel>=1.0.0,<1.1.0 numpy>=2.0.0,<3.0.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 -pydantic>=2.0.0,<3.0.0 +pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 jinja2 # Official Python utilities setuptools @@ -36,4 +36,4 @@ types-setuptools>=57.0.0 black>=25.0.0 cython-lint>=0.15.0 isort>=5.0,<6.0 -confection>=1.0.0,<2.0.0 +confection>=0.0.4,<1.0.0 diff --git a/setup.cfg b/setup.cfg index f44d661feec..cfc6209c706 100644 --- a/setup.cfg +++ b/setup.cfg @@ -54,14 +54,14 @@ install_requires = srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 weasel>=1.0.0,<1.1.0 - confection>=1.0.0,<1.1.0 + confection>=0.0.4,<1.0.0 # Third-party dependencies typer>=0.3.0,<1.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0; python_version < "3.9" numpy>=1.19.0; python_version >= "3.9" requests>=2.13.0,<3.0.0 - pydantic>=2.0.0,<3.0.0 + pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 jinja2 # Official Python utilities setuptools From 4f19800b4e972755cbb2f597c20fdf42acecb60c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 14:04:12 +0100 Subject: [PATCH 16/42] Revert to weasel <0.5 --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6e3fbdf71d0..895bc3f7c07 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<1.0.0 -weasel>=1.0.0,<1.1.0 +weasel>=0.4.2,<0.5.0 # Third party dependencies numpy>=2.0.0,<3.0.0 requests>=2.13.0,<3.0.0 diff --git a/setup.cfg b/setup.cfg index cfc6209c706..585c2694fe2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,7 +53,7 @@ install_requires = wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 - weasel>=1.0.0,<1.1.0 + weasel>=0.4.2,<0.5.0 confection>=0.0.4,<1.0.0 # Third-party dependencies typer>=0.3.0,<1.0.0 From 3154ede82a1bc1e1a0198233b3adcb2b9c242b5d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 14:09:37 +0100 Subject: [PATCH 17/42] Revert pydantic v2 migration, restore v1 compat imports --- spacy/cli/init_config.py | 2 +- spacy/matcher/phrasematcher.pyx | 2 +- .../pipeline/_edit_tree_internals/schemas.py | 32 ++-- spacy/schemas.py | 155 +++++++++++------- spacy/tests/pipeline/test_initialize.py | 7 +- spacy/tests/pipeline/test_pipe_factories.py | 7 +- spacy/tests/test_misc.py | 7 +- 7 files changed, 130 insertions(+), 82 deletions(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index c2c26ca56ec..a7fb2b5b81f 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -168,7 +168,7 @@ def init_config( # Filter out duplicates since tok2vec and transformer are added by template pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")] defaults = RECOMMENDATIONS["__default__"] - reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).model_dump() + reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict() variables = { "lang": lang, "components": pipeline, diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index a71f85f6e63..ccc830e35c1 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -57,7 +57,7 @@ cdef class PhraseMatcher: attr = "ORTH" if attr == "IS_SENT_START": attr = "SENT_START" - if attr.lower() not in TokenPattern().model_dump(): + if attr.lower() not in TokenPattern().dict(): raise ValueError(Errors.E152.format(attr=attr)) self.attr = IDS.get(attr) diff --git a/spacy/pipeline/_edit_tree_internals/schemas.py b/spacy/pipeline/_edit_tree_internals/schemas.py index 4afb6b3747c..89f2861ceac 100644 --- a/spacy/pipeline/_edit_tree_internals/schemas.py +++ b/spacy/pipeline/_edit_tree_internals/schemas.py @@ -1,36 +1,34 @@ from collections import defaultdict from typing import Any, Dict, List, Union -from pydantic import ( - BaseModel, - ConfigDict, - Field, - RootModel, - StrictBool, - StrictInt, - StrictStr, - ValidationError, -) +try: + from pydantic.v1 import BaseModel, Field, ValidationError + from pydantic.v1.types import StrictBool, StrictInt, StrictStr +except ImportError: + from pydantic import BaseModel, Field, ValidationError # type: ignore + from pydantic.types import StrictBool, StrictInt, StrictStr # type: ignore class MatchNodeSchema(BaseModel): - model_config = ConfigDict(extra="forbid") - prefix_len: StrictInt = Field(..., title="Prefix length") suffix_len: StrictInt = Field(..., title="Suffix length") prefix_tree: StrictInt = Field(..., title="Prefix tree") suffix_tree: StrictInt = Field(..., title="Suffix tree") + class Config: + extra = "forbid" -class SubstNodeSchema(BaseModel): - model_config = ConfigDict(extra="forbid") +class SubstNodeSchema(BaseModel): orig: Union[int, StrictStr] = Field(..., title="Original substring") subst: Union[int, StrictStr] = Field(..., title="Replacement substring") + class Config: + extra = "forbid" + -class EditTreeSchema(RootModel[Union[MatchNodeSchema, SubstNodeSchema]]): - pass +class EditTreeSchema(BaseModel): + __root__: Union[MatchNodeSchema, SubstNodeSchema] def validate_edit_tree(obj: Dict[str, Any]) -> List[str]: @@ -40,7 +38,7 @@ def validate_edit_tree(obj: Dict[str, Any]) -> List[str]: RETURNS (List[str]): A list of error messages, if available. """ try: - EditTreeSchema.model_validate(obj) + EditTreeSchema.parse_obj(obj) return [] except ValidationError as e: errors = e.errors() diff --git a/spacy/schemas.py b/spacy/schemas.py index 079e3aa853a..fa987b90f19 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -4,7 +4,6 @@ from enum import Enum from typing import ( TYPE_CHECKING, - Annotated, Any, Callable, Dict, @@ -17,20 +16,34 @@ Union, ) -from pydantic import ( - BaseModel, - ConfigDict, - Field, - RootModel, - StrictBool, - StrictFloat, - StrictInt, - StrictStr, - StringConstraints, - ValidationError, - create_model, - field_validator, -) +try: + from pydantic.v1 import ( + BaseModel, + ConstrainedStr, + Field, + StrictBool, + StrictFloat, + StrictInt, + StrictStr, + ValidationError, + create_model, + validator, + ) + from pydantic.v1.main import ModelMetaclass +except ImportError: + from pydantic import ( # type: ignore + BaseModel, + ConstrainedStr, + Field, + StrictBool, + StrictFloat, + StrictInt, + StrictStr, + ValidationError, + create_model, + validator, + ) + from pydantic.main import ModelMetaclass # type: ignore from thinc.api import ConfigValidationError, Model, Optimizer from thinc.config import Promise @@ -76,8 +89,14 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]: # Initialization -_ARG_SCHEMA_CONFIG = ConfigDict(extra="forbid", arbitrary_types_allowed=True) -_ARG_SCHEMA_CONFIG_EXTRA = ConfigDict(extra="allow", arbitrary_types_allowed=True) +class ArgSchemaConfig: + extra = "forbid" + arbitrary_types_allowed = True + + +class ArgSchemaConfigExtra: + extra = "forbid" + arbitrary_types_allowed = True def get_arg_model( @@ -86,7 +105,7 @@ def get_arg_model( exclude: Iterable[str] = tuple(), name: str = "ArgModel", strict: bool = True, -) -> Type[BaseModel]: +) -> ModelMetaclass: """Generate a pydantic model for function arguments. func (Callable): The function to generate the schema for. @@ -94,15 +113,15 @@ def get_arg_model( name (str): Name of created model class. strict (bool): Don't allow extra arguments if no variable keyword arguments are allowed on the function. - RETURNS (Type[BaseModel]): A pydantic model. + RETURNS (ModelMetaclass): A pydantic model. """ - sig_args: Dict[str, Any] = {} + sig_args = {} try: sig = inspect.signature(func) except ValueError: # Typically happens if the method is part of a Cython module without # binding=True. Here we just use an empty model that allows everything. - return create_model(name, __config__=_ARG_SCHEMA_CONFIG_EXTRA) # type: ignore[call-overload] + return create_model(name, __config__=ArgSchemaConfigExtra) # type: ignore[arg-type, return-value] has_variable = False for param in sig.parameters.values(): if param.name in exclude: @@ -122,8 +141,8 @@ def get_arg_model( default = param.default if param.default != param.empty else default_empty sig_args[param.name] = (annotation, default) is_strict = strict and not has_variable - config = _ARG_SCHEMA_CONFIG if is_strict else _ARG_SCHEMA_CONFIG_EXTRA - return create_model(name, __config__=config, **sig_args) # type: ignore[call-overload] + sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra # type: ignore[assignment] + return create_model(name, **sig_args) # type: ignore[call-overload, arg-type, return-value] def validate_init_settings( @@ -148,7 +167,7 @@ def validate_init_settings( """ schema = get_arg_model(func, exclude=exclude, name="InitArgModel") try: - return schema(**settings).model_dump() + return schema(**settings).dict() except ValidationError as e: block = "initialize" if not section else f"initialize.{section}" title = f"Error validating initialization settings in [{block}]" @@ -174,8 +193,6 @@ def validate_token_pattern(obj: list) -> List[str]: class TokenPatternString(BaseModel): - model_config = ConfigDict(extra="forbid", populate_by_name=True) - REGEX: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="regex") IN: Optional[List[StrictStr]] = Field(None, alias="in") NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in") @@ -211,17 +228,18 @@ class TokenPatternString(BaseModel): None, alias="fuzzy9" ) - @field_validator("*", mode="before") - @classmethod - def raise_for_none(cls, v: Any) -> Any: + class Config: + extra = "forbid" + allow_population_by_field_name = True # allow alias and field name + + @validator("*", pre=True, each_item=True, allow_reuse=True) + def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") return v class TokenPatternNumber(BaseModel): - model_config = ConfigDict(extra="forbid", populate_by_name=True) - REGEX: Optional[StrictStr] = Field(None, alias="regex") IN: Optional[List[StrictInt]] = Field(None, alias="in") NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in") @@ -235,24 +253,26 @@ class TokenPatternNumber(BaseModel): GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">") LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<") - @field_validator("*", mode="before") - @classmethod - def raise_for_none(cls, v: Any) -> Any: + class Config: + extra = "forbid" + allow_population_by_field_name = True # allow alias and field name + + @validator("*", pre=True, each_item=True, allow_reuse=True) + def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") return v class TokenPatternOperatorSimple(str, Enum): - plus = "+" - star = "*" - question = "?" - exclamation = "!" + plus: StrictStr = StrictStr("+") + star: StrictStr = StrictStr("*") + question: StrictStr = StrictStr("?") + exclamation: StrictStr = StrictStr("!") -TokenPatternOperatorMinMax = Annotated[ - str, StringConstraints(pattern=r"^(\{\d+\}|\{\d+,\d*\}|\{\d*,\d+\})$") -] +class TokenPatternOperatorMinMax(ConstrainedStr): + regex = re.compile(r"^({\d+}|{\d+,\d*}|{\d*,\d+})$") TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax] @@ -265,12 +285,6 @@ class TokenPatternOperatorSimple(str, Enum): class TokenPattern(BaseModel): - model_config = ConfigDict( - extra="forbid", - populate_by_name=True, - alias_generator=lambda value: value.upper(), - ) - orth: Optional[StringValue] = None text: Optional[StringValue] = None lower: Optional[StringValue] = None @@ -309,18 +323,23 @@ class TokenPattern(BaseModel): op: Optional[TokenPatternOperator] = None underscore: Optional[Dict[StrictStr, UnderscoreValue]] = Field(None, alias="_") - @field_validator("*", mode="before") - @classmethod - def raise_for_none(cls, v: Any) -> Any: + class Config: + extra = "forbid" + allow_population_by_field_name = True + alias_generator = lambda value: value.upper() + + @validator("*", pre=True, allow_reuse=True) + def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") return v class TokenPatternSchema(BaseModel): - model_config = ConfigDict(extra="forbid") + pattern: List[TokenPattern] = Field(..., min_items=1) - pattern: List[TokenPattern] = Field(..., min_length=1) + class Config: + extra = "forbid" # Model meta @@ -357,7 +376,6 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): - model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) # fmt: off dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") train_corpus: StrictStr = Field(..., title="Path in the config to the training data") @@ -379,9 +397,12 @@ class ConfigSchemaTraining(BaseModel): before_update: Optional[Callable[["Language", Dict[str, Any]], None]] = Field(..., title="Optional callback that is invoked at the start of each training step") # fmt: on + class Config: + extra = "forbid" + arbitrary_types_allowed = True + class ConfigSchemaNlp(BaseModel): - model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) # fmt: off lang: StrictStr = Field(..., title="The base language to use") pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") @@ -394,13 +415,17 @@ class ConfigSchemaNlp(BaseModel): vectors: Callable = Field(..., title="Vectors implementation") # fmt: on + class Config: + extra = "forbid" + arbitrary_types_allowed = True + class ConfigSchemaPretrainEmpty(BaseModel): - model_config = ConfigDict(extra="forbid") + class Config: + extra = "forbid" class ConfigSchemaPretrain(BaseModel): - model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) # fmt: off max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for") dropout: StrictFloat = Field(..., title="Dropout rate") @@ -414,23 +439,29 @@ class ConfigSchemaPretrain(BaseModel): objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.") # fmt: on + class Config: + extra = "forbid" + arbitrary_types_allowed = True + class ConfigSchemaInit(BaseModel): - model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) # fmt: off vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file") lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization") vectors: Optional[StrictStr] = Field(..., title="Path to vectors") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") - tokenizer: Dict[StrictStr, Any] = Field(..., description="Arguments to be passed into Tokenizer.initialize") - components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., description="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component") + tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize") + components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component") before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization") after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization") # fmt: on + class Config: + extra = "forbid" + arbitrary_types_allowed = True + class ConfigSchema(BaseModel): - model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True) training: ConfigSchemaTraining nlp: ConfigSchemaNlp pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} # type: ignore[assignment] @@ -438,6 +469,10 @@ class ConfigSchema(BaseModel): corpora: Dict[str, Reader] initialize: ConfigSchemaInit + class Config: + extra = "allow" + arbitrary_types_allowed = True + CONFIG_SCHEMAS = { "nlp": ConfigSchemaNlp, diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py index 6dd4114f1cd..9854b391e60 100644 --- a/spacy/tests/pipeline/test_initialize.py +++ b/spacy/tests/pipeline/test_initialize.py @@ -1,5 +1,10 @@ import pytest -from pydantic import StrictBool + +try: + from pydantic.v1 import StrictBool +except ImportError: + from pydantic import StrictBool # type: ignore + from thinc.api import ConfigValidationError from spacy.lang.en import English diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index a8a6c7d136a..b355379bfd0 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -1,5 +1,10 @@ import pytest -from pydantic import StrictInt, StrictStr + +try: + from pydantic.v1 import StrictInt, StrictStr +except ImportError: + from pydantic import StrictInt, StrictStr # type: ignore + from thinc.api import ConfigValidationError, Linear, Model import spacy diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 309c57b0926..d2a41ff0fed 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -3,7 +3,12 @@ from pathlib import Path import pytest -from pydantic import ValidationError + +try: + from pydantic.v1 import ValidationError +except ImportError: + from pydantic import ValidationError # type: ignore + from thinc.api import ( Config, ConfigValidationError, From d5f67dc92daeabe906158311ef483072ce10f1e0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 14:31:59 +0100 Subject: [PATCH 18/42] Update spaCy pydantic imports from v1 compat to v2 native API --- .../pipeline/_edit_tree_internals/schemas.py | 19 +-- spacy/schemas.py | 110 ++++++------------ spacy/tests/pipeline/test_initialize.py | 5 +- spacy/tests/pipeline/test_pipe_factories.py | 5 +- spacy/tests/test_misc.py | 5 +- 5 files changed, 45 insertions(+), 99 deletions(-) diff --git a/spacy/pipeline/_edit_tree_internals/schemas.py b/spacy/pipeline/_edit_tree_internals/schemas.py index 89f2861ceac..d20945dc5f1 100644 --- a/spacy/pipeline/_edit_tree_internals/schemas.py +++ b/spacy/pipeline/_edit_tree_internals/schemas.py @@ -1,12 +1,7 @@ from collections import defaultdict from typing import Any, Dict, List, Union -try: - from pydantic.v1 import BaseModel, Field, ValidationError - from pydantic.v1.types import StrictBool, StrictInt, StrictStr -except ImportError: - from pydantic import BaseModel, Field, ValidationError # type: ignore - from pydantic.types import StrictBool, StrictInt, StrictStr # type: ignore +from pydantic import BaseModel, ConfigDict, Field, RootModel, StrictBool, StrictInt, StrictStr, ValidationError class MatchNodeSchema(BaseModel): @@ -15,20 +10,18 @@ class MatchNodeSchema(BaseModel): prefix_tree: StrictInt = Field(..., title="Prefix tree") suffix_tree: StrictInt = Field(..., title="Suffix tree") - class Config: - extra = "forbid" + model_config = ConfigDict(extra="forbid") class SubstNodeSchema(BaseModel): orig: Union[int, StrictStr] = Field(..., title="Original substring") subst: Union[int, StrictStr] = Field(..., title="Replacement substring") - class Config: - extra = "forbid" + model_config = ConfigDict(extra="forbid") -class EditTreeSchema(BaseModel): - __root__: Union[MatchNodeSchema, SubstNodeSchema] +class EditTreeSchema(RootModel[Union[MatchNodeSchema, SubstNodeSchema]]): + pass def validate_edit_tree(obj: Dict[str, Any]) -> List[str]: @@ -38,7 +31,7 @@ def validate_edit_tree(obj: Dict[str, Any]) -> List[str]: RETURNS (List[str]): A list of error messages, if available. """ try: - EditTreeSchema.parse_obj(obj) + EditTreeSchema.model_validate(obj) return [] except ValidationError as e: errors = e.errors() diff --git a/spacy/schemas.py b/spacy/schemas.py index fa987b90f19..c0ecee314ee 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,5 +1,4 @@ import inspect -import re from collections import defaultdict from enum import Enum from typing import ( @@ -16,34 +15,19 @@ Union, ) -try: - from pydantic.v1 import ( - BaseModel, - ConstrainedStr, - Field, - StrictBool, - StrictFloat, - StrictInt, - StrictStr, - ValidationError, - create_model, - validator, - ) - from pydantic.v1.main import ModelMetaclass -except ImportError: - from pydantic import ( # type: ignore - BaseModel, - ConstrainedStr, - Field, - StrictBool, - StrictFloat, - StrictInt, - StrictStr, - ValidationError, - create_model, - validator, - ) - from pydantic.main import ModelMetaclass # type: ignore +from pydantic import ( + BaseModel, + ConfigDict, + Field, + StrictBool, + StrictFloat, + StrictInt, + StrictStr, + ValidationError, + constr, + create_model, + field_validator, +) from thinc.api import ConfigValidationError, Model, Optimizer from thinc.config import Promise @@ -89,14 +73,9 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]: # Initialization -class ArgSchemaConfig: - extra = "forbid" - arbitrary_types_allowed = True - +ArgSchemaConfig = ConfigDict(extra="forbid", arbitrary_types_allowed=True) -class ArgSchemaConfigExtra: - extra = "forbid" - arbitrary_types_allowed = True +ArgSchemaConfigExtra = ConfigDict(extra="forbid", arbitrary_types_allowed=True) def get_arg_model( @@ -105,7 +84,7 @@ def get_arg_model( exclude: Iterable[str] = tuple(), name: str = "ArgModel", strict: bool = True, -) -> ModelMetaclass: +) -> type[BaseModel]: """Generate a pydantic model for function arguments. func (Callable): The function to generate the schema for. @@ -113,7 +92,7 @@ def get_arg_model( name (str): Name of created model class. strict (bool): Don't allow extra arguments if no variable keyword arguments are allowed on the function. - RETURNS (ModelMetaclass): A pydantic model. + RETURNS (type[BaseModel]): A pydantic model. """ sig_args = {} try: @@ -167,7 +146,7 @@ def validate_init_settings( """ schema = get_arg_model(func, exclude=exclude, name="InitArgModel") try: - return schema(**settings).dict() + return schema.model_validate(settings).model_dump() except ValidationError as e: block = "initialize" if not section else f"initialize.{section}" title = f"Error validating initialization settings in [{block}]" @@ -228,11 +207,10 @@ class TokenPatternString(BaseModel): None, alias="fuzzy9" ) - class Config: - extra = "forbid" - allow_population_by_field_name = True # allow alias and field name + model_config = ConfigDict(extra="forbid", populate_by_name=True) - @validator("*", pre=True, each_item=True, allow_reuse=True) + @field_validator("*", mode="before") + @classmethod def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") @@ -253,11 +231,10 @@ class TokenPatternNumber(BaseModel): GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">") LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<") - class Config: - extra = "forbid" - allow_population_by_field_name = True # allow alias and field name + model_config = ConfigDict(extra="forbid", populate_by_name=True) - @validator("*", pre=True, each_item=True, allow_reuse=True) + @field_validator("*", mode="before") + @classmethod def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") @@ -271,8 +248,7 @@ class TokenPatternOperatorSimple(str, Enum): exclamation: StrictStr = StrictStr("!") -class TokenPatternOperatorMinMax(ConstrainedStr): - regex = re.compile(r"^({\d+}|{\d+,\d*}|{\d*,\d+})$") +TokenPatternOperatorMinMax = constr(pattern=r"^(\{\d+\}|\{\d+,\d*\}|\{\d*,\d+\})$") TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax] @@ -323,12 +299,10 @@ class TokenPattern(BaseModel): op: Optional[TokenPatternOperator] = None underscore: Optional[Dict[StrictStr, UnderscoreValue]] = Field(None, alias="_") - class Config: - extra = "forbid" - allow_population_by_field_name = True - alias_generator = lambda value: value.upper() + model_config = ConfigDict(extra="forbid", populate_by_name=True, alias_generator=lambda value: value.upper()) - @validator("*", pre=True, allow_reuse=True) + @field_validator("*", mode="before") + @classmethod def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") @@ -336,10 +310,9 @@ def raise_for_none(cls, v): class TokenPatternSchema(BaseModel): - pattern: List[TokenPattern] = Field(..., min_items=1) + pattern: List[TokenPattern] = Field(..., min_length=1) - class Config: - extra = "forbid" + model_config = ConfigDict(extra="forbid") # Model meta @@ -397,9 +370,7 @@ class ConfigSchemaTraining(BaseModel): before_update: Optional[Callable[["Language", Dict[str, Any]], None]] = Field(..., title="Optional callback that is invoked at the start of each training step") # fmt: on - class Config: - extra = "forbid" - arbitrary_types_allowed = True + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) class ConfigSchemaNlp(BaseModel): @@ -415,14 +386,11 @@ class ConfigSchemaNlp(BaseModel): vectors: Callable = Field(..., title="Vectors implementation") # fmt: on - class Config: - extra = "forbid" - arbitrary_types_allowed = True + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) class ConfigSchemaPretrainEmpty(BaseModel): - class Config: - extra = "forbid" + model_config = ConfigDict(extra="forbid") class ConfigSchemaPretrain(BaseModel): @@ -439,9 +407,7 @@ class ConfigSchemaPretrain(BaseModel): objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.") # fmt: on - class Config: - extra = "forbid" - arbitrary_types_allowed = True + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) class ConfigSchemaInit(BaseModel): @@ -456,9 +422,7 @@ class ConfigSchemaInit(BaseModel): after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization") # fmt: on - class Config: - extra = "forbid" - arbitrary_types_allowed = True + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) class ConfigSchema(BaseModel): @@ -469,9 +433,7 @@ class ConfigSchema(BaseModel): corpora: Dict[str, Reader] initialize: ConfigSchemaInit - class Config: - extra = "allow" - arbitrary_types_allowed = True + model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True) CONFIG_SCHEMAS = { diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py index 9854b391e60..546068800fa 100644 --- a/spacy/tests/pipeline/test_initialize.py +++ b/spacy/tests/pipeline/test_initialize.py @@ -1,9 +1,6 @@ import pytest -try: - from pydantic.v1 import StrictBool -except ImportError: - from pydantic import StrictBool # type: ignore +from pydantic import StrictBool from thinc.api import ConfigValidationError diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index b355379bfd0..f8f9ec4b10d 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -1,9 +1,6 @@ import pytest -try: - from pydantic.v1 import StrictInt, StrictStr -except ImportError: - from pydantic import StrictInt, StrictStr # type: ignore +from pydantic import StrictInt, StrictStr from thinc.api import ConfigValidationError, Linear, Model diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index d2a41ff0fed..2c26952891d 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -4,10 +4,7 @@ import pytest -try: - from pydantic.v1 import ValidationError -except ImportError: - from pydantic import ValidationError # type: ignore +from pydantic import ValidationError from thinc.api import ( Config, From f835985f3f517d7a83dea6b71716a2b32870fdd1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 14:38:21 +0100 Subject: [PATCH 19/42] Increment version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index a93d91532b6..df33ff96bfe 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,5 +1,5 @@ # fmt: off __title__ = "spacy" -__version__ = "3.8.11" +__version__ = "3.8.12" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From b8bade1a570bbbc41e0d0ebf6ef5300c5d064b3d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 15:03:41 +0100 Subject: [PATCH 20/42] Update spaCy to pydantic v2 native API - Replace pydantic.v1 compat imports with direct v2 imports - Replace class Config with model_config = ConfigDict(...) - Replace @validator with @field_validator - Replace ConstrainedStr with constr() - Replace min_items with min_length, allow_population_by_field_name with populate_by_name - Add model_rebuild() calls in __init__.py for forward ref resolution - Update test error type assertions for v2 --- spacy/__init__.py | 21 +++++++++++++++++++++ spacy/tests/pipeline/test_initialize.py | 4 ++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 8bb8b49498e..eeab3773591 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -21,6 +21,27 @@ from .util import logger, registry # noqa: F401 from .vocab import Vocab +# Rebuild pydantic v2 schemas that use forward references to Language/Vocab +from .schemas import ( # noqa: F401 + ConfigSchema, + ConfigSchemaInit, + ConfigSchemaNlp, + ConfigSchemaPretrain, + ConfigSchemaTraining, +) + +from .training import Example # noqa: F401 + +_rebuild_ns = {"Language": Language, "Vocab": Vocab, "Example": Example} +for _schema in ( + ConfigSchemaTraining, + ConfigSchemaNlp, + ConfigSchemaPretrain, + ConfigSchemaInit, + ConfigSchema, +): + _schema.model_rebuild(_types_namespace=_rebuild_ns) + if sys.maxunicode == 65535: raise SystemError(Errors.E130) diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py index 546068800fa..acb1c6faa45 100644 --- a/spacy/tests/pipeline/test_initialize.py +++ b/spacy/tests/pipeline/test_initialize.py @@ -48,7 +48,7 @@ def initialize( errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom1",) - assert errors[0]["type"] == "value_error.missing" + assert errors[0]["type"] == "missing" init_cfg = { "tokenizer": {"custom": 1}, "components": {name: {"custom1": "x", "custom2": 1}}, @@ -60,7 +60,7 @@ def initialize( errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom2",) - assert errors[0]["type"] == "value_error.strictbool" + assert errors[0]["type"] == "bool_type" init_cfg = { "tokenizer": {"custom": 1}, "components": {name: {"custom1": "x"}}, From 501ccfd5fb19d708714dc7d90dd7c9a289661ef9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 20:50:56 +0100 Subject: [PATCH 21/42] Fix pydantic v2 pattern validation error counts and attributeruler type annotation - Update expected error counts in test_pattern_validation.py for pydantic v2 (v2 reports errors for all union members, increasing counts for OP and nested pattern validation) - Fix AttributeRulerPatternType to include List[MatcherPatternType] in the union (v2 is strict about nested list-of-list-of-dict types that v1 accepted laxly) --- spacy/pipeline/attributeruler.py | 2 +- .../tests/matcher/test_pattern_validation.py | 34 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index cc1e2e37a64..fd037480cb2 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -19,7 +19,7 @@ from .pipe import Pipe MatcherPatternType = List[Dict[Union[int, str], Any]] -AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]] +AttributeRulerPatternType = Dict[str, Union[List[MatcherPatternType], MatcherPatternType, Dict, int]] TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]] MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 45f9f4ee718..554522fe8a1 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -10,30 +10,30 @@ # Bad patterns flagged in all cases ([{"XX": "foo"}], 1, 1), ([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2, 1), - ([{"IS_PUNCT": True, "OP": "$"}], 1, 1), + ([{"IS_PUNCT": True, "OP": "$"}], 2, 1), # v2: union reports 2 errors (enum + pattern) ([{"_": "foo"}], 1, 1), ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), ([{"ENT_IOB": "foo"}], 1, 1), ([1, 2, 3], 3, 1), - ([{"TEXT": "foo", "OP": "{,}"}], 1, 1), - ([{"TEXT": "foo", "OP": "{,4}4"}], 1, 1), - ([{"TEXT": "foo", "OP": "{a,3}"}], 1, 1), - ([{"TEXT": "foo", "OP": "{a}"}], 1, 1), - ([{"TEXT": "foo", "OP": "{,a}"}], 1, 1), - ([{"TEXT": "foo", "OP": "{1,2,3}"}], 1, 1), - ([{"TEXT": "foo", "OP": "{1, 3}"}], 1, 1), - ([{"TEXT": "foo", "OP": "{-2}"}], 1, 1), + ([{"TEXT": "foo", "OP": "{,}"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{,4}4"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{a,3}"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{a}"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{,a}"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{1,2,3}"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{1, 3}"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{-2}"}], 2, 1), # v2: union reports 2 errors # Bad patterns flagged outside of Matcher - ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0) + ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 7, 0), # v2: more detailed union errors # Bad patterns not flagged with minimal checks - ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 0), - ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 4, 0), # prev: (2, 0) - ([{"LENGTH": {"VALUE": 5}}], 2, 0), # prev: (1, 0) - ([{"TEXT": {"VALUE": "foo"}}], 2, 0), # prev: (1, 0) + ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 5, 0), # v2: more detailed union errors + ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 5, 0), # v2: more detailed union errors + ([{"LENGTH": {"VALUE": 5}}], 3, 0), # v2: more detailed union errors + ([{"TEXT": {"VALUE": "foo"}}], 2, 0), ([{"IS_DIGIT": -1}], 1, 0), - ([{"ORTH": -1}], 1, 0), - ([{"ENT_ID": -1}], 1, 0), - ([{"ENT_KB_ID": -1}], 1, 0), + ([{"ORTH": -1}], 2, 0), # v2: union reports 2 errors + ([{"ENT_ID": -1}], 2, 0), # v2: union reports 2 errors + ([{"ENT_KB_ID": -1}], 2, 0), # v2: union reports 2 errors # Good patterns ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0), ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0), From fd99ed312b792931d773d3334c92b13ca8c07961 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 21 Mar 2026 08:36:11 +0100 Subject: [PATCH 22/42] Replace black, isort, and flake8 with ruff for linting and formatting - requirements.txt: remove black, isort, flake8; add ruff - pyproject.toml: replace [tool.isort] with [tool.ruff] config - setup.cfg: remove [flake8] section (rules moved to pyproject.toml) - .pre-commit-config.yaml: replace black/flake8 hooks with ruff/ruff-format --- .pre-commit-config.yaml | 16 +++++----------- pyproject.toml | 9 ++++++++- requirements.txt | 4 +--- setup.cfg | 10 ---------- 4 files changed, 14 insertions(+), 25 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e2c5e98fd97..7d57c3a0c56 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,13 +1,7 @@ repos: -- repo: https://github.com/ambv/black - rev: 22.3.0 +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.9.0 hooks: - - id: black - language_version: python3.7 - additional_dependencies: ['click==8.0.4'] -- repo: https://github.com/pycqa/flake8 - rev: 5.0.4 - hooks: - - id: flake8 - args: - - "--config=setup.cfg" + - id: ruff + args: ['--fix'] + - id: ruff-format diff --git a/pyproject.toml b/pyproject.toml index 64b71429e6e..be47f69a005 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,5 +62,12 @@ repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest [tool.cibuildwheel.pyodide] -[tool.isort] +[tool.ruff] +line-length = 88 + +[tool.ruff.lint] +select = ["E", "F", "W", "C", "B", "B9"] +ignore = ["E203", "E266", "E501", "E731", "W503", "E741", "F541"] + +[tool.ruff.lint.isort] profile = "black" diff --git a/requirements.txt b/requirements.txt index 895bc3f7c07..866128b31b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,14 +26,12 @@ cython>=3.0,<4.0 pytest>=5.2.0,!=7.1.0 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 -flake8>=3.8.0,<6.0.0 hypothesis>=3.27.0,<7.0.0 mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8" types-mock>=0.1.1 types-setuptools>=57.0.0 types-requests types-setuptools>=57.0.0 -black>=25.0.0 +ruff>=0.9.0 cython-lint>=0.15.0 -isort>=5.0,<6.0 confection>=0.0.4,<1.0.0 diff --git a/setup.cfg b/setup.cfg index 585c2694fe2..1ef8e303125 100644 --- a/setup.cfg +++ b/setup.cfg @@ -131,16 +131,6 @@ universal = false [sdist] formats = gztar -[flake8] -ignore = E203, E266, E501, E731, W503, E741, F541 -max-line-length = 80 -select = B,C,E,F,W,T4,B9 -exclude = - .env, - .git, - __pycache__, - _tokenizer_exceptions_list.py, - [tool:pytest] markers = slow: mark a test as slow From adeb1620ec1082a0e4ebf0d052587236457bd9b0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 21 Mar 2026 08:36:53 +0100 Subject: [PATCH 23/42] Remove W503 from ruff ignore list (not a valid ruff rule) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index be47f69a005..fe2f45eb80a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,7 +67,7 @@ line-length = 88 [tool.ruff.lint] select = ["E", "F", "W", "C", "B", "B9"] -ignore = ["E203", "E266", "E501", "E731", "W503", "E741", "F541"] +ignore = ["E203", "E266", "E501", "E731", "E741", "F541"] [tool.ruff.lint.isort] profile = "black" From a7f629bb917d0a440cf6f236bb06fd5f1e88acfb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 21 Mar 2026 08:37:22 +0100 Subject: [PATCH 24/42] Fix ruff isort config: replace unsupported profile with equivalent settings --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fe2f45eb80a..9e6cab69da3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,4 +70,5 @@ select = ["E", "F", "W", "C", "B", "B9"] ignore = ["E203", "E266", "E501", "E731", "E741", "F541"] [tool.ruff.lint.isort] -profile = "black" +combine-as-imports = true +split-on-trailing-comma = true From 32c4b638ae0e042264fadc3ceb3ab67147c7b6f5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 21 Mar 2026 08:38:53 +0100 Subject: [PATCH 25/42] Format with ruff --- spacy/cli/apply.py | 15 +- spacy/cli/assemble.py | 23 +- spacy/cli/benchmark_speed.py | 28 ++- spacy/cli/convert.py | 52 ++++- spacy/cli/debug_config.py | 26 ++- spacy/cli/debug_data.py | 29 ++- spacy/cli/debug_diff.py | 36 +++- spacy/cli/debug_model.py | 18 +- spacy/cli/download.py | 13 +- spacy/cli/evaluate.py | 45 +++- spacy/cli/find_function.py | 4 +- spacy/cli/find_threshold.py | 38 +++- spacy/cli/info.py | 22 +- spacy/cli/init_config.py | 72 +++++-- spacy/cli/init_pipeline.py | 83 ++++++-- spacy/cli/package.py | 68 ++++-- spacy/cli/pretrain.py | 29 ++- spacy/cli/profile.py | 11 +- spacy/cli/train.py | 29 ++- spacy/lang/af/stop_words.py | 6 +- spacy/lang/am/lex_attrs.py | 2 +- spacy/lang/am/stop_words.py | 6 +- spacy/lang/ar/lex_attrs.py | 12 +- spacy/lang/ar/stop_words.py | 6 +- spacy/lang/az/stop_words.py | 6 +- spacy/lang/bg/stop_words.py | 6 +- spacy/lang/bn/stop_words.py | 6 +- spacy/lang/bo/stop_words.py | 6 +- spacy/lang/ca/stop_words.py | 6 +- spacy/lang/cs/stop_words.py | 6 +- spacy/lang/da/stop_words.py | 6 +- spacy/lang/de/stop_words.py | 6 +- spacy/lang/dsb/stop_words.py | 6 +- spacy/lang/el/stop_words.py | 6 +- spacy/lang/el/tokenizer_exceptions.py | 1 - spacy/lang/en/stop_words.py | 6 +- spacy/lang/es/lemmatizer.py | 5 +- spacy/lang/es/stop_words.py | 6 +- spacy/lang/et/stop_words.py | 6 +- spacy/lang/eu/stop_words.py | 6 +- spacy/lang/fa/lex_attrs.py | 12 +- spacy/lang/fa/stop_words.py | 6 +- spacy/lang/fi/stop_words.py | 6 +- spacy/lang/fr/lex_attrs.py | 12 +- spacy/lang/fr/stop_words.py | 6 +- spacy/lang/ga/stop_words.py | 6 +- spacy/lang/gd/stop_words.py | 6 +- spacy/lang/grc/stop_words.py | 6 +- spacy/lang/gu/stop_words.py | 6 +- spacy/lang/he/stop_words.py | 6 +- spacy/lang/hi/stop_words.py | 6 +- spacy/lang/hr/stop_words.py | 6 +- spacy/lang/hsb/stop_words.py | 6 +- spacy/lang/ht/lex_attrs.py | 12 +- spacy/lang/ht/stop_words.py | 6 +- spacy/lang/hu/stop_words.py | 6 +- spacy/lang/hy/stop_words.py | 6 +- spacy/lang/id/_tokenizer_exceptions_list.py | 6 +- spacy/lang/id/stop_words.py | 6 +- spacy/lang/id/tokenizer_exceptions.py | 4 +- spacy/lang/is/stop_words.py | 6 +- spacy/lang/it/stop_words.py | 6 +- spacy/lang/ja/stop_words.py | 6 +- spacy/lang/kmr/stop_words.py | 6 +- spacy/lang/kn/stop_words.py | 6 +- spacy/lang/ko/stop_words.py | 6 +- spacy/lang/ky/stop_words.py | 6 +- spacy/lang/la/stop_words.py | 6 +- spacy/lang/lb/lex_attrs.py | 12 +- spacy/lang/lb/stop_words.py | 6 +- spacy/lang/lg/stop_words.py | 6 +- spacy/lang/lij/stop_words.py | 6 +- spacy/lang/lv/stop_words.py | 6 +- spacy/lang/mk/stop_words.py | 6 +- spacy/lang/ml/stop_words.py | 6 +- spacy/lang/mr/stop_words.py | 6 +- spacy/lang/ms/_tokenizer_exceptions_list.py | 6 +- spacy/lang/ms/examples.py | 2 +- spacy/lang/ms/stop_words.py | 6 +- spacy/lang/nb/stop_words.py | 6 +- spacy/lang/ne/stop_words.py | 6 +- spacy/lang/nl/lex_attrs.py | 12 +- spacy/lang/nl/stop_words.py | 6 +- spacy/lang/pl/stop_words.py | 6 +- spacy/lang/pt/stop_words.py | 6 +- spacy/lang/ro/lex_attrs.py | 12 +- spacy/lang/ro/stop_words.py | 6 +- spacy/lang/ru/lex_attrs.py | 8 +- spacy/lang/ru/stop_words.py | 6 +- spacy/lang/sa/stop_words.py | 6 +- spacy/lang/si/stop_words.py | 6 +- spacy/lang/sk/stop_words.py | 6 +- spacy/lang/sl/lex_attrs.py | 18 +- spacy/lang/sl/stop_words.py | 6 +- spacy/lang/sq/stop_words.py | 6 +- spacy/lang/sr/stop_words.py | 6 +- spacy/lang/sv/stop_words.py | 6 +- spacy/lang/ta/stop_words.py | 6 +- spacy/lang/te/stop_words.py | 6 +- spacy/lang/th/stop_words.py | 6 +- spacy/lang/ti/stop_words.py | 6 +- spacy/lang/tl/stop_words.py | 6 +- spacy/lang/tn/stop_words.py | 6 +- spacy/lang/tokenizer_exceptions.py | 16 +- spacy/lang/tr/stop_words.py | 6 +- spacy/lang/tt/stop_words.py | 6 +- spacy/lang/uk/stop_words.py | 6 +- spacy/lang/ur/lex_attrs.py | 4 +- spacy/lang/ur/stop_words.py | 6 +- spacy/lang/vi/stop_words.py | 6 +- spacy/lang/zh/stop_words.py | 6 +- spacy/language.py | 10 +- spacy/matcher/dependencymatcher.pyi | 6 +- spacy/matcher/matcher.pyi | 6 +- spacy/ml/models/entity_linker.py | 6 +- spacy/ml/staticvectors.py | 2 +- .../pipeline/_edit_tree_internals/schemas.py | 11 +- spacy/pipeline/attributeruler.py | 7 +- spacy/schemas.py | 6 +- spacy/tests/lang/et/test_tokenizer.py | 3 +- .../tests/matcher/test_pattern_validation.py | 24 ++- spacy/tests/package/test_requirements.py | 6 +- spacy/tests/pipeline/test_entity_linker.py | 12 +- spacy/tests/pipeline/test_sentencizer.py | 40 +++- spacy/tests/pipeline/test_tok2vec.py | 58 ++++- spacy/tests/test_cli.py | 198 ++++++++++++++++-- spacy/tests/test_factory_imports.py | 12 +- spacy/tests/test_factory_registrations.py | 6 +- spacy/tests/test_registry_population.py | 6 +- spacy/tokens/doc.pyi | 10 +- spacy/tokens/span.pyi | 8 +- spacy/tokens/span_group.pyi | 2 +- spacy/tokens/token.pyi | 8 +- spacy/training/batchers.py | 8 +- .../training/converters/conll_ner_to_docs.py | 3 +- spacy/training/converters/conllu_to_docs.py | 2 +- spacy/training/loggers.py | 2 +- spacy/ty.py | 4 +- spacy/util.py | 4 +- 139 files changed, 1279 insertions(+), 432 deletions(-) diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py index ffd8105060a..7671026f488 100644 --- a/spacy/cli/apply.py +++ b/spacy/cli/apply.py @@ -22,7 +22,7 @@ out_help = "Path to save the resulting .spacy file" code_help = ( - "Path to Python file with additional " "code (registered functions) to be imported" + "Path to Python file with additional code (registered functions) to be imported" ) gold_help = "Use gold preprocessing provided in the .spacy files" force_msg = ( @@ -72,11 +72,15 @@ def apply_cli( data_path: Path = Arg(..., help=path_help, exists=True), output_file: Path = Arg(..., help=out_help, dir_okay=False), code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help), - text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"), - force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"), + text_key: str = Opt( + "text", "--text-key", "-tk", help="Key containing text string for JSONL" + ), + force_overwrite: bool = Opt( + False, "--force", "-F", help="Force overwriting the output file" + ), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."), batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."), - n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.") + n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use."), ): """ Apply a trained pipeline to documents to get predictions. @@ -114,8 +118,7 @@ def apply( if len(paths) == 0: docbin.to_disk(output_file) msg.warn( - "Did not find data to process," - f" {data_path} seems to be an empty directory." + f"Did not find data to process, {data_path} seems to be an empty directory." ) return nlp = load_model(model) diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py index f74bbacb555..bc97a9d594f 100644 --- a/spacy/cli/assemble.py +++ b/spacy/cli/assemble.py @@ -24,10 +24,25 @@ def assemble_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), + output_path: Path = Arg( + ..., help="Output directory to store assembled pipeline in" + ), + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + verbose: bool = Opt( + False, + "--verbose", + "-V", + "-VV", + help="Display more information for debugging purposes", + ), # fmt: on ): """ diff --git a/spacy/cli/benchmark_speed.py b/spacy/cli/benchmark_speed.py index 4dd10049cda..052e7d43416 100644 --- a/spacy/cli/benchmark_speed.py +++ b/spacy/cli/benchmark_speed.py @@ -24,13 +24,29 @@ def benchmark_speed_cli( # fmt: off ctx: typer.Context, model: str = Arg(..., help="Model name or path"), - data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), - batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"), + data_path: Path = Arg( + ..., help="Location of binary evaluation data in .spacy format", exists=True + ), + batch_size: Optional[int] = Opt( + None, "--batch-size", "-b", min=1, help="Override the pipeline batch size" + ), no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), - n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,), - warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + n_batches: int = Opt( + 50, + "--batches", + help="Minimum number of batches to benchmark", + min=30, + ), + warmup_epochs: int = Opt( + 3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup" + ), + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), # fmt: on ): """ @@ -151,7 +167,7 @@ def print_mean_with_ci(sample: numpy.ndarray): low = bootstrap_means[int(len(bootstrap_means) * 0.025)] high = bootstrap_means[int(len(bootstrap_means) * 0.975)] - print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})") + print(f"Mean: {mean:.1f} words/s (95% CI: {low - mean:.1f} +{high - mean:.1f})") def print_outliers(sample: numpy.ndarray): diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index a66a68133b3..140999207f3 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -48,17 +48,47 @@ class FileTypes(str, Enum): def convert_cli( # fmt: off input_path: str = Arg(..., help="Input file or directory", exists=True), - output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True), - file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"), - n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"), - seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"), - model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"), - morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"), - merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"), - converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), - ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), - lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), - concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"), + output_dir: Path = Arg( + "-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True + ), + file_type: FileTypes = Opt( + "spacy", "--file-type", "-t", help="Type of data to produce" + ), + n_sents: int = Opt( + 1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)" + ), + seg_sents: bool = Opt( + False, "--seg-sents", "-s", help="Segment sentences (for -c ner)" + ), + model: Optional[str] = Opt( + None, + "--model", + "--base", + "-b", + help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)", + ), + morphology: bool = Opt( + False, "--morphology", "-m", help="Enable appending morphology to tags" + ), + merge_subtokens: bool = Opt( + False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens" + ), + converter: str = Opt( + AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}" + ), + ner_map: Optional[Path] = Opt( + None, + "--ner-map", + "-nm", + help="NER tag mapping (as JSON-encoded dict of entity types)", + exists=True, + ), + lang: Optional[str] = Opt( + None, "--lang", "-l", help="Language (if tokenizer required)" + ), + concatenate: bool = Opt( + None, "--concatenate", "-C", help="Concatenate output to a single file" + ), # fmt: on ): """ diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index 0e5382cd956..f049d7fd149 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -26,10 +26,28 @@ def debug_config_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"), - show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.") + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), + code_path: Optional[Path] = Opt( + None, + "--code-path", + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + show_funcs: bool = Opt( + False, + "--show-functions", + "-F", + help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)", + ), + show_vars: bool = Opt( + False, + "--show-variables", + "-V", + help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.", + ), # fmt: on ): """Debug a config file and show validation errors. The command will diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 1c9c0e0ea3a..df52250cf90 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -71,11 +71,28 @@ def debug_data_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), - verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"), - no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"), + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), + code_path: Optional[Path] = Opt( + None, + "--code-path", + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + ignore_warnings: bool = Opt( + False, + "--ignore-warnings", + "-IW", + help="Ignore warnings, only show stats and errors", + ), + verbose: bool = Opt( + False, "--verbose", "-V", help="Print additional information and explanations" + ), + no_format: bool = Opt( + False, "--no-format", "-NF", help="Don't pretty-print the results" + ), # fmt: on ): """ @@ -708,7 +725,7 @@ def debug_data( if len(dev_not_train) != 0: pct = len(dev_not_train) / len(trees_dev) msg.info( - f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)" + f"{len(dev_not_train)} lemmatizer trees ({pct * 100:.1f}% of dev trees)" " were found exclusively in the dev data." ) else: diff --git a/spacy/cli/debug_diff.py b/spacy/cli/debug_diff.py index c53b0acab50..08c31b32df3 100644 --- a/spacy/cli/debug_diff.py +++ b/spacy/cli/debug_diff.py @@ -17,12 +17,36 @@ def debug_diff_cli( # fmt: off ctx: typer.Context, - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True), - optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."), - gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."), - pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."), - markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues") + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), + compare_to: Optional[Path] = Opt( + None, + help="Path to a config file to diff against, or `None` to compare against default settings", + exists=True, + allow_dash=True, + ), + optimize: Optimizations = Opt( + Optimizations.efficiency.value, + "--optimize", + "-o", + help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config.", + ), + gpu: bool = Opt( + False, + "--gpu", + "-G", + help="Whether the original config can run on a GPU. Only relevant when comparing against the default config.", + ), + pretraining: bool = Opt( + False, + "--pretraining", + "--pt", + help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config.", + ), + markdown: bool = Opt( + False, "--markdown", "-md", help="Generate Markdown for GitHub issues" + ), # fmt: on ): """Show a diff of a config file with respect to spaCy's defaults or another config file. If diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 3c667e42a2b..ec7ffe099d7 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -36,18 +36,26 @@ def debug_model_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - component: str = Arg(..., help="Name of the pipeline component of which the model should be analysed"), - layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"), + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), + component: str = Arg( + ..., help="Name of the pipeline component of which the model should be analysed" + ), + layers: str = Opt( + "", "--layers", "-l", help="Comma-separated names of layer IDs to print" + ), dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"), parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"), gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"), attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"), P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"), - P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"), + P1: bool = Opt( + False, "--print-step1", "-P1", help="Print model after initialization" + ), P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"), P3: bool = Opt(False, "--print-step3", "-P3", help="Print final predictions"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), # fmt: on ): """ diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 120616753b8..8a1110dcef3 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -28,9 +28,16 @@ def download_cli( # fmt: off ctx: typer.Context, model: str = Arg(..., help="Name of pipeline package to download"), - direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"), - sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel"), - url: str = Opt(None, "--url", "-U", help="Download from given url") + direct: bool = Opt( + False, "--direct", "-d", "-D", help="Force direct download of name + version" + ), + sdist: bool = Opt( + False, + "--sdist", + "-S", + help="Download sdist (.tar.gz) archive instead of pre-built binary wheel", + ), + url: str = Opt(None, "--url", "-U", help="Download from given url"), # fmt: on ): """ diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 2276ca6b0d4..62131fe13ea 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -20,15 +20,42 @@ def evaluate_cli( # fmt: off model: str = Arg(..., help="Model name or path"), - data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), - output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + data_path: Path = Arg( + ..., help="Location of binary evaluation data in .spacy format", exists=True + ), + output: Optional[Path] = Opt( + None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False + ), + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), - gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), - displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), - displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), - per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."), - spans_key: str = Opt("sc", "--spans-key", "-sk", help="Spans key to use when evaluating Doc.spans"), + gold_preproc: bool = Opt( + False, "--gold-preproc", "-G", help="Use gold preprocessing" + ), + displacy_path: Optional[Path] = Opt( + None, + "--displacy-path", + "-dp", + help="Directory to output rendered parses as HTML", + exists=True, + file_okay=False, + ), + displacy_limit: int = Opt( + 25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML" + ), + per_component: bool = Opt( + False, + "--per-component", + "-P", + help="Return scores per component, only applicable when an output JSON file is specified.", + ), + spans_key: str = Opt( + "sc", "--spans-key", "-sk", help="Spans key to use when evaluating Doc.spans" + ), # fmt: on ): """ @@ -123,7 +150,7 @@ def evaluate( if key == "speed": results[metric] = f"{scores[key]:.0f}" else: - results[metric] = f"{scores[key]*100:.2f}" + results[metric] = f"{scores[key] * 100:.2f}" else: results[metric] = "-" data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] diff --git a/spacy/cli/find_function.py b/spacy/cli/find_function.py index f99ce2adc9f..3b3b333337b 100644 --- a/spacy/cli/find_function.py +++ b/spacy/cli/find_function.py @@ -11,7 +11,9 @@ def find_function_cli( # fmt: off func_name: str = Arg(..., help="Name of the registered function."), - registry_name: Optional[str] = Opt(None, "--registry", "-r", help="Name of the catalogue registry."), + registry_name: Optional[str] = Opt( + None, "--registry", "-r", help="Name of the catalogue registry." + ), # fmt: on ): """ diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index ff7af32e6f6..d89b4c27d55 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -27,15 +27,39 @@ def find_threshold_cli( # fmt: off model: str = Arg(..., help="Model name or path"), - data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), + data_path: Path = Arg( + ..., help="Location of binary evaluation data in .spacy format", exists=True + ), pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"), - threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"), + threshold_key: str = Arg( + ..., help="Key of threshold attribute in component's configuration" + ), scores_key: str = Arg(..., help="Metric to optimize"), - n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), - gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + n_trials: int = Opt( + _DEFAULTS["n_trials"], + "--n_trials", + "-n", + help="Number of trials to determine optimal thresholds", + ), + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + use_gpu: int = Opt( + _DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU" + ), + gold_preproc: bool = Opt( + _DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing" + ), + verbose: bool = Opt( + False, + "--verbose", + "-V", + "-VV", + help="Display more information for debugging purposes", + ), # fmt: on ): """ diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 8bfc6b54f15..ed2394c564e 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -16,10 +16,24 @@ def info_cli( # fmt: off model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"), - markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"), - silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"), - exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"), - url: bool = Opt(False, "--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"), + markdown: bool = Opt( + False, "--markdown", "-md", help="Generate Markdown for GitHub issues" + ), + silent: bool = Opt( + False, "--silent", "-s", "-S", help="Don't print anything (just return)" + ), + exclude: str = Opt( + "labels", + "--exclude", + "-e", + help="Comma-separated keys to exclude from the print-out", + ), + url: bool = Opt( + False, + "--url", + "-u", + help="Print the URL to download the most recent compatible version of the pipeline", + ), # fmt: on ): """ diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index a7fb2b5b81f..2cb39056d5e 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -49,13 +49,44 @@ class InitValues: @init_cli.command("config") def init_config_cli( # fmt: off - output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), - lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"), - pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"), - optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), - gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."), - pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), - force_overwrite: bool = Opt(InitValues.force_overwrite, "--force", "-F", help="Force overwriting the output file"), + output_file: Path = Arg( + ..., + help="File to save the config to or - for stdout (will only output config and no additional logging info)", + allow_dash=True, + ), + lang: str = Opt( + InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use" + ), + pipeline: str = Opt( + ",".join(InitValues.pipeline), + "--pipeline", + "-p", + help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')", + ), + optimize: Optimizations = Opt( + InitValues.optimize, + "--optimize", + "-o", + help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters.", + ), + gpu: bool = Opt( + InitValues.gpu, + "--gpu", + "-G", + help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters.", + ), + pretraining: bool = Opt( + InitValues.pretraining, + "--pretraining", + "-pt", + help="Include config for pretraining (with 'spacy pretrain')", + ), + force_overwrite: bool = Opt( + InitValues.force_overwrite, + "--force", + "-F", + help="Force overwriting the output file", + ), # fmt: on ): """ @@ -88,11 +119,28 @@ def init_config_cli( @init_cli.command("fill-config") def init_fill_config_cli( # fmt: off - base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False), - output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True), - pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), - diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"), - code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + base_path: Path = Arg( + ..., help="Path to base config to fill", exists=True, dir_okay=False + ), + output_file: Path = Arg( + "-", help="Path to output .cfg file (or - for stdout)", allow_dash=True + ), + pretraining: bool = Opt( + False, + "--pretraining", + "-pt", + help="Include config for pretraining (with 'spacy pretrain')", + ), + diff: bool = Opt( + False, "--diff", "-D", help="Print a visual diff highlighting the changes" + ), + code_path: Optional[Path] = Opt( + None, + "--code-path", + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), # fmt: on ): """ diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 21eea8edf2f..1c0ff526235 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -26,13 +26,42 @@ def init_vectors_cli( lang: str = Arg(..., help="The language of the nlp object to create"), vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True), output_dir: Path = Arg(..., help="Pipeline output directory"), - prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), - truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), + prune: int = Opt( + -1, "--prune", "-p", help="Optional number of vectors to prune to" + ), + truncate: int = Opt( + 0, + "--truncate", + "-t", + help="Optional number of vectors to truncate to when reading in vectors file", + ), mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"), - name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), - attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"), + name: Optional[str] = Opt( + None, + "--name", + "-n", + help="Optional name for the word vectors, e.g. en_core_web_lg.vectors", + ), + verbose: bool = Opt( + False, + "--verbose", + "-V", + "-VV", + help="Display more information for debugging purposes", + ), + jsonl_loc: Optional[Path] = Opt( + None, + "--lexemes-jsonl", + "-j", + help="Location of JSONL-formatted attributes file", + hidden=True, + ), + attr: str = Opt( + "ORTH", + "--attr", + "-a", + help="Optional token attribute to use for vectors, e.g. LOWER or NORM", + ), # fmt: on ): """Convert word vectors for use with spaCy. Will export an nlp object that @@ -81,11 +110,24 @@ def update_lexemes(nlp: Language, jsonl_loc: Path) -> None: def init_pipeline_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), output_path: Path = Arg(..., help="Output directory for the prepared data"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + verbose: bool = Opt( + False, + "--verbose", + "-V", + "-VV", + help="Display more information for debugging purposes", + ), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), # fmt: on ): if verbose: @@ -108,11 +150,24 @@ def init_pipeline_cli( def init_labels_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), output_path: Path = Arg(..., help="Output directory for the labels"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + verbose: bool = Opt( + False, + "--verbose", + "-V", + "-VV", + help="Display more information for debugging purposes", + ), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), # fmt: on ): """Generate JSON files for the labels in the data. This helps speed up the diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 67b1d318651..9291aae2827 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -21,16 +21,56 @@ @app.command("package") def package_cli( # fmt: off - input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False), - output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), - code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"), - meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), - create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"), - name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"), - version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"), - build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."), - force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"), - require_parent: bool = Opt(True, "--require-parent/--no-require-parent", "-R", "-R", help="Include the parent package (e.g. spacy) in the requirements"), + input_dir: Path = Arg( + ..., help="Directory with pipeline data", exists=True, file_okay=False + ), + output_dir: Path = Arg( + ..., help="Output parent directory", exists=True, file_okay=False + ), + code_paths: str = Opt( + "", + "--code", + "-c", + help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package", + ), + meta_path: Optional[Path] = Opt( + None, + "--meta-path", + "--meta", + "-m", + help="Path to meta.json", + exists=True, + dir_okay=False, + ), + create_meta: bool = Opt( + False, "--create-meta", "-C", help="Create meta.json, even if one exists" + ), + name: Optional[str] = Opt( + None, "--name", "-n", help="Package name to override meta" + ), + version: Optional[str] = Opt( + None, "--version", "-v", help="Package version to override meta" + ), + build: str = Opt( + "sdist", + "--build", + "-b", + help="Comma-separated formats to build: sdist and/or wheel, or none.", + ), + force: bool = Opt( + False, + "--force", + "-f", + "-F", + help="Force overwriting existing data in output directory", + ), + require_parent: bool = Opt( + True, + "--require-parent/--no-require-parent", + "-R", + "-R", + help="Include the parent package (e.g. spacy) in the requirements", + ), # fmt: on ): """ @@ -410,7 +450,7 @@ def generate_readme(meta: Dict[str, Any]) -> str: pipeline = ", ".join([md.code(p) for p in meta.get("pipeline", [])]) components = ", ".join([md.code(p) for p in meta.get("components", [])]) vecs = meta.get("vectors", {}) - vectors = f"{vecs.get('keys', 0)} keys, {vecs.get('vectors', 0)} unique vectors ({ vecs.get('width', 0)} dimensions)" + vectors = f"{vecs.get('keys', 0)} keys, {vecs.get('vectors', 0)} unique vectors ({vecs.get('width', 0)} dimensions)" author = meta.get("author") or "n/a" notes = meta.get("notes", "") license_name = meta.get("license") @@ -469,7 +509,7 @@ def _format_accuracy(data: Dict[str, Any], exclude: List[str] = ["speed"]) -> st md = MarkdownRenderer() scalars = [(k, v) for k, v in data.items() if isinstance(v, (int, float))] scores = [ - (md.code(acc.upper()), f"{score*100:.2f}") + (md.code(acc.upper()), f"{score * 100:.2f}") for acc, score in scalars if acc not in exclude ] @@ -488,9 +528,7 @@ def _format_label_scheme(data: Dict[str, Any]) -> str: if not labels: continue col1 = md.bold(md.code(pipe)) - col2 = ", ".join( - [md.code(str(label).replace("|", "\\|")) for label in labels] - ) # noqa: W605 + col2 = ", ".join([md.code(str(label).replace("|", "\\|")) for label in labels]) # noqa: W605 label_data.append((col1, col2)) n_labels += len(labels) n_pipes += 1 diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 446c40510df..daea861a952 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -25,13 +25,32 @@ def pretrain_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True), + config_path: Path = Arg( + ..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True + ), output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), - epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + resume_path: Optional[Path] = Opt( + None, + "--resume-path", + "-r", + help="Path to pretrained weights from which to resume pretraining", + ), + epoch_resume: Optional[int] = Opt( + None, + "--epoch-resume", + "-er", + help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files.", + ), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), - skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"), + skip_last: bool = Opt( + False, "--skip-last", "-L", help="Skip saving model-last.bin" + ), # fmt: on ): """ diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index e5b8f11939f..03f7127149e 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -21,8 +21,15 @@ def profile_cli( # fmt: off ctx: typer.Context, # This is only used to read current calling context model: str = Arg(..., help="Trained pipeline to load"), - inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True), - n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"), + inputs: Optional[Path] = Arg( + None, + help="Location of input file. '-' for stdin.", + exists=True, + allow_dash=True, + ), + n_texts: int = Opt( + 10000, "--n-texts", "-n", help="Maximum number of texts to use if available" + ), # fmt: on ): """ diff --git a/spacy/cli/train.py b/spacy/cli/train.py index c72e13b2681..379268286ee 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -26,11 +26,30 @@ def train_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), + output_path: Optional[Path] = Opt( + None, + "--output", + "--output-path", + "-o", + help="Output directory to store trained pipeline in", + ), + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + verbose: bool = Opt( + False, + "--verbose", + "-V", + "-VV", + help="Display more information for debugging purposes", + ), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), # fmt: on ): """ diff --git a/spacy/lang/af/stop_words.py b/spacy/lang/af/stop_words.py index 337afb57f8c..4b5a04a5eca 100644 --- a/spacy/lang/af/stop_words.py +++ b/spacy/lang/af/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/stopwords-iso/stopwords-af -STOP_WORDS = set(""" +STOP_WORDS = set( + """ 'n aan af @@ -52,4 +53,5 @@ was wat ʼn -""".split()) +""".split() +) diff --git a/spacy/lang/am/lex_attrs.py b/spacy/lang/am/lex_attrs.py index 9e111b8d5eb..c7b2aab35bf 100644 --- a/spacy/lang/am/lex_attrs.py +++ b/spacy/lang/am/lex_attrs.py @@ -60,7 +60,7 @@ "አስራ ስምንተኛ", "አስራ ዘጠነኛ", "ሃያኛ", - "ሰላሳኛ" "አርባኛ", + "ሰላሳኛአርባኛ", "አምሳኛ", "ስድሳኛ", "ሰባኛ", diff --git a/spacy/lang/am/stop_words.py b/spacy/lang/am/stop_words.py index 8a04c555f74..5487ada5aeb 100644 --- a/spacy/lang/am/stop_words.py +++ b/spacy/lang/am/stop_words.py @@ -1,7 +1,8 @@ # Stop words by Teshome Kassie http://etd.aau.edu.et/bitstream/handle/123456789/3315/Teshome%20Kassie.pdf?sequence=1&isAllowed=y # Stop words by Tihitina Petros http://etd.aau.edu.et/bitstream/handle/123456789/3384/Tihitina%20Petros.pdf?sequence=1&isAllowed=y -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ግን አንቺ አንተ እናንተ ያንተ ያንቺ የናንተ ራስህን ራስሽን ራሳችሁን ሁሉ ኋላ በሰሞኑ አሉ በኋላ ሁኔታ በኩል አስታውቀዋል ሆነ በውስጥ አስታውሰዋል ሆኑ ባጣም እስካሁን ሆኖም በተለይ አሳሰበ ሁል በተመለከተ @@ -28,4 +29,5 @@ በዚህም መሆን ምንጊዜም እነዚህም በዚህና ያለ ስም ሲኖር ከዚህም መሆኑን በሁኔታው የማያንስ እነዚህኑ ማንም ከነዚሁ ያላቸውን እጅግ ሲሆኑ ለሆኑ ሊሆን ለማናቸውም -""".split()) +""".split() +) diff --git a/spacy/lang/ar/lex_attrs.py b/spacy/lang/ar/lex_attrs.py index 6e943d064ee..54ad7a8c363 100644 --- a/spacy/lang/ar/lex_attrs.py +++ b/spacy/lang/ar/lex_attrs.py @@ -1,6 +1,7 @@ from ...attrs import LIKE_NUM -_num_words = set(""" +_num_words = set( + """ صفر واحد إثنان @@ -50,9 +51,11 @@ مليون مليار مليارات -""".split()) +""".split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ اول أول حاد @@ -67,7 +70,8 @@ ثامن تاسع عاشر -""".split()) +""".split() +) def like_num(text): diff --git a/spacy/lang/ar/stop_words.py b/spacy/lang/ar/stop_words.py index 65c8992cbd6..f4da54dda29 100644 --- a/spacy/lang/ar/stop_words.py +++ b/spacy/lang/ar/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ من نحو لعل @@ -385,4 +386,5 @@ وإن ولو يا -""".split()) +""".split() +) diff --git a/spacy/lang/az/stop_words.py b/spacy/lang/az/stop_words.py index 8beffa998da..2114939ba11 100644 --- a/spacy/lang/az/stop_words.py +++ b/spacy/lang/az/stop_words.py @@ -1,5 +1,6 @@ # Source: https://github.com/eliasdabbas/advertools/blob/master/advertools/stopwords.py -STOP_WORDS = set(""" +STOP_WORDS = set( + """ amma arasında artıq @@ -140,4 +141,5 @@ əlbəttə ən əslində -""".split()) +""".split() +) diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py index 7d3e756054d..061850da594 100644 --- a/spacy/lang/bg/stop_words.py +++ b/spacy/lang/bg/stop_words.py @@ -4,7 +4,8 @@ https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it. """ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ а автентичен аз ако ала бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат @@ -75,4 +76,5 @@ юмрук я як -""".split()) +""".split() +) diff --git a/spacy/lang/bn/stop_words.py b/spacy/lang/bn/stop_words.py index 5aec18b7f5b..bf38e32545e 100644 --- a/spacy/lang/bn/stop_words.py +++ b/spacy/lang/bn/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে আগামী আগে আগেই আছে আজ আদ্যভাগে আপনার আপনি আবার আমরা আমাকে আমাদের আমার আমি আর আরও ইত্যাদি ইহা @@ -37,4 +38,5 @@ সাধারণ সামনে সঙ্গে সঙ্গেও সব সবার সমস্ত সম্প্রতি সময় সহ সহিত সাথে সুতরাং সে সেই সেখান সেখানে সেটা সেটাই সেটাও সেটি স্পষ্ট স্বয়ং হইতে হইবে হইয়া হওয়া হওয়ায় হওয়ার হচ্ছে হত হতে হতেই হন হবে হবেন হয় হয়তো হয়নি হয়ে হয়েই হয়েছিল হয়েছে হাজার হয়েছেন হল হলে হলেই হলেও হলো হিসাবে হিসেবে হৈলে হোক হয় হয়ে হয়েছে হৈতে হইয়া হয়েছিল হয়েছেন হয়নি হয়েই হয়তো হওয়া হওয়ার হওয়ায় -""".split()) +""".split() +) diff --git a/spacy/lang/bo/stop_words.py b/spacy/lang/bo/stop_words.py index 158e148b00b..407242c849b 100644 --- a/spacy/lang/bo/stop_words.py +++ b/spacy/lang/bo/stop_words.py @@ -1,6 +1,7 @@ # Source: https://zenodo.org/records/10148636 -STOP_WORDS = set(""" +STOP_WORDS = set( + """ འི་ ། དུ་ @@ -193,4 +194,5 @@ གིང་ ཚ་ ཀྱང -""".split()) +""".split() +) diff --git a/spacy/lang/ca/stop_words.py b/spacy/lang/ca/stop_words.py index 90cce5de885..1a87b2f9dbe 100644 --- a/spacy/lang/ca/stop_words.py +++ b/spacy/lang/ca/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò als altra altre altres amb ambdues ambdós anar ans apa aquell aquella aquelles aquells aquest aquesta aquestes aquests aquí @@ -47,4 +48,5 @@ va vaig vam van vas veu vosaltres vostra vostre vostres -""".split()) +""".split() +) diff --git a/spacy/lang/cs/stop_words.py b/spacy/lang/cs/stop_words.py index 35db9fedc86..f61f424f6f4 100644 --- a/spacy/lang/cs/stop_words.py +++ b/spacy/lang/cs/stop_words.py @@ -1,7 +1,8 @@ # Source: https://github.com/Alir3z4/stop-words # Source: https://github.com/stopwords-iso/stopwords-cs/blob/master/stopwords-cs.txt -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a aby ahoj @@ -360,4 +361,5 @@ zatímco ze že -""".split()) +""".split() +) diff --git a/spacy/lang/da/stop_words.py b/spacy/lang/da/stop_words.py index 0e71dfde739..05b2084dde3 100644 --- a/spacy/lang/da/stop_words.py +++ b/spacy/lang/da/stop_words.py @@ -1,6 +1,7 @@ # Source: Handpicked by Jens Dahl Møllerhøj. -STOP_WORDS = set(""" +STOP_WORDS = set( + """ af aldrig alene alle allerede alligevel alt altid anden andet andre at bag begge blandt blev blive bliver burde bør @@ -40,4 +41,5 @@ var ved vi via vil ville vore vores vær være været øvrigt -""".split()) +""".split() +) diff --git a/spacy/lang/de/stop_words.py b/spacy/lang/de/stop_words.py index 5fbd7428757..f52687eb9b3 100644 --- a/spacy/lang/de/stop_words.py +++ b/spacy/lang/de/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ á a ab aber ach acht achte achten achter achtes ag alle allein allem allen aller allerdings alles allgemeinen als also am an andere anderen anderem andern anders auch auf aus ausser außer ausserdem außerdem @@ -73,4 +74,5 @@ zehn zehnte zehnten zehnter zehntes zeit zu zuerst zugleich zum zunächst zur zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen -""".split()) +""".split() +) diff --git a/spacy/lang/dsb/stop_words.py b/spacy/lang/dsb/stop_words.py index 90735a6236a..376e04aa6e5 100644 --- a/spacy/lang/dsb/stop_words.py +++ b/spacy/lang/dsb/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a abo aby ako ale až daniž dokulaž @@ -10,4 +11,5 @@ pak pótom teke togodla -""".split()) +""".split() +) diff --git a/spacy/lang/el/stop_words.py b/spacy/lang/el/stop_words.py index b5c1c36c41f..7c436219fa9 100644 --- a/spacy/lang/el/stop_words.py +++ b/spacy/lang/el/stop_words.py @@ -1,6 +1,7 @@ # Stop words # Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0 -STOP_WORDS = set(""" +STOP_WORDS = set( + """ αδιάκοπα αι ακόμα ακόμη ακριβώς άλλα αλλά αλλαχού άλλες άλλη άλλην άλλης αλλιώς αλλιώτικα άλλο άλλοι αλλοιώς αλλοιώτικα άλλον άλλος άλλοτε αλλού άλλους άλλων άμα άμεσα αμέσως αν ανά ανάμεσα αναμεταξύ άνευ αντί αντίπερα αντίς @@ -82,4 +83,5 @@ χωρίς χωριστά ω ως ωσάν ωσότου ώσπου ώστε ωστόσο ωχ -""".split()) +""".split() +) diff --git a/spacy/lang/el/tokenizer_exceptions.py b/spacy/lang/el/tokenizer_exceptions.py index 41317ba9770..d88d4837e2a 100644 --- a/spacy/lang/el/tokenizer_exceptions.py +++ b/spacy/lang/el/tokenizer_exceptions.py @@ -128,7 +128,6 @@ _exc.update(_other_exc) for h in range(1, 12 + 1): - for period in ["π.μ.", "πμ"]: _exc[f"{h}{period}"] = [ {ORTH: f"{h}"}, diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index cbce281b491..1ca5cbc1670 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -1,5 +1,6 @@ # Stop words -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a about above across after afterwards again against all almost alone along already also although always am among amongst amount an and another any anyhow anyone anything anyway anywhere are around as at @@ -61,7 +62,8 @@ whither who whoever whole whom whose why will with within without would yet you your yours yourself yourselves -""".split()) +""".split() +) contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"] STOP_WORDS.update(contractions) diff --git a/spacy/lang/es/lemmatizer.py b/spacy/lang/es/lemmatizer.py index ee5d38e8466..05238a75b70 100644 --- a/spacy/lang/es/lemmatizer.py +++ b/spacy/lang/es/lemmatizer.py @@ -415,7 +415,10 @@ def lemmatize_verb_pron( else: rule = self.select_rule("verb", features) verb_lemma = self.lemmatize_verb( - verb, features - {"PronType=Prs"}, rule, index # type: ignore[operator] + verb, + features - {"PronType=Prs"}, + rule, + index, # type: ignore[operator] )[0] pron_lemmas = [] for pron in prons: diff --git a/spacy/lang/es/stop_words.py b/spacy/lang/es/stop_words.py index 5099359e843..6d28854810a 100644 --- a/spacy/lang/es/stop_words.py +++ b/spacy/lang/es/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a acuerdo adelante ademas además afirmó agregó ahi ahora ahí al algo alguna algunas alguno algunos algún alli allí alrededor ambos ante anterior antes apenas aproximadamente aquel aquella aquellas aquello aquellos aqui aquél @@ -75,4 +76,5 @@ vosotras vosotros voy vuestra vuestras vuestro vuestros y ya yo -""".split()) +""".split() +) diff --git a/spacy/lang/et/stop_words.py b/spacy/lang/et/stop_words.py index 248bcb61f08..e1da1f14d5e 100644 --- a/spacy/lang/et/stop_words.py +++ b/spacy/lang/et/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/stopwords-iso/stopwords-et -STOP_WORDS = set(""" +STOP_WORDS = set( + """ aga ei et @@ -36,4 +37,5 @@ ta te ära -""".split()) +""".split() +) diff --git a/spacy/lang/eu/stop_words.py b/spacy/lang/eu/stop_words.py index 4a6661e7d20..d213b5b81a5 100644 --- a/spacy/lang/eu/stop_words.py +++ b/spacy/lang/eu/stop_words.py @@ -1,7 +1,8 @@ # Source: https://github.com/stopwords-iso/stopwords-eu # https://www.ranks.nl/stopwords/basque # https://www.mustgo.com/worldlanguages/basque/ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ al anitz arabera @@ -100,4 +101,5 @@ zuek zuen zuten -""".split()) +""".split() +) diff --git a/spacy/lang/fa/lex_attrs.py b/spacy/lang/fa/lex_attrs.py index 9b0ff546e0d..065e81bd6af 100644 --- a/spacy/lang/fa/lex_attrs.py +++ b/spacy/lang/fa/lex_attrs.py @@ -5,7 +5,8 @@ YE_NUN = "ین" -_num_words = set(""" +_num_words = set( + """ صفر یک دو @@ -62,12 +63,15 @@ کوادریلیون کادریلیارد کوینتیلیون -""".split()) +""".split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ اول سوم -سی‌ام""".split()) +سی‌ام""".split() +) _ordinal_words.update({num + MIM for num in _num_words}) _ordinal_words.update({num + ZWNJ_O_MIM for num in _num_words}) diff --git a/spacy/lang/fa/stop_words.py b/spacy/lang/fa/stop_words.py index 93738c89263..f462f2e7a5d 100644 --- a/spacy/lang/fa/stop_words.py +++ b/spacy/lang/fa/stop_words.py @@ -1,5 +1,6 @@ # Stop words from HAZM package -STOP_WORDS = set(""" +STOP_WORDS = set( + """ و در به @@ -388,4 +389,5 @@ لذا زاده گردد -اینجا""".split()) +اینجا""".split() +) diff --git a/spacy/lang/fi/stop_words.py b/spacy/lang/fi/stop_words.py index 742cacc2689..8e8dcfa565d 100644 --- a/spacy/lang/fi/stop_words.py +++ b/spacy/lang/fi/stop_words.py @@ -1,6 +1,7 @@ # Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt # Reformatted with some minor corrections -STOP_WORDS = set(""" +STOP_WORDS = set( + """ aiemmin aika aikaa aikaan aikaisemmin aikaisin aikana aikoina aikoo aikovat aina ainakaan ainakin ainoa ainoat aiomme aion aiotte aivan ajan alas alemmas alkuisin alkuun alla alle aloitamme aloitan aloitat aloitatte aloitattivat @@ -105,4 +106,5 @@ ympäri älköön älä -""".split()) +""".split() +) diff --git a/spacy/lang/fr/lex_attrs.py b/spacy/lang/fr/lex_attrs.py index 8a9dfb82a8b..9cf508a07b9 100644 --- a/spacy/lang/fr/lex_attrs.py +++ b/spacy/lang/fr/lex_attrs.py @@ -1,20 +1,24 @@ from ...attrs import LIKE_NUM -_num_words = set(""" +_num_words = set( + """ zero un une deux trois quatre cinq six sept huit neuf dix onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf vingt trente quarante cinquante soixante soixante-dix septante quatre-vingt huitante quatre-vingt-dix nonante cent mille mil million milliard billion quadrillion quintillion sextillion septillion octillion nonillion decillion -""".split()) +""".split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ premier première deuxième second seconde troisième quatrième cinquième sixième septième huitième neuvième dixième onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neuvième vingtième trentième quarantième cinquantième soixantième soixante-dixième septantième quatre-vingtième huitantième quatre-vingt-dixième nonantième centième millième millionnième milliardième billionnième quadrillionnième quintillionnième sextillionnième septillionnième octillionnième nonillionnième decillionnième -""".split()) +""".split() +) def like_num(text): diff --git a/spacy/lang/fr/stop_words.py b/spacy/lang/fr/stop_words.py index 85ffe47baef..b32ee3d7173 100644 --- a/spacy/lang/fr/stop_words.py +++ b/spacy/lang/fr/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a à â abord afin ah ai aie ainsi ait allaient allons alors anterieur anterieure anterieures antérieur antérieure antérieures apres après as assez attendu au @@ -79,4 +80,5 @@ y -""".split()) +""".split() +) diff --git a/spacy/lang/ga/stop_words.py b/spacy/lang/ga/stop_words.py index e32ad6431f6..4ef052ca58a 100644 --- a/spacy/lang/ga/stop_words.py +++ b/spacy/lang/ga/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a ach ag agus an aon ar arna as ba beirt bhúr @@ -38,4 +39,5 @@ í ó ón óna ónár -""".split()) +""".split() +) diff --git a/spacy/lang/gd/stop_words.py b/spacy/lang/gd/stop_words.py index 6f2c2856bec..d5132c35e31 100644 --- a/spacy/lang/gd/stop_words.py +++ b/spacy/lang/gd/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ 'ad 'ar 'd # iad @@ -381,4 +382,5 @@ ì ò ó -""".split("\n")) +""".split("\n") +) diff --git a/spacy/lang/grc/stop_words.py b/spacy/lang/grc/stop_words.py index 51f5e9d9dac..cbb766a8ce1 100644 --- a/spacy/lang/grc/stop_words.py +++ b/spacy/lang/grc/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ αὐτῷ αὐτοῦ αὐτῆς αὐτόν αὐτὸν αὐτῶν αὐτὸς αὐτὸ αὐτό αὐτός αὐτὴν αὐτοῖς αὐτοὺς αὔτ' αὐτὰ αὐτῇ αὐτὴ αὐτὼ αὑταὶ καὐτὸς αὐτά αὑτός αὐτοῖσι αὐτοῖσιν αὑτὸς αὐτήν αὐτοῖσί αὐτοί αὐτοὶ αὐτοῖο αὐτάων αὐτὰς αὐτέων αὐτώ αὐτάς αὐτούς αὐτή αὐταί αὐταὶ αὐτῇσιν τὠυτῷ τὠυτὸ ταὐτὰ ταύτῃ αὐτῇσι αὐτῇς αὐταῖς αὐτᾶς αὐτὰν ταὐτὸν @@ -56,4 +57,5 @@ ὣς ὡς ὥς ὧς ὥστ' ὥστε ὥσθ' ὤ ὢ - """.split()) + """.split() +) diff --git a/spacy/lang/gu/stop_words.py b/spacy/lang/gu/stop_words.py index 1d11a3ebd96..2c859681b05 100644 --- a/spacy/lang/gu/stop_words.py +++ b/spacy/lang/gu/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ એમ આ એ @@ -83,4 +84,5 @@ દર એટલો પરંતુ -""".split()) +""".split() +) diff --git a/spacy/lang/he/stop_words.py b/spacy/lang/he/stop_words.py index ea486722475..23bb5176de9 100644 --- a/spacy/lang/he/stop_words.py +++ b/spacy/lang/he/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ אני את אתה @@ -217,4 +218,5 @@ אחרות אשר או -""".split()) +""".split() +) diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py index 9bc57bd3136..475b07da152 100644 --- a/spacy/lang/hi/stop_words.py +++ b/spacy/lang/hi/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6 -STOP_WORDS = set(""" +STOP_WORDS = set( + """ अंदर अत अदि @@ -234,4 +235,5 @@ होते होना होने -""".split()) +""".split() +) diff --git a/spacy/lang/hr/stop_words.py b/spacy/lang/hr/stop_words.py index 769ebe4db53..dd10f792d01 100644 --- a/spacy/lang/hr/stop_words.py +++ b/spacy/lang/hr/stop_words.py @@ -1,5 +1,6 @@ # Source: https://github.com/stopwords-iso/stopwords-hr -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a ah aha @@ -339,4 +340,5 @@ željeo zimus zum -""".split()) +""".split() +) diff --git a/spacy/lang/hsb/stop_words.py b/spacy/lang/hsb/stop_words.py index 86021f555c1..e6fedaf4c92 100644 --- a/spacy/lang/hsb/stop_words.py +++ b/spacy/lang/hsb/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a abo ale ani dokelž @@ -14,4 +15,5 @@ tež tohodla zo zoby -""".split()) +""".split() +) diff --git a/spacy/lang/ht/lex_attrs.py b/spacy/lang/ht/lex_attrs.py index 27a535dd746..ab1a39a8234 100644 --- a/spacy/lang/ht/lex_attrs.py +++ b/spacy/lang/ht/lex_attrs.py @@ -1,20 +1,24 @@ from ...attrs import LIKE_NUM, NORM # Cardinal numbers in Creole -_num_words = set(""" +_num_words = set( + """ zewo youn en de twa kat senk sis sèt uit nèf dis onz douz trèz katoz kenz sèz disèt dizwit diznèf vent trant karant sinkant swasant swasann-dis san mil milyon milya -""".split()) +""".split() +) # Ordinal numbers in Creole (some are French-influenced, some simplified) -_ordinal_words = set(""" +_ordinal_words = set( + """ premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm ventyèm trantyèm karantyèm sinkantyèm swasantyèm swasann-disyèm santyèm milyèm milyonnyèm milyadyèm -""".split()) +""".split() +) NORM_MAP = { "'m": "mwen", diff --git a/spacy/lang/ht/stop_words.py b/spacy/lang/ht/stop_words.py index fd85c2a197f..50998e0e5ff 100644 --- a/spacy/lang/ht/stop_words.py +++ b/spacy/lang/ht/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a ak an ankò ant apre ap atò avan avanlè byen bò byenke @@ -38,7 +39,8 @@ men mèsi oswa osinon -""".split()) +""".split() +) # Add common contractions, with and without apostrophe variants contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"] diff --git a/spacy/lang/hu/stop_words.py b/spacy/lang/hu/stop_words.py index 1841557073a..e39a26d35ae 100644 --- a/spacy/lang/hu/stop_words.py +++ b/spacy/lang/hu/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben amelyeket amelyet amelynek ami amikor amit amolyan amíg annak arra arról az azok azon azonban azt aztán azután azzal azért @@ -57,4 +58,5 @@ úgy új újabb újra ő őket -""".split()) +""".split() +) diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py index 1bfd09a4b29..46d0f6b511c 100644 --- a/spacy/lang/hy/stop_words.py +++ b/spacy/lang/hy/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ նա ողջը այստեղ @@ -102,4 +103,5 @@ այս մեջ թ -""".split()) +""".split() +) diff --git a/spacy/lang/id/_tokenizer_exceptions_list.py b/spacy/lang/id/_tokenizer_exceptions_list.py index 11220a61e5b..a0b35fa1a2b 100644 --- a/spacy/lang/id/_tokenizer_exceptions_list.py +++ b/spacy/lang/id/_tokenizer_exceptions_list.py @@ -1,4 +1,5 @@ -ID_BASE_EXCEPTIONS = set(""" +ID_BASE_EXCEPTIONS = set( + """ aba-aba abah-abah abal-abal @@ -3897,4 +3898,5 @@ yo-yo zam-zam zig-zag -""".split()) +""".split() +) diff --git a/spacy/lang/id/stop_words.py b/spacy/lang/id/stop_words.py index fc85f83679a..b1bfaea796e 100644 --- a/spacy/lang/id/stop_words.py +++ b/spacy/lang/id/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal @@ -113,4 +114,5 @@ waduh wah wahai waktu waktunya walau walaupun wong yaitu yakin yakni yang -""".split()) +""".split() +) diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py index 8dea4e97fd1..8e206262c10 100644 --- a/spacy/lang/id/tokenizer_exceptions.py +++ b/spacy/lang/id/tokenizer_exceptions.py @@ -156,7 +156,7 @@ "S.T.", "S.T.Han", "S.Th.", - "S.Th.I" "S.TI.", + "S.Th.IS.TI.", "S.T.P.", "S.TrK", "S.Tekp.", @@ -210,7 +210,7 @@ "hlm.", "i/o", "n.b.", - "p.p." "pjs.", + "p.p.pjs.", "s.d.", "tel.", "u.p.", diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/is/stop_words.py index 79f84ee6000..917fb6df444 100644 --- a/spacy/lang/is/stop_words.py +++ b/spacy/lang/is/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/Xangis/extra-stopwords -STOP_WORDS = set(""" +STOP_WORDS = set( + """ afhverju aftan aftur @@ -153,4 +154,5 @@ því þær ætti -""".split()) +""".split() +) diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py index 2a37236a9b9..42adc7904c8 100644 --- a/spacy/lang/it/stop_words.py +++ b/spacy/lang/it/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl agli ahime ahimè ai al alcuna alcuni alcuno all alla alle allo allora altri altrimenti altro altrove altrui anche ancora anni anno ansa anticipo assai @@ -78,4 +79,5 @@ v' va vale vari varia varie vario verso vi via vicino visto vita voi volta volte vostra vostre vostri vostro -""".split()) +""".split() +) diff --git a/spacy/lang/ja/stop_words.py b/spacy/lang/ja/stop_words.py index 661b5183594..98560d7e28b 100644 --- a/spacy/lang/ja/stop_words.py +++ b/spacy/lang/ja/stop_words.py @@ -2,7 +2,8 @@ # filtering out everything that wasn't hiragana. ー (one) was also added. # Considered keeping some non-hiragana words but too many place names were # present. -STOP_WORDS = set(""" +STOP_WORDS = set( + """ あ あっ あまり あり ある あるいは あれ い いい いう いく いずれ いっ いつ いる いわ うち @@ -43,4 +44,5 @@ を ん 一 -""".split()) +""".split() +) diff --git a/spacy/lang/kmr/stop_words.py b/spacy/lang/kmr/stop_words.py index 93e6ea27f0c..aee33c2b748 100644 --- a/spacy/lang/kmr/stop_words.py +++ b/spacy/lang/kmr/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ û li bi @@ -39,4 +40,5 @@ hemû kes tişt -""".split()) +""".split() +) diff --git a/spacy/lang/kn/stop_words.py b/spacy/lang/kn/stop_words.py index 528e5e3a8a8..dba9740af91 100644 --- a/spacy/lang/kn/stop_words.py +++ b/spacy/lang/kn/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ಹಲವು ಮೂಲಕ ಹಾಗೂ @@ -81,4 +82,5 @@ ಎಂದು ನನ್ನ ಮೇಲೆ -""".split()) +""".split() +) diff --git a/spacy/lang/ko/stop_words.py b/spacy/lang/ko/stop_words.py index d4cdbc7a112..3eba9fc8299 100644 --- a/spacy/lang/ko/stop_words.py +++ b/spacy/lang/ko/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ 이 있 하 @@ -62,4 +63,5 @@ 원 잘 놓 -""".split()) +""".split() +) diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py index fb8e2c84b95..ea40bdfa222 100644 --- a/spacy/lang/ky/stop_words.py +++ b/spacy/lang/ky/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ага адам айтты айтымында айтып ал алар алардын алган алуу алып анда андан аны анын ар @@ -37,4 +38,5 @@ үч үчүн өз -""".split()) +""".split() +) diff --git a/spacy/lang/la/stop_words.py b/spacy/lang/la/stop_words.py index 47abf7384f4..8b590bb67b3 100644 --- a/spacy/lang/la/stop_words.py +++ b/spacy/lang/la/stop_words.py @@ -1,6 +1,7 @@ # Corrected Perseus list, cf. https://wiki.digitalclassicist.org/Stopwords_for_Greek_and_Latin -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ab ac ad adhuc aliqui aliquis an ante apud at atque aut autem cum cur @@ -32,4 +33,5 @@ ubi uel uero vel vero -""".split()) +""".split() +) diff --git a/spacy/lang/lb/lex_attrs.py b/spacy/lang/lb/lex_attrs.py index bbef72b9bb3..11923137418 100644 --- a/spacy/lang/lb/lex_attrs.py +++ b/spacy/lang/lb/lex_attrs.py @@ -1,18 +1,22 @@ from ...attrs import LIKE_NUM -_num_words = set(""" +_num_words = set( + """ null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg honnert dausend millioun milliard billioun billiard trillioun triliard -""".split()) +""".split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ éischten zweeten drëtten véierten fënneften sechsten siwenten aachten néngten zéngten eeleften zwieleften dräizéngten véierzéngten foffzéngten siechzéngten uechtzéngen uechzéngten nonnzéngten nongzéngten zwanzegsten drëssegsten véierzegsten foffzegsten siechzegsten siwenzegsten uechzegsten nonnzegsten honnertsten dausendsten milliounsten milliardsten billiounsten billiardsten trilliounsten trilliardsten -""".split()) +""".split() +) def like_num(text): diff --git a/spacy/lang/lb/stop_words.py b/spacy/lang/lb/stop_words.py index 386ce1222af..8f22ea6e694 100644 --- a/spacy/lang/lb/stop_words.py +++ b/spacy/lang/lb/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a à äis @@ -206,4 +207,5 @@ zu zum zwar -""".split()) +""".split() +) diff --git a/spacy/lang/lg/stop_words.py b/spacy/lang/lg/stop_words.py index a9f99cbf40f..7bad59344fb 100644 --- a/spacy/lang/lg/stop_words.py +++ b/spacy/lang/lg/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ abadde abalala abamu abangi abava ajja ali alina ani anti ateekeddwa atewamu atya awamu aweebwa ayinza ba baali babadde babalina bajja bajjanewankubade bali balina bandi bangi bano bateekeddwa baweebwa bayina bebombi beera bibye @@ -14,4 +15,5 @@ tetuteekeddwa tewali teyalina teyayina tolina tu tuyina tulina tuyina twafuna twetaaga wa wabula wabweru wadde waggulunnina wakati waliwobangi waliyo wandi wange wano wansi weebwa yabadde yaffe ye yenna yennyini yina yonna ziba zijja zonna -""".split()) +""".split() +) diff --git a/spacy/lang/lij/stop_words.py b/spacy/lang/lij/stop_words.py index 37eb163ffe7..1d6f09d27ca 100644 --- a/spacy/lang/lij/stop_words.py +++ b/spacy/lang/lij/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei bella belle belli bello ben @@ -34,4 +35,5 @@ un uña unn' unna za zu -""".split()) +""".split() +) diff --git a/spacy/lang/lv/stop_words.py b/spacy/lang/lv/stop_words.py index 4ed61996ac1..2685c243083 100644 --- a/spacy/lang/lv/stop_words.py +++ b/spacy/lang/lv/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/stopwords-iso/stopwords-lv -STOP_WORDS = set(""" +STOP_WORDS = set( + """ aiz ap apakš @@ -162,4 +163,5 @@ zem ārpus šaipus -""".split()) +""".split() +) diff --git a/spacy/lang/mk/stop_words.py b/spacy/lang/mk/stop_words.py index 90a27179852..312a456c5db 100644 --- a/spacy/lang/mk/stop_words.py +++ b/spacy/lang/mk/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ а абре aв @@ -810,4 +811,5 @@ џагара-магара џанам џив-џив - """.split()) + """.split() +) diff --git a/spacy/lang/ml/stop_words.py b/spacy/lang/ml/stop_words.py index 64b9acc1025..441e9358699 100644 --- a/spacy/lang/ml/stop_words.py +++ b/spacy/lang/ml/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ അത് ഇത് ആയിരുന്നു @@ -8,4 +9,5 @@ അന്ന് ഇന്ന് ആണ് -""".split()) +""".split() +) diff --git a/spacy/lang/mr/stop_words.py b/spacy/lang/mr/stop_words.py index 3c9c6208916..9b0cee951ab 100644 --- a/spacy/lang/mr/stop_words.py +++ b/spacy/lang/mr/stop_words.py @@ -1,5 +1,6 @@ # Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json -STOP_WORDS = set(""" +STOP_WORDS = set( + """ न अतरी तो @@ -187,4 +188,5 @@ होता होती होते -""".split()) +""".split() +) diff --git a/spacy/lang/ms/_tokenizer_exceptions_list.py b/spacy/lang/ms/_tokenizer_exceptions_list.py index e579e316ae9..fba1dd70f94 100644 --- a/spacy/lang/ms/_tokenizer_exceptions_list.py +++ b/spacy/lang/ms/_tokenizer_exceptions_list.py @@ -1,6 +1,7 @@ # from https://prpm.dbp.gov.my/cari1?keyword= # dbp https://en.wikipedia.org/wiki/Dewan_Bahasa_dan_Pustaka -MS_BASE_EXCEPTIONS = set(""" +MS_BASE_EXCEPTIONS = set( + """ aba-aba abah-abah abar-abar @@ -1938,4 +1939,5 @@ water-cooled world-class yang-yang -""".split()) +""".split() +) diff --git a/spacy/lang/ms/examples.py b/spacy/lang/ms/examples.py index 1af439d4a5b..236e0c0f660 100644 --- a/spacy/lang/ms/examples.py +++ b/spacy/lang/ms/examples.py @@ -10,7 +10,7 @@ "Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?", "Pengeluaran makanan berasal dari beberapa lokasi termasuk Cameron Highlands, Johor Bahru, dan Kuching.", "Syarikat XYZ telah menghasilkan 20,000 unit produk baharu dalam setahun terakhir", - "Kuala Lumpur merupakan ibu negara Malaysia." "Kau berada di mana semalam?", + "Kuala Lumpur merupakan ibu negara Malaysia.Kau berada di mana semalam?", "Siapa yang akan memimpin projek itu?", "Siapa perdana menteri Malaysia sekarang?", ] diff --git a/spacy/lang/ms/stop_words.py b/spacy/lang/ms/stop_words.py index fc85f83679a..b1bfaea796e 100644 --- a/spacy/lang/ms/stop_words.py +++ b/spacy/lang/ms/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal @@ -113,4 +114,5 @@ waduh wah wahai waktu waktunya walau walaupun wong yaitu yakin yakni yang -""".split()) +""".split() +) diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py index bc1c54a4af3..d9ed414efdf 100644 --- a/spacy/lang/nb/stop_words.py +++ b/spacy/lang/nb/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ alle allerede alt and andre annen annet at av bak bare bedre beste blant ble bli blir blitt bris by både @@ -45,4 +46,5 @@ å år ønsker -""".split()) +""".split() +) diff --git a/spacy/lang/ne/stop_words.py b/spacy/lang/ne/stop_words.py index 95d7a375821..8470297b9f0 100644 --- a/spacy/lang/ne/stop_words.py +++ b/spacy/lang/ne/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt -STOP_WORDS = set(""" +STOP_WORDS = set( + """ अक्सर अगाडि अगाडी @@ -489,4 +490,5 @@ होइन होकि होला -""".split()) +""".split() +) diff --git a/spacy/lang/nl/lex_attrs.py b/spacy/lang/nl/lex_attrs.py index 1b8602831ae..488224c2f20 100644 --- a/spacy/lang/nl/lex_attrs.py +++ b/spacy/lang/nl/lex_attrs.py @@ -1,17 +1,21 @@ from ...attrs import LIKE_NUM -_num_words = set(""" +_num_words = set( + """ nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd duizend miljoen miljard biljoen biljard triljoen triljard -""".split()) +""".split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste miljardste biljoenste biljardste triljoenste triljardste -""".split()) +""".split() +) def like_num(text): diff --git a/spacy/lang/nl/stop_words.py b/spacy/lang/nl/stop_words.py index a88c2905199..cd4fdefdf58 100644 --- a/spacy/lang/nl/stop_words.py +++ b/spacy/lang/nl/stop_words.py @@ -13,7 +13,8 @@ # should have a Dutch counterpart here. -STOP_WORDS = set(""" +STOP_WORDS = set( + """ aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna afgelopen aldus alhoewel anderzijds @@ -67,4 +68,5 @@ zal ze zei zelf zich zij zijn zo zonder zou zeer zeker zekere zelfde zelfs zichzelf zijnde zijne zo’n zoals zodra zouden zoveel zowat zulk zulke zulks zullen zult -""".split()) +""".split() +) diff --git a/spacy/lang/pl/stop_words.py b/spacy/lang/pl/stop_words.py index 4418deedc0b..075aec39167 100644 --- a/spacy/lang/pl/stop_words.py +++ b/spacy/lang/pl/stop_words.py @@ -1,6 +1,7 @@ # sources: https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt and https://github.com/stopwords-iso/stopwords-pl -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a aby ach acz aczkolwiek aj albo ale alez ależ ani az aż @@ -73,4 +74,5 @@ z za zaden zadna zadne zadnych zapewne zawsze zaś ze zeby znow znowu znów zostal został -żaden żadna żadne żadnych że żeby""".split()) +żaden żadna żadne żadnych że żeby""".split() +) diff --git a/spacy/lang/pt/stop_words.py b/spacy/lang/pt/stop_words.py index 722aef80236..ce3c86ff570 100644 --- a/spacy/lang/pt/stop_words.py +++ b/spacy/lang/pt/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes ao aos apenas apoia apoio apontar após aquela aquelas aquele aqueles aqui aquilo as assim através atrás até aí @@ -61,4 +62,5 @@ vossas vosso vossos vários vão vêm vós zero -""".split()) +""".split() +) diff --git a/spacy/lang/ro/lex_attrs.py b/spacy/lang/ro/lex_attrs.py index a5880fc2fac..736aa911ac6 100644 --- a/spacy/lang/ro/lex_attrs.py +++ b/spacy/lang/ro/lex_attrs.py @@ -1,13 +1,16 @@ from ...attrs import LIKE_NUM -_num_words = set(""" +_num_words = set( + """ zero unu doi două trei patru cinci șase șapte opt nouă zece unsprezece doisprezece douăsprezece treisprezece patrusprezece cincisprezece șaisprezece șaptesprezece optsprezece nouăsprezece douăzeci treizeci patruzeci cincizeci șaizeci șaptezeci optzeci nouăzeci sută mie milion miliard bilion trilion cvadrilion catralion cvintilion sextilion septilion enșpemii -""".split()) +""".split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ primul doilea treilea patrulea cincilea șaselea șaptelea optulea nouălea zecelea prima doua treia patra cincia șasea șaptea opta noua zecea unsprezecelea doisprezecelea treisprezecelea patrusprezecelea cincisprezecelea șaisprezecelea șaptesprezecelea optsprezecelea nouăsprezecelea @@ -15,7 +18,8 @@ douăzecilea treizecilea patruzecilea cincizecilea șaizecilea șaptezecilea optzecilea nouăzecilea sutălea douăzecea treizecea patruzecea cincizecea șaizecea șaptezecea optzecea nouăzecea suta miilea mielea mia milionulea milioana miliardulea miliardelea miliarda enșpemia -""".split()) +""".split() +) def like_num(text): diff --git a/spacy/lang/ro/stop_words.py b/spacy/lang/ro/stop_words.py index c7c0801f171..d68a81c4569 100644 --- a/spacy/lang/ro/stop_words.py +++ b/spacy/lang/ro/stop_words.py @@ -1,5 +1,6 @@ # Source: https://github.com/stopwords-iso/stopwords-ro -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a abia acea @@ -494,4 +495,5 @@ știu ți ție -""".split()) +""".split() +) diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py index 63b1cead810..e0b35bdc07f 100644 --- a/spacy/lang/ru/lex_attrs.py +++ b/spacy/lang/ru/lex_attrs.py @@ -1,6 +1,8 @@ from ...attrs import LIKE_NUM -_num_words = list(set(""" +_num_words = list( + set( + """ ноль ноля нолю нолём ноле нулевой нулевого нулевому нулевым нулевом нулевая нулевую нулевое нулевые нулевых нулевыми четверть четверти четвертью четвертей четвертям четвертями четвертях @@ -201,7 +203,9 @@ квинтиллиону квинтиллионов квинтлн i ii iii iv v vi vii viii ix x xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxvii xxix -""".split())) +""".split() + ) +) def like_num(text): diff --git a/spacy/lang/ru/stop_words.py b/spacy/lang/ru/stop_words.py index 3040adb52b1..d6ea6b42af9 100644 --- a/spacy/lang/ru/stop_words.py +++ b/spacy/lang/ru/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ а авось ага агу аж ай али алло ау ах ая б будем будет будете будешь буду будут будучи будь будьте бы был была были было @@ -106,4 +107,5 @@ ю я явно явных яко якобы якоже -""".split()) +""".split() +) diff --git a/spacy/lang/sa/stop_words.py b/spacy/lang/sa/stop_words.py index eaf0ffaa2c9..30302a14dcb 100644 --- a/spacy/lang/sa/stop_words.py +++ b/spacy/lang/sa/stop_words.py @@ -1,6 +1,7 @@ # Source: https://gist.github.com/Akhilesh28/fe8b8e180f64b72e64751bc31cb6d323 -STOP_WORDS = set(""" +STOP_WORDS = set( + """ अहम् आवाम् वयम् @@ -510,4 +511,5 @@ ह हन्त हि -""".split()) +""".split() +) diff --git a/spacy/lang/si/stop_words.py b/spacy/lang/si/stop_words.py index acae5763b52..7d29bc1b4d8 100644 --- a/spacy/lang/si/stop_words.py +++ b/spacy/lang/si/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ සහ සමග සමඟ @@ -190,4 +191,5 @@ ලෙස පරිදි එහෙත් -""".split()) +""".split() +) diff --git a/spacy/lang/sk/stop_words.py b/spacy/lang/sk/stop_words.py index 6ef4818c3a2..017e7beef39 100644 --- a/spacy/lang/sk/stop_words.py +++ b/spacy/lang/sk/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/Ardevop-sk/stopwords-sk -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a aby aj @@ -419,4 +420,5 @@ ňou ňu že -""".split()) +""".split() +) diff --git a/spacy/lang/sl/lex_attrs.py b/spacy/lang/sl/lex_attrs.py index 6d6b40b4546..3c1493050a1 100644 --- a/spacy/lang/sl/lex_attrs.py +++ b/spacy/lang/sl/lex_attrs.py @@ -2,7 +2,8 @@ from ...attrs import IS_CURRENCY, LIKE_NUM -_num_words = set(""" +_num_words = set( + """ nula ničla nič ena dva tri štiri pet šest sedem osem devet deset enajst dvanajst trinajst štirinajst petnajst šestnajst sedemnajst osemnajst devetnajst dvajset trideset štirideset @@ -17,9 +18,11 @@ šestnajstih šestnajstim šestnajstimi petnajstih petnajstim petnajstimi sedemnajstih sedemnajstim sedemnajstimi osemnajstih osemnajstim osemnajstimi devetnajstih devetnajstim devetnajstimi dvajsetih dvajsetim dvajsetimi - """.split()) + """.split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ prvi drugi tretji četrti peti šesti sedmi osmi deveti deseti enajsti dvanajsti trinajsti štirinajsti petnajsti šestnajsti sedemnajsti osemnajsti devetnajsti @@ -89,9 +92,11 @@ osemnajstimi devetnajstimi dvajsetimi tridesetimi štiridesetimi petdesetimi šestdesetimi sedemdesetimi osemdesetimi devetdesetimi stotimi tisočimi milijontimi bilijontimi trilijontimi kvadrilijontimi neštetimi - """.split()) + """.split() +) -_currency_words = set(""" +_currency_words = set( + """ evro evra evru evrom evrov evroma evrih evrom evre evri evr eur cent centa centu cenom centov centoma centih centom cente centi dolar dolarja dolarji dolarju dolarjem dolarjev dolarjema dolarjih dolarje usd @@ -104,7 +109,8 @@ jen jena jeni jenu jenom jenov jenoma jenih jene kuna kuni kune kuno kun kunama kunah kunam kunami marka marki marke markama markah markami - """.split()) + """.split() +) def like_num(text): diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py index a81c00db269..8491efcb580 100644 --- a/spacy/lang/sl/stop_words.py +++ b/spacy/lang/sl/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/stopwords-iso/stopwords-sl -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a ali b bi bil bila bile bili bilo biti blizu bo bodo bojo bolj bom bomo @@ -79,4 +80,5 @@ z za zadaj zadnji zakaj zaprta zaprti zaprto zdaj zelo zunaj ž že -""".split()) +""".split() +) diff --git a/spacy/lang/sq/stop_words.py b/spacy/lang/sq/stop_words.py index bf1c7a7039c..f2b1a4f4a7b 100644 --- a/spacy/lang/sq/stop_words.py +++ b/spacy/lang/sq/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/andrixh/index-albanian -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a afert ai @@ -224,4 +225,5 @@ vjen yne zakonisht -""".split()) +""".split() +) diff --git a/spacy/lang/sr/stop_words.py b/spacy/lang/sr/stop_words.py index 758964a5853..5df5509d2c4 100644 --- a/spacy/lang/sr/stop_words.py +++ b/spacy/lang/sr/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ а авај ако @@ -388,4 +389,5 @@ ћете ћеш ћу -""".split()) +""".split() +) diff --git a/spacy/lang/sv/stop_words.py b/spacy/lang/sv/stop_words.py index 08251bcff32..2422b2a9e5a 100644 --- a/spacy/lang/sv/stop_words.py +++ b/spacy/lang/sv/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras annan annat ännu artonde arton åtminstone att åtta åttio åttionde åttonde av även @@ -61,4 +62,5 @@ vad vänster vänstra var vår vara våra varför varifrån varit varken värre varsågod vart vårt vem vems verkligen vi vid vidare viktig viktigare viktigast viktigt vilka vilken vilket vill -""".split()) +""".split() +) diff --git a/spacy/lang/ta/stop_words.py b/spacy/lang/ta/stop_words.py index d6ef21f3b0a..abbff949d79 100644 --- a/spacy/lang/ta/stop_words.py +++ b/spacy/lang/ta/stop_words.py @@ -1,6 +1,7 @@ # Stop words -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ஒரு என்று மற்றும் @@ -126,4 +127,5 @@ வரையில் சற்று எனக் -""".split()) +""".split() +) diff --git a/spacy/lang/te/stop_words.py b/spacy/lang/te/stop_words.py index d2834260898..b18dab697da 100644 --- a/spacy/lang/te/stop_words.py +++ b/spacy/lang/te/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/Xangis/extra-stopwords (MIT License) -STOP_WORDS = set(""" +STOP_WORDS = set( + """ అందరూ అందుబాటులో అడగండి @@ -51,4 +52,5 @@ వేరుగా వ్యతిరేకంగా సంబంధం -""".split()) +""".split() +) diff --git a/spacy/lang/th/stop_words.py b/spacy/lang/th/stop_words.py index 3dd6e56525b..2823281ce95 100644 --- a/spacy/lang/th/stop_words.py +++ b/spacy/lang/th/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ทั้งนี้ ดัง ขอ รวม หลังจาก เป็น หลัง หรือ ๆ เกี่ยวกับ ซึ่งได้แก่ ด้วยเพราะ ด้วยว่า ด้วยเหตุเพราะ ด้วยเหตุว่า สุดๆ เสร็จแล้ว เช่น เข้า ถ้า ถูก ถึง ต่างๆ ใคร เปิดเผย ครา รือ ตาม ใน ได้แก่ ได้แต่ ได้ที่ ตลอดถึง นอกจากว่า นอกนั้น จริง อย่างดี ส่วน เพียงเพื่อ เดียว จัด ทั้งที ทั้งคน ทั้งตัว ไกลๆ @@ -70,4 +71,5 @@ แห่งนี้ แห่งโน้น แห่งไหน แหละ ให้แก่ ใหญ่ ใหญ่โต อย่างมาก อย่างยิ่ง อย่างไรก็ อย่างไรก็ได้ อย่างไรเสีย อย่างละ อย่างหนึ่ง อย่างๆ อัน อันจะ อันได้แก่ อันที่ อันที่จริง อันที่จะ อันเนื่องมาจาก อันละ อันๆ อาจจะ อาจเป็น อาจเป็นด้วย อื่น อื่นๆ เอ็ง เอา ฯ ฯล ฯลฯ 555 กำ ขอโทษ เยี่ยม นี่คือ -""".split()) +""".split() +) diff --git a/spacy/lang/ti/stop_words.py b/spacy/lang/ti/stop_words.py index e0aaf47d3fe..9bd7122007a 100644 --- a/spacy/lang/ti/stop_words.py +++ b/spacy/lang/ti/stop_words.py @@ -1,7 +1,8 @@ # Stop words from Tigrinya Wordcount: https://github.com/fgaim/Tigrinya-WordCount/blob/main/ti_stop_words.txt # Stop words -STOP_WORDS = set(""" +STOP_WORDS = set( + """ 'ምበር 'ሞ 'ቲ 'ታ 'ኳ 'ውን 'ዚ 'የ 'ዩ 'ያ 'ዮም 'ዮን ልዕሊ ሒዙ ሒዛ ሕጂ መበል መን መንጎ መጠን ማለት ምስ ምባል ምእንቲ ምኽንያቱ ምኽንያት ምዃኑ ምዃንና ምዃኖም @@ -22,4 +23,5 @@ ዝነበረ ዝነበረት ዝነበሩ ዝካየድ ዝኸውን ዝኽእል ዝኾነ ዝዀነ የለን ይቕረብ ይብል ይኸውን ይኹን ይኽእል ደኣ ድሕሪ ድማ ገለ ገሊጹ ገና ገይሩ ግና ግን ጥራይ -""".split()) +""".split() +) diff --git a/spacy/lang/tl/stop_words.py b/spacy/lang/tl/stop_words.py index a7bf541990a..2560cdaed6a 100644 --- a/spacy/lang/tl/stop_words.py +++ b/spacy/lang/tl/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ akin aking ako @@ -146,4 +147,5 @@ tungkol una walang -""".split()) +""".split() +) diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py index a63a455f754..f614771dd11 100644 --- a/spacy/lang/tn/stop_words.py +++ b/spacy/lang/tn/stop_words.py @@ -1,5 +1,6 @@ # Stop words -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ke gareng ga selekanyo tlhwatlhwa yo mongwe se sengwe fa go le jalo gongwe ba na mo tikologong jaaka kwa morago nna gonne ka sa pele nako teng @@ -15,4 +16,5 @@ bonala e tshwanang bogolo tsenya tsweetswee karolo sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa tlhano lesometlhano botlalo lekgolo -""".split()) +""".split() +) diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index e80423e5150..b7d91d86f0d 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -34,11 +34,11 @@ # host & domain names # mods: match is case-sensitive, so include [A-Z] r"(?:" # noqa: E131 - r"(?:" # noqa: E131 - r"[A-Za-z0-9\u00a1-\uffff]" # noqa: E131 - r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}" - r")?" - r"[A-Za-z0-9\u00a1-\uffff]\." + r"(?:" # noqa: E131 + r"[A-Za-z0-9\u00a1-\uffff]" # noqa: E131 + r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}" + r")?" + r"[A-Za-z0-9\u00a1-\uffff]\." r")+" # TLD identifier # mods: use ALPHA_LOWER instead of a wider range so that this doesn't match @@ -111,7 +111,8 @@ BASE_EXCEPTIONS[orth] = [{ORTH: orth}] -emoticons = set(r""" +emoticons = set( + r""" :) :-) :)) @@ -242,7 +243,8 @@ ¯\(ツ)/¯ (╯°□°)╯︵┻━┻ ><(((*> -""".split()) +""".split() +) for orth in emoticons: diff --git a/spacy/lang/tr/stop_words.py b/spacy/lang/tr/stop_words.py index 5323cf32d9c..85dcff6a53a 100644 --- a/spacy/lang/tr/stop_words.py +++ b/spacy/lang/tr/stop_words.py @@ -1,5 +1,6 @@ # Source: https://github.com/stopwords-iso/stopwords-tr -STOP_WORDS = set(""" +STOP_WORDS = set( + """ acaba acep adamakıllı @@ -552,4 +553,5 @@ zaten zati zira -""".split()) +""".split() +) diff --git a/spacy/lang/tt/stop_words.py b/spacy/lang/tt/stop_words.py index 8f146d9150a..44169b757e5 100644 --- a/spacy/lang/tt/stop_words.py +++ b/spacy/lang/tt/stop_words.py @@ -1,6 +1,7 @@ # Tatar stopwords are from https://github.com/aliiae/stopwords-tt -STOP_WORDS = set("""алай алайса алар аларга аларда алардан аларны аларның аларча +STOP_WORDS = set( + """алай алайса алар аларга аларда алардан аларны аларның аларча алары аларын аларынга аларында аларыннан аларының алтмыш алтмышынчы алтмышынчыга алтмышынчыда алтмышынчыдан алтмышынчылар алтмышынчыларга алтмышынчыларда алтмышынчылардан алтмышынчыларны алтмышынчыларның алтмышынчыны алтмышынчының @@ -168,4 +169,5 @@ өстәп өч өчен өченче өченчегә өченчедә өченчедән өченчеләр өченчеләргә өченчеләрдә өченчеләрдән өченчеләрне өченчеләрнең өченчене өченченең өчләп -өчәрләп""".split()) +өчәрләп""".split() +) diff --git a/spacy/lang/uk/stop_words.py b/spacy/lang/uk/stop_words.py index 517c300070a..b11d7a044a3 100644 --- a/spacy/lang/uk/stop_words.py +++ b/spacy/lang/uk/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set("""а +STOP_WORDS = set( + """а або адже аж @@ -464,4 +465,5 @@ якій якого якої -якщо""".split()) +якщо""".split() +) diff --git a/spacy/lang/ur/lex_attrs.py b/spacy/lang/ur/lex_attrs.py index 916a47bfd19..e590ed3e303 100644 --- a/spacy/lang/ur/lex_attrs.py +++ b/spacy/lang/ur/lex_attrs.py @@ -5,8 +5,7 @@ # https://en.wikibooks.org/wiki/Urdu/Vocabulary/Numbers # https://www.urdu-english.com/lessons/beginner/numbers -_num_words = ( - """ایک دو تین چار پانچ چھ سات آٹھ نو دس گیارہ بارہ تیرہ چودہ پندرہ سولہ سترہ +_num_words = """ایک دو تین چار پانچ چھ سات آٹھ نو دس گیارہ بارہ تیرہ چودہ پندرہ سولہ سترہ اٹهارا انیس بیس اکیس بائیس تئیس چوبیس پچیس چھببیس ستایس اٹھائس انتيس تیس اکتیس بتیس تینتیس چونتیس پینتیس چھتیس سینتیس ارتیس انتالیس چالیس اکتالیس بیالیس تیتالیس @@ -18,7 +17,6 @@ سٹیاسی اٹھیاسی نواسی نوے اکانوے بانوے ترانوے چورانوے پچانوے چھیانوے ستانوے اٹھانوے ننانوے سو """.split() -) # source https://www.google.com/intl/ur/inputtools/try/ diff --git a/spacy/lang/ur/stop_words.py b/spacy/lang/ur/stop_words.py index 00f0dd2d6b4..abfa3649713 100644 --- a/spacy/lang/ur/stop_words.py +++ b/spacy/lang/ur/stop_words.py @@ -1,5 +1,6 @@ # Source: collected from different resource on internet -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ثھی خو گی @@ -508,4 +509,5 @@ ہورہی ثبعث ضت -""".split()) +""".split() +) diff --git a/spacy/lang/vi/stop_words.py b/spacy/lang/vi/stop_words.py index 9163e10938e..3481701d5ea 100644 --- a/spacy/lang/vi/stop_words.py +++ b/spacy/lang/vi/stop_words.py @@ -1,5 +1,6 @@ # Source: https://github.com/stopwords/vietnamese-stopwords -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a_lô a_ha ai @@ -1942,4 +1943,5 @@ ừ_ào ừ_ừ ử -""".split("\n")) +""".split("\n") +) diff --git a/spacy/lang/zh/stop_words.py b/spacy/lang/zh/stop_words.py index d54fe689504..42ae4a1de04 100644 --- a/spacy/lang/zh/stop_words.py +++ b/spacy/lang/zh/stop_words.py @@ -1,6 +1,7 @@ # stop words as whitespace-separated list # Chinese stop words,maybe not enough -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ! " # @@ -1894,4 +1895,5 @@ ~± ~+ ¥ -""".split()) +""".split() +) diff --git a/spacy/language.py b/spacy/language.py index dcf436c65fe..ea9ba3cae8b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1589,9 +1589,7 @@ def pipe( # noqa: F811 if batch_size is None: batch_size = self.batch_size - pipes = ( - [] - ) # contains functools.partial objects to easily create multiprocess worker. + pipes = [] # contains functools.partial objects to easily create multiprocess worker. for name, proc in self.pipeline: if name in disable: continue @@ -1626,7 +1624,11 @@ def _has_gpu_model(self, disable: Iterable[str]): if name in disable or not is_trainable: continue - if hasattr(proc, "model") and hasattr(proc.model, "ops") and isinstance(proc.model.ops, CupyOps): # type: ignore + if ( + hasattr(proc, "model") + and hasattr(proc.model, "ops") + and isinstance(proc.model.ops, CupyOps) + ): # type: ignore return True return False diff --git a/spacy/matcher/dependencymatcher.pyi b/spacy/matcher/dependencymatcher.pyi index d84a30a58b0..3d744dfce4b 100644 --- a/spacy/matcher/dependencymatcher.pyi +++ b/spacy/matcher/dependencymatcher.pyi @@ -48,10 +48,12 @@ class DependencyMatcher: *, on_match: Optional[ Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] - ] = ... + ] = ..., ) -> None: ... def has_key(self, key: Union[str, int]) -> bool: ... - def get(self, key: Union[str, int], default: Optional[Any] = ...) -> Tuple[ + def get( + self, key: Union[str, int], default: Optional[Any] = ... + ) -> Tuple[ Optional[ Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] ], diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index c33b534cbd2..e474d250d22 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -33,7 +33,7 @@ class Matcher: on_match: Optional[ Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any] ] = ..., - greedy: Optional[str] = ... + greedy: Optional[str] = ..., ) -> None: ... def remove(self, key: str) -> None: ... def has_key(self, key: Union[str, int]) -> bool: ... @@ -56,7 +56,7 @@ class Matcher: *, as_spans: Literal[False] = ..., allow_missing: bool = ..., - with_alignments: bool = ... + with_alignments: bool = ..., ) -> List[Tuple[int, int, int]]: ... @overload def __call__( @@ -65,6 +65,6 @@ class Matcher: *, as_spans: Literal[True], allow_missing: bool = ..., - with_alignments: bool = ... + with_alignments: bool = ..., ) -> List[Span]: ... def _normalize_key(self, key: Any) -> Any: ... diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 8b12720db20..752d1c4433c 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -122,7 +122,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: return get_candidates -def create_candidates_batch() -> ( - Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]] -): +def create_candidates_batch() -> Callable[ + [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] +]: return get_candidates_batch diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index 122ef379544..bc69e53ab24 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -19,7 +19,7 @@ def StaticVectors( *, dropout: Optional[float] = None, init_W: Callable = glorot_uniform_init, - key_attr: str = "ORTH" + key_attr: str = "ORTH", ) -> Model[List[Doc], Ragged]: """Embed Doc objects with their vocab's vectors table, applying a learned linear projection to control the dimensionality. If a dropout rate is diff --git a/spacy/pipeline/_edit_tree_internals/schemas.py b/spacy/pipeline/_edit_tree_internals/schemas.py index d20945dc5f1..ef7a076b6cd 100644 --- a/spacy/pipeline/_edit_tree_internals/schemas.py +++ b/spacy/pipeline/_edit_tree_internals/schemas.py @@ -1,7 +1,16 @@ from collections import defaultdict from typing import Any, Dict, List, Union -from pydantic import BaseModel, ConfigDict, Field, RootModel, StrictBool, StrictInt, StrictStr, ValidationError +from pydantic import ( + BaseModel, + ConfigDict, + Field, + RootModel, + StrictBool, + StrictInt, + StrictStr, + ValidationError, +) class MatchNodeSchema(BaseModel): diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index fd037480cb2..aeb0672c7c7 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -19,7 +19,9 @@ from .pipe import Pipe MatcherPatternType = List[Dict[Union[int, str], Any]] -AttributeRulerPatternType = Dict[str, Union[List[MatcherPatternType], MatcherPatternType, Dict, int]] +AttributeRulerPatternType = Dict[ + str, Union[List[MatcherPatternType], MatcherPatternType, Dict, int] +] TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]] MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] @@ -137,7 +139,8 @@ def match(self, doc: Doc): matches = self.matcher(doc, allow_missing=True, as_spans=False) # Sort by the attribute ID, so that later rules have precedence matches = [ - (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches # type: ignore + (int(self.vocab.strings[m_id]), m_id, s, e) + for m_id, s, e in matches # type: ignore ] matches.sort() return matches diff --git a/spacy/schemas.py b/spacy/schemas.py index c0ecee314ee..e3200348013 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -299,7 +299,11 @@ class TokenPattern(BaseModel): op: Optional[TokenPatternOperator] = None underscore: Optional[Dict[StrictStr, UnderscoreValue]] = Field(None, alias="_") - model_config = ConfigDict(extra="forbid", populate_by_name=True, alias_generator=lambda value: value.upper()) + model_config = ConfigDict( + extra="forbid", + populate_by_name=True, + alias_generator=lambda value: value.upper(), + ) @field_validator("*", mode="before") @classmethod diff --git a/spacy/tests/lang/et/test_tokenizer.py b/spacy/tests/lang/et/test_tokenizer.py index f0f8079cae8..8bee2288033 100644 --- a/spacy/tests/lang/et/test_tokenizer.py +++ b/spacy/tests/lang/et/test_tokenizer.py @@ -2,8 +2,7 @@ ET_BASIC_TOKENIZATION_TESTS = [ ( - "Kedagi ei või piinata ega ebainimlikult või alandavalt kohelda " - "ega karistada.", + "Kedagi ei või piinata ega ebainimlikult või alandavalt kohelda ega karistada.", [ "Kedagi", "ei", diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 554522fe8a1..e0dc7d5a1dd 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -10,7 +10,11 @@ # Bad patterns flagged in all cases ([{"XX": "foo"}], 1, 1), ([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2, 1), - ([{"IS_PUNCT": True, "OP": "$"}], 2, 1), # v2: union reports 2 errors (enum + pattern) + ( + [{"IS_PUNCT": True, "OP": "$"}], + 2, + 1, + ), # v2: union reports 2 errors (enum + pattern) ([{"_": "foo"}], 1, 1), ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), ([{"ENT_IOB": "foo"}], 1, 1), @@ -24,10 +28,22 @@ ([{"TEXT": "foo", "OP": "{1, 3}"}], 2, 1), # v2: union reports 2 errors ([{"TEXT": "foo", "OP": "{-2}"}], 2, 1), # v2: union reports 2 errors # Bad patterns flagged outside of Matcher - ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 7, 0), # v2: more detailed union errors + ( + [{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], + 7, + 0, + ), # v2: more detailed union errors # Bad patterns not flagged with minimal checks - ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 5, 0), # v2: more detailed union errors - ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 5, 0), # v2: more detailed union errors + ( + [{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], + 5, + 0, + ), # v2: more detailed union errors + ( + [{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], + 5, + 0, + ), # v2: more detailed union errors ([{"LENGTH": {"VALUE": 5}}], 3, 0), # v2: more detailed union errors ([{"TEXT": {"VALUE": "foo"}}], 2, 0), ([{"IS_DIGIT": -1}], 1, 0), diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index e7df02e9769..b9276441222 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -59,9 +59,9 @@ def test_build_dependencies(): lib, v = _parse_req(line) if lib and not lib.startswith("cupy") and lib not in libs_ignore_setup: req_v = req_dict.get(lib, None) - assert ( - req_v is not None - ), "{} in setup.cfg but not in requirements.txt".format(lib) + assert req_v is not None, ( + "{} in setup.cfg but not in requirements.txt".format(lib) + ) assert (lib + v) == (lib + req_v), ( "{} has different version in setup.cfg and in requirements.txt: " "{} and {} respectively".format(lib, v, req_v) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 1b6f49f4cde..5e50a4d2801 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -496,15 +496,15 @@ def get_lowercased_candidates_batch(kb, spans): return [get_lowercased_candidates(kb, span) for span in spans] @registry.misc("spacy.LowercaseCandidateGenerator.v1") - def create_candidates() -> ( - Callable[[InMemoryLookupKB, "Span"], Iterable[Candidate]] - ): + def create_candidates() -> Callable[ + [InMemoryLookupKB, "Span"], Iterable[Candidate] + ]: return get_lowercased_candidates @registry.misc("spacy.LowercaseCandidateBatchGenerator.v1") - def create_candidates_batch() -> ( - Callable[[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]] - ): + def create_candidates_batch() -> Callable[ + [InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]] + ]: return get_lowercased_candidates_batch # replace the pipe with a new one with with a different candidate generator diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index 9b1ddd53012..826086fc7fe 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -135,14 +135,38 @@ def test_sentencizer_serialize_bytes(en_vocab): # fmt: off "lang,text", [ - ('bn', 'বাংলা ভাষা (বাঙলা, বাঙ্গলা, তথা বাঙ্গালা নামগুলোতেও পরিচিত) একটি ইন্দো-আর্য ভাষা, যা দক্ষিণ এশিয়ার বাঙালি জাতির প্রধান কথ্য ও লেখ্য ভাষা। মাতৃভাষীর সংখ্যায় বাংলা ইন্দো-ইউরোপীয় ভাষা পরিবারের চতুর্থ ও বিশ্বের ষষ্ঠ বৃহত্তম ভাষা।[৫] মোট ব্যবহারকারীর সংখ্যা অনুসারে বাংলা বিশ্বের সপ্তম বৃহত্তম ভাষা। বাংলা সার্বভৌম ভাষাভিত্তিক জাতিরাষ্ট্র বাংলাদেশের একমাত্র রাষ্ট্রভাষা তথা সরকারি ভাষা[৬] এবং ভারতের পশ্চিমবঙ্গ, ত্রিপুরা, আসামের বরাক উপত্যকার সরকারি ভাষা। বঙ্গোপসাগরে অবস্থিত আন্দামান দ্বীপপুঞ্জের প্রধান কথ্য ভাষা বাংলা। এছাড়া ভারতের ঝাড়খণ্ড, বিহার, মেঘালয়, মিজোরাম, উড়িষ্যা রাজ্যগুলোতে উল্লেখযোগ্য পরিমাণে বাংলাভাষী জনগণ রয়েছে। ভারতে হিন্দির পরেই সর্বাধিক প্রচলিত ভাষা বাংলা।[৭][৮] এছাড়াও মধ্য প্রাচ্য, আমেরিকা ও ইউরোপে উল্লেখযোগ্য পরিমাণে বাংলাভাষী অভিবাসী রয়েছে।[৯] সারা বিশ্বে সব মিলিয়ে ২৬ কোটির অধিক লোক দৈনন্দিন জীবনে বাংলা ব্যবহার করে।[২] বাংলাদেশের জাতীয় সঙ্গীত এবং ভারতের জাতীয় সঙ্গীত ও স্তোত্র বাংলাতে রচিত।'), - ('de', 'Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache. Ihr Sprachraum umfasst Deutschland, Österreich, die Deutschschweiz, Liechtenstein, Luxemburg, Ostbelgien, Südtirol, das Elsass und Lothringen sowie Nordschleswig. Außerdem ist sie eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern, z. B. in Rumänien und Südafrika, sowie Nationalsprache im afrikanischen Namibia.'), - ('hi', 'हिन्दी विश्व की एक प्रमुख भाषा है एवं भारत की राजभाषा है। केन्द्रीय स्तर पर भारत में दूसरी आधिकारिक भाषा अंग्रेजी है। यह हिंदुस्तानी भाषा की एक मानकीकृत रूप है जिसमें संस्कृत के तत्सम तथा तद्भव शब्दों का प्रयोग अधिक है और अरबी-फ़ारसी शब्द कम हैं। हिंदी संवैधानिक रूप से भारत की राजभाषा और भारत की सबसे अधिक बोली और समझी जाने वाली भाषा है। हालाँकि, हिन्दी भारत की राष्ट्रभाषा नहीं है,[3] क्योंकि भारत के संविधान में कोई भी भाषा को ऐसा दर्जा नहीं दिया गया था।[4][5] चीनी के बाद यह विश्व में सबसे अधिक बोली जाने वाली भाषा भी है। विश्व आर्थिक मंच की गणना के अनुसार यह विश्व की दस शक्तिशाली भाषाओं में से एक है।[6]'), - ('kn', 'ದ್ರಾವಿಡ ಭಾಷೆಗಳಲ್ಲಿ ಪ್ರಾಮುಖ್ಯವುಳ್ಳ ಭಾಷೆಯೂ ಭಾರತದ ಪುರಾತನವಾದ ಭಾಷೆಗಳಲ್ಲಿ ಒಂದೂ ಆಗಿರುವ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಅದರ ವಿವಿಧ ರೂಪಗಳಲ್ಲಿ ಸುಮಾರು ೪೫ ದಶಲಕ್ಷ ಜನರು ಆಡು ನುಡಿಯಾಗಿ ಬಳಸುತ್ತಲಿದ್ದಾರೆ. ಕನ್ನಡ ಕರ್ನಾಟಕ ರಾಜ್ಯದ ಆಡಳಿತ ಭಾಷೆ.[೧೧] ಜಗತ್ತಿನಲ್ಲಿ ಅತ್ಯಂತ ಹೆಚ್ಚು ಮಂದಿ ಮಾತನಾಡುವ ಭಾಷೆಯೆಂಬ ನೆಲೆಯಲ್ಲಿ ಇಪ್ಪತೊಂಬತ್ತನೆಯ ಸ್ಥಾನ ಕನ್ನಡಕ್ಕಿದೆ. ೨೦೧೧ರ ಜನಗಣತಿಯ ಪ್ರಕಾರ ಜಗತ್ತಿನಲ್ಲಿ ೬.೪ ಕೋಟಿ ಜನಗಳು ಕನ್ನಡ ಮಾತನಾಡುತ್ತಾರೆ ಎಂದು ತಿಳಿದುಬಂದಿದೆ. ಇವರಲ್ಲಿ ೫.೫ ಕೋಟಿ ಜನಗಳ ಮಾತೃಭಾಷೆ ಕನ್ನಡವಾಗಿದೆ. ಬ್ರಾಹ್ಮಿ ಲಿಪಿಯಿಂದ ರೂಪುಗೊಂಡ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಉಪಯೋಗಿಸಿ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಬರೆಯಲಾಗುತ್ತದೆ. ಕನ್ನಡ ಬರಹದ ಮಾದರಿಗಳಿಗೆ ಸಾವಿರದ ಐನೂರು ವರುಷಗಳ ಚರಿತ್ರೆಯಿದೆ. ಕ್ರಿ.ಶ. ಆರನೆಯ ಶತಮಾನದ ಪಶ್ಚಿಮ ಗಂಗ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ [೧೨] ಮತ್ತು ಒಂಬತ್ತನೆಯ ಶತಮಾನದ ರಾಷ್ಟ್ರಕೂಟ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ ಹಳಗನ್ನಡ ಸಾಹಿತ್ಯ ಅತ್ಯಂತ ಹೆಚ್ಚಿನ ರಾಜಾಶ್ರಯ ಪಡೆಯಿತು.[೧೩][೧೪] ಅದಲ್ಲದೆ ಸಾವಿರ ವರುಷಗಳ ಸಾಹಿತ್ಯ ಪರಂಪರೆ ಕನ್ನಡಕ್ಕಿದೆ.[೧೫]ವಿನೋಬಾ ಭಾವೆ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಲಿಪಿಗಳ ರಾಣಿಯೆಂದು ಹೊಗಳಿದ್ದಾರೆ.[ಸೂಕ್ತ ಉಲ್ಲೇಖನ ಬೇಕು]'), - ('si', 'ශ්‍රී ලංකාවේ ප්‍රධාන ජාතිය වන සිංහල ජනයාගේ මව් බස සිංහල වෙයි. අද වන විට මිලියන 20 කට අධික සිංහල සහ මිලියන 3කට අධික සිංහල නොවන ජනගහනයක් සිංහල භාෂාව භාවිත කරති. සිංහල‍ ඉන්දු-යුරෝපීය භාෂාවල උප ගණයක් වන ඉන්දු-ආර්ය භාෂා ගණයට අයිති වන අතර මාල දිවයින භාවිත කරන දිවෙහි භාෂාව සිංහලයෙන් පැවත එන්නකි. සිංහල ශ්‍රී ලංකාවේ නිල භාෂාවයි .'), - ('ta', 'தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும். தமிழ் திராவிட மொழிக் குடும்பத்தின் முதன்மையான மொழிகளில் ஒன்றும் செம்மொழியும் ஆகும். இந்தியா, இலங்கை, மலேசியா, சிங்கப்பூர் ஆகிய நாடுகளில் அதிக அளவிலும், ஐக்கிய அரபு அமீரகம், தென்னாப்பிரிக்கா, மொரிசியசு, பிஜி, ரீயூனியன், டிரினிடாட் போன்ற நாடுகளில் சிறிய அளவிலும் தமிழ் பேசப்படுகிறது. 1997ஆம் ஆண்டுப் புள்ளி விவரப்படி உலகம் முழுவதிலும் 8 கோடி (80 மில்லியன்) மக்களால் பேசப்படும் தமிழ்[13], ஒரு மொழியைத் தாய்மொழியாகக் கொண்டு பேசும் மக்களின் எண்ணிக்கை அடிப்படையில் பதினெட்டாவது இடத்தில் உள்ளது.[14] இணையத்தில் அதிகம் பயன்படுத்தப்படும் இந்திய மொழிகளில் தமிழ் முதன்மையாக உள்ளதாக 2017 ஆவது ஆண்டில் நடைபெற்ற கூகுள் கணக்கெடுப்பில் தெரிய வந்தது.[15]'), - ('te', 'ఆంధ్ర ప్రదేశ్, తెలంగాణ రాష్ట్రాల అధికార భాష తెలుగు. భారత దేశంలో తెలుగు మాతృభాషగా మాట్లాడే 8.7 కోట్ల (2001) జనాభాతో [1] ప్రాంతీయ భాషలలో మొదటి స్థానంలో ఉంది. ప్రపంచంలోని ప్రజలు అత్యధికముగా మాట్లాడే భాషలలో 15 స్థానములోనూ, భారత దేశములో హిందీ, తర్వాత స్థానములోనూ నిలుస్తుంది. పాతవైన ప్రపంచ భాష గణాంకాల (ఎథ్నోలాగ్) ప్రకారం ప్రపంచవ్యాప్తంగా 7.4 కోట్లు మందికి మాతృభాషగా ఉంది.[2] మొదటి భాషగా మాట్లాడతారు. అతి ప్రాచీన దేశ భాషలలో సంస్కృతము తమిళముతో బాటు తెలుగు భాషను 2008 అక్టోబరు 31న భారత ప్రభుత్వము గుర్తించింది.'), - ('ur', 'اُردُو لشکری زبان[8] (یا جدید معیاری اردو) برصغیر کی معیاری زبانوں میں سے ایک ہے۔ یہ پاکستان کی قومی اور رابطہ عامہ کی زبان ہے، جبکہ بھارت کی چھے ریاستوں کی دفتری زبان کا درجہ رکھتی ہے۔ آئین ہند کے مطابق اسے 22 دفتری شناخت زبانوں میں شامل کیا جاچکا ہے۔ 2001ء کی مردم شماری کے مطابق اردو کو بطور مادری زبان بھارت میں 5.01% فیصد لوگ بولتے ہیں اور اس لحاظ سے یہ بھارت کی چھٹی بڑی زبان ہے جبکہ پاکستان میں اسے بطور مادری زبان 7.59% فیصد لوگ استعمال کرتے ہیں، یہ پاکستان کی پانچویں بڑی زبان ہے۔ اردو تاریخی طور پر ہندوستان کی مسلم آبادی سے جڑی ہے۔[حوالہ درکار] بعض ذخیرہ الفاظ کے علاوہ یہ زبان معیاری ہندی سے قابل فہم ہے جو اس خطے کی ہندوؤں سے منسوب ہے۔[حوالہ درکار] زبانِ اردو کو پہچان و ترقی اس وقت ملی جب برطانوی دور میں انگریز حکمرانوں نے اسے فارسی کی بجائے انگریزی کے ساتھ شمالی ہندوستان کے علاقوں اور جموں و کشمیر میں اسے سنہ 1846ء اور پنجاب میں سنہ 1849ء میں بطور دفتری زبان نافذ کیا۔ اس کے علاوہ خلیجی، یورپی، ایشیائی اور امریکی علاقوں میں اردو بولنے والوں کی ایک بڑی تعداد آباد ہے جو بنیادی طور پر جنوبی ایشیاء سے کوچ کرنے والے اہلِ اردو ہیں۔ 1999ء کے اعداد وشمار کے مطابق اردو زبان کے مجموعی متکلمین کی تعداد دس کروڑ ساٹھ لاکھ کے لگ بھگ تھی۔ اس لحاظ سے یہ دنیا کی نویں بڑی زبان ہے۔'), + ( + "bn", + "বাংলা ভাষা (বাঙলা, বাঙ্গলা, তথা বাঙ্গালা নামগুলোতেও পরিচিত) একটি ইন্দো-আর্য ভাষা, যা দক্ষিণ এশিয়ার বাঙালি জাতির প্রধান কথ্য ও লেখ্য ভাষা। মাতৃভাষীর সংখ্যায় বাংলা ইন্দো-ইউরোপীয় ভাষা পরিবারের চতুর্থ ও বিশ্বের ষষ্ঠ বৃহত্তম ভাষা।[৫] মোট ব্যবহারকারীর সংখ্যা অনুসারে বাংলা বিশ্বের সপ্তম বৃহত্তম ভাষা। বাংলা সার্বভৌম ভাষাভিত্তিক জাতিরাষ্ট্র বাংলাদেশের একমাত্র রাষ্ট্রভাষা তথা সরকারি ভাষা[৬] এবং ভারতের পশ্চিমবঙ্গ, ত্রিপুরা, আসামের বরাক উপত্যকার সরকারি ভাষা। বঙ্গোপসাগরে অবস্থিত আন্দামান দ্বীপপুঞ্জের প্রধান কথ্য ভাষা বাংলা। এছাড়া ভারতের ঝাড়খণ্ড, বিহার, মেঘালয়, মিজোরাম, উড়িষ্যা রাজ্যগুলোতে উল্লেখযোগ্য পরিমাণে বাংলাভাষী জনগণ রয়েছে। ভারতে হিন্দির পরেই সর্বাধিক প্রচলিত ভাষা বাংলা।[৭][৮] এছাড়াও মধ্য প্রাচ্য, আমেরিকা ও ইউরোপে উল্লেখযোগ্য পরিমাণে বাংলাভাষী অভিবাসী রয়েছে।[৯] সারা বিশ্বে সব মিলিয়ে ২৬ কোটির অধিক লোক দৈনন্দিন জীবনে বাংলা ব্যবহার করে।[২] বাংলাদেশের জাতীয় সঙ্গীত এবং ভারতের জাতীয় সঙ্গীত ও স্তোত্র বাংলাতে রচিত।", + ), + ( + "de", + "Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache. Ihr Sprachraum umfasst Deutschland, Österreich, die Deutschschweiz, Liechtenstein, Luxemburg, Ostbelgien, Südtirol, das Elsass und Lothringen sowie Nordschleswig. Außerdem ist sie eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern, z. B. in Rumänien und Südafrika, sowie Nationalsprache im afrikanischen Namibia.", + ), + ( + "hi", + "हिन्दी विश्व की एक प्रमुख भाषा है एवं भारत की राजभाषा है। केन्द्रीय स्तर पर भारत में दूसरी आधिकारिक भाषा अंग्रेजी है। यह हिंदुस्तानी भाषा की एक मानकीकृत रूप है जिसमें संस्कृत के तत्सम तथा तद्भव शब्दों का प्रयोग अधिक है और अरबी-फ़ारसी शब्द कम हैं। हिंदी संवैधानिक रूप से भारत की राजभाषा और भारत की सबसे अधिक बोली और समझी जाने वाली भाषा है। हालाँकि, हिन्दी भारत की राष्ट्रभाषा नहीं है,[3] क्योंकि भारत के संविधान में कोई भी भाषा को ऐसा दर्जा नहीं दिया गया था।[4][5] चीनी के बाद यह विश्व में सबसे अधिक बोली जाने वाली भाषा भी है। विश्व आर्थिक मंच की गणना के अनुसार यह विश्व की दस शक्तिशाली भाषाओं में से एक है।[6]", + ), + ( + "kn", + "ದ್ರಾವಿಡ ಭಾಷೆಗಳಲ್ಲಿ ಪ್ರಾಮುಖ್ಯವುಳ್ಳ ಭಾಷೆಯೂ ಭಾರತದ ಪುರಾತನವಾದ ಭಾಷೆಗಳಲ್ಲಿ ಒಂದೂ ಆಗಿರುವ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಅದರ ವಿವಿಧ ರೂಪಗಳಲ್ಲಿ ಸುಮಾರು ೪೫ ದಶಲಕ್ಷ ಜನರು ಆಡು ನುಡಿಯಾಗಿ ಬಳಸುತ್ತಲಿದ್ದಾರೆ. ಕನ್ನಡ ಕರ್ನಾಟಕ ರಾಜ್ಯದ ಆಡಳಿತ ಭಾಷೆ.[೧೧] ಜಗತ್ತಿನಲ್ಲಿ ಅತ್ಯಂತ ಹೆಚ್ಚು ಮಂದಿ ಮಾತನಾಡುವ ಭಾಷೆಯೆಂಬ ನೆಲೆಯಲ್ಲಿ ಇಪ್ಪತೊಂಬತ್ತನೆಯ ಸ್ಥಾನ ಕನ್ನಡಕ್ಕಿದೆ. ೨೦೧೧ರ ಜನಗಣತಿಯ ಪ್ರಕಾರ ಜಗತ್ತಿನಲ್ಲಿ ೬.೪ ಕೋಟಿ ಜನಗಳು ಕನ್ನಡ ಮಾತನಾಡುತ್ತಾರೆ ಎಂದು ತಿಳಿದುಬಂದಿದೆ. ಇವರಲ್ಲಿ ೫.೫ ಕೋಟಿ ಜನಗಳ ಮಾತೃಭಾಷೆ ಕನ್ನಡವಾಗಿದೆ. ಬ್ರಾಹ್ಮಿ ಲಿಪಿಯಿಂದ ರೂಪುಗೊಂಡ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಉಪಯೋಗಿಸಿ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಬರೆಯಲಾಗುತ್ತದೆ. ಕನ್ನಡ ಬರಹದ ಮಾದರಿಗಳಿಗೆ ಸಾವಿರದ ಐನೂರು ವರುಷಗಳ ಚರಿತ್ರೆಯಿದೆ. ಕ್ರಿ.ಶ. ಆರನೆಯ ಶತಮಾನದ ಪಶ್ಚಿಮ ಗಂಗ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ [೧೨] ಮತ್ತು ಒಂಬತ್ತನೆಯ ಶತಮಾನದ ರಾಷ್ಟ್ರಕೂಟ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ ಹಳಗನ್ನಡ ಸಾಹಿತ್ಯ ಅತ್ಯಂತ ಹೆಚ್ಚಿನ ರಾಜಾಶ್ರಯ ಪಡೆಯಿತು.[೧೩][೧೪] ಅದಲ್ಲದೆ ಸಾವಿರ ವರುಷಗಳ ಸಾಹಿತ್ಯ ಪರಂಪರೆ ಕನ್ನಡಕ್ಕಿದೆ.[೧೫]ವಿನೋಬಾ ಭಾವೆ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಲಿಪಿಗಳ ರಾಣಿಯೆಂದು ಹೊಗಳಿದ್ದಾರೆ.[ಸೂಕ್ತ ಉಲ್ಲೇಖನ ಬೇಕು]", + ), + ( + "si", + "ශ්‍රී ලංකාවේ ප්‍රධාන ජාතිය වන සිංහල ජනයාගේ මව් බස සිංහල වෙයි. අද වන විට මිලියන 20 කට අධික සිංහල සහ මිලියන 3කට අධික සිංහල නොවන ජනගහනයක් සිංහල භාෂාව භාවිත කරති. සිංහල‍ ඉන්දු-යුරෝපීය භාෂාවල උප ගණයක් වන ඉන්දු-ආර්ය භාෂා ගණයට අයිති වන අතර මාල දිවයින භාවිත කරන දිවෙහි භාෂාව සිංහලයෙන් පැවත එන්නකි. සිංහල ශ්‍රී ලංකාවේ නිල භාෂාවයි .", + ), + ( + "ta", + "தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும். தமிழ் திராவிட மொழிக் குடும்பத்தின் முதன்மையான மொழிகளில் ஒன்றும் செம்மொழியும் ஆகும். இந்தியா, இலங்கை, மலேசியா, சிங்கப்பூர் ஆகிய நாடுகளில் அதிக அளவிலும், ஐக்கிய அரபு அமீரகம், தென்னாப்பிரிக்கா, மொரிசியசு, பிஜி, ரீயூனியன், டிரினிடாட் போன்ற நாடுகளில் சிறிய அளவிலும் தமிழ் பேசப்படுகிறது. 1997ஆம் ஆண்டுப் புள்ளி விவரப்படி உலகம் முழுவதிலும் 8 கோடி (80 மில்லியன்) மக்களால் பேசப்படும் தமிழ்[13], ஒரு மொழியைத் தாய்மொழியாகக் கொண்டு பேசும் மக்களின் எண்ணிக்கை அடிப்படையில் பதினெட்டாவது இடத்தில் உள்ளது.[14] இணையத்தில் அதிகம் பயன்படுத்தப்படும் இந்திய மொழிகளில் தமிழ் முதன்மையாக உள்ளதாக 2017 ஆவது ஆண்டில் நடைபெற்ற கூகுள் கணக்கெடுப்பில் தெரிய வந்தது.[15]", + ), + ( + "te", + "ఆంధ్ర ప్రదేశ్, తెలంగాణ రాష్ట్రాల అధికార భాష తెలుగు. భారత దేశంలో తెలుగు మాతృభాషగా మాట్లాడే 8.7 కోట్ల (2001) జనాభాతో [1] ప్రాంతీయ భాషలలో మొదటి స్థానంలో ఉంది. ప్రపంచంలోని ప్రజలు అత్యధికముగా మాట్లాడే భాషలలో 15 స్థానములోనూ, భారత దేశములో హిందీ, తర్వాత స్థానములోనూ నిలుస్తుంది. పాతవైన ప్రపంచ భాష గణాంకాల (ఎథ్నోలాగ్) ప్రకారం ప్రపంచవ్యాప్తంగా 7.4 కోట్లు మందికి మాతృభాషగా ఉంది.[2] మొదటి భాషగా మాట్లాడతారు. అతి ప్రాచీన దేశ భాషలలో సంస్కృతము తమిళముతో బాటు తెలుగు భాషను 2008 అక్టోబరు 31న భారత ప్రభుత్వము గుర్తించింది.", + ), + ( + "ur", + "اُردُو لشکری زبان[8] (یا جدید معیاری اردو) برصغیر کی معیاری زبانوں میں سے ایک ہے۔ یہ پاکستان کی قومی اور رابطہ عامہ کی زبان ہے، جبکہ بھارت کی چھے ریاستوں کی دفتری زبان کا درجہ رکھتی ہے۔ آئین ہند کے مطابق اسے 22 دفتری شناخت زبانوں میں شامل کیا جاچکا ہے۔ 2001ء کی مردم شماری کے مطابق اردو کو بطور مادری زبان بھارت میں 5.01% فیصد لوگ بولتے ہیں اور اس لحاظ سے یہ بھارت کی چھٹی بڑی زبان ہے جبکہ پاکستان میں اسے بطور مادری زبان 7.59% فیصد لوگ استعمال کرتے ہیں، یہ پاکستان کی پانچویں بڑی زبان ہے۔ اردو تاریخی طور پر ہندوستان کی مسلم آبادی سے جڑی ہے۔[حوالہ درکار] بعض ذخیرہ الفاظ کے علاوہ یہ زبان معیاری ہندی سے قابل فہم ہے جو اس خطے کی ہندوؤں سے منسوب ہے۔[حوالہ درکار] زبانِ اردو کو پہچان و ترقی اس وقت ملی جب برطانوی دور میں انگریز حکمرانوں نے اسے فارسی کی بجائے انگریزی کے ساتھ شمالی ہندوستان کے علاقوں اور جموں و کشمیر میں اسے سنہ 1846ء اور پنجاب میں سنہ 1849ء میں بطور دفتری زبان نافذ کیا۔ اس کے علاوہ خلیجی، یورپی، ایشیائی اور امریکی علاقوں میں اردو بولنے والوں کی ایک بڑی تعداد آباد ہے جو بنیادی طور پر جنوبی ایشیاء سے کوچ کرنے والے اہلِ اردو ہیں۔ 1999ء کے اعداد وشمار کے مطابق اردو زبان کے مجموعی متکلمین کی تعداد دس کروڑ ساٹھ لاکھ کے لگ بھگ تھی۔ اس لحاظ سے یہ دنیا کی نویں بڑی زبان ہے۔", + ), ], # fmt: on ) diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 998f0472c7e..ddd9a990c65 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -65,10 +65,30 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): "embed_arch,embed_config", # fmt: off [ - ("spacy.MultiHashEmbed.v1", {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}), - ("spacy.MultiHashEmbed.v1", {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}), - ("spacy.CharacterEmbed.v1", {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}), - ("spacy.CharacterEmbed.v1", {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}), + ( + "spacy.MultiHashEmbed.v1", + { + "rows": [100, 100], + "attrs": ["SHAPE", "LOWER"], + "include_static_vectors": False, + }, + ), + ( + "spacy.MultiHashEmbed.v1", + { + "rows": [100, 20], + "attrs": ["ORTH", "PREFIX"], + "include_static_vectors": False, + }, + ), + ( + "spacy.CharacterEmbed.v1", + {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, + ), + ( + "spacy.CharacterEmbed.v1", + {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, + ), ], # fmt: on ) @@ -76,10 +96,26 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): "tok2vec_arch,encode_arch,encode_config", # fmt: off [ - ("spacy.Tok2Vec.v1", "spacy.MaxoutWindowEncoder.v1", {"window_size": 1, "maxout_pieces": 3, "depth": 2}), - ("spacy.Tok2Vec.v2", "spacy.MaxoutWindowEncoder.v2", {"window_size": 1, "maxout_pieces": 3, "depth": 2}), - ("spacy.Tok2Vec.v1", "spacy.MishWindowEncoder.v1", {"window_size": 1, "depth": 6}), - ("spacy.Tok2Vec.v2", "spacy.MishWindowEncoder.v2", {"window_size": 1, "depth": 6}), + ( + "spacy.Tok2Vec.v1", + "spacy.MaxoutWindowEncoder.v1", + {"window_size": 1, "maxout_pieces": 3, "depth": 2}, + ), + ( + "spacy.Tok2Vec.v2", + "spacy.MaxoutWindowEncoder.v2", + {"window_size": 1, "maxout_pieces": 3, "depth": 2}, + ), + ( + "spacy.Tok2Vec.v1", + "spacy.MishWindowEncoder.v1", + {"window_size": 1, "depth": 6}, + ), + ( + "spacy.Tok2Vec.v2", + "spacy.MishWindowEncoder.v2", + {"window_size": 1, "depth": 6}, + ), ], # fmt: on ) @@ -164,9 +200,9 @@ def test_init_tok2vec(): @pytest.mark.parametrize("with_vectors", (False, True)) def test_tok2vec_listener(with_vectors): orig_config = Config().from_str(cfg_string) - orig_config["components"]["tok2vec"]["model"]["embed"][ - "include_static_vectors" - ] = with_vectors + orig_config["components"]["tok2vec"]["model"]["embed"]["include_static_vectors"] = ( + with_vectors + ) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) if with_vectors: diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 43d5f62837a..8415e5c92ff 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -153,21 +153,171 @@ def test_issue12566(factory: str, output_file: str): "Briana McNaira - Cultural Chaos .", "tokens": [ # fmt: off - {"id": 0, "start": 0, "end": 8, "tag": "ADV", "pos": "ADV", "morph": "Degree=Pos", "lemma": "niedawno", "dep": "advmod", "head": 1, }, - {"id": 1, "start": 9, "end": 15, "tag": "PRAET", "pos": "VERB", "morph": "Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", "lemma": "czytać", "dep": "ROOT", "head": 1, }, - {"id": 2, "start": 16, "end": 18, "tag": "AGLT", "pos": "NOUN", "morph": "Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing", "lemma": "em", "dep": "iobj", "head": 1, }, - {"id": 3, "start": 19, "end": 23, "tag": "ADJ", "pos": "ADJ", "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing", "lemma": "nowy", "dep": "amod", "head": 4, }, - {"id": 4, "start": 24, "end": 31, "tag": "SUBST", "pos": "NOUN", "morph": "Case=Acc|Gender=Fem|Number=Sing", "lemma": "książka", "dep": "obj", "head": 1, }, - {"id": 5, "start": 32, "end": 43, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Nhum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "znakomit", "dep": "acl", "head": 4, }, - {"id": 6, "start": 44, "end": 54, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Hum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "szkockiy", "dep": "amod", "head": 7, }, - {"id": 7, "start": 55, "end": 66, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "medioznawca", "dep": "iobj", "head": 5, }, - {"id": 8, "start": 67, "end": 68, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Comm", "lemma": ",", "dep": "punct", "head": 9, }, - {"id": 9, "start": 69, "end": 75, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "Brian", "dep": "nmod", "head": 4, }, - {"id": 10, "start": 76, "end": 83, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "McNair", "dep": "flat", "head": 9, }, - {"id": 11, "start": 84, "end": 85, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Dash", "lemma": "-", "dep": "punct", "head": 12, }, - {"id": 12, "start": 86, "end": 94, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Cultural", "dep": "conj", "head": 4, }, - {"id": 13, "start": 95, "end": 100, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Chaos", "dep": "flat", "head": 12, }, - {"id": 14, "start": 101, "end": 102, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Peri", "lemma": ".", "dep": "punct", "head": 1, }, + { + "id": 0, + "start": 0, + "end": 8, + "tag": "ADV", + "pos": "ADV", + "morph": "Degree=Pos", + "lemma": "niedawno", + "dep": "advmod", + "head": 1, + }, + { + "id": 1, + "start": 9, + "end": 15, + "tag": "PRAET", + "pos": "VERB", + "morph": "Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", + "lemma": "czytać", + "dep": "ROOT", + "head": 1, + }, + { + "id": 2, + "start": 16, + "end": 18, + "tag": "AGLT", + "pos": "NOUN", + "morph": "Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing", + "lemma": "em", + "dep": "iobj", + "head": 1, + }, + { + "id": 3, + "start": 19, + "end": 23, + "tag": "ADJ", + "pos": "ADJ", + "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing", + "lemma": "nowy", + "dep": "amod", + "head": 4, + }, + { + "id": 4, + "start": 24, + "end": 31, + "tag": "SUBST", + "pos": "NOUN", + "morph": "Case=Acc|Gender=Fem|Number=Sing", + "lemma": "książka", + "dep": "obj", + "head": 1, + }, + { + "id": 5, + "start": 32, + "end": 43, + "tag": "ADJ", + "pos": "ADJ", + "morph": "Animacy=Nhum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", + "lemma": "znakomit", + "dep": "acl", + "head": 4, + }, + { + "id": 6, + "start": 44, + "end": 54, + "tag": "ADJ", + "pos": "ADJ", + "morph": "Animacy=Hum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", + "lemma": "szkockiy", + "dep": "amod", + "head": 7, + }, + { + "id": 7, + "start": 55, + "end": 66, + "tag": "SUBST", + "pos": "NOUN", + "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", + "lemma": "medioznawca", + "dep": "iobj", + "head": 5, + }, + { + "id": 8, + "start": 67, + "end": 68, + "tag": "INTERP", + "pos": "PUNCT", + "morph": "PunctType=Comm", + "lemma": ",", + "dep": "punct", + "head": 9, + }, + { + "id": 9, + "start": 69, + "end": 75, + "tag": "SUBST", + "pos": "PROPN", + "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", + "lemma": "Brian", + "dep": "nmod", + "head": 4, + }, + { + "id": 10, + "start": 76, + "end": 83, + "tag": "SUBST", + "pos": "PROPN", + "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", + "lemma": "McNair", + "dep": "flat", + "head": 9, + }, + { + "id": 11, + "start": 84, + "end": 85, + "tag": "INTERP", + "pos": "PUNCT", + "morph": "PunctType=Dash", + "lemma": "-", + "dep": "punct", + "head": 12, + }, + { + "id": 12, + "start": 86, + "end": 94, + "tag": "SUBST", + "pos": "PROPN", + "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", + "lemma": "Cultural", + "dep": "conj", + "head": 4, + }, + { + "id": 13, + "start": 95, + "end": 100, + "tag": "SUBST", + "pos": "NOUN", + "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", + "lemma": "Chaos", + "dep": "flat", + "head": 12, + }, + { + "id": 14, + "start": 101, + "end": 102, + "tag": "INTERP", + "pos": "PUNCT", + "morph": "PunctType=Peri", + "lemma": ".", + "dep": "punct", + "head": 1, + }, # fmt: on ], } @@ -420,8 +570,14 @@ def test_cli_converters_conll_ner_to_docs(): (["--x.foo=bar"], {"x.foo": "bar"}), (["--x.foo", "--x.bar", "baz"], {"x.foo": True, "x.bar": "baz"}), (["--x.foo", "--x.bar=baz"], {"x.foo": True, "x.bar": "baz"}), - (["--x.foo", "10.1", "--x.bar", "--x.baz", "false"], {"x.foo": 10.1, "x.bar": True, "x.baz": False}), - (["--x.foo", "10.1", "--x.bar", "--x.baz=false"], {"x.foo": 10.1, "x.bar": True, "x.baz": False}) + ( + ["--x.foo", "10.1", "--x.bar", "--x.baz", "false"], + {"x.foo": 10.1, "x.bar": True, "x.baz": False}, + ), + ( + ["--x.foo", "10.1", "--x.bar", "--x.baz=false"], + {"x.foo": 10.1, "x.bar": True, "x.baz": False}, + ), # fmt: on ], ) @@ -499,11 +655,11 @@ def test_model_recommendations(): # fmt: off "parser,textcat,tagger", " parser, textcat ,tagger ", - 'parser,textcat,tagger', - ' parser, textcat ,tagger ', + "parser,textcat,tagger", + " parser, textcat ,tagger ", ' "parser"," textcat " ,"tagger "', " 'parser',' textcat ' ,'tagger '", - '[parser,textcat,tagger]', + "[parser,textcat,tagger]", '["parser","textcat","tagger"]', '[" parser" ,"textcat ", " tagger " ]', "[parser,textcat,tagger]", @@ -522,7 +678,7 @@ def test_string_to_list(value): [ # fmt: off "1,2,3", - '[1,2,3]', + "[1,2,3]", '["1","2","3"]', '[" 1" ,"2 ", " 3 " ]', "[' 1' , '2', ' 3 ' ]", diff --git a/spacy/tests/test_factory_imports.py b/spacy/tests/test_factory_imports.py index a975af0bbd2..7a1b4a769a8 100644 --- a/spacy/tests/test_factory_imports.py +++ b/spacy/tests/test_factory_imports.py @@ -67,16 +67,16 @@ def test_factory_import_compatibility(factory_name, original_module, compat_modu # Import from the original module (registrations.py) original_module_obj = importlib.import_module(original_module) original_factory = getattr(original_module_obj, factory_name) - assert ( - original_factory is not None - ), f"Could not import {factory_name} from {original_module}" + assert original_factory is not None, ( + f"Could not import {factory_name} from {original_module}" + ) # Import from the compatibility module (component file) compat_module_obj = importlib.import_module(compat_module) compat_factory = getattr(compat_module_obj, factory_name) - assert ( - compat_factory is not None - ), f"Could not import {factory_name} from {compat_module}" + assert compat_factory is not None, ( + f"Could not import {factory_name} from {compat_module}" + ) # Test that they're the same function (identity) assert original_factory is compat_factory, ( diff --git a/spacy/tests/test_factory_registrations.py b/spacy/tests/test_factory_registrations.py index 8e93f54f0b0..ab604c3a6d5 100644 --- a/spacy/tests/test_factory_registrations.py +++ b/spacy/tests/test_factory_registrations.py @@ -82,9 +82,9 @@ def test_factory_registrations_preserved(reference_factory_registrations): missing_registrations = set(reference_factory_registrations.keys()) - set( current_registrations.keys() ) - assert ( - not missing_registrations - ), f"Missing factory registrations: {', '.join(sorted(missing_registrations))}" + assert not missing_registrations, ( + f"Missing factory registrations: {', '.join(sorted(missing_registrations))}" + ) # Check for new registrations (not an error, but informative) new_registrations = set(current_registrations.keys()) - set( diff --git a/spacy/tests/test_registry_population.py b/spacy/tests/test_registry_population.py index 592e74dd20a..c67136f9c31 100644 --- a/spacy/tests/test_registry_population.py +++ b/spacy/tests/test_registry_population.py @@ -50,6 +50,6 @@ def test_registry_entries(reference_registry): # Check for missing entries - these would indicate our new registry population # mechanism is missing something missing_entries = expected_set - current_set - assert ( - not missing_entries - ), f"Registry '{registry_name}' missing entries: {', '.join(missing_entries)}" + assert not missing_entries, ( + f"Registry '{registry_name}' missing entries: {', '.join(missing_entries)}" + ) diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index d92f04d0564..b8b26ce8b0d 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -57,7 +57,9 @@ class Doc: force: bool = ..., ) -> None: ... @classmethod - def get_extension(cls, name: str) -> Tuple[ + def get_extension( + cls, name: str + ) -> Tuple[ Optional[Any], Optional[DocMethod], Optional[Callable[[Doc], Any]], @@ -66,7 +68,9 @@ class Doc: @classmethod def has_extension(cls, name: str) -> bool: ... @classmethod - def remove_extension(cls, name: str) -> Tuple[ + def remove_extension( + cls, name: str + ) -> Tuple[ Optional[Any], Optional[DocMethod], Optional[Callable[[Doc], Any]], @@ -144,7 +148,7 @@ class Doc: blocked: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ..., - default: str = ... + default: str = ..., ) -> None: ... @property def noun_chunks(self) -> Iterator[Span]: ... diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 070aaffb3a8..b982eb810b8 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -23,7 +23,9 @@ class Span: force: bool = ..., ) -> None: ... @classmethod - def get_extension(cls, name: str) -> Tuple[ + def get_extension( + cls, name: str + ) -> Tuple[ Optional[Any], Optional[SpanMethod], Optional[Callable[[Span], Any]], @@ -32,7 +34,9 @@ class Span: @classmethod def has_extension(cls, name: str) -> bool: ... @classmethod - def remove_extension(cls, name: str) -> Tuple[ + def remove_extension( + cls, name: str + ) -> Tuple[ Optional[Any], Optional[SpanMethod], Optional[Callable[[Span], Any]], diff --git a/spacy/tokens/span_group.pyi b/spacy/tokens/span_group.pyi index d063bb59533..3bd2b6788fb 100644 --- a/spacy/tokens/span_group.pyi +++ b/spacy/tokens/span_group.pyi @@ -12,7 +12,7 @@ class SpanGroup: *, name: str = ..., attrs: Dict[str, Any] = ..., - spans: Iterable[Span] = ... + spans: Iterable[Span] = ..., ) -> None: ... def __repr__(self) -> str: ... @property diff --git a/spacy/tokens/token.pyi b/spacy/tokens/token.pyi index 7e56ae3bccd..435ace52707 100644 --- a/spacy/tokens/token.pyi +++ b/spacy/tokens/token.pyi @@ -27,7 +27,9 @@ class Token: force: bool = ..., ) -> None: ... @classmethod - def get_extension(cls, name: str) -> Tuple[ + def get_extension( + cls, name: str + ) -> Tuple[ Optional[Any], Optional[TokenMethod], Optional[Callable[[Token], Any]], @@ -36,7 +38,9 @@ class Token: @classmethod def has_extension(cls, name: str) -> bool: ... @classmethod - def remove_extension(cls, name: str) -> Tuple[ + def remove_extension( + cls, name: str + ) -> Tuple[ Optional[Any], Optional[TokenMethod], Optional[Callable[[Token], Any]], diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py index 4a1dfa94515..4f3ac5de795 100644 --- a/spacy/training/batchers.py +++ b/spacy/training/batchers.py @@ -24,7 +24,7 @@ def configure_minibatch_by_padded_size( size: Sizing, buffer: int, discard_oversize: bool, - get_length: Optional[Callable[[ItemT], int]] = None + get_length: Optional[Callable[[ItemT], int]] = None, ) -> BatcherT: """Create a batcher that uses the `batch_by_padded_size` strategy. @@ -49,7 +49,7 @@ def configure_minibatch_by_padded_size( size=size, buffer=buffer, discard_oversize=discard_oversize, - **optionals + **optionals, ) @@ -58,7 +58,7 @@ def configure_minibatch_by_words( size: Sizing, tolerance: float, discard_oversize: bool, - get_length: Optional[Callable[[ItemT], int]] = None + get_length: Optional[Callable[[ItemT], int]] = None, ) -> BatcherT: """Create a batcher that uses the "minibatch by words" strategy. @@ -76,7 +76,7 @@ def configure_minibatch_by_words( size=size, tolerance=tolerance, discard_oversize=discard_oversize, - **optionals + **optionals, ) diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py index b19d1791b27..e66a8a8dfed 100644 --- a/spacy/training/converters/conll_ner_to_docs.py +++ b/spacy/training/converters/conll_ner_to_docs.py @@ -74,8 +74,7 @@ def conll_ner_to_docs( # provide warnings for problematic data if "\n\n" not in input_data: msg.warn( - "No sentence boundaries found. Use `-s` to automatically segment " - "sentences." + "No sentence boundaries found. Use `-s` to automatically segment sentences." ) if doc_delimiter not in input_data: msg.warn( diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py index bda5c88c3d4..3a60c4e024b 100644 --- a/spacy/training/converters/conllu_to_docs.py +++ b/spacy/training/converters/conllu_to_docs.py @@ -15,7 +15,7 @@ def conllu_to_docs( ner_map=None, merge_subtokens=False, no_print=False, - **_ + **_, ): """ Convert conllu files into JSON format for use with train cli. diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 488ca4a7136..05c5d3bb681 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -176,7 +176,7 @@ def log_step(info: Optional[Dict[str, Any]]) -> None: initial = info["step"] else: total = eval_frequency - desc = f"Epoch {info['epoch']+1}" + desc = f"Epoch {info['epoch'] + 1}" initial = 0 # Set disable=None, so that it disables on non-TTY progress = tqdm.tqdm( diff --git a/spacy/ty.py b/spacy/ty.py index b37f2e18a1f..c18ce284dc0 100644 --- a/spacy/ty.py +++ b/spacy/ty.py @@ -29,7 +29,7 @@ def update( *, drop: float = 0.0, sgd: Optional[Optimizer] = None, - losses: Optional[Dict[str, float]] = None + losses: Optional[Dict[str, float]] = None, ) -> Dict[str, float]: ... def finish_update(self, sgd: Optimizer) -> None: ... @@ -41,7 +41,7 @@ def initialize( self, get_examples: Callable[[], Iterable["Example"]], nlp: "Language", - **kwargs: Any + **kwargs: Any, ): ... diff --git a/spacy/util.py b/spacy/util.py index ad5a7e0bada..14d7b539994 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -557,7 +557,9 @@ def load_model_from_package( RETURNS (Language): The loaded nlp object. """ cls = importlib.import_module(name) - return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config) # type: ignore[attr-defined] + return cls.load( + vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config + ) # type: ignore[attr-defined] def load_model_from_path( From 79b5f811bf6e403528e6e7e84307af4951df1a03 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 21 Mar 2026 08:39:43 +0100 Subject: [PATCH 26/42] Update CI validation workflow: replace black, isort, flake8 with ruff --- .github/workflows/tests.yml | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bb4eb278131..c967be0d128 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -33,24 +33,13 @@ jobs: with: python-version: "3.10" - - name: black - run: | - python -m pip install black -c requirements.txt - python -m black spacy --check - - name: isort - run: | - python -m pip install isort -c requirements.txt - python -m isort spacy --check - - name: flake8 - run: | - python -m pip install flake8==5.0.4 - python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics - # Unfortunately cython-lint isn't working after the shift to Cython 3. - #- name: cython-lint - # run: | - # python -m pip install cython-lint -c requirements.txt - # # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment - # cython-lint spacy --ignore E501,W291,E266 + - name: ruff format + run: | + python -m pip install ruff -c requirements.txt + python -m ruff format spacy --check + - name: ruff lint + run: | + python -m ruff check spacy tests: name: Test From 86f7ce303a2ebedf562227229f75b43bcac6cf46 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 21 Mar 2026 08:41:25 +0100 Subject: [PATCH 27/42] Limit CI ruff lint to isort-only checks for now --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c967be0d128..adfce07f50b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -37,9 +37,9 @@ jobs: run: | python -m pip install ruff -c requirements.txt python -m ruff format spacy --check - - name: ruff lint + - name: ruff isort run: | - python -m ruff check spacy + python -m ruff check spacy --select I tests: name: Test From 47b5504e90cf30f1675df112aed1af9330c07986 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 21 Mar 2026 08:43:16 +0100 Subject: [PATCH 28/42] Autofix autofixable things from ruff --- spacy/cli/_util.py | 11 +---------- spacy/cli/debug_diff.py | 3 +-- spacy/cli/evaluate.py | 3 +-- spacy/cli/find_threshold.py | 6 +----- spacy/lang/fa/generate_verbs_exc.py | 6 +++--- spacy/lang/ga/lemmatizer.py | 2 +- spacy/lang/ht/lemmatizer.py | 1 - spacy/lang/sl/punctuation.py | 2 -- spacy/matcher/phrasematcher.pyi | 2 +- spacy/ml/_character_embed.py | 1 - spacy/ml/_precomputable_affine.py | 1 - spacy/ml/callbacks.py | 4 +--- spacy/ml/extract_ngrams.py | 1 - spacy/ml/extract_spans.py | 1 - spacy/ml/featureextractor.py | 2 +- spacy/ml/models/entity_linker.py | 1 - spacy/ml/models/multi_task.py | 6 +++--- spacy/ml/models/parser.py | 3 +-- spacy/ml/models/span_finder.py | 1 - spacy/ml/models/spancat.py | 1 - spacy/ml/models/tagger.py | 1 - spacy/ml/models/textcat.py | 1 - spacy/ml/models/tok2vec.py | 3 +-- spacy/ml/staticvectors.py | 4 ++-- spacy/ml/tb_framework.py | 1 - spacy/pipe_analysis.py | 4 ++-- spacy/pipeline/_edit_tree_internals/schemas.py | 1 - spacy/pipeline/attributeruler.py | 3 +-- spacy/pipeline/edit_tree_lemmatizer.py | 1 - spacy/pipeline/entity_linker.py | 4 +--- spacy/pipeline/entityruler.py | 3 +-- spacy/pipeline/functions.py | 1 - spacy/pipeline/lemmatizer.py | 3 +-- spacy/pipeline/pipe.pyi | 1 - spacy/pipeline/span_finder.py | 2 -- spacy/pipeline/span_ruler.py | 3 +-- spacy/pipeline/spancat.py | 2 -- spacy/pipeline/textcat.py | 5 +---- spacy/pipeline/textcat_multilabel.py | 6 +----- spacy/pipeline/tok2vec.py | 1 - spacy/tests/doc/test_doc_api.py | 6 +++--- spacy/tests/lang/bg/test_tokenizer.py | 1 - spacy/tests/lang/es/test_noun_chunks.py | 10 +++++----- spacy/tests/lang/fr/test_noun_chunks.py | 12 ++++++------ spacy/tests/lang/it/test_noun_chunks.py | 12 ++++++------ spacy/tests/lang/la/test_exception.py | 1 - spacy/tests/lang/pt/test_noun_chunks.py | 10 +++++----- spacy/tests/lang/sl/test_text.py | 1 - spacy/tests/lang/sq/test_text.py | 1 - spacy/tests/lang/xx/test_text.py | 1 - spacy/tests/pipeline/test_entity_linker.py | 4 ++-- spacy/tests/pipeline/test_textcat.py | 1 - spacy/tests/test_cli_app.py | 1 - spacy/tests/test_factory_registrations.py | 3 --- spacy/tests/test_models.py | 2 +- spacy/tests/test_registry_population.py | 1 - spacy/tests/training/test_corpus.py | 3 +-- spacy/tests/vocab_vectors/test_lexeme.py | 1 - spacy/tokens/_serialize.py | 2 +- spacy/training/augment.py | 1 - spacy/training/batchers.py | 5 ++--- spacy/training/callbacks.py | 2 +- spacy/training/loggers.py | 1 - spacy/vocab.pyi | 1 - 64 files changed, 60 insertions(+), 131 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 5057640a5b9..757c418440b 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,15 +1,11 @@ -import hashlib import os -import shutil import sys from configparser import InterpolationError from contextlib import contextmanager from pathlib import Path from typing import ( - TYPE_CHECKING, Any, Dict, - Iterable, List, Optional, Tuple, @@ -21,22 +17,17 @@ import typer from click import NoSuchOption from click.shell_completion import split_arg_string -from thinc.api import Config, ConfigValidationError, require_gpu +from thinc.api import ConfigValidationError, require_gpu from thinc.util import gpu_is_available from typer.main import get_command from wasabi import Printer, msg from weasel import app as project_cli -from .. import about from ..compat import Literal -from ..schemas import validate from ..util import ( ENV_VARS, - SimpleFrozenDict, import_file, - is_compatible_version, logger, - make_tempdir, registry, run_command, ) diff --git a/spacy/cli/debug_diff.py b/spacy/cli/debug_diff.py index 08c31b32df3..71d8826bc5e 100644 --- a/spacy/cli/debug_diff.py +++ b/spacy/cli/debug_diff.py @@ -2,11 +2,10 @@ from typing import Optional import typer -from thinc.api import Config from wasabi import MarkdownRenderer, Printer, diff_strings from ..util import load_config -from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error +from ._util import Arg, Opt, debug_cli, show_validation_error from .init_config import Optimizations, init_config diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 62131fe13ea..9704ea44413 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,13 +1,12 @@ import re from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional import srsly from thinc.api import fix_random_seed from wasabi import Printer from .. import displacy, util -from ..scorer import Scorer from ..tokens import Doc from ..training import Corpus from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index d89b4c27d55..7b2c5f98051 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -207,11 +207,7 @@ def filter_config( ), ) if hasattr(pipe, "cfg"): - setattr( - nlp.get_pipe(pipe_name), - "cfg", - set_nested_item(getattr(pipe, "cfg"), config_keys, threshold), - ) + nlp.get_pipe(pipe_name).cfg = set_nested_item(pipe.cfg, config_keys, threshold) eval_scores = nlp.evaluate(dev_dataset) if scores_key not in eval_scores: diff --git a/spacy/lang/fa/generate_verbs_exc.py b/spacy/lang/fa/generate_verbs_exc.py index a6d79a386df..7ef82c3e8a9 100644 --- a/spacy/lang/fa/generate_verbs_exc.py +++ b/spacy/lang/fa/generate_verbs_exc.py @@ -611,8 +611,8 @@ present_ends = ["م", "ی", "د", "یم", "ید", "ند"] # special case of '#هست': -VERBS_EXC.update({conj: "هست" for conj in ["هست" + end for end in simple_ends]}) -VERBS_EXC.update({conj: "هست" for conj in ["نیست" + end for end in simple_ends]}) +VERBS_EXC.update(dict.fromkeys(["هست" + end for end in simple_ends], "هست")) +VERBS_EXC.update(dict.fromkeys(["نیست" + end for end in simple_ends], "هست")) for verb_root in verb_roots: conjugations = [] @@ -648,4 +648,4 @@ ) ) - VERBS_EXC.update({conj: (past,) if past else present for conj in conjugations}) + VERBS_EXC.update(dict.fromkeys(conjugations, (past,) if past else present)) diff --git a/spacy/lang/ga/lemmatizer.py b/spacy/lang/ga/lemmatizer.py index c9fbfbc193a..cffcf1d3c49 100644 --- a/spacy/lang/ga/lemmatizer.py +++ b/spacy/lang/ga/lemmatizer.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Tuple +from typing import List, Tuple from ...pipeline import Lemmatizer from ...tokens import Token diff --git a/spacy/lang/ht/lemmatizer.py b/spacy/lang/ht/lemmatizer.py index 52bf23d2390..7687865c300 100644 --- a/spacy/lang/ht/lemmatizer.py +++ b/spacy/lang/ht/lemmatizer.py @@ -1,6 +1,5 @@ from typing import List, Tuple -from ...lookups import Lookups from ...pipeline import Lemmatizer from ...tokens import Token diff --git a/spacy/lang/sl/punctuation.py b/spacy/lang/sl/punctuation.py index dadb54d315c..3be83eba382 100644 --- a/spacy/lang/sl/punctuation.py +++ b/spacy/lang/sl/punctuation.py @@ -5,14 +5,12 @@ CONCAT_QUOTES, CURRENCY, HYPHENS, - LIST_CURRENCY, LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES, PUNCT, UNITS, - merge_chars, ) from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi index 27f6ba373fc..0f56699d63f 100644 --- a/spacy/matcher/phrasematcher.pyi +++ b/spacy/matcher/phrasematcher.pyi @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, overload +from typing import Any, Callable, List, Optional, Tuple, Union, overload from ..compat import Literal from ..tokens import Doc, Span diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py index fde73f35b5b..8cc4d25743e 100644 --- a/spacy/ml/_character_embed.py +++ b/spacy/ml/_character_embed.py @@ -4,7 +4,6 @@ from thinc.types import Floats2d from ..tokens import Doc -from ..util import registry def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]: diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index cdcac0c3812..ac2f6bbd3fa 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -1,6 +1,5 @@ from thinc.api import Model, normal_init -from ..util import registry def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1): diff --git a/spacy/ml/callbacks.py b/spacy/ml/callbacks.py index fefb170ba21..d9976cea80a 100644 --- a/spacy/ml/callbacks.py +++ b/spacy/ml/callbacks.py @@ -2,14 +2,12 @@ import inspect import types import warnings -from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set, Type +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set from thinc.layers import with_nvtx_range -from thinc.model import Model, wrap_model_recursive from thinc.util import use_nvtx_range from ..errors import Warnings -from ..util import registry if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py index d571973122e..9f54b48899e 100644 --- a/spacy/ml/extract_ngrams.py +++ b/spacy/ml/extract_ngrams.py @@ -1,7 +1,6 @@ from thinc.api import Model from ..attrs import LOWER -from ..util import registry def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model: diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py index d3456b705a6..2351ddc2cf0 100644 --- a/spacy/ml/extract_spans.py +++ b/spacy/ml/extract_spans.py @@ -3,7 +3,6 @@ from thinc.api import Model, to_numpy from thinc.types import Ints1d, Ragged -from ..util import registry def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]: diff --git a/spacy/ml/featureextractor.py b/spacy/ml/featureextractor.py index fb4e3c39aea..ad376e15f25 100644 --- a/spacy/ml/featureextractor.py +++ b/spacy/ml/featureextractor.py @@ -1,6 +1,6 @@ from typing import Callable, List, Tuple, Union -from thinc.api import Model, registry +from thinc.api import Model from thinc.types import Ints2d from ..tokens import Doc diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 752d1c4433c..05ad9a27287 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -23,7 +23,6 @@ get_candidates_batch, ) from ...tokens import Doc, Span -from ...util import registry from ...vocab import Vocab from ..extract_spans import extract_spans diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 7c68fe48126..9beecf878ad 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -1,5 +1,5 @@ from functools import partial -from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Optional, Tuple, cast +from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Tuple, cast import numpy from thinc.api import ( @@ -21,7 +21,7 @@ from ...attrs import ID, ORTH from ...errors import Errors -from ...util import OOV_RANK, registry +from ...util import OOV_RANK from ...vectors import Mode as VectorsMode if TYPE_CHECKING: @@ -199,7 +199,7 @@ def mlm_initialize(model: Model, X=None, Y=None): layers=[wrapped_model], init=mlm_initialize, refs={"wrapped": wrapped_model}, - dims={dim: None for dim in wrapped_model.dim_names}, + dims=dict.fromkeys(wrapped_model.dim_names), ) mlm_model.set_ref("wrapped", wrapped_model) return mlm_model diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 9ff0ac8ba3c..20b8f6d6e80 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -1,4 +1,4 @@ -from typing import List, Optional, cast +from typing import List, Optional from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init from thinc.types import Floats2d @@ -6,7 +6,6 @@ from ...compat import Literal from ...errors import Errors from ...tokens import Doc -from ...util import registry from .._precomputable_affine import PrecomputableAffine from ..tb_framework import TransitionModel diff --git a/spacy/ml/models/span_finder.py b/spacy/ml/models/span_finder.py index 8081ed92b70..226b736c7eb 100644 --- a/spacy/ml/models/span_finder.py +++ b/spacy/ml/models/span_finder.py @@ -4,7 +4,6 @@ from thinc.types import Floats1d, Floats2d from ...tokens import Doc -from ...util import registry InT = List[Doc] OutT = Floats2d diff --git a/spacy/ml/models/spancat.py b/spacy/ml/models/spancat.py index 91dfb41ed7f..697d1df4d35 100644 --- a/spacy/ml/models/spancat.py +++ b/spacy/ml/models/spancat.py @@ -18,7 +18,6 @@ from thinc.types import Floats2d, Ragged from ...tokens import Doc -from ...util import registry from ..extract_spans import extract_spans diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index aec4276dbd8..d3b090de005 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -4,7 +4,6 @@ from thinc.types import Floats2d from ...tokens import Doc -from ...util import registry def build_tagger_model( diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 49c0dd7077c..8194ab3101e 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -36,7 +36,6 @@ from ...attrs import ORTH from ...errors import Errors from ...tokens import Doc -from ...util import registry from ..extract_ngrams import extract_ngrams from ..staticvectors import StaticVectors from .tok2vec import get_tok2vec_width diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index b2b803b6ed0..ade84274475 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -17,14 +17,13 @@ with_array, with_padded, ) -from thinc.types import Floats2d, Ints1d, Ints2d, Ragged +from thinc.types import Floats2d, Ints2d, Ragged from ...attrs import intify_attr from ...errors import Errors from ...ml import _character_embed from ...pipeline.tok2vec import Tok2VecListener from ...tokens import Doc -from ...util import registry from ..featureextractor import FeatureExtractor from ..staticvectors import StaticVectors diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index bc69e53ab24..d90acdaf008 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -1,7 +1,7 @@ import warnings -from typing import Callable, List, Optional, Sequence, Tuple, cast +from typing import Callable, List, Optional, Tuple, cast -from thinc.api import Model, Ops, registry +from thinc.api import Model, Ops from thinc.initializers import glorot_uniform_init from thinc.types import Floats1d, Floats2d, Ints1d, Ragged from thinc.util import partial diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 16c894f6c5c..e538b9e88c0 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -1,6 +1,5 @@ from thinc.api import Model, noop -from ..util import registry from .parser_model import ParserStepModel diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py index d26884487d3..b564b466e50 100644 --- a/spacy/pipe_analysis.py +++ b/spacy/pipe_analysis.py @@ -23,7 +23,7 @@ def validate_attrs(values: Iterable[str]) -> Iterable[str]: values (Iterable[str]): The string attributes to check, e.g. `["token.pos"]`. RETURNS (Iterable[str]): The checked attributes. """ - data = dot_to_dict({value: True for value in values}) + data = dot_to_dict(dict.fromkeys(values, True)) objs = {"doc": Doc, "token": Token, "span": Span} for obj_key, attrs in data.items(): if obj_key == "span": @@ -100,7 +100,7 @@ def analyze_pipes( all_attrs.update(meta.requires) result["summary"][name] = {key: getattr(meta, key, None) for key in keys} prev_pipes = nlp.pipeline[:i] - requires = {annot: False for annot in meta.requires} + requires = dict.fromkeys(meta.requires, False) if requires: for prev_name, prev_pipe in prev_pipes: prev_meta = nlp.get_pipe_meta(prev_name) diff --git a/spacy/pipeline/_edit_tree_internals/schemas.py b/spacy/pipeline/_edit_tree_internals/schemas.py index ef7a076b6cd..1f12b63607c 100644 --- a/spacy/pipeline/_edit_tree_internals/schemas.py +++ b/spacy/pipeline/_edit_tree_internals/schemas.py @@ -6,7 +6,6 @@ ConfigDict, Field, RootModel, - StrictBool, StrictInt, StrictStr, ValidationError, diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index aeb0672c7c7..d4f96ec014b 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -1,5 +1,4 @@ import importlib -import sys from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union @@ -14,7 +13,7 @@ from ..tokens import Doc, Span from ..tokens._retokenize import normalize_token_attrs, set_token_attrs from ..training import Example -from ..util import SimpleFrozenList, registry +from ..util import SimpleFrozenList from ..vocab import Vocab from .pipe import Pipe diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index 0941b43c1ce..77f033b1c48 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -1,5 +1,4 @@ import importlib -import sys from collections import Counter from itertools import islice from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 6a1ed11dfc5..4b23fee6249 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,6 +1,5 @@ import importlib import random -import sys from itertools import islice from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Union @@ -16,9 +15,8 @@ from ..scorer import Scorer from ..tokens import Doc, Span from ..training import Example, validate_examples, validate_get_examples -from ..util import SimpleFrozenList, registry +from ..util import SimpleFrozenList from ..vocab import Vocab -from .legacy.entity_linker import EntityLinker_v1 from .pipe import deserialize_config from .trainable_pipe import TrainablePipe diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 2b8c9830720..0728c3f0006 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -1,5 +1,4 @@ import importlib -import sys import warnings from collections import defaultdict from pathlib import Path @@ -14,7 +13,7 @@ from ..scorer import get_ner_prf from ..tokens import Doc, Span from ..training import Example -from ..util import SimpleFrozenList, ensure_path, from_disk, registry, to_disk +from ..util import SimpleFrozenList, ensure_path, from_disk, to_disk from .pipe import Pipe DEFAULT_ENT_ID_SEP = "||" diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index e4a3d6d1d5b..b2aa8b708c8 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -1,5 +1,4 @@ import importlib -import sys import warnings from typing import Any, Dict diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index e8d467ef8db..f518e1072ac 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -1,5 +1,4 @@ import importlib -import sys import warnings from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union @@ -13,7 +12,7 @@ from ..scorer import Scorer from ..tokens import Doc, Token from ..training import Example -from ..util import SimpleFrozenList, logger, registry +from ..util import SimpleFrozenList, logger from ..vocab import Vocab from .pipe import Pipe diff --git a/spacy/pipeline/pipe.pyi b/spacy/pipeline/pipe.pyi index 9a1c11cefea..55cfd1fec95 100644 --- a/spacy/pipeline/pipe.pyi +++ b/spacy/pipeline/pipe.pyi @@ -7,7 +7,6 @@ from typing import ( Iterator, List, NoReturn, - Optional, Tuple, Union, ) diff --git a/spacy/pipeline/span_finder.py b/spacy/pipeline/span_finder.py index 26c9efb6a9d..7ee19de04b0 100644 --- a/spacy/pipeline/span_finder.py +++ b/spacy/pipeline/span_finder.py @@ -1,5 +1,4 @@ import importlib -import sys from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple from thinc.api import Config, Model, Optimizer, set_dropout_rate @@ -10,7 +9,6 @@ from ..scorer import Scorer from ..tokens import Doc, Span from ..training import Example -from ..util import registry from .spancat import DEFAULT_SPANS_KEY from .trainable_pipe import TrainablePipe diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py index 98287ba1d22..703eda61561 100644 --- a/spacy/pipeline/span_ruler.py +++ b/spacy/pipeline/span_ruler.py @@ -1,5 +1,4 @@ import importlib -import sys import warnings from functools import partial from pathlib import Path @@ -27,7 +26,7 @@ from ..scorer import Scorer from ..tokens import Doc, Span from ..training import Example -from ..util import SimpleFrozenList, ensure_path, registry +from ..util import SimpleFrozenList, ensure_path from .pipe import Pipe PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 805a0538f01..9b945df35b5 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,5 +1,4 @@ import importlib -import sys from dataclasses import dataclass from functools import partial from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast @@ -14,7 +13,6 @@ from ..scorer import Scorer from ..tokens import Doc, Span, SpanGroup from ..training import Example, validate_examples -from ..util import registry from ..vocab import Vocab from .trainable_pipe import TrainablePipe diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 36b569edc63..7b03c7e81d4 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -1,18 +1,15 @@ import importlib -import sys from itertools import islice from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple import numpy -from thinc.api import Config, Model, Optimizer, get_array_module, set_dropout_rate -from thinc.types import Floats2d +from thinc.api import Config, Model, Optimizer, set_dropout_rate from ..errors import Errors from ..language import Language from ..scorer import Scorer from ..tokens import Doc from ..training import Example, validate_examples, validate_get_examples -from ..util import registry from ..vocab import Vocab from .trainable_pipe import TrainablePipe diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index 32845490d4e..cc094bf6197 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -1,17 +1,13 @@ import importlib -import sys from itertools import islice -from typing import Any, Callable, Dict, Iterable, List, Optional +from typing import Any, Callable, Dict, Iterable, Optional from thinc.api import Config, Model -from thinc.types import Floats2d from ..errors import Errors from ..language import Language from ..scorer import Scorer -from ..tokens import Doc from ..training import Example, validate_get_examples -from ..util import registry from ..vocab import Vocab from .textcat import TextCategorizer diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index ce0296bf5f3..4e2e5af846f 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,5 +1,4 @@ import importlib -import sys from itertools import islice from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index d72c916efb0..ef098ec1a9f 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -60,12 +60,12 @@ def test_issue1757(): """Test comparison against None doesn't cause segfault.""" doc = Doc(Vocab(), words=["a", "b", "c"]) assert not doc[0] < None - assert not doc[0] is None + assert doc[0] is not None assert doc[0] >= None assert not doc[:2] < None - assert not doc[:2] is None + assert doc[:2] is not None assert doc[:2] >= None - assert not doc.vocab["a"] is None + assert doc.vocab["a"] is not None assert not doc.vocab["a"] < None diff --git a/spacy/tests/lang/bg/test_tokenizer.py b/spacy/tests/lang/bg/test_tokenizer.py index 2e2c45001ef..ec575ec9838 100644 --- a/spacy/tests/lang/bg/test_tokenizer.py +++ b/spacy/tests/lang/bg/test_tokenizer.py @@ -1,4 +1,3 @@ -import pytest def test_bg_tokenizer_handles_final_diacritics(bg_tokenizer): diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py index 8e5fe83540c..50d49fcc28e 100644 --- a/spacy/tests/lang/es/test_noun_chunks.py +++ b/spacy/tests/lang/es/test_noun_chunks.py @@ -48,13 +48,13 @@ [(0,4)] ), # Tengo un gato y un perro -> un gato, un perro - ( + ( ["Tengo", "un", "gato", "y", "un", "perro"], [0, 2, 0, 5, 5, 0], ["ROOT", "det", "obj", "cc", "det", "conj"], ["VERB", "DET", "NOUN", "CCONJ", "DET", "NOUN"], [(1,3), (4,6)] - + ), # Dom Pedro II -> Dom Pedro II ( @@ -101,11 +101,11 @@ [1, 1, 3, 1, 5, 1], ['det', 'ROOT', 'case', 'nmod', 'case', 'nmod'], ['DET', 'NOUN', 'ADP', 'PROPN', 'ADP', 'NOUN'], - [(0,2), (3,4), (5,6)] - + [(0,2), (3,4), (5,6)] + ), # El gato regordete de Susana y su amigo -> el gato regordete, Susana, su amigo - ( + ( ['El', 'gato', 'regordete', 'de', 'Susana', 'y', 'su', 'amigo'], [1, 1, 1, 4, 1, 7, 7, 1], ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'], diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py index 436e07b29d0..d413f1f2211 100644 --- a/spacy/tests/lang/fr/test_noun_chunks.py +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -35,7 +35,7 @@ [(0, 2)], ), # det + adj + noun - # Le vieux Londres -> Le vieux Londres + # Le vieux Londres -> Le vieux Londres ( ['Les', 'vieux', 'Londres'], [2, 2, 2], @@ -144,13 +144,13 @@ ), # Two NPs conjuncted # Il a un chien et un chat -> Il, un chien, un chat - ( + ( ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'], [1, 1, 3, 1, 6, 6, 3], ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'], ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], [(0,1), (2,4), (5,7)] - + ), # Two NPs together # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado @@ -195,12 +195,12 @@ [0, 2, 0, 4, 2], ['ROOT', 'case', 'nmod', 'case', 'nmod'], ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], - [(0,1), (2,3), (4,5)] - + [(0,1), (2,3), (4,5)] + ), # Several NPs # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie - ( + ( ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'], [2, 2, 2, 4, 2, 7, 7, 2], ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'], diff --git a/spacy/tests/lang/it/test_noun_chunks.py b/spacy/tests/lang/it/test_noun_chunks.py index 7f6659ee7bd..5fd39ab01b9 100644 --- a/spacy/tests/lang/it/test_noun_chunks.py +++ b/spacy/tests/lang/it/test_noun_chunks.py @@ -62,7 +62,7 @@ [(0,3)], ), # noun + adj plural - # mucche bianche + # mucche bianche ( ["mucche", "bianche"], [0, 0], @@ -117,13 +117,13 @@ ), # Two NPs conjuncted # Ho un cane e un gatto -> un cane, un gatto - ( + ( ['Ho', 'un', 'cane', 'e', 'un', 'gatto'], [0, 2, 0, 5, 5, 0], ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'], ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], [(1,3), (4,6)] - + ), # Two NPs together # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado @@ -177,12 +177,12 @@ [0, 2, 0, 4, 2], ['ROOT', 'case', 'nmod', 'case', 'nmod'], ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], - [(0,1), (2,3), (4,5)] - + [(0,1), (2,3), (4,5)] + ), # Several NPs # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica - ( + ( ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'], [1, 1, 1, 4, 1, 8, 8, 8, 1], ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'], diff --git a/spacy/tests/lang/la/test_exception.py b/spacy/tests/lang/la/test_exception.py index 966ae22cfec..a7fb7c85254 100644 --- a/spacy/tests/lang/la/test_exception.py +++ b/spacy/tests/lang/la/test_exception.py @@ -1,4 +1,3 @@ -import pytest def test_la_tokenizer_handles_exc_in_text(la_tokenizer): diff --git a/spacy/tests/lang/pt/test_noun_chunks.py b/spacy/tests/lang/pt/test_noun_chunks.py index eee96d593b1..5dd7bfd3b82 100644 --- a/spacy/tests/lang/pt/test_noun_chunks.py +++ b/spacy/tests/lang/pt/test_noun_chunks.py @@ -126,13 +126,13 @@ ), # Two NPs conjuncted # Eu tenho um cachorro e um gato -> Eu, um cacharo, um gato - ( + ( ["Eu", "tenho", "um", "cachorro", "e", "um", "gato"], [1, 1, 3, 1, 6, 6, 3], ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'], ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], [(0,1), (2,4), (5,7)] - + ), # Two NPs together # o escritor brasileiro Aníbal Machado -> o escritor brasileiro, Aníbal Machado @@ -186,12 +186,12 @@ [0, 2, 0, 4, 2], ['ROOT', 'case', 'nmod', 'case', 'nmod'], ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], - [(0,1), (2,3), (4,5)] - + [(0,1), (2,3), (4,5)] + ), # Several NPs # O gato gordo da Susana e seu amigo -> O gato gordo, Susana, seu amigo - ( + ( ['O', 'gato', 'gordo', 'da', 'Susana', 'e', 'seu', 'amigo'], [1, 1, 1, 4, 1, 7, 7, 1], ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'], diff --git a/spacy/tests/lang/sl/test_text.py b/spacy/tests/lang/sl/test_text.py index a2a93207729..741e5cc1003 100644 --- a/spacy/tests/lang/sl/test_text.py +++ b/spacy/tests/lang/sl/test_text.py @@ -1,4 +1,3 @@ -import pytest def test_long_text(sl_tokenizer): diff --git a/spacy/tests/lang/sq/test_text.py b/spacy/tests/lang/sq/test_text.py index 44eedaa5487..368bb6c4157 100644 --- a/spacy/tests/lang/sq/test_text.py +++ b/spacy/tests/lang/sq/test_text.py @@ -1,4 +1,3 @@ -import pytest def test_long_text(sq_tokenizer): diff --git a/spacy/tests/lang/xx/test_text.py b/spacy/tests/lang/xx/test_text.py index 477f0ebe271..c28692ee57e 100644 --- a/spacy/tests/lang/xx/test_text.py +++ b/spacy/tests/lang/xx/test_text.py @@ -1,4 +1,3 @@ -import pytest def test_long_text(xx_tokenizer): diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 5e50a4d2801..74dd026e716 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,9 +1,9 @@ -from typing import Any, Callable, Dict, Iterable, Tuple +from typing import Any, Callable, Dict, Iterable import pytest from numpy.testing import assert_equal -from spacy import Language, registry, util +from spacy import registry, util from spacy.attrs import ENT_KB_ID from spacy.compat import pickle from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 4310e41ab47..f2bfe003d39 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -29,7 +29,6 @@ from spacy.training.initialize import init_nlp # Ensure that the architecture gets added to the registry. -from ..tok2vec import build_lazy_init_tok2vec as _ from ..util import make_tempdir TRAIN_DATA_SINGLE_LABEL = [ diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 1789d60ea4c..5fe9e9bcf01 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -1,5 +1,4 @@ import os -import sys from pathlib import Path import pytest diff --git a/spacy/tests/test_factory_registrations.py b/spacy/tests/test_factory_registrations.py index ab604c3a6d5..eb69265e3f3 100644 --- a/spacy/tests/test_factory_registrations.py +++ b/spacy/tests/test_factory_registrations.py @@ -1,17 +1,14 @@ -import inspect import json from pathlib import Path import pytest -from spacy.language import Language from spacy.util import registry # Path to the reference factory registrations, relative to this file REFERENCE_FILE = Path(__file__).parent / "factory_registrations.json" # Monkey patch the util.is_same_func to handle Cython functions -import inspect from spacy import util diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 5228b4544fd..706203ffd63 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -95,7 +95,7 @@ def test_multi_hash_embed(): hash_embeds = [node for node in embed.walk() if node.name == "hashembed"] assert len(hash_embeds) == 3 # Check they look at different columns. - assert list(sorted(he.attrs["column"] for he in hash_embeds)) == [0, 1, 2] + assert sorted(he.attrs["column"] for he in hash_embeds) == [0, 1, 2] # Check they use different seeds assert len(set(he.attrs["seed"] for he in hash_embeds)) == 3 # Check they all have the same number of rows diff --git a/spacy/tests/test_registry_population.py b/spacy/tests/test_registry_population.py index c67136f9c31..e72f3d9f8e8 100644 --- a/spacy/tests/test_registry_population.py +++ b/spacy/tests/test_registry_population.py @@ -1,5 +1,4 @@ import json -import os from pathlib import Path import pytest diff --git a/spacy/tests/training/test_corpus.py b/spacy/tests/training/test_corpus.py index e7cae989384..ded6a53833c 100644 --- a/spacy/tests/training/test_corpus.py +++ b/spacy/tests/training/test_corpus.py @@ -1,7 +1,6 @@ -import tempfile from contextlib import contextmanager from pathlib import Path -from typing import IO, Generator, Iterable, List, TextIO, Tuple +from typing import Generator, Iterable, List, Tuple import pytest diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index 156e3391aa2..3c01055b552 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -2,7 +2,6 @@ import pytest from spacy.attrs import IS_ALPHA, IS_DIGIT -from spacy.lookups import Lookups from spacy.tokens import Doc from spacy.util import OOV_RANK from spacy.vocab import Vocab diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 873d85835f0..06d1791ade0 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -207,7 +207,7 @@ def to_bytes(self) -> bytes: "tokens": tokens.tobytes("C"), "spaces": spaces.tobytes("C"), "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), - "strings": list(sorted(self.strings)), + "strings": sorted(self.strings), "cats": self.cats, "flags": self.flags, "span_groups": self.span_groups, diff --git a/spacy/training/augment.py b/spacy/training/augment.py index da5ae3d087a..ba4368acef1 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -3,7 +3,6 @@ from functools import partial from typing import TYPE_CHECKING, Callable, Dict, Iterator, List, Optional, Tuple -from ..util import registry from .example import Example from .iob_utils import _doc_to_biluo_tags_with_partial, split_bilu_label diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py index 4f3ac5de795..8158e9fdf8f 100644 --- a/spacy/training/batchers.py +++ b/spacy/training/batchers.py @@ -4,7 +4,6 @@ Any, Callable, Iterable, - Iterator, List, Optional, Sequence, @@ -12,7 +11,7 @@ Union, ) -from ..util import minibatch, registry +from ..util import minibatch Sizing = Union[Sequence[int], int] ItemT = TypeVar("ItemT") @@ -232,6 +231,6 @@ def _batch_by_length( batches.append(batch) # Check lengths match assert sum(len(b) for b in batches) == len(seqs) - batches = [list(sorted(batch)) for batch in batches] + batches = [sorted(batch) for batch in batches] batches.reverse() return batches diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py index 714deea6dcd..19382757a95 100644 --- a/spacy/training/callbacks.py +++ b/spacy/training/callbacks.py @@ -1,7 +1,7 @@ from typing import TYPE_CHECKING, Callable, Optional from ..errors import Errors -from ..util import load_model, logger, registry +from ..util import load_model, logger if TYPE_CHECKING: from ..language import Language diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 05c5d3bb681..7f200545ca0 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -8,7 +8,6 @@ from .. import util from ..errors import Errors -from ..util import registry if TYPE_CHECKING: from ..language import Language # noqa: F401 diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi index ee7636f02c8..906a4c0d978 100644 --- a/spacy/vocab.pyi +++ b/spacy/vocab.pyi @@ -5,7 +5,6 @@ from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Unio from cymem.cymem import Pool from thinc.types import Floats1d, FloatsXd -from . import Language from .lexeme import Lexeme from .lookups import Lookups from .morphology import Morphology From 8e6bd6d1c5ff008adf3ce8a0bb07250e1c6b68ae Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 21 Mar 2026 08:43:52 +0100 Subject: [PATCH 29/42] Apply ruff formatting to 8 files --- spacy/cli/find_threshold.py | 4 +++- spacy/ml/_precomputable_affine.py | 1 - spacy/ml/extract_spans.py | 1 - spacy/tests/lang/bg/test_tokenizer.py | 2 -- spacy/tests/lang/la/test_exception.py | 2 -- spacy/tests/lang/sl/test_text.py | 2 -- spacy/tests/lang/sq/test_text.py | 2 -- spacy/tests/lang/xx/test_text.py | 2 -- 8 files changed, 3 insertions(+), 13 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 7b2c5f98051..05b6fdc9517 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -207,7 +207,9 @@ def filter_config( ), ) if hasattr(pipe, "cfg"): - nlp.get_pipe(pipe_name).cfg = set_nested_item(pipe.cfg, config_keys, threshold) + nlp.get_pipe(pipe_name).cfg = set_nested_item( + pipe.cfg, config_keys, threshold + ) eval_scores = nlp.evaluate(dev_dataset) if scores_key not in eval_scores: diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index ac2f6bbd3fa..464c32594dc 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -1,7 +1,6 @@ from thinc.api import Model, normal_init - def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1): model = Model( "precomputable_affine", diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py index 2351ddc2cf0..925bfd45c31 100644 --- a/spacy/ml/extract_spans.py +++ b/spacy/ml/extract_spans.py @@ -4,7 +4,6 @@ from thinc.types import Ints1d, Ragged - def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]: """Extract spans from a sequence of source arrays, as specified by an array of (start, end) indices. The output is a ragged array of the diff --git a/spacy/tests/lang/bg/test_tokenizer.py b/spacy/tests/lang/bg/test_tokenizer.py index ec575ec9838..b16ef12d880 100644 --- a/spacy/tests/lang/bg/test_tokenizer.py +++ b/spacy/tests/lang/bg/test_tokenizer.py @@ -1,5 +1,3 @@ - - def test_bg_tokenizer_handles_final_diacritics(bg_tokenizer): text = "Ня̀маше яйца̀. Ня̀маше яйца̀." tokens = bg_tokenizer(text) diff --git a/spacy/tests/lang/la/test_exception.py b/spacy/tests/lang/la/test_exception.py index a7fb7c85254..9a6e6a422c5 100644 --- a/spacy/tests/lang/la/test_exception.py +++ b/spacy/tests/lang/la/test_exception.py @@ -1,5 +1,3 @@ - - def test_la_tokenizer_handles_exc_in_text(la_tokenizer): text = "scio te omnia facturum, ut nobiscum quam primum sis" tokens = la_tokenizer(text) diff --git a/spacy/tests/lang/sl/test_text.py b/spacy/tests/lang/sl/test_text.py index 741e5cc1003..4781bebcdcb 100644 --- a/spacy/tests/lang/sl/test_text.py +++ b/spacy/tests/lang/sl/test_text.py @@ -1,5 +1,3 @@ - - def test_long_text(sl_tokenizer): # Excerpt: European Convention on Human Rights text = """ diff --git a/spacy/tests/lang/sq/test_text.py b/spacy/tests/lang/sq/test_text.py index 368bb6c4157..24d60afdf20 100644 --- a/spacy/tests/lang/sq/test_text.py +++ b/spacy/tests/lang/sq/test_text.py @@ -1,5 +1,3 @@ - - def test_long_text(sq_tokenizer): # Excerpt: European Convention on Human Rights text = """ diff --git a/spacy/tests/lang/xx/test_text.py b/spacy/tests/lang/xx/test_text.py index c28692ee57e..a4eafdcb98e 100644 --- a/spacy/tests/lang/xx/test_text.py +++ b/spacy/tests/lang/xx/test_text.py @@ -1,5 +1,3 @@ - - def test_long_text(xx_tokenizer): # Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi text = """ From 24255bd1e25c7275553325d989bfcfd3a41fc1d2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 21 Mar 2026 08:44:43 +0100 Subject: [PATCH 30/42] Fix import sorting for ruff isort compliance --- spacy/__init__.py | 11 ++++++----- spacy/compat.py | 4 +++- spacy/lang/pt/punctuation.py | 8 +++++--- spacy/pipeline/factories.py | 2 +- spacy/tests/pipeline/test_initialize.py | 2 -- spacy/tests/pipeline/test_pipe_factories.py | 2 -- spacy/tests/test_misc.py | 2 -- spacy/tokens/_serialize.py | 3 +-- spacy/training/initialize.py | 3 +-- 9 files changed, 17 insertions(+), 20 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index eeab3773591..9d7fdf29ba7 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -10,16 +10,16 @@ # These are imported as part of the API from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401 -from . import pipeline # noqa: F401 -from . import util +from . import ( + pipeline, # noqa: F401 + util, +) from .about import __version__ # noqa: F401 from .cli.info import info # noqa: F401 from .errors import Errors from .glossary import explain # noqa: F401 from .language import Language from .registrations import REGISTRY_POPULATED, populate_registry -from .util import logger, registry # noqa: F401 -from .vocab import Vocab # Rebuild pydantic v2 schemas that use forward references to Language/Vocab from .schemas import ( # noqa: F401 @@ -29,8 +29,9 @@ ConfigSchemaPretrain, ConfigSchemaTraining, ) - from .training import Example # noqa: F401 +from .util import logger, registry # noqa: F401 +from .vocab import Vocab _rebuild_ns = {"Language": Language, "Vocab": Vocab, "Example": Example} for _schema in ( diff --git a/spacy/compat.py b/spacy/compat.py index a9e7d5a20b9..abaca49302f 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -35,7 +35,9 @@ try: # Python 3.8+ import importlib.metadata as importlib_metadata except ImportError: - from catalogue import _importlib_metadata as importlib_metadata # type: ignore[no-redef] # noqa: F401 + from catalogue import ( + _importlib_metadata as importlib_metadata, # type: ignore[no-redef] # noqa: F401 + ) from thinc.api import Optimizer # noqa: F401 diff --git a/spacy/lang/pt/punctuation.py b/spacy/lang/pt/punctuation.py index b2d63cb3d63..60bd50da1eb 100644 --- a/spacy/lang/pt/punctuation.py +++ b/spacy/lang/pt/punctuation.py @@ -1,6 +1,8 @@ -from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES -from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES -from ..punctuation import TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES +from ..punctuation import ( + TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES, + TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES, + TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES, +) _prefixes = [r"\w{1,3}\$"] + BASE_TOKENIZER_PREFIXES diff --git a/spacy/pipeline/factories.py b/spacy/pipeline/factories.py index f796f2dc8a5..e76704237ee 100644 --- a/spacy/pipeline/factories.py +++ b/spacy/pipeline/factories.py @@ -24,8 +24,8 @@ from ..pipeline.sentencizer import Sentencizer from ..pipeline.senter import DEFAULT_SENTER_MODEL, SentenceRecognizer from ..pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL, SpanFinder -from ..pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY from ..pipeline.span_ruler import ( + DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY, SpanRuler, prioritize_existing_ents_filter, prioritize_new_ents_filter, diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py index acb1c6faa45..71b12227f2c 100644 --- a/spacy/tests/pipeline/test_initialize.py +++ b/spacy/tests/pipeline/test_initialize.py @@ -1,7 +1,5 @@ import pytest - from pydantic import StrictBool - from thinc.api import ConfigValidationError from spacy.lang.en import English diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index f8f9ec4b10d..a8a6c7d136a 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -1,7 +1,5 @@ import pytest - from pydantic import StrictInt, StrictStr - from thinc.api import ConfigValidationError, Linear, Model import spacy diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 2c26952891d..309c57b0926 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -3,9 +3,7 @@ from pathlib import Path import pytest - from pydantic import ValidationError - from thinc.api import ( Config, ConfigValidationError, diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 06d1791ade0..51f5740c25c 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -13,8 +13,7 @@ from ..util import SimpleFrozenList, ensure_path from ..vocab import Vocab from ._dict_proxies import SpanGroups -from .doc import DOCBIN_ALL_ATTRS as ALL_ATTRS -from .doc import Doc +from .doc import DOCBIN_ALL_ATTRS as ALL_ATTRS, Doc class DocBin: diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 0621702214c..64cdddf9c43 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -25,8 +25,7 @@ registry, resolve_dot_names, ) -from ..vectors import Mode as VectorsMode -from ..vectors import Vectors +from ..vectors import Mode as VectorsMode, Vectors from .pretrain import get_tok2vec_ref if TYPE_CHECKING: From f175a51e2df36f16d473c7b62a0b9b8563ebb773 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 23 Mar 2026 13:45:02 +0100 Subject: [PATCH 31/42] Fully migrate to Pydantic v2 (#13940) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use confection v1.3 and Thinc v8.3.13, which implement custom validation logic in place of Pydantic, allowing us to properly adopt Pydantic v2 and provide full Python 3.14 support. Our dependency tree used Pydantic v1 in unusual ways, and relied on behaviours that Pydantic v2 reformed. In the time since Pydantic v2 was released there were a few attempts to migrate over to it, but the task has been complicated by the fact that the confection library has a fairly tangled implementation and I had reduced availability for open-source work in 2024 and 2025. Specifically, our library confection provides the extensible configuration system we use in spaCy and Thinc. The config system allows you to refer to values that will be supplied by arbitrary functions, that e.g. define some neural network model or its sublayers. The functionality in confection is complicated because we aggressively prioritised user experience in the specification, even if it required increased implementation complexity. Confection's original implementation built a dynamic Pydantic v1 schema for function-supplied values ("promises"). We validate the schema before calling any promises, and then validate the schema again after calling all the promises and substituting in their values. The variable-interpolation system adds further difficulties to the implementation, and we have to do it all subclassing the Python built-in configparser, which ties us to implementation choices I'd do differently if I had a clean slate. Here's one summary of Pydantic v1-specific behaviours that the migration to v2 particularly difficult for us. This particular summary was produced during a session with Claude Code Opus 4.6, so nuances of it might be wrong. The full history of attempts at doing this spans over different refactors separated by a few months at a time, so I don't have a full record of all the things that I struggled with. It's possible some details of this summary are incorrect though. The core problem we kept hitting: Pydantic v2 compiles validation schemas upfront and has much stricter immutability. The whole session has been a series of workarounds for this: ``` 1. Schema mutation — v1 let you mutate __fields__ in place; v2 needs model_rebuild() which loses forward ref namespaces, or create_model subclasses which don't propagate to parent schemas. 2. model_dump vs dict — v2 converts dataclasses to dicts, breaking resolved objects. Needed a custom _model_to_dict helper. 3. model_construct drops extras — v2 silently drops fields with extra="forbid", needed manual workarounds. 4. Strict coercion — v2 coerces ndarray to List[Floats1d] via iteration, needed strict=True. 5. Forward refs — Every schema with TYPE_CHECKING imports needs model_rebuild() with the right namespace, and that breaks when confection re-rebuilds later. In order to adjust for behavioural differences like this, I'd refactored confection to build the different versions of the schema in multiple passes, instead of building all the representations together as we'd been doing. However this refactor itself had problems, further complicating the migration. ``` ~I've now bitten the bullet and rolled back the refactor I'd been attempting of confection, and instead replaced the Pydantic validation with custom logic. This allows Confection to remove Pydantic as a dependency entirely.~ Update: Actually I went back and got the refactor working. All much nicer now. I've taken some lengths to explain this because migrating off a dependency after breaking changes can be a sensitive topic. I want to stress that the changes Pydantic made from v1 to v2 are very good, and I greatly appreciate them as a user of FastAPI in our services. It would be very bad for the ecosystem if Pydantic pinned themselves to exactly matching the behaviours they had in v1 just to avoid breaking support for the sort of thing we'd been doing. Instead users who were relying on those behaviours like us should just find some way to adapt --- either vendor the v1 version we need, or change our behaviours, or implement an alternative. I would have liked to do this sooner but we've ultimately gone with the third option. --- .github/workflows/cibuildwheel.yml | 5 ++++- .github/workflows/explosionbot.yml | 10 +++++++--- .github/workflows/issue-manager.yml | 6 +++++- .github/workflows/lock.yml | 2 +- .github/workflows/publish_pypi.yml | 4 +++- .github/workflows/spacy_universe_alert.yml | 13 ++++--------- .github/workflows/tests.yml | 18 ++++++++++-------- .github/workflows/universe_validation.yml | 6 ++++-- pyproject.toml | 2 +- requirements.txt | 12 ++++++------ setup.cfg | 16 ++++++++++------ spacy/__init__.py | 2 +- spacy/cli/debug_config.py | 4 ++-- spacy/cli/debug_data.py | 2 +- spacy/cli/debug_model.py | 2 +- spacy/cli/find_threshold.py | 6 ++++-- spacy/cli/init_config.py | 2 +- spacy/compat.py | 4 ++-- spacy/lang/es/lemmatizer.py | 2 +- spacy/language.py | 10 ++++++---- spacy/matcher/phrasematcher.pyx | 2 +- spacy/pipeline/_edit_tree_internals/schemas.py | 1 + spacy/pipeline/factories.py | 3 ++- spacy/schemas.py | 6 +++--- spacy/tests/lang/zh/test_tokenizer.py | 2 +- spacy/tests/package/test_requirements.py | 1 + spacy/tests/pipeline/test_textcat.py | 1 + spacy/tests/test_cli.py | 3 +++ spacy/training/batchers.py | 1 + spacy/training/corpus.py | 3 ++- spacy/training/initialize.py | 2 +- spacy/training/loop.py | 2 +- spacy/training/pretrain.py | 2 +- 33 files changed, 93 insertions(+), 64 deletions(-) diff --git a/.github/workflows/cibuildwheel.yml b/.github/workflows/cibuildwheel.yml index 5f8ba9285ac..5f731a31595 100644 --- a/.github/workflows/cibuildwheel.yml +++ b/.github/workflows/cibuildwheel.yml @@ -7,9 +7,12 @@ on: # ** matches 'zero or more of any character' - 'release-v[0-9]+.[0-9]+.[0-9]+**' - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**' + +permissions: {} + jobs: build_wheels: - uses: explosion/gha-cibuildwheel/.github/workflows/cibuildwheel.yml@main + uses: explosion/gha-cibuildwheel/.github/workflows/cibuildwheel.yml@2c98f757f13d112cf73fcf4b627249f1fffb5aae # main permissions: contents: write actions: read diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml index 78a27cfa3ba..979385ccb90 100644 --- a/.github/workflows/explosionbot.yml +++ b/.github/workflows/explosionbot.yml @@ -6,6 +6,8 @@ on: - created - edited +permissions: {} + jobs: explosion-bot: if: github.repository_owner == 'explosion' @@ -15,13 +17,15 @@ jobs: env: GITHUB_CONTEXT: ${{ toJson(github) }} run: echo "$GITHUB_CONTEXT" - - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 - name: Install and run explosion-bot run: | - pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot + git config --global url."https://x-access-token:${EXPLOSIONBOT_TOKEN}@github.com/".insteadOf "https://github.com/" + pip install git+https://github.com/explosion/explosion-bot python -m explosionbot env: + EXPLOSIONBOT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }} INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }} INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }} ENABLED_COMMANDS: "test_gpu,test_slow,test_slow_gpu" diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index 6c7d7d5a6f8..264707485e7 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -11,12 +11,16 @@ on: types: - labeled +permissions: {} + jobs: issue-manager: + permissions: + issues: write if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - - uses: tiangolo/issue-manager@0.4.0 + - uses: tiangolo/issue-manager@4d1b7e05935a404dc8337d30bd23be46be8bb8e5 # 0.4.0 with: token: ${{ secrets.GITHUB_TOKEN }} config: > diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml index 2bbdd64c771..8fcf3028476 100644 --- a/.github/workflows/lock.yml +++ b/.github/workflows/lock.yml @@ -16,7 +16,7 @@ jobs: if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - - uses: dessant/lock-threads@v5 + - uses: dessant/lock-threads@1bf7ec25051fe7c00bdd17e6a7cf3d7bfb7dc771 # v5 with: process-only: 'issues' issue-inactive-days: '30' diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml index 9f432874cc2..fcc6f2a9999 100644 --- a/.github/workflows/publish_pypi.yml +++ b/.github/workflows/publish_pypi.yml @@ -8,6 +8,8 @@ on: types: - published +permissions: {} + jobs: upload_pypi: runs-on: ubuntu-latest @@ -21,7 +23,7 @@ jobs: # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this) # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') steps: - - uses: robinraju/release-downloader@v1 + - uses: robinraju/release-downloader@daf26c55d821e836577a15f77d86ddc078948b05 # v1 with: tag: ${{ github.event.release.tag_name }} fileName: '*' diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml index 01731ffe0d7..ec0230699be 100644 --- a/.github/workflows/spacy_universe_alert.yml +++ b/.github/workflows/spacy_universe_alert.yml @@ -5,21 +5,16 @@ on: paths: - "website/meta/universe.json" +permissions: {} + jobs: build: if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - - name: Dump GitHub context - env: - GITHUB_CONTEXT: ${{ toJson(github) }} - PR_NUMBER: ${{github.event.number}} - run: | - echo "$GITHUB_CONTEXT" - - - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: '3.10' - name: Install Bernadette app dependency and send an alert diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index adfce07f50b..b20dba12f04 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -19,6 +19,8 @@ on: - "*.mdx" - "website/**" +permissions: {} + jobs: validate: name: Validate @@ -26,10 +28,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - name: Configure Python version - uses: actions/setup-python@v4 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: "3.10" @@ -45,19 +47,19 @@ jobs: name: Test needs: Validate strategy: - fail-fast: true + fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] - python_version: ["3.10", "3.11", "3.12", "3.13"] + python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"] runs-on: ${{ matrix.os }} steps: - name: Check out repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - name: Configure Python version - uses: actions/setup-python@v4 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: ${{ matrix.python_version }} @@ -93,7 +95,7 @@ jobs: shell: bash - name: Test import - run: python -W error -c "import spacy" + run: python -W error -W 'ignore:Core Pydantic V1:UserWarning:pydantic' -c "import spacy" - name: "Test download CLI" run: | @@ -154,7 +156,7 @@ jobs: - name: "Run CPU tests" run: | - python -m pytest --pyargs spacy -W error + python -m pytest --pyargs spacy -W error -W 'ignore:Core Pydantic V1:UserWarning:pydantic' if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')" - name: "Run CPU tests with thinc-apple-ops" diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml index ce7df49dbae..e97850cd4b0 100644 --- a/.github/workflows/universe_validation.yml +++ b/.github/workflows/universe_validation.yml @@ -13,6 +13,8 @@ on: paths: - "website/meta/universe.json" +permissions: {} + jobs: validate: name: Validate @@ -20,10 +22,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - name: Configure Python version - uses: actions/setup-python@v4 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: "3.7" diff --git a/pyproject.toml b/pyproject.toml index 9e6cab69da3..395c2f7a108 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.3.4,<8.4.0", + "thinc>=8.3.12,<8.4.0", "numpy>=2.0.0,<3.0.0" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 866128b31b6..50c6382bea3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,19 +3,19 @@ spacy-legacy>=3.0.11,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.3.4,<8.4.0 -ml_datasets>=0.2.0,<0.3.0 +thinc>=8.3.12,<8.4.0 +ml_datasets>=0.2.1,<0.3.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.2.0 -srsly>=2.4.3,<3.0.0 +srsly>=2.5.3,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<1.0.0 -weasel>=0.4.2,<0.5.0 +weasel>=1.0.0,<2.0.0 # Third party dependencies numpy>=2.0.0,<3.0.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 -pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 +pydantic>=2.0.0,<3.0.0 jinja2 # Official Python utilities setuptools @@ -34,4 +34,4 @@ types-requests types-setuptools>=57.0.0 ruff>=0.9.0 cython-lint>=0.15.0 -confection>=0.0.4,<1.0.0 +confection>=1.1.0,<2.0.0 diff --git a/setup.cfg b/setup.cfg index 1ef8e303125..66cf942218b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,6 +22,7 @@ classifiers = Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 Programming Language :: Python :: 3.13 + Programming Language :: Python :: 3.14 Topic :: Scientific/Engineering project_urls = Release notes = https://github.com/explosion/spaCy/releases @@ -41,7 +42,7 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.3.4,<8.4.0 + thinc>=8.3.12,<8.4.0 install_requires = # Our libraries spacy-legacy>=3.0.11,<3.1.0 @@ -49,19 +50,19 @@ install_requires = murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.3.4,<8.4.0 + thinc>=8.3.12,<8.4.0 wasabi>=0.9.1,<1.2.0 - srsly>=2.4.3,<3.0.0 + srsly>=2.5.3,<3.0.0 catalogue>=2.0.6,<2.1.0 - weasel>=0.4.2,<0.5.0 - confection>=0.0.4,<1.0.0 + weasel>=1.0.0,<2.0.0 + confection>=1.1.0,<2.0.0 # Third-party dependencies typer>=0.3.0,<1.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0; python_version < "3.9" numpy>=1.19.0; python_version >= "3.9" requests>=2.13.0,<3.0.0 - pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 + pydantic>=2.0.0,<3.0.0 jinja2 # Official Python utilities setuptools @@ -135,6 +136,9 @@ formats = gztar markers = slow: mark a test as slow issue: reference specific issue +filterwarnings = + error + ignore:Core Pydantic V1:UserWarning:pydantic [mypy] ignore_missing_imports = True diff --git a/spacy/__init__.py b/spacy/__init__.py index 9d7fdf29ba7..5b3ff25c872 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -41,7 +41,7 @@ ConfigSchemaInit, ConfigSchema, ): - _schema.model_rebuild(_types_namespace=_rebuild_ns) + _schema.model_rebuild(_types_namespace=_rebuild_ns) # type: ignore[attr-defined] if sys.maxunicode == 65535: raise SystemError(Errors.E130) diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index f049d7fd149..4876b6ff9e1 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -82,10 +82,10 @@ def debug_config( config = nlp.config.interpolate() msg.divider("Config validation for [initialize]") with show_validation_error(config_path): - T = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + T = registry.resolve(config["initialize"], schema=ConfigSchemaInit) # type: ignore[arg-type] msg.divider("Config validation for [training]") with show_validation_error(config_path): - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # type: ignore[arg-type] dot_names = [T["train_corpus"], T["dev_corpus"]] util.resolve_dot_names(config, dot_names) msg.good("Config is valid") diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index df52250cf90..6ba18e7f224 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -137,7 +137,7 @@ def debug_data( cfg = util.load_config(config_path, overrides=config_overrides) nlp = util.load_model_from_config(cfg) config = nlp.config.interpolate() - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # type: ignore[arg-type] # Use original config here, not resolved version sourced_components = get_sourced_components(cfg) frozen_components = T["frozen_components"] diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index ec7ffe099d7..dc0de3e1489 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -89,7 +89,7 @@ def debug_model_cli( with show_validation_error(config_path): nlp = util.load_model_from_config(raw_config) config = nlp.config.interpolate() - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # type: ignore[arg-type] seed = T["seed"] if seed is not None: msg.info(f"Fixing random seed: {seed}") diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 05b6fdc9517..1873f476fcd 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -207,8 +207,10 @@ def filter_config( ), ) if hasattr(pipe, "cfg"): - nlp.get_pipe(pipe_name).cfg = set_nested_item( - pipe.cfg, config_keys, threshold + nlp.get_pipe(pipe_name).cfg = set_nested_item( # type: ignore[attr-defined] + pipe.cfg, + config_keys, + threshold, # type: ignore[attr-defined] ) eval_scores = nlp.evaluate(dev_dataset) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 2cb39056d5e..c7081040280 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -216,7 +216,7 @@ def init_config( # Filter out duplicates since tok2vec and transformer are added by template pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")] defaults = RECOMMENDATIONS["__default__"] - reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict() + reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).model_dump() variables = { "lang": lang, "components": pipeline, diff --git a/spacy/compat.py b/spacy/compat.py index abaca49302f..828ed1ba62e 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -35,8 +35,8 @@ try: # Python 3.8+ import importlib.metadata as importlib_metadata except ImportError: - from catalogue import ( - _importlib_metadata as importlib_metadata, # type: ignore[no-redef] # noqa: F401 + from catalogue import ( # type: ignore[no-redef] + _importlib_metadata as importlib_metadata, # noqa: F401 ) from thinc.api import Optimizer # noqa: F401 diff --git a/spacy/lang/es/lemmatizer.py b/spacy/lang/es/lemmatizer.py index 05238a75b70..3102f3b9bc4 100644 --- a/spacy/lang/es/lemmatizer.py +++ b/spacy/lang/es/lemmatizer.py @@ -416,7 +416,7 @@ def lemmatize_verb_pron( rule = self.select_rule("verb", features) verb_lemma = self.lemmatize_verb( verb, - features - {"PronType=Prs"}, + features - {"PronType=Prs"}, # type: ignore[operator] rule, index, # type: ignore[operator] )[0] diff --git a/spacy/language.py b/spacy/language.py index ea9ba3cae8b..8e91018254e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1323,7 +1323,7 @@ def get_examples(): # Make sure the config is interpolated so we can resolve subsections config = self.config.interpolate() # These are the settings provided in the [initialize] block in the config - I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) # type: ignore[arg-type] before_init = I["before_init"] if before_init is not None: before_init(self) @@ -1353,7 +1353,7 @@ def get_examples(): proc.initialize(get_examples, nlp=self, **p_settings) pretrain_cfg = config.get("pretraining") if pretrain_cfg: - P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) + P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) # type: ignore[arg-type] init_tok2vec(self, P, I) self._link_components() self._optimizer = sgd @@ -1823,7 +1823,7 @@ def from_config( orig_pretraining = config.pop("pretraining", None) config["components"] = {} if auto_fill: - filled = registry.fill(config, validate=validate, schema=ConfigSchema) + filled = registry.fill(config, validate=validate, schema=ConfigSchema) # type: ignore[arg-type] else: filled = config filled["components"] = orig_pipeline @@ -1832,7 +1832,9 @@ def from_config( filled["pretraining"] = orig_pretraining config["pretraining"] = orig_pretraining resolved_nlp = registry.resolve( - filled["nlp"], validate=validate, schema=ConfigSchemaNlp + filled["nlp"], + validate=validate, + schema=ConfigSchemaNlp, # type: ignore[arg-type] ) create_tokenizer = resolved_nlp["tokenizer"] create_vectors = resolved_nlp["vectors"] diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index ccc830e35c1..a71f85f6e63 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -57,7 +57,7 @@ cdef class PhraseMatcher: attr = "ORTH" if attr == "IS_SENT_START": attr = "SENT_START" - if attr.lower() not in TokenPattern().dict(): + if attr.lower() not in TokenPattern().model_dump(): raise ValueError(Errors.E152.format(attr=attr)) self.attr = IDS.get(attr) diff --git a/spacy/pipeline/_edit_tree_internals/schemas.py b/spacy/pipeline/_edit_tree_internals/schemas.py index 1f12b63607c..ef7a076b6cd 100644 --- a/spacy/pipeline/_edit_tree_internals/schemas.py +++ b/spacy/pipeline/_edit_tree_internals/schemas.py @@ -6,6 +6,7 @@ ConfigDict, Field, RootModel, + StrictBool, StrictInt, StrictStr, ValidationError, diff --git a/spacy/pipeline/factories.py b/spacy/pipeline/factories.py index e76704237ee..8c71067b32e 100644 --- a/spacy/pipeline/factories.py +++ b/spacy/pipeline/factories.py @@ -14,9 +14,10 @@ ) # Import factory default configurations -from ..pipeline.entity_linker import DEFAULT_NEL_MODEL, EntityLinker, EntityLinker_v1 +from ..pipeline.entity_linker import DEFAULT_NEL_MODEL, EntityLinker from ..pipeline.entityruler import DEFAULT_ENT_ID_SEP, EntityRuler from ..pipeline.functions import DocCleaner, TokenSplitter +from ..pipeline.legacy import EntityLinker_v1 from ..pipeline.lemmatizer import Lemmatizer from ..pipeline.morphologizer import DEFAULT_MORPH_MODEL, Morphologizer from ..pipeline.multitask import DEFAULT_MT_MODEL, MultitaskObjective diff --git a/spacy/schemas.py b/spacy/schemas.py index e3200348013..359c3fd0f83 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -251,7 +251,7 @@ class TokenPatternOperatorSimple(str, Enum): TokenPatternOperatorMinMax = constr(pattern=r"^(\{\d+\}|\{\d+,\d*\}|\{\d*,\d+\})$") -TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax] +TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax] # type: ignore[valid-type] StringValue = Union[TokenPatternString, StrictStr] NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat] UnderscoreValue = Union[ @@ -420,8 +420,8 @@ class ConfigSchemaInit(BaseModel): lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization") vectors: Optional[StrictStr] = Field(..., title="Path to vectors") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") - tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize") - components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component") + tokenizer: Dict[StrictStr, Any] = Field(..., title="Arguments to be passed into Tokenizer.initialize") + components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component") before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization") after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization") # fmt: on diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py index cdba5e39709..cb9b4ec539a 100644 --- a/spacy/tests/lang/zh/test_tokenizer.py +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -1,5 +1,5 @@ import pytest -from thinc.api import ConfigValidationError +from confection import ConfigValidationError from spacy.lang.zh import Chinese, _get_pkuseg_trie_data diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index b9276441222..f4c6f056aa2 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -15,6 +15,7 @@ def test_build_dependencies(): "cython-lint", "black", "isort", + "ruff", "mypy", "types-dataclasses", "types-mock", diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index f2bfe003d39..e7499404f63 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -24,6 +24,7 @@ ) from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer +from spacy.tests.tok2vec import build_lazy_init_tok2vec as _ # noqa: F401 from spacy.tokens import Doc, DocBin from spacy.training import Example from spacy.training.initialize import init_nlp diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 8415e5c92ff..4bac40f0b89 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1229,6 +1229,9 @@ def test_download_rejects_relative_urls(monkeypatch): relative path in the filename""" monkeypatch.setattr(download_module, "run_command", lambda cmd: None) + monkeypatch.setattr( + download_module, "_get_pip_install_cmd", lambda: ["pip", "install"] + ) # Check that normal download works download_module.download("en_core_web_sm-3.7.1", direct=True) diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py index 8158e9fdf8f..40e437dcc8c 100644 --- a/spacy/training/batchers.py +++ b/spacy/training/batchers.py @@ -4,6 +4,7 @@ Any, Callable, Iterable, + Iterator, List, Optional, Sequence, diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 5cc2733a540..30e32911e6b 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -50,9 +50,10 @@ def create_jsonl_reader( @util.registry.readers("spacy.read_labels.v1") -def read_labels(path: Path, *, require: bool = False): +def read_labels(path: Union[str, Path], *, require: bool = False): # I decided not to give this a generic name, because I don't want people to # use it for arbitrary stuff, as I want this require arg with default False. + path = Path(path) if not require and not path.exists(): return None return srsly.read_json(path) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 64cdddf9c43..164a0867494 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -50,7 +50,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": logger.info("Set up nlp object from config") config = nlp.config.interpolate() # Resolve all training-relevant sections using the filled nlp config - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # type: ignore[arg-type] dot_names = [T["train_corpus"], T["dev_corpus"]] if not isinstance(T["train_corpus"], str): raise ConfigValidationError( diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 6f5099858f1..d6f1ad7d608 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -62,7 +62,7 @@ def train( allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # type: ignore[arg-type] dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) optimizer = T["optimizer"] diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index 14a813a0993..32eada4d749 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -42,7 +42,7 @@ def pretrain( config["initialize"]["init_tok2vec"] = None nlp = load_model_from_config(config) _config = nlp.config.interpolate() - P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) + P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) # type: ignore[arg-type] corpus = dot_to_object(_config, P["corpus"]) corpus = registry.resolve({"corpus": corpus})["corpus"] batcher = P["batcher"] From 188c90d72a7dde6fd4d504bb49e24bc315d570bc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Mar 2026 13:12:10 +0100 Subject: [PATCH 32/42] Add lazy spaCy CLI loading and static launcher --- MANIFEST.in | 1 + setup.cfg | 2 +- setup.py | 2 +- spacy/cli/__init__.py | 110 +++++++++++++++++++--------- spacy/cli/_dispatch.py | 105 +++++++++++++++++++++++++++ spacy/cli/_util.py | 18 ++--- spacy/tests/test_cli_app.py | 4 ++ spacy/tests/test_cli_launcher.py | 102 ++++++++++++++++++++++++++ spacy_cli/__init__.py | 1 + spacy_cli/build_manifest.py | 99 ++++++++++++++++++++++++++ spacy_cli/cli_manifest.json | 118 +++++++++++++++++++++++++++++++ spacy_cli/main.py | 69 ++++++++++++++++++ spacy_cli/static.py | 24 +++++++ 13 files changed, 610 insertions(+), 45 deletions(-) create mode 100644 spacy/cli/_dispatch.py create mode 100644 spacy/tests/test_cli_launcher.py create mode 100644 spacy_cli/__init__.py create mode 100644 spacy_cli/build_manifest.py create mode 100644 spacy_cli/cli_manifest.json create mode 100644 spacy_cli/main.py create mode 100644 spacy_cli/static.py diff --git a/MANIFEST.in b/MANIFEST.in index 1caf758464f..36465ea94a0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml *.hh +recursive-include spacy_cli *.json include LICENSE include README.md include pyproject.toml diff --git a/setup.cfg b/setup.cfg index 66cf942218b..83147ad0d48 100644 --- a/setup.cfg +++ b/setup.cfg @@ -70,7 +70,7 @@ install_requires = [options.entry_points] console_scripts = - spacy = spacy.cli:setup_cli + spacy = spacy_cli.main:main [options.extras_require] lookups = diff --git a/setup.py b/setup.py index 6f28b7340d0..2de619c720a 100755 --- a/setup.py +++ b/setup.py @@ -213,7 +213,7 @@ def setup_package(): version=about["__version__"], ext_modules=ext_modules, cmdclass={"build_ext": build_ext_subclass}, - package_data={"": ["*.pyx", "*.pxd", "*.pxi"]}, + package_data={"": ["*.pyx", "*.pxd", "*.pxi"], "spacy_cli": ["*.json"]}, ) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 3095778fe22..dcfb4b8a92e 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,40 +1,82 @@ +import sys +from importlib import import_module +from typing import Iterable + +from typer.main import get_command from wasabi import msg -# Needed for testing -from . import download as download_module # noqa: F401 -from ._util import app, setup_cli # noqa: F401 -from .apply import apply # noqa: F401 -from .assemble import assemble_cli # noqa: F401 - -# These are the actual functions, NOT the wrapped CLI commands. The CLI commands -# are registered automatically and won't have to be imported here. -from .benchmark_speed import benchmark_speed_cli # noqa: F401 -from .convert import convert # noqa: F401 -from .debug_config import debug_config # noqa: F401 -from .debug_data import debug_data # noqa: F401 -from .debug_diff import debug_diff # noqa: F401 -from .debug_model import debug_model # noqa: F401 -from .download import download # noqa: F401 -from .evaluate import evaluate # noqa: F401 -from .find_function import find_function # noqa: F401 -from .find_threshold import find_threshold # noqa: F401 -from .info import info # noqa: F401 -from .init_config import fill_config, init_config # noqa: F401 -from .init_pipeline import init_pipeline_cli # noqa: F401 -from .package import package # noqa: F401 -from .pretrain import pretrain # noqa: F401 -from .profile import profile # noqa: F401 -from .project.assets import project_assets # type: ignore[attr-defined] # noqa: F401 -from .project.clone import project_clone # type: ignore[attr-defined] # noqa: F401 -from .project.document import ( # type: ignore[attr-defined] # noqa: F401 - project_document, +from ..util import registry +from ._dispatch import ( + GROUP_MODULES, + PUBLIC_ATTRS, + SUBCOMMAND_MODULES, + TOP_LEVEL_MODULES, ) -from .project.dvc import project_update_dvc # type: ignore[attr-defined] # noqa: F401 -from .project.pull import project_pull # type: ignore[attr-defined] # noqa: F401 -from .project.push import project_push # type: ignore[attr-defined] # noqa: F401 -from .project.run import project_run # type: ignore[attr-defined] # noqa: F401 -from .train import train_cli # type: ignore[attr-defined] # noqa: F401 -from .validate import validate # type: ignore[attr-defined] # noqa: F401 +from ._dispatch import iter_builtin_modules +from ._util import COMMAND, add_project_cli, app + +HELP_OPTIONS = {"--help", "-h"} +ROOT_OPTIONS = HELP_OPTIONS | {"--install-completion", "--show-completion"} + +__all__ = [ + "app", + "load_all_commands", + "load_for_argv", + "setup_cli", + *sorted(PUBLIC_ATTRS), +] + + +def _import_modules(module_names: Iterable[str]) -> None: + for module_name in module_names: + import_module(module_name) + + +def load_all_commands() -> None: + _import_modules(iter_builtin_modules()) + add_project_cli() + + +def load_for_argv(argv: Iterable[str]) -> None: + args = list(argv) + if not args or args[0] in ROOT_OPTIONS or args[0].startswith("-"): + load_all_commands() + return + command = args[0] + if command == "project": + add_project_cli() + return + if command in GROUP_MODULES: + subcommand = args[1] if len(args) > 1 and not args[1].startswith("-") else None + if subcommand is not None and (command, subcommand) in SUBCOMMAND_MODULES: + _import_modules(SUBCOMMAND_MODULES[(command, subcommand)]) + return + _import_modules(GROUP_MODULES[command]) + return + if command in TOP_LEVEL_MODULES: + _import_modules(TOP_LEVEL_MODULES[command]) + + +def setup_cli() -> None: + # Make sure entry-point CLI integrations are imported before command dispatch. + registry.cli.get_all() + load_for_argv(sys.argv[1:]) + command = get_command(app) + command(prog_name=COMMAND) + + +def __getattr__(name: str): + if name not in PUBLIC_ATTRS: + raise AttributeError(f"module 'spacy.cli' has no attribute {name!r}") + module_name, attr_name = PUBLIC_ATTRS[name] + module = import_module(module_name) + value = module if attr_name is None else getattr(module, attr_name) + globals()[name] = value + return value + + +def __dir__(): + return sorted(set(globals()) | set(PUBLIC_ATTRS)) @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/_dispatch.py b/spacy/cli/_dispatch.py new file mode 100644 index 00000000000..e1975dd7e1d --- /dev/null +++ b/spacy/cli/_dispatch.py @@ -0,0 +1,105 @@ +from typing import Dict, Iterable, Optional, Tuple + + +CommandPath = Tuple[str, ...] + + +TOP_LEVEL_MODULES: Dict[str, Tuple[str, ...]] = { + "apply": ("spacy.cli.apply",), + "assemble": ("spacy.cli.assemble",), + "convert": ("spacy.cli.convert",), + "debug-data": ("spacy.cli.debug_data",), + "download": ("spacy.cli.download",), + "evaluate": ("spacy.cli.evaluate",), + "find-function": ("spacy.cli.find_function",), + "find-threshold": ("spacy.cli.find_threshold",), + "info": ("spacy.cli.info",), + "package": ("spacy.cli.package",), + "pretrain": ("spacy.cli.pretrain",), + "profile": ("spacy.cli.profile",), + "train": ("spacy.cli.train",), + "validate": ("spacy.cli.validate",), +} + + +GROUP_MODULES: Dict[str, Tuple[str, ...]] = { + "benchmark": ( + "spacy.cli.benchmark_speed", + "spacy.cli.evaluate", + ), + "debug": ( + "spacy.cli.debug_config", + "spacy.cli.debug_data", + "spacy.cli.debug_diff", + "spacy.cli.debug_model", + "spacy.cli.profile", + ), + "init": ( + "spacy.cli.init_config", + "spacy.cli.init_pipeline", + ), +} + + +SUBCOMMAND_MODULES: Dict[CommandPath, Tuple[str, ...]] = { + ("benchmark", "accuracy"): ("spacy.cli.evaluate",), + ("benchmark", "speed"): ("spacy.cli.benchmark_speed",), + ("debug", "config"): ("spacy.cli.debug_config",), + ("debug", "data"): ("spacy.cli.debug_data",), + ("debug", "diff-config"): ("spacy.cli.debug_diff",), + ("debug", "model"): ("spacy.cli.debug_model",), + ("debug", "profile"): ("spacy.cli.profile",), + ("init", "config"): ("spacy.cli.init_config",), + ("init", "fill-config"): ("spacy.cli.init_config",), + ("init", "labels"): ("spacy.cli.init_pipeline",), + ("init", "nlp"): ("spacy.cli.init_pipeline",), + ("init", "vectors"): ("spacy.cli.init_pipeline",), +} + + +PUBLIC_ATTRS: Dict[str, Tuple[str, Optional[str]]] = { + "app": ("spacy.cli._util", "app"), + "apply": ("spacy.cli.apply", "apply"), + "assemble_cli": ("spacy.cli.assemble", "assemble_cli"), + "benchmark_speed_cli": ("spacy.cli.benchmark_speed", "benchmark_speed_cli"), + "convert": ("spacy.cli.convert", "convert"), + "debug_config": ("spacy.cli.debug_config", "debug_config"), + "debug_data": ("spacy.cli.debug_data", "debug_data"), + "debug_diff": ("spacy.cli.debug_diff", "debug_diff"), + "debug_model": ("spacy.cli.debug_model", "debug_model"), + "download": ("spacy.cli.download", "download"), + "download_module": ("spacy.cli.download", None), + "evaluate": ("spacy.cli.evaluate", "evaluate"), + "fill_config": ("spacy.cli.init_config", "fill_config"), + "find_function": ("spacy.cli.find_function", "find_function"), + "find_threshold": ("spacy.cli.find_threshold", "find_threshold"), + "info": ("spacy.cli.info", "info"), + "init_config": ("spacy.cli.init_config", "init_config"), + "init_pipeline_cli": ("spacy.cli.init_pipeline", "init_pipeline_cli"), + "package": ("spacy.cli.package", "package"), + "pretrain": ("spacy.cli.pretrain", "pretrain"), + "profile": ("spacy.cli.profile", "profile"), + "project_assets": ("spacy.cli.project.assets", "project_assets"), + "project_clone": ("spacy.cli.project.clone", "project_clone"), + "project_document": ("spacy.cli.project.document", "project_document"), + "project_pull": ("spacy.cli.project.pull", "project_pull"), + "project_push": ("spacy.cli.project.push", "project_push"), + "project_run": ("spacy.cli.project.run", "project_run"), + "project_update_dvc": ("spacy.cli.project.dvc", "project_update_dvc"), + "train_cli": ("spacy.cli.train", "train_cli"), + "validate": ("spacy.cli.validate", "validate"), +} + + +def iter_builtin_modules() -> Iterable[str]: + seen = set() + for modules in TOP_LEVEL_MODULES.values(): + for module in modules: + if module not in seen: + seen.add(module) + yield module + for modules in GROUP_MODULES.values(): + for module in modules: + if module not in seen: + seen.add(module) + yield module diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 757c418440b..8a8b4e0f417 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -19,9 +19,7 @@ from click.shell_completion import split_arg_string from thinc.api import ConfigValidationError, require_gpu from thinc.util import gpu_is_available -from typer.main import get_command from wasabi import Printer, msg -from weasel import app as project_cli from ..compat import Literal from ..util import ( @@ -63,19 +61,21 @@ benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True) debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True) init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True) +_PROJECT_CLI_ADDED = False -app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True) app.add_typer(debug_cli) app.add_typer(benchmark_cli) app.add_typer(init_cli) -def setup_cli() -> None: - # Make sure the entry-point for CLI runs, so that they get imported. - registry.cli.get_all() - # Ensure that the help messages always display the correct prompt - command = get_command(app) - command(prog_name=COMMAND) +def add_project_cli() -> None: + global _PROJECT_CLI_ADDED + if _PROJECT_CLI_ADDED: + return + from weasel import app as project_cli + + app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True) + _PROJECT_CLI_ADDED = True def parse_config_overrides( diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 5fe9e9bcf01..7ff3f755a31 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -5,12 +5,16 @@ import srsly from typer.testing import CliRunner +from spacy.cli import load_all_commands from spacy.cli._util import app, get_git_version from spacy.tokens import Doc, DocBin, Span from .util import make_tempdir, normalize_whitespace +load_all_commands() + + def has_git(): try: get_git_version() diff --git a/spacy/tests/test_cli_launcher.py b/spacy/tests/test_cli_launcher.py new file mode 100644 index 00000000000..392572fdeec --- /dev/null +++ b/spacy/tests/test_cli_launcher.py @@ -0,0 +1,102 @@ +import importlib +import subprocess +import sys + +import pytest + +from spacy_cli.build_manifest import build_manifest +from spacy_cli.static import load_manifest + +launcher_module = importlib.import_module("spacy_cli.main") + + +def _run_python(code: str) -> str: + result = subprocess.run( + [sys.executable, "-c", code], + check=True, + capture_output=True, + text=True, + ) + return result.stdout.strip() + + +def test_cli_package_import_is_lazy(): + output = _run_python( + "import sys; import spacy.cli; " + "print('spacy.cli.train' in sys.modules); print('weasel' in sys.modules)" + ) + assert output.splitlines() == ["False", "False"] + + +def test_load_for_argv_imports_only_requested_command(): + output = _run_python( + "import sys; from spacy.cli import load_for_argv; " + "load_for_argv(['train', '--help']); " + "print('spacy.cli.train' in sys.modules); print('weasel' in sys.modules)" + ) + assert output.splitlines() == ["True", "False"] + + +def test_load_for_argv_imports_project_on_demand(): + output = _run_python( + "import sys; from spacy.cli import load_for_argv; " + "load_for_argv(['project', '--help']); print('weasel' in sys.modules)" + ) + assert output == "True" + + +def test_manifest_is_current(): + assert build_manifest() == load_manifest() + + +def test_launcher_root_help_uses_static(capsys, monkeypatch): + monkeypatch.setattr( + launcher_module, "_run_live", lambda: (_ for _ in ()).throw(AssertionError) + ) + with pytest.raises(SystemExit) as exc: + launcher_module.main(["--help"]) + assert exc.value.code == 0 + assert capsys.readouterr().out == load_manifest()["root_help"] + + +def test_launcher_command_help_uses_static(capsys, monkeypatch): + monkeypatch.setattr( + launcher_module, "_run_live", lambda: (_ for _ in ()).throw(AssertionError) + ) + with pytest.raises(SystemExit) as exc: + launcher_module.main(["train", "--help"]) + assert exc.value.code == 0 + assert capsys.readouterr().out == load_manifest()["command_help"]["train"] + + +def test_launcher_unknown_command_uses_static_error(capsys, monkeypatch): + monkeypatch.setattr( + launcher_module, "_run_live", lambda: (_ for _ in ()).throw(AssertionError) + ) + with pytest.raises(SystemExit) as exc: + launcher_module.main(["definitely-not-a-command"]) + assert exc.value.code == 2 + assert "No such command 'definitely-not-a-command'" in capsys.readouterr().out + + +def test_launcher_non_help_command_falls_back_to_live(monkeypatch): + called = [] + + def fake_run_live(): + called.append(True) + + monkeypatch.setattr(launcher_module, "_run_live", fake_run_live) + launcher_module.main(["train", "config.cfg"]) + assert called == [True] + + +def test_launcher_root_help_falls_back_with_plugins(monkeypatch): + called = [] + + def fake_run_live(): + called.append(True) + + monkeypatch.setattr(launcher_module, "_run_live", fake_run_live) + monkeypatch.setattr(launcher_module, "get_plugin_command_names", lambda: {"custom"}) + launcher_module.main(["--help"]) + assert called == [True] diff --git a/spacy_cli/__init__.py b/spacy_cli/__init__.py new file mode 100644 index 00000000000..a2cb1f66b78 --- /dev/null +++ b/spacy_cli/__init__.py @@ -0,0 +1 @@ +"""Lightweight launcher package for the spaCy console script.""" diff --git a/spacy_cli/build_manifest.py b/spacy_cli/build_manifest.py new file mode 100644 index 00000000000..6e019bdcb95 --- /dev/null +++ b/spacy_cli/build_manifest.py @@ -0,0 +1,99 @@ +import json +from pathlib import Path +from typing import Dict, Iterable, List + +from typer.main import get_command +from typer.testing import CliRunner + +from spacy.cli import load_all_commands +from spacy.cli._util import COMMAND, app + +from .static import MANIFEST_FILE, UNKNOWN_COMMAND_TOKEN, UNKNOWN_SUBCOMMAND_TOKEN + +DEFAULT_ENV = {"COLUMNS": "100", "LINES": "40", "TERM": "xterm-256color"} + + +def _invoke(runner: CliRunner, cli, args: Iterable[str]): + return runner.invoke(cli, list(args), prog_name=COMMAND, env=DEFAULT_ENV) + + +def _get_help(runner: CliRunner, cli, args: Iterable[str]) -> str: + result = _invoke(runner, cli, [*list(args), "--help"]) + if result.exit_code != 0: + err = f"Could not render help for: {' '.join(args) or ''}" + raise RuntimeError(err) + return result.stdout + + +def _maybe_get_help(runner: CliRunner, cli, args: Iterable[str]): + result = _invoke(runner, cli, [*list(args), "--help"]) + if result.exit_code != 0: + return None + return result.stdout + + +def build_manifest() -> Dict[str, object]: + load_all_commands() + cli = get_command(app) + runner = CliRunner() + known_top_level: List[str] = sorted(cli.commands.keys()) + known_groups: Dict[str, List[str]] = {} + hidden_top_level: List[str] = [] + hidden_group_commands: Dict[str, List[str]] = {} + group_help: Dict[str, str] = {} + command_help: Dict[str, str] = {} + unknown_subcommand: Dict[str, str] = {} + + for name, command in cli.commands.items(): + if getattr(command, "hidden", False): + hidden_top_level.append(name) + if hasattr(command, "commands"): + subcommands = sorted(command.commands.keys()) + known_groups[name] = subcommands + hidden_group_commands[name] = sorted( + sub_name + for sub_name, sub_cmd in command.commands.items() + if getattr(sub_cmd, "hidden", False) + ) + group_help[name] = _get_help(runner, app, [name]) + unknown_subcommand[name] = _invoke( + runner, app, [name, UNKNOWN_SUBCOMMAND_TOKEN] + ).stdout + for sub_name in subcommands: + help_text = _maybe_get_help(runner, app, [name, sub_name]) + if help_text is not None: + command_help[f"{name} {sub_name}"] = help_text + else: + command_help[name] = _get_help(runner, app, [name]) + + return { + "command": COMMAND, + "known_top_level": known_top_level, + "known_groups": known_groups, + "hidden_top_level": hidden_top_level, + "hidden_group_commands": hidden_group_commands, + "root_help": _get_help(runner, app, []), + "group_help": group_help, + "command_help": command_help, + "errors": { + "missing_command": _invoke(runner, app, []).stdout, + "unknown_command": _invoke(runner, app, [UNKNOWN_COMMAND_TOKEN]).stdout, + "unknown_subcommand": unknown_subcommand, + }, + } + + +def write_manifest(path: Path) -> Path: + data = build_manifest() + path.write_text( + json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True) + "\n" + ) + return path + + +def main() -> None: + write_manifest(Path(__file__).with_name(MANIFEST_FILE)) + + +if __name__ == "__main__": + main() diff --git a/spacy_cli/cli_manifest.json b/spacy_cli/cli_manifest.json new file mode 100644 index 00000000000..361a10dca16 --- /dev/null +++ b/spacy_cli/cli_manifest.json @@ -0,0 +1,118 @@ +{ + "command": "python -m spacy", + "command_help": { + "apply": " \n Usage: python -m spacy apply [OPTIONS] MODEL DATA_PATH OUTPUT_FILE \n \n Apply a trained pipeline to documents to get predictions. Expects a loadable spaCy pipeline and \n path to the data, which can be a directory or a file. The data files can be provided in multiple \n formats: 1. .spacy files 2. .jsonl files with a specified \"field\" to read the text from. \n 3. Files with any other extension are assumed to be containing a single document. DOCS: \n https://spacy.io/api/cli#apply \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of the documents to predict on. Can be a single file in │\n│ .spacy format or a .jsonl file. Files with other extensions are │\n│ treated as single plain text documents. If a directory is provided │\n│ it is traversed recursively to grab all files to be processed. The │\n│ files can be a mixture of .spacy, .jsonl and text files. If .jsonl │\n│ is provided the specified field is going to be grabbed (\"text\" by │\n│ default). │\n│ [default: None] │\n│ [required] │\n│ * output_file FILE Path to save the resulting .spacy file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ [default: None] │\n│ --text-key -tk TEXT Key containing text string for JSONL [default: text] │\n│ --force -F Force overwriting the output file │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU. [default: -1] │\n│ --batch-size -b INTEGER Batch size. [default: 1] │\n│ --n-process -n INTEGER number of processors to use. [default: 1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "assemble": " \n Usage: python -m spacy assemble [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Assemble a spaCy pipeline from a config file. The config file includes all settings for \n initializing the pipeline. To override settings in the config, e.g. settings that point to local \n paths or that you want to experiment with, you can override them as command line options. The \n --code argument lets you pass in a Python file that can be used to register custom functions that \n are referenced in the config. \n \n DOCS: https://spacy.io/api/cli#assemble \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * output_path PATH Output directory to store assembled pipeline in [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) to │\n│ be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "benchmark accuracy": " \n Usage: python -m spacy benchmark accuracy [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics [default: None] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ [default: None] │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "benchmark speed": " \n Usage: python -m spacy benchmark speed [OPTIONS] MODEL DATA_PATH \n \n Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the binary .spacy \n format. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --batch-size -b INTEGER RANGE [x>=1] Override the pipeline batch size [default: None] │\n│ --no-shuffle Do not shuffle benchmark data │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --batches INTEGER RANGE [x>=30] Minimum number of batches to benchmark │\n│ [default: 50] │\n│ --warmup -w INTEGER RANGE [x>=0] Number of iterations over the data for warmup │\n│ [default: 3] │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "convert": " \n Usage: python -m spacy convert [OPTIONS] INPUT_PATH [OUTPUT_DIR] \n \n Convert files into json or DocBin format for training. The resulting .spacy file can be used with \n the train command and other experiment management functions. \n \n If no output_dir is specified and the output format is JSON, the data is written to stdout, so you \n can pipe them forward to a JSON file: $ spacy convert some_file.conllu --file-type json > \n some_file.json \n DOCS: https://spacy.io/api/cli#convert \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_path TEXT Input file or directory [default: None] [required] │\n│ output_dir [OUTPUT_DIR] Output directory. '-' for stdout. [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --file-type -t [json|spacy] Type of data to produce [default: spacy] │\n│ --n-sents -n INTEGER Number of sentences per doc (0 to disable) │\n│ [default: 1] │\n│ --seg-sents -s Segment sentences (for -c ner) │\n│ --model,--base -b TEXT Trained spaCy pipeline for sentence segmentation to │\n│ use as base (for --seg-sents) │\n│ [default: None] │\n│ --morphology -m Enable appending morphology to tags │\n│ --merge-subtokens -T Merge CoNLL-U subtokens │\n│ --converter -c TEXT Converter: ('conllubio', 'conllu', 'conll', 'ner', │\n│ 'iob', 'json') │\n│ [default: auto] │\n│ --ner-map -nm PATH NER tag mapping (as JSON-encoded dict of entity types) │\n│ [default: None] │\n│ --lang -l TEXT Language (if tokenizer required) [default: None] │\n│ --concatenate -C Concatenate output to a single file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug config": " \n Usage: python -m spacy debug config [OPTIONS] CONFIG_PATH \n \n Debug a config file and show validation errors. The command will create all objects in the tree \n and validate them. Note that some config validation errors are blocking and will prevent the rest \n of the config from being resolved. This means that you may not see all validation errors at once \n and some issues are only shown once previous errors have been fixed. Similar as with the 'train' \n command, you can override settings from the config as command line options. For instance, \n --training.batch_size 128 overrides the value of \"batch_size\" in the block \"[training]\". \n \n DOCS: https://spacy.io/api/cli#debug-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --show-functions -F Show an overview of all registered functions used in the │\n│ config and where they come from (modules, files etc.) │\n│ --show-variables -V Show an overview of all variables referenced in the config and │\n│ their values. This will also reflect variables overwritten on │\n│ the CLI. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug data": " \n Usage: python -m spacy debug data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug diff-config": " \n Usage: python -m spacy debug diff-config [OPTIONS] CONFIG_PATH \n \n Show a diff of a config file with respect to spaCy's defaults or another config file. If \n additional settings were used in the creation of the config file, then you must supply these as \n extra parameters to the command when comparing to the default settings. The generated diff can \n also be used when posting to the discussion forum to provide more information for the maintainers. \n \n The `optimize`, `gpu`, and `pretraining` options are only relevant when comparing against the \n default configuration (or specifically when `compare_to` is None). \n DOCS: https://spacy.io/api/cli#debug-diff \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --compare-to PATH Path to a config file to diff against, or │\n│ `None` to compare against default settings │\n│ [default: None] │\n│ --optimize -o [efficiency|accuracy] Whether the user config was optimized for │\n│ efficiency or accuracy. Only relevant when │\n│ comparing against the default config. │\n│ [default: efficiency] │\n│ --gpu -G Whether the original config can run on a │\n│ GPU. Only relevant when comparing against │\n│ the default config. │\n│ --pretraining,--pt Whether to compare on a config with │\n│ pretraining involved. Only relevant when │\n│ comparing against the default config. │\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug model": " \n Usage: python -m spacy debug model [OPTIONS] CONFIG_PATH COMPONENT \n \n Analyze a Thinc model implementation. Includes checks for internal structure and activations \n during training. \n \n DOCS: https://spacy.io/api/cli#debug-model \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * component TEXT Name of the pipeline component of which the model should be analysed │\n│ [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --layers -l TEXT Comma-separated names of layer IDs to print │\n│ --dimensions -DIM Show dimensions │\n│ --parameters -PAR Show parameters │\n│ --gradients -GRAD Show gradients │\n│ --attributes -ATTR Show attributes │\n│ --print-step0 -P0 Print model before training │\n│ --print-step1 -P1 Print model after initialization │\n│ --print-step2 -P2 Print model after training │\n│ --print-step3 -P3 Print final predictions │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug profile": " \n Usage: python -m spacy debug profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [default: None] [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug-data": " \n Usage: python -m spacy debug-data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "download": " \n Usage: python -m spacy download [OPTIONS] MODEL \n \n Download compatible trained pipeline from the default download path using pip. If --direct flag is \n set, the command expects the full package name with version. For direct downloads, the \n compatibility check will be skipped. All additional arguments provided to this command will be \n passed to `pip install` on package installation. \n \n DOCS: https://spacy.io/api/cli#download AVAILABLE PACKAGES: https://spacy.io/models \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Name of pipeline package to download [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --direct -d,-D Force direct download of name + version │\n│ --sdist -S Download sdist (.tar.gz) archive instead of pre-built binary wheel │\n│ --url -U TEXT Download from given url [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "evaluate": " \n Usage: python -m spacy evaluate [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics [default: None] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ [default: None] │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "find-function": " \n Usage: python -m spacy find-function [OPTIONS] FUNC_NAME \n \n Find the module, path and line number to the file the registered function is defined in, if \n available. \n \n func_name (str): Name of the registered function. registry_name (Optional[str]): Name of the \n catalogue registry. \n DOCS: https://spacy.io/api/cli#find-function \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * func_name TEXT Name of the registered function. [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --registry -r TEXT Name of the catalogue registry. [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "find-threshold": " \n Usage: python -m spacy find-threshold [OPTIONS] MODEL DATA_PATH PIPE_NAME \n THRESHOLD_KEY SCORES_KEY \n \n Runs prediction trials for a trained model with varying thresholds to maximize the specified \n metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. \n Results are displayed in a table on `stdout` (the corresponding API call to \n `spacy.cli.find_threshold.find_threshold()` returns all results). \n \n This is applicable only for components whose predictions are influenced by thresholds - e.g. \n `textcat_multilabel` and `spancat`, but not `textcat`. Note that the full path to the \n corresponding threshold attribute in the config has to be provided. \n DOCS: https://spacy.io/api/cli#find-threshold \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format │\n│ [default: None] │\n│ [required] │\n│ * pipe_name TEXT Name of pipe to examine thresholds for [default: None] [required] │\n│ * threshold_key TEXT Key of threshold attribute in component's configuration │\n│ [default: None] │\n│ [required] │\n│ * scores_key TEXT Metric to optimize [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n_trials -n INTEGER Number of trials to determine optimal thresholds │\n│ [default: 11] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "info": " \n Usage: python -m spacy info [OPTIONS] [MODEL] \n \n Print info about spaCy installation. If a pipeline is specified as an argument, print its meta \n information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. \n \n Flag --url prints only the download URL of the most recent compatible version of the pipeline. \n DOCS: https://spacy.io/api/cli#info \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ model [MODEL] Optional loadable spaCy pipeline [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --silent -s,-S Don't print anything (just return) │\n│ --exclude -e TEXT Comma-separated keys to exclude from the print-out │\n│ [default: labels] │\n│ --url -u Print the URL to download the most recent compatible version of the │\n│ pipeline │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init config": " \n Usage: python -m spacy init config [OPTIONS] OUTPUT_FILE \n \n Generate a starter config file for training. Based on your requirements specified via the CLI \n arguments, this command generates a config with the optimal settings for your use case. This \n includes the choice of architecture, pretrained weights and related hyperparameters. \n \n DOCS: https://spacy.io/api/cli#init-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * output_file PATH File to save the config to or - for stdout (will only output config │\n│ and no additional logging info) │\n│ [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --lang -l TEXT Two-letter code of the language to use │\n│ [default: en] │\n│ --pipeline -p TEXT Comma-separated names of trainable pipeline │\n│ components to include (without 'tok2vec' or │\n│ 'transformer') │\n│ [default: tagger,parser,ner] │\n│ --optimize -o [efficiency|accuracy] Whether to optimize for efficiency (faster │\n│ inference, smaller model, lower memory │\n│ consumption) or higher accuracy (potentially │\n│ larger and slower model). This will impact the │\n│ choice of architecture, pretrained weights and │\n│ related hyperparameters. │\n│ [default: efficiency] │\n│ --gpu -G Whether the model can run on GPU. This will │\n│ impact the choice of architecture, pretrained │\n│ weights and related hyperparameters. │\n│ --pretraining -pt Include config for pretraining (with 'spacy │\n│ pretrain') │\n│ --force -F Force overwriting the output file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init fill-config": " \n Usage: python -m spacy init fill-config [OPTIONS] BASE_PATH [OUTPUT_FILE] \n \n Fill partial config file with default values. Will add all missing settings from the default \n config and will create all objects, check the registered functions for their default values and \n update the base config. This command can be used with a config generated via the training \n quickstart widget: https://spacy.io/usage/training#quickstart \n \n DOCS: https://spacy.io/api/cli#init-fill-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * base_path FILE Path to base config to fill [default: None] [required] │\n│ output_file [OUTPUT_FILE] Path to output .cfg file (or - for stdout) [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --pretraining -pt Include config for pretraining (with 'spacy pretrain') │\n│ --diff -D Print a visual diff highlighting the changes │\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init labels": " \n Usage: python -m spacy init labels [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Generate JSON files for the labels in the data. This helps speed up the training process, since \n spaCy won't have to preprocess the data to extract the labels. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * output_path PATH Output directory for the labels [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init nlp": " \n Usage: python -m spacy init nlp [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * output_path PATH Output directory for the prepared data [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init vectors": " \n Usage: python -m spacy init vectors [OPTIONS] LANG VECTORS_LOC OUTPUT_DIR \n \n Convert word vectors for use with spaCy. Will export an nlp object that you can use in the \n [initialize] block of your config to initialize a model with vectors. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * lang TEXT The language of the nlp object to create [default: None] [required] │\n│ * vectors_loc PATH Vectors file in Word2Vec format [default: None] [required] │\n│ * output_dir PATH Pipeline output directory [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --prune -p INTEGER Optional number of vectors to prune to [default: -1] │\n│ --truncate -t INTEGER Optional number of vectors to truncate to when reading in │\n│ vectors file │\n│ [default: 0] │\n│ --mode -m TEXT Vectors mode: default or floret [default: default] │\n│ --name -n TEXT Optional name for the word vectors, e.g. en_core_web_lg.vectors │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --attr -a TEXT Optional token attribute to use for vectors, e.g. LOWER or NORM │\n│ [default: ORTH] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "link": " \n Usage: python -m spacy link [OPTIONS] ARGS KWARGS \n \n (deprecated) \n As of spaCy v3.0, symlinks like \"en\" are not supported anymore. You can load trained pipeline \n packages using their full names or from a directory path. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * args TEXT [default: None] [required] │\n│ * kwargs TEXT [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "package": " \n Usage: python -m spacy package [OPTIONS] INPUT_DIR OUTPUT_DIR \n \n Generate an installable Python package for a pipeline. Includes binary data, meta and required \n installation files. A new directory will be created in the specified output directory, and the \n data will be copied over. If --create-meta is set and a meta.json already exists in the output \n directory, the existing values will be used as the defaults in the command-line prompt. After \n packaging, \"python -m build --sdist\" is run in the package directory, which will create a .tar.gz \n archive that can be installed via \"pip install\". \n \n If additional code files are provided (e.g. Python files containing custom registered functions \n like pipeline components), they are copied into the package and imported in the __init__.py. \n DOCS: https://spacy.io/api/cli#package \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_dir DIRECTORY Directory with pipeline data [default: None] [required] │\n│ * output_dir DIRECTORY Output parent directory [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c TEXT Comma-separated paths to Python file │\n│ with additional code (registered │\n│ functions) to be included in the package │\n│ --meta-path,--meta -m FILE Path to meta.json [default: None] │\n│ --create-meta -C Create meta.json, even if one exists │\n│ --name -n TEXT Package name to override meta │\n│ [default: None] │\n│ --version -v TEXT Package version to override meta │\n│ [default: None] │\n│ --build -b TEXT Comma-separated formats to build: sdist │\n│ and/or wheel, or none. │\n│ [default: sdist] │\n│ --force -f,-F Force overwriting existing data in │\n│ output directory │\n│ --require-parent -R,-R --no-require-parent Include the parent package (e.g. spacy) │\n│ in the requirements │\n│ [default: require-parent] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "pretrain": " \n Usage: python -m spacy pretrain [OPTIONS] CONFIG_PATH OUTPUT_DIR \n \n Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate \n language-modelling objective. Two objective types are available, vector-based and character-based. \n \n In the vector-based objective, we load word vectors that have been trained using a word2vec-style \n distributional similarity algorithm, and train a component like a CNN, BiLSTM, etc to predict \n vectors which match the pretrained ones. The weights are saved to a directory after each epoch. \n You can then pass a path to one of these pretrained weights files to the 'spacy train' command. \n This technique may be especially helpful if you have little labelled data. However, it's still \n quite experimental, so your mileage may vary. \n To load the weights back in during 'spacy train', you need to ensure all settings are the same \n between pretraining and training. Ideally, this is done by using the same config file for both \n commands. \n DOCS: https://spacy.io/api/cli#pretrain \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path FILE Path to config file [default: None] [required] │\n│ * output_dir PATH Directory to write weights to on each epoch [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --resume-path -r PATH Path to pretrained weights from which to resume pretraining │\n│ [default: None] │\n│ --epoch-resume -er INTEGER The epoch to resume counting from when using --resume-path. │\n│ Prevents unintended overwriting of existing weight files. │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --skip-last -L Skip saving model-last.bin │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "profile": " \n Usage: python -m spacy profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [default: None] [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project assets": " \n Usage: python -m spacy project assets [OPTIONS] [PROJECT_DIR] \n \n Fetch project assets like datasets and pretrained weights. Assets are defined in the \"assets\" \n section of the project.yml. If a checksum is provided in the project.yml, the file is only \n downloaded if no local file with the same checksum exists. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/tutorial/directory-and-assets.md \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Path to cloned project. Defaults to current working directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --sparse -S Use sparse checkout for assets provided via Git, to only check out and clone │\n│ the files needed. Requires Git v22.2+. │\n│ --extra -e Download all assets, including those marked as 'extra'. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project clone": " \n Usage: python -m spacy project clone [OPTIONS] NAME [DEST] \n \n Clone a project template from a repository. Calls into \"git\" and will only download the files from \n the given subdirectory. The GitHub repo defaults to the official Weasel template repo, but can be \n customized (including using a private repo). \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#clipboard-clone \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * name TEXT The name of the template to clone [default: None] [required] │\n│ dest [DEST] Where to clone the project. Defaults to current working directory │\n│ [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --repo -r TEXT The repository to clone from │\n│ [default: https://github.com/explosion/projects] │\n│ --branch -b TEXT The branch to clone from. If not provided, will attempt main, master │\n│ [default: None] │\n│ --sparse -S Use sparse Git checkout to only check out and clone the files needed. │\n│ Requires Git v22.2+. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project document": " \n Usage: python -m spacy project document [OPTIONS] [PROJECT_DIR] \n \n Auto-generate a README.md for a project. If the content is saved to a file, hidden markers are \n added so you can add custom content before or after the auto-generated section and only the \n auto-generated docs will be replaced when you re-run the command. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#closed_book-document \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Path to cloned project. Defaults to current working directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o PATH Path to output Markdown file for output. Defaults to - for standard │\n│ output │\n│ [default: -] │\n│ --no-emoji -NE Don't use emoji │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project dvc": " \n Usage: python -m spacy project dvc [OPTIONS] [PROJECT_DIR] [WORKFLOW] \n \n Auto-generate Data Version Control (DVC) config. A DVC project can only define one pipeline, so \n you need to specify one workflow defined in the project.yml. If no workflow is specified, the \n first defined workflow is used. The DVC config will only be updated if the project.yml changed. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#repeat-dvc \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n│ workflow [WORKFLOW] Name of workflow defined in project.yml. Defaults to first │\n│ workflow if not set. │\n│ [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --verbose -V Print more info │\n│ --quiet -q Print less info │\n│ --force -F Force update DVC config │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project pull": " \n Usage: python -m spacy project pull [OPTIONS] [REMOTE] [PROJECT_DIR] \n \n Retrieve available precomputed outputs from a remote storage. You can alias remotes in your \n project.yml by mapping them to storage paths. A storage can be anything that the smart_open \n library can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_down-push \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ remote [REMOTE] Name or path of remote storage [default: default] │\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project push": " \n Usage: python -m spacy project push [OPTIONS] [REMOTE] [PROJECT_DIR] \n \n Persist outputs to a remote storage. You can alias remotes in your project.yml by mapping them to \n storage paths. A storage can be anything that the smart_open library can upload to, e.g. AWS, \n Google Cloud Storage, SSH, local directories etc. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_up-push \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ remote [REMOTE] Name or path of remote storage [default: default] │\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "train": " \n Usage: python -m spacy train [OPTIONS] CONFIG_PATH \n \n Train or update a spaCy pipeline. Requires data in spaCy's binary format. To convert data from \n other formats, use the `spacy convert` command. The config file includes all settings and \n hyperparameters used during training. To override settings in the config, e.g. settings that point \n to local paths or that you want to experiment with, you can override them as command line options. \n For instance, --training.batch_size 128 overrides the value of \"batch_size\" in the block \n \"[training]\". The --code argument lets you pass in a Python file that's imported before training. \n It can be used to register custom functions and architectures that can then be referenced in the \n config. \n \n DOCS: https://spacy.io/api/cli#train \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output,--output-path -o PATH Output directory to store trained pipeline in │\n│ [default: None] │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "validate": " \n Usage: python -m spacy validate [OPTIONS] \n \n Validate the currently installed pipeline packages and spaCy version. Checks if the installed \n packages are compatible and shows upgrade instructions if available. Should be run after `pip \n install -U spacy`. \n \n DOCS: https://spacy.io/api/cli#validate \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + }, + "errors": { + "missing_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ Missing command. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "unknown_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_COMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "unknown_subcommand": { + "benchmark": " \n Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]... \n \n Commands for benchmarking pipelines. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ accuracy Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in │\n│ the binary .spacy format. The --gold-preproc option sets up the evaluation examples │\n│ with gold-standard sentences and tokens for the predictions. Gold preprocessing helps │\n│ the annotations align to the tokenization, and may result in sequences of more │\n│ consistent length. However, it may reduce runtime accuracy due to train/test skew. To │\n│ render a sample of dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ speed Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the │\n│ binary .spacy format. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug": " \n Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]... \n \n Suite of helpful commands for debugging and profiling. Includes commands to check and validate \n your config files, training and evaluation data, and custom model implementations. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ data Analyze, debug and validate your training and development data. Outputs useful │\n│ stats, and can help you find problems like invalid entity annotations, cyclic │\n│ dependencies, low data labels and more. │\n│ profile Profile which functions take the most time in a spaCy pipeline. Input should be │\n│ formatted as one JSON object per line with a key \"text\". It can either be provided │\n│ as a JSONL file, or be read from sys.sytdin. If no input file is specified, the │\n│ IMDB dataset is loaded via Thinc. │\n│ config Debug a config file and show validation errors. The command will create all │\n│ objects in the tree and validate them. Note that some config validation errors are │\n│ blocking and will prevent the rest of the config from being resolved. This means │\n│ that you may not see all validation errors at once and some issues are only shown │\n│ once previous errors have been fixed. Similar as with the 'train' command, you can │\n│ override settings from the config as command line options. For instance, │\n│ --training.batch_size 128 overrides the value of \"batch_size\" in the block │\n│ \"[training]\". │\n│ diff-config Show a diff of a config file with respect to spaCy's defaults or another config │\n│ file. If additional settings were used in the creation of the config file, then │\n│ you must supply these as extra parameters to the command when comparing to the │\n│ default settings. The generated diff can also be used when posting to the │\n│ discussion forum to provide more information for the maintainers. │\n│ model Analyze a Thinc model implementation. Includes checks for internal structure and │\n│ activations during training. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init": " \n Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]... \n \n Commands for initializing configs and pipeline packages. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ config Generate a starter config file for training. Based on your requirements specified │\n│ via the CLI arguments, this command generates a config with the optimal settings │\n│ for your use case. This includes the choice of architecture, pretrained weights │\n│ and related hyperparameters. │\n│ fill-config Fill partial config file with default values. Will add all missing settings from │\n│ the default config and will create all objects, check the registered functions for │\n│ their default values and update the base config. This command can be used with a │\n│ config generated via the training quickstart widget: │\n│ https://spacy.io/usage/training#quickstart │\n│ vectors Convert word vectors for use with spaCy. Will export an nlp object that you can │\n│ use in the [initialize] block of your config to initialize a model with vectors. │\n│ labels Generate JSON files for the labels in the data. This helps speed up the training │\n│ process, since spaCy won't have to preprocess the data to extract the labels. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project": " \n Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]... \n \n Command-line interface for spaCy projects and templates. You'd typically start by cloning a \n project template to a local directory and fetching its assets like datasets etc. See the project's \n project.yml for the available commands. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ assets Fetch project assets like datasets and pretrained weights. Assets are defined in the │\n│ \"assets\" section of the project.yml. If a checksum is provided in the project.yml, │\n│ the file is only downloaded if no local file with the same checksum exists. │\n│ clone Clone a project template from a repository. Calls into \"git\" and will only download │\n│ the files from the given subdirectory. The GitHub repo defaults to the official │\n│ Weasel template repo, but can be customized (including using a private repo). │\n│ document Auto-generate a README.md for a project. If the content is saved to a file, hidden │\n│ markers are added so you can add custom content before or after the auto-generated │\n│ section and only the auto-generated docs will be replaced when you re-run the │\n│ command. │\n│ dvc Auto-generate Data Version Control (DVC) config. A DVC project can only define one │\n│ pipeline, so you need to specify one workflow defined in the project.yml. If no │\n│ workflow is specified, the first defined workflow is used. The DVC config will only │\n│ be updated if the project.yml changed. │\n│ run Run a named command or workflow defined in the project.yml. If a workflow name is │\n│ specified, all commands in the workflow are run, in order. If commands define │\n│ dependencies and/or outputs, they will only be re-run if state has changed. │\n│ pull Retrieve available precomputed outputs from a remote storage. You can alias remotes │\n│ in your project.yml by mapping them to storage paths. A storage can be anything that │\n│ the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, local │\n│ directories etc. │\n│ push Persist outputs to a remote storage. You can alias remotes in your project.yml by │\n│ mapping them to storage paths. A storage can be anything that the smart_open library │\n│ can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + } + }, + "group_help": { + "benchmark": " \n Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]... \n \n Commands for benchmarking pipelines. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ accuracy Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in │\n│ the binary .spacy format. The --gold-preproc option sets up the evaluation examples │\n│ with gold-standard sentences and tokens for the predictions. Gold preprocessing helps │\n│ the annotations align to the tokenization, and may result in sequences of more │\n│ consistent length. However, it may reduce runtime accuracy due to train/test skew. To │\n│ render a sample of dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ speed Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the │\n│ binary .spacy format. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug": " \n Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]... \n \n Suite of helpful commands for debugging and profiling. Includes commands to check and validate \n your config files, training and evaluation data, and custom model implementations. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ data Analyze, debug and validate your training and development data. Outputs useful │\n│ stats, and can help you find problems like invalid entity annotations, cyclic │\n│ dependencies, low data labels and more. │\n│ profile Profile which functions take the most time in a spaCy pipeline. Input should be │\n│ formatted as one JSON object per line with a key \"text\". It can either be provided │\n│ as a JSONL file, or be read from sys.sytdin. If no input file is specified, the │\n│ IMDB dataset is loaded via Thinc. │\n│ config Debug a config file and show validation errors. The command will create all │\n│ objects in the tree and validate them. Note that some config validation errors are │\n│ blocking and will prevent the rest of the config from being resolved. This means │\n│ that you may not see all validation errors at once and some issues are only shown │\n│ once previous errors have been fixed. Similar as with the 'train' command, you can │\n│ override settings from the config as command line options. For instance, │\n│ --training.batch_size 128 overrides the value of \"batch_size\" in the block │\n│ \"[training]\". │\n│ diff-config Show a diff of a config file with respect to spaCy's defaults or another config │\n│ file. If additional settings were used in the creation of the config file, then │\n│ you must supply these as extra parameters to the command when comparing to the │\n│ default settings. The generated diff can also be used when posting to the │\n│ discussion forum to provide more information for the maintainers. │\n│ model Analyze a Thinc model implementation. Includes checks for internal structure and │\n│ activations during training. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init": " \n Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]... \n \n Commands for initializing configs and pipeline packages. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ config Generate a starter config file for training. Based on your requirements specified │\n│ via the CLI arguments, this command generates a config with the optimal settings │\n│ for your use case. This includes the choice of architecture, pretrained weights │\n│ and related hyperparameters. │\n│ fill-config Fill partial config file with default values. Will add all missing settings from │\n│ the default config and will create all objects, check the registered functions for │\n│ their default values and update the base config. This command can be used with a │\n│ config generated via the training quickstart widget: │\n│ https://spacy.io/usage/training#quickstart │\n│ vectors Convert word vectors for use with spaCy. Will export an nlp object that you can │\n│ use in the [initialize] block of your config to initialize a model with vectors. │\n│ labels Generate JSON files for the labels in the data. This helps speed up the training │\n│ process, since spaCy won't have to preprocess the data to extract the labels. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project": " \n Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]... \n \n Command-line interface for spaCy projects and templates. You'd typically start by cloning a \n project template to a local directory and fetching its assets like datasets etc. See the project's \n project.yml for the available commands. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ assets Fetch project assets like datasets and pretrained weights. Assets are defined in the │\n│ \"assets\" section of the project.yml. If a checksum is provided in the project.yml, │\n│ the file is only downloaded if no local file with the same checksum exists. │\n│ clone Clone a project template from a repository. Calls into \"git\" and will only download │\n│ the files from the given subdirectory. The GitHub repo defaults to the official │\n│ Weasel template repo, but can be customized (including using a private repo). │\n│ document Auto-generate a README.md for a project. If the content is saved to a file, hidden │\n│ markers are added so you can add custom content before or after the auto-generated │\n│ section and only the auto-generated docs will be replaced when you re-run the │\n│ command. │\n│ dvc Auto-generate Data Version Control (DVC) config. A DVC project can only define one │\n│ pipeline, so you need to specify one workflow defined in the project.yml. If no │\n│ workflow is specified, the first defined workflow is used. The DVC config will only │\n│ be updated if the project.yml changed. │\n│ run Run a named command or workflow defined in the project.yml. If a workflow name is │\n│ specified, all commands in the workflow are run, in order. If commands define │\n│ dependencies and/or outputs, they will only be re-run if state has changed. │\n│ pull Retrieve available precomputed outputs from a remote storage. You can alias remotes │\n│ in your project.yml by mapping them to storage paths. A storage can be anything that │\n│ the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, local │\n│ directories etc. │\n│ push Persist outputs to a remote storage. You can alias remotes in your project.yml by │\n│ mapping them to storage paths. A storage can be anything that the smart_open library │\n│ can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + }, + "hidden_group_commands": { + "benchmark": [], + "debug": [], + "init": [ + "nlp" + ], + "project": [] + }, + "hidden_top_level": [ + "link", + "debug-data", + "profile" + ], + "known_groups": { + "benchmark": [ + "accuracy", + "speed" + ], + "debug": [ + "config", + "data", + "diff-config", + "model", + "profile" + ], + "init": [ + "config", + "fill-config", + "labels", + "nlp", + "vectors" + ], + "project": [ + "assets", + "clone", + "document", + "dvc", + "pull", + "push", + "run" + ] + }, + "known_top_level": [ + "apply", + "assemble", + "benchmark", + "convert", + "debug", + "debug-data", + "download", + "evaluate", + "find-function", + "find-threshold", + "info", + "init", + "link", + "package", + "pretrain", + "profile", + "project", + "train", + "validate" + ], + "root_help": " \n Usage: python -m spacy [OPTIONS] COMMAND [ARGS]... \n \n spaCy Command-line Interface \n \n DOCS: https://spacy.io/api/cli \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --install-completion Install completion for the current shell. │\n│ --show-completion Show completion for the current shell, to copy it or customize the │\n│ installation. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ download Download compatible trained pipeline from the default download path using pip. │\n│ If --direct flag is set, the command expects the full package name with │\n│ version. For direct downloads, the compatibility check will be skipped. All │\n│ additional arguments provided to this command will be passed to `pip install` │\n│ on package installation. │\n│ info Print info about spaCy installation. If a pipeline is specified as an argument, │\n│ print its meta information. Flag --markdown prints details in Markdown for easy │\n│ copy-pasting to GitHub issues. │\n│ apply Apply a trained pipeline to documents to get predictions. Expects a loadable │\n│ spaCy pipeline and path to the data, which can be a directory or a file. The │\n│ data files can be provided in multiple formats: 1. .spacy files 2. │\n│ .jsonl files with a specified \"field\" to read the text from. 3. Files with │\n│ any other extension are assumed to be containing a single document. │\n│ DOCS: https://spacy.io/api/cli#apply │\n│ assemble Assemble a spaCy pipeline from a config file. The config file includes all │\n│ settings for initializing the pipeline. To override settings in the config, │\n│ e.g. settings that point to local paths or that you want to experiment with, │\n│ you can override them as command line options. The --code argument lets you │\n│ pass in a Python file that can be used to register custom functions that are │\n│ referenced in the config. │\n│ convert Convert files into json or DocBin format for training. The resulting .spacy │\n│ file can be used with the train command and other experiment management │\n│ functions. │\n│ evaluate Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation │\n│ data in the binary .spacy format. The --gold-preproc option sets up the │\n│ evaluation examples with gold-standard sentences and tokens for the │\n│ predictions. Gold preprocessing helps the annotations align to the │\n│ tokenization, and may result in sequences of more consistent length. However, │\n│ it may reduce runtime accuracy due to train/test skew. To render a sample of │\n│ dependency parses in a HTML file, set as output directory as the displacy_path │\n│ argument. │\n│ find-function Find the module, path and line number to the file the registered function is │\n│ defined in, if available. │\n│ find-threshold Runs prediction trials for a trained model with varying thresholds to maximize │\n│ the specified metric. The search space for the threshold is traversed linearly │\n│ from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` │\n│ (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` │\n│ returns all results). │\n│ package Generate an installable Python package for a pipeline. Includes binary data, │\n│ meta and required installation files. A new directory will be created in the │\n│ specified output directory, and the data will be copied over. If --create-meta │\n│ is set and a meta.json already exists in the output directory, the existing │\n│ values will be used as the defaults in the command-line prompt. After │\n│ packaging, \"python -m build --sdist\" is run in the package directory, which │\n│ will create a .tar.gz archive that can be installed via \"pip install\". │\n│ pretrain Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using │\n│ an approximate language-modelling objective. Two objective types are available, │\n│ vector-based and character-based. │\n│ train Train or update a spaCy pipeline. Requires data in spaCy's binary format. To │\n│ convert data from other formats, use the `spacy convert` command. The config │\n│ file includes all settings and hyperparameters used during training. To │\n│ override settings in the config, e.g. settings that point to local paths or │\n│ that you want to experiment with, you can override them as command line │\n│ options. For instance, --training.batch_size 128 overrides the value of │\n│ \"batch_size\" in the block \"[training]\". The --code argument lets you pass in a │\n│ Python file that's imported before training. It can be used to register custom │\n│ functions and architectures that can then be referenced in the config. │\n│ validate Validate the currently installed pipeline packages and spaCy version. Checks if │\n│ the installed packages are compatible and shows upgrade instructions if │\n│ available. Should be run after `pip install -U spacy`. │\n│ debug Suite of helpful commands for debugging and profiling. Includes commands to │\n│ check and validate your config files, training and evaluation data, and custom │\n│ model implementations. │\n│ benchmark Commands for benchmarking pipelines. │\n│ init Commands for initializing configs and pipeline packages. │\n│ project Command-line interface for spaCy projects and templates. You'd typically start │\n│ by cloning a project template to a local directory and fetching its assets like │\n│ datasets etc. See the project's project.yml for the available commands. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" +} diff --git a/spacy_cli/main.py b/spacy_cli/main.py new file mode 100644 index 00000000000..cbe6e376c58 --- /dev/null +++ b/spacy_cli/main.py @@ -0,0 +1,69 @@ +import sys +from typing import Iterable, Optional + +from .static import HELP_OPTIONS, UNKNOWN_COMMAND_TOKEN, UNKNOWN_SUBCOMMAND_TOKEN +from .static import get_plugin_command_names, load_manifest + + +def _write_output(text: str) -> None: + sys.stdout.write(text) + if not text.endswith("\n"): + sys.stdout.write("\n") + + +def _run_live() -> None: + from spacy.cli import setup_cli + + setup_cli() + + +def _try_static(argv: Iterable[str]): + args = list(argv) + manifest = load_manifest() + plugin_command_names = get_plugin_command_names() + known_groups = manifest["known_groups"] + known_top_level = set(manifest["known_top_level"]) + if not args: + return manifest["errors"]["missing_command"], 2 + first = args[0] + if first in HELP_OPTIONS: + if plugin_command_names: + return None + return manifest["root_help"], 0 + if first.startswith("-"): + return None + if first not in known_top_level: + if first in plugin_command_names: + return None + template = manifest["errors"]["unknown_command"] + return template.replace(UNKNOWN_COMMAND_TOKEN, first), 2 + if first in known_groups: + if len(args) == 1 or args[1] in HELP_OPTIONS: + if plugin_command_names: + return None + return manifest["group_help"][first], 0 + second = args[1] + if second not in known_groups[first]: + if plugin_command_names: + return None + template = manifest["errors"]["unknown_subcommand"][first] + return template.replace(UNKNOWN_SUBCOMMAND_TOKEN, second), 2 + if any(arg in HELP_OPTIONS for arg in args[2:]): + return manifest["command_help"][f"{first} {second}"], 0 + return None + if any(arg in HELP_OPTIONS for arg in args[1:]): + return manifest["command_help"][first], 0 + return None + + +def main(argv: Optional[Iterable[str]] = None) -> None: + args = sys.argv[1:] if argv is None else list(argv) + try: + static_result = _try_static(args) + except Exception: + return _run_live() + if static_result is None: + return _run_live() + text, code = static_result + _write_output(text) + raise SystemExit(code) diff --git a/spacy_cli/static.py b/spacy_cli/static.py new file mode 100644 index 00000000000..51594ceef9a --- /dev/null +++ b/spacy_cli/static.py @@ -0,0 +1,24 @@ +import json +from functools import lru_cache +from importlib.metadata import entry_points +from importlib.resources import files +from typing import Any, Dict, Set + + +HELP_OPTIONS = {"--help", "-h"} +PLUGIN_ENTRY_POINT_GROUP = "spacy_cli" +MANIFEST_FILE = "cli_manifest.json" +UNKNOWN_COMMAND_TOKEN = "__SPACY_UNKNOWN_COMMAND__" +UNKNOWN_SUBCOMMAND_TOKEN = "__SPACY_UNKNOWN_SUBCOMMAND__" + + +@lru_cache(maxsize=1) +def load_manifest() -> Dict[str, Any]: + data = files("spacy_cli").joinpath(MANIFEST_FILE).read_text(encoding="utf8") + return json.loads(data) + + +def get_plugin_command_names() -> Set[str]: + return { + entry_point.name for entry_point in entry_points(group=PLUGIN_ENTRY_POINT_GROUP) + } From 0a45289512d668a1beddbf11c2b163a0533e3a7a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Mar 2026 14:32:31 +0100 Subject: [PATCH 33/42] Fix lazy load on modules where the function shadows --- spacy/cli/__init__.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index dcfb4b8a92e..ded45efe9f7 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,4 +1,5 @@ import sys +import types from importlib import import_module from typing import Iterable @@ -79,6 +80,19 @@ def __dir__(): return sorted(set(globals()) | set(PUBLIC_ATTRS)) +class _CLIModule(types.ModuleType): + def __setattr__(self, name, value): + if isinstance(value, types.ModuleType) and name in PUBLIC_ATTRS: + _, attr_name = PUBLIC_ATTRS[name] + if attr_name is not None: + super().__setattr__(name, getattr(value, attr_name)) + return + super().__setattr__(name, value) + + +sys.modules[__name__].__class__ = _CLIModule + + @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) def link(*args, **kwargs): """As of spaCy v3.0, symlinks like "en" are not supported anymore. You can load trained From f0abcf7bc0d3c072a3a4c3dff4fc9329ba88d5f1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Mar 2026 14:32:41 +0100 Subject: [PATCH 34/42] Update manifest --- spacy_cli/cli_manifest.json | 78 ++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/spacy_cli/cli_manifest.json b/spacy_cli/cli_manifest.json index 361a10dca16..12d760416c5 100644 --- a/spacy_cli/cli_manifest.json +++ b/spacy_cli/cli_manifest.json @@ -1,55 +1,55 @@ { "command": "python -m spacy", "command_help": { - "apply": " \n Usage: python -m spacy apply [OPTIONS] MODEL DATA_PATH OUTPUT_FILE \n \n Apply a trained pipeline to documents to get predictions. Expects a loadable spaCy pipeline and \n path to the data, which can be a directory or a file. The data files can be provided in multiple \n formats: 1. .spacy files 2. .jsonl files with a specified \"field\" to read the text from. \n 3. Files with any other extension are assumed to be containing a single document. DOCS: \n https://spacy.io/api/cli#apply \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of the documents to predict on. Can be a single file in │\n│ .spacy format or a .jsonl file. Files with other extensions are │\n│ treated as single plain text documents. If a directory is provided │\n│ it is traversed recursively to grab all files to be processed. The │\n│ files can be a mixture of .spacy, .jsonl and text files. If .jsonl │\n│ is provided the specified field is going to be grabbed (\"text\" by │\n│ default). │\n│ [default: None] │\n│ [required] │\n│ * output_file FILE Path to save the resulting .spacy file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ [default: None] │\n│ --text-key -tk TEXT Key containing text string for JSONL [default: text] │\n│ --force -F Force overwriting the output file │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU. [default: -1] │\n│ --batch-size -b INTEGER Batch size. [default: 1] │\n│ --n-process -n INTEGER number of processors to use. [default: 1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "assemble": " \n Usage: python -m spacy assemble [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Assemble a spaCy pipeline from a config file. The config file includes all settings for \n initializing the pipeline. To override settings in the config, e.g. settings that point to local \n paths or that you want to experiment with, you can override them as command line options. The \n --code argument lets you pass in a Python file that can be used to register custom functions that \n are referenced in the config. \n \n DOCS: https://spacy.io/api/cli#assemble \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * output_path PATH Output directory to store assembled pipeline in [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) to │\n│ be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "benchmark accuracy": " \n Usage: python -m spacy benchmark accuracy [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics [default: None] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ [default: None] │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "benchmark speed": " \n Usage: python -m spacy benchmark speed [OPTIONS] MODEL DATA_PATH \n \n Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the binary .spacy \n format. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --batch-size -b INTEGER RANGE [x>=1] Override the pipeline batch size [default: None] │\n│ --no-shuffle Do not shuffle benchmark data │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --batches INTEGER RANGE [x>=30] Minimum number of batches to benchmark │\n│ [default: 50] │\n│ --warmup -w INTEGER RANGE [x>=0] Number of iterations over the data for warmup │\n│ [default: 3] │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "convert": " \n Usage: python -m spacy convert [OPTIONS] INPUT_PATH [OUTPUT_DIR] \n \n Convert files into json or DocBin format for training. The resulting .spacy file can be used with \n the train command and other experiment management functions. \n \n If no output_dir is specified and the output format is JSON, the data is written to stdout, so you \n can pipe them forward to a JSON file: $ spacy convert some_file.conllu --file-type json > \n some_file.json \n DOCS: https://spacy.io/api/cli#convert \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_path TEXT Input file or directory [default: None] [required] │\n│ output_dir [OUTPUT_DIR] Output directory. '-' for stdout. [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --file-type -t [json|spacy] Type of data to produce [default: spacy] │\n│ --n-sents -n INTEGER Number of sentences per doc (0 to disable) │\n│ [default: 1] │\n│ --seg-sents -s Segment sentences (for -c ner) │\n│ --model,--base -b TEXT Trained spaCy pipeline for sentence segmentation to │\n│ use as base (for --seg-sents) │\n│ [default: None] │\n│ --morphology -m Enable appending morphology to tags │\n│ --merge-subtokens -T Merge CoNLL-U subtokens │\n│ --converter -c TEXT Converter: ('conllubio', 'conllu', 'conll', 'ner', │\n│ 'iob', 'json') │\n│ [default: auto] │\n│ --ner-map -nm PATH NER tag mapping (as JSON-encoded dict of entity types) │\n│ [default: None] │\n│ --lang -l TEXT Language (if tokenizer required) [default: None] │\n│ --concatenate -C Concatenate output to a single file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug config": " \n Usage: python -m spacy debug config [OPTIONS] CONFIG_PATH \n \n Debug a config file and show validation errors. The command will create all objects in the tree \n and validate them. Note that some config validation errors are blocking and will prevent the rest \n of the config from being resolved. This means that you may not see all validation errors at once \n and some issues are only shown once previous errors have been fixed. Similar as with the 'train' \n command, you can override settings from the config as command line options. For instance, \n --training.batch_size 128 overrides the value of \"batch_size\" in the block \"[training]\". \n \n DOCS: https://spacy.io/api/cli#debug-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --show-functions -F Show an overview of all registered functions used in the │\n│ config and where they come from (modules, files etc.) │\n│ --show-variables -V Show an overview of all variables referenced in the config and │\n│ their values. This will also reflect variables overwritten on │\n│ the CLI. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug data": " \n Usage: python -m spacy debug data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug diff-config": " \n Usage: python -m spacy debug diff-config [OPTIONS] CONFIG_PATH \n \n Show a diff of a config file with respect to spaCy's defaults or another config file. If \n additional settings were used in the creation of the config file, then you must supply these as \n extra parameters to the command when comparing to the default settings. The generated diff can \n also be used when posting to the discussion forum to provide more information for the maintainers. \n \n The `optimize`, `gpu`, and `pretraining` options are only relevant when comparing against the \n default configuration (or specifically when `compare_to` is None). \n DOCS: https://spacy.io/api/cli#debug-diff \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --compare-to PATH Path to a config file to diff against, or │\n│ `None` to compare against default settings │\n│ [default: None] │\n│ --optimize -o [efficiency|accuracy] Whether the user config was optimized for │\n│ efficiency or accuracy. Only relevant when │\n│ comparing against the default config. │\n│ [default: efficiency] │\n│ --gpu -G Whether the original config can run on a │\n│ GPU. Only relevant when comparing against │\n│ the default config. │\n│ --pretraining,--pt Whether to compare on a config with │\n│ pretraining involved. Only relevant when │\n│ comparing against the default config. │\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug model": " \n Usage: python -m spacy debug model [OPTIONS] CONFIG_PATH COMPONENT \n \n Analyze a Thinc model implementation. Includes checks for internal structure and activations \n during training. \n \n DOCS: https://spacy.io/api/cli#debug-model \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * component TEXT Name of the pipeline component of which the model should be analysed │\n│ [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --layers -l TEXT Comma-separated names of layer IDs to print │\n│ --dimensions -DIM Show dimensions │\n│ --parameters -PAR Show parameters │\n│ --gradients -GRAD Show gradients │\n│ --attributes -ATTR Show attributes │\n│ --print-step0 -P0 Print model before training │\n│ --print-step1 -P1 Print model after initialization │\n│ --print-step2 -P2 Print model after training │\n│ --print-step3 -P3 Print final predictions │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug profile": " \n Usage: python -m spacy debug profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [default: None] [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug-data": " \n Usage: python -m spacy debug-data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "download": " \n Usage: python -m spacy download [OPTIONS] MODEL \n \n Download compatible trained pipeline from the default download path using pip. If --direct flag is \n set, the command expects the full package name with version. For direct downloads, the \n compatibility check will be skipped. All additional arguments provided to this command will be \n passed to `pip install` on package installation. \n \n DOCS: https://spacy.io/api/cli#download AVAILABLE PACKAGES: https://spacy.io/models \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Name of pipeline package to download [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --direct -d,-D Force direct download of name + version │\n│ --sdist -S Download sdist (.tar.gz) archive instead of pre-built binary wheel │\n│ --url -U TEXT Download from given url [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "evaluate": " \n Usage: python -m spacy evaluate [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics [default: None] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ [default: None] │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "find-function": " \n Usage: python -m spacy find-function [OPTIONS] FUNC_NAME \n \n Find the module, path and line number to the file the registered function is defined in, if \n available. \n \n func_name (str): Name of the registered function. registry_name (Optional[str]): Name of the \n catalogue registry. \n DOCS: https://spacy.io/api/cli#find-function \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * func_name TEXT Name of the registered function. [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --registry -r TEXT Name of the catalogue registry. [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "find-threshold": " \n Usage: python -m spacy find-threshold [OPTIONS] MODEL DATA_PATH PIPE_NAME \n THRESHOLD_KEY SCORES_KEY \n \n Runs prediction trials for a trained model with varying thresholds to maximize the specified \n metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. \n Results are displayed in a table on `stdout` (the corresponding API call to \n `spacy.cli.find_threshold.find_threshold()` returns all results). \n \n This is applicable only for components whose predictions are influenced by thresholds - e.g. \n `textcat_multilabel` and `spancat`, but not `textcat`. Note that the full path to the \n corresponding threshold attribute in the config has to be provided. \n DOCS: https://spacy.io/api/cli#find-threshold \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [default: None] [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format │\n│ [default: None] │\n│ [required] │\n│ * pipe_name TEXT Name of pipe to examine thresholds for [default: None] [required] │\n│ * threshold_key TEXT Key of threshold attribute in component's configuration │\n│ [default: None] │\n│ [required] │\n│ * scores_key TEXT Metric to optimize [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n_trials -n INTEGER Number of trials to determine optimal thresholds │\n│ [default: 11] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "info": " \n Usage: python -m spacy info [OPTIONS] [MODEL] \n \n Print info about spaCy installation. If a pipeline is specified as an argument, print its meta \n information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. \n \n Flag --url prints only the download URL of the most recent compatible version of the pipeline. \n DOCS: https://spacy.io/api/cli#info \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ model [MODEL] Optional loadable spaCy pipeline [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --silent -s,-S Don't print anything (just return) │\n│ --exclude -e TEXT Comma-separated keys to exclude from the print-out │\n│ [default: labels] │\n│ --url -u Print the URL to download the most recent compatible version of the │\n│ pipeline │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init config": " \n Usage: python -m spacy init config [OPTIONS] OUTPUT_FILE \n \n Generate a starter config file for training. Based on your requirements specified via the CLI \n arguments, this command generates a config with the optimal settings for your use case. This \n includes the choice of architecture, pretrained weights and related hyperparameters. \n \n DOCS: https://spacy.io/api/cli#init-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * output_file PATH File to save the config to or - for stdout (will only output config │\n│ and no additional logging info) │\n│ [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --lang -l TEXT Two-letter code of the language to use │\n│ [default: en] │\n│ --pipeline -p TEXT Comma-separated names of trainable pipeline │\n│ components to include (without 'tok2vec' or │\n│ 'transformer') │\n│ [default: tagger,parser,ner] │\n│ --optimize -o [efficiency|accuracy] Whether to optimize for efficiency (faster │\n│ inference, smaller model, lower memory │\n│ consumption) or higher accuracy (potentially │\n│ larger and slower model). This will impact the │\n│ choice of architecture, pretrained weights and │\n│ related hyperparameters. │\n│ [default: efficiency] │\n│ --gpu -G Whether the model can run on GPU. This will │\n│ impact the choice of architecture, pretrained │\n│ weights and related hyperparameters. │\n│ --pretraining -pt Include config for pretraining (with 'spacy │\n│ pretrain') │\n│ --force -F Force overwriting the output file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init fill-config": " \n Usage: python -m spacy init fill-config [OPTIONS] BASE_PATH [OUTPUT_FILE] \n \n Fill partial config file with default values. Will add all missing settings from the default \n config and will create all objects, check the registered functions for their default values and \n update the base config. This command can be used with a config generated via the training \n quickstart widget: https://spacy.io/usage/training#quickstart \n \n DOCS: https://spacy.io/api/cli#init-fill-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * base_path FILE Path to base config to fill [default: None] [required] │\n│ output_file [OUTPUT_FILE] Path to output .cfg file (or - for stdout) [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --pretraining -pt Include config for pretraining (with 'spacy pretrain') │\n│ --diff -D Print a visual diff highlighting the changes │\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init labels": " \n Usage: python -m spacy init labels [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Generate JSON files for the labels in the data. This helps speed up the training process, since \n spaCy won't have to preprocess the data to extract the labels. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * output_path PATH Output directory for the labels [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init nlp": " \n Usage: python -m spacy init nlp [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n│ * output_path PATH Output directory for the prepared data [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init vectors": " \n Usage: python -m spacy init vectors [OPTIONS] LANG VECTORS_LOC OUTPUT_DIR \n \n Convert word vectors for use with spaCy. Will export an nlp object that you can use in the \n [initialize] block of your config to initialize a model with vectors. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * lang TEXT The language of the nlp object to create [default: None] [required] │\n│ * vectors_loc PATH Vectors file in Word2Vec format [default: None] [required] │\n│ * output_dir PATH Pipeline output directory [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --prune -p INTEGER Optional number of vectors to prune to [default: -1] │\n│ --truncate -t INTEGER Optional number of vectors to truncate to when reading in │\n│ vectors file │\n│ [default: 0] │\n│ --mode -m TEXT Vectors mode: default or floret [default: default] │\n│ --name -n TEXT Optional name for the word vectors, e.g. en_core_web_lg.vectors │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --attr -a TEXT Optional token attribute to use for vectors, e.g. LOWER or NORM │\n│ [default: ORTH] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "link": " \n Usage: python -m spacy link [OPTIONS] ARGS KWARGS \n \n (deprecated) \n As of spaCy v3.0, symlinks like \"en\" are not supported anymore. You can load trained pipeline \n packages using their full names or from a directory path. \n \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * args TEXT [default: None] [required] │\n│ * kwargs TEXT [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "package": " \n Usage: python -m spacy package [OPTIONS] INPUT_DIR OUTPUT_DIR \n \n Generate an installable Python package for a pipeline. Includes binary data, meta and required \n installation files. A new directory will be created in the specified output directory, and the \n data will be copied over. If --create-meta is set and a meta.json already exists in the output \n directory, the existing values will be used as the defaults in the command-line prompt. After \n packaging, \"python -m build --sdist\" is run in the package directory, which will create a .tar.gz \n archive that can be installed via \"pip install\". \n \n If additional code files are provided (e.g. Python files containing custom registered functions \n like pipeline components), they are copied into the package and imported in the __init__.py. \n DOCS: https://spacy.io/api/cli#package \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_dir DIRECTORY Directory with pipeline data [default: None] [required] │\n│ * output_dir DIRECTORY Output parent directory [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c TEXT Comma-separated paths to Python file │\n│ with additional code (registered │\n│ functions) to be included in the package │\n│ --meta-path,--meta -m FILE Path to meta.json [default: None] │\n│ --create-meta -C Create meta.json, even if one exists │\n│ --name -n TEXT Package name to override meta │\n│ [default: None] │\n│ --version -v TEXT Package version to override meta │\n│ [default: None] │\n│ --build -b TEXT Comma-separated formats to build: sdist │\n│ and/or wheel, or none. │\n│ [default: sdist] │\n│ --force -f,-F Force overwriting existing data in │\n│ output directory │\n│ --require-parent -R,-R --no-require-parent Include the parent package (e.g. spacy) │\n│ in the requirements │\n│ [default: require-parent] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "pretrain": " \n Usage: python -m spacy pretrain [OPTIONS] CONFIG_PATH OUTPUT_DIR \n \n Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate \n language-modelling objective. Two objective types are available, vector-based and character-based. \n \n In the vector-based objective, we load word vectors that have been trained using a word2vec-style \n distributional similarity algorithm, and train a component like a CNN, BiLSTM, etc to predict \n vectors which match the pretrained ones. The weights are saved to a directory after each epoch. \n You can then pass a path to one of these pretrained weights files to the 'spacy train' command. \n This technique may be especially helpful if you have little labelled data. However, it's still \n quite experimental, so your mileage may vary. \n To load the weights back in during 'spacy train', you need to ensure all settings are the same \n between pretraining and training. Ideally, this is done by using the same config file for both \n commands. \n DOCS: https://spacy.io/api/cli#pretrain \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path FILE Path to config file [default: None] [required] │\n│ * output_dir PATH Directory to write weights to on each epoch [default: None] │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ [default: None] │\n│ --resume-path -r PATH Path to pretrained weights from which to resume pretraining │\n│ [default: None] │\n│ --epoch-resume -er INTEGER The epoch to resume counting from when using --resume-path. │\n│ Prevents unintended overwriting of existing weight files. │\n│ [default: None] │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --skip-last -L Skip saving model-last.bin │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "profile": " \n Usage: python -m spacy profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [default: None] [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "apply": " \n Usage: python -m spacy apply [OPTIONS] MODEL DATA_PATH OUTPUT_FILE \n \n Apply a trained pipeline to documents to get predictions. Expects a loadable spaCy pipeline and \n path to the data, which can be a directory or a file. The data files can be provided in multiple \n formats: 1. .spacy files 2. .jsonl files with a specified \"field\" to read the text from. \n 3. Files with any other extension are assumed to be containing a single document. DOCS: \n https://spacy.io/api/cli#apply \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of the documents to predict on. Can be a single file in │\n│ .spacy format or a .jsonl file. Files with other extensions are │\n│ treated as single plain text documents. If a directory is provided │\n│ it is traversed recursively to grab all files to be processed. The │\n│ files can be a mixture of .spacy, .jsonl and text files. If .jsonl │\n│ is provided the specified field is going to be grabbed (\"text\" by │\n│ default). │\n│ [required] │\n│ * output_file FILE Path to save the resulting .spacy file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ --text-key -tk TEXT Key containing text string for JSONL [default: text] │\n│ --force -F Force overwriting the output file │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU. [default: -1] │\n│ --batch-size -b INTEGER Batch size. [default: 1] │\n│ --n-process -n INTEGER number of processors to use. [default: 1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "assemble": " \n Usage: python -m spacy assemble [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Assemble a spaCy pipeline from a config file. The config file includes all settings for \n initializing the pipeline. To override settings in the config, e.g. settings that point to local \n paths or that you want to experiment with, you can override them as command line options. The \n --code argument lets you pass in a Python file that can be used to register custom functions that \n are referenced in the config. \n \n DOCS: https://spacy.io/api/cli#assemble \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n│ * output_path PATH Output directory to store assembled pipeline in [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) to │\n│ be imported │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "benchmark accuracy": " \n Usage: python -m spacy benchmark accuracy [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "benchmark speed": " \n Usage: python -m spacy benchmark speed [OPTIONS] MODEL DATA_PATH \n \n Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the binary .spacy \n format. \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --batch-size -b INTEGER RANGE [x>=1] Override the pipeline batch size │\n│ --no-shuffle Do not shuffle benchmark data │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --batches INTEGER RANGE [x>=30] Minimum number of batches to benchmark │\n│ [default: 50] │\n│ --warmup -w INTEGER RANGE [x>=0] Number of iterations over the data for warmup │\n│ [default: 3] │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "convert": " \n Usage: python -m spacy convert [OPTIONS] INPUT_PATH [OUTPUT_DIR] \n \n Convert files into json or DocBin format for training. The resulting .spacy file can be used with \n the train command and other experiment management functions. \n \n If no output_dir is specified and the output format is JSON, the data \n is written to stdout, so you can pipe them forward to a JSON file: \n $ spacy convert some_file.conllu --file-type json > some_file.json \n \n DOCS: https://spacy.io/api/cli#convert \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_path TEXT Input file or directory [required] │\n│ output_dir [OUTPUT_DIR] Output directory. '-' for stdout. [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --file-type -t [json|spacy] Type of data to produce [default: spacy] │\n│ --n-sents -n INTEGER Number of sentences per doc (0 to disable) │\n│ [default: 1] │\n│ --seg-sents -s Segment sentences (for -c ner) │\n│ --model,--base -b TEXT Trained spaCy pipeline for sentence segmentation to │\n│ use as base (for --seg-sents) │\n│ --morphology -m Enable appending morphology to tags │\n│ --merge-subtokens -T Merge CoNLL-U subtokens │\n│ --converter -c TEXT Converter: ('conllubio', 'conllu', 'conll', 'ner', │\n│ 'iob', 'json') │\n│ [default: auto] │\n│ --ner-map -nm PATH NER tag mapping (as JSON-encoded dict of entity types) │\n│ --lang -l TEXT Language (if tokenizer required) │\n│ --concatenate -C Concatenate output to a single file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug config": " \n Usage: python -m spacy debug config [OPTIONS] CONFIG_PATH \n \n Debug a config file and show validation errors. The command will create all objects in the tree \n and validate them. Note that some config validation errors are blocking and will prevent the rest \n of the config from being resolved. This means that you may not see all validation errors at once \n and some issues are only shown once previous errors have been fixed. Similar as with the 'train' \n command, you can override settings from the config as command line options. For instance, \n --training.batch_size 128 overrides the value of \"batch_size\" in the block \"\". \n \n DOCS: https://spacy.io/api/cli#debug-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --show-functions -F Show an overview of all registered functions used in the │\n│ config and where they come from (modules, files etc.) │\n│ --show-variables -V Show an overview of all variables referenced in the config and │\n│ their values. This will also reflect variables overwritten on │\n│ the CLI. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug data": " \n Usage: python -m spacy debug data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug diff-config": " \n Usage: python -m spacy debug diff-config [OPTIONS] CONFIG_PATH \n \n Show a diff of a config file with respect to spaCy's defaults or another config file. If \n additional settings were used in the creation of the config file, then you must supply these as \n extra parameters to the command when comparing to the default settings. The generated diff can \n also be used when posting to the discussion forum to provide more information for the maintainers. \n \n The `optimize`, `gpu`, and `pretraining` options are only relevant when \n comparing against the default configuration (or specifically when `compare_to` is None). \n \n DOCS: https://spacy.io/api/cli#debug-diff \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --compare-to PATH Path to a config file to diff against, or │\n│ `None` to compare against default settings │\n│ --optimize -o [efficiency|accuracy] Whether the user config was optimized for │\n│ efficiency or accuracy. Only relevant when │\n│ comparing against the default config. │\n│ [default: efficiency] │\n│ --gpu -G Whether the original config can run on a │\n│ GPU. Only relevant when comparing against │\n│ the default config. │\n│ --pretraining,--pt Whether to compare on a config with │\n│ pretraining involved. Only relevant when │\n│ comparing against the default config. │\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug model": " \n Usage: python -m spacy debug model [OPTIONS] CONFIG_PATH COMPONENT \n \n Analyze a Thinc model implementation. Includes checks for internal structure and activations \n during training. \n \n DOCS: https://spacy.io/api/cli#debug-model \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n│ * component TEXT Name of the pipeline component of which the model should be analysed │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --layers -l TEXT Comma-separated names of layer IDs to print │\n│ --dimensions -DIM Show dimensions │\n│ --parameters -PAR Show parameters │\n│ --gradients -GRAD Show gradients │\n│ --attributes -ATTR Show attributes │\n│ --print-step0 -P0 Print model before training │\n│ --print-step1 -P1 Print model after initialization │\n│ --print-step2 -P2 Print model after training │\n│ --print-step3 -P3 Print final predictions │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug profile": " \n Usage: python -m spacy debug profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug-data": " \n Usage: python -m spacy debug-data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "download": " \n Usage: python -m spacy download [OPTIONS] MODEL \n \n Download compatible trained pipeline from the default download path using pip. If --direct flag is \n set, the command expects the full package name with version. For direct downloads, the \n compatibility check will be skipped. All additional arguments provided to this command will be \n passed to `pip install` on package installation. \n \n DOCS: https://spacy.io/api/cli#download \n AVAILABLE PACKAGES: https://spacy.io/models \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Name of pipeline package to download [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --direct -d,-D Force direct download of name + version │\n│ --sdist -S Download sdist (.tar.gz) archive instead of pre-built binary wheel │\n│ --url -U TEXT Download from given url │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "evaluate": " \n Usage: python -m spacy evaluate [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "find-function": " \n Usage: python -m spacy find-function [OPTIONS] FUNC_NAME \n \n Find the module, path and line number to the file the registered function is defined in, if \n available. \n \n func_name (str): Name of the registered function. \n registry_name (Optional): Name of the catalogue registry. \n \n DOCS: https://spacy.io/api/cli#find-function \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * func_name TEXT Name of the registered function. [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --registry -r TEXT Name of the catalogue registry. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "find-threshold": " \n Usage: python -m spacy find-threshold [OPTIONS] MODEL DATA_PATH PIPE_NAME \n THRESHOLD_KEY SCORES_KEY \n \n Runs prediction trials for a trained model with varying thresholds to maximize the specified \n metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. \n Results are displayed in a table on `stdout` (the corresponding API call to \n `spacy.cli.find_threshold.find_threshold()` returns all results). \n \n This is applicable only for components whose predictions are influenced by \n thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note \n that the full path to the corresponding threshold attribute in the config has to \n be provided. \n \n DOCS: https://spacy.io/api/cli#find-threshold \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [required] │\n│ * pipe_name TEXT Name of pipe to examine thresholds for [required] │\n│ * threshold_key TEXT Key of threshold attribute in component's configuration [required] │\n│ * scores_key TEXT Metric to optimize [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n_trials -n INTEGER Number of trials to determine optimal thresholds │\n│ [default: 11] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "info": " \n Usage: python -m spacy info [OPTIONS] [MODEL] \n \n Print info about spaCy installation. If a pipeline is specified as an argument, print its meta \n information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. \n \n Flag --url prints only the download URL of the most recent compatible \n version of the pipeline. \n \n DOCS: https://spacy.io/api/cli#info \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ model [MODEL] Optional loadable spaCy pipeline │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --silent -s,-S Don't print anything (just return) │\n│ --exclude -e TEXT Comma-separated keys to exclude from the print-out │\n│ [default: labels] │\n│ --url -u Print the URL to download the most recent compatible version of the │\n│ pipeline │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init config": " \n Usage: python -m spacy init config [OPTIONS] OUTPUT_FILE \n \n Generate a starter config file for training. Based on your requirements specified via the CLI \n arguments, this command generates a config with the optimal settings for your use case. This \n includes the choice of architecture, pretrained weights and related hyperparameters. \n \n DOCS: https://spacy.io/api/cli#init-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * output_file PATH File to save the config to or - for stdout (will only output config │\n│ and no additional logging info) │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --lang -l TEXT Two-letter code of the language to use │\n│ [default: en] │\n│ --pipeline -p TEXT Comma-separated names of trainable pipeline │\n│ components to include (without 'tok2vec' or │\n│ 'transformer') │\n│ [default: tagger,parser,ner] │\n│ --optimize -o [efficiency|accuracy] Whether to optimize for efficiency (faster │\n│ inference, smaller model, lower memory │\n│ consumption) or higher accuracy (potentially │\n│ larger and slower model). This will impact the │\n│ choice of architecture, pretrained weights and │\n│ related hyperparameters. │\n│ [default: efficiency] │\n│ --gpu -G Whether the model can run on GPU. This will │\n│ impact the choice of architecture, pretrained │\n│ weights and related hyperparameters. │\n│ --pretraining -pt Include config for pretraining (with 'spacy │\n│ pretrain') │\n│ --force -F Force overwriting the output file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init fill-config": " \n Usage: python -m spacy init fill-config [OPTIONS] BASE_PATH [OUTPUT_FILE] \n \n Fill partial config file with default values. Will add all missing settings from the default \n config and will create all objects, check the registered functions for their default values and \n update the base config. This command can be used with a config generated via the training \n quickstart widget: https://spacy.io/usage/training#quickstart \n \n DOCS: https://spacy.io/api/cli#init-fill-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * base_path FILE Path to base config to fill [required] │\n│ output_file [OUTPUT_FILE] Path to output .cfg file (or - for stdout) [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --pretraining -pt Include config for pretraining (with 'spacy pretrain') │\n│ --diff -D Print a visual diff highlighting the changes │\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init labels": " \n Usage: python -m spacy init labels [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Generate JSON files for the labels in the data. This helps speed up the training process, since \n spaCy won't have to preprocess the data to extract the labels. \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n│ * output_path PATH Output directory for the labels [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init nlp": " \n Usage: python -m spacy init nlp [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n│ * output_path PATH Output directory for the prepared data [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init vectors": " \n Usage: python -m spacy init vectors [OPTIONS] LANG VECTORS_LOC OUTPUT_DIR \n \n Convert word vectors for use with spaCy. Will export an nlp object that you can use in the block \n of your config to initialize a model with vectors. \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * lang TEXT The language of the nlp object to create [required] │\n│ * vectors_loc PATH Vectors file in Word2Vec format [required] │\n│ * output_dir PATH Pipeline output directory [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --prune -p INTEGER Optional number of vectors to prune to [default: -1] │\n│ --truncate -t INTEGER Optional number of vectors to truncate to when reading in │\n│ vectors file │\n│ [default: 0] │\n│ --mode -m TEXT Vectors mode: default or floret [default: default] │\n│ --name -n TEXT Optional name for the word vectors, e.g. en_core_web_lg.vectors │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --attr -a TEXT Optional token attribute to use for vectors, e.g. LOWER or NORM │\n│ [default: ORTH] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "link": " \n Usage: python -m spacy link [OPTIONS] ARGS KWARGS \n \n (deprecated) \n As of spaCy v3.0, symlinks like \"en\" are not supported anymore. You can load trained pipeline \n packages using their full names or from a directory path. \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * args TEXT [required] │\n│ * kwargs TEXT [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "package": " \n Usage: python -m spacy package [OPTIONS] INPUT_DIR OUTPUT_DIR \n \n Generate an installable Python package for a pipeline. Includes binary data, meta and required \n installation files. A new directory will be created in the specified output directory, and the \n data will be copied over. If --create-meta is set and a meta.json already exists in the output \n directory, the existing values will be used as the defaults in the command-line prompt. After \n packaging, \"python -m build --sdist\" is run in the package directory, which will create a .tar.gz \n archive that can be installed via \"pip install\". \n \n If additional code files are provided (e.g. Python files containing custom \n registered functions like pipeline components), they are copied into the \n package and imported in the __init__.py. \n \n DOCS: https://spacy.io/api/cli#package \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_dir DIRECTORY Directory with pipeline data [required] │\n│ * output_dir DIRECTORY Output parent directory [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c TEXT Comma-separated paths to Python file │\n│ with additional code (registered │\n│ functions) to be included in the package │\n│ --meta-path,--meta -m FILE Path to meta.json │\n│ --create-meta -C Create meta.json, even if one exists │\n│ --name -n TEXT Package name to override meta │\n│ --version -v TEXT Package version to override meta │\n│ --build -b TEXT Comma-separated formats to build: sdist │\n│ and/or wheel, or none. │\n│ [default: sdist] │\n│ --force -f,-F Force overwriting existing data in │\n│ output directory │\n│ --require-parent -R,-R --no-require-parent Include the parent package (e.g. spacy) │\n│ in the requirements │\n│ [default: require-parent] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "pretrain": " \n Usage: python -m spacy pretrain [OPTIONS] CONFIG_PATH OUTPUT_DIR \n \n Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate \n language-modelling objective. Two objective types are available, vector-based and character-based. \n \n In the vector-based objective, we load word vectors that have been trained \n using a word2vec-style distributional similarity algorithm, and train a \n component like a CNN, BiLSTM, etc to predict vectors which match the \n pretrained ones. The weights are saved to a directory after each epoch. You \n can then pass a path to one of these pretrained weights files to the \n 'spacy train' command. \n \n This technique may be especially helpful if you have little labelled data. \n However, it's still quite experimental, so your mileage may vary. \n \n To load the weights back in during 'spacy train', you need to ensure \n all settings are the same between pretraining and training. Ideally, \n this is done by using the same config file for both commands. \n \n DOCS: https://spacy.io/api/cli#pretrain \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path FILE Path to config file [required] │\n│ * output_dir PATH Directory to write weights to on each epoch [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --resume-path -r PATH Path to pretrained weights from which to resume pretraining │\n│ --epoch-resume -er INTEGER The epoch to resume counting from when using --resume-path. │\n│ Prevents unintended overwriting of existing weight files. │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --skip-last -L Skip saving model-last.bin │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "profile": " \n Usage: python -m spacy profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", "project assets": " \n Usage: python -m spacy project assets [OPTIONS] [PROJECT_DIR] \n \n Fetch project assets like datasets and pretrained weights. Assets are defined in the \"assets\" \n section of the project.yml. If a checksum is provided in the project.yml, the file is only \n downloaded if no local file with the same checksum exists. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/tutorial/directory-and-assets.md \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Path to cloned project. Defaults to current working directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --sparse -S Use sparse checkout for assets provided via Git, to only check out and clone │\n│ the files needed. Requires Git v22.2+. │\n│ --extra -e Download all assets, including those marked as 'extra'. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project clone": " \n Usage: python -m spacy project clone [OPTIONS] NAME [DEST] \n \n Clone a project template from a repository. Calls into \"git\" and will only download the files from \n the given subdirectory. The GitHub repo defaults to the official Weasel template repo, but can be \n customized (including using a private repo). \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#clipboard-clone \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * name TEXT The name of the template to clone [default: None] [required] │\n│ dest [DEST] Where to clone the project. Defaults to current working directory │\n│ [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --repo -r TEXT The repository to clone from │\n│ [default: https://github.com/explosion/projects] │\n│ --branch -b TEXT The branch to clone from. If not provided, will attempt main, master │\n│ [default: None] │\n│ --sparse -S Use sparse Git checkout to only check out and clone the files needed. │\n│ Requires Git v22.2+. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project clone": " \n Usage: python -m spacy project clone [OPTIONS] NAME [DEST] \n \n Clone a project template from a repository. Calls into \"git\" and will only download the files from \n the given subdirectory. The GitHub repo defaults to the official Weasel template repo, but can be \n customized (including using a private repo). \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#clipboard-clone \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * name TEXT The name of the template to clone [required] │\n│ dest [DEST] Where to clone the project. Defaults to current working directory │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --repo -r TEXT The repository to clone from │\n│ [default: https://github.com/explosion/projects] │\n│ --branch -b TEXT The branch to clone from. If not provided, will attempt main, master │\n│ --sparse -S Use sparse Git checkout to only check out and clone the files needed. │\n│ Requires Git v22.2+. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", "project document": " \n Usage: python -m spacy project document [OPTIONS] [PROJECT_DIR] \n \n Auto-generate a README.md for a project. If the content is saved to a file, hidden markers are \n added so you can add custom content before or after the auto-generated section and only the \n auto-generated docs will be replaced when you re-run the command. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#closed_book-document \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Path to cloned project. Defaults to current working directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o PATH Path to output Markdown file for output. Defaults to - for standard │\n│ output │\n│ [default: -] │\n│ --no-emoji -NE Don't use emoji │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project dvc": " \n Usage: python -m spacy project dvc [OPTIONS] [PROJECT_DIR] [WORKFLOW] \n \n Auto-generate Data Version Control (DVC) config. A DVC project can only define one pipeline, so \n you need to specify one workflow defined in the project.yml. If no workflow is specified, the \n first defined workflow is used. The DVC config will only be updated if the project.yml changed. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#repeat-dvc \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n│ workflow [WORKFLOW] Name of workflow defined in project.yml. Defaults to first │\n│ workflow if not set. │\n│ [default: None] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --verbose -V Print more info │\n│ --quiet -q Print less info │\n│ --force -F Force update DVC config │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project dvc": " \n Usage: python -m spacy project dvc [OPTIONS] [PROJECT_DIR] [WORKFLOW] \n \n Auto-generate Data Version Control (DVC) config. A DVC project can only define one pipeline, so \n you need to specify one workflow defined in the project.yml. If no workflow is specified, the \n first defined workflow is used. The DVC config will only be updated if the project.yml changed. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#repeat-dvc \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n│ workflow [WORKFLOW] Name of workflow defined in project.yml. Defaults to first │\n│ workflow if not set. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --verbose -V Print more info │\n│ --quiet -q Print less info │\n│ --force -F Force update DVC config │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", "project pull": " \n Usage: python -m spacy project pull [OPTIONS] [REMOTE] [PROJECT_DIR] \n \n Retrieve available precomputed outputs from a remote storage. You can alias remotes in your \n project.yml by mapping them to storage paths. A storage can be anything that the smart_open \n library can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_down-push \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ remote [REMOTE] Name or path of remote storage [default: default] │\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", "project push": " \n Usage: python -m spacy project push [OPTIONS] [REMOTE] [PROJECT_DIR] \n \n Persist outputs to a remote storage. You can alias remotes in your project.yml by mapping them to \n storage paths. A storage can be anything that the smart_open library can upload to, e.g. AWS, \n Google Cloud Storage, SSH, local directories etc. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_up-push \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ remote [REMOTE] Name or path of remote storage [default: default] │\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "train": " \n Usage: python -m spacy train [OPTIONS] CONFIG_PATH \n \n Train or update a spaCy pipeline. Requires data in spaCy's binary format. To convert data from \n other formats, use the `spacy convert` command. The config file includes all settings and \n hyperparameters used during training. To override settings in the config, e.g. settings that point \n to local paths or that you want to experiment with, you can override them as command line options. \n For instance, --training.batch_size 128 overrides the value of \"batch_size\" in the block \n \"[training]\". The --code argument lets you pass in a Python file that's imported before training. \n It can be used to register custom functions and architectures that can then be referenced in the \n config. \n \n DOCS: https://spacy.io/api/cli#train \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [default: None] [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output,--output-path -o PATH Output directory to store trained pipeline in │\n│ [default: None] │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ [default: None] │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "train": " \n Usage: python -m spacy train [OPTIONS] CONFIG_PATH \n \n Train or update a spaCy pipeline. Requires data in spaCy's binary format. To convert data from \n other formats, use the `spacy convert` command. The config file includes all settings and \n hyperparameters used during training. To override settings in the config, e.g. settings that point \n to local paths or that you want to experiment with, you can override them as command line options. \n For instance, --training.batch_size 128 overrides the value of \"batch_size\" in the block \"\". The \n --code argument lets you pass in a Python file that's imported before training. It can be used to \n register custom functions and architectures that can then be referenced in the config. \n \n DOCS: https://spacy.io/api/cli#train \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output,--output-path -o PATH Output directory to store trained pipeline in │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", "validate": " \n Usage: python -m spacy validate [OPTIONS] \n \n Validate the currently installed pipeline packages and spaCy version. Checks if the installed \n packages are compatible and shows upgrade instructions if available. Should be run after `pip \n install -U spacy`. \n \n DOCS: https://spacy.io/api/cli#validate \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" }, "errors": { - "missing_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ Missing command. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", - "unknown_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_COMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "missing_command": "", + "unknown_command": "", "unknown_subcommand": { - "benchmark": " \n Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]... \n \n Commands for benchmarking pipelines. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ accuracy Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in │\n│ the binary .spacy format. The --gold-preproc option sets up the evaluation examples │\n│ with gold-standard sentences and tokens for the predictions. Gold preprocessing helps │\n│ the annotations align to the tokenization, and may result in sequences of more │\n│ consistent length. However, it may reduce runtime accuracy due to train/test skew. To │\n│ render a sample of dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ speed Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the │\n│ binary .spacy format. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug": " \n Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]... \n \n Suite of helpful commands for debugging and profiling. Includes commands to check and validate \n your config files, training and evaluation data, and custom model implementations. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ data Analyze, debug and validate your training and development data. Outputs useful │\n│ stats, and can help you find problems like invalid entity annotations, cyclic │\n│ dependencies, low data labels and more. │\n│ profile Profile which functions take the most time in a spaCy pipeline. Input should be │\n│ formatted as one JSON object per line with a key \"text\". It can either be provided │\n│ as a JSONL file, or be read from sys.sytdin. If no input file is specified, the │\n│ IMDB dataset is loaded via Thinc. │\n│ config Debug a config file and show validation errors. The command will create all │\n│ objects in the tree and validate them. Note that some config validation errors are │\n│ blocking and will prevent the rest of the config from being resolved. This means │\n│ that you may not see all validation errors at once and some issues are only shown │\n│ once previous errors have been fixed. Similar as with the 'train' command, you can │\n│ override settings from the config as command line options. For instance, │\n│ --training.batch_size 128 overrides the value of \"batch_size\" in the block │\n│ \"[training]\". │\n│ diff-config Show a diff of a config file with respect to spaCy's defaults or another config │\n│ file. If additional settings were used in the creation of the config file, then │\n│ you must supply these as extra parameters to the command when comparing to the │\n│ default settings. The generated diff can also be used when posting to the │\n│ discussion forum to provide more information for the maintainers. │\n│ model Analyze a Thinc model implementation. Includes checks for internal structure and │\n│ activations during training. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init": " \n Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]... \n \n Commands for initializing configs and pipeline packages. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ config Generate a starter config file for training. Based on your requirements specified │\n│ via the CLI arguments, this command generates a config with the optimal settings │\n│ for your use case. This includes the choice of architecture, pretrained weights │\n│ and related hyperparameters. │\n│ fill-config Fill partial config file with default values. Will add all missing settings from │\n│ the default config and will create all objects, check the registered functions for │\n│ their default values and update the base config. This command can be used with a │\n│ config generated via the training quickstart widget: │\n│ https://spacy.io/usage/training#quickstart │\n│ vectors Convert word vectors for use with spaCy. Will export an nlp object that you can │\n│ use in the [initialize] block of your config to initialize a model with vectors. │\n│ labels Generate JSON files for the labels in the data. This helps speed up the training │\n│ process, since spaCy won't have to preprocess the data to extract the labels. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project": " \n Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]... \n \n Command-line interface for spaCy projects and templates. You'd typically start by cloning a \n project template to a local directory and fetching its assets like datasets etc. See the project's \n project.yml for the available commands. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ assets Fetch project assets like datasets and pretrained weights. Assets are defined in the │\n│ \"assets\" section of the project.yml. If a checksum is provided in the project.yml, │\n│ the file is only downloaded if no local file with the same checksum exists. │\n│ clone Clone a project template from a repository. Calls into \"git\" and will only download │\n│ the files from the given subdirectory. The GitHub repo defaults to the official │\n│ Weasel template repo, but can be customized (including using a private repo). │\n│ document Auto-generate a README.md for a project. If the content is saved to a file, hidden │\n│ markers are added so you can add custom content before or after the auto-generated │\n│ section and only the auto-generated docs will be replaced when you re-run the │\n│ command. │\n│ dvc Auto-generate Data Version Control (DVC) config. A DVC project can only define one │\n│ pipeline, so you need to specify one workflow defined in the project.yml. If no │\n│ workflow is specified, the first defined workflow is used. The DVC config will only │\n│ be updated if the project.yml changed. │\n│ run Run a named command or workflow defined in the project.yml. If a workflow name is │\n│ specified, all commands in the workflow are run, in order. If commands define │\n│ dependencies and/or outputs, they will only be re-run if state has changed. │\n│ pull Retrieve available precomputed outputs from a remote storage. You can alias remotes │\n│ in your project.yml by mapping them to storage paths. A storage can be anything that │\n│ the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, local │\n│ directories etc. │\n│ push Persist outputs to a remote storage. You can alias remotes in your project.yml by │\n│ mapping them to storage paths. A storage can be anything that the smart_open library │\n│ can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + "benchmark": "", + "debug": "", + "init": "", + "project": "" } }, "group_help": { - "benchmark": " \n Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]... \n \n Commands for benchmarking pipelines. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ accuracy Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in │\n│ the binary .spacy format. The --gold-preproc option sets up the evaluation examples │\n│ with gold-standard sentences and tokens for the predictions. Gold preprocessing helps │\n│ the annotations align to the tokenization, and may result in sequences of more │\n│ consistent length. However, it may reduce runtime accuracy due to train/test skew. To │\n│ render a sample of dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ speed Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the │\n│ binary .spacy format. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug": " \n Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]... \n \n Suite of helpful commands for debugging and profiling. Includes commands to check and validate \n your config files, training and evaluation data, and custom model implementations. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ data Analyze, debug and validate your training and development data. Outputs useful │\n│ stats, and can help you find problems like invalid entity annotations, cyclic │\n│ dependencies, low data labels and more. │\n│ profile Profile which functions take the most time in a spaCy pipeline. Input should be │\n│ formatted as one JSON object per line with a key \"text\". It can either be provided │\n│ as a JSONL file, or be read from sys.sytdin. If no input file is specified, the │\n│ IMDB dataset is loaded via Thinc. │\n│ config Debug a config file and show validation errors. The command will create all │\n│ objects in the tree and validate them. Note that some config validation errors are │\n│ blocking and will prevent the rest of the config from being resolved. This means │\n│ that you may not see all validation errors at once and some issues are only shown │\n│ once previous errors have been fixed. Similar as with the 'train' command, you can │\n│ override settings from the config as command line options. For instance, │\n│ --training.batch_size 128 overrides the value of \"batch_size\" in the block │\n│ \"[training]\". │\n│ diff-config Show a diff of a config file with respect to spaCy's defaults or another config │\n│ file. If additional settings were used in the creation of the config file, then │\n│ you must supply these as extra parameters to the command when comparing to the │\n│ default settings. The generated diff can also be used when posting to the │\n│ discussion forum to provide more information for the maintainers. │\n│ model Analyze a Thinc model implementation. Includes checks for internal structure and │\n│ activations during training. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init": " \n Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]... \n \n Commands for initializing configs and pipeline packages. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ config Generate a starter config file for training. Based on your requirements specified │\n│ via the CLI arguments, this command generates a config with the optimal settings │\n│ for your use case. This includes the choice of architecture, pretrained weights │\n│ and related hyperparameters. │\n│ fill-config Fill partial config file with default values. Will add all missing settings from │\n│ the default config and will create all objects, check the registered functions for │\n│ their default values and update the base config. This command can be used with a │\n│ config generated via the training quickstart widget: │\n│ https://spacy.io/usage/training#quickstart │\n│ vectors Convert word vectors for use with spaCy. Will export an nlp object that you can │\n│ use in the [initialize] block of your config to initialize a model with vectors. │\n│ labels Generate JSON files for the labels in the data. This helps speed up the training │\n│ process, since spaCy won't have to preprocess the data to extract the labels. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project": " \n Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]... \n \n Command-line interface for spaCy projects and templates. You'd typically start by cloning a \n project template to a local directory and fetching its assets like datasets etc. See the project's \n project.yml for the available commands. \n \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ assets Fetch project assets like datasets and pretrained weights. Assets are defined in the │\n│ \"assets\" section of the project.yml. If a checksum is provided in the project.yml, │\n│ the file is only downloaded if no local file with the same checksum exists. │\n│ clone Clone a project template from a repository. Calls into \"git\" and will only download │\n│ the files from the given subdirectory. The GitHub repo defaults to the official │\n│ Weasel template repo, but can be customized (including using a private repo). │\n│ document Auto-generate a README.md for a project. If the content is saved to a file, hidden │\n│ markers are added so you can add custom content before or after the auto-generated │\n│ section and only the auto-generated docs will be replaced when you re-run the │\n│ command. │\n│ dvc Auto-generate Data Version Control (DVC) config. A DVC project can only define one │\n│ pipeline, so you need to specify one workflow defined in the project.yml. If no │\n│ workflow is specified, the first defined workflow is used. The DVC config will only │\n│ be updated if the project.yml changed. │\n│ run Run a named command or workflow defined in the project.yml. If a workflow name is │\n│ specified, all commands in the workflow are run, in order. If commands define │\n│ dependencies and/or outputs, they will only be re-run if state has changed. │\n│ pull Retrieve available precomputed outputs from a remote storage. You can alias remotes │\n│ in your project.yml by mapping them to storage paths. A storage can be anything that │\n│ the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, local │\n│ directories etc. │\n│ push Persist outputs to a remote storage. You can alias remotes in your project.yml by │\n│ mapping them to storage paths. A storage can be anything that the smart_open library │\n│ can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + "benchmark": " \n Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]... \n \n Commands for benchmarking pipelines. \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ accuracy Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation │\n│ data in the binary .spacy format. The --gold-preproc option sets up the │\n│ evaluation examples with gold-standard sentences and tokens for the │\n│ predictions. Gold preprocessing helps the annotations align to the │\n│ tokenization, and may result in sequences of more consistent length. However, │\n│ it may reduce runtime accuracy due to train/test skew. To render a sample of │\n│ dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ speed Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark │\n│ data in the binary .spacy format. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "debug": " \n Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]... \n \n Suite of helpful commands for debugging and profiling. Includes commands to check and validate \n your config files, training and evaluation data, and custom model implementations. \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ data Analyze, debug and validate your training and development data. Outputs │\n│ useful stats, and can help you find problems like invalid entity annotations, │\n│ cyclic dependencies, low data labels and more. │\n│ profile Profile which functions take the most time in a spaCy pipeline. │\n│ Input should be formatted as one JSON object per line with a key \"text\". │\n│ It can either be provided as a JSONL file, or be read from sys.sytdin. │\n│ If no input file is specified, the IMDB dataset is loaded via Thinc. │\n│ config Debug a config file and show validation errors. The command will │\n│ create all objects in the tree and validate them. Note that some config │\n│ validation errors are blocking and will prevent the rest of the config from │\n│ being resolved. This means that you may not see all validation errors at │\n│ once and some issues are only shown once previous errors have been fixed. │\n│ Similar as with the 'train' command, you can override settings from the config │\n│ as command line options. For instance, --training.batch_size 128 overrides │\n│ the value of \"batch_size\" in the block \"\". │\n│ diff-config Show a diff of a config file with respect to spaCy's defaults or another config │\n│ file. If │\n│ additional settings were used in the creation of the config file, then you │\n│ must supply these as extra parameters to the command when comparing to the default │\n│ settings. The generated diff │\n│ can also be used when posting to the discussion forum to provide more │\n│ information for the maintainers. │\n│ model Analyze a Thinc model implementation. Includes checks for internal structure │\n│ and activations during training. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "init": " \n Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]... \n \n Commands for initializing configs and pipeline packages. \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ config Generate a starter config file for training. Based on your requirements │\n│ specified via the CLI arguments, this command generates a config with the │\n│ optimal settings for your use case. This includes the choice of architecture, │\n│ pretrained weights and related hyperparameters. │\n│ fill-config Fill partial config file with default values. Will add all missing settings │\n│ from the default config and will create all objects, check the registered │\n│ functions for their default values and update the base config. This command │\n│ can be used with a config generated via the training quickstart widget: │\n│ https://spacy.io/usage/training#quickstart │\n│ vectors Convert word vectors for use with spaCy. Will export an nlp object that │\n│ you can use in the block of your config to initialize │\n│ a model with vectors. │\n│ labels Generate JSON files for the labels in the data. This helps speed up the │\n│ training process, since spaCy won't have to preprocess the data to │\n│ extract the labels. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", + "project": " \n Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]... \n \n Command-line interface for spaCy projects and templates. You'd typically start by cloning a \n project template to a local directory and fetching its assets like datasets etc. See the project's \n project.yml for the available commands. \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ assets Fetch project assets like datasets and pretrained weights. Assets are │\n│ defined in the \"assets\" section of the project.yml. If a checksum is │\n│ provided in the project.yml, the file is only downloaded if no local file │\n│ with the same checksum exists. │\n│ clone Clone a project template from a repository. Calls into \"git\" and will │\n│ only download the files from the given subdirectory. The GitHub repo │\n│ defaults to the official Weasel template repo, but can be customized │\n│ (including using a private repo). │\n│ document Auto-generate a README.md for a project. If the content is saved to a file, │\n│ hidden markers are added so you can add custom content before or after the │\n│ auto-generated section and only the auto-generated docs will be replaced │\n│ when you re-run the command. │\n│ dvc Auto-generate Data Version Control (DVC) config. A DVC │\n│ project can only define one pipeline, so you need to specify one workflow │\n│ defined in the project.yml. If no workflow is specified, the first defined │\n│ workflow is used. The DVC config will only be updated if the project.yml │\n│ changed. │\n│ run Run a named command or workflow defined in the project.yml. If a workflow │\n│ name is specified, all commands in the workflow are run, in order. If │\n│ commands define dependencies and/or outputs, they will only be re-run if │\n│ state has changed. │\n│ pull Retrieve available precomputed outputs from a remote storage. │\n│ You can alias remotes in your project.yml by mapping them to storage paths. │\n│ A storage can be anything that the smart_open library can upload to, e.g. │\n│ AWS, Google Cloud Storage, SSH, local directories etc. │\n│ push Persist outputs to a remote storage. You can alias remotes in your │\n│ project.yml by mapping them to storage paths. A storage can be anything that │\n│ the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, │\n│ local directories etc. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" }, "hidden_group_commands": { "benchmark": [], @@ -114,5 +114,5 @@ "train", "validate" ], - "root_help": " \n Usage: python -m spacy [OPTIONS] COMMAND [ARGS]... \n \n spaCy Command-line Interface \n \n DOCS: https://spacy.io/api/cli \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --install-completion Install completion for the current shell. │\n│ --show-completion Show completion for the current shell, to copy it or customize the │\n│ installation. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ download Download compatible trained pipeline from the default download path using pip. │\n│ If --direct flag is set, the command expects the full package name with │\n│ version. For direct downloads, the compatibility check will be skipped. All │\n│ additional arguments provided to this command will be passed to `pip install` │\n│ on package installation. │\n│ info Print info about spaCy installation. If a pipeline is specified as an argument, │\n│ print its meta information. Flag --markdown prints details in Markdown for easy │\n│ copy-pasting to GitHub issues. │\n│ apply Apply a trained pipeline to documents to get predictions. Expects a loadable │\n│ spaCy pipeline and path to the data, which can be a directory or a file. The │\n│ data files can be provided in multiple formats: 1. .spacy files 2. │\n│ .jsonl files with a specified \"field\" to read the text from. 3. Files with │\n│ any other extension are assumed to be containing a single document. │\n│ DOCS: https://spacy.io/api/cli#apply │\n│ assemble Assemble a spaCy pipeline from a config file. The config file includes all │\n│ settings for initializing the pipeline. To override settings in the config, │\n│ e.g. settings that point to local paths or that you want to experiment with, │\n│ you can override them as command line options. The --code argument lets you │\n│ pass in a Python file that can be used to register custom functions that are │\n│ referenced in the config. │\n│ convert Convert files into json or DocBin format for training. The resulting .spacy │\n│ file can be used with the train command and other experiment management │\n│ functions. │\n│ evaluate Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation │\n│ data in the binary .spacy format. The --gold-preproc option sets up the │\n│ evaluation examples with gold-standard sentences and tokens for the │\n│ predictions. Gold preprocessing helps the annotations align to the │\n│ tokenization, and may result in sequences of more consistent length. However, │\n│ it may reduce runtime accuracy due to train/test skew. To render a sample of │\n│ dependency parses in a HTML file, set as output directory as the displacy_path │\n│ argument. │\n│ find-function Find the module, path and line number to the file the registered function is │\n│ defined in, if available. │\n│ find-threshold Runs prediction trials for a trained model with varying thresholds to maximize │\n│ the specified metric. The search space for the threshold is traversed linearly │\n│ from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` │\n│ (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` │\n│ returns all results). │\n│ package Generate an installable Python package for a pipeline. Includes binary data, │\n│ meta and required installation files. A new directory will be created in the │\n│ specified output directory, and the data will be copied over. If --create-meta │\n│ is set and a meta.json already exists in the output directory, the existing │\n│ values will be used as the defaults in the command-line prompt. After │\n│ packaging, \"python -m build --sdist\" is run in the package directory, which │\n│ will create a .tar.gz archive that can be installed via \"pip install\". │\n│ pretrain Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using │\n│ an approximate language-modelling objective. Two objective types are available, │\n│ vector-based and character-based. │\n│ train Train or update a spaCy pipeline. Requires data in spaCy's binary format. To │\n│ convert data from other formats, use the `spacy convert` command. The config │\n│ file includes all settings and hyperparameters used during training. To │\n│ override settings in the config, e.g. settings that point to local paths or │\n│ that you want to experiment with, you can override them as command line │\n│ options. For instance, --training.batch_size 128 overrides the value of │\n│ \"batch_size\" in the block \"[training]\". The --code argument lets you pass in a │\n│ Python file that's imported before training. It can be used to register custom │\n│ functions and architectures that can then be referenced in the config. │\n│ validate Validate the currently installed pipeline packages and spaCy version. Checks if │\n│ the installed packages are compatible and shows upgrade instructions if │\n│ available. Should be run after `pip install -U spacy`. │\n│ debug Suite of helpful commands for debugging and profiling. Includes commands to │\n│ check and validate your config files, training and evaluation data, and custom │\n│ model implementations. │\n│ benchmark Commands for benchmarking pipelines. │\n│ init Commands for initializing configs and pipeline packages. │\n│ project Command-line interface for spaCy projects and templates. You'd typically start │\n│ by cloning a project template to a local directory and fetching its assets like │\n│ datasets etc. See the project's project.yml for the available commands. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + "root_help": " \n Usage: python -m spacy [OPTIONS] COMMAND [ARGS]... \n \n spaCy Command-line Interface \n \n DOCS: https://spacy.io/api/cli \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --install-completion Install completion for the current shell. │\n│ --show-completion Show completion for the current shell, to copy it or customize the │\n│ installation. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ download Download compatible trained pipeline from the default download path using │\n│ pip. If --direct flag is set, the command expects the full package name with │\n│ version. For direct downloads, the compatibility check will be skipped. All │\n│ additional arguments provided to this command will be passed to `pip install` │\n│ on package installation. │\n│ info Print info about spaCy installation. If a pipeline is specified as an argument, │\n│ print its meta information. Flag --markdown prints details in Markdown for easy │\n│ copy-pasting to GitHub issues. │\n│ apply Apply a trained pipeline to documents to get predictions. │\n│ Expects a loadable spaCy pipeline and path to the data, which │\n│ can be a directory or a file. │\n│ The data files can be provided in multiple formats: │\n│ 1. .spacy files │\n│ 2. .jsonl files with a specified \"field\" to read the text from. │\n│ 3. Files with any other extension are assumed to be containing │\n│ a single document. │\n│ DOCS: https://spacy.io/api/cli#apply │\n│ assemble Assemble a spaCy pipeline from a config file. The config file includes │\n│ all settings for initializing the pipeline. To override settings in the │\n│ config, e.g. settings that point to local paths or that you want to │\n│ experiment with, you can override them as command line options. The │\n│ --code argument lets you pass in a Python file that can be used to │\n│ register custom functions that are referenced in the config. │\n│ convert Convert files into json or DocBin format for training. The resulting .spacy │\n│ file can be used with the train command and other experiment management │\n│ functions. │\n│ evaluate Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation │\n│ data in the binary .spacy format. The --gold-preproc option sets up the │\n│ evaluation examples with gold-standard sentences and tokens for the │\n│ predictions. Gold preprocessing helps the annotations align to the │\n│ tokenization, and may result in sequences of more consistent length. However, │\n│ it may reduce runtime accuracy due to train/test skew. To render a sample of │\n│ dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ find-function Find the module, path and line number to the file the registered │\n│ function is defined in, if available. │\n│ find-threshold Runs prediction trials for a trained model with varying thresholds to maximize │\n│ the specified metric. The search space for the threshold is traversed linearly │\n│ from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` │\n│ (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` │\n│ returns all results). │\n│ package Generate an installable Python package for a pipeline. Includes binary data, │\n│ meta and required installation files. A new directory will be created in the │\n│ specified output directory, and the data will be copied over. If │\n│ --create-meta is set and a meta.json already exists in the output directory, │\n│ the existing values will be used as the defaults in the command-line prompt. │\n│ After packaging, \"python -m build --sdist\" is run in the package directory, │\n│ which will create a .tar.gz archive that can be installed via \"pip install\". │\n│ pretrain Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, │\n│ using an approximate language-modelling objective. Two objective types │\n│ are available, vector-based and character-based. │\n│ train Train or update a spaCy pipeline. Requires data in spaCy's binary format. To │\n│ convert data from other formats, use the `spacy convert` command. The │\n│ config file includes all settings and hyperparameters used during training. │\n│ To override settings in the config, e.g. settings that point to local │\n│ paths or that you want to experiment with, you can override them as │\n│ command line options. For instance, --training.batch_size 128 overrides │\n│ the value of \"batch_size\" in the block \"\". The --code argument │\n│ lets you pass in a Python file that's imported before training. It can be │\n│ used to register custom functions and architectures that can then be │\n│ referenced in the config. │\n│ validate Validate the currently installed pipeline packages and spaCy version. Checks │\n│ if the installed packages are compatible and shows upgrade instructions if │\n│ available. Should be run after `pip install -U spacy`. │\n│ debug Suite of helpful commands for debugging and profiling. Includes │\n│ commands to check and validate your config files, training and evaluation data, │\n│ and custom model implementations. │\n│ benchmark Commands for benchmarking pipelines. │\n│ init Commands for initializing configs and pipeline packages. │\n│ project Command-line interface for spaCy projects and templates. │\n│ You'd typically start by cloning a project template to a local directory and │\n│ fetching its assets like datasets etc. See the project's project.yml for the │\n│ available commands. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" } From 8a318dbae5eaa8119a7f0be9ee8a3f168221ae27 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Mar 2026 16:12:26 +0100 Subject: [PATCH 35/42] Fix manifest --- spacy_cli/build_manifest.py | 6 +++--- spacy_cli/cli_manifest.json | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/spacy_cli/build_manifest.py b/spacy_cli/build_manifest.py index 6e019bdcb95..71982d82d77 100644 --- a/spacy_cli/build_manifest.py +++ b/spacy_cli/build_manifest.py @@ -58,7 +58,7 @@ def build_manifest() -> Dict[str, object]: group_help[name] = _get_help(runner, app, [name]) unknown_subcommand[name] = _invoke( runner, app, [name, UNKNOWN_SUBCOMMAND_TOKEN] - ).stdout + ).output for sub_name in subcommands: help_text = _maybe_get_help(runner, app, [name, sub_name]) if help_text is not None: @@ -76,8 +76,8 @@ def build_manifest() -> Dict[str, object]: "group_help": group_help, "command_help": command_help, "errors": { - "missing_command": _invoke(runner, app, []).stdout, - "unknown_command": _invoke(runner, app, [UNKNOWN_COMMAND_TOKEN]).stdout, + "missing_command": _invoke(runner, app, []).output, + "unknown_command": _invoke(runner, app, [UNKNOWN_COMMAND_TOKEN]).output, "unknown_subcommand": unknown_subcommand, }, } diff --git a/spacy_cli/cli_manifest.json b/spacy_cli/cli_manifest.json index 12d760416c5..9aa7da9b973 100644 --- a/spacy_cli/cli_manifest.json +++ b/spacy_cli/cli_manifest.json @@ -36,13 +36,13 @@ "validate": " \n Usage: python -m spacy validate [OPTIONS] \n \n Validate the currently installed pipeline packages and spaCy version. Checks if the installed \n packages are compatible and shows upgrade instructions if available. Should be run after `pip \n install -U spacy`. \n \n DOCS: https://spacy.io/api/cli#validate \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" }, "errors": { - "missing_command": "", - "unknown_command": "", + "missing_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ Missing command. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "unknown_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_COMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "unknown_subcommand": { - "benchmark": "", - "debug": "", - "init": "", - "project": "" + "benchmark": "Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy benchmark --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_SUBCOMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "debug": "Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy debug --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_SUBCOMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "init": "Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy init --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_SUBCOMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "project": "Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy project --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_SUBCOMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n" } }, "group_help": { From 4967496dbdd7696334cb8e5489d853711247eb7f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Mar 2026 09:12:08 +0100 Subject: [PATCH 36/42] Update test_cli_launcher --- spacy/tests/test_cli_launcher.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/spacy/tests/test_cli_launcher.py b/spacy/tests/test_cli_launcher.py index 392572fdeec..1fd6e3a6fc6 100644 --- a/spacy/tests/test_cli_launcher.py +++ b/spacy/tests/test_cli_launcher.py @@ -4,7 +4,6 @@ import pytest -from spacy_cli.build_manifest import build_manifest from spacy_cli.static import load_manifest launcher_module = importlib.import_module("spacy_cli.main") @@ -46,7 +45,21 @@ def test_load_for_argv_imports_project_on_demand(): def test_manifest_is_current(): - assert build_manifest() == load_manifest() + # Run in a subprocess to avoid command registration order being affected + # by other test modules importing CLI submodules (which register commands + # as a side effect of import). + result = subprocess.run( + [ + sys.executable, + "-c", + "from spacy_cli.build_manifest import build_manifest; " + "from spacy_cli.static import load_manifest; " + "assert build_manifest() == load_manifest()", + ], + capture_output=True, + text=True, + ) + assert result.returncode == 0, result.stderr def test_launcher_root_help_uses_static(capsys, monkeypatch): From 93547fe2e8ceae6cae66f3caf31c0148b42d9dea Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 23 Mar 2026 14:56:27 +0100 Subject: [PATCH 37/42] Fix lint issues across PR files - setup.py: rename loop variable shadowing parameter (B020) - _util.py: remove unused registry import (F401), use specific except clause (E722, B904) - test_cli_app.py: use dict literals instead of dict() (C408) - main.py: extract _try_static_group to reduce complexity (C901) --- setup.py | 8 +++---- spacy/cli/_util.py | 5 ++-- spacy/tests/test_cli_app.py | 48 ++++++++++++++++++------------------- spacy_cli/main.py | 30 +++++++++++++---------- 4 files changed, 47 insertions(+), 44 deletions(-) diff --git a/setup.py b/setup.py index 2de619c720a..e18e98b9249 100755 --- a/setup.py +++ b/setup.py @@ -158,10 +158,10 @@ def _minimal_ext_cmd(cmd): def clean(path): - for path in path.glob("**/*"): - if path.is_file() and path.suffix in (".so", ".cpp", ".html"): - print(f"Deleting {path.name}") - path.unlink() + for child in path.glob("**/*"): + if child.is_file() and child.suffix in (".so", ".cpp", ".html"): + print(f"Deleting {child.name}") + child.unlink() def setup_package(): diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 8a8b4e0f417..35f899b2cf8 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -26,7 +26,6 @@ ENV_VARS, import_file, logger, - registry, run_command, ) @@ -206,8 +205,8 @@ def get_git_version( """ try: ret = run_command("git --version", capture=True) - except: - raise RuntimeError(error) + except Exception as err: + raise RuntimeError(error) from err stdout = ret.stdout.strip() if not stdout or not stdout.startswith("git version"): return 0, 0 diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 7ff3f755a31..2b3aed04060 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -288,30 +288,30 @@ def test_find_function_invalid(): example_ents = ["O", "O", "I-ANIMAL"] example_spans = [(2, 3, "ANIMAL")] -TRAIN_EXAMPLE_1 = dict( - words=example_words_1, - lemmas=example_lemmas_1, - tags=example_tags, - morphs=example_morphs, - deps=example_deps, - heads=[1, 1, 1], - pos=example_pos, - ents=example_ents, - spans=example_spans, - cats={"CAT": 1.0, "DOG": 0.0}, -) -TRAIN_EXAMPLE_2 = dict( - words=example_words_2, - lemmas=example_lemmas_2, - tags=example_tags, - morphs=example_morphs, - deps=example_deps, - heads=[1, 1, 1], - pos=example_pos, - ents=example_ents, - spans=example_spans, - cats={"CAT": 0.0, "DOG": 1.0}, -) +TRAIN_EXAMPLE_1 = { + "words": example_words_1, + "lemmas": example_lemmas_1, + "tags": example_tags, + "morphs": example_morphs, + "deps": example_deps, + "heads": [1, 1, 1], + "pos": example_pos, + "ents": example_ents, + "spans": example_spans, + "cats": {"CAT": 1.0, "DOG": 0.0}, +} +TRAIN_EXAMPLE_2 = { + "words": example_words_2, + "lemmas": example_lemmas_2, + "tags": example_tags, + "morphs": example_morphs, + "deps": example_deps, + "heads": [1, 1, 1], + "pos": example_pos, + "ents": example_ents, + "spans": example_spans, + "cats": {"CAT": 0.0, "DOG": 1.0}, +} @pytest.mark.slow diff --git a/spacy_cli/main.py b/spacy_cli/main.py index cbe6e376c58..f8e6cabe808 100644 --- a/spacy_cli/main.py +++ b/spacy_cli/main.py @@ -38,24 +38,28 @@ def _try_static(argv: Iterable[str]): template = manifest["errors"]["unknown_command"] return template.replace(UNKNOWN_COMMAND_TOKEN, first), 2 if first in known_groups: - if len(args) == 1 or args[1] in HELP_OPTIONS: - if plugin_command_names: - return None - return manifest["group_help"][first], 0 - second = args[1] - if second not in known_groups[first]: - if plugin_command_names: - return None - template = manifest["errors"]["unknown_subcommand"][first] - return template.replace(UNKNOWN_SUBCOMMAND_TOKEN, second), 2 - if any(arg in HELP_OPTIONS for arg in args[2:]): - return manifest["command_help"][f"{first} {second}"], 0 - return None + return _try_static_group(args, first, manifest, known_groups, plugin_command_names) if any(arg in HELP_OPTIONS for arg in args[1:]): return manifest["command_help"][first], 0 return None +def _try_static_group(args, first, manifest, known_groups, plugin_command_names): + if len(args) == 1 or args[1] in HELP_OPTIONS: + if plugin_command_names: + return None + return manifest["group_help"][first], 0 + second = args[1] + if second not in known_groups[first]: + if plugin_command_names: + return None + template = manifest["errors"]["unknown_subcommand"][first] + return template.replace(UNKNOWN_SUBCOMMAND_TOKEN, second), 2 + if any(arg in HELP_OPTIONS for arg in args[2:]): + return manifest["command_help"][f"{first} {second}"], 0 + return None + + def main(argv: Optional[Iterable[str]] = None) -> None: args = sys.argv[1:] if argv is None else list(argv) try: From c5bcffdbf89df6a3c262467d93d30d7a15e6203a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 23 Mar 2026 14:58:26 +0100 Subject: [PATCH 38/42] Fix import sorting (ruff I001) for CI validation --- spacy/cli/__init__.py | 2 +- spacy/cli/_dispatch.py | 1 - spacy/tests/test_cli_app.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index ded45efe9f7..f176a2eabad 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -12,8 +12,8 @@ PUBLIC_ATTRS, SUBCOMMAND_MODULES, TOP_LEVEL_MODULES, + iter_builtin_modules, ) -from ._dispatch import iter_builtin_modules from ._util import COMMAND, add_project_cli, app HELP_OPTIONS = {"--help", "-h"} diff --git a/spacy/cli/_dispatch.py b/spacy/cli/_dispatch.py index e1975dd7e1d..5ee4f654d39 100644 --- a/spacy/cli/_dispatch.py +++ b/spacy/cli/_dispatch.py @@ -1,6 +1,5 @@ from typing import Dict, Iterable, Optional, Tuple - CommandPath = Tuple[str, ...] diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 2b3aed04060..c72e26c3444 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -11,7 +11,6 @@ from .util import make_tempdir, normalize_whitespace - load_all_commands() From ec786c85ad8252b9e93351a17ea9886f310205d0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 23 Mar 2026 14:59:58 +0100 Subject: [PATCH 39/42] Add local lint script matching CI validate + mypy checks --- lint.sh | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100755 lint.sh diff --git a/lint.sh b/lint.sh new file mode 100755 index 00000000000..0ec0bda3f6b --- /dev/null +++ b/lint.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# Local lint script matching the CI Validate job + mypy type checks. +# Fixes formatting and import sorting in-place, then re-verifies in +# check mode to catch any conflicts between the two, and runs mypy. +set -euo pipefail + +err=0 + +echo "==> ruff format (auto-fixing)" +python -m ruff format spacy + +echo "==> ruff isort (auto-fixing)" +python -m ruff check spacy --select I --fix + +echo "==> ruff format (verify)" +if ! python -m ruff format spacy --check; then + echo "FAIL: isort fix broke formatting" + err=1 +fi + +echo "==> ruff isort (verify)" +if ! python -m ruff check spacy --select I; then + echo "FAIL: format fix broke import sorting" + err=1 +fi + +echo "==> mypy" +if ! python -m mypy spacy; then + err=1 +fi + +if [ "$err" -ne 0 ]; then + echo "FAIL: see errors above" + exit 1 +fi + +echo "OK: all checks passed" From cb67fe175af7becd715db442a84d1de2a892509f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 23 Mar 2026 15:19:01 +0100 Subject: [PATCH 40/42] Regenerate CLI manifest for typer 0.24.1 plain-text output --- spacy_cli/cli_manifest.json | 88 ++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/spacy_cli/cli_manifest.json b/spacy_cli/cli_manifest.json index 9aa7da9b973..e756c058eb3 100644 --- a/spacy_cli/cli_manifest.json +++ b/spacy_cli/cli_manifest.json @@ -1,55 +1,55 @@ { "command": "python -m spacy", "command_help": { - "apply": " \n Usage: python -m spacy apply [OPTIONS] MODEL DATA_PATH OUTPUT_FILE \n \n Apply a trained pipeline to documents to get predictions. Expects a loadable spaCy pipeline and \n path to the data, which can be a directory or a file. The data files can be provided in multiple \n formats: 1. .spacy files 2. .jsonl files with a specified \"field\" to read the text from. \n 3. Files with any other extension are assumed to be containing a single document. DOCS: \n https://spacy.io/api/cli#apply \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of the documents to predict on. Can be a single file in │\n│ .spacy format or a .jsonl file. Files with other extensions are │\n│ treated as single plain text documents. If a directory is provided │\n│ it is traversed recursively to grab all files to be processed. The │\n│ files can be a mixture of .spacy, .jsonl and text files. If .jsonl │\n│ is provided the specified field is going to be grabbed (\"text\" by │\n│ default). │\n│ [required] │\n│ * output_file FILE Path to save the resulting .spacy file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ --text-key -tk TEXT Key containing text string for JSONL [default: text] │\n│ --force -F Force overwriting the output file │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU. [default: -1] │\n│ --batch-size -b INTEGER Batch size. [default: 1] │\n│ --n-process -n INTEGER number of processors to use. [default: 1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "assemble": " \n Usage: python -m spacy assemble [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Assemble a spaCy pipeline from a config file. The config file includes all settings for \n initializing the pipeline. To override settings in the config, e.g. settings that point to local \n paths or that you want to experiment with, you can override them as command line options. The \n --code argument lets you pass in a Python file that can be used to register custom functions that \n are referenced in the config. \n \n DOCS: https://spacy.io/api/cli#assemble \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n│ * output_path PATH Output directory to store assembled pipeline in [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) to │\n│ be imported │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "benchmark accuracy": " \n Usage: python -m spacy benchmark accuracy [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "benchmark speed": " \n Usage: python -m spacy benchmark speed [OPTIONS] MODEL DATA_PATH \n \n Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the binary .spacy \n format. \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --batch-size -b INTEGER RANGE [x>=1] Override the pipeline batch size │\n│ --no-shuffle Do not shuffle benchmark data │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --batches INTEGER RANGE [x>=30] Minimum number of batches to benchmark │\n│ [default: 50] │\n│ --warmup -w INTEGER RANGE [x>=0] Number of iterations over the data for warmup │\n│ [default: 3] │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "convert": " \n Usage: python -m spacy convert [OPTIONS] INPUT_PATH [OUTPUT_DIR] \n \n Convert files into json or DocBin format for training. The resulting .spacy file can be used with \n the train command and other experiment management functions. \n \n If no output_dir is specified and the output format is JSON, the data \n is written to stdout, so you can pipe them forward to a JSON file: \n $ spacy convert some_file.conllu --file-type json > some_file.json \n \n DOCS: https://spacy.io/api/cli#convert \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_path TEXT Input file or directory [required] │\n│ output_dir [OUTPUT_DIR] Output directory. '-' for stdout. [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --file-type -t [json|spacy] Type of data to produce [default: spacy] │\n│ --n-sents -n INTEGER Number of sentences per doc (0 to disable) │\n│ [default: 1] │\n│ --seg-sents -s Segment sentences (for -c ner) │\n│ --model,--base -b TEXT Trained spaCy pipeline for sentence segmentation to │\n│ use as base (for --seg-sents) │\n│ --morphology -m Enable appending morphology to tags │\n│ --merge-subtokens -T Merge CoNLL-U subtokens │\n│ --converter -c TEXT Converter: ('conllubio', 'conllu', 'conll', 'ner', │\n│ 'iob', 'json') │\n│ [default: auto] │\n│ --ner-map -nm PATH NER tag mapping (as JSON-encoded dict of entity types) │\n│ --lang -l TEXT Language (if tokenizer required) │\n│ --concatenate -C Concatenate output to a single file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug config": " \n Usage: python -m spacy debug config [OPTIONS] CONFIG_PATH \n \n Debug a config file and show validation errors. The command will create all objects in the tree \n and validate them. Note that some config validation errors are blocking and will prevent the rest \n of the config from being resolved. This means that you may not see all validation errors at once \n and some issues are only shown once previous errors have been fixed. Similar as with the 'train' \n command, you can override settings from the config as command line options. For instance, \n --training.batch_size 128 overrides the value of \"batch_size\" in the block \"\". \n \n DOCS: https://spacy.io/api/cli#debug-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --show-functions -F Show an overview of all registered functions used in the │\n│ config and where they come from (modules, files etc.) │\n│ --show-variables -V Show an overview of all variables referenced in the config and │\n│ their values. This will also reflect variables overwritten on │\n│ the CLI. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug data": " \n Usage: python -m spacy debug data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug diff-config": " \n Usage: python -m spacy debug diff-config [OPTIONS] CONFIG_PATH \n \n Show a diff of a config file with respect to spaCy's defaults or another config file. If \n additional settings were used in the creation of the config file, then you must supply these as \n extra parameters to the command when comparing to the default settings. The generated diff can \n also be used when posting to the discussion forum to provide more information for the maintainers. \n \n The `optimize`, `gpu`, and `pretraining` options are only relevant when \n comparing against the default configuration (or specifically when `compare_to` is None). \n \n DOCS: https://spacy.io/api/cli#debug-diff \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --compare-to PATH Path to a config file to diff against, or │\n│ `None` to compare against default settings │\n│ --optimize -o [efficiency|accuracy] Whether the user config was optimized for │\n│ efficiency or accuracy. Only relevant when │\n│ comparing against the default config. │\n│ [default: efficiency] │\n│ --gpu -G Whether the original config can run on a │\n│ GPU. Only relevant when comparing against │\n│ the default config. │\n│ --pretraining,--pt Whether to compare on a config with │\n│ pretraining involved. Only relevant when │\n│ comparing against the default config. │\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug model": " \n Usage: python -m spacy debug model [OPTIONS] CONFIG_PATH COMPONENT \n \n Analyze a Thinc model implementation. Includes checks for internal structure and activations \n during training. \n \n DOCS: https://spacy.io/api/cli#debug-model \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n│ * component TEXT Name of the pipeline component of which the model should be analysed │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --layers -l TEXT Comma-separated names of layer IDs to print │\n│ --dimensions -DIM Show dimensions │\n│ --parameters -PAR Show parameters │\n│ --gradients -GRAD Show gradients │\n│ --attributes -ATTR Show attributes │\n│ --print-step0 -P0 Print model before training │\n│ --print-step1 -P1 Print model after initialization │\n│ --print-step2 -P2 Print model after training │\n│ --print-step3 -P3 Print final predictions │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug profile": " \n Usage: python -m spacy debug profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug-data": " \n Usage: python -m spacy debug-data [OPTIONS] CONFIG_PATH \n \n Analyze, debug and validate your training and development data. Outputs useful stats, and can help \n you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. \n \n DOCS: https://spacy.io/api/cli#debug-data \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --ignore-warnings -IW Ignore warnings, only show stats and errors │\n│ --verbose -V Print additional information and explanations │\n│ --no-format -NF Don't pretty-print the results │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "download": " \n Usage: python -m spacy download [OPTIONS] MODEL \n \n Download compatible trained pipeline from the default download path using pip. If --direct flag is \n set, the command expects the full package name with version. For direct downloads, the \n compatibility check will be skipped. All additional arguments provided to this command will be \n passed to `pip install` on package installation. \n \n DOCS: https://spacy.io/api/cli#download \n AVAILABLE PACKAGES: https://spacy.io/models \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Name of pipeline package to download [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --direct -d,-D Force direct download of name + version │\n│ --sdist -S Download sdist (.tar.gz) archive instead of pre-built binary wheel │\n│ --url -U TEXT Download from given url │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "evaluate": " \n Usage: python -m spacy evaluate [OPTIONS] MODEL DATA_PATH \n \n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation data in the binary \n .spacy format. The --gold-preproc option sets up the evaluation examples with gold-standard \n sentences and tokens for the predictions. Gold preprocessing helps the annotations align to the \n tokenization, and may result in sequences of more consistent length. However, it may reduce \n runtime accuracy due to train/test skew. To render a sample of dependency parses in a HTML file, \n set as output directory as the displacy_path argument. \n \n DOCS: https://spacy.io/api/cli#benchmark-accuracy \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o FILE Output JSON file for metrics │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --displacy-path -dp DIRECTORY Directory to output rendered parses as HTML │\n│ --displacy-limit -dl INTEGER Limit of parses to render as HTML [default: 25] │\n│ --per-component -P Return scores per component, only applicable when an │\n│ output JSON file is specified. │\n│ --spans-key -sk TEXT Spans key to use when evaluating Doc.spans [default: sc] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "find-function": " \n Usage: python -m spacy find-function [OPTIONS] FUNC_NAME \n \n Find the module, path and line number to the file the registered function is defined in, if \n available. \n \n func_name (str): Name of the registered function. \n registry_name (Optional): Name of the catalogue registry. \n \n DOCS: https://spacy.io/api/cli#find-function \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * func_name TEXT Name of the registered function. [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --registry -r TEXT Name of the catalogue registry. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "find-threshold": " \n Usage: python -m spacy find-threshold [OPTIONS] MODEL DATA_PATH PIPE_NAME \n THRESHOLD_KEY SCORES_KEY \n \n Runs prediction trials for a trained model with varying thresholds to maximize the specified \n metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. \n Results are displayed in a table on `stdout` (the corresponding API call to \n `spacy.cli.find_threshold.find_threshold()` returns all results). \n \n This is applicable only for components whose predictions are influenced by \n thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note \n that the full path to the corresponding threshold attribute in the config has to \n be provided. \n \n DOCS: https://spacy.io/api/cli#find-threshold \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Model name or path [required] │\n│ * data_path PATH Location of binary evaluation data in .spacy format [required] │\n│ * pipe_name TEXT Name of pipe to examine thresholds for [required] │\n│ * threshold_key TEXT Key of threshold attribute in component's configuration [required] │\n│ * scores_key TEXT Metric to optimize [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n_trials -n INTEGER Number of trials to determine optimal thresholds │\n│ [default: 11] │\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --gold-preproc -G Use gold preprocessing │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "info": " \n Usage: python -m spacy info [OPTIONS] [MODEL] \n \n Print info about spaCy installation. If a pipeline is specified as an argument, print its meta \n information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. \n \n Flag --url prints only the download URL of the most recent compatible \n version of the pipeline. \n \n DOCS: https://spacy.io/api/cli#info \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ model [MODEL] Optional loadable spaCy pipeline │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --markdown -md Generate Markdown for GitHub issues │\n│ --silent -s,-S Don't print anything (just return) │\n│ --exclude -e TEXT Comma-separated keys to exclude from the print-out │\n│ [default: labels] │\n│ --url -u Print the URL to download the most recent compatible version of the │\n│ pipeline │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init config": " \n Usage: python -m spacy init config [OPTIONS] OUTPUT_FILE \n \n Generate a starter config file for training. Based on your requirements specified via the CLI \n arguments, this command generates a config with the optimal settings for your use case. This \n includes the choice of architecture, pretrained weights and related hyperparameters. \n \n DOCS: https://spacy.io/api/cli#init-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * output_file PATH File to save the config to or - for stdout (will only output config │\n│ and no additional logging info) │\n│ [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --lang -l TEXT Two-letter code of the language to use │\n│ [default: en] │\n│ --pipeline -p TEXT Comma-separated names of trainable pipeline │\n│ components to include (without 'tok2vec' or │\n│ 'transformer') │\n│ [default: tagger,parser,ner] │\n│ --optimize -o [efficiency|accuracy] Whether to optimize for efficiency (faster │\n│ inference, smaller model, lower memory │\n│ consumption) or higher accuracy (potentially │\n│ larger and slower model). This will impact the │\n│ choice of architecture, pretrained weights and │\n│ related hyperparameters. │\n│ [default: efficiency] │\n│ --gpu -G Whether the model can run on GPU. This will │\n│ impact the choice of architecture, pretrained │\n│ weights and related hyperparameters. │\n│ --pretraining -pt Include config for pretraining (with 'spacy │\n│ pretrain') │\n│ --force -F Force overwriting the output file │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init fill-config": " \n Usage: python -m spacy init fill-config [OPTIONS] BASE_PATH [OUTPUT_FILE] \n \n Fill partial config file with default values. Will add all missing settings from the default \n config and will create all objects, check the registered functions for their default values and \n update the base config. This command can be used with a config generated via the training \n quickstart widget: https://spacy.io/usage/training#quickstart \n \n DOCS: https://spacy.io/api/cli#init-fill-config \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * base_path FILE Path to base config to fill [required] │\n│ output_file [OUTPUT_FILE] Path to output .cfg file (or - for stdout) [default: -] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --pretraining -pt Include config for pretraining (with 'spacy pretrain') │\n│ --diff -D Print a visual diff highlighting the changes │\n│ --code-path,--code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init labels": " \n Usage: python -m spacy init labels [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n Generate JSON files for the labels in the data. This helps speed up the training process, since \n spaCy won't have to preprocess the data to extract the labels. \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n│ * output_path PATH Output directory for the labels [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init nlp": " \n Usage: python -m spacy init nlp [OPTIONS] CONFIG_PATH OUTPUT_PATH \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n│ * output_path PATH Output directory for the prepared data [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered functions) │\n│ to be imported │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init vectors": " \n Usage: python -m spacy init vectors [OPTIONS] LANG VECTORS_LOC OUTPUT_DIR \n \n Convert word vectors for use with spaCy. Will export an nlp object that you can use in the block \n of your config to initialize a model with vectors. \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * lang TEXT The language of the nlp object to create [required] │\n│ * vectors_loc PATH Vectors file in Word2Vec format [required] │\n│ * output_dir PATH Pipeline output directory [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --prune -p INTEGER Optional number of vectors to prune to [default: -1] │\n│ --truncate -t INTEGER Optional number of vectors to truncate to when reading in │\n│ vectors file │\n│ [default: 0] │\n│ --mode -m TEXT Vectors mode: default or floret [default: default] │\n│ --name -n TEXT Optional name for the word vectors, e.g. en_core_web_lg.vectors │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --attr -a TEXT Optional token attribute to use for vectors, e.g. LOWER or NORM │\n│ [default: ORTH] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "link": " \n Usage: python -m spacy link [OPTIONS] ARGS KWARGS \n \n (deprecated) \n As of spaCy v3.0, symlinks like \"en\" are not supported anymore. You can load trained pipeline \n packages using their full names or from a directory path. \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * args TEXT [required] │\n│ * kwargs TEXT [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "package": " \n Usage: python -m spacy package [OPTIONS] INPUT_DIR OUTPUT_DIR \n \n Generate an installable Python package for a pipeline. Includes binary data, meta and required \n installation files. A new directory will be created in the specified output directory, and the \n data will be copied over. If --create-meta is set and a meta.json already exists in the output \n directory, the existing values will be used as the defaults in the command-line prompt. After \n packaging, \"python -m build --sdist\" is run in the package directory, which will create a .tar.gz \n archive that can be installed via \"pip install\". \n \n If additional code files are provided (e.g. Python files containing custom \n registered functions like pipeline components), they are copied into the \n package and imported in the __init__.py. \n \n DOCS: https://spacy.io/api/cli#package \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * input_dir DIRECTORY Directory with pipeline data [required] │\n│ * output_dir DIRECTORY Output parent directory [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c TEXT Comma-separated paths to Python file │\n│ with additional code (registered │\n│ functions) to be included in the package │\n│ --meta-path,--meta -m FILE Path to meta.json │\n│ --create-meta -C Create meta.json, even if one exists │\n│ --name -n TEXT Package name to override meta │\n│ --version -v TEXT Package version to override meta │\n│ --build -b TEXT Comma-separated formats to build: sdist │\n│ and/or wheel, or none. │\n│ [default: sdist] │\n│ --force -f,-F Force overwriting existing data in │\n│ output directory │\n│ --require-parent -R,-R --no-require-parent Include the parent package (e.g. spacy) │\n│ in the requirements │\n│ [default: require-parent] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "pretrain": " \n Usage: python -m spacy pretrain [OPTIONS] CONFIG_PATH OUTPUT_DIR \n \n Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate \n language-modelling objective. Two objective types are available, vector-based and character-based. \n \n In the vector-based objective, we load word vectors that have been trained \n using a word2vec-style distributional similarity algorithm, and train a \n component like a CNN, BiLSTM, etc to predict vectors which match the \n pretrained ones. The weights are saved to a directory after each epoch. You \n can then pass a path to one of these pretrained weights files to the \n 'spacy train' command. \n \n This technique may be especially helpful if you have little labelled data. \n However, it's still quite experimental, so your mileage may vary. \n \n To load the weights back in during 'spacy train', you need to ensure \n all settings are the same between pretraining and training. Ideally, \n this is done by using the same config file for both commands. \n \n DOCS: https://spacy.io/api/cli#pretrain \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path FILE Path to config file [required] │\n│ * output_dir PATH Directory to write weights to on each epoch [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --code -c PATH Path to Python file with additional code (registered │\n│ functions) to be imported │\n│ --resume-path -r PATH Path to pretrained weights from which to resume pretraining │\n│ --epoch-resume -er INTEGER The epoch to resume counting from when using --resume-path. │\n│ Prevents unintended overwriting of existing weight files. │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --skip-last -L Skip saving model-last.bin │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "profile": " \n Usage: python -m spacy profile [OPTIONS] MODEL [INPUTS] \n \n Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one \n JSON object per line with a key \"text\". It can either be provided as a JSONL file, or be read from \n sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. \n \n DOCS: https://spacy.io/api/cli#debug-profile \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * model TEXT Trained pipeline to load [required] │\n│ inputs [INPUTS] Location of input file. '-' for stdin. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --n-texts -n INTEGER Maximum number of texts to use if available [default: 10000] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project assets": " \n Usage: python -m spacy project assets [OPTIONS] [PROJECT_DIR] \n \n Fetch project assets like datasets and pretrained weights. Assets are defined in the \"assets\" \n section of the project.yml. If a checksum is provided in the project.yml, the file is only \n downloaded if no local file with the same checksum exists. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/tutorial/directory-and-assets.md \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Path to cloned project. Defaults to current working directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --sparse -S Use sparse checkout for assets provided via Git, to only check out and clone │\n│ the files needed. Requires Git v22.2+. │\n│ --extra -e Download all assets, including those marked as 'extra'. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project clone": " \n Usage: python -m spacy project clone [OPTIONS] NAME [DEST] \n \n Clone a project template from a repository. Calls into \"git\" and will only download the files from \n the given subdirectory. The GitHub repo defaults to the official Weasel template repo, but can be \n customized (including using a private repo). \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#clipboard-clone \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * name TEXT The name of the template to clone [required] │\n│ dest [DEST] Where to clone the project. Defaults to current working directory │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --repo -r TEXT The repository to clone from │\n│ [default: https://github.com/explosion/projects] │\n│ --branch -b TEXT The branch to clone from. If not provided, will attempt main, master │\n│ --sparse -S Use sparse Git checkout to only check out and clone the files needed. │\n│ Requires Git v22.2+. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project document": " \n Usage: python -m spacy project document [OPTIONS] [PROJECT_DIR] \n \n Auto-generate a README.md for a project. If the content is saved to a file, hidden markers are \n added so you can add custom content before or after the auto-generated section and only the \n auto-generated docs will be replaced when you re-run the command. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#closed_book-document \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Path to cloned project. Defaults to current working directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output -o PATH Path to output Markdown file for output. Defaults to - for standard │\n│ output │\n│ [default: -] │\n│ --no-emoji -NE Don't use emoji │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project dvc": " \n Usage: python -m spacy project dvc [OPTIONS] [PROJECT_DIR] [WORKFLOW] \n \n Auto-generate Data Version Control (DVC) config. A DVC project can only define one pipeline, so \n you need to specify one workflow defined in the project.yml. If no workflow is specified, the \n first defined workflow is used. The DVC config will only be updated if the project.yml changed. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#repeat-dvc \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n│ workflow [WORKFLOW] Name of workflow defined in project.yml. Defaults to first │\n│ workflow if not set. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --verbose -V Print more info │\n│ --quiet -q Print less info │\n│ --force -F Force update DVC config │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project pull": " \n Usage: python -m spacy project pull [OPTIONS] [REMOTE] [PROJECT_DIR] \n \n Retrieve available precomputed outputs from a remote storage. You can alias remotes in your \n project.yml by mapping them to storage paths. A storage can be anything that the smart_open \n library can upload to, e.g. AWS, Google Cloud Storage, SSH, local directories etc. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_down-push \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ remote [REMOTE] Name or path of remote storage [default: default] │\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project push": " \n Usage: python -m spacy project push [OPTIONS] [REMOTE] [PROJECT_DIR] \n \n Persist outputs to a remote storage. You can alias remotes in your project.yml by mapping them to \n storage paths. A storage can be anything that the smart_open library can upload to, e.g. AWS, \n Google Cloud Storage, SSH, local directories etc. \n \n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_up-push \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ remote [REMOTE] Name or path of remote storage [default: default] │\n│ project_dir [PROJECT_DIR] Location of project directory. Defaults to current working │\n│ directory. │\n│ [default: /Users/matt/repos/explosion/spaCy] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "train": " \n Usage: python -m spacy train [OPTIONS] CONFIG_PATH \n \n Train or update a spaCy pipeline. Requires data in spaCy's binary format. To convert data from \n other formats, use the `spacy convert` command. The config file includes all settings and \n hyperparameters used during training. To override settings in the config, e.g. settings that point \n to local paths or that you want to experiment with, you can override them as command line options. \n For instance, --training.batch_size 128 overrides the value of \"batch_size\" in the block \"\". The \n --code argument lets you pass in a Python file that's imported before training. It can be used to \n register custom functions and architectures that can then be referenced in the config. \n \n DOCS: https://spacy.io/api/cli#train \n \n╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮\n│ * config_path PATH Path to config file [required] │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --output,--output-path -o PATH Output directory to store trained pipeline in │\n│ --code -c PATH Path to Python file with additional code │\n│ (registered functions) to be imported │\n│ --verbose -V,-VV Display more information for debugging purposes │\n│ --gpu-id -g INTEGER GPU ID or -1 for CPU [default: -1] │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "validate": " \n Usage: python -m spacy validate [OPTIONS] \n \n Validate the currently installed pipeline packages and spaCy version. Checks if the installed \n packages are compatible and shows upgrade instructions if available. Should be run after `pip \n install -U spacy`. \n \n DOCS: https://spacy.io/api/cli#validate \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + "apply": "Usage: python -m spacy apply [OPTIONS] MODEL DATA_PATH OUTPUT_FILE\n\n Apply a trained pipeline to documents to get predictions. Expects a loadable\n spaCy pipeline and path to the data, which can be a directory or a file. The\n data files can be provided in multiple formats: 1. .spacy files 2.\n .jsonl files with a specified \"field\" to read the text from. 3. Files with\n any other extension are assumed to be containing a single document.\n DOCS: https://spacy.io/api/cli#apply\n\nArguments:\n MODEL Model name or path [required]\n DATA_PATH Location of the documents to predict on. Can be a single file in\n .spacy format or a .jsonl file. Files with other extensions are\n treated as single plain text documents. If a directory is\n provided it is traversed recursively to grab all files to be\n processed. The files can be a mixture of .spacy, .jsonl and text\n files. If .jsonl is provided the specified field is going to be\n grabbed (\"text\" by default). [required]\n OUTPUT_FILE Path to save the resulting .spacy file [required]\n\nOptions:\n -c, --code PATH Path to Python file with additional code (registered\n functions) to be imported\n -tk, --text-key TEXT Key containing text string for JSONL [default:\n text]\n -F, --force Force overwriting the output file\n -g, --gpu-id INTEGER GPU ID or -1 for CPU. [default: -1]\n -b, --batch-size INTEGER Batch size. [default: 1]\n -n, --n-process INTEGER number of processors to use. [default: 1]\n --help Show this message and exit.\n", + "assemble": "Usage: python -m spacy assemble [OPTIONS] CONFIG_PATH OUTPUT_PATH\n\n Assemble a spaCy pipeline from a config file. The config file includes all\n settings for initializing the pipeline. To override settings in the config,\n e.g. settings that point to local paths or that you want to experiment with,\n you can override them as command line options. The --code argument lets you\n pass in a Python file that can be used to register custom functions that are\n referenced in the config.\n\n DOCS: https://spacy.io/api/cli#assemble\n\nArguments:\n CONFIG_PATH Path to config file [required]\n OUTPUT_PATH Output directory to store assembled pipeline in [required]\n\nOptions:\n -c, --code PATH Path to Python file with additional code (registered\n functions) to be imported\n -V, -VV, --verbose Display more information for debugging purposes\n --help Show this message and exit.\n", + "benchmark accuracy": "Usage: python -m spacy benchmark accuracy [OPTIONS] MODEL DATA_PATH\n\n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation\n data in the binary .spacy format. The --gold-preproc option sets up the\n evaluation examples with gold-standard sentences and tokens for the\n predictions. Gold preprocessing helps the annotations align to the\n tokenization, and may result in sequences of more consistent length. However,\n it may reduce runtime accuracy due to train/test skew. To render a sample of\n dependency parses in a HTML file, set as output directory as the displacy_path\n argument.\n\n DOCS: https://spacy.io/api/cli#benchmark-accuracy\n\nArguments:\n MODEL Model name or path [required]\n DATA_PATH Location of binary evaluation data in .spacy format [required]\n\nOptions:\n -o, --output FILE Output JSON file for metrics\n -c, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n -G, --gold-preproc Use gold preprocessing\n -dp, --displacy-path DIRECTORY Directory to output rendered parses as HTML\n -dl, --displacy-limit INTEGER Limit of parses to render as HTML [default:\n 25]\n -P, --per-component Return scores per component, only applicable\n when an output JSON file is specified.\n -sk, --spans-key TEXT Spans key to use when evaluating Doc.spans\n [default: sc]\n --help Show this message and exit.\n", + "benchmark speed": "Usage: python -m spacy benchmark speed [OPTIONS] MODEL DATA_PATH\n\n Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in\n the binary .spacy format.\n\nArguments:\n MODEL Model name or path [required]\n DATA_PATH Location of binary evaluation data in .spacy format [required]\n\nOptions:\n -b, --batch-size INTEGER RANGE Override the pipeline batch size [x>=1]\n --no-shuffle Do not shuffle benchmark data\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n --batches INTEGER RANGE Minimum number of batches to benchmark\n [default: 50; x>=30]\n -w, --warmup INTEGER RANGE Number of iterations over the data for warmup\n [default: 3; x>=0]\n -c, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n --help Show this message and exit.\n", + "convert": "Usage: python -m spacy convert [OPTIONS] INPUT_PATH [OUTPUT_DIR]\n\n Convert files into json or DocBin format for training. The resulting .spacy\n file can be used with the train command and other experiment management\n functions.\n\n If no output_dir is specified and the output format is JSON, the data is\n written to stdout, so you can pipe them forward to a JSON file: $ spacy\n convert some_file.conllu --file-type json > some_file.json\n\n DOCS: https://spacy.io/api/cli#convert\n\nArguments:\n INPUT_PATH Input file or directory [required]\n [OUTPUT_DIR] Output directory. '-' for stdout. [default: -]\n\nOptions:\n -t, --file-type [json|spacy] Type of data to produce [default: spacy]\n -n, --n-sents INTEGER Number of sentences per doc (0 to disable)\n [default: 1]\n -s, --seg-sents Segment sentences (for -c ner)\n -b, --model, --base TEXT Trained spaCy pipeline for sentence segmentation\n to use as base (for --seg-sents)\n -m, --morphology Enable appending morphology to tags\n -T, --merge-subtokens Merge CoNLL-U subtokens\n -c, --converter TEXT Converter: ('conllubio', 'conllu', 'conll',\n 'ner', 'iob', 'json') [default: auto]\n -nm, --ner-map PATH NER tag mapping (as JSON-encoded dict of entity\n types)\n -l, --lang TEXT Language (if tokenizer required)\n -C, --concatenate Concatenate output to a single file\n --help Show this message and exit.\n", + "debug config": "Usage: python -m spacy debug config [OPTIONS] CONFIG_PATH\n\n Debug a config file and show validation errors. The command will create all\n objects in the tree and validate them. Note that some config validation errors\n are blocking and will prevent the rest of the config from being resolved. This\n means that you may not see all validation errors at once and some issues are\n only shown once previous errors have been fixed. Similar as with the 'train'\n command, you can override settings from the config as command line options.\n For instance, --training.batch_size 128 overrides the value of \"batch_size\" in\n the block \"[training]\".\n\n DOCS: https://spacy.io/api/cli#debug-config\n\nArguments:\n CONFIG_PATH Path to config file [required]\n\nOptions:\n -c, --code-path, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -F, --show-functions Show an overview of all registered functions\n used in the config and where they come from\n (modules, files etc.)\n -V, --show-variables Show an overview of all variables referenced in\n the config and their values. This will also\n reflect variables overwritten on the CLI.\n --help Show this message and exit.\n", + "debug data": "Usage: python -m spacy debug data [OPTIONS] CONFIG_PATH\n\n Analyze, debug and validate your training and development data. Outputs useful\n stats, and can help you find problems like invalid entity annotations, cyclic\n dependencies, low data labels and more.\n\n DOCS: https://spacy.io/api/cli#debug-data\n\nArguments:\n CONFIG_PATH Path to config file [required]\n\nOptions:\n -c, --code-path, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -IW, --ignore-warnings Ignore warnings, only show stats and errors\n -V, --verbose Print additional information and explanations\n -NF, --no-format Don't pretty-print the results\n --help Show this message and exit.\n", + "debug diff-config": "Usage: python -m spacy debug diff-config [OPTIONS] CONFIG_PATH\n\n Show a diff of a config file with respect to spaCy's defaults or another\n config file. If additional settings were used in the creation of the config\n file, then you must supply these as extra parameters to the command when\n comparing to the default settings. The generated diff can also be used when\n posting to the discussion forum to provide more information for the\n maintainers.\n\n The `optimize`, `gpu`, and `pretraining` options are only relevant when\n comparing against the default configuration (or specifically when `compare_to`\n is None).\n\n DOCS: https://spacy.io/api/cli#debug-diff\n\nArguments:\n CONFIG_PATH Path to config file [required]\n\nOptions:\n --compare-to PATH Path to a config file to diff against, or\n `None` to compare against default settings\n -o, --optimize [efficiency|accuracy]\n Whether the user config was optimized for\n efficiency or accuracy. Only relevant when\n comparing against the default config.\n [default: efficiency]\n -G, --gpu Whether the original config can run on a GPU.\n Only relevant when comparing against the\n default config.\n --pretraining, --pt Whether to compare on a config with\n pretraining involved. Only relevant when\n comparing against the default config.\n -md, --markdown Generate Markdown for GitHub issues\n --help Show this message and exit.\n", + "debug model": "Usage: python -m spacy debug model [OPTIONS] CONFIG_PATH COMPONENT\n\n Analyze a Thinc model implementation. Includes checks for internal structure\n and activations during training.\n\n DOCS: https://spacy.io/api/cli#debug-model\n\nArguments:\n CONFIG_PATH Path to config file [required]\n COMPONENT Name of the pipeline component of which the model should be\n analysed [required]\n\nOptions:\n -l, --layers TEXT Comma-separated names of layer IDs to print\n -DIM, --dimensions Show dimensions\n -PAR, --parameters Show parameters\n -GRAD, --gradients Show gradients\n -ATTR, --attributes Show attributes\n -P0, --print-step0 Print model before training\n -P1, --print-step1 Print model after initialization\n -P2, --print-step2 Print model after training\n -P3, --print-step3 Print final predictions\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n --help Show this message and exit.\n", + "debug profile": "Usage: python -m spacy debug profile [OPTIONS] MODEL [INPUTS]\n\n Profile which functions take the most time in a spaCy pipeline. Input should\n be formatted as one JSON object per line with a key \"text\". It can either be\n provided as a JSONL file, or be read from sys.sytdin. If no input file is\n specified, the IMDB dataset is loaded via Thinc.\n\n DOCS: https://spacy.io/api/cli#debug-profile\n\nArguments:\n MODEL Trained pipeline to load [required]\n [INPUTS] Location of input file. '-' for stdin.\n\nOptions:\n -n, --n-texts INTEGER Maximum number of texts to use if available [default:\n 10000]\n --help Show this message and exit.\n", + "debug-data": "Usage: python -m spacy debug-data [OPTIONS] CONFIG_PATH\n\n Analyze, debug and validate your training and development data. Outputs useful\n stats, and can help you find problems like invalid entity annotations, cyclic\n dependencies, low data labels and more.\n\n DOCS: https://spacy.io/api/cli#debug-data\n\nArguments:\n CONFIG_PATH Path to config file [required]\n\nOptions:\n -c, --code-path, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -IW, --ignore-warnings Ignore warnings, only show stats and errors\n -V, --verbose Print additional information and explanations\n -NF, --no-format Don't pretty-print the results\n --help Show this message and exit.\n", + "download": "Usage: python -m spacy download [OPTIONS] MODEL\n\n Download compatible trained pipeline from the default download path using pip.\n If --direct flag is set, the command expects the full package name with\n version. For direct downloads, the compatibility check will be skipped. All\n additional arguments provided to this command will be passed to `pip install`\n on package installation.\n\n DOCS: https://spacy.io/api/cli#download AVAILABLE PACKAGES:\n https://spacy.io/models\n\nArguments:\n MODEL Name of pipeline package to download [required]\n\nOptions:\n -d, -D, --direct Force direct download of name + version\n -S, --sdist Download sdist (.tar.gz) archive instead of pre-built binary\n wheel\n -U, --url TEXT Download from given url\n --help Show this message and exit.\n", + "evaluate": "Usage: python -m spacy evaluate [OPTIONS] MODEL DATA_PATH\n\n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation\n data in the binary .spacy format. The --gold-preproc option sets up the\n evaluation examples with gold-standard sentences and tokens for the\n predictions. Gold preprocessing helps the annotations align to the\n tokenization, and may result in sequences of more consistent length. However,\n it may reduce runtime accuracy due to train/test skew. To render a sample of\n dependency parses in a HTML file, set as output directory as the displacy_path\n argument.\n\n DOCS: https://spacy.io/api/cli#benchmark-accuracy\n\nArguments:\n MODEL Model name or path [required]\n DATA_PATH Location of binary evaluation data in .spacy format [required]\n\nOptions:\n -o, --output FILE Output JSON file for metrics\n -c, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n -G, --gold-preproc Use gold preprocessing\n -dp, --displacy-path DIRECTORY Directory to output rendered parses as HTML\n -dl, --displacy-limit INTEGER Limit of parses to render as HTML [default:\n 25]\n -P, --per-component Return scores per component, only applicable\n when an output JSON file is specified.\n -sk, --spans-key TEXT Spans key to use when evaluating Doc.spans\n [default: sc]\n --help Show this message and exit.\n", + "find-function": "Usage: python -m spacy find-function [OPTIONS] FUNC_NAME\n\n Find the module, path and line number to the file the registered function is\n defined in, if available.\n\n func_name (str): Name of the registered function. registry_name\n (Optional[str]): Name of the catalogue registry.\n\n DOCS: https://spacy.io/api/cli#find-function\n\nArguments:\n FUNC_NAME Name of the registered function. [required]\n\nOptions:\n -r, --registry TEXT Name of the catalogue registry.\n --help Show this message and exit.\n", + "find-threshold": "Usage: python -m spacy find-threshold [OPTIONS] MODEL DATA_PATH PIPE_NAME\n THRESHOLD_KEY SCORES_KEY\n\n Runs prediction trials for a trained model with varying thresholds to maximize\n the specified metric. The search space for the threshold is traversed linearly\n from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`\n (the corresponding API call to `spacy.cli.find_threshold.find_threshold()`\n returns all results).\n\n This is applicable only for components whose predictions are influenced by\n thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note\n that the full path to the corresponding threshold attribute in the config has\n to be provided.\n\n DOCS: https://spacy.io/api/cli#find-threshold\n\nArguments:\n MODEL Model name or path [required]\n DATA_PATH Location of binary evaluation data in .spacy format [required]\n PIPE_NAME Name of pipe to examine thresholds for [required]\n THRESHOLD_KEY Key of threshold attribute in component's configuration\n [required]\n SCORES_KEY Metric to optimize [required]\n\nOptions:\n -n, --n_trials INTEGER Number of trials to determine optimal thresholds\n [default: 11]\n -c, --code PATH Path to Python file with additional code (registered\n functions) to be imported\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n -G, --gold-preproc Use gold preprocessing\n -V, -VV, --verbose Display more information for debugging purposes\n --help Show this message and exit.\n", + "info": "Usage: python -m spacy info [OPTIONS] [MODEL]\n\n Print info about spaCy installation. If a pipeline is specified as an\n argument, print its meta information. Flag --markdown prints details in\n Markdown for easy copy-pasting to GitHub issues.\n\n Flag --url prints only the download URL of the most recent compatible version\n of the pipeline.\n\n DOCS: https://spacy.io/api/cli#info\n\nArguments:\n [MODEL] Optional loadable spaCy pipeline\n\nOptions:\n -md, --markdown Generate Markdown for GitHub issues\n -s, -S, --silent Don't print anything (just return)\n -e, --exclude TEXT Comma-separated keys to exclude from the print-out\n [default: labels]\n -u, --url Print the URL to download the most recent compatible\n version of the pipeline\n --help Show this message and exit.\n", + "init config": "Usage: python -m spacy init config [OPTIONS] OUTPUT_FILE\n\n Generate a starter config file for training. Based on your requirements\n specified via the CLI arguments, this command generates a config with the\n optimal settings for your use case. This includes the choice of architecture,\n pretrained weights and related hyperparameters.\n\n DOCS: https://spacy.io/api/cli#init-config\n\nArguments:\n OUTPUT_FILE File to save the config to or - for stdout (will only output\n config and no additional logging info) [required]\n\nOptions:\n -l, --lang TEXT Two-letter code of the language to use\n [default: en]\n -p, --pipeline TEXT Comma-separated names of trainable pipeline\n components to include (without 'tok2vec' or\n 'transformer') [default: tagger,parser,ner]\n -o, --optimize [efficiency|accuracy]\n Whether to optimize for efficiency (faster\n inference, smaller model, lower memory\n consumption) or higher accuracy (potentially\n larger and slower model). This will impact the\n choice of architecture, pretrained weights and\n related hyperparameters. [default:\n efficiency]\n -G, --gpu Whether the model can run on GPU. This will\n impact the choice of architecture, pretrained\n weights and related hyperparameters.\n -pt, --pretraining Include config for pretraining (with 'spacy\n pretrain')\n -F, --force Force overwriting the output file\n --help Show this message and exit.\n", + "init fill-config": "Usage: python -m spacy init fill-config [OPTIONS] BASE_PATH [OUTPUT_FILE]\n\n Fill partial config file with default values. Will add all missing settings\n from the default config and will create all objects, check the registered\n functions for their default values and update the base config. This command\n can be used with a config generated via the training quickstart widget:\n https://spacy.io/usage/training#quickstart\n\n DOCS: https://spacy.io/api/cli#init-fill-config\n\nArguments:\n BASE_PATH Path to base config to fill [required]\n [OUTPUT_FILE] Path to output .cfg file (or - for stdout) [default: -]\n\nOptions:\n -pt, --pretraining Include config for pretraining (with 'spacy\n pretrain')\n -D, --diff Print a visual diff highlighting the changes\n -c, --code-path, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n --help Show this message and exit.\n", + "init labels": "Usage: python -m spacy init labels [OPTIONS] CONFIG_PATH OUTPUT_PATH\n\n Generate JSON files for the labels in the data. This helps speed up the\n training process, since spaCy won't have to preprocess the data to extract the\n labels.\n\nArguments:\n CONFIG_PATH Path to config file [required]\n OUTPUT_PATH Output directory for the labels [required]\n\nOptions:\n -c, --code PATH Path to Python file with additional code (registered\n functions) to be imported\n -V, -VV, --verbose Display more information for debugging purposes\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n --help Show this message and exit.\n", + "init nlp": "Usage: python -m spacy init nlp [OPTIONS] CONFIG_PATH OUTPUT_PATH\n\nArguments:\n CONFIG_PATH Path to config file [required]\n OUTPUT_PATH Output directory for the prepared data [required]\n\nOptions:\n -c, --code PATH Path to Python file with additional code (registered\n functions) to be imported\n -V, -VV, --verbose Display more information for debugging purposes\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n --help Show this message and exit.\n", + "init vectors": "Usage: python -m spacy init vectors [OPTIONS] LANG VECTORS_LOC OUTPUT_DIR\n\n Convert word vectors for use with spaCy. Will export an nlp object that you\n can use in the [initialize] block of your config to initialize a model with\n vectors.\n\nArguments:\n LANG The language of the nlp object to create [required]\n VECTORS_LOC Vectors file in Word2Vec format [required]\n OUTPUT_DIR Pipeline output directory [required]\n\nOptions:\n -p, --prune INTEGER Optional number of vectors to prune to [default: -1]\n -t, --truncate INTEGER Optional number of vectors to truncate to when reading\n in vectors file [default: 0]\n -m, --mode TEXT Vectors mode: default or floret [default: default]\n -n, --name TEXT Optional name for the word vectors, e.g.\n en_core_web_lg.vectors\n -V, -VV, --verbose Display more information for debugging purposes\n -a, --attr TEXT Optional token attribute to use for vectors, e.g.\n LOWER or NORM [default: ORTH]\n --help Show this message and exit.\n", + "link": "Usage: python -m spacy link [OPTIONS] ARGS KWARGS\n\n As of spaCy v3.0, symlinks like \"en\" are not supported anymore. You can load\n trained pipeline packages using their full names or from a directory path.\n (DEPRECATED)\n\nArguments:\n ARGS [required]\n KWARGS [required]\n\nOptions:\n --help Show this message and exit.\n", + "package": "Usage: python -m spacy package [OPTIONS] INPUT_DIR OUTPUT_DIR\n\n Generate an installable Python package for a pipeline. Includes binary data,\n meta and required installation files. A new directory will be created in the\n specified output directory, and the data will be copied over. If --create-meta\n is set and a meta.json already exists in the output directory, the existing\n values will be used as the defaults in the command-line prompt. After\n packaging, \"python -m build --sdist\" is run in the package directory, which\n will create a .tar.gz archive that can be installed via \"pip install\".\n\n If additional code files are provided (e.g. Python files containing custom\n registered functions like pipeline components), they are copied into the\n package and imported in the __init__.py.\n\n DOCS: https://spacy.io/api/cli#package\n\nArguments:\n INPUT_DIR Directory with pipeline data [required]\n OUTPUT_DIR Output parent directory [required]\n\nOptions:\n -c, --code TEXT Comma-separated paths to Python file with\n additional code (registered functions) to be\n included in the package\n -m, --meta-path, --meta FILE Path to meta.json\n -C, --create-meta Create meta.json, even if one exists\n -n, --name TEXT Package name to override meta\n -v, --version TEXT Package version to override meta\n -b, --build TEXT Comma-separated formats to build: sdist and/or\n wheel, or none. [default: sdist]\n -f, -F, --force Force overwriting existing data in output\n directory\n -R, -R, --require-parent / --no-require-parent\n Include the parent package (e.g. spacy) in the\n requirements [default: require-parent]\n --help Show this message and exit.\n", + "pretrain": "Usage: python -m spacy pretrain [OPTIONS] CONFIG_PATH OUTPUT_DIR\n\n Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using\n an approximate language-modelling objective. Two objective types are\n available, vector-based and character-based.\n\n In the vector-based objective, we load word vectors that have been trained\n using a word2vec-style distributional similarity algorithm, and train a\n component like a CNN, BiLSTM, etc to predict vectors which match the\n pretrained ones. The weights are saved to a directory after each epoch. You\n can then pass a path to one of these pretrained weights files to the 'spacy\n train' command.\n\n This technique may be especially helpful if you have little labelled data.\n However, it's still quite experimental, so your mileage may vary.\n\n To load the weights back in during 'spacy train', you need to ensure all\n settings are the same between pretraining and training. Ideally, this is done\n by using the same config file for both commands.\n\n DOCS: https://spacy.io/api/cli#pretrain\n\nArguments:\n CONFIG_PATH Path to config file [required]\n OUTPUT_DIR Directory to write weights to on each epoch [required]\n\nOptions:\n -c, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -r, --resume-path PATH Path to pretrained weights from which to resume\n pretraining\n -er, --epoch-resume INTEGER The epoch to resume counting from when using\n --resume-path. Prevents unintended overwriting of\n existing weight files.\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n -L, --skip-last Skip saving model-last.bin\n --help Show this message and exit.\n", + "profile": "Usage: python -m spacy profile [OPTIONS] MODEL [INPUTS]\n\n Profile which functions take the most time in a spaCy pipeline. Input should\n be formatted as one JSON object per line with a key \"text\". It can either be\n provided as a JSONL file, or be read from sys.sytdin. If no input file is\n specified, the IMDB dataset is loaded via Thinc.\n\n DOCS: https://spacy.io/api/cli#debug-profile\n\nArguments:\n MODEL Trained pipeline to load [required]\n [INPUTS] Location of input file. '-' for stdin.\n\nOptions:\n -n, --n-texts INTEGER Maximum number of texts to use if available [default:\n 10000]\n --help Show this message and exit.\n", + "project assets": "Usage: python -m spacy project assets [OPTIONS] [PROJECT_DIR]\n\n Fetch project assets like datasets and pretrained weights. Assets are defined\n in the \"assets\" section of the project.yml. If a checksum is provided in the\n project.yml, the file is only downloaded if no local file with the same\n checksum exists.\n\n DOCS: https://github.com/explosion/weasel/tree/main/docs/tutorial/directory-\n and-assets.md\n\nArguments:\n [PROJECT_DIR] Path to cloned project. Defaults to current working directory.\n [default: /Users/matt/repos/spacy-monorepo/spacy]\n\nOptions:\n -S, --sparse Use sparse checkout for assets provided via Git, to only check\n out and clone the files needed. Requires Git v22.2+.\n -e, --extra Download all assets, including those marked as 'extra'.\n --help Show this message and exit.\n", + "project clone": "Usage: python -m spacy project clone [OPTIONS] NAME [DEST]\n\n Clone a project template from a repository. Calls into \"git\" and will only\n download the files from the given subdirectory. The GitHub repo defaults to\n the official Weasel template repo, but can be customized (including using a\n private repo).\n\n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#clipboard-\n clone\n\nArguments:\n NAME The name of the template to clone [required]\n [DEST] Where to clone the project. Defaults to current working directory\n\nOptions:\n -r, --repo TEXT The repository to clone from [default:\n https://github.com/explosion/projects]\n -b, --branch TEXT The branch to clone from. If not provided, will attempt\n main, master\n -S, --sparse Use sparse Git checkout to only check out and clone the\n files needed. Requires Git v22.2+.\n --help Show this message and exit.\n", + "project document": "Usage: python -m spacy project document [OPTIONS] [PROJECT_DIR]\n\n Auto-generate a README.md for a project. If the content is saved to a file,\n hidden markers are added so you can add custom content before or after the\n auto-generated section and only the auto-generated docs will be replaced when\n you re-run the command.\n\n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#closed_book-\n document\n\nArguments:\n [PROJECT_DIR] Path to cloned project. Defaults to current working directory.\n [default: /Users/matt/repos/spacy-monorepo/spacy]\n\nOptions:\n -o, --output PATH Path to output Markdown file for output. Defaults to - for\n standard output [default: -]\n -NE, --no-emoji Don't use emoji\n --help Show this message and exit.\n", + "project dvc": "Usage: python -m spacy project dvc [OPTIONS] [PROJECT_DIR] [WORKFLOW]\n\n Auto-generate Data Version Control (DVC) config. A DVC project can only define\n one pipeline, so you need to specify one workflow defined in the project.yml.\n If no workflow is specified, the first defined workflow is used. The DVC\n config will only be updated if the project.yml changed.\n\n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#repeat-dvc\n\nArguments:\n [PROJECT_DIR] Location of project directory. Defaults to current working\n directory. [default: /Users/matt/repos/spacy-monorepo/spacy]\n [WORKFLOW] Name of workflow defined in project.yml. Defaults to first\n workflow if not set.\n\nOptions:\n -V, --verbose Print more info\n -q, --quiet Print less info\n -F, --force Force update DVC config\n --help Show this message and exit.\n", + "project pull": "Usage: python -m spacy project pull [OPTIONS] [REMOTE] [PROJECT_DIR]\n\n Retrieve available precomputed outputs from a remote storage. You can alias\n remotes in your project.yml by mapping them to storage paths. A storage can be\n anything that the smart_open library can upload to, e.g. AWS, Google Cloud\n Storage, SSH, local directories etc.\n\n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_down-\n push\n\nArguments:\n [REMOTE] Name or path of remote storage [default: default]\n [PROJECT_DIR] Location of project directory. Defaults to current working\n directory. [default: /Users/matt/repos/spacy-monorepo/spacy]\n\nOptions:\n --help Show this message and exit.\n", + "project push": "Usage: python -m spacy project push [OPTIONS] [REMOTE] [PROJECT_DIR]\n\n Persist outputs to a remote storage. You can alias remotes in your project.yml\n by mapping them to storage paths. A storage can be anything that the\n smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, local\n directories etc.\n\n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_up-push\n\nArguments:\n [REMOTE] Name or path of remote storage [default: default]\n [PROJECT_DIR] Location of project directory. Defaults to current working\n directory. [default: /Users/matt/repos/spacy-monorepo/spacy]\n\nOptions:\n --help Show this message and exit.\n", + "train": "Usage: python -m spacy train [OPTIONS] CONFIG_PATH\n\n Train or update a spaCy pipeline. Requires data in spaCy's binary format. To\n convert data from other formats, use the `spacy convert` command. The config\n file includes all settings and hyperparameters used during training. To\n override settings in the config, e.g. settings that point to local paths or\n that you want to experiment with, you can override them as command line\n options. For instance, --training.batch_size 128 overrides the value of\n \"batch_size\" in the block \"[training]\". The --code argument lets you pass in a\n Python file that's imported before training. It can be used to register custom\n functions and architectures that can then be referenced in the config.\n\n DOCS: https://spacy.io/api/cli#train\n\nArguments:\n CONFIG_PATH Path to config file [required]\n\nOptions:\n -o, --output, --output-path PATH\n Output directory to store trained pipeline in\n -c, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -V, -VV, --verbose Display more information for debugging\n purposes\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n --help Show this message and exit.\n", + "validate": "Usage: python -m spacy validate [OPTIONS]\n\n Validate the currently installed pipeline packages and spaCy version. Checks\n if the installed packages are compatible and shows upgrade instructions if\n available. Should be run after `pip install -U spacy`.\n\n DOCS: https://spacy.io/api/cli#validate\n\nOptions:\n --help Show this message and exit.\n" }, "errors": { - "missing_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ Missing command. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", - "unknown_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_COMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "missing_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n\nError: Missing command.\n", + "unknown_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n\nError: No such command '__SPACY_UNKNOWN_COMMAND__'.\n", "unknown_subcommand": { - "benchmark": "Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy benchmark --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_SUBCOMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", - "debug": "Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy debug --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_SUBCOMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", - "init": "Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy init --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_SUBCOMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", - "project": "Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy project --help' for help.\n╭─ Error ──────────────────────────────────────────────────────────────────────────────────────────╮\n│ No such command '__SPACY_UNKNOWN_SUBCOMMAND__'. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + "benchmark": "Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy benchmark --help' for help.\n\nError: No such command '__SPACY_UNKNOWN_SUBCOMMAND__'.\n", + "debug": "Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy debug --help' for help.\n\nError: No such command '__SPACY_UNKNOWN_SUBCOMMAND__'.\n", + "init": "Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy init --help' for help.\n\nError: No such command '__SPACY_UNKNOWN_SUBCOMMAND__'.\n", + "project": "Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy project --help' for help.\n\nError: No such command '__SPACY_UNKNOWN_SUBCOMMAND__'.\n" } }, "group_help": { - "benchmark": " \n Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]... \n \n Commands for benchmarking pipelines. \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ accuracy Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation │\n│ data in the binary .spacy format. The --gold-preproc option sets up the │\n│ evaluation examples with gold-standard sentences and tokens for the │\n│ predictions. Gold preprocessing helps the annotations align to the │\n│ tokenization, and may result in sequences of more consistent length. However, │\n│ it may reduce runtime accuracy due to train/test skew. To render a sample of │\n│ dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ speed Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark │\n│ data in the binary .spacy format. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "debug": " \n Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]... \n \n Suite of helpful commands for debugging and profiling. Includes commands to check and validate \n your config files, training and evaluation data, and custom model implementations. \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ data Analyze, debug and validate your training and development data. Outputs │\n│ useful stats, and can help you find problems like invalid entity annotations, │\n│ cyclic dependencies, low data labels and more. │\n│ profile Profile which functions take the most time in a spaCy pipeline. │\n│ Input should be formatted as one JSON object per line with a key \"text\". │\n│ It can either be provided as a JSONL file, or be read from sys.sytdin. │\n│ If no input file is specified, the IMDB dataset is loaded via Thinc. │\n│ config Debug a config file and show validation errors. The command will │\n│ create all objects in the tree and validate them. Note that some config │\n│ validation errors are blocking and will prevent the rest of the config from │\n│ being resolved. This means that you may not see all validation errors at │\n│ once and some issues are only shown once previous errors have been fixed. │\n│ Similar as with the 'train' command, you can override settings from the config │\n│ as command line options. For instance, --training.batch_size 128 overrides │\n│ the value of \"batch_size\" in the block \"\". │\n│ diff-config Show a diff of a config file with respect to spaCy's defaults or another config │\n│ file. If │\n│ additional settings were used in the creation of the config file, then you │\n│ must supply these as extra parameters to the command when comparing to the default │\n│ settings. The generated diff │\n│ can also be used when posting to the discussion forum to provide more │\n│ information for the maintainers. │\n│ model Analyze a Thinc model implementation. Includes checks for internal structure │\n│ and activations during training. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "init": " \n Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]... \n \n Commands for initializing configs and pipeline packages. \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ config Generate a starter config file for training. Based on your requirements │\n│ specified via the CLI arguments, this command generates a config with the │\n│ optimal settings for your use case. This includes the choice of architecture, │\n│ pretrained weights and related hyperparameters. │\n│ fill-config Fill partial config file with default values. Will add all missing settings │\n│ from the default config and will create all objects, check the registered │\n│ functions for their default values and update the base config. This command │\n│ can be used with a config generated via the training quickstart widget: │\n│ https://spacy.io/usage/training#quickstart │\n│ vectors Convert word vectors for use with spaCy. Will export an nlp object that │\n│ you can use in the block of your config to initialize │\n│ a model with vectors. │\n│ labels Generate JSON files for the labels in the data. This helps speed up the │\n│ training process, since spaCy won't have to preprocess the data to │\n│ extract the labels. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n", - "project": " \n Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]... \n \n Command-line interface for spaCy projects and templates. You'd typically start by cloning a \n project template to a local directory and fetching its assets like datasets etc. See the project's \n project.yml for the available commands. \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ assets Fetch project assets like datasets and pretrained weights. Assets are │\n│ defined in the \"assets\" section of the project.yml. If a checksum is │\n│ provided in the project.yml, the file is only downloaded if no local file │\n│ with the same checksum exists. │\n│ clone Clone a project template from a repository. Calls into \"git\" and will │\n│ only download the files from the given subdirectory. The GitHub repo │\n│ defaults to the official Weasel template repo, but can be customized │\n│ (including using a private repo). │\n│ document Auto-generate a README.md for a project. If the content is saved to a file, │\n│ hidden markers are added so you can add custom content before or after the │\n│ auto-generated section and only the auto-generated docs will be replaced │\n│ when you re-run the command. │\n│ dvc Auto-generate Data Version Control (DVC) config. A DVC │\n│ project can only define one pipeline, so you need to specify one workflow │\n│ defined in the project.yml. If no workflow is specified, the first defined │\n│ workflow is used. The DVC config will only be updated if the project.yml │\n│ changed. │\n│ run Run a named command or workflow defined in the project.yml. If a workflow │\n│ name is specified, all commands in the workflow are run, in order. If │\n│ commands define dependencies and/or outputs, they will only be re-run if │\n│ state has changed. │\n│ pull Retrieve available precomputed outputs from a remote storage. │\n│ You can alias remotes in your project.yml by mapping them to storage paths. │\n│ A storage can be anything that the smart_open library can upload to, e.g. │\n│ AWS, Google Cloud Storage, SSH, local directories etc. │\n│ push Persist outputs to a remote storage. You can alias remotes in your │\n│ project.yml by mapping them to storage paths. A storage can be anything that │\n│ the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, │\n│ local directories etc. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + "benchmark": "Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]...\n\n Commands for benchmarking pipelines.\n\nOptions:\n --help Show this message and exit.\n\nCommands:\n accuracy Evaluate a trained pipeline.\n speed Benchmark a pipeline.\n", + "debug": "Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]...\n\n Suite of helpful commands for debugging and profiling. Includes commands to\n check and validate your config files, training and evaluation data, and custom\n model implementations.\n\nOptions:\n --help Show this message and exit.\n\nCommands:\n data Analyze, debug and validate your training and development data.\n profile Profile which functions take the most time in a spaCy pipeline.\n config Debug a config file and show validation errors.\n diff-config Show a diff of a config file with respect to spaCy's...\n model Analyze a Thinc model implementation.\n", + "init": "Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]...\n\n Commands for initializing configs and pipeline packages.\n\nOptions:\n --help Show this message and exit.\n\nCommands:\n config Generate a starter config file for training.\n fill-config Fill partial config file with default values.\n vectors Convert word vectors for use with spaCy.\n labels Generate JSON files for the labels in the data.\n", + "project": "Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]...\n\n Command-line interface for spaCy projects and templates. You'd typically start\n by cloning a project template to a local directory and fetching its assets\n like datasets etc. See the project's project.yml for the available commands.\n\nOptions:\n --help Show this message and exit.\n\nCommands:\n assets Fetch project assets like datasets and pretrained weights.\n clone Clone a project template from a repository.\n document Auto-generate a README.md for a project.\n dvc Auto-generate Data Version Control (DVC) config.\n run Run a named command or workflow defined in the project.yml.\n pull Retrieve available precomputed outputs from a remote storage.\n push Persist outputs to a remote storage.\n" }, "hidden_group_commands": { "benchmark": [], @@ -114,5 +114,5 @@ "train", "validate" ], - "root_help": " \n Usage: python -m spacy [OPTIONS] COMMAND [ARGS]... \n \n spaCy Command-line Interface \n \n DOCS: https://spacy.io/api/cli \n \n╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮\n│ --install-completion Install completion for the current shell. │\n│ --show-completion Show completion for the current shell, to copy it or customize the │\n│ installation. │\n│ --help Show this message and exit. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────╮\n│ download Download compatible trained pipeline from the default download path using │\n│ pip. If --direct flag is set, the command expects the full package name with │\n│ version. For direct downloads, the compatibility check will be skipped. All │\n│ additional arguments provided to this command will be passed to `pip install` │\n│ on package installation. │\n│ info Print info about spaCy installation. If a pipeline is specified as an argument, │\n│ print its meta information. Flag --markdown prints details in Markdown for easy │\n│ copy-pasting to GitHub issues. │\n│ apply Apply a trained pipeline to documents to get predictions. │\n│ Expects a loadable spaCy pipeline and path to the data, which │\n│ can be a directory or a file. │\n│ The data files can be provided in multiple formats: │\n│ 1. .spacy files │\n│ 2. .jsonl files with a specified \"field\" to read the text from. │\n│ 3. Files with any other extension are assumed to be containing │\n│ a single document. │\n│ DOCS: https://spacy.io/api/cli#apply │\n│ assemble Assemble a spaCy pipeline from a config file. The config file includes │\n│ all settings for initializing the pipeline. To override settings in the │\n│ config, e.g. settings that point to local paths or that you want to │\n│ experiment with, you can override them as command line options. The │\n│ --code argument lets you pass in a Python file that can be used to │\n│ register custom functions that are referenced in the config. │\n│ convert Convert files into json or DocBin format for training. The resulting .spacy │\n│ file can be used with the train command and other experiment management │\n│ functions. │\n│ evaluate Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation │\n│ data in the binary .spacy format. The --gold-preproc option sets up the │\n│ evaluation examples with gold-standard sentences and tokens for the │\n│ predictions. Gold preprocessing helps the annotations align to the │\n│ tokenization, and may result in sequences of more consistent length. However, │\n│ it may reduce runtime accuracy due to train/test skew. To render a sample of │\n│ dependency parses in a HTML file, set as output directory as the │\n│ displacy_path argument. │\n│ find-function Find the module, path and line number to the file the registered │\n│ function is defined in, if available. │\n│ find-threshold Runs prediction trials for a trained model with varying thresholds to maximize │\n│ the specified metric. The search space for the threshold is traversed linearly │\n│ from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` │\n│ (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` │\n│ returns all results). │\n│ package Generate an installable Python package for a pipeline. Includes binary data, │\n│ meta and required installation files. A new directory will be created in the │\n│ specified output directory, and the data will be copied over. If │\n│ --create-meta is set and a meta.json already exists in the output directory, │\n│ the existing values will be used as the defaults in the command-line prompt. │\n│ After packaging, \"python -m build --sdist\" is run in the package directory, │\n│ which will create a .tar.gz archive that can be installed via \"pip install\". │\n│ pretrain Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, │\n│ using an approximate language-modelling objective. Two objective types │\n│ are available, vector-based and character-based. │\n│ train Train or update a spaCy pipeline. Requires data in spaCy's binary format. To │\n│ convert data from other formats, use the `spacy convert` command. The │\n│ config file includes all settings and hyperparameters used during training. │\n│ To override settings in the config, e.g. settings that point to local │\n│ paths or that you want to experiment with, you can override them as │\n│ command line options. For instance, --training.batch_size 128 overrides │\n│ the value of \"batch_size\" in the block \"\". The --code argument │\n│ lets you pass in a Python file that's imported before training. It can be │\n│ used to register custom functions and architectures that can then be │\n│ referenced in the config. │\n│ validate Validate the currently installed pipeline packages and spaCy version. Checks │\n│ if the installed packages are compatible and shows upgrade instructions if │\n│ available. Should be run after `pip install -U spacy`. │\n│ debug Suite of helpful commands for debugging and profiling. Includes │\n│ commands to check and validate your config files, training and evaluation data, │\n│ and custom model implementations. │\n│ benchmark Commands for benchmarking pipelines. │\n│ init Commands for initializing configs and pipeline packages. │\n│ project Command-line interface for spaCy projects and templates. │\n│ You'd typically start by cloning a project template to a local directory and │\n│ fetching its assets like datasets etc. See the project's project.yml for the │\n│ available commands. │\n╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n\n" + "root_help": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\n\n spaCy Command-line Interface\n\n DOCS: https://spacy.io/api/cli\n\nOptions:\n --install-completion Install completion for the current shell.\n --show-completion Show completion for the current shell, to copy it or\n customize the installation.\n --help Show this message and exit.\n\nCommands:\n download Download compatible trained pipeline from the default...\n info Print info about spaCy installation.\n apply Apply a trained pipeline to documents to get predictions.\n assemble Assemble a spaCy pipeline from a config file.\n convert Convert files into json or DocBin format for training.\n evaluate Evaluate a trained pipeline.\n find-function Find the module, path and line number to the file the...\n find-threshold Runs prediction trials for a trained model with varying...\n package Generate an installable Python package for a pipeline.\n pretrain Pre-train the 'token-to-vector' (tok2vec) layer of...\n train Train or update a spaCy pipeline.\n validate Validate the currently installed pipeline packages and...\n debug Suite of helpful commands for debugging and profiling.\n benchmark Commands for benchmarking pipelines.\n init Commands for initializing configs and pipeline packages.\n project Command-line interface for spaCy projects and templates.\n" } From 1041f8b4269952e249863467754d4af9e01f771d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 23 Mar 2026 15:53:13 +0100 Subject: [PATCH 41/42] Debug: dump manifest diff in test_manifest_is_current --- spacy/tests/test_cli_launcher.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/tests/test_cli_launcher.py b/spacy/tests/test_cli_launcher.py index 1fd6e3a6fc6..931ec9100f7 100644 --- a/spacy/tests/test_cli_launcher.py +++ b/spacy/tests/test_cli_launcher.py @@ -52,9 +52,12 @@ def test_manifest_is_current(): [ sys.executable, "-c", + "import json; " "from spacy_cli.build_manifest import build_manifest; " "from spacy_cli.static import load_manifest; " - "assert build_manifest() == load_manifest()", + "b, l = build_manifest(), load_manifest(); " + "diffs = {k: (str(b[k])[:80], str(l[k])[:80]) for k in b if b[k] != l[k]}; " + "assert b == l, json.dumps(diffs, indent=2)", ], capture_output=True, text=True, From 66b1691dc2e7dc35fc6ee800fb3438ce634375c9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 23 Mar 2026 16:12:02 +0100 Subject: [PATCH 42/42] Debug: show per-key manifest diffs in test_manifest_is_current --- spacy/tests/test_cli_launcher.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/tests/test_cli_launcher.py b/spacy/tests/test_cli_launcher.py index 931ec9100f7..c9af62a509d 100644 --- a/spacy/tests/test_cli_launcher.py +++ b/spacy/tests/test_cli_launcher.py @@ -56,7 +56,12 @@ def test_manifest_is_current(): "from spacy_cli.build_manifest import build_manifest; " "from spacy_cli.static import load_manifest; " "b, l = build_manifest(), load_manifest(); " - "diffs = {k: (str(b[k])[:80], str(l[k])[:80]) for k in b if b[k] != l[k]}; " + "diffs = {}; " + "[diffs.update({f'{k}.{sk}': (repr(b[k][sk])[:120], repr(l[k][sk])[:120])}) " + "for k in b if isinstance(b[k], dict) and b[k] != l[k] " + "for sk in b[k] if b[k].get(sk) != l[k].get(sk)]; " + "[diffs.update({k: (repr(b[k])[:120], repr(l[k])[:120])}) " + "for k in b if not isinstance(b[k], dict) and b[k] != l[k]]; " "assert b == l, json.dumps(diffs, indent=2)", ], capture_output=True,