diff --git a/.github/scripts/translate_docs.py b/.github/scripts/translate_docs.py index c5af35fd6..f9879d4d1 100644 --- a/.github/scripts/translate_docs.py +++ b/.github/scripts/translate_docs.py @@ -12,7 +12,6 @@ import os import subprocess -import sys from pathlib import Path from anthropic import Anthropic @@ -20,6 +19,15 @@ EN_DIR = "docs/src/en" ES_DIR = "docs/src/es" +# Source files outside EN_DIR whose Spanish translation lives in ES_DIR. The +# English docs page for these is an mdbook {{#include}} of the source file, so +# the source file is the single source of truth and drives its es/ translation. +EXTERNAL_SOURCES = {"CONTRIBUTING.md": f"{ES_DIR}/contributing.md"} + +# English doc files that must not be translated directly — e.g. pages that are +# just an mdbook {{#include}} of a source handled via EXTERNAL_SOURCES above. +SKIP_EN_FILES = {f"{EN_DIR}/contributing.md"} + # Files to use as style/terminology reference (picked for breadth of patterns) REFERENCE_FILES = ["archs4.md", "blast.md", "info.md"] @@ -80,31 +88,46 @@ """ +def es_target(filepath): + """Map an English/source doc path to its Spanish counterpart path.""" + if filepath in EXTERNAL_SOURCES: + return EXTERNAL_SOURCES[filepath] + return filepath.replace(EN_DIR, ES_DIR, 1) + + def get_changed_files(before_sha, after_sha): - """Return dict of added/modified/deleted English doc files.""" + """Return dict of added/modified/deleted documentation source files. + + Watches the English docs directory plus any external source files + (e.g. the root CONTRIBUTING.md, which the English docs page includes). + """ + watched = [EN_DIR, *EXTERNAL_SOURCES] # Check if before_sha is a valid commit - is_valid = subprocess.run( - ["git", "cat-file", "-t", before_sha], - capture_output=True, - text=True, - ).returncode == 0 + is_valid = ( + subprocess.run( + ["git", "cat-file", "-t", before_sha], + capture_output=True, + text=True, + ).returncode + == 0 + ) if not is_valid: # Initial push or invalid ref — treat all current files as new result = subprocess.run( - ["git", "ls-tree", "-r", "--name-only", after_sha, "--", EN_DIR], + ["git", "ls-tree", "-r", "--name-only", after_sha, "--", *watched], capture_output=True, text=True, check=True, ) return { - "added": [f for f in result.stdout.strip().split("\n") if f], + "added": [f for f in result.stdout.strip().split("\n") if f and f not in SKIP_EN_FILES], "modified": [], "deleted": [], } result = subprocess.run( - ["git", "diff", "--name-status", before_sha, after_sha, "--", EN_DIR], + ["git", "diff", "--name-status", before_sha, after_sha, "--", *watched], capture_output=True, text=True, check=True, @@ -125,6 +148,10 @@ def get_changed_files(before_sha, after_sha): elif status == "R": files["deleted"].append(parts[1]) files["added"].append(parts[2]) + + # Drop English pages that must not be translated directly (handled elsewhere). + for key in files: + files[key] = [f for f in files[key] if f not in SKIP_EN_FILES] return files @@ -151,9 +178,7 @@ def load_reference_files(): def build_reference_block(references): """Format reference files into a single text block.""" - return "\n\n---\n\n".join( - f"=== {name} ===\n{content}" for name, content in references.items() - ) + return "\n\n---\n\n".join(f"=== {name} ===\n{content}" for name, content in references.items()) def clean_model_output(text): @@ -231,6 +256,7 @@ def translate_diff(client, diff_text, en_content, es_content, filename, ref_bloc def main(): + """Translate English docs changed between two commits into Spanish.""" before_sha = os.environ.get("BEFORE_SHA", "").strip() after_sha = os.environ.get("AFTER_SHA", "HEAD").strip() @@ -259,7 +285,7 @@ def main(): # --- Deletions --- for filepath in changed["deleted"]: - es_path = filepath.replace(EN_DIR, ES_DIR, 1) + es_path = es_target(filepath) if Path(es_path).exists(): Path(es_path).unlink() print(f"Deleted: {es_path}") @@ -270,7 +296,7 @@ def main(): filename = Path(filepath).name print(f"Translating new file: {filename} ...") translated = translate_new_file(client, en_content, filename, ref_block) - es_path = filepath.replace(EN_DIR, ES_DIR, 1) + es_path = es_target(filepath) Path(es_path).parent.mkdir(parents=True, exist_ok=True) Path(es_path).write_text(translated) print(f" -> Created: {es_path}") @@ -278,7 +304,7 @@ def main(): # --- Modified files --- for filepath in changed["modified"]: filename = Path(filepath).name - es_path = filepath.replace(EN_DIR, ES_DIR, 1) + es_path = es_target(filepath) en_content = Path(filepath).read_text() if not Path(es_path).exists(): @@ -292,9 +318,7 @@ def main(): continue es_content = Path(es_path).read_text() print(f"Applying edits to {filename} ...") - translated = translate_diff( - client, diff_text, en_content, es_content, filename, ref_block - ) + translated = translate_diff(client, diff_text, en_content, es_content, filename, ref_block) Path(es_path).parent.mkdir(parents=True, exist_ok=True) Path(es_path).write_text(translated) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 000000000..c8a62834b --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,25 @@ +name: Check Build + +on: + push: + branches: [main] + pull_request: + branches: [main] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + package: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Install uv + uses: astral-sh/setup-uv@v7 + - name: Build package + run: uv build + - name: Check package + run: uvx twine check --strict dist/*.whl diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 461b3dae0..6759d5e9f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,37 +1,72 @@ name: CI - tests on: - # Scheduled runs twice weekly. - # These runs execute tests on both Python versions, save the pytest output to a file, - # upload the file as an artifact, and commit the 3.12 report back to main. + # Scheduled runs twice weekly: save the pytest output to a file, upload it as + # an artifact, and commit the 3.12 report back to the branch. schedule: - cron: "0 16 * * 1,4" - # Push runs only when package code or tests change. - # These runs execute tests normally and fail immediately on test failure. + # Run post-merge on the integration branches only — pushes to a PR's feature + # branch are already covered by the pull_request event below, so scoping push + # to main/dev avoids running the suite twice for the same commit. push: + branches: [main, dev] paths: - "gget/**" - "tests/**" - # Avoid recursively triggering on committed pytest result files. + - "pyproject.toml" + # Avoid recursively triggering on the bot-committed pytest result files. - "!tests/pytest_results_py*.txt" + # Run on every pull request into the integration branches. + pull_request: + branches: [main, dev] + paths: + - "gget/**" + - "tests/**" + - "pyproject.toml" # Manual runs behave like scheduled runs: - # save output, upload artifact, and optionally commit report back to main. + # save output, upload artifact, and commit report back. workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + permissions: - contents: write + contents: read jobs: - build: - name: Test on Python ${{ matrix.python }} - runs-on: ubuntu-22.04 + # Derive the test matrix from pyproject.toml ([tool.hatch.envs.hatch-test]), + # so the tested environments are defined in a single place and stay identical + # locally (`hatch test`) and in CI. + get-environments: + runs-on: ubuntu-latest + outputs: + envs: ${{ steps.get-envs.outputs.envs }} + steps: + - uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v7 + - name: Get test environments from hatch + id: get-envs + run: | + ENVS_JSON=$(uvx hatch env show --json | jq -c 'to_entries + | map(select(.key | startswith("hatch-test")) | { name: .key, python: .value.python })') + echo "envs=${ENVS_JSON}" | tee "$GITHUB_OUTPUT" + + test: + needs: get-environments + name: ${{ matrix.env.name }} + runs-on: ubuntu-latest + permissions: + contents: write # commit pytest report back on scheduled/manual runs + id-token: write # codecov OIDC strategy: fail-fast: false matrix: - python: ["3.11", "3.12"] + env: ${{ fromJSON(needs.get-environments.outputs.envs) }} steps: - name: Checkout branch @@ -39,64 +74,74 @@ jobs: with: fetch-depth: 0 - - name: Setup python - uses: actions/setup-python@v5 + - name: Install uv + uses: astral-sh/setup-uv@v7 with: - python-version: ${{ matrix.python }} - - - name: Install dependencies - run: | - python -m pip install -r requirements.txt - python -m pip install -r dev-requirements.txt - - # Push behavior: - # run pytest normally and let this step fail the job immediately if tests fail. - - name: Run tests on push - if: github.event_name == 'push' - run: | - pytest -ra -v --tb=long --durations=10 \ - --cov=gget --cov-report=term-missing tests - - # Scheduled/manual behavior: - # run pytest, save full output to a file, and capture the real pytest exit code. - # - # Important: - # - GitHub bash shells may stop on non-zero commands before later lines run. - # - We temporarily disable errexit with "set +e" so failed tests do not prevent - # us from recording PIPESTATUS[0] and writing it to GITHUB_OUTPUT. - # - continue-on-error keeps later artifact/commit steps running. - - name: Run tests and save output for scheduled/manual runs + python-version: ${{ matrix.env.python }} + + # Builds the environment (project + test dependency-group, plus the + # cellxgene extra only where pyproject says it is available). + - name: Create hatch test environment + run: uvx hatch env create ${{ matrix.env.name }} + + # Push/PR: run tests and fail the job immediately on test failure. + - name: Run tests (push / pull_request) + if: github.event_name == 'push' || github.event_name == 'pull_request' + env: + MPLBACKEND: agg + run: uvx hatch run ${{ matrix.env.name }}:run-cov -ra -v --durations=10 + + # Scheduled/manual: save full output to a file and capture the real pytest + # exit code. "set +e" keeps a test failure from preventing the exit-code / + # artifact / report-commit handling below; continue-on-error does the same + # at the step level. + - name: Run tests and save output (schedule / workflow_dispatch) id: pytest_saved if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' continue-on-error: true shell: bash + env: + MPLBACKEND: agg run: | set -o pipefail - OUT="tests/pytest_results_py${{ matrix.python }}.txt" - echo "Pytest results (Python ${{ matrix.python }}) - $(date -u +"%Y-%m-%dT%H:%M:%SZ")" > "$OUT" + OUT="tests/pytest_results_py${{ matrix.env.python }}.txt" + echo "Pytest results (Python ${{ matrix.env.python }}) - $(date -u +"%Y-%m-%dT%H:%M:%SZ")" > "$OUT" echo "" >> "$OUT" set +e - pytest -ra -v --tb=long --durations=10 \ - --cov=gget --cov-report=term-missing tests 2>&1 | tee -a "$OUT" + uvx hatch run ${{ matrix.env.name }}:run-cov -ra -v --durations=10 2>&1 | tee -a "$OUT" code=${PIPESTATUS[0]} set -e echo "exit_code=$code" >> "$GITHUB_OUTPUT" echo "pytest exit code: $code" - - # Do not fail here; a later step fails the job after artifacts/report handling. exit 0 + # Coverage upload is best-effort: a failure here must not mask the test + # result (which is handled by the steps above/below). + - name: Generate coverage report + if: always() + continue-on-error: true + run: | + test -f .coverage || uvx hatch run ${{ matrix.env.name }}:cov-combine + uvx hatch run ${{ matrix.env.name }}:coverage xml + + - name: Upload coverage to Codecov + if: always() + uses: codecov/codecov-action@v6 + with: + use_oidc: true + fail_ci_if_error: false + # Upload the saved pytest report as an artifact. - # Only do this once (3.12) to avoid duplicate artifacts from the matrix. + # Only once (3.12) to avoid duplicate artifacts from the matrix. - name: Upload pytest results artifact - if: always() && matrix.python == '3.12' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + if: always() && matrix.env.python == '3.12' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') uses: actions/upload-artifact@v4 with: - name: pytest-results-py${{ matrix.python }} - path: tests/pytest_results_py${{ matrix.python }}.txt + name: pytest-results-py${{ matrix.env.python }} + path: tests/pytest_results_py${{ matrix.env.python }}.txt # Commit the saved pytest report back to the repository. # Safety guards: @@ -107,27 +152,27 @@ jobs: - name: Commit and push pytest results if: > always() && - matrix.python == '3.12' && + matrix.env.python == '3.12' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') shell: bash run: | set -euo pipefail - + BRANCH="${GITHUB_REF#refs/heads/}" echo "Current branch: $BRANCH" - + git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" - + git add tests/pytest_results_py*.txt - + if git diff --cached --quiet; then echo "No changes to commit." exit 0 fi - + git commit -m "CI: update pytest results ($BRANCH)" - + for attempt in 1 2 3 4 5; do echo "Push attempt $attempt..." git pull --rebase --autostash origin "$BRANCH" || true @@ -136,13 +181,12 @@ jobs: fi sleep $((attempt * 5)) done - + echo "Push failed after retries." exit 1 # After scheduled/manual runs, explicitly fail the job if pytest failed. - # This step is separate so that artifact upload and report commit can still happen - # even when tests fail. + # Separate so that artifact upload and report commit still happen on failure. - name: Fail job if pytest failed if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') shell: bash @@ -150,7 +194,6 @@ jobs: code="${{ steps.pytest_saved.outputs.exit_code }}" echo "Captured pytest exit code: ${code:-}" - # Missing output means something went wrong before exit code capture. if [ -z "${code:-}" ]; then echo "pytest exit code was not captured" exit 1 @@ -159,3 +202,17 @@ jobs: if [ "$code" != "0" ]; then exit "$code" fi + + # Single gate job so branch protection can require one stable check name + # instead of every matrix entry. See https://github.com/re-actors/alls-green. + check: + name: Tests pass + if: always() + needs: + - get-environments + - test + runs-on: ubuntu-latest + steps: + - uses: re-actors/alls-green@release/v1 + with: + jobs: ${{ toJSON(needs) }} diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 220a8f49d..46da29aba 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -3,7 +3,7 @@ on: push: branches: - main - + paths: - 'docs/**' diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 000000000..89decb39c --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,26 @@ +name: Release + +on: + release: + types: [published] + +# Use "trusted publishing", see https://docs.pypi.org/trusted-publishers/ +jobs: + release: + name: Upload release to PyPI + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/gget + permissions: + id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Install uv + uses: astral-sh/setup-uv@v7 + - name: Build package + run: uv build + - name: Publish package distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/traffic.yml b/.github/workflows/traffic.yml index 29da6d30b..3d578671c 100644 --- a/.github/workflows/traffic.yml +++ b/.github/workflows/traffic.yml @@ -1,9 +1,9 @@ name: Repo Traffic Back Up on: - schedule: + schedule: # Runs every week - cron: "0 0 */7 * *" - + jobs: # This workflow stores repository traffic and clones past the default 2 week period traffic: @@ -15,14 +15,14 @@ jobs: - uses: actions/checkout@v2 with: ref: "traffic" - + # Calculates traffic and clones and stores them in a CSV file # This workflow is based on https://github.com/marketplace/actions/repository-traffic - - name: GitHub traffic + - name: GitHub traffic uses: sangonzal/repository-traffic-action@v.0.1.6 env: - TRAFFIC_ACTION_TOKEN: ${{ secrets.TRAFFIC_ACTION_TOKEN }} - + TRAFFIC_ACTION_TOKEN: ${{ secrets.TRAFFIC_ACTION_TOKEN }} + # Commits files to traffic branch - name: Commit changes uses: EndBug/add-and-commit@v4 diff --git a/.github/workflows/translate_docs.yml b/.github/workflows/translate_docs.yml index 583639ec6..656d9277c 100644 --- a/.github/workflows/translate_docs.yml +++ b/.github/workflows/translate_docs.yml @@ -5,6 +5,9 @@ on: branches: [main] paths: - 'docs/src/en/**' + # The English contributing page is an mdbook include of this file, so a + # change here must regenerate docs/src/es/contributing.md (see translate_docs.py). + - 'CONTRIBUTING.md' workflow_dispatch: # Only one translation run at a time; new pushes cancel in-progress runs. diff --git a/.gitignore b/.gitignore index 1e8e1f52b..c4bfe26d7 100644 --- a/.gitignore +++ b/.gitignore @@ -144,3 +144,5 @@ dmypy.json # VSCode settings .vscode/ +# uv lockfile (library: resolve fresh; hatch CI manages its own envs) +uv.lock diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..31447cbf5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,49 @@ +fail_fast: false +default_language_version: + python: python3 +default_stages: + - pre-commit + - pre-push +minimum_pre_commit_version: 2.16.0 +# Never reformat vendored binaries, bundled package data, test reference +# fixtures, or the auto-generated CI report — these are content/data whose +# exact bytes matter (CRLF in .pdb fixtures, exact-match test inputs, ...). +exclude: | + (?x)^( + gget/bins/ + | gget/constants/ + | tests/fixtures/ + | tests/pytest_results_py.*\.txt + ) +repos: + - repo: https://github.com/biomejs/pre-commit + rev: v2.4.16 + hooks: + - id: biome-format + - repo: https://github.com/tox-dev/pyproject-fmt + rev: v2.23.0 + hooks: + - id: pyproject-fmt + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.15.15 + hooks: + - id: ruff-check + types_or: [python, pyi, jupyter] + args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format + types_or: [python, pyi, jupyter] + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: detect-private-key + - id: check-ast + - id: end-of-file-fixer + - id: mixed-line-ending + args: [--fix=lf] + - id: trailing-whitespace + # Preserve Markdown hard line breaks (trailing double-space) in docs. + args: [--markdown-linebreak-ext=md] + - id: check-case-conflict + # Check that there are no merge conflicts (could be generated by template sync) + - id: check-merge-conflict + args: [--assume-in-merge] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 01fe0ef36..feb45069f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -40,10 +40,10 @@ Commit the changes once you are happy with them. 1. Review the content for technical accuracy. 2. Copy-edit the changes/comments for grammar, spelling, and adherence to the general style of existing gget code. -3. Format your code using [black](https://black.readthedocs.io/en/stable/getting_started.html). +3. Format and lint your code with [pre-commit](https://pre-commit.com/) (powered by [ruff](https://docs.astral.sh/ruff/)). Install the hooks once with `prek install` (or `pre-commit install`) so they run automatically on every commit, or run them on demand with `prek run --all-files` (or `pre-commit run --all-files`). 4. Make sure the unit tests pass: - - Developer dependencies can be installed with `pip install -r dev-requirements.txt` - - Run existing unit tests from the gget repository root with `coverage run -m pytest -ra -v tests && coverage report --omit=main.py,tests*` + - The tested environments are defined in `pyproject.toml` under `[tool.hatch.envs.hatch-test]` (the single source of truth used by CI). Run the full matrix with `uvx hatch test`. + - For a quick single-environment run, install the test dependencies with `uv sync --group test` and run `uv run pytest -ra -v --cov=gget --cov-report=term-missing tests`. To also exercise the `gget cellxgene` module, install its extra (`uv sync --group test --extra cellxgene`) on Python 3.12/3.13 — its dependency has no wheels for newer Python versions yet, and that test skips itself when the dependency is absent. 5. Add new unit tests if applicable: - Arguments and expected results are stored in json files in ./tests/fixtures/ - Unit tests can be added to ./tests/test_*.py and will be automatically detected @@ -59,7 +59,7 @@ If you have any questions, feel free to start a [discussion](https://github.com/ When you're finished with the changes, [create a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request), also known as a PR. -‼️ Please make all PRs against the `dev` branch of the gget repository. +‼️ Please make all PRs against the `dev` branch of the gget repository. - Don't forget to [link PR to issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) if you are solving one. - Enable the checkbox to [allow maintainer edits](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/allowing-changes-to-a-pull-request-branch-created-from-a-fork) so the branch can be updated for a merge. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 85c2d56eb..000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include LICENSE -include requirements.txt -recursive-include gget/bins * -recursive-include gget/constants * \ No newline at end of file diff --git a/README.md b/README.md index 945026107..119f6af0e 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,10 @@ `gget` is part of the [scverse®](https://scverse.org) project and is fiscally sponsored by [NumFOCUS](https://numfocus.org/). If you like `gget` and want to support our mission, please consider making a tax-deductible [donation](https://opencollective.com/scverse/projects/scverse-gget/donate?interval=oneTime&amount=20&contributeAs=me).
- + ![alt text](https://github.com/pachterlab/gget/blob/main/figures/gget_overview.png?raw=true) - -If you use `gget` in a publication, please [cite*](https://pachterlab.github.io/gget/en/cite.html): + +If you use `gget` in a publication, please [cite*](https://pachterlab.github.io/gget/en/cite.html): ``` Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. https://doi.org/10.1093/bioinformatics/btac836 ``` @@ -42,7 +42,7 @@ For use in Jupyter Lab / Google Colab: # Python import gget ``` -# [🔗 Manual](https://pachterlab.github.io/gget) +# [🔗 Manual](https://pachterlab.github.io/gget) # 🪄 Quick start guide Command line: diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index d3f679228..000000000 --- a/dev-requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -coverage>=5.1 -pytest>=7.0.0 -pytest-cov>=6.2.1 -openai<=0.28.1 -cellxgene-census -parameterized==0.9.0 -bravado==11.0.3 diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md index 9ccbe4bed..ca34b5f9c 100644 --- a/docs/src/SUMMARY.md +++ b/docs/src/SUMMARY.md @@ -9,7 +9,7 @@ * [Quick Start Guide](en/quick_start_guide.md) # Manual -* [gget 8cube](en/8cube.md) +* [gget 8cube](en/8cube.md) * [gget alphafold](en/alphafold.md) * [gget archs4](en/archs4.md) * [gget bgee](en/bgee.md) @@ -25,12 +25,12 @@ * [gget info](en/info.md) * [gget muscle](en/muscle.md) * [gget mutate](en/mutate.md) -* [gget opentargets](en/opentargets.md) +* [gget opentargets](en/opentargets.md) * [gget pdb](en/pdb.md) * [gget ref](en/ref.md) * [gget search](en/search.md) * [gget setup](en/setup.md) -* [gget seq](en/seq.md) +* [gget seq](en/seq.md) * [gget virus](en/virus.md) --- @@ -44,14 +44,14 @@ # Español * [Introdución](es/introduction.md) * [¡Lo más reciente!](es/updates.md) -* [Dependientes y Noticias](es/dependents.md) +* [Dependientes y Noticias](es/dependents.md) # Guía del usario * [Instalación](es/installation.md) * [Guía de inicio rápido](es/quick_start_guide.md) # Manuál -* [gget 8cube](es/8cube.md) +* [gget 8cube](es/8cube.md) * [gget alphafold](es/alphafold.md) * [gget archs4](es/archs4.md) * [gget bgee](es/bgee.md) @@ -67,7 +67,7 @@ * [gget info](es/info.md) * [gget muscle](es/muscle.md) * [gget mutate](es/mutate.md) -* [gget opentargets](es/opentargets.md) +* [gget opentargets](es/opentargets.md) * [gget pdb](es/pdb.md) * [gget ref](es/ref.md) * [gget search](es/search.md) @@ -80,4 +80,3 @@ * [Guía de contribución](es/contributing.md) * [Codigo de conducto](es/code_of_conduct.md) * [Cómo citar](es/cite.md) - diff --git a/docs/src/en/8cube.md b/docs/src/en/8cube.md index b7dbf9e32..684ec7512 100644 --- a/docs/src/en/8cube.md +++ b/docs/src/en/8cube.md @@ -36,7 +36,7 @@ Gene symbols or Ensembl gene IDs. Multiple genes allowed. **Optional arguments** `-csv` `--csv` -Returns CSV instead of JSON (command-line only). +Returns CSV instead of JSON (command-line only). Python: Use `json=False` (default DataFrame) or `json=True` for JSON. `-o` `--out` diff --git a/docs/src/en/alphafold.md b/docs/src/en/alphafold.md index 27c26403f..b647e8936 100644 --- a/docs/src/en/alphafold.md +++ b/docs/src/en/alphafold.md @@ -13,10 +13,10 @@ Before using `gget alphafold` for the first time: `conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0` For Python version 3.11: `conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0` - + Recommendation: Follow with `conda update -qy conda` to update conda to the latest version afterwards. - -3. Run `gget setup alphafold` / `gget.setup("alphafold")` once (also see [`gget setup`](setup.md)). Running `gget setup alphafold` / `gget.setup("alphafold")` will download and install the latest version of AlphaFold2 hosted on the [AlphaFold GitHub Repo](https://github.com/deepmind/alphafold). You can rerun this command any time to update the software after a new AlphaFold release. + +3. Run `gget setup alphafold` / `gget.setup("alphafold")` once (also see [`gget setup`](setup.md)). Running `gget setup alphafold` / `gget.setup("alphafold")` will download and install the latest version of AlphaFold2 hosted on the [AlphaFold GitHub Repo](https://github.com/deepmind/alphafold). You can rerun this command any time to update the software after a new AlphaFold release. **Positional argument** `sequence` @@ -27,27 +27,27 @@ Amino acid sequence (str), or list of sequences (*gget alphafold will automatica The multimer model will continue recycling until the predictions stop changing, up to the limit set here. Default: 3. For higher accuracy, at the potential cost of longer inference times, set this to 20. -`-o` `--out` +`-o` `--out` Path to folder to save prediction results in (str). Default: "./[date_time]_gget_alphafold_prediction". - -**Flags** + +**Flags** `-mfm` `--multimer_for_monomer` Use multimer model for a monomer. -`-r` `--relax` -AMBER relax the best model. +`-r` `--relax` +AMBER relax the best model. -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. -Python: Use `verbose=False` to prevent progress information from being displayed. +Python: Use `verbose=False` to prevent progress information from being displayed. `plot` Python only. `plot=True` provides an interactive, 3D graphical overview of the predicted structure and alignment quality using [py3Dmol](https://pypi.org/project/py3Dmol/) and [matplotlib](https://matplotlib.org/) (default: True). `show_sidechains` Python only. `show_sidechains=True` includes side chains in the plot (default: True). - - + + ### Example ```bash # Generate new prediction from amino acid sequence @@ -83,7 +83,7 @@ gget.pdb("2K42", save=True) ### [🔗 gget alphafold FAQ](https://github.com/pachterlab/gget/discussions/39) # References -If you use `gget alphafold` in a publication, please cite the following articles: +If you use `gget alphafold` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/en/archs4.md b/docs/src/en/archs4.md index 373b1b7e2..23371433e 100644 --- a/docs/src/en/archs4.md +++ b/docs/src/en/archs4.md @@ -17,15 +17,15 @@ Alternatively: use flag `--ensembl` to input an Ensembl gene IDs, e.g. ENSG00000 'tissue' returns a tissue expression atlas calculated from human or mouse samples (as defined by 'species') in [ARCHS4](https://maayanlab.cloud/archs4/). `-s` `--species` -'human' (default) or 'mouse'. +'human' (default) or 'mouse'. Defines whether to use human or mouse samples from [ARCHS4](https://maayanlab.cloud/archs4/). (Only for tissue expression atlas.) -`-o` `--out` -Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out. +`-o` `--out` +Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out. Python: `save=True` will save the output in the current working directory. - -**Flags** + +**Flags** `-e` `--ensembl` Add this flag if `gene` is given as an Ensembl gene ID. @@ -33,11 +33,11 @@ Add this flag if `gene` is given as an Ensembl gene ID. Command-line only. Returns results in CSV format. Python: Use `json=True` to return output in JSON format. -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. -Python: Use `verbose=False` to prevent progress information from being displayed. - - +Python: Use `verbose=False` to prevent progress information from being displayed. + + ### Examples ```bash gget archs4 ACE2 @@ -49,10 +49,10 @@ gget.archs4("ACE2") → Returns the 100 most correlated genes to ACE2: | gene_symbol | pearson_correlation | -| -------------- |-------------------------| -| SLC5A1 | 0.579634 | -| CYP2C18 | 0.576577 | -| . . . | . . . | +| -------------- |-------------------------| +| SLC5A1 | 0.579634 | +| CYP2C18 | 0.576577 | +| . . . | . . . |

@@ -66,9 +66,9 @@ gget.archs4("ACE2", which="tissue") → Returns the tissue expression of ACE2 (by default, human data is used): | id | min | q1 | median | q3 | max | -| ------ |--------| ------ |--------| ------ |--------| +| ------ |--------| ------ |--------| ------ |--------| | System.Urogenital/Reproductive System.Kidney.RENAL CORTEX | 0.113644 | 8.274060 | 9.695840 | 10.51670 | 11.21970 | -| System.Digestive System.Intestine.INTESTINAL EPITHELIAL CELL | 0.113644 | 5.905560 | 9.570450 | 13.26470 | 13.83590 | +| System.Digestive System.Intestine.INTESTINAL EPITHELIAL CELL | 0.113644 | 5.905560 | 9.570450 | 13.26470 | 13.83590 | | . . . | . . . | . . . | . . . | . . . | . . . |

@@ -80,7 +80,7 @@ Check out [this tutorial](https://davetang.org/muse/2023/05/16/check-where-a-gen #### [More examples](https://github.com/pachterlab/gget_examples) # References -If you use `gget archs4` in a publication, please cite the following articles: +If you use `gget archs4` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/en/bgee.md b/docs/src/en/bgee.md index 6ea2b7833..7a49ee457 100644 --- a/docs/src/en/bgee.md +++ b/docs/src/en/bgee.md @@ -2,7 +2,7 @@ > Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag. # gget bgee 🐝 -Fetch orthology and gene expression data from [Bgee](https://www.bgee.org/) using Ensembl IDs. +Fetch orthology and gene expression data from [Bgee](https://www.bgee.org/) using Ensembl IDs. Return format: JSON/CSV (command-line) or data frame (Python). > If you are specifically interested in human gene expression data, consider using [gget opentargets](./opentargets.md) or [gget archs4](./archs4.md) instead. @@ -21,19 +21,19 @@ NOTE: Some of the species in [Bgee](https://www.bgee.org/) are not in Ensembl or `-t` `--type` Type of data to fetch. Options: `orthologs` (default), `expression`. -`-o` `--out` +`-o` `--out` Path to the JSON file the results will be saved in, e.g. path/to/directory/results.json. Default: Standard out. -**Flags** +**Flags** `-csv` `--csv` Command-line only. Returns the output in CSV format, instead of JSON format. Python: Use `json=True` to return output in JSON format. -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. Python: Use `verbose=False` to prevent progress information from being displayed. - - + + ### Examples **Get orthologs for a gene** @@ -102,11 +102,11 @@ gget.bgee(["ENSBTAG00000047356", "ENSBTAG00000018317"], type="expression") | BGEE:0000000 | anatomical entity and cellular component | 89.12 | high| expressed | | ... | ... | ... | ... | ... | - + #### [More examples](https://github.com/pachterlab/gget_examples) # References -If you use `gget bgee` in a publication, please cite the following articles: +If you use `gget bgee` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/en/blast.md b/docs/src/en/blast.md index 10c7ad2fa..4aee7a527 100644 --- a/docs/src/en/blast.md +++ b/docs/src/en/blast.md @@ -6,7 +6,7 @@ BLAST a nucleotide or amino acid sequence to any [BLAST](https://blast.ncbi.nlm. Return format: JSON (command-line) or data frame/CSV (Python). **Positional argument** -`sequence` +`sequence` Nucleotide or amino acid sequence, or path to FASTA or .txt file. **Optional arguments** @@ -25,8 +25,8 @@ Limits number of hits to return. Default: 50. `-e` `--expect` Defines the [expect value](https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=FAQ#expect) cutoff. Default: 10.0. -`-o` `--out` -Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out. +`-o` `--out` +Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out. Python: `save=True` will save the output in the current working directory. **Flags** @@ -40,13 +40,13 @@ Turns off MegaBLAST algorithm. Default: MegaBLAST on (blastn only). Command-line only. Returns results in CSV format. Python: Use `json=True` to return output in JSON format. -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. Python: Use `verbose=False` to prevent progress information from being displayed. `wrap_text` -Python only. `wrap_text=True` displays data frame with wrapped text for easy reading (default: False). - +Python only. `wrap_text=True` displays data frame with wrapped text for easy reading (default: False). + ### Example ```bash gget blast MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRIQLPSEKAIFLFVDKTVPQSR @@ -60,7 +60,7 @@ gget.blast("MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRI | Description | Scientific Name | Common Name | Taxid | Max Score | Total Score | Query Cover | ... | | -------------- |-------------------------| ------------------------| -------------- | ----------|-----|---|---| | PREDICTED: gamma-aminobutyric acid receptor-as...| Colobus angolensis palliatus | NaN | 336983 | 180 | 180 | 100% | ... | -| . . . | . . . | . . . | . . . | . . . | . . . | . . . | ... | +| . . . | . . . | . . . | . . . | . . . | . . . | . . . | ... |

**BLAST from .fa or .txt file:** @@ -71,12 +71,12 @@ gget blast fasta.fa # Python gget.blast("fasta.fa") ``` -→ Returns the BLAST results of the first sequence contained in the fasta.fa file. +→ Returns the BLAST results of the first sequence contained in the fasta.fa file. #### [More examples](https://github.com/pachterlab/gget_examples) # References -If you use `gget blast` in a publication, please cite the following articles: +If you use `gget blast` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/en/blat.md b/docs/src/en/blat.md index 169784c87..8f64580e3 100644 --- a/docs/src/en/blat.md +++ b/docs/src/en/blat.md @@ -2,34 +2,34 @@ > Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag. # gget blat 🎯 -Find the genomic location of a nucleotide or amino acid sequence using [BLAT](https://genome.ucsc.edu/cgi-bin/hgBlat). +Find the genomic location of a nucleotide or amino acid sequence using [BLAT](https://genome.ucsc.edu/cgi-bin/hgBlat). Return format: JSON (command-line) or data frame/CSV (Python). **Positional argument** -`sequence` +`sequence` Nucleotide or amino acid sequence, or path to FASTA or .txt file. **Optional arguments** -`-st` `--seqtype` -'DNA', 'protein', 'translated%20RNA', or 'translated%20DNA'. +`-st` `--seqtype` +'DNA', 'protein', 'translated%20RNA', or 'translated%20DNA'. Default: 'DNA' for nucleotide sequences; 'protein' for amino acid sequences. `-a` `--assembly` -'human' (hg38) (default), 'mouse' (mm39), 'zebrafinch' (taeGut2), +'human' (hg38) (default), 'mouse' (mm39), 'zebrafinch' (taeGut2), or any of the species assemblies available [here](https://genome.ucsc.edu/cgi-bin/hgBlat) (use short assembly name). -`-o` `--out` -Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out. +`-o` `--out` +Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out. Python: `save=True` will save the output in the current working directory. - + **Flags** `-csv` `--csv` Command-line only. Returns results in CSV format. Python: Use `json=True` to return output in JSON format. -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. -Python: Use `verbose=False` to prevent progress information from being displayed. +Python: Use `verbose=False` to prevent progress information from being displayed. ### Example @@ -49,9 +49,8 @@ gget.blat("MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRIQ #### [More examples](https://github.com/pachterlab/gget_examples) # References -If you use `gget blat` in a publication, please cite the following articles: +If you use `gget blat` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - Kent WJ. BLAT--the BLAST-like alignment tool. Genome Res. 2002 Apr;12(4):656-64. doi: 10.1101/gr.229202. PMID: 11932250; PMCID: PMC187518. - diff --git a/docs/src/en/cbio.md b/docs/src/en/cbio.md index d8b6c224d..da70c8e3f 100644 --- a/docs/src/en/cbio.md +++ b/docs/src/en/cbio.md @@ -4,18 +4,18 @@ # gget cbio 📖 Plot cancer genomics heatmaps using data from [cBioPortal](https://www.cbioportal.org/) using Ensembl IDs or gene names. -This module was written by [Sam Wagenaar](https://github.com/techno-sam). +This module was written by [Sam Wagenaar](https://github.com/techno-sam). **Positional argument** `subcommand` Either `search` or `plot` ### `search` subcommand (Python: `gget.cbio_search`) -Find cBioPortal study IDs by keyword. -Return format: JSON (command-line) or string list (Python). +Find cBioPortal study IDs by keyword. +Return format: JSON (command-line) or string list (Python). **Note: This does not return studies with mixed cancer types.** -**Positional argument** +**Positional argument** `keywords` Space-separated list of keywords to search for, e.g. breast lung. Python: Pass keywords as a list of strings. @@ -25,14 +25,14 @@ Plot cancer genomics heatmaps using data from cBioPortal. Return format: PNG (command-line and Python) **Required arguments** -`-s` `--study_ids` +`-s` `--study_ids` Space-separated list of cBioPortal study IDs, e.g. msk_impact_2017 egc_msk_2023. `-g` `--genes` Space-separated list of gene names or Ensembl IDs, e.g. NOTCH3 ENSG00000108375. **Optional arguments** -`-st` `--stratification` +`-st` `--stratification` Column to stratify the data by. Default: `tissue`. Options: - tissue @@ -41,8 +41,8 @@ Options: - study_id - sample -`-vt` `--variation_type` -Type of variation to plot. Default: `mutation_occurrences`. +`-vt` `--variation_type` +Type of variation to plot. Default: `mutation_occurrences`. Options: - mutation_occurrences - cna_nonbinary (Note: `stratification` must be 'sample' for this option) @@ -50,18 +50,18 @@ Options: - cna_occurrences - Consequence (Note: `stratification` must be 'sample' for this option) -`-f` `--filter` -Filter the data by a specific value in a specific column, e.g. `study_id:msk_impact_2017` +`-f` `--filter` +Filter the data by a specific value in a specific column, e.g. `study_id:msk_impact_2017` Python: `filter=(column, value)` `-dd` `--data_dir` Directory to store data files. Default: `./gget_cbio_cache`. -`-fd` `--figure_dir` +`-fd` `--figure_dir` Directory to output figures. Default: `./gget_cbio_figures`. `-fn` `--filename` -Filename for the output figure, relative to `figure_dir`. Default: auto-generated +Filename for the output figure, relative to `figure_dir`. Default: auto-generated Python: `figure_filename` `-t` `--title` @@ -71,23 +71,23 @@ Python: `figure_title` `-dpi` `--dpi` DPI of the output figure. Default: 100. -**Flags** +**Flags** -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. -Python: Use `verbose=False` to prevent progress information from being displayed. +Python: Use `verbose=False` to prevent progress information from being displayed. -`-nc` `--no_confirm` +`-nc` `--no_confirm` Command-line only. Skip download confirmation prompts. Python: Use `confirm_download=True` to enable download confirmation prompts. `-sh` `--show` Show the plot in a window (automatic in Jupyter notebooks). - - + + ### Examples -**Find all cBioPortal studies with cancer types matching specific keywords:** +**Find all cBioPortal studies with cancer types matching specific keywords:** ```bash gget cbio search esophag ovary ovarian ``` @@ -104,7 +104,7 @@ gget.cbio_search(['esophag', 'ovary', 'ovarian'])

-**Plot a heatmap of mutation occurrences for specific genes in a specific study:** +**Plot a heatmap of mutation occurrences for specific genes in a specific study:** ```bash gget cbio plot \ -s msk_impact_2017 \ @@ -131,7 +131,7 @@ gget.cbio_plot(

-**Plot a heatmap of mutation types for specific genes in a specific study:** +**Plot a heatmap of mutation types for specific genes in a specific study:** ```bash gget cbio plot \ -s msk_impact_2017 \ @@ -217,19 +217,18 @@ gget.cbio_plot( → Saves a heatmap of mutation types for the specified genes in the specified study, filtered by tissue, with the title "Intestinal Mutations" to `./gget_cbio_figures/intestinal_mutations.png`. ![Heatmap](https://raw.githubusercontent.com/pachterlab/gget/b32c01efefd55d37c19034ce96a86826e30ae3e5/docs/assets/gget_cbio_figure_4.png) - + #### [More examples](https://github.com/pachterlab/gget_examples) # References -If you use `gget cbio` in a publication, please cite the following articles: +If you use `gget cbio` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - Cerami E, Gao J, Dogrusoz U, Gross BE, Sumer SO, Aksoy BA, Jacobsen A, Byrne CJ, Heuer ML, Larsson E, Antipin Y, Reva B, Goldberg AP, Sander C, Schultz N. The cBio cancer genomics portal: an open platform for exploring multidimensional cancer genomics data. Cancer Discov. 2012 May;2(5):401-4. doi: [10.1158/2159-8290.CD-12-0095](https://doi.org/10.1158/2159-8290.cd-12-0095). Erratum in: Cancer Discov. 2012 Oct;2(10):960. PMID: 22588877; PMCID: PMC3956037. - + - Gao J, Aksoy BA, Dogrusoz U, Dresdner G, Gross B, Sumer SO, Sun Y, Jacobsen A, Sinha R, Larsson E, Cerami E, Sander C, Schultz N. Integrative analysis of complex cancer genomics and clinical profiles using the cBioPortal. Sci Signal. 2013 Apr 2;6(269):pl1. doi: [10.1126/scisignal.2004088](https://doi.org/10.1126/scisignal.2004088). PMID: 23550210; PMCID: PMC4160307. - + - de Bruijn I, Kundra R, Mastrogiacomo B, Tran TN, Sikina L, Mazor T, Li X, Ochoa A, Zhao G, Lai B, Abeshouse A, Baiceanu D, Ciftci E, Dogrusoz U, Dufilie A, Erkoc Z, Garcia Lara E, Fu Z, Gross B, Haynes C, Heath A, Higgins D, Jagannathan P, Kalletla K, Kumari P, Lindsay J, Lisman A, Leenknegt B, Lukasse P, Madela D, Madupuri R, van Nierop P, Plantalech O, Quach J, Resnick AC, Rodenburg SYA, Satravada BA, Schaeffer F, Sheridan R, Singh J, Sirohi R, Sumer SO, van Hagen S, Wang A, Wilson M, Zhang H, Zhu K, Rusk N, Brown S, Lavery JA, Panageas KS, Rudolph JE, LeNoue-Newton ML, Warner JL, Guo X, Hunter-Zinck H, Yu TV, Pilai S, Nichols C, Gardos SM, Philip J; AACR Project GENIE BPC Core Team, AACR Project GENIE Consortium; Kehl KL, Riely GJ, Schrag D, Lee J, Fiandalo MV, Sweeney SM, Pugh TJ, Sander C, Cerami E, Gao J, Schultz N. Analysis and Visualization of Longitudinal Genomic and Clinical Data from the AACR Project GENIE Biopharma Collaborative in cBioPortal. Cancer Res. 2023 Dec 1;83(23):3861-3867. doi: [10.1158/0008-5472.CAN-23-0816](https://doi.org/10.1158/0008-5472.CAN-23-0816). PMID: 37668528; PMCID: PMC10690089. - -- Please also cite the source of the data if you are using a publicly available dataset. +- Please also cite the source of the data if you are using a publicly available dataset. diff --git a/docs/src/en/cellxgene.md b/docs/src/en/cellxgene.md index 018175d34..44b137679 100644 --- a/docs/src/en/cellxgene.md +++ b/docs/src/en/cellxgene.md @@ -2,7 +2,7 @@ > Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag. # gget cellxgene 🍱 -Query data from [CZ CELLxGENE Discover](https://cellxgene.cziscience.com/) using the [CZ CELLxGENE Discover Census](https://github.com/chanzuckerberg/cellxgene-census). [CZ CELLxGENE Discover](https://cellxgene.cziscience.com/) provides ready-to-use single-cell RNA sequencing count matrices for certain tissues/diseases/genes/etc. +Query data from [CZ CELLxGENE Discover](https://cellxgene.cziscience.com/) using the [CZ CELLxGENE Discover Census](https://github.com/chanzuckerberg/cellxgene-census). [CZ CELLxGENE Discover](https://cellxgene.cziscience.com/) provides ready-to-use single-cell RNA sequencing count matrices for certain tissues/diseases/genes/etc. Returns: An AnnData object containing the count matrix and metadata of single-cell RNA sequencing data from the defined tissues/genes/etc. @@ -15,7 +15,7 @@ Non-human primates ('macaca_mulatta', 'callithrix_jacchus', 'pan_troglodytes') r `-g` `--gene` Str or list of gene name(s) or Ensembl ID(s). Default: None. - NOTE: Use `-e / --ensembl` (Python: `ensembl=True`) when providing Ensembl ID(s) instead of gene name(s). + NOTE: Use `-e / --ensembl` (Python: `ensembl=True`) when providing Ensembl ID(s) instead of gene name(s). NOTE: Gene symbols are case sensitive! Use canonical casing when passing gene symbols, e.g., 'PAX7' (human), 'Pax7' (mouse). See https://cellxgene.cziscience.com/gene-expression for examples of available genes. @@ -27,7 +27,7 @@ List of metadata columns to return (stored in AnnData.obs). Default: ['dataset_id', 'assay', 'suspension_type', 'sex', 'tissue_general', 'tissue', 'cell_type'] For more options, see: https://api.cellxgene.cziscience.com/curation/ui/#/ -> Schemas -> dataset -`-o` `--out` +`-o` `--out` Path to file to save generated AnnData .h5ad file (or .csv with `-mo / --meta_only`). Required when using from command line! @@ -38,7 +38,7 @@ Use when genes are provided as Ensembl IDs instead of gene names. `-mo` `--meta_only` Only returns metadata data frame (corresponds to AnnData.obs). -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. Python: Use `verbose=False` to prevent progress information from being displayed. @@ -104,7 +104,7 @@ Str or list of sex ontology ID(s) as defined in the [CELLxGENE dataset schema](h `--suspension_type` Str or list of suspension type(s) as defined in the [CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Default: None. - + ### Examples ```bash gget cellxgene --gene ACE2 ABCA1 SLC5A1 --tissue lung --cell_type 'mucus secreting cell' 'neuroendocrine cell' -o example_adata.h5ad @@ -142,9 +142,8 @@ df Also see: [https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_gget_demo.html](https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_gget_demo.html) # References -If you use `gget cellxgene` in a publication, please cite the following articles: +If you use `gget cellxgene` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - Chanzuckerberg Initiative. (n.d.). CZ CELLxGENE Discover. Retrieved [insert date here], from [https://cellxgene.cziscience.com/](https://cellxgene.cziscience.com/) - diff --git a/docs/src/en/cite.md b/docs/src/en/cite.md index b090d2413..694d6172c 100644 --- a/docs/src/en/cite.md +++ b/docs/src/en/cite.md @@ -4,7 +4,7 @@ # Citation -If you use `gget` in a publication, please cite: +If you use `gget` in a publication, please cite: Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - If using `gget alphafold`, please also cite: @@ -13,32 +13,32 @@ Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference data And, if applicable: - Evans, R. et al. Protein complex prediction with AlphaFold-Multimer. bioRxiv 2021.10.04.463034; [https://doi.org/10.1101/2021.10.04.463034](https://doi.org/10.1101/2021.10.04.463034) -- If using `gget archs4`, please also cite: +- If using `gget archs4`, please also cite: - Lachmann A, Torre D, Keenan AB, Jagodnik KM, Lee HJ, Wang L, Silverstein MC, Ma’ayan A. Massive mining of publicly available RNA-seq data from human and mouse. Nature Communications 9. Article number: 1366 (2018), doi:10.1038/s41467-018-03751-6 - Bray NL, Pimentel H, Melsted P and Pachter L, Near optimal probabilistic RNA-seq quantification, Nature Biotechnology 34, p 525--527 (2016). [https://doi.org/10.1038/nbt.3519](https://doi.org/10.1038/nbt.3519) - If using `gget bgee`, please also cite: - Frederic B Bastian, Julien Roux, Anne Niknejad, Aurélie Comte, Sara S Fonseca Costa, Tarcisio Mendes de Farias, Sébastien Moretti, Gilles Parmentier, Valentine Rech de Laval, Marta Rosikiewicz, Julien Wollbrett, Amina Echchiki, Angélique Escoriza, Walid H Gharib, Mar Gonzales-Porta, Yohan Jarosz, Balazs Laurenczy, Philippe Moret, Emilie Person, Patrick Roelli, Komal Sanjeev, Mathieu Seppey, Marc Robinson-Rechavi (2021). The Bgee suite: integrated curated expression atlas and comparative transcriptomics in animals. Nucleic Acids Research, Volume 49, Issue D1, 8 January 2021, Pages D831–D847, [https://doi.org/10.1093/nar/gkaa793](https://doi.org/10.1093/nar/gkaa793) - + - If using `gget blast`, please also cite: - Altschul SF, Gish W, Miller W, Myers EW, Lipman DJ. Basic local alignment search tool. J Mol Biol. 1990 Oct 5;215(3):403-10. doi: 10.1016/S0022-2836(05)80360-2. PMID: 2231712. -- If using `gget blat`, please also cite: +- If using `gget blat`, please also cite: - Kent WJ. BLAT--the BLAST-like alignment tool. Genome Res. 2002 Apr;12(4):656-64. doi: 10.1101/gr.229202. PMID: 11932250; PMCID: PMC187518. - If using `gget cbio`, please also cite: - Cerami E, Gao J, Dogrusoz U, Gross BE, Sumer SO, Aksoy BA, Jacobsen A, Byrne CJ, Heuer ML, Larsson E, Antipin Y, Reva B, Goldberg AP, Sander C, Schultz N. The cBio cancer genomics portal: an open platform for exploring multidimensional cancer genomics data. Cancer Discov. 2012 May;2(5):401-4. doi: [10.1158/2159-8290.CD-12-0095](https://doi.org/10.1158/2159-8290.cd-12-0095). Erratum in: Cancer Discov. 2012 Oct;2(10):960. PMID: 22588877; PMCID: PMC3956037. - + - Gao J, Aksoy BA, Dogrusoz U, Dresdner G, Gross B, Sumer SO, Sun Y, Jacobsen A, Sinha R, Larsson E, Cerami E, Sander C, Schultz N. Integrative analysis of complex cancer genomics and clinical profiles using the cBioPortal. Sci Signal. 2013 Apr 2;6(269):pl1. doi: [10.1126/scisignal.2004088](https://doi.org/10.1126/scisignal.2004088). PMID: 23550210; PMCID: PMC4160307. - + - de Bruijn I, Kundra R, Mastrogiacomo B, Tran TN, Sikina L, Mazor T, Li X, Ochoa A, Zhao G, Lai B, Abeshouse A, Baiceanu D, Ciftci E, Dogrusoz U, Dufilie A, Erkoc Z, Garcia Lara E, Fu Z, Gross B, Haynes C, Heath A, Higgins D, Jagannathan P, Kalletla K, Kumari P, Lindsay J, Lisman A, Leenknegt B, Lukasse P, Madela D, Madupuri R, van Nierop P, Plantalech O, Quach J, Resnick AC, Rodenburg SYA, Satravada BA, Schaeffer F, Sheridan R, Singh J, Sirohi R, Sumer SO, van Hagen S, Wang A, Wilson M, Zhang H, Zhu K, Rusk N, Brown S, Lavery JA, Panageas KS, Rudolph JE, LeNoue-Newton ML, Warner JL, Guo X, Hunter-Zinck H, Yu TV, Pilai S, Nichols C, Gardos SM, Philip J; AACR Project GENIE BPC Core Team, AACR Project GENIE Consortium; Kehl KL, Riely GJ, Schrag D, Lee J, Fiandalo MV, Sweeney SM, Pugh TJ, Sander C, Cerami E, Gao J, Schultz N. Analysis and Visualization of Longitudinal Genomic and Clinical Data from the AACR Project GENIE Biopharma Collaborative in cBioPortal. Cancer Res. 2023 Dec 1;83(23):3861-3867. doi: [10.1158/0008-5472.CAN-23-0816](https://doi.org/10.1158/0008-5472.CAN-23-0816). PMID: 37668528; PMCID: PMC10690089. - + - Please also cite the source of the data if you are using a publicly available dataset. - + - If using `gget cellxgene`, please also cite: - Chanzuckerberg Initiative. (n.d.). CZ CELLxGENE Discover. Retrieved [insert date here], from [https://cellxgene.cziscience.com/](https://cellxgene.cziscience.com/) - + - If using `gget cosmic`, please also cite: - Tate JG, Bamford S, Jubb HC, Sondka Z, Beare DM, Bindal N, Boutselakis H, Cole CG, Creatore C, Dawson E, Fish P, Harsha B, Hathaway C, Jupe SC, Kok CY, Noble K, Ponting L, Ramshaw CC, Rye CE, Speedy HE, Stefancsik R, Thompson SL, Wang S, Ward S, Campbell PJ, Forbes SA. COSMIC: the Catalogue Of Somatic Mutations In Cancer. Nucleic Acids Res. 2019 Jan 8;47(D1):D941-D947. doi: [10.1093/nar/gky1015](https://doi.org/10.1093/nar/gky1015). PMID: 30371878; PMCID: PMC6323903. @@ -47,43 +47,43 @@ Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference data - If using `gget elm`, please also cite: - Laura Luebbert, Chi Hoang, Manjeet Kumar, Lior Pachter, Fast and scalable querying of eukaryotic linear motifs with gget elm, Bioinformatics, 2024, btae095, [https://doi.org/10.1093/bioinformatics/btae095](https://doi.org/10.1093/bioinformatics/btae095) - + - Manjeet Kumar, Sushama Michael, Jesús Alvarado-Valverde, Bálint Mészáros, Hugo Sámano‐Sánchez, András Zeke, Laszlo Dobson, Tamas Lazar, Mihkel Örd, Anurag Nagpal, Nazanin Farahi, Melanie Käser, Ramya Kraleti, Norman E Davey, Rita Pancsa, Lucía B Chemes, Toby J Gibson, The Eukaryotic Linear Motif resource: 2022 release, Nucleic Acids Research, Volume 50, Issue D1, 7 January 2022, Pages D497–D508, [https://doi.org/10.1093/nar/gkab975](https://doi.org/10.1093/nar/gkab975) - -- If using `gget enrichr`, please also cite: - - Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128) - - Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377) +- If using `gget enrichr`, please also cite: + - Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128) + + - Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377) - Xie Z, Bailey A, Kuleshov MV, Clarke DJB., Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM, Jeon M, & Ma’ayan A. Gene set knowledge discovery with Enrichr. Current Protocols, 1, e90. 2021. doi: [10.1002/cpz1.90](https://doi.org/10.1002/cpz1.90). - + If working with non-human/mouse datasets, please also cite: - Kuleshov MV, Diaz JEL, Flamholz ZN, Keenan AB, Lachmann A, Wojciechowicz ML, Cagan RL, Ma'ayan A. modEnrichr: a suite of gene set enrichment analysis tools for model organisms. Nucleic Acids Res. 2019 Jul 2;47(W1):W183-W190. doi: [10.1093/nar/gkz347](https://doi.org/10.1093/nar/gkz347). PMID: 31069376; PMCID: PMC6602483. - If using `gget info`, please also cite: - Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606. - + - Sayers EW, Beck J, Bolton EE, Brister JR, Chan J, Comeau DC, Connor R, DiCuccio M, Farrell CM, Feldgarden M, Fine AM, Funk K, Hatcher E, Hoeppner M, Kane M, Kannan S, Katz KS, Kelly C, Klimke W, Kim S, Kimchi A, Landrum M, Lathrop S, Lu Z, Malheiro A, Marchler-Bauer A, Murphy TD, Phan L, Prasad AB, Pujar S, Sawyer A, Schmieder E, Schneider VA, Schoch CL, Sharma S, Thibaud-Nissen F, Trawick BW, Venkatapathi T, Wang J, Pruitt KD, Sherry ST. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2024 Jan 5;52(D1):D33-D43. doi: [10.1093/nar/gkad1044](https://doi.org/10.1093/nar/gkad1044). PMID: 37994677; PMCID: PMC10767890. - + - The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052) - If using `gget muscle`, please also cite: - Edgar RC (2021), MUSCLE v5 enables improved estimates of phylogenetic tree confidence by ensemble bootstrapping, bioRxiv 2021.06.20.449169. [https://doi.org/10.1101/2021.06.20.449169](https://doi.org/10.1101/2021.06.20.449169) - + - If using `gget opentargets`, please also cite: - Ochoa D, Hercules A, Carmona M, Suveges D, Baker J, Malangone C, Lopez I, Miranda A, Cruz-Castillo C, Fumis L, Bernal-Llinares M, Tsukanov K, Cornu H, Tsirigos K, Razuvayevskaya O, Buniello A, Schwartzentruber J, Karim M, Ariano B, Martinez Osorio RE, Ferrer J, Ge X, Machlitt-Northen S, Gonzalez-Uriarte A, Saha S, Tirunagari S, Mehta C, Roldán-Romero JM, Horswell S, Young S, Ghoussaini M, Hulcoop DG, Dunham I, McDonagh EM. The next-generation Open Targets Platform: reimagined, redesigned, rebuilt. Nucleic Acids Res. 2023 Jan 6;51(D1):D1353-D1359. doi: [10.1093/nar/gkac1046](https://doi.org/10.1093/nar/gkac1046). PMID: 36399499; PMCID: PMC9825572. - + - If using `gget pdb`, please also cite: - Berman HM, Westbrook J, Feng Z, Gilliland G, Bhat TN, Weissig H, Shindyalov IN, Bourne PE. The Protein Data Bank. Nucleic Acids Res. 2000 Jan 1;28(1):235-42. doi: [10.1093/nar/28.1.235](https://doi.org/10.1093/nar/28.1.235). PMID: 10592235; PMCID: PMC102472. - If using `gget ref` or `gget search`, please also cite: - Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606. - + - If using `gget seq`, please also cite: - Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606. - + - The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052) - + ___ # Disclaimer `gget` is only as accurate as the databases/servers/APIs it queries from. The accuracy or reliability of the data is not guaranteed or warranted in any way and the providers disclaim liability of any kind whatsoever, including, without limitation, liability for quality, performance, merchantability and fitness for a particular purpose arising out of the use, or inability to use the data. diff --git a/docs/src/en/contributing.md b/docs/src/en/contributing.md index b22b375d0..9cdba4c60 100644 --- a/docs/src/en/contributing.md +++ b/docs/src/en/contributing.md @@ -1,73 +1 @@ -[ View page source on GitHub ](https://github.com/pachterlab/gget/blob/main/docs/src/en/contributing.md) - -# Welcome to gget's contributing guide - -Thank you for investing your time in contributing to our project! Any contribution you make will be reflected on the [gget repo](https://github.com/pachterlab/gget). ✨ - -Read our [Code of Conduct](./code_of_conduct.md) to keep our community approachable and respectable. - -In this guide you will get an overview of the contribution workflow from opening an issue or creating a pull request (PR) to reviewing and merging a PR. - -## Issues - -### Create a new issue - -If you spot a problem with gget or you have an idea for a new feature, [check if an issue already exists](https://github.com/pachterlab/gget/issues). If a related issue doesn't exist, you can open a new issue using the relevant [issue form](https://github.com/pachterlab/gget/issues/new/choose). - -### Solve an issue - -Scan through our [existing issues](https://github.com/pachterlab/gget/issues) to find one that interests you. You can narrow down the search using `labels` as filters. If you find an issue to work on, you are welcome to open a PR with a fix. - -## Contribute through pull requests - -### Getting started - -1. Fork the repository. -- Using GitHub Desktop: - - [Getting started with GitHub Desktop](https://docs.github.com/en/desktop/installing-and-configuring-github-desktop/getting-started-with-github-desktop) will guide you through setting up Desktop. - - Once Desktop is set up, you can use it to [fork the repo](https://docs.github.com/en/desktop/contributing-and-collaborating-using-github-desktop/cloning-and-forking-repositories-from-github-desktop)! - -- Using the command line: - - [Fork the repo](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo#fork-an-example-repository) so that you can make your changes without affecting the original project until you're ready to merge them. - -2. Create a working branch and start with your changes! - -### Commit your update - -Commit the changes once you are happy with them. - -### ‼️ Self-review the following before creating a Pull Request ‼️ - -1. Review the content for technical accuracy. -2. Copy-edit the changes/comments for grammar, spelling, and adherence to the general style of existing gget code. -3. Format your code using [black](https://black.readthedocs.io/en/stable/getting_started.html). -4. Make sure the unit tests pass: - - Developer dependencies can be installed with `pip install -r dev-requirements.txt` - - Run existing unit tests from the gget repository root with `coverage run -m pytest -ra -v tests && coverage report --omit=main.py,tests*` -5. Add new unit tests if applicable: - - Arguments and expected results are stored in json files in ./tests/fixtures/ - - Unit tests can be added to ./tests/test_*.py and will be automatically detected -6. Make sure the edits are compatible with both the Python and the command line interface - - The command line interface and arguments are defined in ./gget/main.py -8. Add new modules/arguments to the documentation if applicable: - - The manual for each module can be added/edited in `./docs/src/en/*.md` (the Spanish version of the docs in `./docs/src/es/*.md` is automatically generated/updated, and does not need to be edited manually) - -If you have any questions, feel free to start a [discussion](https://github.com/pachterlab/gget/discussions) or create an issue as described above. - -### Pull Request - -When you're finished with the changes, [create a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request), also known as a PR. - -‼️ Please make all PRs against the `dev` branch of the gget repository. - -- Don't forget to [link PR to issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) if you are solving one. -- Enable the checkbox to [allow maintainer edits](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/allowing-changes-to-a-pull-request-branch-created-from-a-fork) so the branch can be updated for a merge. -- If you run into any merge issues, checkout this [git tutorial](https://github.com/skills/resolve-merge-conflicts) to help you resolve merge conflicts and other issues. - -Once you submit your PR, a gget team member will review your proposal. We may ask questions or request additional information. - -### Your PR is merged! - -Congratulations! 🎉 The gget team thanks you. ✨ - -Once your PR is merged, your contributions will be publicly visible on the [gget repo](https://github.com/pachterlab/gget). +{{#include ../../../CONTRIBUTING.md}} diff --git a/docs/src/en/cosmic.md b/docs/src/en/cosmic.md index 2b1829c54..9d2c34250 100644 --- a/docs/src/en/cosmic.md +++ b/docs/src/en/cosmic.md @@ -3,7 +3,7 @@ > Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag. # gget cosmic 🪐 Search for genes, mutations, and other factors associated with cancer using the [COSMIC](https://cancer.sanger.ac.uk/cosmic) (Catalogue Of Somatic Mutations In Cancer) database. -Return format: JSON (command-line) or data frame/CSV (Python) when `download_cosmic=False`. When `download_cosmic=True`, downloads the requested database into the specified folder. +Return format: JSON (command-line) or data frame/CSV (Python) when `download_cosmic=False`. When `download_cosmic=True`, downloads the requested database into the specified folder. This module was originally written in part by [@AubakirovArman](https://github.com/AubakirovArman) (information querying) and [@josephrich98](https://github.com/josephrich98) (database download). @@ -12,13 +12,13 @@ NOTE: License fees apply for the commercial use of COSMIC. You can read more abo NOTE: When using this module for the first time, first download a COSMIC database to obtain `cosmic_tsv_path` (see examples below). **Positional argument (for querying information)** -`searchterm` -Search term, which can be a mutation, or gene name (or Ensembl ID), or sample, etc. +`searchterm` +Search term, which can be a mutation, or gene name (or Ensembl ID), or sample, etc. Examples: 'EGFR', 'ENST00000275493', 'c.650A>T', 'p.Q217L', 'COSV51765119', 'BT2012100223LNCTB' (sample ID) NOTE: (Python only) Set to `None` when downloading COSMIC databases with `download_cosmic=True`. **Required argument (for querying information)** -`-ctp` `--cosmic_tsv_path` +`-ctp` `--cosmic_tsv_path` Path to the COSMIC database tsv file, e.g. 'path/to/CancerMutationCensus_AllData_v101_GRCh37.tsv'. This file is downloaded when downloading COSMIC databases using the arguments described below. NOTE: This is a required argument when `download_cosmic=False`. @@ -41,8 +41,8 @@ Creates a modified version of the COSMIC database for use with [`gget mutate`](m **Optional arguments (for downloading COSMIC databases)** `-cp` `--cosmic_project` 'cancer' (default), 'cancer_example', 'census', 'resistance', 'cell_line', 'genome_screen', or 'targeted_screen' -Type of COSMIC database to download: - +Type of COSMIC database to download: + | cosmic_project | Description | Notes | Size | |-----------------|-----------------------------------------------------------------------|------------------------------------------------------------------------------------|--------| | cancer | Cancer Mutation Census (CMC) (most commonly used COSMIC mutation set) | Only available for GRCh37. Most feature-rich schema (takes the longest to search). | 2 GB | @@ -82,18 +82,18 @@ Whether to remove duplicate rows from the modified database for use with `gget m (str) Name of the mutation_id column in the csv file created by `gget_mutate`. Default: "mutation_id" **Optional arguments (general)** -`-o` `--out` +`-o` `--out` Path to the file (or folder when downloading databases with the `download_cosmic` flag) the results will be saved in, e.g. 'path/to/results.json'. -Defaults: +Defaults: -> When `download_cosmic=False`: Results will be returned to standard out -> When `download_cosmic=True`: Database will be downloaded into current working directory **Flags (general)** -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. Python: Use `verbose=False` to prevent progress information from being displayed. - + ### Examples #### Download the COSMIC "cancer" database and query information ```bash @@ -143,7 +143,7 @@ gget.cosmic("EGFR", cosmic_tsv_path="Cosmic_MutantCensus_Tsv_v101_GRCh37/Cosmic_ # References -If you use `gget cosmic` in a publication, please cite the following articles: +If you use `gget cosmic` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/en/dependents.md b/docs/src/en/dependents.md index b0d0e9296..9d496fec5 100644 --- a/docs/src/en/dependents.md +++ b/docs/src/en/dependents.md @@ -32,7 +32,7 @@ The following applications build on `gget`: > "Tools are grouped into families such as literature [...], genomics (biopython, **gget**), and machine learning (rdkit, pymol)." - [PerTurboAgent](https://www.biorxiv.org/content/10.1101/2025.05.25.656020v1) A Self-Planning Agent for Boosting Sequential Perturb-seq Experiments. - > "We [...] use packages **gget** and blitzgsea for data enrichment analysis" + > "We [...] use packages **gget** and blitzgsea for data enrichment analysis" - [Scientific skills for Claude](https://github.com/K-Dense-AI/claude-scientific-skills) by K-Dense-AI > " This repository contains 138 scientific skills organized across multiple domains. Each skill provides comprehensive documentation, code examples, and best practices for working with scientific libraries, databases, and tools. > 🧬 Bioinformatics & Genomics @@ -92,7 +92,7 @@ ____ - Shanmugampillai Jeyarajaguru Kabilan et al., [Molecular modelling approaches for the identification of potent Sodium-Glucose Cotransporter 2 inhibitors from Boerhavia diffusa for the potential treatment of chronic kidney disease.](https://doi.org/10.21203/rs.3.rs-4520611/v1) *Journal of Computer-Aided Molecular Design (under review)* (2024). DOI: 10.21203/rs.3.rs-4520611/v1 - Joseph M Rich et al., [The impact of package selection and versioning on single-cell RNA-seq analysis.](https://pmc.ncbi.nlm.nih.gov/articles/PMC11014608/#:~:text=10.1101/2024.04.04.588111) *bioRxiv* (2024). DOI: 10.1101/2024.04.04.588111 - Sanjay C. Nagi et al., [AnoPrimer: Primer Design in malaria vectors informed by range-wide genomic variation.](https://wellcomeopenresearch.org/articles/9-255/v1) *Wellcome Open Research* (2024). -- Yasmin Makki Mohialden et al., [A survey of the most recent Python packages for use in biology.](http://dx.doi.org/10.48047/NQ.2023.21.2.NQ23029) *NeuroQuantology* (2023). DOI: 10.48047/NQ.2023.21.2.NQ23029 +- Yasmin Makki Mohialden et al., [A survey of the most recent Python packages for use in biology.](http://dx.doi.org/10.48047/NQ.2023.21.2.NQ23029) *NeuroQuantology* (2023). DOI: 10.48047/NQ.2023.21.2.NQ23029 - Nicola A. Kearns et al., [Generation and molecular characterization of human pluripotent stem cell-derived pharyngeal foregut endoderm.](https://doi.org/10.1016/j.devcel.2023.08.024) *Cell Reports* (2023). DOI: 10.1016/j.devcel.2023.08.024 - Jonathan Rosenski et al., [Predicting gene knockout effects from expression data.](https://link.springer.com/article/10.1186/s12920-023-01446-6) *BMC Medical Genomics* (2023). DOI: 10.1186/s12920-023-01446-6 - Peter Overby et al., [Pharmacological or genetic inhibition of Scn9a protects beta-cells while reducing insulin secretion in type 1 diabetes.](https://doi.org/10.1101/2023.06.11.544521) *bioRxiv* (2023). DOI: 10.1101/2023.06.11.544521 @@ -113,4 +113,3 @@ ___ # 🚂 [gget code repository](https://github.com/pachterlab/gget/) traffic ![Plots showing the cumulative clones and views of the gget GitHub repository.](https://raw.githubusercontent.com/pachterlab/gget/traffic/plots/gget_cumulative_clones_views.png) Updates automatically every week on Sunday at 23:55 (UTC). - diff --git a/docs/src/en/diamond.md b/docs/src/en/diamond.md index 3675c1fdc..7077733c2 100644 --- a/docs/src/en/diamond.md +++ b/docs/src/en/diamond.md @@ -2,7 +2,7 @@ > Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag. # gget diamond 💎 -Align multiple protein or translated DNA sequences using [DIAMOND](https://www.nature.com/articles/nmeth.3176) (DIAMOND is similar to BLAST, but this is a local computation). +Align multiple protein or translated DNA sequences using [DIAMOND](https://www.nature.com/articles/nmeth.3176) (DIAMOND is similar to BLAST, but this is a local computation). Return format: JSON (command-line) or data frame/CSV (Python). **Positional argument** @@ -20,7 +20,7 @@ Path to save DIAMOND database created from `reference` (str). Default: None -> Temporary db file will be deleted after alignment or saved in `out` if `out` is provided. `-s` `--sensitivity` -Sensitivity of alignment (str). Default: "very-sensitive". +Sensitivity of alignment (str). Default: "very-sensitive". One of the following: fast, mid-sensitive, sensitive, more-sensitive, very-sensitive, or ultra-sensitive. `-t` `--threads` @@ -29,20 +29,20 @@ Number of threads used (int). Default: 1. `-db` `--diamond_binary` Path to DIAMOND binary (str). Default: None -> Uses DIAMOND binary installed with `gget`. -`-o` `--out` -Path to the folder to save results in (str), e.g. "path/to/directory". Default: Standard out; temporary files are deleted. +`-o` `--out` +Path to the folder to save results in (str), e.g. "path/to/directory". Default: Standard out; temporary files are deleted. **Flags** `-x` `--translated` Perform translated alignment of nucleotide sequences to amino acid reference sequences. - + `-csv` `--csv` Command-line only. Returns results in CSV format. Python: Use `json=True` to return output in JSON format. -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. -Python: Use `verbose=False` to prevent progress information from being displayed. +Python: Use `verbose=False` to prevent progress information from being displayed. ### Example ```bash @@ -53,7 +53,7 @@ gget diamond GGETISAWESQME ELVISISALIVE LQVEFRANKLIN PACHTERLABRQCKS -ref GGETIS # Python gget.diamond(["GGETISAWESQME", "ELVISISALIVE", "LQVEFRANKLIN", "PACHTERLABRQCKS"], reference=["GGETISAWESQMEELVISISALIVELQVEFRANKLIN", "PACHTERLABRQCKS"]) ``` -→ Returns results in JSON (command-line) or data frame/CSV (Python) format: +→ Returns results in JSON (command-line) or data frame/CSV (Python) format: |query_accession|subject_accession|identity_percentage|query_seq_length|subject_seq_length|length|mismatches|gap_openings|query_start|query_end|subject_start|subject_end|e-value |bit_score| |---------------|-----------------|-------------------|----------------|------------------|------|----------|------------|-----------|---------|-------------|-----------|--------|---------| @@ -64,7 +64,7 @@ gget.diamond(["GGETISAWESQME", "ELVISISALIVE", "LQVEFRANKLIN", "PACHTERLABRQCKS" #### [More examples](https://github.com/pachterlab/gget_examples) # References -If you use `gget diamond` in a publication, please cite the following articles: +If you use `gget diamond` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/en/elm.md b/docs/src/en/elm.md index 07e4ab423..915030cdf 100644 --- a/docs/src/en/elm.md +++ b/docs/src/en/elm.md @@ -2,12 +2,12 @@ > Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag. # gget elm 🎭 -Locally predict Eukaryotic Linear Motifs from an amino acid sequence or UniProt Acc using data from the [ELM database](http://elm.eu.org/). +Locally predict Eukaryotic Linear Motifs from an amino acid sequence or UniProt Acc using data from the [ELM database](http://elm.eu.org/). Return format: JSON (command-line) or data frame/CSV (Python). This module returns two data frames (or JSON formatted files) (see examples). -**ELM data can be downloaded & distributed for non-commercial use according to the [ELM Software License Agreement](http://elm.eu.org/media/Elm_academic_license.pdf).** +**ELM data can be downloaded & distributed for non-commercial use according to the [ELM Software License Agreement](http://elm.eu.org/media/Elm_academic_license.pdf).** -Before using `gget elm` for the first time, run `gget setup elm` (bash) / `gget.setup("elm")` (Python) once (also see [`gget setup`](setup.md)). +Before using `gget elm` for the first time, run `gget setup elm` (bash) / `gget.setup("elm")` (Python) once (also see [`gget setup`](setup.md)). **Positional argument** `sequence` @@ -16,7 +16,7 @@ When providing a Uniprot Acc, use flag `--uniprot` (Python: `uniprot=True`). **Optional arguments** `-s` `--sensitivity` -Sensitivity of DIAMOND alignment (str). Default: "very-sensitive". +Sensitivity of DIAMOND alignment (str). Default: "very-sensitive". One of the following: fast, mid-sensitive, sensitive, more-sensitive, very-sensitive, or ultra-sensitive. `-t` `--threads` @@ -25,23 +25,23 @@ Number of threads used in DIAMOND alignment (int). Default: 1. `-bin` `--diamond_binary` Path to DIAMOND binary (str). Default: None -> Uses DIAMOND binary installed with `gget`. -`-o` `--out` -Path to the folder to save results in (str), e.g. "path/to/directory". Default: Standard out; temporary files are deleted. +`-o` `--out` +Path to the folder to save results in (str), e.g. "path/to/directory". Default: Standard out; temporary files are deleted. **Flags** `-u` `--uniprot` Set to True if `sequence` is a Uniprot Acc instead of an amino acid sequence. -`-e` `--expand` -Expand the information returned in the regex data frame to include the protein names, organisms, and references that the motif was orignally validated on. +`-e` `--expand` +Expand the information returned in the regex data frame to include the protein names, organisms, and references that the motif was orignally validated on. `-csv` `--csv` Command-line only. Returns results in CSV format. Python: Use `json=True` to return output in JSON format. -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. -Python: Use `verbose=False` to prevent progress information from being displayed. +Python: Use `verbose=False` to prevent progress information from being displayed. ### Examples Find ELMs in an amino acid sequence: @@ -54,7 +54,7 @@ gget elm -o gget_elm_results LIAQSIGQASFV gget.setup(“elm”) # Downloads/updates local ELM database ortholog_df, regex_df = gget.elm("LIAQSIGQASFV") ``` - + Find ELMs giving a UniProt Acc as input: ```bash gget setup elm # Downloads/updates local ELM database @@ -68,14 +68,14 @@ ortholog_df, regex_df = gget.elm("Q02410", uniprot=True, expand=True) → Returns two data frames (or JSON formatted dictionaries for command line) containing extensive information about linear motifs associated with orthologous proteins and motifs found in the input sequence directly based on their regex expressions: ortholog_df: - + |Ortholog_UniProt_Acc|ProteinName|class_accession|ELMIdentifier |FunctionalSiteName |Description |Organism |… | |:-----------------:|:---------:|:-------------:|:-------------:|:-----------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------:|:----------:|:-:| |Q02410 |APBA1_HUMAN|ELME000357 |LIG_CaMK_CASK_1|CASK CaMK domain binding ligand motif|Motif that mediates binding to the calmodulin-dependent protein kinase (CaMK) domain of the peripheral plasma membrane protein CASK/Lin2.|Homo sapiens|… | |Q02410 |APBA1_HUMAN|ELME000091 |LIG_PDZ_Class_2|PDZ domain ligands |The C-terminal class 2 PDZ-binding motif is classically represented by a pattern such as |Homo sapiens|… | regex_df: - + |Instance_accession|ELMIdentifier |FunctionalSiteName |ELMType|Description |Instances (Matched Sequence)|Organism |… | |:----------------:|:----------------:|:-----------------------------:|:-----:|:-----------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------:|:----------------------------:|:-:| |ELME000321 |CLV_C14_Caspase3-7|Caspase cleavage motif |CLV |Caspase-3 and Caspase-7 cleavage site. |ERSDG |Mus musculus |… | @@ -87,13 +87,13 @@ regex_df: # Tutorials ### [🔗 General `gget elm` demo](https://github.com/pachterlab/gget_examples/blob/main/gget_elm_demo.ipynb) - + ### [🔗 A point mutation in BRCA2 is carcinogenic due to the loss of a protein interaction motif](https://github.com/pachterlab/gget_examples/blob/main/gget_elm_BRCA2_example.ipynb) - + ### [🔗 Filter `gget elm` results based on disordered protein regions](https://github.com/pachterlab/gget_examples/blob/main/gget_elm_IUPred3_tutorial.ipynb) # References -If you use `gget elm` in a publication, please cite the following articles: +If you use `gget elm` in a publication, please cite the following articles: - Laura Luebbert, Chi Hoang, Manjeet Kumar, Lior Pachter, Fast and scalable querying of eukaryotic linear motifs with gget elm, _Bioinformatics_, 2024, btae095, [https://doi.org/10.1093/bioinformatics/btae095](https://doi.org/10.1093/bioinformatics/btae095) diff --git a/docs/src/en/enrichr.md b/docs/src/en/enrichr.md index a529cb8cb..5df9e9496 100644 --- a/docs/src/en/enrichr.md +++ b/docs/src/en/enrichr.md @@ -4,7 +4,7 @@ # gget enrichr 💰 Perform an enrichment analysis on a list of genes using [Enrichr](https://maayanlab.cloud/Enrichr/) or [modEnrichr](https://maayanlab.cloud/modEnrichr/). Return format: JSON (command-line) or data frame/CSV (Python). - + **Positional argument** `genes` Short names (gene symbols) of genes to perform enrichment analysis on, e.g. PHF14 RBM3 MSL1 PHF21A. @@ -17,12 +17,12 @@ Supports any database listed [here](https://maayanlab.cloud/Enrichr/#libraries) 'pathway'       (KEGG_2021_Human) 'transcription'     (ChEA_2016) 'ontology'      (GO_Biological_Process_2021) -'diseases_drugs'   (GWAS_Catalog_2019) +'diseases_drugs'   (GWAS_Catalog_2019) 'celltypes'      (PanglaoDB_Augmented_2021) 'kinase_interactions'  (KEA_2015) - -NOTE: database shortcuts are not supported for species other than 'human' or 'mouse'. Click on the species databases listed below under `species` to view a list of databases available for each species. - + +NOTE: database shortcuts are not supported for species other than 'human' or 'mouse'. Click on the species databases listed below under `species` to view a list of databases available for each species. + **Optional arguments** `-s` `--species` Species to use as reference for the enrichment analysis. (Default: human) @@ -42,8 +42,8 @@ Short names (gene symbols) of background genes to perform enrichment analysis on Alternatively: use flag `--ensembl_background` to input a list of Ensembl gene IDs. See [this Tweetorial](https://x.com/ChiHoangCaltech/status/1689679611335155712?s=20) to learn why you should use a background gene list when performing an enrichment analysis. -`-o` `--out` -Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). (Default: Standard out.) +`-o` `--out` +Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). (Default: Standard out.) Python: `save=True` will save the output in the current working directory. `-ko` `--kegg_out` @@ -58,9 +58,9 @@ Python only. (width, height) of plot in inches. (Default: (10,10)) `ax` Python only. Pass a matplotlib axes object for plot customization. (Default: None) - + **Flags** -`-e` `--ensembl` +`-e` `--ensembl` Add this flag if `genes` are given as Ensembl gene IDs. `-e_b` `--ensembl_bkg` @@ -68,19 +68,19 @@ Add this flag if `background_list` are given as Ensembl gene IDs. `-bkg` `--background` If True, use set of > 20,000 default background genes listed [here](https://github.com/pachterlab/gget/blob/main/gget/constants/enrichr_bkg_genes.txt). - + `-csv` `--csv` Command-line only. Returns results in CSV format. Python: Use `json=True` to return output in JSON format. -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. -Python: Use `verbose=False` to prevent progress information from being displayed. - +Python: Use `verbose=False` to prevent progress information from being displayed. + `plot` Python only. `plot=True` provides a graphical overview of the first 15 results (default: False). - - + + ### Examples ```bash gget enrichr -db ontology ACE2 AGT AGTR1 @@ -110,10 +110,10 @@ gget.enrichr( genes = [ "PHF14", "RBM3", "MSL1", "PHF21A", "ARL10", "INSR", "JADE2", "P2RX7", "LINC00662", "CCDC101", "PPM1B", "KANSL1L", "CRYZL1", "ANAPC16", "TMCC1", - "CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2", "PLBD2", "LARP7", "TECPR2", + "CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2", "PLBD2", "LARP7", "TECPR2", "ZNF302", "CUX1", "MOB2", "CYTH2", "SEC22C", "EIF4E3", "ROBO2", "ADAMTS9-AS2", "CXXC1", "LINC01314", "ATF7", "ATP5F1" - ], + ], database = "ChEA_2022", background_list = [ "NSUN3","POLRMT","NLRX1","SFXN5","ZC3H12C","SLC25A39","ARSG", @@ -128,11 +128,11 @@ gget.enrichr( "ZFP787","ZFP655","RABEPK","ZFP650","4732466D17RIK","EXOSC4", "WDR42A","GPHN","2610528J11RIK","1110003E01RIK","MDH1","1200014M14RIK", "AW209491","MUT","1700123L14RIK","2610036D13RIK", - "PHF14", "RBM3", "MSL1", "PHF21A", "ARL10", "INSR", "JADE2", - "P2RX7", "LINC00662", "CCDC101", "PPM1B", "KANSL1L", "CRYZL1", - "ANAPC16", "TMCC1","CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2", - "PLBD2", "LARP7", "TECPR2", "ZNF302", "CUX1", "MOB2", "CYTH2", - "SEC22C", "EIF4E3", "ROBO2", "ADAMTS9-AS2", "CXXC1", "LINC01314", "ATF7", + "PHF14", "RBM3", "MSL1", "PHF21A", "ARL10", "INSR", "JADE2", + "P2RX7", "LINC00662", "CCDC101", "PPM1B", "KANSL1L", "CRYZL1", + "ANAPC16", "TMCC1","CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2", + "PLBD2", "LARP7", "TECPR2", "ZNF302", "CUX1", "MOB2", "CYTH2", + "SEC22C", "EIF4E3", "ROBO2", "ADAMTS9-AS2", "CXXC1", "LINC01314", "ATF7", "ATP5F1""COX15","TMEM30A","NSMCE4A","TM2D2","RHBDD3","ATXN2","NFS1", "3110001I20RIK","BC038156","C330002I19RIK","ZFYVE20","POLI","TOMM70A", "LOC100047782","2410012H22RIK","RILP","A230062G08RIK", @@ -226,15 +226,15 @@ df |> [Using `gget enrichr` with background genes](https://github.com/pachterlab/gget_examples/blob/main/gget_enrichr_with_background_genes.ipynb) # References -If you use `gget enrichr` in a publication, please cite the following articles: +If you use `gget enrichr` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) -- Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128) +- Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128) -- Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377) +- Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377) - Xie Z, Bailey A, Kuleshov MV, Clarke DJB., Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM, Jeon M, & Ma’ayan A. Gene set knowledge discovery with Enrichr. Current Protocols, 1, e90. 2021. doi: [10.1002/cpz1.90](https://doi.org/10.1002/cpz1.90). - + If working with non-human/mouse datasets, please also cite: - Kuleshov MV, Diaz JEL, Flamholz ZN, Keenan AB, Lachmann A, Wojciechowicz ML, Cagan RL, Ma'ayan A. modEnrichr: a suite of gene set enrichment analysis tools for model organisms. Nucleic Acids Res. 2019 Jul 2;47(W1):W183-W190. doi: [10.1093/nar/gkz347](https://doi.org/10.1093/nar/gkz347). PMID: 31069376; PMCID: PMC6602483. diff --git a/docs/src/en/gpt.md b/docs/src/en/gpt.md index 9d02ae483..bc07fba2f 100644 --- a/docs/src/en/gpt.md +++ b/docs/src/en/gpt.md @@ -2,7 +2,7 @@ > Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag. # gget gpt 💬 -Generates natural language text based on a given prompt using the [OpenAI](https://openai.com/) API's 'openai.ChatCompletion.create' endpoint. +Generates natural language text based on a given prompt using the [OpenAI](https://openai.com/) API's 'openai.ChatCompletion.create' endpoint. This module, including its source code, documentation and unit tests, were partly written by OpenAI's Chat-GTP3. NOTE: @@ -27,34 +27,34 @@ Your OpenAI API key (str) ([get your API key](https://platform.openai.com/accoun The name of the GPT model to use for generating the text (str). Default is "gpt-3.5-turbo". See https://platform.openai.com/docs/models/gpt-4 for more information on the available models. -`-temp` `--temperature` +`-temp` `--temperature` Value between 0 and 2 that controls the level of randomness and creativity in the generated text (float). Higher values result in more creative and varied text. Default is 1. -`-tp` `--top_p` +`-tp` `--top_p` Controls the diversity of the generated text as an alternative to sampling with temperature (float). Higher values result in more diverse and unexpected text. Default is 1. Note: OpenAI recommends altering this or temperature but not both. -`-s` `--stop` +`-s` `--stop` A sequence of tokens to mark the end of the generated text (str). Default is None. -`-mt` `--max_tokens` +`-mt` `--max_tokens` Controls the maximum length of the generated text, in tokens (int). Default is 200. -`-pp` `--presence_penalty` +`-pp` `--presence_penalty` Number between -2.0 and 2.0. Higher values result increase the model's likelihood to talk about new topics (float). Default is 0. -`-fp` `--frequency_penalty` +`-fp` `--frequency_penalty` Number between -2.0 and 2.0. Higher values decrease the model's likelihood to repeat the same line verbatim (float). Default is 0. -`-lb` `--logit_bias` +`-lb` `--logit_bias` A dictionary that specifies a bias towards certain tokens in the generated text (dict). Default is None. -`-o` `--out` +`-o` `--out` If provided, saves the generated text to a file with the specified path (str). Default: Standard out. - - + + ### Example ```bash gget gpt "How are you today GPT?" your_api_token diff --git a/docs/src/en/info.md b/docs/src/en/info.md index e1f4ff192..a76aebdb6 100644 --- a/docs/src/en/info.md +++ b/docs/src/en/info.md @@ -6,33 +6,33 @@ Fetch extensive gene and transcript metadata from [Ensembl](https://www.ensembl. Return format: JSON (command-line) or data frame/CSV (Python). **Positional argument** -`ens_ids` +`ens_ids` One or more Ensembl IDs (WormBase and Flybase IDs are also supported). -NOTE: Providing a list of more than 1,000 Ensembl IDs at once might result in a server error (to process more than 1,000 IDs, split the list of IDs into chunks of 1,000 IDs and run these separately). +NOTE: Providing a list of more than 1,000 Ensembl IDs at once might result in a server error (to process more than 1,000 IDs, split the list of IDs into chunks of 1,000 IDs and run these separately). **Optional arguments** -`-o` `--out` -Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out. +`-o` `--out` +Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out. Python: `save=True` will save the output in the current working directory. **Flags** `-n` `--ncbi` TURN OFF results from [NCBI](https://www.ncbi.nlm.nih.gov/). -Python: `ncbi=False` prevents data retrieval from NCBI (default: True). +Python: `ncbi=False` prevents data retrieval from NCBI (default: True). `-u` `--uniprot` TURN OFF results from [UniProt](https://www.uniprot.org/). -Python: `uniprot=False` prevents data retrieval from UniProt (default: True). +Python: `uniprot=False` prevents data retrieval from UniProt (default: True). `-pdb` `--pdb` INCLUDE [PDB](https://www.ebi.ac.uk/pdbe/) IDs in output (might increase runtime). -Python: `pdb=True` includes PDB IDs in the results (default: False). +Python: `pdb=True` includes PDB IDs in the results (default: False). `-csv` `--csv` Command-line only. Returns results in CSV format. Python: Use `json=True` to return output in JSON format. -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. Python: Use `verbose=False` to prevent progress information from being displayed. @@ -54,17 +54,16 @@ gget.info(["ENSG00000034713", "ENSG00000104853", "ENSG00000170296"]) | -------------- |-------------------------| ------------------------| -------------- | ----------|-----|----|----|----|----|----|----| | ENSG00000034713| P60520 | 11345 | GABARAPL2 | [ATG8, ATG8C, FLC3A, GABARAPL2, GATE-16, GATE16, GEF-2, GEF2] | Gamma-aminobutyric acid receptor-associated protein like 2 (GABA(A) receptor-associated protein-like 2)... | GABA type A receptor associated protein like 2 [Source:HGNC Symbol;Acc:HGNC:13291] | FUNCTION: Ubiquitin-like modifier involved in intra- Golgi traffic (By similarity). Modulates intra-Golgi transport through coupling between NSF activity and ... | Enables ubiquitin protein ligase binding activity. Involved in negative regulation of proteasomal protein catabolic process and protein... | protein_coding | ENST00000037243.7 |... | | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | ... | - + #### [More examples](https://github.com/pachterlab/gget_examples) # References -If you use `gget info` in a publication, please cite the following articles: +If you use `gget info` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606. - + - Sayers EW, Beck J, Bolton EE, Brister JR, Chan J, Comeau DC, Connor R, DiCuccio M, Farrell CM, Feldgarden M, Fine AM, Funk K, Hatcher E, Hoeppner M, Kane M, Kannan S, Katz KS, Kelly C, Klimke W, Kim S, Kimchi A, Landrum M, Lathrop S, Lu Z, Malheiro A, Marchler-Bauer A, Murphy TD, Phan L, Prasad AB, Pujar S, Sawyer A, Schmieder E, Schneider VA, Schoch CL, Sharma S, Thibaud-Nissen F, Trawick BW, Venkatapathi T, Wang J, Pruitt KD, Sherry ST. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2024 Jan 5;52(D1):D33-D43. doi: [10.1093/nar/gkad1044](https://doi.org/10.1093/nar/gkad1044). PMID: 37994677; PMCID: PMC10767890. - -- The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052) +- The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052) diff --git a/docs/src/en/introduction.md b/docs/src/en/introduction.md index 244a05930..4bd02ca88 100644 --- a/docs/src/en/introduction.md +++ b/docs/src/en/introduction.md @@ -8,10 +8,10 @@ # Welcome! [](https://raw.githubusercontent.com/pachterlab/gget/main/figures/gget_overview.png) - + `gget` is a free, open-source command-line tool and Python package that enables efficient querying of genomic databases.
-`gget` consists of a collection of separate but interoperable modules, each designed to facilitate one type of database querying in a single line of code. +`gget` consists of a collection of separate but interoperable modules, each designed to facilitate one type of database querying in a single line of code.
`gget` is part of the [scverse®](https://scverse.org) project and is fiscally sponsored by [NumFOCUS](https://numfocus.org/). If you like `gget` and want to support our mission, please consider making a tax-deductible [donation](https://opencollective.com/scverse/projects/scverse-gget/donate?interval=oneTime&amount=20&contributeAs=me). @@ -63,7 +63,7 @@ These are the `gget` core modules. Click on any module to access detailed docume
-If you use `gget` in a publication, please [cite*](/gget/en/cite.md): +If you use `gget` in a publication, please [cite*](/gget/en/cite.md): ``` Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. https://doi.org/10.1093/bioinformatics/btac836 ``` @@ -73,7 +73,7 @@ Read the article here: [https://doi.org/10.1093/bioinformatics/btac836](https://
[![gget PyPI downloads over the last year](https://github.com/lauraluebbert/gget_downloads/raw/main/plots/downloads_gget_daily.png)](https://github.com/lauraluebbert/gget_downloads/tree/main) - +

@@ -98,4 +98,3 @@ Read the article here: [https://doi.org/10.1093/bioinformatics/btac836](https://     logo-okfn

- diff --git a/docs/src/en/muscle.md b/docs/src/en/muscle.md index 6d8c18b3b..16a512540 100644 --- a/docs/src/en/muscle.md +++ b/docs/src/en/muscle.md @@ -6,12 +6,12 @@ Align multiple nucleotide or amino acid sequences to each other using [Muscle5]( Return format: ClustalW formatted standard out or aligned FASTA (.afa). **Positional argument** -`fasta` +`fasta` List of sequences or path to FASTA or .txt file containing the nucleotide or amino acid sequences to be aligned. **Optional arguments** -`-o` `--out` -Path to the aligned FASTA file the results will be saved in, e.g. path/to/directory/results.afa. Default: Standard out. +`-o` `--out` +Path to the aligned FASTA file the results will be saved in, e.g. path/to/directory/results.afa. Default: Standard out. Python: `save=True` will save the output in the current working directory. **Flags** @@ -19,11 +19,11 @@ Python: `save=True` will save the output in the current working directory. Aligns input using the [Super5 algorithm](https://drive5.com/muscle5/Muscle5_SuppMat.pdf) instead of the [Parallel Perturbed Probcons (PPP) algorithm](https://drive5.com/muscle5/Muscle5_SuppMat.pdf) to decrease time and memory. Use for large inputs (a few hundred sequences). -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. -Python: Use `verbose=False` to prevent progress information from being displayed. - - +Python: Use `verbose=False` to prevent progress information from being displayed. + + ### Example ```bash gget muscle MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLAS MSSSSWLLLSLVEVTAAQSTIEQQAKTFLDKFHEAEDLFYQSLLAS @@ -40,7 +40,7 @@ gget muscle fasta.fa # Python gget.muscle("fasta.fa") ``` -→ Returns an overview of the aligned sequences with ClustalW coloring. (To return an aligned FASTA (.afa) file, use `--out` argument (or `save=True` in Jupyter Lab/Google Colab).) In the above example, the 'fasta.fa' includes several sequences to be aligned (e.g. isoforms returned from `gget seq`). +→ Returns an overview of the aligned sequences with ClustalW coloring. (To return an aligned FASTA (.afa) file, use `--out` argument (or `save=True` in Jupyter Lab/Google Colab).) In the above example, the 'fasta.fa' includes several sequences to be aligned (e.g. isoforms returned from `gget seq`). ![alt text](https://github.com/pachterlab/gget/blob/main/figures/example_muscle_return.png?raw=true) @@ -60,9 +60,8 @@ alv.view(msa) #### [More examples](https://github.com/pachterlab/gget_examples) # References -If you use `gget muscle` in a publication, please cite the following articles: +If you use `gget muscle` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - Edgar RC (2021), MUSCLE v5 enables improved estimates of phylogenetic tree confidence by ensemble bootstrapping, bioRxiv 2021.06.20.449169. [https://doi.org/10.1101/2021.06.20.449169](https://doi.org/10.1101/2021.06.20.449169) - diff --git a/docs/src/en/mutate.md b/docs/src/en/mutate.md index 3bb595780..35ca12ded 100644 --- a/docs/src/en/mutate.md +++ b/docs/src/en/mutate.md @@ -10,7 +10,7 @@ This module was written by [Joseph Rich](https://github.com/josephrich98). ** Update: The more complex functionality of gget mutate has been ported to https://github.com/pachterlab/kvar. kvar expands on this functionality in the context of screening for variants/mutations in sequencing data. If this sounds interesting to you, please check it out! ** **Positional argument** -`sequences` +`sequences` Path to the FASTA file containing the sequences to be mutated, e.g., 'path/to/seqs.fa'. Sequence identifiers following the '>' character must correspond to the identifiers in the seq_ID column of `mutations`. @@ -57,20 +57,20 @@ Name of the column containing the IDs of the sequences to be mutated in `mutatio `-mic` `--mut_id_column` Name of the column containing the IDs of each mutation in `mutations`. Default: Same as `mut_column`. - + **Optional mutant sequence generation/filtering arguments** `-k` `--k` Length of sequences flanking the mutation. Default: 30. If k > total length of the sequence, the entire sequence will be kept. - + **Optional general arguments** -`-o` `--out` +`-o` `--out` Path to output FASTA file containing the mutated sequences, e.g., 'path/to/output_fasta.fa'. -Default: None -> returns a list of the mutated sequences to standard out. -The identifiers (following the '>') of the mutated sequences in the output FASTA will be '>[seq_ID]_[mut_ID]'. +Default: None -> returns a list of the mutated sequences to standard out. +The identifiers (following the '>') of the mutated sequences in the output FASTA will be '>[seq_ID]_[mut_ID]'. **Optional general flags** -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. Python: Use `verbose=False` to prevent progress information from being displayed. @@ -94,7 +94,7 @@ gget mutate ATCGCTAAGCT TAGCTA -m 'c.4G>T' 'c.1_3inv' -o mut_fasta.fa # Python gget.mutate(["ATCGCTAAGCT", "TAGCTA"], ["c.4G>T", "c.1_3inv"], out="mut_fasta.fa") ``` -→ Saves 'mut_fasta.fa' file containing: +→ Saves 'mut_fasta.fa' file containing: ``` >seq1_mut1 ATCTCTAAGCT @@ -116,7 +116,6 @@ gget.mutate(["ATCGCTAAGCT", "TAGCTA"], "c.1_3inv", k=3) # References -If you use `gget mutate` in a publication, please cite the following articles: +If you use `gget mutate` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - diff --git a/docs/src/en/opentargets.md b/docs/src/en/opentargets.md index 44be2a9d6..b1fa72975 100644 --- a/docs/src/en/opentargets.md +++ b/docs/src/en/opentargets.md @@ -2,7 +2,7 @@ > Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag. # gget opentargets 🎯 -Fetch associated diseases or drugs from [OpenTargets](https://platform.opentargets.org/) using Ensembl IDs. +Fetch associated diseases or drugs from [OpenTargets](https://platform.opentargets.org/) using Ensembl IDs. Return format: JSON/CSV (command-line) or data frame (Python). This module was written by [Sam Wagenaar](https://github.com/techno-sam). @@ -12,8 +12,8 @@ This module was written by [Sam Wagenaar](https://github.com/techno-sam). Ensembl gene ID, e.g ENSG00000169194. **Optional arguments** -`-r` `--resource` -Defines the type of information to return in the output. Default: 'diseases'. +`-r` `--resource` +Defines the type of information to return in the output. Default: 'diseases'. Possible resources are: | Resource | Return Value | Valid Filters | Sources | @@ -27,35 +27,35 @@ Possible resources are: | `interactions` | Protein⇄protein interactions | `protein_a_id`
`protein_b_id`
`gene_b_id` | | `-l` `--limit` -Limit the number of results, e.g 10. Default: No limit. +Limit the number of results, e.g 10. Default: No limit. Note: Not compatible with the `tractability` and `depmap` resources. -`-o` `--out` +`-o` `--out` Path to the JSON file the results will be saved in, e.g. path/to/directory/results.json. Default: Standard out. Python: `save=True` will save the output in the current working directory. `--filters` Filter results by exact equality using returned OpenTargets column names. Pass multiple filters by repeating the flag, e.g. '--filter disease.id=EFO_0000274 --filter drug.id=CHEMBL1743081'. Nested fields use dot notation, matching the column names returned by the API. -**Flags** +**Flags** `-csv` `--csv` Command-line only. Returns the output in CSV format, instead of JSON format. Python: Use `json=True` to return output in JSON format. -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. -Python: Use `verbose=False` to prevent progress information from being displayed. +Python: Use `verbose=False` to prevent progress information from being displayed. `-or` `--or` Command-line only. Filters are combined with OR logic. Default: AND logic. `wrap_text` Python only. `wrap_text=True` displays data frame with wrapped text for easy reading (default: False). - - + + ### Examples -**Get associated diseases for a specific gene:** +**Get associated diseases for a specific gene:** ```bash gget opentargets ENSG00000169194 -r diseases -l 1 ``` @@ -72,7 +72,7 @@ gget.opentargets('ENSG00000169194', resource='diseases', limit=1)

-**Get associated drugs for a specific gene:** +**Get associated drugs for a specific gene:** ```bash gget opentargets ENSG00000169194 -r drugs -l 2 ``` @@ -93,7 +93,7 @@ gget.opentargets('ENSG00000169194', resource='drugs', limit=2)

-**Get tractability data for a specific gene:** +**Get tractability data for a specific gene:** ```bash gget opentargets ENSG00000169194 -r tractability ``` @@ -237,13 +237,12 @@ gget.opentargets( | 0.400 | 1 | intact | P35225 | ENSG00000169194 | IL13 | unspecified role | 9606 | Q86XT9 | ENSG00000149932 | TMEM219 | stimulator | 9606 | - + #### [More examples](https://github.com/pachterlab/gget_examples) # References -If you use `gget opentargets` in a publication, please cite the following articles: +If you use `gget opentargets` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - Ochoa D, Hercules A, Carmona M, Suveges D, Baker J, Malangone C, Lopez I, Miranda A, Cruz-Castillo C, Fumis L, Bernal-Llinares M, Tsukanov K, Cornu H, Tsirigos K, Razuvayevskaya O, Buniello A, Schwartzentruber J, Karim M, Ariano B, Martinez Osorio RE, Ferrer J, Ge X, Machlitt-Northen S, Gonzalez-Uriarte A, Saha S, Tirunagari S, Mehta C, Roldán-Romero JM, Horswell S, Young S, Ghoussaini M, Hulcoop DG, Dunham I, McDonagh EM. The next-generation Open Targets Platform: reimagined, redesigned, rebuilt. Nucleic Acids Res. 2023 Jan 6;51(D1):D1353-D1359. doi: [10.1093/nar/gkac1046](https://doi.org/10.1093/nar/gkac1046). PMID: 36399499; PMCID: PMC9825572. - diff --git a/docs/src/en/pdb.md b/docs/src/en/pdb.md index b81c77c68..4f1d749ad 100644 --- a/docs/src/en/pdb.md +++ b/docs/src/en/pdb.md @@ -12,7 +12,7 @@ PDB ID to be queried, e.g. '7S7U'. **Optional arguments** `-r` `--resource` Defines type of information to be returned. One of the following: - 'pdb': Returns the protein structure in PDB format (default). + 'pdb': Returns the protein structure in PDB format (default). 'entry': Information about PDB structures at the top level of PDB structure hierarchical data organization. 'pubmed': Get PubMed annotations (data integrated from PubMed) for a given entry's primary citation. 'assembly': Information about PDB structures at the quaternary structure level. @@ -23,15 +23,15 @@ PDB ID to be queried, e.g. '7S7U'. 'branched_entity_instance': Get branched entity instance description (define chain ID as 'identifier'). 'polymer_entity_instance': Get polymer entity instance (a.k.a chain) data (define chain ID as 'identifier'). 'nonpolymer_entity_instance': Get non-polymer entity instance description (define chain ID as 'identifier'). - + `-i` `--identifier` Can be used to define assembly, entity or chain ID (default: None). Assembly/entity IDs are numbers (e.g. 1), and chain IDs are letters (e.g. 'A'). - -`-o` `--out` -Path to the file the results will be saved in, e.g. path/to/directory/7S7U.pdb or path/to/directory/7S7U_entry.json. Default: Standard out. + +`-o` `--out` +Path to the file the results will be saved in, e.g. path/to/directory/7S7U.pdb or path/to/directory/7S7U_entry.json. Default: Standard out. Python: `save=True` will save the output in the current working directory. - - + + ### Examples ```bash gget pdb 7S7U -o 7S7U.pdb @@ -44,10 +44,10 @@ gget.pdb("7S7U", save=True) **Find PDB crystal structures for a comparative analysis of protein structure:** ```bash -# Find PDB IDs associated with an Ensembl ID +# Find PDB IDs associated with an Ensembl ID gget info ENSG00000130234 -# Alternatively: Since many entries in the PDB do not have linked Ensembl IDs, +# Alternatively: Since many entries in the PDB do not have linked Ensembl IDs, # you will likely find more PDB entries by BLASTing the sequence agains the PDB. # Get the amino acid sequence of a transcript from an Ensembl ID @@ -61,10 +61,10 @@ gget pdb 7DQA -o 7DQA.pdb gget pdb 7CT5 -o 7CT5.pdb ``` ```python -# Find PDB IDs associated with an Ensembl ID +# Find PDB IDs associated with an Ensembl ID gget.info("ENSG00000130234") -# Alternatively: Since many entries in the PDB do not have linked Ensembl IDs, +# Alternatively: Since many entries in the PDB do not have linked Ensembl IDs, # you will likely find more PDB entries by BLASTing the sequence agains the PDB. # Get the amino acid sequence of a transcript from an Ensembl ID @@ -78,14 +78,12 @@ gget.pdb("7DQA", save=True) gget.pdb("7CT5", save=True) ``` → The use case above exemplifies how to find PDB files for comparative analysis of protein structure starting with Ensembl IDs or amino acid sequences. The fetched PDB files can also be compared to predicted structures generated by [`gget alphafold`](alphafold.md). PDB files can be viewed interactively in 3D [online](https://rcsb.org/3d-view), or using programs like [PyMOL](https://pymol.org/) or [Blender](https://www.blender.org/). To compare two PDB files, you can use [this website](https://rcsb.org/alignment). - + #### [More examples](https://github.com/pachterlab/gget_examples) # References -If you use `gget pdb` in a publication, please cite the following articles: +If you use `gget pdb` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - Berman HM, Westbrook J, Feng Z, Gilliland G, Bhat TN, Weissig H, Shindyalov IN, Bourne PE. The Protein Data Bank. Nucleic Acids Res. 2000 Jan 1;28(1):235-42. doi: [10.1093/nar/28.1.235](https://doi.org/10.1093/nar/28.1.235). PMID: 10592235; PMCID: PMC102472. - - diff --git a/docs/src/en/quick_start_guide.md b/docs/src/en/quick_start_guide.md index 7a533de79..84ad90928 100644 --- a/docs/src/en/quick_start_guide.md +++ b/docs/src/en/quick_start_guide.md @@ -97,4 +97,3 @@ gget$pdb("1R42", save=TRUE) gget$virus("Zika virus", host="Homo sapiens", nuc_completeness="complete") ``` #### [More examples](https://github.com/pachterlab/gget_examples) - diff --git a/docs/src/en/ref.md b/docs/src/en/ref.md index 0e6cb63a3..42f0e6b15 100644 --- a/docs/src/en/ref.md +++ b/docs/src/en/ref.md @@ -9,7 +9,7 @@ Return format: dictionary/JSON. `species` Species for which the FTPs will be fetched in the format genus_species, e.g. homo_sapiens. Supports all available vertebrate and invertebrate (plants, fungi, protists, and invertebrate metazoa) genomes from Ensembl, except bacteria. -Note: Not required when using flags `--list_species` or `--list_iv_species`. +Note: Not required when using flags `--list_species` or `--list_iv_species`. Supported shortcuts: 'human', 'mouse', 'human_grch37' (accesses the GRCh37 genome assembly) **Optional arguments** @@ -26,34 +26,34 @@ Possible entries are one or a combination (as comma-separated list) of the follo `-r` `--release` Defines the Ensembl release number from which the files are fetched, e.g. 104. Default: latest Ensembl release. -`-od` `--out_dir` +`-od` `--out_dir` Path to the directory where the FTPs will be saved, e.g. path/to/directory/. Default: Current working directory. -`-o` `--out` +`-o` `--out` Path to the JSON file the results will be saved in, e.g. path/to/directory/results.json. Default: Standard out. Python: `save=True` will save the output in the current working directory. **Flags** -`-l` `--list_species` +`-l` `--list_species` Lists all available vertebrate species. (Python: combine with `species=None`.) -`-liv` `--list_iv_species` +`-liv` `--list_iv_species` Lists all available invertebrate species. (Python: combine with `species=None`.) -`-ftp` `--ftp` +`-ftp` `--ftp` Returns only the requested FTP links. -`-d` `--download` +`-d` `--download` Command-line only. Downloads the requested FTPs to the directory specified by `out_dir` (requires [curl](https://curl.se/docs/) to be installed). -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. -Python: Use `verbose=False` to prevent progress information from being displayed. - - +Python: Use `verbose=False` to prevent progress information from being displayed. + + ### Examples -**Get the genome reference for a specific species:** +**Get the genome reference for a specific species:** ```bash gget ref -w gtf,dna homo_sapiens ``` @@ -93,7 +93,7 @@ gget ref --list_species -r 103 # Python gget.ref(species=None, list_species=True, release=103) ``` -→ Returns a list with all available genomes (checks if GTF and FASTAs are available) from Ensembl release 103. +→ Returns a list with all available genomes (checks if GTF and FASTAs are available) from Ensembl release 103. (If no release is specified, `gget ref` will always return information from the latest Ensembl release.)

@@ -111,7 +111,7 @@ kb ref \ #### [More examples](https://github.com/pachterlab/gget_examples) # References -If you use `gget ref` in a publication, please cite the following articles: +If you use `gget ref` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/en/search.md b/docs/src/en/search.md index 9065530db..712a2dabc 100644 --- a/docs/src/en/search.md +++ b/docs/src/en/search.md @@ -2,33 +2,33 @@ > Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag. # gget search 🔎 -Fetch genes and transcripts from [Ensembl](https://www.ensembl.org/) using free-form search terms. +Fetch genes and transcripts from [Ensembl](https://www.ensembl.org/) using free-form search terms. Results are matched based on the "gene name" and "description" sections in the Ensembl database. `gget` version >= 0.27.9 also includes results that match the Ensembl "synonym" section. Return format: JSON (command-line) or data frame/CSV (Python). **Positional argument** -`searchwords` +`searchwords` One or more free form search words, e.g. gaba nmda. (Note: Search is not case-sensitive.) -**Other required arguments** +**Other required arguments** `-s` `--species` Species or database to be searched. A species can be passed in the format 'genus_species', e.g. 'homo_sapiens' or 'arabidopsis_thaliana'. To pass a specific database, pass the name of the CORE database, e.g. 'mus_musculus_dba2j_core_105_1'. - + All available core databases can be found here: Vertebrates: [http://ftp.ensembl.org/pub/current/mysql/](http://ftp.ensembl.org/pub/current/mysql/) Invertebrates: [http://ftp.ensemblgenomes.org/pub/current/](http://ftp.ensemblgenomes.org/pub/current/) + select kingdom + go to mysql/ - + Supported shortcuts: 'human', 'mouse' **Optional arguments** -`-r` `--release` +`-r` `--release` Defines the Ensembl release number from which the files are fetched, e.g. 104. Default: None -> latest Ensembl release is used. - -Note: *The release argument does not apply to invertebrate species* (you can pass a specific core database (which includes a release number) to the `species` argument instead). For invertebrate species, Ensembl only stores databases from 10 releases prior to the current release. - -This argument is overwritten if a specific database (which includes a release number) is passed to the species argument. + +Note: *The release argument does not apply to invertebrate species* (you can pass a specific core database (which includes a release number) to the `species` argument instead). For invertebrate species, Ensembl only stores databases from 10 releases prior to the current release. + +This argument is overwritten if a specific database (which includes a release number) is passed to the species argument. `-t` `--id_type` 'gene' (default) or 'transcript' @@ -39,11 +39,11 @@ Returns genes or transcripts, respectively. 'or': Returns all genes that INCLUDE AT LEAST ONE of the searchwords in their name/description. 'and': Returns only genes that INCLUDE ALL of the searchwords in their name/description. -`-l` `--limit` +`-l` `--limit` Limits the number of search results, e.g. 10. Default: None. `-o` `--out` -Path to the csv the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out. +Path to the csv the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out. Python: `save=True` will save the output in the current working directory. **Flags** @@ -51,15 +51,15 @@ Python: `save=True` will save the output in the current working directory. Command-line only. Returns results in CSV format. Python: Use `json=True` to return output in JSON format. -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. -Python: Use `verbose=False` to prevent progress information from being displayed. +Python: Use `verbose=False` to prevent progress information from being displayed. `wrap_text` Python only. `wrap_text=True` displays data frame with wrapped text for easy reading (default: False). - - - + + + ### Example ```bash gget search -s human gaba gamma-aminobutyric @@ -74,13 +74,12 @@ gget.search(["gaba", "gamma-aminobutyric"], "homo_sapiens") | -------------- |-------------------------| ------------------------| -------------- | ----------|-----| | ENSG00000034713| GABARAPL2 | GABA type A receptor associated protein like 2 [Source:HGNC Symbol;Acc:HGNC:13291] | GABA type A receptor associated protein like 2 | protein_coding | https://uswest.ensembl.org/homo_sapiens/Gene/Summary?g=ENSG00000034713 | | . . . | . . . | . . . | . . . | . . . | . . . | - + #### [More examples](https://github.com/pachterlab/gget_examples) # References -If you use `gget search` in a publication, please cite the following articles: +If you use `gget search` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606. - diff --git a/docs/src/en/seq.md b/docs/src/en/seq.md index 1e7b3b897..bfe356c44 100644 --- a/docs/src/en/seq.md +++ b/docs/src/en/seq.md @@ -2,16 +2,16 @@ > Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag. # gget seq 🧬 -Fetch nucleotide or amino acid sequence(s) of a gene (and all its isoforms) or a transcript by Ensembl ID. +Fetch nucleotide or amino acid sequence(s) of a gene (and all its isoforms) or a transcript by Ensembl ID. Return format: FASTA. **Positional argument** -`ens_ids` +`ens_ids` One or more Ensembl IDs. **Optional arguments** -`-o` `--out` -Path to the file the results will be saved in, e.g. path/to/directory/results.fa. Default: Standard out. +`-o` `--out` +Path to the file the results will be saved in, e.g. path/to/directory/results.fa. Default: Standard out. Python: `save=True` will save the output in the current working directory. **Flags** @@ -20,11 +20,11 @@ Returns amino acid (instead of nucleotide) sequences. Nucleotide sequences are fetched from [Ensembl](https://www.ensembl.org/). Amino acid sequences are fetched from [UniProt](https://www.uniprot.org/). -`-iso` `--isoforms` +`-iso` `--isoforms` Returns the sequences of all known transcripts. (Only for gene IDs.) -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. Python: Use `verbose=False` to prevent progress information from being displayed. @@ -52,7 +52,7 @@ gget.seq("ENSG00000034713", translate=True, isoforms=True) #### [More examples](https://github.com/pachterlab/gget_examples) # References -If you use `gget seq` in a publication, please cite the following articles: +If you use `gget seq` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/en/setup.md b/docs/src/en/setup.md index 80752db11..e3ad9e1c3 100644 --- a/docs/src/en/setup.md +++ b/docs/src/en/setup.md @@ -15,12 +15,12 @@ gget module for which dependencies should be installed. `-o` `--out` Path to the folder downloaded files will be saved in (currently only applies to module = 'elm'). NOTE: Do NOT use this argument when downloading the files for use with `gget.elm`. -Default: None (downloaded files are saved inside the `gget` package installation folder). +Default: None (downloaded files are saved inside the `gget` package installation folder). **Flags** -`-q` `--quiet` +`-q` `--quiet` Command-line only. Prevents progress information from being displayed. -Python: Use `verbose=False` to prevent progress information from being displayed. +Python: Use `verbose=False` to prevent progress information from being displayed. ### Example diff --git a/docs/src/en/updates.md b/docs/src/en/updates.md index 14419bd29..0ff444f9d 100644 --- a/docs/src/en/updates.md +++ b/docs/src/en/updates.md @@ -6,7 +6,7 @@ - The `species` argument (both Python and command line) now accepts all five supported organisms; the CLI `choices`, help text, and docstrings list them. - Added early validation of the `species` argument that raises a clear `ValueError` listing the supported species, instead of failing later inside the Census API call. - Note: the new primate species require `census_version="2025-11-08"` (LTS) or newer. - + **Version ≥ 0.30.6** (Jun 10, 2026): - [`gget blat`](blat.md): Improved resilience against UCSC BLAT endpoint failures (fixes intermittently failing tests). - Added retry-with-exponential-backoff for transient failures (HTTP 429/5xx, network errors, and non-JSON 200 responses caused by UCSC rate-limiting or HTML error pages). Up to 4 attempts with 1.5s → 3s → 6s backoff. @@ -26,6 +26,11 @@ - `utils.get_uniprot_seqs`: Collect per-ID DataFrames in a list and `pd.concat(..., ignore_index=True)` once at the end, avoiding the O(n²) cost of growing a DataFrame inside the request loop. - Cached `utils.find_latest_ens_rel`, `utils.search_species_options`, `utils.ref_species_options`, and `utils.find_nv_kingdom` with `functools.lru_cache`. These hit Ensembl FTP listings that are stable for a release; repeated calls within one Python process are now free. - Added `utils.parallel_map`, a thin `ThreadPoolExecutor` wrapper for I/O-bound work. Used to fan out `utils.get_uniprot_seqs` across the input ID list — looking up N IDs is now bounded by ~`N / pool_size` UniProt round-trips instead of `N`. Pool size defaults to 8 and can be overridden via the `GGET_MAX_WORKERS` environment variable. +- Developer tooling / packaging: + - Migrated packaging to a single `pyproject.toml` (the [hatchling](https://hatch.pypa.io/) build backend); removed `setup.py`, `setup.cfg`, `requirements.txt`, `dev-requirements.txt`, and `MANIFEST.in`. Runtime dependencies and the `test` dependency group are now declared in `pyproject.toml`. + - The minimum supported Python version is now **3.12**. + - Added a [pre-commit](https://pre-commit.com/) configuration (lint + format via [ruff](https://docs.astral.sh/ruff/), plus standard hygiene hooks). Run `prek run --all-files` (or `pre-commit run --all-files`) before opening a PR. + - Modernized the test CI to use [uv](https://docs.astral.sh/uv/) and run on pull requests, and added package-build-check and PyPI trusted-publishing workflows. **Version ≥ 0.30.5** (May 23, 2026): - [`gget opentargets`](opentargets.md): Rewrote this module to reflect the new Open Targets API structure @@ -69,7 +74,7 @@ - [`gget pdb`](pdb.md): Added wwpdb mirror; falls back to rcsb if wwpdb fails. - [`gget cellxgene`](cellxgene.md): Improved argument handling; frontend unchanged. Fixes [issue 181](https://github.com/pachterlab/gget/issues/181). - [`gget setup`](setup.md)/[`gget alphafold`](alphafold.md): Fixed pip_cmd bug in gget.setup("alphafold") - + **Version ≥ 0.29.2** (Jul 03, 2025): - gget can now be installed using `uv pip install gget` - All package metadata (version, author, description, etc.) is now managed in setup.cfg for full compatibility with modern tools like uv, pip, and PyPI @@ -94,7 +99,7 @@ - Allow querying multiple genes at once. - [`gget diamond`](diamond.md): - Now supports translated alignment of nucleotide sequences to amino acid reference sequences using the `--translated` flag. -- [`gget elm`](elm.md): +- [`gget elm`](elm.md): - Improved server error handling. **Version ≥ 0.29.0** (Sep 25, 2024): @@ -122,12 +127,12 @@ - [`gget ref`](./ref.md): Can now fetch the GRCh37 genome assembly using `species='human_grch37'` - [`gget search`](./search.md): Adjust access of human data to the structure of Ensembl release 112 (fixes [issue 129](https://github.com/pachterlab/gget/issues/129)) -~~**Version ≥ 0.28.5** (May 29, 2024):~~ +~~**Version ≥ 0.28.5** (May 29, 2024):~~ - Yanked due to logging bug in `gget.setup("alphafold")` + inversion mutations in `gget mutate` only reverse the string instead of also computing the complementary strand - + **Version ≥ 0.28.4** (January 31, 2024): - [`gget setup`](./setup.md): Fix bug with filepath when running `gget.setup("elm")` on Windows OS. - + **Version ≥ 0.28.3** (January 22, 2024): - **[`gget search`](./search.md) and [`gget ref`](./ref.md) now also support fungi 🍄, protists 🌝, and invertebrate metazoa 🐝 🐜 🐌 🐙 (in addition to vertebrates and plants)** - **New module: [`gget cosmic`](./cosmic.md)** @@ -140,7 +145,7 @@ - [`gget setup`](./setup.md): Use the `out` argument to specify a directory the ELM database will be downloaded into. Completes [this feature request](https://github.com/pachterlab/gget/issues/119). - [`gget diamond`](./diamond.md): The DIAMOND command is now run with `--ignore-warnings` flag, allowing niche sequences such as amino acid sequences that only contain nucleotide characters and repeated sequences. This is also true for DIAMOND alignments performed within [`gget elm`](./elm.md). - **[`gget ref`](./ref.md) and [`gget search`](./search.md) back-end change: the current Ensembl release is fetched from the new [release file](https://ftp.ensembl.org/pub/VERSION) on the Ensembl FTP site to avoid errors during uploads of new releases.** -- [`gget search`](./search.md): +- [`gget search`](./search.md): - FTP link results (`--ftp`) are saved in txt file format instead of json. - Fix URL links to Ensembl gene summary for species with a subspecies name and invertebrates. - [`gget ref`](./ref.md): @@ -152,7 +157,7 @@ - Replace deprecated 'text' argument to find()-type methods whenever used with dependency `BeautifulSoup` - [`gget elm`](elm.md): Remove false positive and true negative instances from returned results - [`gget elm`](elm.md): Add `expand` argument - + **Version ≥ 0.28.0** (November 5, 2023): - Updated documentation of [`gget muscle`](./muscle.md) to add a tutorial on how to visualize sequences with varying sequence name lengths + slight change to returned visualization so it's a bit more robust to varying sequence names - [`gget muscle`](./muscle.md) now also allows a list of sequences as input (as an alternative to providing the path to a FASTA file) @@ -160,7 +165,7 @@ - [`gget seq`](./seq.md): Allow missing gene names (fixes [https://github.com/pachterlab/gget/issues/107](https://github.com/pachterlab/gget/issues/107)) - **[`gget enrichr`](enrichr.md): Use new arguments `kegg_out` and `kegg_rank` to create an image of the KEGG pathway with the genes from the enrichment analysis highlighted (thanks to [this PR](https://github.com/pachterlab/gget/pull/106) by [Noriaki Sato](https://github.com/noriakis))** - **New modules: [`gget elm`](elm.md) and [`gget diamond`](diamond.md)** - + **Version ≥ 0.27.9** (August 7, 2023): - **[`gget enrichr`](enrichr.md): Use new argument `background_list` to provide a list of background genes** - [`gget search`](search.md) now also searches [Ensembl](https://ensembl.org/) synonyms (in addition to gene descriptions and names) to return more comprehensive search results (thanks to [Samuel Klein](https://github.com/KleinSamuel) for the [suggestion](https://github.com/pachterlab/gget/issues/90)) @@ -185,11 +190,11 @@ **Version ≥ 0.27.4** (March 19, 2023): - **New module: [`gget gpt`](gpt.md)** - + **Version ≥ 0.27.3** (March 11, 2023): - [`gget info`](info.md) excludes PDB IDs by default to increase speed (PDB results can be included using flag `--pdb` / `pdb=True`). -**Version ≥ 0.27.2** (January 1, 2023): +**Version ≥ 0.27.2** (January 1, 2023): - Updated [`gget alphafold`](alphafold.md) to [DeepMind's AlphaFold v2.3.0](https://github.com/deepmind/alphafold/releases/tag/v2.3.0) (including new arguments `multimer_for_monomer` and `multimer_recycles`) **Version ≥ 0.27.0** (December 10, 2022): diff --git a/docs/src/en/virus.md b/docs/src/en/virus.md index be2cc0900..c60f67087 100644 --- a/docs/src/en/virus.md +++ b/docs/src/en/virus.md @@ -21,12 +21,12 @@ Add `--is_accession` when passing an NCBI accession number. Add `--is_sars_cov2` For SARS-CoV-2 and Alphainfluenza cached downloads, supports: - Single accession: `NC_045512.2` - - Space-separated list: `NC_045512.2 MN908947.3 MT020781.1` + - Space-separated list: `NC_045512.2 MN908947.3 MT020781.1` - Text file path: `accessions.txt` (one accession per line) Use flag `--download_all_accessions` to apply filters without searching for a specific virus. -**Optional arguments** +**Optional arguments** _Host filters_ @@ -87,7 +87,7 @@ Command line: `--annotated true` to fetch only that have been annotated with gen Python: `annotated=True` or `annotated=False` (`annotated=None` for no filter). `--lab_passaged` -'true' or 'false'. Filter for or against lab-passaged samples. +'true' or 'false'. Filter for or against lab-passaged samples. Command line: `--lab_passaged true` to fetch only lab-passaged samples, or `--lab_passaged false` to exclude them. Python: `lab_passaged=True` or `lab_passaged=False` (`lab_passaged=None` for no filter). @@ -205,8 +205,8 @@ Python: `merge_results=False` `-a` `--is_accession` Flag to indicate that the `virus` positional argument is an accession number, a space-separated list of accessions, or a path to a text file containing accession numbers (one per line). -`--download_all_accessions` -Use this flag when applying filters without searching for a specific virus (leave `virus` argument empty). +`--download_all_accessions` +Use this flag when applying filters without searching for a specific virus (leave `virus` argument empty). ⚠️ **WARNING**: If you do not specify additional filters, this flag downloads ALL available viral sequences from NCBI (entire Viruses taxonomy, taxon ID 10239). This is an extremely large dataset that can take many hours to download and require significant disk space. Use with caution and ensure you have adequate storage and bandwidth. When this flag is set, the `virus` argument is ignored. `--is_sars_cov2` @@ -227,7 +227,7 @@ Flag to keep all intermediate/temporary files generated during processing. By de `-q` `--quiet` Command-line only. Prevents progress information from being displayed. -Python: Use `verbose=False` to prevent progress information from being displayed. +Python: Use `verbose=False` to prevent progress information from being displayed. ### Example @@ -278,8 +278,8 @@ gget virus "SARS-CoV-2" --host human --nuc_completeness complete --min_seq_lengt import gget gget.virus( - "SARS-CoV-2", - host="human", + "SARS-CoV-2", + host="human", nuc_completeness="complete", min_seq_length=29000, genbank_metadata=True, @@ -302,8 +302,8 @@ gget virus "Influenza A virus" --host human --nuc_completeness complete --max_se import gget gget.virus( - "Influenza A virus", - host="human", + "Influenza A virus", + host="human", nuc_completeness="complete", max_seq_length=15000, genbank_metadata=True, @@ -660,5 +660,3 @@ If you use `gget virus` in a publication, please cite the following articles: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - O’Leary, N.A., Cox, E., Holmes, J.B. et al (2024). Exploring and retrieving sequence and metadata for species across the tree of life with NCBI Datasets. Sci Data 11, 732. [https://doi.org/10.1038/s41597-024-03571-y](https://doi.org/10.1038/s41597-024-03571-y) - - diff --git a/docs/src/es/alphafold.md b/docs/src/es/alphafold.md index 7c1a67c9a..a93445385 100644 --- a/docs/src/es/alphafold.md +++ b/docs/src/es/alphafold.md @@ -13,9 +13,9 @@ Antes de usar `gget alphafold` por primera vez: `conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0` Para Python versión 3.11: `conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0` - + Recomendación: siga con `conda update -qy conda` para actualizar _conda_ a la última versión. -3. Corre `gget setup alphafold` / `gget.setup("alphafold")` (ver también [`gget setup`](setup.md)). Al ejecutar `gget setup alphafold` / `gget.setup("alphafold")` se descargará e instalará la última versión de AlphaFold2 alojada en el [AlphaFold GitHub Repo](https://github.com/deepmind/alphafold). Puede volver a ejecutar este comando en cualquier momento para actualizar el software cuando hay una nueva versión de AlphaFold. +3. Corre `gget setup alphafold` / `gget.setup("alphafold")` (ver también [`gget setup`](setup.md)). Al ejecutar `gget setup alphafold` / `gget.setup("alphafold")` se descargará e instalará la última versión de AlphaFold2 alojada en el [AlphaFold GitHub Repo](https://github.com/deepmind/alphafold). Puede volver a ejecutar este comando en cualquier momento para actualizar el software cuando hay una nueva versión de AlphaFold. **Parámetro posicional** `sequence` @@ -26,17 +26,17 @@ Secuencia de aminoácidos (str), o una lista de secuencias (*gget alphafold auto El algoritmo de multímero se reciclara hasta que las predicciones dejen de cambiar, el limite de ciclos esta indicado aqui. Por defecto: 3 Para obtener más exactitud, ajusta este limite a 20 (al costo de ejecuciones mas tardadas). -`-o` `--out` +`-o` `--out` Ruta a la carpeta para guardar los resultados de la predicción (str). Por defecto: "./[fecha_tiempo]_gget_alphafold_prediction". - -**Banderas** + +**Banderas** `-mfm` `--multimer_for_monomer` Usa el algoritmo de multímero para un monómero. -`-r` `--relax` +`-r` `--relax` Relaja el mejor modelo con el algoritmo AMBER. -`-q` `--quiet` +`-q` `--quiet` Uso limitado para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa. Para Python, usa `verbose=False`. @@ -45,8 +45,8 @@ Solo para Python. `plot=True` provée una visualización interactiva de la predi `show_sidechains` Solo para Python. `show_sidechains=True` incluye las cadenas laterales de proteínas en el esquema (por defecto: True). - - + + ### Ejemplo ```bash # Predice la estructura de una proteína derivada de su secuencia de aminoácidos @@ -82,12 +82,12 @@ gget.pdb("2K42", save=True) ### [🔗 gget alphafold - preguntas más frecuentes](https://github.com/pachterlab/gget/discussions/39) -# Citar +# Citar Si utiliza `gget alphafold` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - Jumper, J., Evans, R., Pritzel, A. et al. Highly accurate protein structure prediction with AlphaFold. Nature 596, 583–589 (2021). [https://doi.org/10.1038/s41586-021-03819-2](https://doi.org/10.1038/s41586-021-03819-2) -Y, si corresponde: +Y, si corresponde: - Evans, R. et al. Protein complex prediction with AlphaFold-Multimer. bioRxiv 2021.10.04.463034; [https://doi.org/10.1101/2021.10.04.463034](https://doi.org/10.1101/2021.10.04.463034) diff --git a/docs/src/es/archs4.md b/docs/src/es/archs4.md index 80dabd91a..27e0eb66a 100644 --- a/docs/src/es/archs4.md +++ b/docs/src/es/archs4.md @@ -17,27 +17,27 @@ Alternativamente: usa la bandera `--ensembl` para ingresar un ID tipo Ensembl, p 'tissue' produce un atlas de expresión tisular calculado de todas las muestras humanas o de ratón (según lo definido usando el parámetro `--species` (especies)) en [ARCHS4](https://maayanlab.cloud/archs4/). `-s` `--species` -'human' (humano; se usa por defecto) o 'mouse' (ratón). +'human' (humano; se usa por defecto) o 'mouse' (ratón). Define si se usan muestras humanas o de ratón de [ARCHS4](https://maayanlab.cloud/archs4/). (Solo aplica para el atlas de expresión tisular.) -`-o` `--out` +`-o` `--out` Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.csv (o .json). Por defecto: salida estándar (STDOUT). Para Python, use `save=True` para guardar los resultados en el directorio de trabajo actual. - -**Banderas** + +**Banderas** `-e` `--ensembl` -Usa esta bandera si `gene` se ingresa como ID tipo Ensembl. +Usa esta bandera si `gene` se ingresa como ID tipo Ensembl. `-csv` `--csv` -Solo para Terminal. Produce los resultados en formato CSV. -Para Python, usa `json=True` para obtener los resultados en formato JSON. +Solo para Terminal. Produce los resultados en formato CSV. +Para Python, usa `json=True` para obtener los resultados en formato JSON. -`-q` `--quiet` +`-q` `--quiet` Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa. Para Python, usa `verbose=False` para impedir la información de progreso de ser exhibida durante la ejecución del programa. - - + + ### Ejemplo ```bash gget archs4 ACE2 @@ -49,10 +49,10 @@ gget.archs4("ACE2") → Produce los 100 genes más correlacionados con el gen ACE2: | gene_symbol | pearson_correlation | -| -------------- |-------------------------| -| SLC5A1 | 0.579634 | -| CYP2C18 | 0.576577 | -| . . . | . . . | +| -------------- |-------------------------| +| SLC5A1 | 0.579634 | +| CYP2C18 | 0.576577 | +| . . . | . . . |

@@ -66,9 +66,9 @@ gget.archs4("ACE2", which="tissue") → Produce la expresión tisular de ACE2 (por defecto, se utilizan datos humanos): | id | min | q1 | median | q3 | max | -| ------ |--------| ------ |--------| ------ |--------| +| ------ |--------| ------ |--------| ------ |--------| | System.Urogenital/Reproductive System.Kidney.RENAL CORTEX | 0.113644 | 8.274060 | 9.695840 | 10.51670 | 11.21970 | -| System.Digestive System.Intestine.INTESTINAL EPITHELIAL CELL | 0.113644 | 5.905560 | 9.570450 | 13.26470 | 13.83590 | +| System.Digestive System.Intestine.INTESTINAL EPITHELIAL CELL | 0.113644 | 5.905560 | 9.570450 | 13.26470 | 13.83590 | | . . . | . . . | . . . | . . . | . . . | . . . |

@@ -79,7 +79,7 @@ Consulte [este tutorial](https://davetang.org/muse/2023/05/16/check-where-a-gene #### [Más ejemplos](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget archs4` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/es/bgee.md b/docs/src/es/bgee.md index aa13a4745..360458766 100644 --- a/docs/src/es/bgee.md +++ b/docs/src/es/bgee.md @@ -34,7 +34,7 @@ Python: Usa `json=True` para devolver la salida en formato JSON. `-q` `--quiet` Solo en línea de comandos. Evita que se muestre la información de progreso. Python: Usa `verbose=False` para evitar que se muestre la información de progreso. - + ### Ejemplos **Obtener ortólogos para un gen** @@ -93,7 +93,7 @@ import gget gget.bgee(["ENSBTAG00000047356", "ENSBTAG00000018317"], type="expression") ``` -→ Devuelve datos de expresión génica para los genes ENSBTAG00000047356 y ENSBTAG00000018317: +→ Devuelve datos de expresión génica para los genes ENSBTAG00000047356 y ENSBTAG00000018317: | anat_entity_id | anat_entity_name | score | score_confidence | expression_state | |----------------|-----------------------------|-------|------------------|------------------| @@ -102,10 +102,10 @@ gget.bgee(["ENSBTAG00000047356", "ENSBTAG00000018317"], type="expression") | BGEE:0000000 | anatomical entity and cellular component | 89.12 | high | expressed | | ... | ... | ... | ... | ... | - + #### [Más ejemplos](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget bgee` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/es/blast.md b/docs/src/es/blast.md index 694a33645..722afca40 100644 --- a/docs/src/es/blast.md +++ b/docs/src/es/blast.md @@ -6,7 +6,7 @@ BLAST una secuencia de nucleótidos o aminoácidos a cualquier base de datos [BL Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python). **Parámetro posicional** -`sequence` +`sequence` Secuencia de nucleótidos o aminoácidos, o una ruta a un archivo tipo FASTA o .txt. **Parámetros optionales** @@ -25,7 +25,7 @@ Limita el número de resultados producidos. Por defecto: 50. `-e` `--expect` Define el umbral de ['expect value'](https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=FAQ#expect). Por defecto: 10.0. -`-o` `--out` +`-o` `--out` Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.csv (o .json). Por defecto: salida estándar (STDOUT). Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual. @@ -37,16 +37,16 @@ Activa el ['low complexity filter'](https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD Desactiva el algoritmo MegaBLAST. Por defecto: MegaBLAST esta activado (solo aplicable para blastn). `-csv` `--csv` -Solo para Terminal. Produce los resultados en formato CSV. +Solo para Terminal. Produce los resultados en formato CSV. Para Python, usa `json=True` para producir los resultados en formato JSON. -`-q` `--quiet` +`-q` `--quiet` Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa. Para Python, usa `verbose=False` para imipidir la informacion de progreso de ser exhibida durante la ejecución del programa. `wrap_text` -Solo para Python. `wrap_text=True` muestra los resultados con texto envuelto para facilitar la lectura (por defecto: False). - +Solo para Python. `wrap_text=True` muestra los resultados con texto envuelto para facilitar la lectura (por defecto: False). + ### Por ejemplo ```bash gget blast MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRIQLPSEKAIFLFVDKTVPQSR @@ -60,7 +60,7 @@ gget.blast("MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRI | Description | Scientific Name | Common Name | Taxid | Max Score | Total Score | Query Cover | ... | | -------------- |-------------------------| ------------------------| -------------- | ----------|-----|---|---| | PREDICTED: gamma-aminobutyric acid receptor-as...| Colobus angolensis palliatus | NaN | 336983 | 180 | 180 | 100% | ... | -| . . . | . . . | . . . | . . . | . . . | . . . | . . . | ... | +| . . . | . . . | . . . | . . . | . . . | . . . | . . . | ... | **BLAST desde un archivo .fa o .txt:** @@ -75,7 +75,7 @@ gget.blast("fasta.fa") #### [Más ejemplos](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget blast` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/es/blat.md b/docs/src/es/blat.md index 21109750c..44e154f35 100644 --- a/docs/src/es/blat.md +++ b/docs/src/es/blat.md @@ -2,32 +2,32 @@ > Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`. # gget blat 🎯 -Encuentra la ubicación genómica de una secuencia de nucleótidos o aminoácidos usando [BLAT](https://genome.ucsc.edu/cgi-bin/hgBlat). +Encuentra la ubicación genómica de una secuencia de nucleótidos o aminoácidos usando [BLAT](https://genome.ucsc.edu/cgi-bin/hgBlat). Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python). **Parámetro posicional** -`sequence` +`sequence` Secuencia de nucleótidos o aminoácidos, o una ruta a un archivo tipo FASTA o .txt. **Parámetros optionales** -`-st` `--seqtype` -'DNA', 'protein', 'translated%20RNA', o 'translated%20DNA'. +`-st` `--seqtype` +'DNA', 'protein', 'translated%20RNA', o 'translated%20DNA'. Por defecto: 'DNA' para secuencias de nucleótidos; 'protein' para secuencias de aminoácidos. -`-a` `--assembly` -Ensamblaje del genoma. 'human' (hg38) (se usa por defecto), 'mouse' (mm39) (ratón), 'zebrafish' (taeGut2) (pinzón cebra), +`-a` `--assembly` +Ensamblaje del genoma. 'human' (hg38) (se usa por defecto), 'mouse' (mm39) (ratón), 'zebrafish' (taeGut2) (pinzón cebra), o cualquiera de los ensamblajes de especies disponibles [aquí](https://genome.ucsc.edu/cgi-bin/hgBlat) (use el nombre corto del ensamblado, p. ej. 'hg38'). -`-o` `--out` +`-o` `--out` Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.csv (o .json). Por defecto: salida estándar (STDOUT). Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual. - + **Banderas** `-csv` `--csv` -Solo para Terminal. Produce los resultados en formato CSV. +Solo para Terminal. Produce los resultados en formato CSV. Para Python, usa `json=True` para producir los resultados en formato JSON. -`-q` `--quiet` +`-q` `--quiet` Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa. Para Python, usa `verbose=False` para impedir la información de progreso de ser exhibida durante la ejecución del programa. @@ -40,7 +40,7 @@ gget blat -a taeGut2 MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQ # Python gget.blat("MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRIQLPSEKAIFLFVDKTVPQSR", assembly="taeGut2") ``` -→ Produce los resultados de BLAT para el ensamblaje taeGut2 (pinzón cebra). En este ejemplo, `gget blat` automáticamente detecta esta secuencia como una secuencia de aminoácidos y, por lo tanto, establece el tipo de secuencia (`--seqtype`) como *proteína*. +→ Produce los resultados de BLAT para el ensamblaje taeGut2 (pinzón cebra). En este ejemplo, `gget blat` automáticamente detecta esta secuencia como una secuencia de aminoácidos y, por lo tanto, establece el tipo de secuencia (`--seqtype`) como *proteína*. | genome | query_size | aligned_start | aligned_end | matches | mismatches | %_aligned | ... | | -------------- |-------------------------| ------------------------| -------------- | ----------|-----|---|---| @@ -48,7 +48,7 @@ gget.blat("MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRIQ #### [Màs ejemplos](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget blat` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/es/cbio.md b/docs/src/es/cbio.md index dc22820ed..eb3aea1d4 100644 --- a/docs/src/es/cbio.md +++ b/docs/src/es/cbio.md @@ -104,7 +104,7 @@ gget.cbio_search(['esophag', 'ovary', 'ovarian'])

-**Graficar un mapa de calor de ocurrencias de mutaciones para genes específicos en un estudio específico:** +**Graficar un mapa de calor de ocurrencias de mutaciones para genes específicos en un estudio específico:** ```bash gget cbio plot \ -s msk_impact_2017 \ @@ -131,7 +131,7 @@ gget.cbio_plot(

-**Graficar un mapa de calor de tipos de mutaciones para genes específicos en un estudio específico:** +**Graficar un mapa de calor de tipos de mutaciones para genes específicos en un estudio específico:** ```bash gget cbio plot \ -s msk_impact_2017 \ @@ -217,19 +217,18 @@ gget.cbio_plot( → Guarda un mapa de calor de los tipos de mutaciones para los genes especificados en el estudio especificado, filtrado por tejido, con el título "Mutaciones intestinales" en ./gget_cbio_figures/intestinal_mutations.png. ![Heatmap](https://raw.githubusercontent.com/pachterlab/gget/b32c01efefd55d37c19034ce96a86826e30ae3e5/docs/assets/gget_cbio_figure_4.png) - + #### [Más ejemplos](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget cbio` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - Cerami E, Gao J, Dogrusoz U, Gross BE, Sumer SO, Aksoy BA, Jacobsen A, Byrne CJ, Heuer ML, Larsson E, Antipin Y, Reva B, Goldberg AP, Sander C, Schultz N. The cBio cancer genomics portal: an open platform for exploring multidimensional cancer genomics data. Cancer Discov. 2012 May;2(5):401-4. doi: [10.1158/2159-8290.CD-12-0095](https://doi.org/10.1158/2159-8290.cd-12-0095). Erratum in: Cancer Discov. 2012 Oct;2(10):960. PMID: 22588877; PMCID: PMC3956037. - + - Gao J, Aksoy BA, Dogrusoz U, Dresdner G, Gross B, Sumer SO, Sun Y, Jacobsen A, Sinha R, Larsson E, Cerami E, Sander C, Schultz N. Integrative analysis of complex cancer genomics and clinical profiles using the cBioPortal. Sci Signal. 2013 Apr 2;6(269):pl1. doi: [10.1126/scisignal.2004088](https://doi.org/10.1126/scisignal.2004088). PMID: 23550210; PMCID: PMC4160307. - + - de Bruijn I, Kundra R, Mastrogiacomo B, Tran TN, Sikina L, Mazor T, Li X, Ochoa A, Zhao G, Lai B, Abeshouse A, Baiceanu D, Ciftci E, Dogrusoz U, Dufilie A, Erkoc Z, Garcia Lara E, Fu Z, Gross B, Haynes C, Heath A, Higgins D, Jagannathan P, Kalletla K, Kumari P, Lindsay J, Lisman A, Leenknegt B, Lukasse P, Madela D, Madupuri R, van Nierop P, Plantalech O, Quach J, Resnick AC, Rodenburg SYA, Satravada BA, Schaeffer F, Sheridan R, Singh J, Sirohi R, Sumer SO, van Hagen S, Wang A, Wilson M, Zhang H, Zhu K, Rusk N, Brown S, Lavery JA, Panageas KS, Rudolph JE, LeNoue-Newton ML, Warner JL, Guo X, Hunter-Zinck H, Yu TV, Pilai S, Nichols C, Gardos SM, Philip J; AACR Project GENIE BPC Core Team, AACR Project GENIE Consortium; Kehl KL, Riely GJ, Schrag D, Lee J, Fiandalo MV, Sweeney SM, Pugh TJ, Sander C, Cerami E, Gao J, Schultz N. Analysis and Visualization of Longitudinal Genomic and Clinical Data from the AACR Project GENIE Biopharma Collaborative in cBioPortal. Cancer Res. 2023 Dec 1;83(23):3861-3867. doi: [10.1158/0008-5472.CAN-23-0816](https://doi.org/10.1158/0008-5472.CAN-23-0816). PMID: 37668528; PMCID: PMC10690089. - -- Please also cite the source of the data if you are using a publicly available dataset. +- Please also cite the source of the data if you are using a publicly available dataset. diff --git a/docs/src/es/cellxgene.md b/docs/src/es/cellxgene.md index a66ae41a0..35d99c837 100644 --- a/docs/src/es/cellxgene.md +++ b/docs/src/es/cellxgene.md @@ -13,7 +13,7 @@ Antes de usar `gget cellxgene` por primera vez, corre `gget setup cellxgene` / ` `-g` `--gene` Str o lista de genes de interés o ID(s) tipo Ensembl. Por defecto: None (ninguno). -Atención: Utilice la bandera `-e / --ensembl` (Python: `ensembl=True`) cuando ingrese ID(s) tipo Ensembl. +Atención: Utilice la bandera `-e / --ensembl` (Python: `ensembl=True`) cuando ingrese ID(s) tipo Ensembl. Atención: ¡Los símbolos de genes distinguen mayúsculas y minúsculas! Usa la capitalización canónica al pasar símbolos de genes; p. ej., ‘PAX7’ (humano), ‘Pax7’ (ratón). Ver https://cellxgene.cziscience.com/gene-expression para ejemplos de genes. @@ -22,21 +22,21 @@ Versión del CZ CELLxGENE Discover Census (str), p. ej. "2023-05-15", o "latest" `-cn` `--column_names` Lista de columnas de metadatos a obtener (almacenadas en AnnData.obs). -Por defecto: ['dataset_id', 'assay', 'suspension_type', 'sex', 'tissue_general', 'tissue', 'cell_type'] +Por defecto: ['dataset_id', 'assay', 'suspension_type', 'sex', 'tissue_general', 'tissue', 'cell_type'] Para más opciones, ver: https://api.cellxgene.cziscience.com/curation/ui/#/ -> 'Schemas' -> 'dataset' -`-o` `--out` +`-o` `--out` Ruta al archivo para guardar el objeto AnnData formato .h5ad (o .csv con bandera `-mo / --meta_only`). ¡Requerido cuando se usa desde Terminal! **Banderas** `-e` `--ensembl` -Usa esta bandera si `gene` se ingresa como ID tipo Ensembl. +Usa esta bandera si `gene` se ingresa como ID tipo Ensembl. `-mo` `--meta_only` Solo produce la tabla (Dataframe) con metadatos (corresponde a AnnData.obs). -`-q` `--quiet` +`-q` `--quiet` Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa. Para Python, usa `verbose=False` para impedir la información de progreso de ser exhibida durante la ejecución del programa. @@ -70,7 +70,7 @@ Str o lista de tejido(s) del tipo high-level. Por defecto: None. Tejidos y sus IDs de UBERON se enumeran [aquí](https://github.com/chanzuckerberg/single-cell-data-portal/blob/9b94ccb0a2e0a8f6182b213aa4852c491f6f6aff/backend/wmg/data/tissue_mapper.py). `--tissue_ontology_term_id` -Str o lista de ID(s) de 'tissue ontology term' como están definidos en el [esquema de datos del CELLxGENE](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Por defecto: None. +Str o lista de ID(s) de 'tissue ontology term' como están definidos en el [esquema de datos del CELLxGENE](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Por defecto: None. `--assay_ontology_term_id` Str o lista de ID(s) de 'assay ontology term' como están definidos en el [esquema de datos del CELLxGENE](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Por defecto: None. @@ -81,7 +81,7 @@ Str o lista de 'assays' (métodos) como están definidos en el [esquema de datos `--cell_type_ontology_term_id` Str o lista de ID(s) de 'celltype ontology term' como están definidos en el [esquema de datos del CELLxGENE](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Por defecto: None. -`--development_stage_ontology_term_id` +`--development_stage_ontology_term_id` Str o lista de ID(s) de 'development stage ontology term' como están definidos en el [esquema de datos del CELLxGENE](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Por defecto: None. `--disease_ontology_term_id` @@ -102,7 +102,7 @@ Str o lista de ID(s) de 'sex ontology' como están definidos en el [esquema de d `--suspension_type` Str o lista de tipo(s) de suspensión como están definidos en el [esquema de datos del CELLxGENE](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Por defecto: None. - + ### Ejemplo ```bash gget cellxgene --gene ACE2 ABCA1 SLC5A1 --tissue lung --cell_type 'mucus secreting cell' 'neuroendocrine cell' -o example_adata.h5ad @@ -139,7 +139,7 @@ df Ver también: [https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_gget_demo.html](https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_gget_demo.html) -# Citar +# Citar Si utiliza `gget cellxgene` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/es/cite.md b/docs/src/es/cite.md index 74d4449c7..62a15d272 100644 --- a/docs/src/es/cite.md +++ b/docs/src/es/cite.md @@ -4,7 +4,7 @@ # Citar -Si utiliza `gget` en una publicación, favor de citar: +Si utiliza `gget` en una publicación, favor de citar: Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. https://doi.org/10.1093/bioinformatics/btac836 - Si utiliza `gget alphafold`, favor de citar también: @@ -20,7 +20,7 @@ Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference data - Si utiliza `gget bgee`, favor de citar también: - Frederic B Bastian, Julien Roux, Anne Niknejad, Aurélie Comte, Sara S Fonseca Costa, Tarcisio Mendes de Farias, Sébastien Moretti, Gilles Parmentier, Valentine Rech de Laval, Marta Rosikiewicz, Julien Wollbrett, Amina Echchiki, Angélique Escoriza, Walid H Gharib, Mar Gonzales-Porta, Yohan Jarosz, Balazs Laurenczy, Philippe Moret, Emilie Person, Patrick Roelli, Komal Sanjeev, Mathieu Seppey, Marc Robinson-Rechavi (2021). The Bgee suite: integrated curated expression atlas and comparative transcriptomics in animals. Nucleic Acids Research, Volume 49, Issue D1, 8 January 2021, Pages D831–D847, [https://doi.org/10.1093/nar/gkaa793](https://doi.org/10.1093/nar/gkaa793) - + - Si utiliza `gget blast`, favor de citar también: - Altschul SF, Gish W, Miller W, Myers EW, Lipman DJ. Basic local alignment search tool. J Mol Biol. 1990 Oct 5;215(3):403-10. doi: 10.1016/S0022-2836(05)80360-2. PMID: 2231712. @@ -29,16 +29,16 @@ Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference data - Si utiliza `gget cbio`, favor de citar también: - Cerami E, Gao J, Dogrusoz U, Gross BE, Sumer SO, Aksoy BA, Jacobsen A, Byrne CJ, Heuer ML, Larsson E, Antipin Y, Reva B, Goldberg AP, Sander C, Schultz N. The cBio cancer genomics portal: an open platform for exploring multidimensional cancer genomics data. Cancer Discov. 2012 May;2(5):401-4. doi: [10.1158/2159-8290.CD-12-0095](https://doi.org/10.1158/2159-8290.cd-12-0095). Erratum in: Cancer Discov. 2012 Oct;2(10):960. PMID: 22588877; PMCID: PMC3956037. - + - Gao J, Aksoy BA, Dogrusoz U, Dresdner G, Gross B, Sumer SO, Sun Y, Jacobsen A, Sinha R, Larsson E, Cerami E, Sander C, Schultz N. Integrative analysis of complex cancer genomics and clinical profiles using the cBioPortal. Sci Signal. 2013 Apr 2;6(269):pl1. doi: [10.1126/scisignal.2004088](https://doi.org/10.1126/scisignal.2004088). PMID: 23550210; PMCID: PMC4160307. - + - de Bruijn I, Kundra R, Mastrogiacomo B, Tran TN, Sikina L, Mazor T, Li X, Ochoa A, Zhao G, Lai B, Abeshouse A, Baiceanu D, Ciftci E, Dogrusoz U, Dufilie A, Erkoc Z, Garcia Lara E, Fu Z, Gross B, Haynes C, Heath A, Higgins D, Jagannathan P, Kalletla K, Kumari P, Lindsay J, Lisman A, Leenknegt B, Lukasse P, Madela D, Madupuri R, van Nierop P, Plantalech O, Quach J, Resnick AC, Rodenburg SYA, Satravada BA, Schaeffer F, Sheridan R, Singh J, Sirohi R, Sumer SO, van Hagen S, Wang A, Wilson M, Zhang H, Zhu K, Rusk N, Brown S, Lavery JA, Panageas KS, Rudolph JE, LeNoue-Newton ML, Warner JL, Guo X, Hunter-Zinck H, Yu TV, Pilai S, Nichols C, Gardos SM, Philip J; AACR Project GENIE BPC Core Team, AACR Project GENIE Consortium; Kehl KL, Riely GJ, Schrag D, Lee J, Fiandalo MV, Sweeney SM, Pugh TJ, Sander C, Cerami E, Gao J, Schultz N. Analysis and Visualization of Longitudinal Genomic and Clinical Data from the AACR Project GENIE Biopharma Collaborative in cBioPortal. Cancer Res. 2023 Dec 1;83(23):3861-3867. doi: [10.1158/0008-5472.CAN-23-0816](https://doi.org/10.1158/0008-5472.CAN-23-0816). PMID: 37668528; PMCID: PMC10690089. - + - Please also cite the source of the data if you are using a publicly available dataset. - + - Si utiliza `gget cellxgene`, favor de citar también: - Chanzuckerberg Initiative. (n.d.). CZ CELLxGENE Discover. Retrieved [insert date here], from [https://cellxgene.cziscience.com/](https://cellxgene.cziscience.com/) - + - Si utiliza `gget cosmic`, favor de citar también: - Tate JG, Bamford S, Jubb HC, Sondka Z, Beare DM, Bindal N, Boutselakis H, Cole CG, Creatore C, Dawson E, Fish P, Harsha B, Hathaway C, Jupe SC, Kok CY, Noble K, Ponting L, Ramshaw CC, Rye CE, Speedy HE, Stefancsik R, Thompson SL, Wang S, Ward S, Campbell PJ, Forbes SA. COSMIC: the Catalogue Of Somatic Mutations In Cancer. Nucleic Acids Res. 2019 Jan 8;47(D1):D941-D947. doi: [10.1093/nar/gky1015](https://doi.org/10.1093/nar/gky1015). PMID: 30371878; PMCID: PMC6323903. @@ -47,41 +47,41 @@ Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference data - Si utiliza `gget elm`, favor de citar también: - Laura Luebbert, Chi Hoang, Manjeet Kumar, Lior Pachter, Fast and scalable querying of eukaryotic linear motifs with gget elm, Bioinformatics, 2024, btae095, [https://doi.org/10.1093/bioinformatics/btae095](https://doi.org/10.1093/bioinformatics/btae095) - + - Manjeet Kumar, Sushama Michael, Jesús Alvarado-Valverde, Bálint Mészáros, Hugo Sámano‐Sánchez, András Zeke, Laszlo Dobson, Tamas Lazar, Mihkel Örd, Anurag Nagpal, Nazanin Farahi, Melanie Käser, Ramya Kraleti, Norman E Davey, Rita Pancsa, Lucía B Chemes, Toby J Gibson, The Eukaryotic Linear Motif resource: 2022 release, Nucleic Acids Research, Volume 50, Issue D1, 7 January 2022, Pages D497–D508, [https://doi.org/10.1093/nar/gkab975](https://doi.org/10.1093/nar/gkab975) - -- Si utiliza `gget enrichr`, favor de citar también: - - Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128) - - Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377) +- Si utiliza `gget enrichr`, favor de citar también: + - Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128) + + - Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377) - Xie Z, Bailey A, Kuleshov MV, Clarke DJB., Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM, Jeon M, & Ma’ayan A. Gene set knowledge discovery with Enrichr. Current Protocols, 1, e90. 2021. doi: [10.1002/cpz1.90](https://doi.org/10.1002/cpz1.90). - + Si trabaja con conjuntos de datos no humanos/ratón, cite también: - Kuleshov MV, Diaz JEL, Flamholz ZN, Keenan AB, Lachmann A, Wojciechowicz ML, Cagan RL, Ma'ayan A. modEnrichr: a suite of gene set enrichment analysis tools for model organisms. Nucleic Acids Res. 2019 Jul 2;47(W1):W183-W190. doi: [10.1093/nar/gkz347](https://doi.org/10.1093/nar/gkz347). PMID: 31069376; PMCID: PMC6602483. - Si utiliza `gget info`, favor de citar también: - Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606. - + - Sayers EW, Beck J, Bolton EE, Brister JR, Chan J, Comeau DC, Connor R, DiCuccio M, Farrell CM, Feldgarden M, Fine AM, Funk K, Hatcher E, Hoeppner M, Kane M, Kannan S, Katz KS, Kelly C, Klimke W, Kim S, Kimchi A, Landrum M, Lathrop S, Lu Z, Malheiro A, Marchler-Bauer A, Murphy TD, Phan L, Prasad AB, Pujar S, Sawyer A, Schmieder E, Schneider VA, Schoch CL, Sharma S, Thibaud-Nissen F, Trawick BW, Venkatapathi T, Wang J, Pruitt KD, Sherry ST. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2024 Jan 5;52(D1):D33-D43. doi: [10.1093/nar/gkad1044](https://doi.org/10.1093/nar/gkad1044). PMID: 37994677; PMCID: PMC10767890. - + - The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052) - Si utiliza `gget muscle`, favor de citar también: - Edgar RC (2021), MUSCLE v5 enables improved estimates of phylogenetic tree confidence by ensemble bootstrapping, bioRxiv 2021.06.20.449169. [https://doi.org/10.1101/2021.06.20.449169](https://doi.org/10.1101/2021.06.20.449169) - + - Si utiliza `gget opentargets`, favor de citar también: - Ochoa D, Hercules A, Carmona M, Suveges D, Baker J, Malangone C, Lopez I, Miranda A, Cruz-Castillo C, Fumis L, Bernal-Llinares M, Tsukanov K, Cornu H, Tsirigos K, Razuvayevskaya O, Buniello A, Schwartzentruber J, Karim M, Ariano B, Martinez Osorio RE, Ferrer J, Ge X, Machlitt-Northen S, Gonzalez-Uriarte A, Saha S, Tirunagari S, Mehta C, Roldán-Romero JM, Horswell S, Young S, Ghoussaini M, Hulcoop DG, Dunham I, McDonagh EM. The next-generation Open Targets Platform: reimagined, redesigned, rebuilt. Nucleic Acids Res. 2023 Jan 6;51(D1):D1353-D1359. doi: [10.1093/nar/gkac1046](https://doi.org/10.1093/nar/gkac1046). PMID: 36399499; PMCID: PMC9825572. - + - Si utiliza `gget pdb`, favor de citar también: - Berman HM, Westbrook J, Feng Z, Gilliland G, Bhat TN, Weissig H, Shindyalov IN, Bourne PE. The Protein Data Bank. Nucleic Acids Res. 2000 Jan 1;28(1):235-42. doi: [10.1093/nar/28.1.235](https://doi.org/10.1093/nar/28.1.235). PMID: 10592235; PMCID: PMC102472. - Si utiliza `gget ref` o `gget search`, favor de citar también: - Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606. - + - Si utiliza `gget seq`, favor de citar también: - Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606. - + - The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052) ___ diff --git a/docs/src/es/contributing.md b/docs/src/es/contributing.md index 81a677182..be40cbc17 100644 --- a/docs/src/es/contributing.md +++ b/docs/src/es/contributing.md @@ -51,7 +51,7 @@ Confirme sus cambios una vez que esté satisfecho con ellos. - Los parámetros para la Terminal se definen en ./gget/main.py 8. Agregue módulos/argumentos nuevos a la documentación, si corresponde: - El manual de cada módulo se puede agregar/editar en `./docs/src/en/*.md` (la versión en español de la documentación en `./docs/src/es/*.md` se genera/actualiza automáticamente, y no necesita ser editada manualmente) - + Si tiene alguna pregunta, no dude en iniciar una [discusión](https://github.com/pachterlab/gget/discussions) o crear un Issue como se describe anteriormente. ### Crear un Pull Request (PR) diff --git a/docs/src/es/cosmic.md b/docs/src/es/cosmic.md index 41e631a0c..e779cb1a8 100644 --- a/docs/src/es/cosmic.md +++ b/docs/src/es/cosmic.md @@ -1,9 +1,9 @@ [ Ver el codigo fuente de la pagina en GitHub ](https://github.com/pachterlab/gget/blob/main/docs/src/es/cosmic.md) -> Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Las banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`. +> Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Las banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`. # gget cosmic 🪐 Busca genes, mutaciones y otros factores asociados con el cáncer utilizando la base de datos [COSMIC](https://cancer.sanger.ac.uk/cosmic) (Catalogue Of Somatic Mutations In Cancer). -Formato de retorno: JSON (línea de comandos) o data frame/CSV (Python) cuando `download_cosmic=False`. Cuando `download_cosmic=True`, se descarga la base de datos solicitada en la carpeta especificada. +Formato de retorno: JSON (línea de comandos) o data frame/CSV (Python) cuando `download_cosmic=False`. Cuando `download_cosmic=True`, se descarga la base de datos solicitada en la carpeta especificada. Este módulo fue escrito originalmente en parte por [@AubakirovArman](https://github.com/AubakirovArman) (consultas de información) y [@josephrich98](https://github.com/josephrich98) (descarga de bases de datos). @@ -143,13 +143,9 @@ gget.cosmic("EGFR", cosmic_tsv_path="Cosmic_MutantCensus_Tsv_v101_GRCh37/Cosmic_ | ... | ... | ... | ... | ... | ... | ... | -# Citar +# Citar Si utiliza `gget cosmic` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - Tate JG, Bamford S, Jubb HC, Sondka Z, Beare DM, Bindal N, Boutselakis H, Cole CG, Creatore C, Dawson E, Fish P, Harsha B, Hathaway C, Jupe SC, Kok CY, Noble K, Ponting L, Ramshaw CC, Rye CE, Speedy HE, Stefancsik R, Thompson SL, Wang S, Ward S, Campbell PJ, Forbes SA. COSMIC: the Catalogue Of Somatic Mutations In Cancer. Nucleic Acids Res. 2019 Jan 8;47(D1):D941-D947. doi: [10.1093/nar/gky1015](https://doi.org/10.1093/nar/gky1015). PMID: 30371878; PMCID: PMC6323903. - - - - diff --git a/docs/src/es/dependents.md b/docs/src/es/dependents.md index 4013cc770..1b03d7f78 100644 --- a/docs/src/es/dependents.md +++ b/docs/src/es/dependents.md @@ -22,8 +22,8 @@ Las siguientes aplicaciones usan *gget*: - [https://mcpservers.org/servers/longevity-genie/holy-bio-mcp](https://mcpservers.org/servers/longevity-genie/holy-bio-mcp) - [https://biocontext.ai](https://biocontext.ai/registry/longevity-genie/gget-mcp) - [https://mcpmarket.com/zh/tools/skills/gget-bioinformatics-tool](https://mcpmarket.com/zh/tools/skills/gget-bioinformatics-tool) -- [PantheonOS](https://pantheonos.stanford.edu/) - Un sistema evolutivo de agentes biológicos multiagente diseñado para conciliar la generalidad con la especificidad de dominio, desarrollado en Stanford. +- [PantheonOS](https://pantheonos.stanford.edu/) + Un sistema evolutivo de agentes biológicos multiagente diseñado para conciliar la generalidad con la especificidad de dominio, desarrollado en Stanford. > "Acceso a bases de datos: utilizando las habilidades de **gget**, iSeq y cellxgene para acceder a una variedad de bases de datos, incluyendo SRA, GEO, Ensembl, UniProt, UCSC, Enrichr y CZI cellxgene." - [Biomni](https://biomni.stanford.edu/environment) Un agente de inteligencia artificial biomédica de propósito general que se está desarrollando en Stanford y Genentech. @@ -32,7 +32,7 @@ Las siguientes aplicaciones usan *gget*: > "Las herramientas se agrupan en familias como literatura [...], genómica (biopython, **gget**) y aprendizaje automático (rdkit, pymol)." - [PerTurboAgent](https://www.biorxiv.org/content/10.1101/2025.05.25.656020v1) Un agente de auto-planificación para potenciar experimentos secuenciales de Perturb-seq. - > "Nosotros [...] usamos los paquetes **gget** y blitzgsea para análisis de enriquecimiento de datos" + > "Nosotros [...] usamos los paquetes **gget** y blitzgsea para análisis de enriquecimiento de datos" - [Habilidades científicas para Claude](https://github.com/K-Dense-AI/claude-scientific-skills), desarrolladas por K-Dense-AI > " Este repositorio contiene 138 habilidades científicas organizadas en múltiples dominios. Cada habilidad proporciona documentación completa, ejemplos de código y mejores prácticas para trabajar con librerías científicas, bases de datos y herramientas. > 🧬 Bioinformática y Genómica @@ -92,7 +92,7 @@ ____ - Shanmugampillai Jeyarajaguru Kabilan et al., [Molecular modelling approaches for the identification of potent Sodium-Glucose Cotransporter 2 inhibitors from Boerhavia diffusa for the potential treatment of chronic kidney disease.](https://doi.org/10.21203/rs.3.rs-4520611/v1) *Journal of Computer-Aided Molecular Design (en revisión)* (2024). DOI: 10.21203/rs.3.rs-4520611/v1 - Joseph M Rich et al., [The impact of package selection and versioning on single-cell RNA-seq analysis.](https://pmc.ncbi.nlm.nih.gov/articles/PMC11014608/#:~:text=10.1101/2024.04.04.588111) *bioRxiv* (2024). DOI: 10.1101/2024.04.04.588111 - Sanjay C. Nagi et al., [AnoPrimer: Primer Design in malaria vectors informed by range-wide genomic variation.](https://wellcomeopenresearch.org/articles/9-255/v1) *Wellcome Open Research* (2024). -- Yasmin Makki Mohialden et al., [A survey of the most recent Python packages for use in biology.](http://dx.doi.org/10.48047/NQ.2023.21.2.NQ23029) *NeuroQuantology* (2023). DOI: 10.48047/NQ.2023.21.2.NQ23029 +- Yasmin Makki Mohialden et al., [A survey of the most recent Python packages for use in biology.](http://dx.doi.org/10.48047/NQ.2023.21.2.NQ23029) *NeuroQuantology* (2023). DOI: 10.48047/NQ.2023.21.2.NQ23029 - Nicola A. Kearns et al., [Generation and molecular characterization of human pluripotent stem cell-derived pharyngeal foregut endoderm.](https://doi.org/10.1016/j.devcel.2023.08.024) *Cell Reports* (2023). DOI: 10.1016/j.devcel.2023.08.024 - Jonathan Rosenski et al., [Predicting gene knockout effects from expression data.](https://link.springer.com/article/10.1186/s12920-023-01446-6) *BMC Medical Genomics* (2023). DOI: 10.1186/s12920-023-01446-6 - Peter Overby et al., [Pharmacological or genetic inhibition of Scn9a protects beta-cells while reducing insulin secretion in type 1 diabetes.](https://doi.org/10.1101/2023.06.11.544521) *bioRxiv* (2023). DOI: 10.1101/2023.06.11.544521 diff --git a/docs/src/es/diamond.md b/docs/src/es/diamond.md index ec8d4a802..6a6e85aaf 100644 --- a/docs/src/es/diamond.md +++ b/docs/src/es/diamond.md @@ -2,12 +2,12 @@ > Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Las banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`. # gget diamond 💎 -Alinee múltiples proteínas o secuencias de ADN traducidas usando [DIAMOND](https://www.nature.com/articles/nmeth.3176) (DIAMOND es similar a BLAST, pero este es un cálculo local). +Alinee múltiples proteínas o secuencias de ADN traducidas usando [DIAMOND](https://www.nature.com/articles/nmeth.3176) (DIAMOND es similar a BLAST, pero este es un cálculo local). Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python). **Parámetro posicional** `query` -Secuencia(s) (str o lista) de aminoácidos, o una ruta a un archivo tipo FASTA. +Secuencia(s) (str o lista) de aminoácidos, o una ruta a un archivo tipo FASTA. **Parámetro requerido** `-ref` `--reference` @@ -20,7 +20,7 @@ Por defecto: None -> El archivo de base de datos DIAMOND temporal se eliminará `-s` `--sensitivity` Sensibilidad de la alineación (str). Por defecto: "very-sensitive" (muy sensible). -Uno de los siguientes: fast, mid-sensitive, sensitive, more-sensitive, very-sensitive, or ultra-sensitive. +Uno de los siguientes: fast, mid-sensitive, sensitive, more-sensitive, very-sensitive, or ultra-sensitive. `-t` `--threads` Número de hilos de procesamiento utilizados (int). Por defecto: 1. @@ -28,18 +28,18 @@ Número de hilos de procesamiento utilizados (int). Por defecto: 1. `-db` `--diamond_binary` Ruta al binario DIAMOND (str). Por defecto: None -> Utiliza el binario DIAMOND instalado automáticamente con `gget`. -`-o` `--out` -Ruta al archivo en el que se guardarán los resultados (str), p. ej. "ruta/al/directorio". Por defecto: salida estándar (STDOUT); los archivos temporales se eliminan. +`-o` `--out` +Ruta al archivo en el que se guardarán los resultados (str), p. ej. "ruta/al/directorio". Por defecto: salida estándar (STDOUT); los archivos temporales se eliminan. **Banderas** `-u` `--uniprot` -Use esta bandera cuando `sequence` es un ID de Uniprot en lugar de una secuencia de aminoácidos. +Use esta bandera cuando `sequence` es un ID de Uniprot en lugar de una secuencia de aminoácidos. `-csv` `--csv` -Solo para Terminal. Produce los resultados en formato CSV. +Solo para Terminal. Produce los resultados en formato CSV. Para Python, usa `json=True` para producir los resultados en formato JSON. -`-q` `--quiet` +`-q` `--quiet` Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa. Para Python, usa `verbose=False` para impedir la información de progreso de ser exhibida durante la ejecución del programa. @@ -63,7 +63,7 @@ gget.diamond(["GGETISAWESQME", "ELVISISALIVE", "LQVEFRANKLIN", "PACHTERLABRQCKS" #### [Màs ejemplos](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget diamond` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/es/elm.md b/docs/src/es/elm.md index dc32f739f..b5ce56ad8 100644 --- a/docs/src/es/elm.md +++ b/docs/src/es/elm.md @@ -2,12 +2,12 @@ > Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`. # gget elm 🎭 -Prediga localmente motivos lineales eucarióticos (ELMs) a partir de una secuencia de aminoácidos o UniProt Acc utilizando datos de la [base de datos ELM](http://elm.eu.org/). -Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python). Este módulo devuelve dos tipos de resultados (ver ejemplos). +Prediga localmente motivos lineales eucarióticos (ELMs) a partir de una secuencia de aminoácidos o UniProt Acc utilizando datos de la [base de datos ELM](http://elm.eu.org/). +Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python). Este módulo devuelve dos tipos de resultados (ver ejemplos). **Los datos de ELM se pueden descargar y distribuir para uso no comercial de acuerdo con el [acuerdo de licencia de software de ELM](http://elm.eu.org/media/Elm_academic_license.pdf).** -Antes de usar `gget elm` por primera vez, ejecute `gget setup elm` / `gget.setup("elm")` una vez (consulte también [`gget setup`](setup.md)). +Antes de usar `gget elm` por primera vez, ejecute `gget setup elm` / `gget.setup("elm")` una vez (consulte también [`gget setup`](setup.md)). **Parámetro posicional** `sequence` @@ -25,21 +25,21 @@ Número de hilos de procesamiento utilizados en la alineación de secuencias con `-bin` `diamond_binary` Ruta al binario DIAMOND (str). Por defecto: None -> Utiliza el binario DIAMOND instalado automáticamente con `gget`. -`-o` `--out` +`-o` `--out` Ruta al archivo en el que se guardarán los resultados (str), p. ej. "ruta/al/directorio". Por defecto: salida estándar (STDOUT); los archivos temporales se eliminan. **Banderas** `-u` `--uniprot` -Use esta bandera cuando `sequence` es una Uniprot Acc en lugar de una secuencia de aminoácidos. +Use esta bandera cuando `sequence` es una Uniprot Acc en lugar de una secuencia de aminoácidos. `-e` `--expand` Amplíe la información devuelta en el marco de datos de expresiones regulares para incluir los nombres de proteínas, los organismos y las referencias en las que se validó originalmente el motivo. `-csv` `--csv` -Solo para Terminal. Produce los resultados en formato CSV. +Solo para Terminal. Produce los resultados en formato CSV. Para Python, usa `json=True` para producir los resultados en formato JSON. -`-q` `--quiet` +`-q` `--quiet` Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa. Para Python, usa `verbose=False` para impedir la información de progreso de ser exhibida durante la ejecución del programa. @@ -54,8 +54,8 @@ gget elm -o gget_elm_results LIAQSIGQASFV gget.setup(“elm”) # Descarga/actualiza la base de datos ELM local ortholog_df, regex_df = gget.elm("LIAQSIGQASFV") ``` - -Encuentre ELM que proporcionen a una UniProt Acc: + +Encuentre ELM que proporcionen a una UniProt Acc: ```bash gget setup elm # Descarga/actualiza la base de datos ELM local gget elm -o gget_elm_results --uniprot Q02410 -e @@ -68,14 +68,14 @@ ortholog_df, regex_df = gget.elm("Q02410", uniprot=True, expand=True) → Produce dos resultados con información extensa sobre ELMs asociados con proteínas ortólogas y motivos encontrados en la secuencia de entrada directamente en función de sus expresiones regex: ortholog_df: - + |Ortholog_UniProt_Acc|ProteinName|class_accession|ELMIdentifier |FunctionalSiteName |Description |Organism |… | |:-----------------:|:---------:|:-------------:|:-------------:|:-----------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------:|:----------:|:-:| |Q02410 |APBA1_HUMAN|ELME000357 |LIG_CaMK_CASK_1|CASK CaMK domain binding ligand motif|Motif that mediates binding to the calmodulin-dependent protein kinase (CaMK) domain of the peripheral plasma membrane protein CASK/Lin2.|Homo sapiens|… | |Q02410 |APBA1_HUMAN|ELME000091 |LIG_PDZ_Class_2|PDZ domain ligands |The C-terminal class 2 PDZ-binding motif is classically represented by a pattern such as |Homo sapiens|… | regex_df: - + |Instance_accession|ELMIdentifier |FunctionalSiteName |ELMType|Description |Instances (Matched Sequence)|Organism |… | |:----------------:|:----------------:|:-----------------------------:|:-----:|:-----------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------:|:----------------------------:|:-:| |ELME000321 |CLV_C14_Caspase3-7|Caspase cleavage motif |CLV |Caspase-3 and Caspase-7 cleavage site. |ERSDG |Mus musculus |… | @@ -89,9 +89,8 @@ regex_df: #### [Màs ejemplos](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget elm` en una publicación, favor de citar los siguientes artículos: - Laura Luebbert, Chi Hoang, Manjeet Kumar, Lior Pachter, Fast and scalable querying of eukaryotic linear motifs with gget elm, _Bioinformatics_, 2024, btae095, [https://doi.org/10.1093/bioinformatics/btae095](https://doi.org/10.1093/bioinformatics/btae095) - Manjeet Kumar, Sushama Michael, Jesús Alvarado-Valverde, Bálint Mészáros, Hugo Sámano‐Sánchez, András Zeke, Laszlo Dobson, Tamas Lazar, Mihkel Örd, Anurag Nagpal, Nazanin Farahi, Melanie Käser, Ramya Kraleti, Norman E Davey, Rita Pancsa, Lucía B Chemes, Toby J Gibson, The Eukaryotic Linear Motif resource: 2022 release, Nucleic Acids Research, Volume 50, Issue D1, 7 January 2022, Pages D497–D508, [https://doi.org/10.1093/nar/gkab975](https://doi.org/10.1093/nar/gkab975) - diff --git a/docs/src/es/enrichr.md b/docs/src/es/enrichr.md index 7ab6c6bc2..f633e76f7 100644 --- a/docs/src/es/enrichr.md +++ b/docs/src/es/enrichr.md @@ -4,7 +4,7 @@ # gget enrichr 💰 Realice un análisis de enriquecimiento de una lista de genes utilizando [Enrichr](https://maayanlab.cloud/Enrichr/). Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python). - + **Parámetro posicional** `genes` Lista de nombres cortos (símbolos) de los genes de interés para realizar el análisis de enriquecimiento, p. PHF14 RBM3 MSL1 PHF21A. @@ -17,10 +17,10 @@ Admite cualquier base de datos enumerada [aquí](https://maayanlab.cloud/Enrichr 'pathway'       (KEGG_2021_Human) 'transcription'     (ChEA_2016) 'ontology'      (GO_Biological_Process_2021) -'diseases_drugs'   (GWAS_Catalog_2019) +'diseases_drugs'   (GWAS_Catalog_2019) 'celltypes'      (PanglaoDB_Augmented_2021) 'kinase_interactions'  (KEA_2015) - + **Parámetros opcionales** `-s` `--species` Especies a utilizar como referencia para el análisis de enriquecimiento. (Por defecto: human) @@ -39,12 +39,12 @@ Opciones: Lista de nombres cortos (símbolos) de genes de 'background' (de fondo/control), p. NSUN3 POLRMT NLRX1. Alternativamente: usa la bandera `--ensembl_background` para ingresar IDs tipo Ensembl. -`-o` `--out` +`-o` `--out` Ruta al archivo en el que se guardarán los resultados, p. ruta/al/directorio/resultados.csv (o .json). Por defecto: salida estándar (STDOUT). Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual. `-ko` `--kegg_out` -Ruta al archivo png en el que se guardará la imágen de la vía de señalización celular KEGG, p. ej. ruta/al/directorio/KEGG.png. (Por defecto: None) +Ruta al archivo png en el que se guardará la imágen de la vía de señalización celular KEGG, p. ej. ruta/al/directorio/KEGG.png. (Por defecto: None) `-kr` `--kegg_rank` Rango de la ruta KEGG que se va a trazar. (Por defecto: 1) @@ -52,33 +52,33 @@ Rango de la ruta KEGG que se va a trazar. (Por defecto: 1) `figsize` Solo para Python. (ancho, alto) de la visualización en pulgadas. (Por defecto: (10,10)) -`ax` +`ax` Solo para Python. Ingresa un objeto de ejes matplotlib para personalizar la visualización.(Por defecto: None) - + **Banderas** -`-e` `--ensembl` -Usa esta bandera si `genes` se ingresa como una lista de IDs tipo Ensembl. +`-e` `--ensembl` +Usa esta bandera si `genes` se ingresa como una lista de IDs tipo Ensembl. `-e_b` `--ensembl_bkg` Usa esta bandera si `background_list` se ingresa como una lista de IDs tipo Ensembl. `-bkg` `--background` -Use un conjunto de 20,625 genes 'background' +Use un conjunto de 20,625 genes 'background' listados [aquí](https://github.com/pachterlab/gget/blob/main/gget/constants/enrichr_bkg_genes.txt). - + `-csv` `--csv` -Solo para Terminal. Produce los resultados en formato CSV. -Para Python, usa `json=True` produce los resultados en formato JSON. +Solo para Terminal. Produce los resultados en formato CSV. +Para Python, usa `json=True` produce los resultados en formato JSON. -`-q` `--quiet` +`-q` `--quiet` Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa. Para Python, usa `verbose=False` para imipidir la información de progreso de ser exhibida durante la ejecución del programa. - + `plot` Solo para Python. `plot=True` provée la visualización de los primeros 15 resultados (por defecto: False). - - + + ### Ejemplo ```bash gget enrichr -db ontology ACE2 AGT AGTR1 @@ -107,10 +107,10 @@ gget.enrichr( genes = [ "PHF14", "RBM3", "MSL1", "PHF21A", "ARL10", "INSR", "JADE2", "P2RX7", "LINC00662", "CCDC101", "PPM1B", "KANSL1L", "CRYZL1", "ANAPC16", "TMCC1", - "CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2", "PLBD2", "LARP7", "TECPR2", + "CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2", "PLBD2", "LARP7", "TECPR2", "ZNF302", "CUX1", "MOB2", "CYTH2", "SEC22C", "EIF4E3", "ROBO2", "ADAMTS9-AS2", "CXXC1", "LINC01314", "ATF7", "ATP5F1" - ], + ], database = "ChEA_2022", background_list = [ "NSUN3","POLRMT","NLRX1","SFXN5","ZC3H12C","SLC25A39","ARSG", @@ -125,11 +125,11 @@ gget.enrichr( "ZFP787","ZFP655","RABEPK","ZFP650","4732466D17RIK","EXOSC4", "WDR42A","GPHN","2610528J11RIK","1110003E01RIK","MDH1","1200014M14RIK", "AW209491","MUT","1700123L14RIK","2610036D13RIK", - "PHF14", "RBM3", "MSL1", "PHF21A", "ARL10", "INSR", "JADE2", - "P2RX7", "LINC00662", "CCDC101", "PPM1B", "KANSL1L", "CRYZL1", - "ANAPC16", "TMCC1","CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2", - "PLBD2", "LARP7", "TECPR2", "ZNF302", "CUX1", "MOB2", "CYTH2", - "SEC22C", "EIF4E3", "ROBO2", "ADAMTS9-AS2", "CXXC1", "LINC01314", "ATF7", + "PHF14", "RBM3", "MSL1", "PHF21A", "ARL10", "INSR", "JADE2", + "P2RX7", "LINC00662", "CCDC101", "PPM1B", "KANSL1L", "CRYZL1", + "ANAPC16", "TMCC1","CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2", + "PLBD2", "LARP7", "TECPR2", "ZNF302", "CUX1", "MOB2", "CYTH2", + "SEC22C", "EIF4E3", "ROBO2", "ADAMTS9-AS2", "CXXC1", "LINC01314", "ATF7", "ATP5F1""COX15","TMEM30A","NSMCE4A","TM2D2","RHBDD3","ATXN2","NFS1", "3110001I20RIK","BC038156","C330002I19RIK","ZFYVE20","POLI","TOMM70A", "LOC100047782","2410012H22RIK","RILP","A230062G08RIK", @@ -164,7 +164,7 @@ gget.enrichr(["ZBP1", "IRF3", "RIPK1"], database="pathway", kegg_out="kegg.png", El siguiente ejemplo fue enviado por [Dylan Lawless](https://github.com/DylanLawless) a través de un [PR](https://github.com/pachterlab/gget/pull/54) (con ajustes de [Laura Luebbert](https://github.com/lauraluebbert)): **Use `gget enrichr` en R y cree unq visualización similar usando [ggplot](https://ggplot2.tidyverse.org/reference/ggplot.html).** -TENGA EN CUENTA el cambio de ejes en comparación con la visualización en Python. +TENGA EN CUENTA el cambio de ejes en comparación con la visualización en Python. ```r system("pip install gget") install.packages("reticulate") @@ -221,16 +221,16 @@ df |> #### [Más ejemplos](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget enrichr` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) -- Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128) +- Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128) -- Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377) +- Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377) - Xie Z, Bailey A, Kuleshov MV, Clarke DJB., Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM, Jeon M, & Ma’ayan A. Gene set knowledge discovery with Enrichr. Current Protocols, 1, e90. 2021. doi: [10.1002/cpz1.90](https://doi.org/10.1002/cpz1.90). - + Si trabaja con conjuntos de datos no humanos/ratón, cite también: - Kuleshov MV, Diaz JEL, Flamholz ZN, Keenan AB, Lachmann A, Wojciechowicz ML, Cagan RL, Ma'ayan A. modEnrichr: a suite of gene set enrichment analysis tools for model organisms. Nucleic Acids Res. 2019 Jul 2;47(W1):W183-W190. doi: [10.1093/nar/gkz347](https://doi.org/10.1093/nar/gkz347). PMID: 31069376; PMCID: PMC6602483. diff --git a/docs/src/es/gpt.md b/docs/src/es/gpt.md index 49e87693b..e9b9ba30e 100644 --- a/docs/src/es/gpt.md +++ b/docs/src/es/gpt.md @@ -27,33 +27,33 @@ Su clave API de OpenAI (str) ([obtenga su clave API](https://platform.openai.com El nombre del algoritmo GPT que se usará para generar el texto (str). Por defecto: "gpt-3.5-turbo". See https://platform.openai.com/docs/models/gpt-4 for more information on the available models. -`-temp` `--temperature` +`-temp` `--temperature` Valor entre 0 y 2 que controla el nivel de aleatoriedad y creatividad en el texto generado (float). Los valores más altos resultan en un texto más creativo y variado. Por defecto: 1. -`-tp` `--top_p` +`-tp` `--top_p` Controla la diversidad del texto generado como alternativa al muestreo con `--temperature` (float). Los valores más altos resultan en un texto más diverso e inesperado. Por defecto: 1. Tenga en cuenta que OpenAI recomienda modificar `--top_p` o el parámetro `--temperature`, pero no ambas. -`-s` `--stop` +`-s` `--stop` Una secuencia de tokens para marcar el final del texto generado (str). Por defecto: None. -`-mt` `--max_tokens` +`-mt` `--max_tokens` Controla la longitud máxima del texto generado, en tokens (int). Por defecto: 200. -`-pp` `--presence_penalty` +`-pp` `--presence_penalty` Número entre -2.0 y 2.0. Los valores más altos aumentan la probabilidad de que el modelo hable sobre temas nuevos (float). Por defecto: 0. -`-fp` `--frequency_penalty` +`-fp` `--frequency_penalty` Número entre -2.0 y 2.0. Los valores más altos reducen la probabilidad de que el modelo repita la misma línea palabra por palabra (float). Por defecto: 0. -`-lb` `--logit_bias` +`-lb` `--logit_bias` Un diccionario que especifica un sesgo hacia ciertos tokens en el texto generado (dict). Por defecto: None. -`-o` `--out` +`-o` `--out` Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.txt. Por defecto: salida estándar (STDOUT). - + ### Por ejemplo ```bash gget gpt "Cómo estás hoy GPT?" su_clave_api diff --git a/docs/src/es/info.md b/docs/src/es/info.md index d2009c868..b2a90a62f 100644 --- a/docs/src/es/info.md +++ b/docs/src/es/info.md @@ -6,33 +6,33 @@ Obtenga información detallada sobre genes y transcripciones de [Ensembl](https: Regresa: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python). **Parámetro posicional** -`ens_ids` +`ens_ids` Uno o más ID del tipo Ensembl. -NOTA: Proporcionar una lista de más de 1000 ID de Ensembl a la vez puede provocar un error del servidor (para procesar más de 1000 ID, divida la lista de ID en fragmentos de 1000 ID y ejecútelos por separado). +NOTA: Proporcionar una lista de más de 1000 ID de Ensembl a la vez puede provocar un error del servidor (para procesar más de 1000 ID, divida la lista de ID en fragmentos de 1000 ID y ejecútelos por separado). **Parámetros optionales** -`-o` `--out` +`-o` `--out` Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.csv (o .json). Por defecto: salida estándar (STDOUT). Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual. **Banderas** `-n` `--ncbi` DESACTIVA los resultados de [NCBI](https://www.ncbi.nlm.nih.gov/). -Para Python: `ncbi=False` evita la incluida de datos de NCBI (por defecto: True). +Para Python: `ncbi=False` evita la incluida de datos de NCBI (por defecto: True). `-u` `--uniprot` DESACTIVA los resultados de [UniProt](https://www.uniprot.org/). -Para Python: `uniprot=False` evita la incluida de datos de UniProt (por defecto: True). +Para Python: `uniprot=False` evita la incluida de datos de UniProt (por defecto: True). `-pdb` `--pdb` INCLUYE [PDB](https://www.ebi.ac.uk/pdbe/) IDs en los resultados (podría aumentar el tiempo de ejecución). -Para Python: `pdb=True` incluye IDs de PDB en los resultados (por defecto: False). +Para Python: `pdb=True` incluye IDs de PDB en los resultados (por defecto: False). `-csv` `--csv` -Solo para la Terminal. Regresa los resultados en formato CSV. +Solo para la Terminal. Regresa los resultados en formato CSV. Para Python, usa `json=True` para regresar los resultados en formato JSON. -`-q` `--quiet` +`-q` `--quiet` Solo para la Terminal. Impide la informacion de progreso de ser exhibida durante la corrida. Para Python, usa `verbose=False` para imipidir la informacion de progreso de ser exhibida durante la corrida. @@ -54,16 +54,16 @@ gget.info(["ENSG00000034713", "ENSG00000104853", "ENSG00000170296"]) | -------------- |-------------------------| ------------------------| -------------- | ----------|-----|----|----|----|----|----|----| | ENSG00000034713| P60520 | 11345 | GABARAPL2 | [ATG8, ATG8C, FLC3A, GABARAPL2, GATE-16, GATE16, GEF-2, GEF2] | Gamma-aminobutyric acid receptor-associated protein like 2 (GABA(A) receptor-associated protein-like 2)... | GABA type A receptor associated protein like 2 [Source:HGNC Symbol;Acc:HGNC:13291] | FUNCTION: Ubiquitin-like modifier involved in intra- Golgi traffic (By similarity). Modulates intra-Golgi transport through coupling between NSF activity and ... | Enables ubiquitin protein ligase binding activity. Involved in negative regulation of proteasomal protein catabolic process and protein... | protein_coding | ENST00000037243.7 |... | | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | ... | - + #### [More examples](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget info` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606. - + - Sayers EW, Beck J, Bolton EE, Brister JR, Chan J, Comeau DC, Connor R, DiCuccio M, Farrell CM, Feldgarden M, Fine AM, Funk K, Hatcher E, Hoeppner M, Kane M, Kannan S, Katz KS, Kelly C, Klimke W, Kim S, Kimchi A, Landrum M, Lathrop S, Lu Z, Malheiro A, Marchler-Bauer A, Murphy TD, Phan L, Prasad AB, Pujar S, Sawyer A, Schmieder E, Schneider VA, Schoch CL, Sharma S, Thibaud-Nissen F, Trawick BW, Venkatapathi T, Wang J, Pruitt KD, Sherry ST. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2024 Jan 5;52(D1):D33-D43. doi: [10.1093/nar/gkad1044](https://doi.org/10.1093/nar/gkad1044). PMID: 37994677; PMCID: PMC10767890. - + - The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052) diff --git a/docs/src/es/installation.md b/docs/src/es/installation.md index 26a9899a4..f2e94bf7b 100644 --- a/docs/src/es/installation.md +++ b/docs/src/es/installation.md @@ -74,4 +74,3 @@ pip install . ``` o elimina el ejecutable de tu `PATH` del sistema. - Si sigues teniendo problemas, por favor [contáctanos](https://github.com/pachterlab/gget/issues). - diff --git a/docs/src/es/introduction.md b/docs/src/es/introduction.md index 599fbd508..e032e7078 100644 --- a/docs/src/es/introduction.md +++ b/docs/src/es/introduction.md @@ -8,7 +8,7 @@ [](https://raw.githubusercontent.com/pachterlab/gget/main/figures/gget_overview.png) # ¡Bienvenidos! - + `gget` es un programa gratuito de código fuente abierta de Terminal y Python que permite la consulta eficiente de bases de datos genómicas.
`gget` consiste en un conjunto de módulos separados pero interoperables, cada uno diseñado para facilitar un tipo de consulta de base de datos en una sola línea de código. @@ -65,7 +65,7 @@ Estos son los módulos principales de `gget`. Haga clic en cualquier módulo par
-Si usa `gget` en una publicación, por favor [cite*](cite.md): +Si usa `gget` en una publicación, por favor [cite*](cite.md): ``` Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. https://doi.org/10.1093/bioinformatics/btac836 ``` diff --git a/docs/src/es/muscle.md b/docs/src/es/muscle.md index b83e4b644..d3c33b304 100644 --- a/docs/src/es/muscle.md +++ b/docs/src/es/muscle.md @@ -6,11 +6,11 @@ Alinea múltiples secuencias de nucleótidos o aminoácidos usando el algoritmo Regresa: Salida estándar (STDOUT) en formato ClustalW o archivo de tipo 'aligned FASTA' (.afa). **Parámetro posicional** -`fasta` +`fasta` Lista de secuencias o ruta al archivo FASTA o .txt que contiene las secuencias de nucleótidos o aminoácidos que se van a alinear. **Parámetros optionales** -`-o` `--out` +`-o` `--out` Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.afa. Por defecto: salida estándar (STDOUT). Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual. @@ -19,11 +19,11 @@ Para Python, usa `save=True` para guardar los resultados en el directorio de tra Alinea las secuencies usando el algoritmo [Super5](https://drive5.com/muscle5/Muscle5_SuppMat.pdf) en lugar del algoritmo [Parallel Perturbed Probcons (PPP)](https://drive5.com/muscle5/Muscle5_SuppMat.pdf) para disminuir el tiempo y la memoria usada durante la corrida. Use para ingresos grandes (unos cientos secuencias). -`-q` `--quiet` +`-q` `--quiet` Solo para la Terminal. Impide la informacion de progreso de ser exhibida durante la corrida. Para Python, usa `verbose=False` para imipidir la informacion de progreso de ser exhibida durante la corrida. - - + + ### Por ejemplo ```bash gget muscle MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLAS MSSSSWLLLSLVEVTAAQSTIEQQAKTFLDKFHEAEDLFYQSLLAS @@ -32,7 +32,7 @@ gget muscle MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLAS MSSSSWLLLSLVEVTAAQST # Python gget.muscle(["MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLAS", "MSSSSWLLLSLVEVTAAQSTIEQQAKTFLDKFHEAEDLFYQSLLAS"]) ``` - + ```bash gget muscle fasta.fa ``` @@ -59,7 +59,7 @@ alv.view(msa) #### [More examples](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget muscle` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/es/mutate.md b/docs/src/es/mutate.md index 8b2d58022..9bb0656f9 100644 --- a/docs/src/es/mutate.md +++ b/docs/src/es/mutate.md @@ -1,6 +1,6 @@ [ Ver el codigo fuente de la pagina en GitHub ](https://github.com/pachterlab/gget/blob/main/docs/src/es/mutate.md) -> Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`. +> Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`. # gget mutate 🧟 Recibe secuencias de nucleótidos y mutaciones (en [anotación de mutación estándar](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1867422/)) y devuelve versiones mutadas de las secuencias según las mutaciones proporcionadas. Resultado: Guarda las secuencias mutadas en formato FASTA (o devuelve una lista que contiene las secuencias mutadas si `out=None`). @@ -71,53 +71,53 @@ Si k > longitud total de la secuencia, se mantendrá toda la secuencia. `-msl` `--min_seq_len` Longitud mínima de la secuencia de salida mutante, por ejemplo, 100. Las secuencias mutantes más pequeñas que esto serán descartadas. Predeterminado: Ninguno -`-ma` `--max_ambiguous` +`-ma` `--max_ambiguous` Número máximo de caracteres 'N' (o 'n') permitidos en la secuencia de salida, por ejemplo, 10. Predeterminado: Ninguno (no se aplicará filtro de caracteres ambiguos) **Banderas opcionales para la generación/filtrado de secuencias mutantes** `-ofr` `--optimize_flanking_regions` -Elimina nucleótidos de cualquiera de los extremos de la secuencia mutante para asegurar (cuando sea posible) que la secuencia mutante no contenga ningún k-mer que también se encuentre en la secuencia de tipo salvaje/entrada. +Elimina nucleótidos de cualquiera de los extremos de la secuencia mutante para asegurar (cuando sea posible) que la secuencia mutante no contenga ningún k-mer que también se encuentre en la secuencia de tipo salvaje/entrada. `-rswk` `--remove_seqs_with_wt_kmers` Elimina las secuencias de salida donde al menos un k-mer también está presente en la secuencia de tipo salvaje/entrada en la misma región. Cuando se utiliza con `--optimize_flanking_regions`, solo se eliminarán las secuencias para las cuales un k-mer de tipo salvaje aún está presente después de la optimización. -`-mio` `--merge_identical_off` +`-mio` `--merge_identical_off` No fusionar secuencias mutantes idénticas en la salida (por defecto, las secuencias idénticas se fusionarán concatenando los encabezados de secuencia para todas las secuencias idénticas). -**Argumentos opcionales para generar salida adicional** +**Argumentos opcionales para generar salida adicional** Esta salida se activa utilizando la bandera `--update_df` y se almacenará en una copia del DataFrame `mutations`. -`-udf_o` `--update_df_out` +`-udf_o` `--update_df_out` Ruta al archivo csv de salida que contiene el DataFrame actualizado, por ejemplo, 'path/to/mutations_updated.csv'. Solo válido cuando se usa con `--update_df`. Predeterminado: Ninguno -> el nuevo archivo csv se guardará en el mismo directorio que el DataFrame `mutations` con el apéndice '_updated' -`-ts` `--translate_start` +`-ts` `--translate_start` (int o str) La posición en la secuencia de nucleótidos de entrada para comenzar a traducir, por ejemplo, 5. Si se proporciona una cadena, debe corresponder a un nombre de columna en `mutations` que contenga las posiciones de inicio del marco de lectura abierto para cada secuencia/mutación. Solo válido cuando se usa con `--translate`. Predeterminado: traduce desde el principio de cada secuencia -`-te` `--translate_end` +`-te` `--translate_end` (int o str) La posición en la secuencia de nucleótidos de entrada para finalizar la traducción, por ejemplo, 35. Si se proporciona una cadena, debe corresponder a un nombre de columna en `mutations` que contenga las posiciones de fin del marco de lectura abierto para cada secuencia/mutación. Solo válido cuando se usa con `--translate`. Predeterminado: traduce hasta el final de cada secuencia **Banderas opcionales para modificar salida adicional** -`-udf` `--update_df` +`-udf` `--update_df` Actualiza el DataFrame de entrada `mutations` para incluir columnas adicionales con el tipo de mutación, la secuencia de nucleótidos de tipo salvaje y la secuencia de nucleótidos mutante (solo válido si `mutations` es un archivo .csv o .tsv). -`-sfs` `--store_full_sequences` -Incluye las secuencias completas de tipo salvaje y mutantes en el DataFrame actualizado `mutations` (no solo la sub-secuencia con flancos de longitud k). Solo válido cuando se usa con `--update_df`. +`-sfs` `--store_full_sequences` +Incluye las secuencias completas de tipo salvaje y mutantes en el DataFrame actualizado `mutations` (no solo la sub-secuencia con flancos de longitud k). Solo válido cuando se usa con `--update_df`. + +`-tr` `--translate` +Agrega columnas adicionales al DataFrame actualizado `mutations` que contienen las secuencias de aminoácidos de tipo salvaje y mutantes. Solo válido cuando se usa con `--store_full_sequences`. -`-tr` `--translate` -Agrega columnas adicionales al DataFrame actualizado `mutations` que contienen las secuencias de aminoácidos de tipo salvaje y mutantes. Solo válido cuando se usa con `--store_full_sequences`. - **Argumentos generales opcionales** -`-o` `--out` +`-o` `--out` Ruta al archivo FASTA de salida que contiene las secuencias mutadas, por ejemplo, 'path/to/output_fasta.fa'. -Predeterminado: Ninguno -> devuelve una lista de las secuencias mutadas a la salida estándar. -Los identificadores (que siguen al '>') de las secuencias mutadas en el FASTA de salida serán '>[seq_ID]_[mut_ID]'. +Predeterminado: Ninguno -> devuelve una lista de las secuencias mutadas a la salida estándar. +Los identificadores (que siguen al '>') de las secuencias mutadas en el FASTA de salida serán '>[seq_ID]_[mut_ID]'. **Banderas generales opcionales** -`-q` `--quiet` +`-q` `--quiet` Solo en línea de comandos. Previene que se muestre información de progreso. Python: Usa `verbose=False` para prevenir que se muestre información de progreso. @@ -221,7 +221,7 @@ gget.mutate( | 1 | g.224411A>C | ENST00000193812 | 0 | 100 | | 8 | g.25111del | ENST00000174411 | 0 | 294 | | X | g.1011_1012insAA | ENST00000421914 | 9 | 1211 | -``` +``` → Guarda el archivo 'mut_fasta.fa' que contiene: ``` >1:g.224411A>C @@ -230,7 +230,7 @@ TGCTCTGCT GAGTCGAT >X:g.1011_1012insAA TTAGAACTT -``` +``` → Guarda el archivo 'mutations_updated.csv' que contiene: ``` @@ -242,8 +242,7 @@ TTAGAACTT ``` -# Citar +# Citar Si utiliza `gget mutate` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) - diff --git a/docs/src/es/opentargets.md b/docs/src/es/opentargets.md index 799b6c400..0c72d3066 100644 --- a/docs/src/es/opentargets.md +++ b/docs/src/es/opentargets.md @@ -12,8 +12,8 @@ Este módulo fue escrito por [Sam Wagenaar](https://github.com/techno-sam). ID de gen Ensembl, por ejemplo, ENSG00000169194. **Argumentos opcionales** -`-r` `--resource` -Define el tipo de información a devolver en la salida. Predeterminado: 'diseases' (enfermedades). +`-r` `--resource` +Define el tipo de información a devolver en la salida. Predeterminado: 'diseases' (enfermedades). Los recursos posibles son: | Recurso | Valor devuelto | Filtros válidos | Fuentes | @@ -27,24 +27,24 @@ Los recursos posibles son: | `interactions` | Interacciones proteína⇄proteína | `protein_a_id`
`protein_b_id`
`gene_b_id` | | `-l` `--limit` -Limitar el número de resultados, por ejemplo, 10. Predeterminado: Sin límite. +Limitar el número de resultados, por ejemplo, 10. Predeterminado: Sin límite. Nota: No es compatible con los recursos `tractability` y `depmap`. -`-o` `--out` +`-o` `--out` Ruta al archivo JSON donde se guardarán los resultados, por ejemplo, path/to/directory/results.json. Predeterminado: Salida estándar. Python: `save=True` guardará la salida en el directorio de trabajo actual. `--filters` Filtrar resultados por igualdad exacta usando nombres de columnas de OpenTargets devueltos. Pase múltiples filtros repitiendo la bandera, p. ej. '--filter disease.id=EFO_0000274 --filter drug.id=CHEMBL1743081'. Los campos anidados usan notación de punto, coincidiendo con los nombres de columna devueltos por la API. -**Banderas** +**Banderas** `-csv` `--csv` Solo en línea de comandos. Devuelve la salida en formato CSV, en lugar de formato JSON. Python: Use `json=True` para devolver la salida en formato JSON. -`-q` `--quiet` +`-q` `--quiet` Solo en línea de comandos. Evita que se muestre la información de progreso. -Python: Use `verbose=False` para evitar que se muestre la información de progreso. +Python: Use `verbose=False` para evitar que se muestre la información de progreso. `-or` `--or` Solo en línea de comandos. Los filtros se combinan con lógica OR. Predeterminado: lógica AND. @@ -54,7 +54,7 @@ Solo para Python. `wrap_text=True` muestra el marco de datos con texto ajustado ### Ejemplos -**Obtenga enfermedades asociadas a un gen específico:** +**Obtenga enfermedades asociadas a un gen específico:** ```bash gget opentargets ENSG00000169194 -r diseases -l 1 ``` @@ -71,7 +71,7 @@ gget.opentargets('ENSG00000169194', resource='diseases', limit=1)

-**Obtener medicamentos asociados para un gen específico:** +**Obtener medicamentos asociados para un gen específico:** ```bash gget opentargets ENSG00000169194 -r drugs -l 2 ``` @@ -92,7 +92,7 @@ gget.opentargets('ENSG00000169194', resource='drugs', limit=2)

-**Obtenga datos de trazabilidad para un gen específico:** +**Obtenga datos de trazabilidad para un gen específico:** ```bash gget opentargets ENSG00000169194 -r tractability ``` @@ -235,10 +235,10 @@ gget.opentargets( | 0.400 | 1 | intact | P35225 | ENSG00000169194 | IL13 | unspecified role | 9606 | Q86XT9 | ENSG00000149932 | TMEM219 | stimulator | 9606 | - + #### [Más ejemplos](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget opentargets` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/es/pdb.md b/docs/src/es/pdb.md index 3469577a6..459d79f42 100644 --- a/docs/src/es/pdb.md +++ b/docs/src/es/pdb.md @@ -3,7 +3,7 @@ > Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`. # gget pdb 🔮 Obtenga la estructura o los metadatos de una proteína usando data de [RCSB Protein Data Bank (PDB)](https://www.rcsb.org/). -Regresa: El archivo 'pdb' se regresa en formato PDB. Todos los demás datos se regresan en formato JSON. +Regresa: El archivo 'pdb' se regresa en formato PDB. Todos los demás datos se regresan en formato JSON. **Parámetro posicional** `pdb_id` @@ -12,7 +12,7 @@ ID del tipo PDB, p. ej. '7S7U'. **Parámetros optionales** `-r` `--resource` Define el tipo de información a regresar. Uno de los siguientes: - 'pdb': Regresa la estructura de la proteína en formato PDB (regresa por defecto). + 'pdb': Regresa la estructura de la proteína en formato PDB (regresa por defecto). 'entry': Regresa información sobre las estructuras PDB en el nivel superior de la organización de datos PDB jerárquicos. 'pubmed': Regresa anotaciones de PubMed (datos integrados de PubMed) para la cita principal de un ID PDB. 'assembly': Regresa información sobre estructuras PDB en el nivel de estructura cuaternaria. @@ -22,15 +22,15 @@ Define el tipo de información a regresar. Uno de los siguientes: 'uniprot': Regresa anotaciones UniProt para una entidad macromolecular (defina el ID de la entidad como `identifier`). 'branched_entity_instance': Regresa la descripción de instancia de entidad ramificada (defina el ID de cadena como `identifier`). 'polymer_entity_instance': Regresa datos de instancia de entidad polimérica (también conocida como cadena) (defina el ID de cadena como `identifier`). - 'nonpolymer_entity_instance': Regresa datos de instancia de entidad no polimérica (defina el ID de cadena como `identifier`). - + 'nonpolymer_entity_instance': Regresa datos de instancia de entidad no polimérica (defina el ID de cadena como `identifier`). + `-i` `--identifier` Este parámetro se puede utilizar para definir el ID de ensamblaje, entidad o cadena (po defecto: None). Los IDs de ensamblaje/entidad son números (p. ej., 1) y los IDs de cadena son letras (p. ej., 'A'). - -`-o` `--out` + +`-o` `--out` Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/7S7U.pdb (o 7S7U_entry.json). Por defecto: salida estándar (STDOUT). -Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual. - +Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual. + ### Por ejemplo ```bash gget pdb 7S7U -o 7S7U.pdb @@ -77,10 +77,10 @@ gget.pdb("7DQA", save=True) gget.pdb("7CT5", save=True) ``` → Este caso de uso ejemplifica cómo encontrar archivos PDB para un análisis comparativo de la estructura de las proteínas asociado con IDs de Ensembl o secuencias de aminoácidos. Los archivos PDB obtenidos también se pueden comparar con las estructuras predichas generadas por [`gget alphafold`](alphafold.md). Los archivos PDB se pueden ver de forma interactiva en 3D [aquí](https://rcsb.org/3d-view), o usando programas como [PyMOL](https://pymol.org/) o [Blender](https://www.blender.org/). Múltiple archivos PDB se pueden visualizar para comparación [aquí](https://rcsb.org/alignment). - + #### [Más ejemplos](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget pdb` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/es/ref.md b/docs/src/es/ref.md index 9b8744a59..f25a2edd1 100644 --- a/docs/src/es/ref.md +++ b/docs/src/es/ref.md @@ -8,44 +8,44 @@ Regresa: Resultados en formato JSON. **Parámetro posicional** `species` La especie por la cual que se buscará los FTP en el formato género_especies, p. ej. homo_sapiens. -Nota: No se requiere cuando se llama a la bandera `--list_species`. +Nota: No se requiere cuando se llama a la bandera `--list_species`. Accesos directos: 'human', 'mouse', 'human_grch37' (accede al ensamblaje del genoma GRCh37) **Parámetros optionales** `-w` `--which` -Define qué resultados devolver. Por defecto: 'all' -> Regresa todos los resultados disponibles. -Las entradas posibles son uno solo o una combinación de las siguientes (como lista separada por comas): +Define qué resultados devolver. Por defecto: 'all' -> Regresa todos los resultados disponibles. +Las entradas posibles son uno solo o una combinación de las siguientes (como lista separada por comas): 'gtf' - Regresa la anotación (GTF). 'cdna' - Regresa el transcriptoma (cDNA). 'dna' - Regresa el genoma (DNA). 'cds' - Regresa las secuencias codificantes correspondientes a los genes Ensembl. (No contiene UTR ni secuencia intrónica). -'cdrna' - Regresa secuencias de transcripción correspondientes a genes de ARN no codificantes (ncRNA). -'pep' - Regresa las traducciones de proteínas de los genes Ensembl. +'cdrna' - Regresa secuencias de transcripción correspondientes a genes de ARN no codificantes (ncRNA). +'pep' - Regresa las traducciones de proteínas de los genes Ensembl. `-r` `--release` Define el número de versión de Ensembl desde el que se obtienen los archivos, p. ej. 104. Default: latest Ensembl release. -`-od` `--out_dir` +`-od` `--out_dir` Ruta al directorio donde se guardarán los archivos FTP, p. ruta/al/directorio/. Por defecto: directorio de trabajo actual. -`-o` `--out` +`-o` `--out` Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.json. Por defecto: salida estándar (STDOUT). Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual. **Banderas** -`-l` `--list_species` +`-l` `--list_species` Enumera todas las especies disponibles. (Para Python: combina con `species=None`.) -`-ftp` `--ftp` +`-ftp` `--ftp` Regresa solo los enlaces FTP solicitados. -`-d` `--download` +`-d` `--download` Solo para Terminal. Descarga los FTP solicitados al directorio actual (requiere [curl](https://curl.se/docs/) para ser instalado). -`-q` `--quiet` +`-q` `--quiet` Solo para la Terminal. Impide la informacion de progreso de ser exhibida durante la corrida. Para Python, usa `verbose=False` para imipidir la informacion de progreso de ser exhibida durante la corrida. - + ### Por ejemplo **Use `gget ref` en combinación con [kallisto | bustools](https://www.kallistobus.tools/kb_usage/kb_ref/) para construir un índice de referencia:** ```bash @@ -67,8 +67,8 @@ gget.ref(species=None, list_species=True, release=103) (Si no se especifica ninguna versión, `gget ref` siempre devolverá información de la última versión de Ensembl).

- -**Obtenga la referencia del genoma para una especie específica:** + +**Obtenga la referencia del genoma para una especie específica:** ```bash gget ref -w gtf,dna homo_sapiens ``` @@ -100,7 +100,7 @@ gget.ref("homo_sapiens", which=["gtf", "dna"]) #### [Más ejemplos](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget ref` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/es/search.md b/docs/src/es/search.md index a41ee730a..8f824de67 100644 --- a/docs/src/es/search.md +++ b/docs/src/es/search.md @@ -2,36 +2,36 @@ > Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`. # gget search 🔎 -Obtenga genes y transcripciones de [Ensembl](https://www.ensembl.org/) usando términos de búsqueda de forma libre. -Los resultados se comparan según las secciones "nombre del gen" y "descripción" en la base de datos de Ensembl. `gget` versión >= 0.27.9 también incluye resultados que coinciden con la sección "sinónimo" de Ensembl. +Obtenga genes y transcripciones de [Ensembl](https://www.ensembl.org/) usando términos de búsqueda de forma libre. +Los resultados se comparan según las secciones "nombre del gen" y "descripción" en la base de datos de Ensembl. `gget` versión >= 0.27.9 también incluye resultados que coinciden con la sección "sinónimo" de Ensembl. Regresa: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python). **Parámetro posicional** -`searchwords` +`searchwords` Una o más palabras de búsqueda de forma libre, p. ej. gaba nmda. (Nota: la búsqueda no distingue entre mayúsculas y minúsculas). -**Otros parámetros requeridos** +**Otros parámetros requeridos** `-s` `--species` -Especies o base de datos a buscar. +Especies o base de datos a buscar. Una especie se puede pasar en el formato 'género_especie', p. ej. 'homo_sapiens' o 'arabidopsis_thaliana'. Para pasar una base de datos específica, pase el nombre de la base de datos CORE, p. ej. 'mus_musculus_dba2j_core_105_1'. - + Todas las bases de datos disponibles para cada versión de Ensembl se pueden encontrar aquí: Vertebrados: [http://ftp.ensembl.org/pub/current/mysql/](http://ftp.ensembl.org/pub/current/mysql/) Invertebrados: [http://ftp.ensemblgenomes.org/pub/current/](http://ftp.ensemblgenomes.org/pub/current/) + selecciona reino animal + selecciona mysql/ - + Accesos directos: 'human', 'mouse' **Parámetros optionales** -`-r` `--release` +`-r` `--release` Define el número de versión de Ensembl desde el que se obtienen los archivos, p. ej. 104. Por defecto: None -> se usa la última versión de Ensembl. - -Nota: *No se aplica a las especies invertebrados* (en su lugar, puede pasar una base de datos de una especies específica (incluyen un número de versión) al argumento `species`). Para especies de invertebrados, Ensembl solo almacena bases de datos de 10 versiones anteriores a la versión actual. - + +Nota: *No se aplica a las especies invertebrados* (en su lugar, puede pasar una base de datos de una especies específica (incluyen un número de versión) al argumento `species`). Para especies de invertebrados, Ensembl solo almacena bases de datos de 10 versiones anteriores a la versión actual. + Este argumento se sobrescribe si se pasa una base de datos específica (que incluye un número de publicación) al argumento `species`. `-t` `--id_type` -'gene' (esto se use por defecto) o 'transcript' +'gene' (esto se use por defecto) o 'transcript' Regesa genes o transcripciones, respectivamente. `-ao` `--andor` @@ -39,26 +39,26 @@ Regesa genes o transcripciones, respectivamente. 'or' ('o'): Regresa todos los genes que INCLUYEN AL MENOS UNA de las palabras de búsqueda en su nombre/descripción. 'and' ('y'): Regresa solo los genes que INCLUYEN TODAS las palabras de búsqueda en su nombre/descripción. -`-l` `--limit` +`-l` `--limit` Limita el número de resultados de búsqueda, p. ej. 10. Por defecto: None. -`-o` `--out` +`-o` `--out` Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.csv (o .json). Por defecto: salida estándar (STDOUT). Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual. **Banderas** `-csv` `--csv` -Solo para la Terminal. Regresa los resultados en formato CSV. +Solo para la Terminal. Regresa los resultados en formato CSV. Para Python, usa `json=True` para regresar los resultados en formato JSON. -`-q` `--quiet` +`-q` `--quiet` Solo para la Terminal. Impide la informacion de progreso de ser exhibida durante la corrida. Para Python, usa `verbose=False` para imipidir la informacion de progreso de ser exhibida durante la corrida. `wrap_text` -Solo para Python. `wrap_text=True` muestra los resultados con texto envuelto para facilitar la lectura (por defecto: False). +Solo para Python. `wrap_text=True` muestra los resultados con texto envuelto para facilitar la lectura (por defecto: False). + - ### Por ejemplo ```bash gget search -s human gaba gamma-aminobutyric @@ -73,10 +73,10 @@ gget.search(["gaba", "gamma-aminobutyric"], "homo_sapiens") | -------------- |-------------------------| ------------------------| -------------- | ----------|-----| | ENSG00000034713| GABARAPL2 | GABA type A receptor associated protein like 2 [Source:HGNC Symbol;Acc:HGNC:13291] | GABA type A receptor associated protein like 2 | protein_coding | https://uswest.ensembl.org/homo_sapiens/Gene/Summary?g=ENSG00000034713 | | . . . | . . . | . . . | . . . | . . . | . . . | - + #### [Más ejemplos](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget search` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/es/seq.md b/docs/src/es/seq.md index d8495123f..8b4badc93 100644 --- a/docs/src/es/seq.md +++ b/docs/src/es/seq.md @@ -6,11 +6,11 @@ Obtenga la(s) secuencia(s) nucleótidos o aminoácidos de un gen (y todas sus is Regresa: Archivo de tipo FASTA. **Parámetro posicional** -`ens_ids` +`ens_ids` One or more Ensembl IDs. **Parámetros optionales** -`-o` `--out` +`-o` `--out` Ruta al archivo en el que se guardarán los resultados, p. ruta/al/directorio/resultados.fa. Por defecto: salida estándar (STDOUT). Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual. @@ -20,11 +20,11 @@ Regresa secuencias de aminoácidos (en lugar de nucleótidos). Las secuencias de nucleótidos se obtienen de [Ensembl](https://www.ensembl.org/). Las secuencias de aminoácidos se obtienen de [UniProt](https://www.uniprot.org/). -`-iso` `--isoforms` +`-iso` `--isoforms` Regresa las secuencias de todas las transcripciones conocidas. (Solo para IDs de genes). -`-q` `--quiet` +`-q` `--quiet` Solo para la Terminal. Impide la informacion de progreso de ser exhibida durante la corrida. Para Python, usa `verbose=False` para imipidir la informacion de progreso de ser exhibida durante la corrida. @@ -52,7 +52,7 @@ gget.seq("ENSG00000034713", translate=True, isoforms=True) #### [Más ejemplos](https://github.com/pachterlab/gget_examples) -# Citar +# Citar Si utiliza `gget seq` en una publicación, favor de citar los siguientes artículos: - Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836) diff --git a/docs/src/es/setup.md b/docs/src/es/setup.md index 954308947..bbe52f131 100644 --- a/docs/src/es/setup.md +++ b/docs/src/es/setup.md @@ -8,7 +8,7 @@ Función para instalar/descargar dependencias de terceros para un módulo de gge > **Nota:** Algunas dependencias (por ejemplo, `cellxgene-census`) pueden no ser compatibles con las versiones más recientes de Python. Si encuentras errores durante la instalación, intenta usar un entorno con una versión anterior de Python. **Parámetro posicional** -`module` +`module` Módulo gget para el que se deben instalar las dependencias. ### Por ejemplo @@ -20,4 +20,3 @@ gget setup alphafold gget.setup("alphafold") ``` → Instala todas las dependencias de terceros (modificadas) y descarga los parámetros del algoritmo (~4 GB) necesarios para ejecutar [`gget alphafold`](alphafold.md). - diff --git a/docs/src/es/updates.md b/docs/src/es/updates.md index 9f044b2ef..c9559b7c5 100644 --- a/docs/src/es/updates.md +++ b/docs/src/es/updates.md @@ -64,7 +64,7 @@ - [`gget pdb`](pdb.md): Agregado el sitio web de `wwpdb`, retrocede a `rcsb` si las solicitudes fallan. - [`gget cellxgene`](cellxgene.md): Mejora el manejo de argumentos; el frontend no cambia. - [`gget setup`](setup.md)/[`gget alphafold`](alphafold.md): Corrige el error pip_cmd en `gget.setup("alphafold")`. - + **Versión ≥ 0.29.2** (03 de julio de 2025): - Ahora se puede instalar `gget` usando `uv pip install gget` - Toda la metadata del paquete (versión, autor, descripción, etc.) ahora se gestiona en `setup.cfg` para una compatibilidad total con herramientas modernas como `uv`, `pip` y PyPI @@ -89,7 +89,7 @@ - Se permite la consulta de múltiples genes a la vez. - [`gget diamond`](diamond.md): - Ahora soporta alineamiento traducido de secuencias nucleotídicas contra secuencias de referencia de aminoácidos usando la opción `--translated`. -- [`gget elm`](elm.md): +- [`gget elm`](elm.md): - Mejorado el manejo de errores del servidor. **Versión ≥ 0.29.0** (25 de septiembre de 2024): @@ -110,14 +110,14 @@ - Pruebas unitarias reorganizadas para aumentar la velocidad y disminuir el código - Requisitos actualizados para [permitir versiones más nuevas de mysql-connector](https://github.com/pachterlab/gget/pull/159) - [Soporte para Numpy>= 2.0](https://github.com/pachterlab/gget/issues/157) - + **Versión ≥ 0.28.6 (2 de junio de 2024):** - **Nuevo módulo: [`gget mutate`](./mutate.md)** - [`gget cosmic`](./cosmic.md): Ahora puedes descargar bases de datos completas de COSMIC utilizando el argumento `download_cosmic` - [`gget ref`](./ref.md): Ahora puede obtener la ensambladura del genoma GRCh27 usando `species='human_grch37'` - [`gget search`](./search.md): Ajusta el acceso a los datos humanos a la estructura de la versión 112 de Ensembl (corrige [issue 129](https://github.com/pachterlab/gget/issues/129)) -~~**Version ≥ 0.28.5** (May 29, 2024):~~ +~~**Version ≥ 0.28.5** (May 29, 2024):~~ - Retirado debido a un error con 'logging' en `gget.setup("alphafold")` + mutaciones de inversión en `gget mutate` solo invierten la cadena en lugar de también calcular la hebra complementaria **Versión ≥ 0.28.4** (31 de enero de 2024): @@ -141,13 +141,13 @@ - [`gget ref`](./ref.md): - Cambios de back-end para aumentar la velocidad. - Nuevo argumento: `list_iv_species` para enumerar todas las especies de invertebrados disponibles (se puede combinar con el argumento `release` para obtener todas las especies disponibles de una liberación específica de Ensembl) - + **Versión ≥ 0.28.2** (15 de noviembre de 2023): - [`gget info`](./info.md): devuelve un mensaje de error cuando el servidor NCBI falla por un motivo distinto a un error de recuperación (esto es un error en el lado del servidor en lugar de un error con `gget`) - Reemplace el argumento obsoleto 'texto' para los métodos de tipo find() siempre que se usen con la dependencia `BeautifulSoup` - [`gget elm`](elm.md): Elimina instancias de falsos positivos y verdaderos negativos de los resultados devueltos. - [`gget elm`](elm.md): agrega el argumento `expand` - + **Versión ≥ 0.28.0** (5 de noviembre de 2023): - Documentación actualizada de [`gget muscle`](./muscle.md) para agregar un tutorial sobre cómo visualizar secuencias con diferentes longitudes de nombres de secuencia + ligero cambio en la visualización devuelta para que sea un poco más sólida ante diferentes nombres de secuencia - [`gget muscle`](./muscle.md) ahora también permite una lista de secuencias como entrada (como alternativa a proporcionar la ruta a un archivo FASTA) @@ -155,11 +155,11 @@ - [`gget seq`](./seq.md): permite nombres de genes faltantes (correccione [https://github.com/pachterlab/gget/issues/107](https://github.com/pachterlab/gget /números/107)) - Nuevos argumentos para [`gget enrichr`](enrichr.md): use el argumento `kegg_out` y `kegg_rank` para crear una imagen de la vía KEGG con los genes del análisis de enriquecimiento resaltados (gracias a [este PR](https ://github.com/pachterlab/gget/pull/106) por [Noriaki Sato](https://github.com/noriakis)) - Nuevos módulos: [`gget elm`](elm.md) y [`gget Diamond`](diamond.md) - + **Versión ≥ 0.27.9** (7 de agosto de 2023): - Nuevos argumentos para [`gget enrichr`](enrichr.md): use el argumento `background_list` para proporcionar una lista de genes 'background' - [`gget search`](search.md) ahora también busca sinónimos [Ensembl](https://ensembl.org/) (además de nombres y descripciones de genes) para obtener resultados de búsqueda más completos (gracias a [Samuel Klein](https://github.com/KleinSamuel) por la [sugerencia](https://github.com/pachterlab/gget/issu90)) - + **Versión ≥ 0.27.8** (12 de julio de 2023): - Nuevo argumento para [`gget search`](search.md): especifique la versión de Ensembl desde la cual se obtiene la información con `-r` `--release` - Se corrigió un [error](https://github.com/pachterlab/gget/issu91) en [`gget pdb`](pdb.md) (este error se introdujo en la versión 0.27.5) @@ -179,7 +179,7 @@ - Todos los módulos gget ahora tienen una bandera `-q / --quiet` (para Python: `verbose=False`) para desactivar la información de progreso **Versión ≥ 0.27.4** (19 de marzo de 2023): -- Nuevo módulo: [`gget gpt`](gpt.md) +- Nuevo módulo: [`gget gpt`](gpt.md) **Versión ≥ 0.27.3** (11 de marzo de 2023): - [`gget info`](info.md) excluye los ID de PDB de forma predeterminada para aumentar la velocidad (los resultados de PDB se pueden incluir usando la marca `--pdb` / `pdb=True`). diff --git a/docs/src/es/virus.md b/docs/src/es/virus.md index bf18e444e..c4caafb44 100644 --- a/docs/src/es/virus.md +++ b/docs/src/es/virus.md @@ -27,7 +27,7 @@ Para descargas en caché de SARS-CoV-2 y Alphainfluenza, se admite: Use la opción `--download_all_accessions` para aplicar filtros sin buscar un virus específico. -**Argumentos opcionales** +**Argumentos opcionales** _Filtros de hospedador_ @@ -279,8 +279,8 @@ gget virus "SARS-CoV-2" --host human --nuc_completeness complete --min_seq_lengt import gget gget.virus( - "SARS-CoV-2", - host="human", + "SARS-CoV-2", + host="human", nuc_completeness="complete", min_seq_length=29000, genbank_metadata=True, @@ -303,8 +303,8 @@ gget virus "Influenza A virus" --host human --nuc_completeness complete --max_se import gget gget.virus( - "Influenza A virus", - host="human", + "Influenza A virus", + host="human", nuc_completeness="complete", max_seq_length=15000, genbank_metadata=True, diff --git a/gget/__init__.py b/gget/__init__.py index 3c65663f0..506788c94 100644 --- a/gget/__init__.py +++ b/gget/__init__.py @@ -1,44 +1,39 @@ -from .gget_ref import ref -from .gget_search import search -from .gget_info import info -from .gget_seq import seq -from .gget_muscle import muscle +"""gget: efficient querying of genomic databases.""" + +import logging +from importlib.metadata import PackageNotFoundError, version + +from .gget_8cube import gene_expression, psi_block, specificity +from .gget_alphafold import alphafold +from .gget_archs4 import archs4 +from .gget_bgee import bgee from .gget_blast import blast from .gget_blat import blat -from .gget_enrichr import enrichr -from .gget_archs4 import archs4 -from .gget_alphafold import alphafold -from .gget_setup import setup -from .gget_pdb import pdb -from .gget_gpt import gpt +from .gget_cbio import cbio_plot, cbio_search from .gget_cellxgene import cellxgene -from .gget_elm import elm -from .gget_diamond import diamond from .gget_cosmic import cosmic +from .gget_diamond import diamond +from .gget_elm import elm +from .gget_enrichr import enrichr +from .gget_gpt import gpt +from .gget_info import info +from .gget_muscle import muscle from .gget_mutate import mutate from .gget_opentargets import opentargets -from .gget_cbio import cbio_plot, cbio_search -from .gget_bgee import bgee -from .gget_8cube import specificity, psi_block, gene_expression +from .gget_pdb import pdb +from .gget_ref import ref +from .gget_search import search +from .gget_seq import seq +from .gget_setup import setup from .gget_virus import virus -import logging - # Mute numexpr threads info logging.getLogger("numexpr").setLevel(logging.WARNING) - -# Get version number from the config file -try: - from importlib.metadata import version, PackageNotFoundError -except ImportError: - from importlib_metadata import version, PackageNotFoundError # For Python <3.8 - try: __version__ = version("gget") except PackageNotFoundError: __version__ = "unknown" - __author__ = "Laura Luebbert" __email__ = "lauralubbert@gmail.com" diff --git a/gget/compile.py b/gget/compile.py index 4cd028c98..bfd8f6c4c 100644 --- a/gget/compile.py +++ b/gget/compile.py @@ -1,7 +1,7 @@ import os +import platform import subprocess import sys -import platform from .constants import MUSCLE_GITHUB_LINK from .utils import set_up_logger @@ -11,21 +11,16 @@ # Get absolute package path PACKAGE_PATH = os.path.abspath(os.path.dirname(__file__)) # Path to muscle binary (only exists after 'compile_muscle' was executed) -MUSCLE_PATH = os.path.join( - PACKAGE_PATH, f"bins/compiled/muscle/src/{platform.system()}/muscle" -) +MUSCLE_PATH = os.path.join(PACKAGE_PATH, f"bins/compiled/muscle/src/{platform.system()}/muscle") def compile_muscle(): - """ - Compiles MUSCLE from source. + """Compiles MUSCLE from source. + Currently only supports Linux and Darwin. """ - if platform.system() != "Linux" and platform.system() != "Darwin": - raise OSError( - f"Muscle compiler currently only supports Linux and Darwin, not {platform.system()}.\n" - ) + raise OSError(f"Muscle compiler currently only supports Linux and Darwin, not {platform.system()}.\n") logger.info("Compiling MUSCLE binary from source... ") @@ -56,16 +51,10 @@ def compile_muscle(): # Run make command if platform.system() == "Linux": - logger.warning( - "Compiling MUSCLE requires that g++, make, sed and git are installed." - ) + logger.warning("Compiling MUSCLE requires that g++, make, sed and git are installed.") if platform.system() == "Darwin": - logger.warning( - "Compiling MUSCLE requires that gcc v11, make, sed and git are installed." - ) - logger.warning( - "Please run 'brew install gcc' to install gcc v11 if the compile fails." - ) + logger.warning("Compiling MUSCLE requires that gcc v11, make, sed and git are installed.") + logger.warning("Please run 'brew install gcc' to install gcc v11 if the compile fails.") command2 = "make -s" diff --git a/gget/constants.py b/gget/constants.py index 463987f77..1ea0aa663 100644 --- a/gget/constants.py +++ b/gget/constants.py @@ -47,13 +47,11 @@ GET_BACKGROUND_ENRICHR_URL = "https://maayanlab.cloud/speedrichr/api/backgroundenrich" POST_ENRICHR_URLS = { - f"{typ}": f"https://maayanlab.cloud/{typ.capitalize()}Enrichr/addList" - for typ in ["fly", "yeast", "worm", "fish"] + f"{typ}": f"https://maayanlab.cloud/{typ.capitalize()}Enrichr/addList" for typ in ["fly", "yeast", "worm", "fish"] } POST_ENRICHR_URLS["human"] = POST_ENRICHR_URL GET_ENRICHR_URLS = { - f"{typ}": f"https://maayanlab.cloud/{typ.capitalize()}Enrichr/enrich" - for typ in ["fly", "yeast", "worm", "fish"] + f"{typ}": f"https://maayanlab.cloud/{typ.capitalize()}Enrichr/enrich" for typ in ["fly", "yeast", "worm", "fish"] } GET_ENRICHR_URLS["human"] = GET_ENRICHR_URL @@ -62,12 +60,8 @@ EXPRESSION_URL = "https://maayanlab.cloud/archs4/search/loadExpressionTissue.php?" # Download links for ELM database -ELM_INSTANCES_FASTA_DOWNLOAD = ( - "http://elm.eu.org/instances.fasta?q=*&taxon=&instance_logic=" -) -ELM_INSTANCES_TSV_DOWNLOAD = ( - "http://elm.eu.org/instances.tsv?q=*&taxon=&instance_logic=" -) +ELM_INSTANCES_FASTA_DOWNLOAD = "http://elm.eu.org/instances.fasta?q=*&taxon=&instance_logic=" +ELM_INSTANCES_TSV_DOWNLOAD = "http://elm.eu.org/instances.tsv?q=*&taxon=&instance_logic=" ELM_CLASSES_TSV_DOWNLOAD = "http://elm.eu.org/elms/elms_index.tsv" ELM_INTDOMAINS_TSV_DOWNLOAD = "http://elm.eu.org/interactiondomains.tsv" diff --git a/gget/gget_8cube.py b/gget/gget_8cube.py index f22036caf..1f685bb00 100644 --- a/gget/gget_8cube.py +++ b/gget/gget_8cube.py @@ -1,10 +1,11 @@ -import requests -import pandas as pd -import json as json_package import io +import json as json_package + +import pandas as pd +import requests -from .utils import set_up_logger from .constants import DEFAULT_REQUESTS_TIMEOUT +from .utils import set_up_logger logger = set_up_logger() @@ -25,9 +26,7 @@ def _convert_to_df(response_text, endpoint_name): try: return pd.read_csv(io.StringIO(response_text)) except Exception as e: - raise RuntimeError( - f"API '{endpoint_name}' returned non-CSV data: {e}\nResponse:\n{response_text}" - ) + raise RuntimeError(f"API '{endpoint_name}' returned non-CSV data: {e}\nResponse:\n{response_text}") from e def _save_output(df_or_json, name, json=False, verbose=True): @@ -58,8 +57,8 @@ def specificity( save=False, verbose=True, ): - """ - Retrieve gene-level specificity statistics from the 8cubeDB + """Retrieve gene-level specificity statistics from the 8cubeDB. + (https://eightcubedb.onrender.com/). This endpoint returns ψ (psi) and ζ (zeta) specificity metrics for one @@ -76,7 +75,8 @@ def specificity( gget_8cube_specificity.csv (or .json if json=True). - verbose If True, print progress information. Default: True. - Returns: + Returns + ------- A pandas DataFrame or JSON list containing: - gene_name - ensembl_id @@ -85,11 +85,12 @@ def specificity( - Psi_mean, Psi_std - Zeta_mean, Zeta_std - Raises: + Raises + ------ - ValueError If gene_list is not a list. - RuntimeError If the API request fails or returns invalid data. - """ + """ if not isinstance(gene_list, (list, tuple)): raise ValueError("`gene_list` must be a list.") @@ -130,8 +131,7 @@ def psi_block( save=False, verbose=True, ): - """ - Retrieve ψ_block (psi-block) specificity scores from the 8cubeDB. + """Retrieve ψ_block (psi-block) specificity scores from the 8cubeDB. ψ_block quantifies the specificity of a gene to a particular block within a partition. This endpoint supports block-wise @@ -146,15 +146,17 @@ def psi_block( or .json if json=True. - verbose If True, print progress information. Default: True. - Returns: + Returns + ------- A pandas DataFrame or JSON list containing ψ_block scores for each block label in the partition (e.g., "Male:NZOJ", "Female:B6J", etc.). - Raises: + Raises + ------ - ValueError If gene_list is not a list. - RuntimeError If the API request fails. - """ + """ if not isinstance(gene_list, (list, tuple)): raise ValueError("`gene_list` must be a list.") @@ -166,10 +168,7 @@ def psi_block( ] + [("gene_list", g) for g in processed] if verbose: - logger.info( - f"Fetching ψ-block scores for {len(processed)} genes " - f"({analysis_level}, {analysis_type})…" - ) + logger.info(f"Fetching ψ-block scores for {len(processed)} genes ({analysis_level}, {analysis_type})…") r = requests.get(PSI_BLOCK_URL, params=params, timeout=DEFAULT_REQUESTS_TIMEOUT) if not r.ok: @@ -200,8 +199,7 @@ def gene_expression( save=False, verbose=True, ): - """ - Retrieve normalized gene expression values from 8cubeDB. + """Retrieve normalized gene expression values from 8cubeDB. This endpoint returns mean and variance of normalized expression for the specified gene(s), computed over the selected partition. For example: @@ -217,15 +215,17 @@ def gene_expression( or .json if json=True. - verbose If True, print progress information. - Returns: + Returns + ------- A pandas DataFrame or JSON list with expression values and metadata for each partition block (columns vary depending on analysis_type). - Raises: + Raises + ------ - ValueError If gene_list is not a list. - RuntimeError If the API request fails or returns invalid/empty data. - """ + """ if not isinstance(gene_list, (list, tuple)): raise ValueError("`gene_list` must be a list.") @@ -237,16 +237,11 @@ def gene_expression( ] + [("gene_list", g) for g in processed] if verbose: - logger.info( - f"Fetching expression data for {len(processed)} genes " - f"({analysis_level}, {analysis_type})…" - ) + logger.info(f"Fetching expression data for {len(processed)} genes ({analysis_level}, {analysis_type})…") r = requests.get(GENE_EXPR_URL, params=params, timeout=DEFAULT_REQUESTS_TIMEOUT) if not r.ok: - raise RuntimeError( - f"Gene expression request failed ({r.status_code}): {r.text}" - ) + raise RuntimeError(f"Gene expression request failed ({r.status_code}): {r.text}") df = _convert_to_df(r.text, "gene_expression") diff --git a/gget/gget_alphafold.py b/gget/gget_alphafold.py index 6a574cb01..5455bbfc3 100644 --- a/gget/gget_alphafold.py +++ b/gget/gget_alphafold.py @@ -11,49 +11,43 @@ # Get current date and time for default foldername dt_string = datetime.now().strftime("%Y_%m_%d-%H%M") -from tqdm import tqdm -import os -import shutil -import sys -import enum -import glob -import json -import subprocess -import platform -import collections -import copy -from concurrent import futures -import random -from urllib import request -import matplotlib.pyplot as plt -import numpy as np -from IPython import display -from ipywidgets import GridspecLayout -from ipywidgets import Output - -from .utils import set_up_logger +import collections # noqa: E402 +import copy # noqa: E402 +import enum # noqa: E402 +import glob # noqa: E402 +import json # noqa: E402 +import os # noqa: E402 +import platform # noqa: E402 +import random # noqa: E402 +import shutil # noqa: E402 +import subprocess # noqa: E402 +import sys # noqa: E402 +from concurrent import futures # noqa: E402 +from urllib import request # noqa: E402 + +import matplotlib.pyplot as plt # noqa: E402 +import numpy as np # noqa: E402 +from IPython import display # noqa: E402 +from ipywidgets import GridspecLayout, Output # noqa: E402 +from tqdm import tqdm # noqa: E402 + +from .utils import set_up_logger # noqa: E402 logger = set_up_logger() -TQDM_BAR_FORMAT = ( - "{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]" -) +TQDM_BAR_FORMAT = "{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]" -from .compile import PACKAGE_PATH +from .compile import PACKAGE_PATH # noqa: E402 # from .gget_setup import TMP_DISK -from .gget_setup import UUID, PARAMS_DIR +from .gget_setup import PARAMS_DIR, UUID # noqa: E402 STEREO_CHEM_DIR = os.path.join(PARAMS_DIR, "stereo_chemical_props.txt") # Path to jackhmmer binary -JACKHMMER_BINARY_PATH = os.path.join( - PACKAGE_PATH, f"bins/{platform.system()}/jackhmmer" -) +JACKHMMER_BINARY_PATH = os.path.join(PACKAGE_PATH, f"bins/{platform.system()}/jackhmmer") # Test pattern to find closest source -test_url_pattern = ( - "https://storage.googleapis.com/alphafold-colab{:s}/latest/uniref90_2022_01.fasta.1" -) +test_url_pattern = "https://storage.googleapis.com/alphafold-colab{:s}/latest/uniref90_2022_01.fasta.1" # Sequence validation parameters MIN_PER_SEQUENCE_LENGTH = 16 @@ -80,9 +74,7 @@ def plot_plddt_legend(): - """ - Function to plot the legend for pLDDT. - """ + """Function to plot the legend for pLDDT.""" thresh = [ "Very low (pLDDT < 50)", "Low (70 > pLDDT > 50)", @@ -109,17 +101,13 @@ def plot_plddt_legend(): def fetch(source): - """ - Support function for finding closest source. - """ + """Support function for finding closest source.""" request.urlretrieve(test_url_pattern.format(source)) return source def get_msa(fasta_path, msa_databases, total_jackhmmer_chunks): - """ - Function to search for MSA for the given sequence using chunked Jackhmmer search. - """ + """Function to search for MSA for the given sequence using chunked Jackhmmer search.""" from alphafold.data.tools import jackhmmer ## Run the search against chunks of genetic databases to save disk space @@ -150,9 +138,7 @@ def jackhmmer_chunk_callback(i): def clean_up(): - """ - Function to clean up temporary files after running gget alphafold. - """ + """Function to clean up temporary files after running gget alphafold.""" # # Remove fasta files with input sequences # files = glob.glob("target_*.fasta") # for f in files: @@ -196,8 +182,8 @@ def alphafold( show_sidechains=True, verbose=True, ): - """ - Predicts the structure of a protein using a slightly simplified version of AlphaFold v2.3.0 (https://doi.org/10.1038/s41586-021-03819-2) + """Predicts the structure of a protein using a slightly simplified version of AlphaFold v2.3.0 (https://doi.org/10.1038/s41586-021-03819-2). + published in the AlphaFold Colab notebook (https://colab.research.google.com/github/deepmind/alphafold/blob/main/notebooks/AlphaFold.ipynb). Args: @@ -229,32 +215,29 @@ def alphafold( If you use this function, please cite the gget (https://doi.org/10.1101/2022.05.17.492392) and AphaFold (https://doi.org/10.1038/s41586-021-03819-2) papers and, if applicable, the AlphaFold-Multimer paper (https://www.biorxiv.org/content/10.1101/2021.10.04.463034v1). """ - if platform.system() == "Windows": - logger.warning( - "gget setup alphafold and gget alphafold are not supported on Windows OS." - ) + logger.warning("gget setup alphafold and gget alphafold are not supported on Windows OS.") ## Check if third-party dependencies are installed # Check if openmm is installed try: - import simtk.openmm as openmm + import simtk.openmm as openmm # noqa: F401 except ImportError as e: raise ImportError( f""" Importing openmm resulted in the following error: {e} - Please install AlphaFold third-party dependency openmm by running the following command from the command line: - For Python version < 3.10: - 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1' - For Python version 3.10: - 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0' - For Python version 3.11: - 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0' + Please install AlphaFold third-party dependency openmm by running the following command from the command line: + For Python version < 3.10: + 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1' + For Python version 3.10: + 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0' + For Python version 3.11: + 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0' (Recommendation: Follow with 'conda update -qy conda' to update conda to the latest version afterwards.) """ - ) + ) from e # Check if AlphaFold is installed try: @@ -262,7 +245,7 @@ def alphafold( except ImportError: logger.error( """ - Some third-party dependencies are missing. Please run the following command: + Some third-party dependencies are missing. Please run the following command: >>> gget.setup('alphafold') or $ gget setup alphafold """ ) @@ -276,7 +259,7 @@ def alphafold( if pdb_out.decode() == "": logger.error( """ - Some third-party dependencies are missing. Please run the following command: + Some third-party dependencies are missing. Please run the following command: >>> gget.setup('alphafold') or $ gget setup alphafold """ ) @@ -286,7 +269,7 @@ def alphafold( if not os.path.exists(os.path.join(PARAMS_DIR, "params/")): logger.error( """ - The AlphaFold model parameters are missing. Please run the following command: + The AlphaFold model parameters are missing. Please run the following command: >>> gget.setup('alphafold') or $ gget setup alphafold """ ) @@ -295,24 +278,17 @@ def alphafold( if len(os.listdir(os.path.join(PARAMS_DIR, "params/"))) < 12: logger.error( """ - The AlphaFold model parameters are missing. Please run the following command: + The AlphaFold model parameters are missing. Please run the following command: >>> gget.setup('alphafold') or $ gget setup alphafold """ ) return ## Import AlphaFold functions - from alphafold.notebooks import notebook_utils - from alphafold.model import model - from alphafold.model import config - from alphafold.model import data - - from alphafold.data import feature_processing - from alphafold.data import msa_pairing - from alphafold.data import pipeline - from alphafold.data import pipeline_multimer - from alphafold.common import protein + from alphafold.data import feature_processing, msa_pairing, pipeline, pipeline_multimer + from alphafold.model import config, data, model + from alphafold.notebooks import notebook_utils try: from alphafold.relax import utils @@ -323,16 +299,16 @@ def alphafold( Importing openmm resulted in the following error: {e} - Please install AlphaFold third-party dependency openmm by running the following command from the command line: - For Python version < 3.10: - 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1' - For Python version 3.10: - 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0' - For Python version 3.11: - 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0' + Please install AlphaFold third-party dependency openmm by running the following command from the command line: + For Python version < 3.10: + 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1' + For Python version 3.10: + 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0' + For Python version 3.11: + 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0' (Recommendation: Follow with 'conda update -qy conda' to update conda to the latest version afterwards.) """ - ) + ) from e if relax: # Import AlphaFold relax package @@ -345,16 +321,16 @@ def alphafold( Importing openmm resulted in the following error: {e} - Please install AlphaFold third-party dependency openmm by running the following command from the command line: - For Python version < 3.10: - 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1' - For Python version 3.10: - 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0' - For Python version 3.11: - 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0' + Please install AlphaFold third-party dependency openmm by running the following command from the command line: + For Python version < 3.10: + 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1' + For Python version 3.10: + 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0' + For Python version 3.11: + 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0' (Recommendation: Follow with 'conda update -qy conda' to update conda to the latest version afterwards.) """ - ) + ) from e ## Move stereo_chemical_props.txt from gget bins to Alphafold package so it can be found # logger.info("Locate files containing stereochemical properties.") @@ -367,7 +343,7 @@ def alphafold( ## Validate input sequence(s) if verbose: - logger.info(f"Validating input sequence(s).") + logger.info("Validating input sequence(s).") # Handle command line passing path to FASTA as a list if isinstance(sequence, list) and len(sequence) == 1: @@ -381,7 +357,7 @@ def alphafold( titles = [] seqs = [] with open(sequence) as text_file: - for i, line in enumerate(text_file): + for i, line in enumerate(text_file): # noqa: B007 # Recognize a title line by the '>' character if line[0] == ">": # Append title line to titles list @@ -398,9 +374,7 @@ def alphafold( # Each second line will be a title line if i % 2 == 0: if line[0] != ">": - raise ValueError( - "Expected FASTA to start with a '>' character. " - ) + raise ValueError("Expected FASTA to start with a '>' character. ") else: # Append title line to titles list titles.append(line.strip()) @@ -413,10 +387,8 @@ def alphafold( else: seqs.append(line.strip()) else: - raise ValueError( - "File format not recognized. gget alphafold only supports '.txt' or '.fa' files. " - ) - elif type(sequence) == str and not "." in sequence: + raise ValueError("File format not recognized. gget alphafold only supports '.txt' or '.fa' files. ") + elif isinstance(sequence, str) and "." not in sequence: # Convert string to list seqs = [sequence] else: @@ -435,9 +407,7 @@ class ModelType(enum.Enum): if len(seqs) == 1: if multimer_for_monomer: if verbose: - logger.info( - "Using the multimer model for a single chain, as requested." - ) + logger.info("Using the multimer model for a single chain, as requested.") model_type_to_use = ModelType.MULTIMER else: if verbose: @@ -460,7 +430,7 @@ class ModelType(enum.Enum): if len(seqs[0]) > MAX_MONOMER_MODEL_LENGTH: raise ValueError( f""" - Input sequence is too long: {len(sequences[0])} amino acids, while the maximum for the monomer model is {MAX_MONOMER_MODEL_LENGTH}. + Input sequence is too long: {len(sequences[0])} amino acids, while the maximum for the monomer model is {MAX_MONOMER_MODEL_LENGTH}. You can try to run this sequence with the multimer model by using the flag [-mfm] ('multimer_for_monomer=True'). """ ) @@ -472,7 +442,7 @@ class ModelType(enum.Enum): ## Find the closest source if verbose: - logger.info(f"Finding closest source for reference database.") + logger.info("Finding closest source for reference database.") ex = futures.ThreadPoolExecutor(3) fs = [ex.submit(fetch, source) for source in ["", "-europe", "-asia"]] @@ -551,7 +521,7 @@ class ModelType(enum.Enum): # Save the target sequence in a fasta file fasta_path = os.path.join(abs_out_path, f"target_{sequence_index}.fasta") - with open(fasta_path, "wt") as f: + with open(fasta_path, "w") as f: f.write(f">query\n{sequence}") # Don't do redundant work for multiple copies of the same chain in the multimer @@ -570,45 +540,31 @@ class ModelType(enum.Enum): single_chain_msas = [] uniprot_msa = None for db_name, db_results in raw_msa_results.items(): - merged_msa = notebook_utils.merge_chunked_msa( - results=db_results, max_hits=MAX_HITS.get(db_name) - ) + merged_msa = notebook_utils.merge_chunked_msa(results=db_results, max_hits=MAX_HITS.get(db_name)) if merged_msa.sequences and db_name != "uniprot": single_chain_msas.append(merged_msa) msa_size = len(set(merged_msa.sequences)) if verbose: - logger.info( - f"{msa_size} unique sequences found in {db_name} for sequence {sequence_index}." - ) + logger.info(f"{msa_size} unique sequences found in {db_name} for sequence {sequence_index}.") elif merged_msa.sequences and db_name == "uniprot": uniprot_msa = merged_msa - notebook_utils.show_msa_info( - single_chain_msas=single_chain_msas, sequence_index=sequence_index - ) + notebook_utils.show_msa_info(single_chain_msas=single_chain_msas, sequence_index=sequence_index) # Turn the raw data into model features. feature_dict = {} feature_dict.update( - pipeline.make_sequence_features( - sequence=sequence, description="query", num_res=len(sequence) - ) + pipeline.make_sequence_features(sequence=sequence, description="query", num_res=len(sequence)) ) feature_dict.update(pipeline.make_msa_features(msas=single_chain_msas)) # Add empty placeholder features - feature_dict.update( - notebook_utils.empty_placeholder_template_features( - num_templates=0, num_res=len(sequence) - ) - ) + feature_dict.update(notebook_utils.empty_placeholder_template_features(num_templates=0, num_res=len(sequence))) # Construct the all_seq features only for heteromers, not homomers if model_type_to_use == ModelType.MULTIMER and len(set(sequences)) > 1: valid_feats = msa_pairing.MSA_FEATURES + ("msa_species_identifiers",) all_seq_features = { - f"{k}_all_seq": v - for k, v in pipeline.make_msa_features([uniprot_msa]).items() - if k in valid_feats + f"{k}_all_seq": v for k, v in pipeline.make_msa_features([uniprot_msa]).items() if k in valid_feats } feature_dict.update(all_seq_features) @@ -621,15 +577,11 @@ class ModelType(enum.Enum): elif model_type_to_use == ModelType.MULTIMER: all_chain_features = {} for chain_id, chain_features in features_for_chain.items(): - all_chain_features[chain_id] = pipeline_multimer.convert_monomer_features( - chain_features, chain_id - ) + all_chain_features[chain_id] = pipeline_multimer.convert_monomer_features(chain_features, chain_id) all_chain_features = pipeline_multimer.add_assembly_features(all_chain_features) - np_example = feature_processing.pair_and_merge( - all_chain_features=all_chain_features - ) + np_example = feature_processing.pair_and_merge(all_chain_features=all_chain_features) # Pad MSA to avoid zero-sized extra_msa np_example = pipeline_multimer.pad_msa(np_example, min_num_seq=512) @@ -663,12 +615,8 @@ class ModelType(enum.Enum): params = data.get_model_haiku_params(model_name, PARAMS_DIR) model_runner = model.RunModel(cfg, params) - processed_feature_dict = model_runner.process_features( - np_example, random_seed=0 - ) - prediction = model_runner.predict( - processed_feature_dict, random_seed=random.randrange(sys.maxsize) - ) + processed_feature_dict = model_runner.process_features(np_example, random_seed=0) + prediction = model_runner.predict(processed_feature_dict, random_seed=random.randrange(sys.maxsize)) if model_type_to_use == ModelType.MONOMER: if "predicted_aligned_error" in prediction: @@ -697,9 +645,7 @@ class ModelType(enum.Enum): processed_feature_dict, prediction, b_factors=b_factors, - remove_leading_feature_dimension=( - model_type_to_use == ModelType.MONOMER - ), + remove_leading_feature_dimension=(model_type_to_use == ModelType.MONOMER), ) unrelaxed_proteins[model_name] = unrelaxed_protein @@ -711,12 +657,10 @@ class ModelType(enum.Enum): ## AMBER relax the best model # Find the best model according to the mean pLDDT. - best_model_name = max( - ranking_confidences.keys(), key=lambda x: ranking_confidences[x] - ) + best_model_name = max(ranking_confidences.keys(), key=lambda x: ranking_confidences[x]) if relax: - pbar.set_description(f"AMBER relaxation") + pbar.set_description("AMBER relaxation") amber_relaxer = run_relax.AmberRelaxation( max_iterations=0, @@ -726,9 +670,7 @@ class ModelType(enum.Enum): max_outer_iterations=3, use_gpu=False, ) - relaxed_pdb, _, _ = amber_relaxer.process( - prot=unrelaxed_proteins[best_model_name] - ) + relaxed_pdb, _, _ = amber_relaxer.process(prot=unrelaxed_proteins[best_model_name]) else: logger.warning( "\nRunning model without relaxation stage. Use flag [--relax] ('relax=True') to include AMBER relaxation." diff --git a/gget/gget_archs4.py b/gget/gget_archs4.py index e03fd43e9..2ec2b948c 100644 --- a/gget/gget_archs4.py +++ b/gget/gget_archs4.py @@ -1,17 +1,17 @@ -import requests -import pandas as pd -import json as json_package import io +import json as json_package + +import pandas as pd +import requests from .utils import set_up_logger logger = set_up_logger() # Custom functions -from .gget_info import info - # Constants -from .constants import GENECORR_URL, EXPRESSION_URL +from .constants import EXPRESSION_URL, GENECORR_URL # noqa: E402 +from .gget_info import info # noqa: E402 def archs4( @@ -24,9 +24,9 @@ def archs4( save=False, verbose=True, ): - """ - Find the most correlated genes or the tissue expression atlas - of a gene of interest using data from the human and mouse RNA-seq + """Find the most correlated genes or the tissue expression atlas of a gene of interest. + + Uses data from the human and mouse RNA-seq database ARCHS4 (https://maayanlab.cloud/archs4/). Args: @@ -52,16 +52,12 @@ def archs4( # Check if 'which' argument is valid whichs = ["correlation", "tissue"] if which not in whichs: - raise ValueError( - f"'which' argument specified as {which}. Expected one of: {', '.join(whichs)}" - ) + raise ValueError(f"'which' argument specified as {which}. Expected one of: {', '.join(whichs)}") # Check if 'species' argument is valid sps = ["human", "mouse"] if species not in sps: - raise ValueError( - f"'species' argument specified as {species}. Expected one of: {', '.join(sps)}" - ) + raise ValueError(f"'species' argument specified as {species}. Expected one of: {', '.join(sps)}") ## Transform Ensembl IDs to gene symbols if ensembl: @@ -72,9 +68,7 @@ def archs4( # Check if Ensembl ID was found if isinstance(info_df, type(None)): - logger.error( - f"ID '{gene}' not found. Please double-check spelling/arguments and try again." - ) + logger.error(f"ID '{gene}' not found. Please double-check spelling/arguments and try again.") return gene_symbol = info_df.loc[gene]["ensembl_gene_name"] @@ -90,9 +84,7 @@ def archs4( if which == "correlation": if verbose: - logger.info( - f"Fetching the {gene_count} most correlated genes to {gene} from ARCHS4." - ) + logger.info(f"Fetching the {gene_count} most correlated genes to {gene} from ARCHS4.") ## Find most similar genes based on co-expression # Define number of correlated genes to return (+1 to account for Python indexing) @@ -120,9 +112,7 @@ def archs4( ) return else: - logger.error( - f"Gene correlation request for search term '{gene}' returned error: {corr_data['error']}" - ) + logger.error(f"Gene correlation request for search term '{gene}' returned error: {corr_data['error']}") return else: @@ -136,9 +126,7 @@ def archs4( if json: results_dict = json_package.loads(corr_df.to_json(orient="records")) if save: - with open( - f"gget_archs4_gene-correlation_{gene}.json", "w", encoding="utf-8" - ) as f: + with open(f"gget_archs4_gene-correlation_{gene}.json", "w", encoding="utf-8") as f: json_package.dump(results_dict, f, ensure_ascii=False, indent=4) return results_dict @@ -151,9 +139,7 @@ def archs4( if which == "tissue": if verbose: - logger.info( - f"Fetching the tissue expression atlas of {gene} from {species} ARCHS4 data." - ) + logger.info(f"Fetching the tissue expression atlas of {gene} from {species} ARCHS4 data.") ## Find tissue expression data ## Define API query @@ -195,17 +181,13 @@ def archs4( if json: results_dict = json_package.loads(tissue_exp_df.to_json(orient="records")) if save: - with open( - f"gget_archs4_tissue-expression_{gene}.json", "w", encoding="utf-8" - ) as f: + with open(f"gget_archs4_tissue-expression_{gene}.json", "w", encoding="utf-8") as f: json_package.dump(results_dict, f, ensure_ascii=False, indent=4) return results_dict else: if save: - tissue_exp_df.to_csv( - f"gget_archs4_tissue-expression_{gene}.csv", index=False - ) + tissue_exp_df.to_csv(f"gget_archs4_tissue-expression_{gene}.csv", index=False) return tissue_exp_df diff --git a/gget/gget_bgee.py b/gget/gget_bgee.py index 166d902fb..37398495b 100644 --- a/gget/gget_bgee.py +++ b/gget/gget_bgee.py @@ -1,19 +1,17 @@ -import pandas as pd import json as json_ -from .utils import set_up_logger, json_list_to_df, http_json, dig +from .utils import dig, http_json, json_list_to_df, set_up_logger logger = set_up_logger() def _bgee_species(gene_id: str, verbose=True): - """ - Get species ID from Bgee + """Get species ID from Bgee. + :param gene_id: Ensembl gene ID :param verbose: log progress - :return: species ID + :return: species ID. """ - if verbose: logger.info(f"Getting species ID for gene {gene_id} from Bgee") @@ -38,8 +36,7 @@ def _bgee_species(gene_id: str, verbose=True): def _bgee_orthologs(gene_id, json=False, verbose=True): - """ - Get orthologs for a gene from Bgee + """Get orthologs for a gene from Bgee. Args: @@ -51,9 +48,7 @@ def _bgee_orthologs(gene_id, json=False, verbose=True): """ # if single Ensembl ID passed as string, convert to list if isinstance(gene_id, list): - raise ValueError( - "One a single gene ID can be passed at a time for ortholog searches." - ) + raise ValueError("One a single gene ID can be passed at a time for ortholog searches.") # must first obtain species species = _bgee_species(gene_id, verbose=verbose) @@ -96,8 +91,7 @@ def _bgee_orthologs(gene_id, json=False, verbose=True): def _bgee_expression(gene_id, json=False, verbose=True): - """ - Get expression data from Bgee + """Get expression data from Bgee. Args: @@ -143,7 +137,10 @@ def _bgee_expression(gene_id, json=False, verbose=True): ) expression_data = dig( - payload, "data", "expressionData", "expressionCalls", + payload, + "data", + "expressionData", + "expressionCalls", context="Bgee API (expression)", ) @@ -173,8 +170,7 @@ def bgee( json=False, verbose=True, ): - """ - Get orthologs/expression data for a gene from Bgee (https://www.bgee.org/). + """Get orthologs/expression data for a gene from Bgee (https://www.bgee.org/). Args: type type of data to retrieve ('expression' or 'orthologs') @@ -189,6 +185,4 @@ def bgee( elif type == "orthologs": return _bgee_orthologs(gene_id, json=json, verbose=verbose) else: - raise ValueError( - f"Argument type should be 'expression' or 'orthologs', not '{type}'" - ) + raise ValueError(f"Argument type should be 'expression' or 'orthologs', not '{type}'") diff --git a/gget/gget_blast.py b/gget/gget_blast.py index 7afc48396..eb0183d81 100644 --- a/gget/gget_blast.py +++ b/gget/gget_blast.py @@ -1,24 +1,24 @@ -from io import StringIO - -import pandas as pd import json as json_package import time -from bs4 import BeautifulSoup +from io import StringIO +from urllib.parse import urlencode # Using urllib instead of requests here because requests does not # support long queries (queries very long here due to input sequence) -from urllib.request import urlopen, Request -from urllib.parse import urlencode +from urllib.request import Request, urlopen + +import pandas as pd +from bs4 import BeautifulSoup # Custom functions -from .utils import parse_blast_ref_page, wrap_cols_func, read_fasta, set_up_logger +from .utils import parse_blast_ref_page, read_fasta, set_up_logger, wrap_cols_func logger = set_up_logger() # Constants -from .constants import ( - BLAST_URL, +from .constants import ( # noqa: E402 BLAST_CLIENT, + BLAST_URL, ) @@ -35,8 +35,8 @@ def blast( json=False, save=False, ): - """ - BLAST a nucleotide or amino acid sequence against any BLAST DB. + """BLAST a nucleotide or amino acid sequence against any BLAST DB. + Args: - sequence Sequence (str) or path to FASTA file. (If more than one sequence in FASTA file, only the first will be submitted to BLAST.) @@ -91,16 +91,12 @@ def blast( _, seqs = read_fasta(sequence) else: - raise ValueError( - "File format not recognized. gget BLAST currently only supports '.txt' or '.fa' files. " - ) + raise ValueError("File format not recognized. gget BLAST currently only supports '.txt' or '.fa' files. ") # Set the first sequence from the fasta file as 'sequence' sequence = seqs[0] if len(seqs) > 1: - logger.warning( - "File contains more than one sequence. Only the first sequence will be submitted to BLAST." - ) + logger.warning("File contains more than one sequence. Only the first sequence will be submitted to BLAST.") # Convert sequence to upper case sequence = sequence.upper() @@ -134,16 +130,12 @@ def blast( else: # Check if the user specified database is valid if database not in dbs: - raise ValueError( - f"Database specified is {database}. Expected one of: {', '.join(dbs)}" - ) + raise ValueError(f"Database specified is {database}. Expected one of: {', '.join(dbs)}") else: if verbose: logger.info("Sequence recognized as nucleotide sequence.") - logger.info( - "BLAST will use program 'blastn' with user-specified database." - ) + logger.info("BLAST will use program 'blastn' with user-specified database.") # If sequence is an amino acid sequence, set program to blastp elif set(sequence) <= amino_acids: program = "blastp" @@ -157,47 +149,39 @@ def blast( else: # Check if the user specified database is valid if database not in dbs: - raise ValueError( - f"Database specified is {database}. Expected one of: {', '.join(dbs)}" - ) + raise ValueError(f"Database specified is {database}. Expected one of: {', '.join(dbs)}") else: if verbose: logger.info("Sequence recognized as amino acid sequence.") - logger.info( - "BLAST will use program 'blastp' with user-specified database." - ) + logger.info("BLAST will use program 'blastp' with user-specified database.") else: raise ValueError( f""" Sequence not automatically recognized as a nucleotide or amino acid sequence. Please specify 'program' and 'database'. - Program options: {', '.join(programs)} - Database options: {', '.join(dbs)} + Program options: {", ".join(programs)} + Database options: {", ".join(dbs)} """ ) else: # Check if the user specified program is valid if program not in programs: - raise ValueError( - f"Program specified is {program}. Expected one of: {', '.join(programs)}" - ) + raise ValueError(f"Program specified is {program}. Expected one of: {', '.join(programs)}") # Ask user to also specify database if database == "default": raise ValueError( f""" - User-specified program requires user-specified database. Please also specify argument 'database'. - Database options: {', '.join(dbs)} + User-specified program requires user-specified database. Please also specify argument 'database'. + Database options: {", ".join(dbs)} """ ) else: # Check if the user specified database is valid if database not in dbs: - raise ValueError( - f"Database specified is {database}. Expected one of: {', '.join(dbs)}" - ) + raise ValueError(f"Database specified is {database}. Expected one of: {', '.join(dbs)}") ## Translate filter arguments if low_comp_filt is False: @@ -246,14 +230,12 @@ def blast( if RTOE < 11: # Communicate RTOE if verbose: - logger.info(f"BLAST initiated. Estimated time to completion: 11 seconds.") + logger.info("BLAST initiated. Estimated time to completion: 11 seconds.") time.sleep(11) else: # Communicate RTOE if verbose: - logger.info( - f"BLAST initiated with search ID {RID}. Estimated time to completion: {RTOE} seconds." - ) + logger.info(f"BLAST initiated with search ID {RID}. Estimated time to completion: {RTOE} seconds.") time.sleep(int(RTOE)) ## Poll server for status and fetch search results @@ -295,9 +277,7 @@ def blast( continue elif status == "FAILED": - logger.error( - f"Search {RID} failed; please try again and/or report to blast-help@ncbi.nlm.nih.gov." - ) + logger.error(f"Search {RID} failed; please try again and/or report to blast-help@ncbi.nlm.nih.gov.") return elif status == "UNKNOWN": @@ -314,11 +294,7 @@ def blast( # Parse HTML results soup = BeautifulSoup(results, "html.parser") # Get the descriptions table - dsc_table = soup.find( - lambda tag: tag.name == "table" - and tag.has_attr("id") - and tag["id"] == "dscTable" - ) + dsc_table = soup.find(lambda tag: tag.name == "table" and tag.has_attr("id") and tag["id"] == "dscTable") if dsc_table is None: logger.error( diff --git a/gget/gget_blat.py b/gget/gget_blat.py index 5d08200cc..0cb5a4aed 100644 --- a/gget/gget_blat.py +++ b/gget/gget_blat.py @@ -1,11 +1,12 @@ import json as json_package import time from json.decoder import JSONDecodeError -import pandas as pd from urllib import request from urllib.error import HTTPError, URLError -from .utils import set_up_logger, read_fasta +import pandas as pd + +from .utils import read_fasta, set_up_logger logger = set_up_logger() @@ -22,8 +23,7 @@ def blat( save=False, verbose=True, ): - """ - BLAT a nucleotide or amino acid sequence against any BLAT UCSC assembly. + """BLAT a nucleotide or amino acid sequence against any BLAT UCSC assembly. Args: - sequence Sequence (str) or path to fasta file containing one sequence. @@ -38,7 +38,6 @@ def blat( Returns a data frame with the BLAT results. """ - ## Clean up sequence # If the path to a fasta file was provided instead of a nucleotide sequence, # read the file and extract the first sequence @@ -47,24 +46,18 @@ def blat( _, seqs = read_fasta(sequence) else: - raise ValueError( - "File format not recognized. gget BLAT currently only supports '.txt' or '.fa' files. " - ) + raise ValueError("File format not recognized. gget BLAT currently only supports '.txt' or '.fa' files. ") # Set the first sequence from the fasta file as 'sequence' sequence = seqs[0] if len(seqs) > 1: if verbose: - logger.info( - "File contains more than one sequence. Only the first sequence will be submitted to BLAT." - ) + logger.info("File contains more than one sequence. Only the first sequence will be submitted to BLAT.") # Shorten sequence to length limit if necessary if len(sequence) > 8000: if verbose: - logger.info( - "Length of sequence is > 8000. Only the fist 8000 characters will be submitted to BLAT." - ) + logger.info("Length of sequence is > 8000. Only the fist 8000 characters will be submitted to BLAT.") sequence = sequence[:8000] # Convert sequence to upper case @@ -85,33 +78,27 @@ def blat( if set(sequence) <= nucleotides: seqtype = "DNA" if verbose: - logger.info( - f"Sequence recognized as nucleotide sequence. 'seqtype' will be set as {seqtype}." - ) + logger.info(f"Sequence recognized as nucleotide sequence. 'seqtype' will be set as {seqtype}.") # If sequence is an amino acid sequence, set seqtype to protein elif set(sequence) <= amino_acids: seqtype = "protein" if verbose: - logger.info( - f"Sequence recognized as amino acid sequence. 'seqtype' will be set as {seqtype}." - ) + logger.info(f"Sequence recognized as amino acid sequence. 'seqtype' will be set as {seqtype}.") else: raise ValueError( f""" Sequence not automatically recognized as a nucleotide or amino acid sequence. Please specify 'seqtype'. - Seqtype options: {', '.join(seqtypes)} + Seqtype options: {", ".join(seqtypes)} """ ) else: # Check if the user specified seqtype is valid if seqtype not in seqtypes: - raise ValueError( - f"Seqtype specified is {seqtype}. Expected one of {', '.join(seqtypes)}" - ) + raise ValueError(f"Seqtype specified is {seqtype}. Expected one of {', '.join(seqtypes)}") ## Set assembly # Note: If assembly not found, defaults to hg38 @@ -133,17 +120,13 @@ def blat( if len(results["blat"]) == 0: if verbose: - logger.info( - f"No {seqtype} BLAT matches were found for this sequence in genome {results['genome']}." - ) + logger.info(f"No {seqtype} BLAT matches were found for this sequence in genome {results['genome']}.") return # Let user know if assembly was not found # If this is the case, BLAT automatically defaults to human (hg38) if results["genome"] != database: - logger.warning( - f"Assembly {database} not recognized. Defaulted to {results['genome']} instead." - ) + logger.warning(f"Assembly {database} not recognized. Defaulted to {results['genome']} instead.") ## Build data frame to resemble BLAT web search results # Define dataframe from dictionary @@ -153,7 +136,7 @@ def blat( df_dict.update({field: []}) for blat_result_list in results["blat"]: - for field, (i, result) in zip(results["fields"], enumerate(blat_result_list)): + for field, (_i, result) in zip(results["fields"], enumerate(blat_result_list), strict=False): df_dict[field].append(result) df = pd.DataFrame(df_dict) @@ -222,9 +205,9 @@ class _RetryableBlatError(Exception): def _fetch_blat_results(url, seqtype, database): - """ - Submit a BLAT request to UCSC and return the parsed JSON dict, or None - on a non-recoverable failure. Retries transient failures (5xx, network + """Submit a BLAT request to UCSC and return the parsed JSON dict, or None on a non-recoverable failure. + + Retries transient failures (5xx, network errors, non-JSON responses from rate-limiting / HTML error pages) with exponential backoff. The legacy "sequence too short or assembly invalid" message is replaced with the actual server response so failures are @@ -239,8 +222,7 @@ def _fetch_blat_results(url, seqtype, database): if attempt < _BLAT_MAX_ATTEMPTS: delay = _BLAT_BACKOFF_BASE_SECONDS * (2 ** (attempt - 1)) logger.warning( - f"BLAT attempt {attempt}/{_BLAT_MAX_ATTEMPTS} failed ({last_error}). " - f"Retrying in {delay:.1f}s." + f"BLAT attempt {attempt}/{_BLAT_MAX_ATTEMPTS} failed ({last_error}). Retrying in {delay:.1f}s." ) time.sleep(delay) @@ -278,10 +260,7 @@ def _fetch_blat_attempt(url, seqtype, database): code = r.getcode() if code != 200: - raise RuntimeError( - f"HTTP response status code {code}. " - "Please double-check arguments and try again.\n" - ) + raise RuntimeError(f"HTTP response status code {code}. Please double-check arguments and try again.\n") raw = r.read() try: @@ -290,13 +269,13 @@ def _fetch_blat_attempt(url, seqtype, database): preview = _preview_bytes(raw) # Non-JSON from a 200 response is almost always an HTML error / throttle # page from UCSC, which is worth retrying. - raise _RetryableBlatError(f"non-JSON response: {preview!r}") + raise _RetryableBlatError(f"non-JSON response: {preview!r}") from None def _safe_read_preview(response, limit=300): try: return _preview_bytes(response.read(), limit=limit) - except Exception: + except Exception: # noqa: BLE001 return "" diff --git a/gget/gget_cbio.py b/gget/gget_cbio.py index 427531656..ec70f7597 100644 --- a/gget/gget_cbio.py +++ b/gget/gget_cbio.py @@ -4,19 +4,18 @@ import math import os import subprocess -import pandas as pd +from collections import OrderedDict, defaultdict + +import matplotlib.pyplot as plt import numpy as np +import pandas as pd import requests +from matplotlib.colors import BoundaryNorm, ListedColormap, TwoSlopeNorm from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from collections import defaultdict, OrderedDict - -from .utils import set_up_logger - -import matplotlib.pyplot as plt -from matplotlib.colors import ListedColormap, BoundaryNorm, TwoSlopeNorm from .constants import CBIO_CANCER_TYPE_TO_TISSUE_DICTIONARY +from .utils import set_up_logger logger = set_up_logger() @@ -29,8 +28,8 @@ def _ints_between(start, end, max_count, min_count, verbose=False): - """ - Generate a list of integers between start and end (inclusive) with a maximum count of max_count and a minimum count min_count. + """Generate a list of integers between start and end (inclusive) with a maximum count of max_count and a minimum count min_count. + The list is guaranteed to contain start and end, and the spacing between the numbers will be as even as possible. If a perfect spacing is not possible, the spacing will omit a number rather than overcrowding. @@ -68,12 +67,10 @@ def _ints_between(start, end, max_count, min_count, verbose=False): def _describe_bytes(size): - """ - Describe a size in bytes in human-readable format. + """Describe a size in bytes in human-readable format. :param size: size in bytes """ - steps = ["bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"] unit = steps.pop(0) @@ -88,8 +85,7 @@ def _describe_bytes(size): def _download_file_from_git_lfs(target_path: str, oid: str, size: int, verbose=False): - """ - Download a single object from Git LFS. + """Download a single object from Git LFS. :param target_path: path to save the downloaded object :param oid: object ID @@ -105,7 +101,7 @@ def _download_file_from_git_lfs(target_path: str, oid: str, size: int, verbose=F lfs_metadata_json = json.dumps(lfs_metadata) try: - github_url = f"https://github.com/cBioPortal/datahub.git/info/lfs/objects/batch" + github_url = "https://github.com/cBioPortal/datahub.git/info/lfs/objects/batch" curl_command = [ "curl", @@ -142,7 +138,7 @@ def _download_file_from_git_lfs(target_path: str, oid: str, size: int, verbose=F if verbose: logger.info(f"Downloaded object {oid} to {target_path}") - except Exception as e: + except Exception as e: # noqa: BLE001 logger.error(f"Error downloading object {oid} to {target_path}: {e}") return False @@ -164,8 +160,7 @@ def add(self, target_path: str, oid: str, size: int): self.objects.append((target_path, (oid, size))) def download(self) -> bool: - """ - Download all objects in the plan. + """Download all objects in the plan. :return: True if all objects were downloaded successfully, False otherwise """ @@ -183,8 +178,7 @@ def download_cbioportal_data( out_dir=None, confirm_download=False, ) -> bool: - """ - Download data from cBioPortal studies. + """Download data from cBioPortal studies. Args: @@ -197,7 +191,6 @@ def download_cbioportal_data( :return: True if successfully downloaded all needed data, False otherwise """ - actual_out_dir = os.path.abspath(out_dir or "gget_cbio_cache") os.makedirs(actual_out_dir, exist_ok=True) @@ -237,9 +230,7 @@ def download_cbioportal_data( response = session.get(url, timeout=30) if not response.ok: - logger.error( - f"Failed to download {file_type} data for study {study_id}" - ) + logger.error(f"Failed to download {file_type} data for study {study_id}") if file_type not in optional_file_types: success = False continue @@ -254,9 +245,9 @@ def download_cbioportal_data( v = v.strip() fields[k] = v - assert ( - fields["version"] == "https://git-lfs.github.com/spec/v1" - ), f"Cannot handle git-lfs version {fields['version']}" + assert fields["version"] == "https://git-lfs.github.com/spec/v1", ( + f"Cannot handle git-lfs version {fields['version']}" + ) oid: str = fields["oid"].split(":")[1].strip() size: int = int(fields["size"]) @@ -264,14 +255,10 @@ def download_cbioportal_data( if plan: plan.add(filename, oid, size) else: - success &= _download_file_from_git_lfs( - filename, oid, size, verbose=verbose - ) + success &= _download_file_from_git_lfs(filename, oid, size, verbose=verbose) - except Exception as e: - logger.error( - f"Error downloading {file_type} data for study {study_id}: {e}" - ) + except Exception as e: # noqa: BLE001 + logger.error(f"Error downloading {file_type} data for study {study_id}: {e}") success = False if verbose and not confirm_download: @@ -280,9 +267,7 @@ def download_cbioportal_data( # If using a download plan AND there are actually objects to download, ask for confirmation if plan and plan.objects: do_download = ( - input( - f"Do you want to download {_describe_bytes(plan.total_size)} to {actual_out_dir}? (y/n) " - ) + input(f"Do you want to download {_describe_bytes(plan.total_size)} to {actual_out_dir}? (y/n) ") .lower() .strip() == "y" @@ -306,8 +291,7 @@ def _extract_study_name(name: str) -> str: def cbio_search(key_words): - """ - Find cBioPortal study IDs by keyword. + """Find cBioPortal study IDs by keyword. Args: key_words list of keywords to search for - use tissues related to tissue or cancer type of interest (e.g., esophag, ovarian, etc) @@ -316,13 +300,12 @@ def cbio_search(key_words): :return: list of study IDs that match the keywords """ - try: from bravado.client import SwaggerClient except ImportError: logger.error( """ - Some third-party dependencies are missing. Please run the following command: + Some third-party dependencies are missing. Please run the following command: >>> gget.setup('cbio') or $ gget setup cbio Alternative: Install the bravado package using pip (https://pypi.org/project/bravado). @@ -348,18 +331,14 @@ def cbio_search(key_words): studies = api.Studies.getAllStudiesUsingGET().result() cancer_type_acronym_dict = { - _extract_study_name(individual_study["name"]): individual_study["cancerTypeId"] - for individual_study in studies + _extract_study_name(individual_study["name"]): individual_study["cancerTypeId"] for individual_study in studies } cancer_type_acronym_dict = OrderedDict(sorted(cancer_type_acronym_dict.items())) cancer_id_list = [ cancer_type_acronym for cancer_type, cancer_type_acronym in cancer_type_acronym_dict.items() - if any( - key_word in cancer_type.lower() or key_word in cancer_type_acronym.lower() - for key_word in key_words - ) + if any(key_word in cancer_type.lower() or key_word in cancer_type_acronym.lower() for key_word in key_words) and cancer_type_acronym.lower() != "mixed" ] @@ -383,7 +362,7 @@ def _get_ensembl_gene_id(transcript_id: str, verbose=False): data = response.json() return data.get("Parent") - except Exception as e: + except Exception: # noqa: BLE001 if verbose: print(f"Error for: {transcript_id}") return "Unknown" @@ -394,7 +373,7 @@ def _get_ensembl_gene_id_bulk(transcript_ids): return {} try: - url = f"https://rest.ensembl.org/lookup/id/" + url = "https://rest.ensembl.org/lookup/id/" response = requests.post( url, json={"ids": transcript_ids}, @@ -407,9 +386,7 @@ def _get_ensembl_gene_id_bulk(transcript_ids): data = response.json() return { - transcript_id: data[transcript_id].get("Parent") - for transcript_id in transcript_ids - if data[transcript_id] + transcript_id: data[transcript_id].get("Parent") for transcript_id in transcript_ids if data[transcript_id] } except Exception as e: logger.error(f"Failed to fetch gene IDs from Ensembl: {e}") @@ -421,29 +398,21 @@ def _get_ensembl_gene_name_bulk(gene_ids): return {} try: - url = f"https://rest.ensembl.org/lookup/id/" - response = requests.post( - url, json={"ids": gene_ids}, headers={"Content-Type": "application/json"} - ) + url = "https://rest.ensembl.org/lookup/id/" + response = requests.post(url, json={"ids": gene_ids}, headers={"Content-Type": "application/json"}) if not response.ok: response.raise_for_status() data = response.json() - return { - gene_id: data[gene_id].get("display_name") - for gene_id in gene_ids - if data[gene_id] - } + return {gene_id: data[gene_id].get("display_name") for gene_id in gene_ids if data[gene_id]} except Exception as e: logger.error(f"Failed to fetch gene names from Ensembl: {e}") raise e -def _get_valid_ensembl_gene_id( - row, transcript_column: str = "seq_ID", gene_column: str = "gene_name" -): +def _get_valid_ensembl_gene_id(row, transcript_column: str = "seq_ID", gene_column: str = "gene_name"): ensembl_gene_id = _get_ensembl_gene_id(row[transcript_column]) if ensembl_gene_id == "Unknown": return row[gene_column] @@ -451,7 +420,7 @@ def _get_valid_ensembl_gene_id( def _get_valid_ensembl_gene_id_bulk(df: pd.DataFrame): - map_: Optional[dict[str, str]] = None + map_: dict[str, str] | None = None def f( row: pd.Series, @@ -495,17 +464,13 @@ def __init__( ensembl_transcripts = [gene for gene in genes if gene.startswith("ENST")] map_ = { - k: v - for k, v in _get_ensembl_gene_id_bulk(ensembl_transcripts).items() - if v != "Unknown" and v is not None + k: v for k, v in _get_ensembl_gene_id_bulk(ensembl_transcripts).items() if v != "Unknown" and v is not None } genes = [map_.get(gene, gene) for gene in genes] ensembl_gene_ids = [gene for gene in genes if gene.startswith("ENSG")] map_ = { - k: v - for k, v in _get_ensembl_gene_name_bulk(ensembl_gene_ids).items() - if v != "Unknown" and v is not None + k: v for k, v in _get_ensembl_gene_name_bulk(ensembl_gene_ids).items() if v != "Unknown" and v is not None } self.genes = [map_.get(gene, gene) for gene in genes] @@ -529,9 +494,7 @@ def __init__( "Entrez_Gene_Id", "Consequence", ] - self.column_for_merging: str = ( - "Hugo_Symbol" if self.merge_type == _SYMBOL else "Ensembl_Gene_ID" - ) + self.column_for_merging: str = "Hugo_Symbol" if self.merge_type == _SYMBOL else "Ensembl_Gene_ID" self.df_collection = {} self.big_combined_df = self._create_study_dataframes() @@ -540,9 +503,7 @@ def _create_single_study_dataframe(self, study_id: str) -> pd.DataFrame: data_folder = os.path.join(self.data_dir, study_id) mutation_df = pd.read_csv(os.path.join(data_folder, "mutations.txt"), sep="\t") - sample_df = pd.read_csv( - os.path.join(data_folder, "clinical_sample.txt"), sep="\t" - ) + sample_df = pd.read_csv(os.path.join(data_folder, "clinical_sample.txt"), sep="\t") self.df_collection[study_id]["mutations"] = mutation_df self.df_collection[study_id]["samples"] = sample_df @@ -556,13 +517,8 @@ def _create_single_study_dataframe(self, study_id: str) -> pd.DataFrame: ): mutation_df.rename(columns={"Gene": "Ensembl_Gene_ID"}, inplace=True) if self.remove_non_ensembl_genes: - mutation_df = mutation_df[ - mutation_df["Ensembl_Gene_ID"].str.startswith("ENSG") - ] - elif ( - "Transcript_ID" in mutation_df.columns - and mutation_df["Transcript_ID"].str.startswith("ENST").any() - ): + mutation_df = mutation_df[mutation_df["Ensembl_Gene_ID"].str.startswith("ENSG")] + elif "Transcript_ID" in mutation_df.columns and mutation_df["Transcript_ID"].str.startswith("ENST").any(): logger.info("Fetching gene IDs from Ensembl") mutation_df["Ensembl_Gene_ID"] = mutation_df.progress_apply( _get_valid_ensembl_gene_id_bulk(mutation_df), @@ -571,14 +527,10 @@ def _create_single_study_dataframe(self, study_id: str) -> pd.DataFrame: gene_column="Hugo_Symbol", ) if self.remove_non_ensembl_genes: - mutation_df = mutation_df[ - mutation_df["Ensembl_Gene_ID"].str.startswith("ENSG") - ] + mutation_df = mutation_df[mutation_df["Ensembl_Gene_ID"].str.startswith("ENSG")] else: self.merge_type = _SYMBOL - logger.warn( - "No Ensembl gene IDs found in the mutation data. Merging on gene symbol instead." - ) + logger.warn("No Ensembl gene IDs found in the mutation data. Merging on gene symbol instead.") def join_unique_string_values(series): if series.isnull().all(): @@ -613,9 +565,7 @@ def join_unique_string_values(series): self.columns_to_keep.remove("Entrez_Gene_Id") aggregated_df = ( - mutation_df.groupby(["Tumor_Sample_Barcode", self.column_for_merging]) - .agg(aggregation_dict) - .reset_index() + mutation_df.groupby(["Tumor_Sample_Barcode", self.column_for_merging]).agg(aggregation_dict).reset_index() ) if self.column_for_merging not in self.columns_to_keep: @@ -643,9 +593,7 @@ def join_unique_string_values(series): self.df_collection[study_id]["cna"] = cna_df # Exclude 'Hugo_Symbol' column - columns_to_transform = self.df_collection[study_id][ - "cna" - ].columns.difference(["Hugo_Symbol"]) + columns_to_transform = self.df_collection[study_id]["cna"].columns.difference(["Hugo_Symbol"]) # Apply binary transformation to the selected columns df_binary = self.df_collection[study_id]["cna"][columns_to_transform].map( @@ -653,9 +601,7 @@ def join_unique_string_values(series): ) # Add 'Hugo_Symbol' column back to the DataFrame - df_binary.insert( - 0, "Hugo_Symbol", self.df_collection[study_id]["cna"]["Hugo_Symbol"] - ) + df_binary.insert(0, "Hugo_Symbol", self.df_collection[study_id]["cna"]["Hugo_Symbol"]) # Reassign the transformed DataFrame to the collection self.df_collection[study_id]["cna_binary"] = df_binary @@ -690,22 +636,14 @@ def join_unique_string_values(series): melted_sv = melted_sv.drop_duplicates(subset=["Sample_Id", "Hugo_Symbol"]) # Count the occurrences of each Hugo_Symbol in each Sample_Id - sv_occurrences = ( - melted_sv.groupby(["Hugo_Symbol", "Sample_Id"]) - .size() - .reset_index(name="sv_occurrences") - ) + sv_occurrences = melted_sv.groupby(["Hugo_Symbol", "Sample_Id"]).size().reset_index(name="sv_occurrences") # Rename columns to match the desired output - sv_occurrences = sv_occurrences.rename( - columns={"Sample_Id": "Tumor_Sample_Barcode"} - ) + sv_occurrences = sv_occurrences.rename(columns={"Sample_Id": "Tumor_Sample_Barcode"}) final_df = pd.merge( final_df, - sv_occurrences[ - ["Hugo_Symbol", "Tumor_Sample_Barcode", "sv_occurrences"] - ], + sv_occurrences[["Hugo_Symbol", "Tumor_Sample_Barcode", "sv_occurrences"]], on=["Hugo_Symbol", "Tumor_Sample_Barcode"], how="outer", ) @@ -719,15 +657,13 @@ def join_unique_string_values(series): elif "SAMPLE_ID" in sample_df.columns: sample_identifier_column = "SAMPLE_ID" else: - raise AssertionError( - "Sample Identifier column not found in the sample dataframe" - ) + raise AssertionError("Sample Identifier column not found in the sample dataframe") columns_to_merge = [sample_identifier_column, "Cancer Type", "Cancer Type Detailed"] for column in columns_to_merge: if column not in sample_df.columns: columns_to_merge.remove(column) - + final_df = pd.merge( final_df, sample_df[columns_to_merge], @@ -746,9 +682,7 @@ def join_unique_string_values(series): ) final_df["tissue"] = ( - final_df["cancer_type"] - .map(CBIO_CANCER_TYPE_TO_TISSUE_DICTIONARY) - .fillna("unclassified") + final_df["cancer_type"].map(CBIO_CANCER_TYPE_TO_TISSUE_DICTIONARY).fillna("unclassified") ) # Drop the redundant SAMPLE_ID column @@ -766,7 +700,7 @@ def _create_study_dataframes(self) -> pd.DataFrame: # clean up data just in case (cut out comments) filename = f"{self.data_dir}/{study_id}/mutations.txt" - with open(filename, "r") as file: + with open(filename) as file: lines = file.readlines() changed = False @@ -782,7 +716,7 @@ def _create_study_dataframes(self) -> pd.DataFrame: final_df = self._create_single_study_dataframe(study_id=study_id) dataframes.append(final_df) - except Exception as e: + except Exception as e: # noqa: BLE001 logger.error(f"Error processing study {study_id}: {e}") continue @@ -800,9 +734,9 @@ def plot_heatmap( figure_title=None, ): if variation_type == "cna_nonbinary" or variation_type == "Consequence": - assert ( - stratification == "sample" - ), "Stratification must be 'sample' for cna_nonbinary and Consequence variations" + assert stratification == "sample", ( + "Stratification must be 'sample' for cna_nonbinary and Consequence variations" + ) if variation_type != "cna_nonbinary": simple_merge_by_stratification: dict[str, list[str]] = { @@ -823,9 +757,7 @@ def plot_heatmap( if filter_category is None: # no filtering final_df = self.big_combined_df else: - final_df = self.big_combined_df[ - self.big_combined_df[filter_category] == filter_value - ] + final_df = self.big_combined_df[self.big_combined_df[filter_category] == filter_value] merge_on = list(set(merge_on).intersection(final_df.columns)) @@ -843,9 +775,7 @@ def plot_heatmap( unique_samples_info = final_df[available_cols].drop_duplicates() - hugo_mask = final_df["Hugo_Symbol"].isin( - [gene for gene in (self.genes) if not gene.startswith("ENSG")] - ) + hugo_mask = final_df["Hugo_Symbol"].isin([gene for gene in (self.genes) if not gene.startswith("ENSG")]) if self.merge_type == _ENSEMBL: ensg_mask = final_df["Ensembl_Gene_ID"].isin( @@ -868,16 +798,12 @@ def plot_heatmap( else: raise AssertionError(f"Invalid merge type: {self.merge_type}") - unexpressed_genes = [ - gene for gene in (self.genes) if gene not in existing_genes - ] + unexpressed_genes = [gene for gene in (self.genes) if gene not in existing_genes] # Get all unique Tumor_Sample_Barcode from the original DataFrame all_samples = final_df[merge_on].drop_duplicates() - all_samples = pd.merge( - all_samples, unique_samples_info, on=merge_on, how="left" - ) + all_samples = pd.merge(all_samples, unique_samples_info, on=merge_on, how="left") if variation_type not in columns_to_keep_copy: columns_to_keep_copy.append(variation_type) @@ -889,10 +815,7 @@ def plot_heatmap( "cancer_type_detailed", ] for column_name in must_keep: - if ( - column_name in merge_on - and column_name not in columns_to_keep_copy - ): + if column_name in merge_on and column_name not in columns_to_keep_copy: columns_to_keep_copy.append(column_name) # Merge the filtered genes DataFrame with all samples to ensure all samples are included @@ -906,11 +829,7 @@ def plot_heatmap( if stratification != "sample": # no filtering df_for_heatmap_very_final: pd.DataFrame = ( - merged_df.groupby([self.column_for_merging, stratification])[ - variation_type - ] - .sum() - .reset_index() + merged_df.groupby([self.column_for_merging, stratification])[variation_type].sum().reset_index() ) else: df_for_heatmap_very_final: pd.DataFrame = merged_df @@ -951,9 +870,7 @@ def plot_heatmap( pivot_df1 = pivot_df1[sorted_columns] if unexpressed_genes: - new_rows = pd.DataFrame( - np.nan, index=unexpressed_genes, columns=pivot_df1.columns - ) + new_rows = pd.DataFrame(np.nan, index=unexpressed_genes, columns=pivot_df1.columns) pivot_df1 = pd.concat([pivot_df1, new_rows]) title = f"Heatmap of Gene mutations per gene across {stratification}" @@ -967,12 +884,8 @@ def plot_heatmap( pivot_df1.rename(index=map_, inplace=True) else: # variation_type == "cna_nonbinary" - assert ( - stratification == "sample" - ), "stratification must be 'sample' for CNA data" - assert ( - filter_category == "study_id" - ), "filter_category must be 'study_id' for CNA data" + assert stratification == "sample", "stratification must be 'sample' for CNA data" + assert filter_category == "study_id", "filter_category must be 'study_id' for CNA data" pivot_df1 = self.df_collection[filter_value]["cna"].copy() pivot_df1.set_index("Hugo_Symbol", inplace=True) pivot_df1 = pivot_df1[pivot_df1.index.isin(self.genes)] @@ -990,13 +903,11 @@ def plot_heatmap( missing_genes = [g for g in self.genes if g not in existing] if missing_genes: new_rows = pd.DataFrame( - {col: np.nan for col in pivot_df1.columns}, + dict.fromkeys(pivot_df1.columns, np.nan), index=missing_genes, ) new_rows["Hugo_Symbol"] = missing_genes - pivot_df1 = pd.concat( - [pivot_df1, new_rows], ignore_index=True - ) + pivot_df1 = pd.concat([pivot_df1, new_rows], ignore_index=True) # Set 'Hugo_Symbol' back as index if needed pivot_df1 = pivot_df1.set_index("Hugo_Symbol") @@ -1016,9 +927,7 @@ def plot_heatmap( # limit to first 500 columns render_divider_lines = True render_column_ids = pivot_df1.shape[1] < 100 - if ( - pivot_df1.shape[1] > 372 - ): # 372 is fine, 373 is not. There's something wrong with pyplot... + if pivot_df1.shape[1] > 372: # 372 is fine, 373 is not. There's something wrong with pyplot... print("Warning: Too many columns to plot. Limiting to first 372 columns") pivot_df1 = pivot_df1.iloc[:, :372] render_divider_lines = False @@ -1030,12 +939,8 @@ def plot_heatmap( levels = list(range(min_value + 1, max_value + 1)) pivot_df1 = pivot_df1.fillna(min_value) - colors_list = plt.get_cmap("RdBu_r", max_value - min_value + 1)( - range(max_value - min_value + 1) - ) - colors_list = np.vstack( - ([[0.5, 0.5, 0.5, 0.3]], colors_list[1:]) - ) # Grey color for -3 + colors_list = plt.get_cmap("RdBu_r", max_value - min_value + 1)(range(max_value - min_value + 1)) + colors_list = np.vstack(([[0.5, 0.5, 0.5, 0.3]], colors_list[1:])) # Grey color for -3 cmap = ListedColormap(colors_list) # Define the norm with the diverging palette centered at 0 @@ -1044,16 +949,12 @@ def plot_heatmap( elif variation_type == "Consequence": consequences = list(self.big_combined_df["Consequence"].unique()) - colors_list = plt.get_cmap("tab20", len(consequences))( - range(len(consequences)) - ) + colors_list = plt.get_cmap("tab20", len(consequences))(range(len(consequences))) # if consequences contains nan, ensure the nan value is at the beginning if np.nan in consequences: colors_list = np.vstack(([[1.0, 1.0, 1.0, 0.3]], colors_list[:-1])) - consequences = [np.nan] + sorted( - v for v in consequences if not isinstance(v, float) - ) + consequences = [np.nan] + sorted(v for v in consequences if not isinstance(v, float)) else: consequences.sort() nas_present = False @@ -1068,9 +969,7 @@ def plot_heatmap( ) levels = list(range(min_value, max_value)) - string_to_int = { - consequence: i for i, consequence in enumerate(consequences) - } + string_to_int = {consequence: i for i, consequence in enumerate(consequences)} pivot_df1 = pivot_df1.map(lambda x: string_to_int[x]) @@ -1090,9 +989,7 @@ def plot_heatmap( # Create a custom colormap colors_list = plt.get_cmap("Reds", len(levels))(range(len(levels))) if nas_present: - colors_list = np.vstack( - ([[0.5, 0.5, 0.5, 0.3]], colors_list) - ) # Grey color for -1 + colors_list = np.vstack(([[0.5, 0.5, 0.5, 0.3]], colors_list)) # Grey color for -1 cmap = ListedColormap(colors_list) # Define the norm, with vmin set to -1 and vmax to max_value @@ -1197,8 +1094,7 @@ def cbio_plot( show=False, figure_title=None, ): - """ - Plot a heatmap of given genes in the given studies. + """Plot a heatmap of given genes in the given studies. Args: study_ids list of cBioPortal study IDs @@ -1226,9 +1122,7 @@ def cbio_plot( if verbose: logger.info("Downloading data") - if not download_cbioportal_data( - study_ids, verbose=verbose, out_dir=data_dir, confirm_download=confirm_download - ): + if not download_cbioportal_data(study_ids, verbose=verbose, out_dir=data_dir, confirm_download=confirm_download): logger.error("Failed to download data. Continuing with available studies") # return False diff --git a/gget/gget_cellxgene.py b/gget/gget_cellxgene.py index febaa9fc7..4b08bebad 100644 --- a/gget/gget_cellxgene.py +++ b/gget/gget_cellxgene.py @@ -16,12 +16,12 @@ def _listify(x): - """ - Return x as a 1-D list suitable for SOMA `in [...]` filters. + """Return x as a 1-D list suitable for SOMA `in [...]` filters. + - None -> None - "str" -> ["str"] - iterables -> list(iterable) - - scalars -> [scalar] + - scalars -> [scalar]. """ if x is None: return None @@ -35,8 +35,8 @@ def _listify(x): def _build_obs_filter(filters: dict, is_primary_data: bool): - """ - Build a SOMA obs value_filter string like: + """Build a SOMA obs value_filter string like: + "is_primary_data == True and tissue in ['lung'] and cell_type in ['muscle cell']" Only includes keys with non-empty values. """ @@ -81,8 +81,8 @@ def cellxgene( verbose=True, out=None, ): - """ - Query data from CZ CELLxGENE Discover (https://cellxgene.cziscience.com/) using the + """Query data from CZ CELLxGENE Discover (https://cellxgene.cziscience.com/) using the + CZ CELLxGENE Discover Census (https://github.com/chanzuckerberg/cellxgene-census). NOTE: Querying large datasets requires a large amount of RAM. Use the cell metadata attributes @@ -154,7 +154,7 @@ def cellxgene( "tissue_general", "tissue", "cell_type", - "disease" + "disease", ] # Check dependency @@ -163,7 +163,7 @@ def cellxgene( except ImportError: logger.error( """ - Some third-party dependencies are missing. Please run the following command: + Some third-party dependencies are missing. Please run the following command: >>> gget.setup('cellxgene') or $ gget setup cellxgene Alternative: Install the cellxgene-census package using pip (https://pypi.org/project/cellxgene-census). @@ -238,9 +238,7 @@ def cellxgene( var_value_filter = None if verbose: - logger.info( - "Fetching AnnData object from CZ CELLxGENE Discover. This might take a few minutes..." - ) + logger.info("Fetching AnnData object from CZ CELLxGENE Discover. This might take a few minutes...") with cellxgene_census.open_soma(census_version=census_version) as census: adata = cellxgene_census.get_anndata( census=census, diff --git a/gget/gget_cosmic.py b/gget/gget_cosmic.py index c96cc81a5..00bccf37f 100644 --- a/gget/gget_cosmic.py +++ b/gget/gget_cosmic.py @@ -1,32 +1,30 @@ -import requests -import pandas as pd -import subprocess +import base64 +import getpass +import gzip +import json as json_package import os import re -import json as json_package -import base64 import shutil +import subprocess import tarfile -import gzip -import getpass + +import pandas as pd # Constants -from .constants import COSMIC_GET_URL -from .utils import set_up_logger, get_latest_cosmic +from .utils import get_latest_cosmic, set_up_logger logger = set_up_logger() def is_valid_email(email): - """ - Check if an e-mail address is valid. - """ + """Check if an e-mail address is valid.""" email_pattern = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)") return re.match(email_pattern, email) is not None -def download_reference(download_link, tar_folder_path, file_path, verbose, email = None, password = None, unzip = False): +def download_reference(download_link, tar_folder_path, file_path, verbose, email=None, password=None, unzip=False): + """Download a COSMIC reference file using email/password authentication, extract the tar, and optionally unzip it.""" if not email: email = input("Please enter your COSMIC email: ") if not is_valid_email(email): @@ -56,19 +54,17 @@ def download_reference(download_link, tar_folder_path, file_path, verbose, email except json_package.JSONDecodeError: raise RuntimeError( "Failed to download file. Please double-check arguments (especially cosmic_version) and try again." - ) + ) from None try: true_download_url = response_data.get("url") except AttributeError: - raise AttributeError("Invalid username or password.") + raise AttributeError("Invalid username or password.") from None curl_command2 = ["curl", true_download_url, "--output", f"{tar_folder_path}.tar"] result2 = subprocess.run(curl_command2, capture_output=True, text=True) if result2.returncode != 0: - raise RuntimeError( - f"Failed to download file. Return code: {result2.returncode}\n{result2.stderr}" - ) + raise RuntimeError(f"Failed to download file. Return code: {result2.returncode}\n{result2.stderr}") with tarfile.open(f"{tar_folder_path}.tar", "r") as tar: tar.extractall(path=tar_folder_path) @@ -84,8 +80,17 @@ def download_reference(download_link, tar_folder_path, file_path, verbose, email def select_reference( - cosmic_project, reference_dir, grch_version, cosmic_version, verbose, email = None, password = None, unzip = True, overwrite = None + cosmic_project, + reference_dir, + grch_version, + cosmic_version, + verbose, + email=None, + password=None, + unzip=True, + overwrite=None, ): + """Resolve the download link and paths for the requested COSMIC project, then download and extract the database, returning the file path and overwrite flag.""" # if cosmic_project == "transcriptome": # download_link = f"https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted?path=grch{grch_version}/cosmic/v{cosmic_version}/Cosmic_Genes_Fasta_v{cosmic_version}_GRCh{grch_version}.tar&bucket=downloads" # tarred_folder = f"Cosmic_Genes_Fasta_v{cosmic_version}_GRCh{grch_version}" @@ -93,20 +98,16 @@ def select_reference( if cosmic_project == "cancer": if grch_version == 38: - logger.error( - "CancerMutationCensus data is only available for GRCh37. Define grch_version=37." - ) + logger.error("CancerMutationCensus data is only available for GRCh37. Define grch_version=37.") download_link = f"https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted?path=grch{grch_version}/cmc/v{cosmic_version}/CancerMutationCensus_AllData_Tsv_v{cosmic_version}_GRCh{grch_version}.tar&bucket=downloads" - tarred_folder = ( - f"CancerMutationCensus_AllData_Tsv_v{cosmic_version}_GRCh{grch_version}" - ) - contained_file = ( - f"CancerMutationCensus_AllData_v{cosmic_version}_GRCh{grch_version}.tsv" - ) + tarred_folder = f"CancerMutationCensus_AllData_Tsv_v{cosmic_version}_GRCh{grch_version}" + contained_file = f"CancerMutationCensus_AllData_v{cosmic_version}_GRCh{grch_version}.tsv" if str(cosmic_version) == "100": # special treatment due to v2 download_link = download_link.replace(".tar&bucket=downloads", "_v2.tar&bucket=downloads") tarred_folder += "_v2" - if str(cosmic_version) == "99" or str(cosmic_version) == "100": # special treatment due to link difference - path=GRCh37 instead of path=grch37 + if ( + str(cosmic_version) == "99" or str(cosmic_version) == "100" + ): # special treatment due to link difference - path=GRCh37 instead of path=grch37 download_link = download_link.replace(f"path=grch{grch_version}", f"path=GRCh{grch_version}") elif cosmic_project == "cell_line": @@ -121,21 +122,13 @@ def select_reference( elif cosmic_project == "resistance": download_link = f"https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted?path=grch{grch_version}/cosmic/v{cosmic_version}/Cosmic_ResistanceMutations_Tsv_v{cosmic_version}_GRCh{grch_version}.tar&bucket=downloads" - tarred_folder = ( - f"Cosmic_ResistanceMutations_Tsv_v{cosmic_version}_GRCh{grch_version}" - ) - contained_file = ( - f"Cosmic_ResistanceMutations_v{cosmic_version}_GRCh{grch_version}.tsv" - ) + tarred_folder = f"Cosmic_ResistanceMutations_Tsv_v{cosmic_version}_GRCh{grch_version}" + contained_file = f"Cosmic_ResistanceMutations_v{cosmic_version}_GRCh{grch_version}.tsv" elif cosmic_project == "genome_screen": download_link = f"https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted?path=grch{grch_version}/cosmic/v{cosmic_version}/Cosmic_GenomeScreensMutant_Tsv_v{cosmic_version}_GRCh{grch_version}.tar&bucket=downloads" - tarred_folder = ( - f"Cosmic_GenomeScreensMutant_Tsv_v{cosmic_version}_GRCh{grch_version}" - ) - contained_file = ( - f"Cosmic_GenomeScreensMutant_v{cosmic_version}_GRCh{grch_version}.tsv" - ) + tarred_folder = f"Cosmic_GenomeScreensMutant_Tsv_v{cosmic_version}_GRCh{grch_version}" + contained_file = f"Cosmic_GenomeScreensMutant_v{cosmic_version}_GRCh{grch_version}.tsv" elif cosmic_project == "targeted_screen": download_link = f"https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted?path=grch{grch_version}/cosmic/v{cosmic_version}/Cosmic_CompleteTargetedScreensMutant_Tsv_v{cosmic_version}_GRCh{grch_version}.tar&bucket=downloads" @@ -178,7 +171,7 @@ def select_reference( f"{tar_folder_path}.tar", download_link, ] - result = subprocess.run(curl_command, capture_output=True, text=True) + subprocess.run(curl_command, capture_output=True, text=True) with tarfile.open(f"{tar_folder_path}.tar", "r") as tar: tar.extractall(path=tar_folder_path) @@ -198,18 +191,20 @@ def select_reference( .lower() ) if proceed in ["yes", "y"]: - download_reference(download_link, tar_folder_path, file_path, verbose, email = email, password = password, unzip = unzip) + download_reference( + download_link, tar_folder_path, file_path, verbose, email=email, password=password, unzip=unzip + ) else: raise KeyboardInterrupt( - f"Database download canceled. Learn more about COSMIC at https://cancer.sanger.ac.uk/cosmic/download/cosmic." + "Database download canceled. Learn more about COSMIC at https://cancer.sanger.ac.uk/cosmic/download/cosmic." ) return file_path, overwrite def make_exact_match_mask(df, searchterm_lower, cols_to_check): - """ - Build a boolean mask for rows where any of the specified columns match the search term exactly. + """Build a boolean mask for rows where any of the specified columns match the search term exactly. + Handles special case for ACCESSION_NUMBER to match both with and without version. Allows for columns in cols_to_check to be missing from the DataFrame. """ @@ -251,9 +246,7 @@ def make_exact_match_mask(df, searchterm_lower, cols_to_check): def query_local_cosmic(cosmic_tsv_path, cosmic_project, searchterm, limit): - """ - Search the local COSMIC mutation census file for matching entries. - """ + """Search the local COSMIC mutation census file for matching entries.""" df = pd.read_csv(cosmic_tsv_path, sep="\t", low_memory=False) searchterm_lower = searchterm.lower() results = [] @@ -261,25 +254,22 @@ def query_local_cosmic(cosmic_tsv_path, cosmic_project, searchterm, limit): def match_and_limit(mask, extract_fn): for _, row in df[mask].head(limit).iterrows(): results.append(extract_fn(row)) - - if cosmic_project in ["cancer", "cancer_example"]: + if cosmic_project in ["cancer", "cancer_example"]: # Columns to check for search term cols_to_check = [ - "GENE_NAME", - "ACCESSION_NUMBER", - "LEGACY_MUTATION_ID", - "Mutation CDS", + "GENE_NAME", + "ACCESSION_NUMBER", + "LEGACY_MUTATION_ID", + "Mutation CDS", "Mutation AA", - "GENOMIC_MUTATION_ID" - ] + "GENOMIC_MUTATION_ID", + ] mask = make_exact_match_mask(df, searchterm_lower, cols_to_check) - match_and_limit(mask, lambda row: { - col.replace(" ", "_"): row[col] - for col in row.index - if not col.startswith("__") - }) + match_and_limit( + mask, lambda row: {col.replace(" ", "_"): row[col] for col in row.index if not col.startswith("__")} + ) elif cosmic_project in ["census", "resistance", "cell_line", "genome_screen", "targeted_screen", "other"]: # Columns to check for search term @@ -292,24 +282,24 @@ def match_and_limit(mask, extract_fn): "GENOMIC_MUTATION_ID", "LEGACY_MUTATION_ID", "SAMPLE_NAME", - "MUTATION_CDS", - "MUTATION_AA", + "MUTATION_CDS", + "MUTATION_AA", "MUTATION_ID", - "COSMIC_STUDY_ID" - ] + "COSMIC_STUDY_ID", + ] mask = make_exact_match_mask(df, searchterm_lower, cols_to_check) - match_and_limit(mask, lambda row: { - col.replace(" ", "_"): row[col] - for col in row.index - if not col.startswith("__") - }) + match_and_limit( + mask, lambda row: {col.replace(" ", "_"): row[col] for col in row.index if not col.startswith("__")} + ) else: raise ValueError(f"Unsupported cosmic_project: {cosmic_project}") - + if len(results) == 0: - raise ValueError(f"No results were found for searchterm '{searchterm}' and cosmic_project '{cosmic_project}' in COSMIC database file (cosmic_tsv_path) '{cosmic_tsv_path}'.") + raise ValueError( + f"No results were found for searchterm '{searchterm}' and cosmic_project '{cosmic_project}' in COSMIC database file (cosmic_tsv_path) '{cosmic_tsv_path}'." + ) return results @@ -332,10 +322,10 @@ def cosmic( mutation_column="mutation", mut_id_column="mutation_id", out=None, - verbose=True + verbose=True, ): - """ - Search for genes, mutations, etc associated with cancers using the COSMIC + """Search for genes, mutations, etc associated with cancers using the COSMIC database. + (Catalogue Of Somatic Mutations In Cancer) database (https://cancer.sanger.ac.uk/cosmic). NOTE: Licence fees apply for the commercial use of COSMIC (https://www.cosmickb.org/licensing). @@ -373,7 +363,7 @@ def cosmic( Examples: EGFR, ENST00000275493, c.650A>T, p.Q217L, COSV51765119, BT2012100223LNCTB (sample ID) NOTE: Set to None when downloading COSMIC databases with download_cosmic=True. - cosmic_tsv_path (str) Path to the COSMIC mutation tsv file, e.g. 'path/to/CancerMutationCensus_AllData_v101_GRCh37.tsv'. - This file is downloaded when downloading COSMIC databases using the arguments described above. + This file is downloaded when downloading COSMIC databases using the arguments described above. NOTE: This is a required argument when download_cosmic=False. - limit (int) Number of hits to return. Default: 100 - json (True/False) If True, returns results in json format instead of data frame. Default: False @@ -387,7 +377,6 @@ def cosmic( - When download_cosmic=True: Database will be downloaded into current working directory - verbose (True/False) whether to print progress information. Default: True """ - if verbose: logger.info("NOTE: Licence fees apply for the commercial use of COSMIC (https://www.cosmickb.org/licensing).") @@ -396,7 +385,9 @@ def cosmic( if not cosmic_project: cosmic_project = "cancer" if verbose: - logger.info(f"No cosmic_project provided. Defaulting to cosmic_project '{cosmic_project}' (also works for 'cancer_example').") + logger.info( + f"No cosmic_project provided. Defaulting to cosmic_project '{cosmic_project}' (also works for 'cancer_example')." + ) mut_class_allowed = [ "cancer", @@ -412,11 +403,9 @@ def cosmic( f"Parameter 'cosmic_project' must be one of the following: {', '.join(mut_class_allowed)}.\n" ) - grch_allowed = ['37', '38'] + grch_allowed = ["37", "38"] if str(grch_version) not in grch_allowed: - raise ValueError( - f"Parameter 'grch_version' must be one of the following: {', '.join(grch_allowed)}.\n" - ) + raise ValueError(f"Parameter 'grch_version' must be one of the following: {', '.join(grch_allowed)}.\n") if not out: out = os.getcwd() @@ -427,21 +416,17 @@ def cosmic( if not cosmic_version: cosmic_version = get_latest_cosmic() if verbose: - logger.info( - f"Downloading data from latest COSMIC version (v{cosmic_version})." - ) + logger.info(f"Downloading data from latest COSMIC version (v{cosmic_version}).") ## Download requested database mutation_tsv_file, overwrite = select_reference( - cosmic_project, out, grch_version, cosmic_version, verbose, email = email, password = password + cosmic_project, out, grch_version, cosmic_version, verbose, email=email, password=password ) if gget_mutate and overwrite is not False: ## Create copy of results formatted for further use by gget mutate if verbose: - logger.info( - "Creating modified mutations file for use with gget mutate..." - ) + logger.info("Creating modified mutations file for use with gget mutate...") if cosmic_project == "cancer" or cosmic_project == "cancer_example": relevant_cols = [ @@ -494,23 +479,16 @@ def cosmic( # } # ) - from gget.gget_mutate import mutation_pattern, convert_chromosome_value_to_int_when_possible import numpy as np + from gget.gget_mutate import convert_chromosome_value_to_int_when_possible, mutation_pattern + # * uncomment to include strand information (tested not to be accurate for CMC) - df[["chromosome", "GENOME_POS"]] = df[ - "Mutation genome position GRCh37" - ].str.split(":", expand=True) - df["chromosome"] = df["chromosome"].apply( - convert_chromosome_value_to_int_when_possible - ) - df[["GENOME_START", "GENOME_STOP"]] = df["GENOME_POS"].str.split( - "-", expand=True - ) + df[["chromosome", "GENOME_POS"]] = df["Mutation genome position GRCh37"].str.split(":", expand=True) + df["chromosome"] = df["chromosome"].apply(convert_chromosome_value_to_int_when_possible) + df[["GENOME_START", "GENOME_STOP"]] = df["GENOME_POS"].str.split("-", expand=True) - df[["nucleotide_positions", "actual_mutation"]] = df[ - "mutation" - ].str.extract(mutation_pattern) + df[["nucleotide_positions", "actual_mutation"]] = df["mutation"].str.extract(mutation_pattern) sub_mask = df["actual_mutation"].str.contains(">") ins_mask = (df["actual_mutation"].str.contains("ins")) & ( @@ -520,16 +498,12 @@ def cosmic( ins_delins_mask = ins_mask | delins_mask sub_ins_delins_mask = sub_mask | ins_delins_mask - df.loc[sub_mask, "wt_allele_cds"] = ( - df.loc[sub_mask, "actual_mutation"].str.split(">").str[0] - ) - df.loc[sub_mask, "mut_allele_cds"] = ( - df.loc[sub_mask, "actual_mutation"].str.split(">").str[1] - ) + df.loc[sub_mask, "wt_allele_cds"] = df.loc[sub_mask, "actual_mutation"].str.split(">").str[0] + df.loc[sub_mask, "mut_allele_cds"] = df.loc[sub_mask, "actual_mutation"].str.split(">").str[1] - df.loc[ins_delins_mask, "mut_allele_cds"] = df.loc[ - ins_delins_mask, "actual_mutation" - ].str.extract(r"ins(.+)")[0] + df.loc[ins_delins_mask, "mut_allele_cds"] = df.loc[ins_delins_mask, "actual_mutation"].str.extract( + r"ins(.+)" + )[0] df["strand"] = np.nan @@ -545,13 +519,9 @@ def cosmic( ) df.loc[sub_mask, "actual_mutation_updated"] = ( - df.loc[sub_mask, "GENOMIC_WT_ALLELE_SEQ"] - + ">" - + df.loc[sub_mask, "GENOMIC_MUT_ALLELE_SEQ"] - ) - df.loc[ins_mask, "actual_mutation_updated"] = ( - "ins" + df.loc[ins_mask, "GENOMIC_MUT_ALLELE_SEQ"] + df.loc[sub_mask, "GENOMIC_WT_ALLELE_SEQ"] + ">" + df.loc[sub_mask, "GENOMIC_MUT_ALLELE_SEQ"] ) + df.loc[ins_mask, "actual_mutation_updated"] = "ins" + df.loc[ins_mask, "GENOMIC_MUT_ALLELE_SEQ"] df.loc[delins_mask, "actual_mutation_updated"] = ( "delins" + df.loc[delins_mask, "GENOMIC_MUT_ALLELE_SEQ"] ) @@ -577,14 +547,10 @@ def cosmic( + "_" + df["GENOME_STOP"].astype(str) + df["actual_mutation_final"], - "g." - + df["GENOME_START"].astype(str) - + df["actual_mutation_final"], + "g." + df["GENOME_START"].astype(str) + df["actual_mutation_final"], ) - df.loc[ - df["Mutation genome position GRCh37"].isna(), "mutation_genome" - ] = np.nan + df.loc[df["Mutation genome position GRCh37"].isna(), "mutation_genome"] = np.nan df.drop( columns=[ @@ -635,9 +601,7 @@ def cosmic( df = df.drop(columns=["GENE_NAME", "MUTATION_ID"]) if remove_duplicates: - duplicate_count = ( - df.duplicated(subset=["seq_ID", "mutation"], keep=False).sum() // 2 - ) + duplicate_count = df.duplicated(subset=["seq_ID", "mutation"], keep=False).sum() // 2 print( f"Removing {duplicate_count} duplicate entries from the COSMIC csv for gget mutate: {duplicate_count}" ) @@ -657,9 +621,7 @@ def cosmic( df.to_csv(mutate_csv_out, index=False) if verbose: - logger.info( - f"Modified mutations file for use with gget mutate created at {mutate_csv_out}" - ) + logger.info(f"Modified mutations file for use with gget mutate created at {mutate_csv_out}") else: # Old code from when COSMIC was acccessible without an account: @@ -677,7 +639,7 @@ def cosmic( # raise ValueError( # f"'entity' argument specified as {entity}. Expected one of: {', '.join(sps)}" # ) - + # # Translate categories to match COSMIC data table IDs # if entity == "cancer": # entity = "disease" @@ -847,7 +809,7 @@ def cosmic( # counter = counter + 1 # if limit < counter: # break - + # Check if cosmic_tsv_path exists if not cosmic_tsv_path or not os.path.exists(cosmic_tsv_path): example_call_python = f"gget.cosmic(download_cosmic=True, searchterm=None, cosmic_project='{cosmic_project}', grch_version={grch_version}, cosmic_version={cosmic_version or get_latest_cosmic()})" @@ -868,7 +830,9 @@ def cosmic( else: cosmic_project = "other" if verbose: - logger.info(f"No cosmic_project provided. Defaulting to cosmic_project '{cosmic_project}' (incapsulates all mutation classes except 'cancer' and 'cancer_example').") + logger.info( + f"No cosmic_project provided. Defaulting to cosmic_project '{cosmic_project}' (incapsulates all mutation classes except 'cancer' and 'cancer_example')." + ) # Query local COSMIC database dicts = query_local_cosmic(cosmic_tsv_path, cosmic_project, searchterm, limit) diff --git a/gget/gget_diamond.py b/gget/gget_diamond.py index d5b1c6ab2..1dba1a60c 100644 --- a/gget/gget_diamond.py +++ b/gget/gget_diamond.py @@ -1,25 +1,20 @@ +import json as json_package +import os +import platform import subprocess import sys -import platform -import os -import pandas as pd import uuid -import json as json_package from .compile import PACKAGE_PATH -from .utils import tsv_to_df, create_tmp_fasta, remove_temp_files, set_up_logger +from .utils import create_tmp_fasta, remove_temp_files, set_up_logger, tsv_to_df logger = set_up_logger() # Path to precompiled diamond binary if platform.system() == "Windows": - PRECOMPILED_DIAMOND_PATH = os.path.join( - PACKAGE_PATH, f"bins/{platform.system()}/diamond.exe" - ) + PRECOMPILED_DIAMOND_PATH = os.path.join(PACKAGE_PATH, f"bins/{platform.system()}/diamond.exe") else: - PRECOMPILED_DIAMOND_PATH = os.path.join( - PACKAGE_PATH, f"bins/{platform.system()}/diamond" - ) + PRECOMPILED_DIAMOND_PATH = os.path.join(PACKAGE_PATH, f"bins/{platform.system()}/diamond") def diamond( @@ -34,8 +29,7 @@ def diamond( json=False, out=None, ): - """ - Align multiple protein or translated DNA sequences using DIAMOND (https://www.nature.com/articles/nmeth.3176). + """Align multiple protein or translated DNA sequences using DIAMOND (https://www.nature.com/articles/nmeth.3176). Args: - query Sequences (str or list) or path to FASTA file containing sequences to be aligned against the reference. @@ -128,14 +122,14 @@ def diamond( if translated: if verbose: - logger.info(f"Aligning nucleotide query to amino acid reference (blastx mode).") + logger.info("Aligning nucleotide query to amino acid reference (blastx mode).") diamond_program = "blastx" else: diamond_program = "blastp" # Run DIAMOND commands as separate subprocess calls (avoids shell=True security issues) if verbose: - logger.info(f"Creating DIAMOND database and initiating alignment...") + logger.info("Creating DIAMOND database and initiating alignment...") # Step 1: Check diamond version version_cmd = [diamond_bin, "version"] @@ -147,13 +141,7 @@ def diamond( raise RuntimeError("DIAMOND version check failed.") # Step 2: Create database - makedb_cmd = [ - diamond_bin, "makedb", - "--quiet", - "--in", ref_file, - "--db", db_path, - "--threads", str(threads) - ] + makedb_cmd = [diamond_bin, "makedb", "--quiet", "--in", ref_file, "--db", db_path, "--threads", str(threads)] with subprocess.Popen(makedb_cmd, stderr=subprocess.PIPE) as process: stderr = process.stderr.read().decode("utf-8") if stderr: @@ -163,17 +151,35 @@ def diamond( # Step 3: Run alignment align_cmd = [ - diamond_bin, diamond_program, - "--outfmt", "6", - "qseqid", "sseqid", "pident", "qlen", "slen", "length", - "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", + diamond_bin, + diamond_program, + "--outfmt", + "6", + "qseqid", + "sseqid", + "pident", + "qlen", + "slen", + "length", + "mismatch", + "gapopen", + "qstart", + "qend", + "sstart", + "send", + "evalue", + "bitscore", "--quiet", - "--query", in_file, - "--db", ref_file, - "--out", out_file, + "--query", + in_file, + "--db", + ref_file, + "--out", + out_file, f"--{sensitivity}", - "--threads", str(threads), - "--ignore-warnings" + "--threads", + str(threads), + "--ignore-warnings", ] with subprocess.Popen(align_cmd, stderr=subprocess.PIPE) as process: stderr = process.stderr.read().decode("utf-8") @@ -184,7 +190,7 @@ def diamond( raise RuntimeError("DIAMOND alignment failed.") else: if verbose: - logger.info(f"DIAMOND alignment complete.") + logger.info("DIAMOND alignment complete.") df_diamond = tsv_to_df( output, diff --git a/gget/gget_elm.py b/gget/gget_elm.py index cf766473d..806c2090c 100644 --- a/gget/gget_elm.py +++ b/gget/gget_elm.py @@ -1,26 +1,26 @@ -import pandas as pd -import numpy as np -import os import json as json_package +import os import re -from .utils import get_uniprot_seqs, tsv_to_df, set_up_logger +import numpy as np +import pandas as pd + +from .utils import get_uniprot_seqs, set_up_logger, tsv_to_df logger = set_up_logger() -from .constants import UNIPROT_REST_API -from .gget_diamond import diamond -from .gget_setup import ( - ELM_INSTANCES_FASTA, +from .constants import UNIPROT_REST_API # noqa: E402 +from .gget_diamond import diamond # noqa: E402 +from .gget_setup import ( # noqa: E402 ELM_CLASSES_TSV, + ELM_INSTANCES_FASTA, ELM_INSTANCES_TSV, ELM_INTDOMAINS_TSV, ) def motif_in_query(row): - """ - Checks if motif is in the overlapping region with the query sequence + """Checks if motif is in the overlapping region with the query sequence. Args: row - row in dataframe @@ -29,15 +29,13 @@ def motif_in_query(row): """ return ( True - if (row["motif_start_in_subject"] >= row["subject_start"]) - & (row["motif_end_in_subject"] <= row["subject_end"]) + if (row["motif_start_in_subject"] >= row["subject_start"]) & (row["motif_end_in_subject"] <= row["subject_end"]) else False ) def get_elm_instances(UniProtID): - """ - Get ELM instances and their information from local ELM tsv files. + """Get ELM instances and their information from local ELM tsv files. Args: - UniProtID UniProt Acc to search for in the accession column of ELM tsv files. @@ -47,9 +45,7 @@ def get_elm_instances(UniProtID): # Get matching rows from elm_instances.tsv # ELM Instances.tsv file contains 5 lines before headers and data df_full_instances = tsv_to_df(ELM_INSTANCES_TSV, skiprows=5) - df_instances_matching = df_full_instances[ - df_full_instances["Primary_Acc"] == UniProtID - ] + df_instances_matching = df_full_instances[df_full_instances["Primary_Acc"] == UniProtID] # Rename columns df_instances_matching = df_instances_matching.rename( columns={ @@ -90,8 +86,8 @@ def seq_workflow( verbose, diamond_binary, ): - """ - Alignment of sequence using DIAMOND to get UniProt Acc. Use the UniProt Acc to construct an ortholog dataframe similar to the UniProt workflow + """Alignment of sequence using DIAMOND to get UniProt Acc. Use the UniProt Acc to construct an ortholog dataframe similar to the UniProt workflow + except for additional columns for start, end and whether the motif overlaps the subject sequence. Args: @@ -133,10 +129,7 @@ def seq_workflow( # Construct df with elm instances from UniProt Acc returned from diamond # TODO double check that this gets info if more than one UniProt Acc matched if verbose: - uniprot_ids = [ - str(id).split("|")[1] - for id in df_diamond["subject_accession"].values - ] + uniprot_ids = [str(id).split("|")[1] for id in df_diamond["subject_accession"].values] logger.info( f"ORTHO Sequence {seq_number}/{len(sequences)}: DIAMOND found the following orthologous proteins: {', '.join(uniprot_ids)}. Retrieving ELMs for each UniProt Acc..." ) @@ -147,20 +140,14 @@ def seq_workflow( # missing motifs other than the first one # df_elm["query_cover"] = df_diamond["length"].values[i] / seq_len * 100 df_elm["query_seq_length"] = df_diamond["query_seq_length"].values[i] - df_elm["subject_seq_length"] = df_diamond["subject_seq_length"].values[ - i - ] + df_elm["subject_seq_length"] = df_diamond["subject_seq_length"].values[i] df_elm["alignment_length"] = df_diamond["length"].values[i] - df_elm["identity_percentage"] = df_diamond[ - "identity_percentage" - ].values[i] + df_elm["identity_percentage"] = df_diamond["identity_percentage"].values[i] df_elm["query_start"] = int(df_diamond["query_start"].values[i]) df_elm["query_end"] = int(df_diamond["query_end"].values[i]) df_elm["subject_start"] = int(df_diamond["subject_start"].values[i]) df_elm["subject_end"] = int(df_diamond["subject_end"].values[i]) - df_elm["motif_inside_subject_query_overlap"] = df_elm.apply( - motif_in_query, axis=1 - ) + df_elm["motif_inside_subject_query_overlap"] = df_elm.apply(motif_in_query, axis=1) df = pd.concat([df, df_elm]) @@ -170,15 +157,16 @@ def seq_workflow( def regex_match(sequence): - """ - Compare ELM regex with input sequence and return all matching elms + """Compare ELM regex with input sequence and return all matching elms. Args: sequence - user input sequence (can be either amino acid seq or UniProt Acc) - Returns: + Returns + ------- df_final - dataframe containing regex matches TODO: Make sure this returns empty dataframe if no matches were found + """ # Get all motif regex patterns from elm db local file df_elm_classes = tsv_to_df(ELM_CLASSES_TSV, skiprows=5) @@ -199,7 +187,7 @@ def regex_match(sequence): df_final = pd.DataFrame() # Compare ELM regex with input sequence and return all matching elms - for elm_id, pattern in zip(elm_ids, regex_patterns): + for elm_id, pattern in zip(elm_ids, regex_patterns, strict=False): regex_matches = re.finditer(f"(?=({pattern}))", sequence) for match_string in regex_matches: @@ -214,7 +202,7 @@ def regex_match(sequence): elm_row.insert(loc=2, column="motif_start_in_query", value=int(start + 1)) elm_row.insert(loc=3, column="motif_end_in_query", value=int(end)) - elm_identifier = [str(x) for x in elm_row["ELMIdentifier"]][0] + [str(x) for x in elm_row["ELMIdentifier"]][0] # df_instances_matching = df_full_instances.loc[ # df_full_instances["ELMIdentifier"] == elm_identifier @@ -243,8 +231,8 @@ def elm( json=False, out=None, ): - """ - Locally predicts Eukaryotic Linear Motifs from an amino acid sequence or UniProt Acc using + """Locally predicts Eukaryotic Linear Motifs from an amino acid sequence or UniProt Acc using + data from the ELM database (http://elm.eu.org/). Args: @@ -276,7 +264,7 @@ def elm( or not os.path.exists(ELM_INTDOMAINS_TSV) ): raise FileNotFoundError( - f"Some or all ELM database files are missing. Please run 'gget setup elm' (Python: gget.setup('elm')) once to download the necessary files." + "Some or all ELM database files are missing. Please run 'gget setup elm' (Python: gget.setup('elm')) once to download the necessary files." ) # Let users know when local ELM was last updated @@ -299,12 +287,12 @@ def elm( # If sequence is not a valid amino sequence, raise error if not set(sequence) <= amino_acids: logger.warning( - f"Input amino acid sequence contains invalid characters. If the input is a UniProt Acc, please use flag --uniprot (Python: uniprot=True)." + "Input amino acid sequence contains invalid characters. If the input is a UniProt Acc, please use flag --uniprot (Python: uniprot=True)." ) # Build ortholog dataframe if verbose: - logger.info(f"ORTHO Compiling ortholog information...") + logger.info("ORTHO Compiling ortholog information...") ortho_df = pd.DataFrame() if uniprot: ortho_df = get_elm_instances(sequence) @@ -317,9 +305,7 @@ def elm( if len(df_uniprot) > 0: # Only grab sequences where IDs match exactly - aa_seqs = df_uniprot[df_uniprot["uniprot_id"] == sequence][ - "sequence" - ].values + aa_seqs = df_uniprot[df_uniprot["uniprot_id"] == sequence]["sequence"].values if len(aa_seqs) == 0: raise ValueError( @@ -350,9 +336,7 @@ def elm( ) if len(ortho_df) == 0: - logger.warning( - "ORTHO No ELM database orthologs found for input sequence or UniProt Acc." - ) + logger.warning("ORTHO No ELM database orthologs found for input sequence or UniProt Acc.") # Reorder columns of ortholog data frame ortho_cols = [ @@ -393,28 +377,25 @@ def elm( ortho_df = ortho_df[ortho_cols] # Remove false positives and true negatives ortho_df = ortho_df[ - (ortho_df["InstanceLogic"] != "false positive") - & (ortho_df["InstanceLogic"] != "true negative") + (ortho_df["InstanceLogic"] != "false positive") & (ortho_df["InstanceLogic"] != "true negative") ] # Drop duplicate rows and reset the index ortho_df = ortho_df.drop_duplicates().reset_index(drop=True) # Build data frame containing regex motif matches if verbose: - logger.info(f"REGEX Finding regex motif matches...") + logger.info("REGEX Finding regex motif matches...") fetch_aa_failed = False if uniprot: # use amino acid sequence associated with UniProt Acc to do regex match # do not fetch sequence again if already done above - if not "df_uniprot" in locals(): + if "df_uniprot" not in locals(): df_uniprot = get_uniprot_seqs(UNIPROT_REST_API, sequence) if len(df_uniprot) > 0: # Only grab sequences where IDs match exactly - sequences = df_uniprot[df_uniprot["uniprot_id"] == sequence][ - "sequence" - ].values + sequences = df_uniprot[df_uniprot["uniprot_id"] == sequence]["sequence"].values if len(sequences) == 0: logger.warning( @@ -433,9 +414,7 @@ def elm( df_regex_matches = regex_match(sequence) if len(df_regex_matches) == 0: - logger.warning( - "REGEX No regex matches found for input sequence or UniProt Acc." - ) + logger.warning("REGEX No regex matches found for input sequence or UniProt Acc.") # Reorder regex columns if expand: @@ -492,8 +471,7 @@ def elm( df_regex_matches = df_regex_matches[regex_cols] # Remove false positives and true negatives df_regex_matches = df_regex_matches[ - (df_regex_matches["InstanceLogic"] != "false positive") - & (df_regex_matches["InstanceLogic"] != "true negative") + (df_regex_matches["InstanceLogic"] != "false positive") & (df_regex_matches["InstanceLogic"] != "true negative") ] # Drop duplicates and reset index df_regex_matches = df_regex_matches.drop_duplicates().reset_index(drop=True) diff --git a/gget/gget_enrichr.py b/gget/gget_enrichr.py index 8b1050dc3..9e98c4121 100644 --- a/gget/gget_enrichr.py +++ b/gget/gget_enrichr.py @@ -1,32 +1,29 @@ -import requests -import pandas as pd import json as json_package -import numpy as np +import textwrap # Plotting packages import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import requests from matplotlib.ticker import MaxNLocator -import textwrap +from .compile import PACKAGE_PATH from .constants import ( - POST_ENRICHR_URLS, + DEFAULT_REQUESTS_TIMEOUT, + GET_BACKGROUND_ENRICHR_URL, GET_ENRICHR_URLS, POST_BACKGROUND_ID_ENRICHR_URL, - GET_BACKGROUND_ENRICHR_URL, - DEFAULT_REQUESTS_TIMEOUT, + POST_ENRICHR_URLS, ) -from .compile import PACKAGE_PATH from .gget_info import info - from .utils import set_up_logger logger = set_up_logger() def ensembl_to_gene_names(ensembl_ids): - """ - Function to fetch gene names from a list of Ensembl IDs using gget info. - """ + """Function to fetch gene names from a list of Ensembl IDs using gget info.""" genes_v2 = [] # Remove version number if passed @@ -37,9 +34,7 @@ def ensembl_to_gene_names(ensembl_ids): for gene_id in ensembl_ids: # Check if Ensembl ID was found if gene_id not in info_df.index: - logger.warning( - f"ID '{gene_id}' not found. Please double-check spelling/arguments." - ) + logger.warning(f"ID '{gene_id}' not found. Please double-check spelling/arguments.") continue gene_symbol = info_df.loc[gene_id]["ensembl_gene_name"] @@ -54,6 +49,7 @@ def ensembl_to_gene_names(ensembl_ids): def clean_genes_list(genes_list): + """Remove NaNs, Nones, and 'nan' strings from a list of genes.""" # Remove any NaNs/Nones from the gene list genes_clean = [] for gene in genes_list: @@ -79,8 +75,7 @@ def enrichr( save=False, verbose=True, ): - """ - Perform an enrichment analysis on a list of genes using Enrichr (https://maayanlab.cloud/Enrichr/). + """Perform an enrichment analysis on a list of genes using Enrichr (https://maayanlab.cloud/Enrichr/). Args: - genes List of Entrez gene symbols to perform enrichment analysis on, passed as a list of strings, e.g. ['PHF14', 'RBM3', 'MSL1', 'PHF21A']. @@ -117,11 +112,8 @@ def enrichr( Returns a data frame with the Enrichr results. """ - if species not in ["human", "mouse", "fly", "yeast", "worm", "fish"]: - raise ValueError( - f"Argument 'species' must be one of 'human', 'mouse', 'fly', 'yeast', 'worm', or 'fish'." - ) + raise ValueError("Argument 'species' must be one of 'human', 'mouse', 'fly', 'yeast', 'worm', or 'fish'.") if species == "mouse": species = "human" @@ -161,61 +153,49 @@ def enrichr( # All available libraries: https://maayanlab.cloud/Enrichr/#libraries if species == "human": db_message = f""" - Please note that there might be a more appropriate database for your application. + Please note that there might be a more appropriate database for your application. Go to https://maayanlab.cloud/{species_enrichr}/#libraries for a full list of supported databases. """ else: db_message = f""" - Please note that there might be a more appropriate database for your application. + Please note that there might be a more appropriate database for your application. Go to https://maayanlab.cloud/{species_enrichr}/#stats for a full list of supported databases. """ if not isinstance(background, bool): raise ValueError( - f"Argument`background` must be a boolean True/False. If you are adding a background list, use the argument `background_list` instead." + "Argument`background` must be a boolean True/False. If you are adding a background list, use the argument `background_list` instead." ) # Handle database shortcuts if database == "pathway": database = "KEGG_2021_Human" if verbose: - logger.info( - f"Performing Enrichr analysis using database {database}. " + db_message - ) + logger.info(f"Performing Enrichr analysis using database {database}. " + db_message) elif database == "transcription": database = "ChEA_2016" if verbose: - logger.info( - f"Performing Enrichr analysis using database {database}. " + db_message - ) + logger.info(f"Performing Enrichr analysis using database {database}. " + db_message) elif database == "ontology": database = "GO_Biological_Process_2021" if verbose: - logger.info( - f"Performing Enrichr analysis using database {database}. " + db_message - ) + logger.info(f"Performing Enrichr analysis using database {database}. " + db_message) elif database == "diseases_drugs": database = "GWAS_Catalog_2019" if verbose: - logger.info( - f"Performing Enrichr analysis using database {database}. " + db_message - ) + logger.info(f"Performing Enrichr analysis using database {database}. " + db_message) elif database == "celltypes": database = "PanglaoDB_Augmented_2021" if verbose: - logger.info( - f"Performing Enrichr analysis using database {database}. " + db_message - ) + logger.info(f"Performing Enrichr analysis using database {database}. " + db_message) elif database == "kinase_interactions": database = "KEA_2015" if verbose: - logger.info( - f"Performing Enrichr analysis using database {database}. " + db_message - ) + logger.info(f"Performing Enrichr analysis using database {database}. " + db_message) else: database = database @@ -225,9 +205,7 @@ def enrichr( # To generate a KEGG pathway image, confirm that the database is a KEGG database and pykegg is installed if kegg_out: if not database.startswith("KEGG"): - logger.error( - "Please specify a KEGG database when generating a KEGG pathway image." - ) + logger.error("Please specify a KEGG database when generating a KEGG pathway image.") return try: import pykegg @@ -268,9 +246,7 @@ def enrichr( if ensembl: if verbose: - logger.info( - f"Performing Enrichr analysis on the following gene symbols: {', '.join(genes_clean)}" - ) + logger.info(f"Performing Enrichr analysis on the following gene symbols: {', '.join(genes_clean)}") # Join genes from list genes_clean_final = "\n".join(genes_clean) @@ -303,9 +279,7 @@ def enrichr( # If user gives a background list, use the user input instead of the default if background_list: if verbose: - logger.info( - f"Performing Enrichr analysis using user-defined background gene list." - ) + logger.info("Performing Enrichr analysis using user-defined background gene list.") if background: logger.warning( @@ -409,14 +383,14 @@ def enrichr( if species == "human": logger.error( f""" - Database {database} not found. Go to https://maayanlab.cloud/{species_enrichr}/#libraries + Database {database} not found. Go to https://maayanlab.cloud/{species_enrichr}/#libraries for a full list of supported databases. """ ) else: logger.error( f""" - Database {database} not found. Go to https://maayanlab.cloud/{species_enrichr}/#stats + Database {database} not found. Go to https://maayanlab.cloud/{species_enrichr}/#stats for a full list of supported databases. """ ) @@ -485,12 +459,8 @@ def enrichr( # Plot barplot # ax1.barh(np.arange(len(gene_counts)), gene_counts, color=cmap(c_values), align="center") - ax1.barh( - np.arange(len(gene_counts)), gene_counts, color=barcolor, align="center" - ) - ax1.set_yticks( - np.arange(len(gene_counts)), labels, linespacing=0.85, fontsize=fontsize - ) + ax1.barh(np.arange(len(gene_counts)), gene_counts, color=barcolor, align="center") + ax1.set_yticks(np.arange(len(gene_counts)), labels, linespacing=0.85, fontsize=fontsize) ax1.invert_yaxis() # Set x-limit to be gene count + 1 ax1.set_xlim(0, ax1.get_xlim()[1] + 1) @@ -509,9 +479,7 @@ def enrichr( s=20, ) # Change label and color of p-value axis - ax2.set_xlabel( - "$-log_{10}$(adjusted P value)", fontsize=fontsize, color=p_val_color - ) + ax2.set_xlabel("$-log_{10}$(adjusted P value)", fontsize=fontsize, color=p_val_color) ax2.spines["top"].set_color(p_val_color) ax2.tick_params(axis="x", colors=p_val_color, labelsize=fontsize) @@ -543,9 +511,7 @@ def enrichr( ax1.tick_params(axis="y", labelsize=fontsize) # Set title - ax1.set_title( - f"Enrichr results from database {database}", fontsize=fontsize + 2 - ) + ax1.set_title(f"Enrichr results from database {database}", fontsize=fontsize + 2) # Set axis margins ax1.margins(y=0, x=0) @@ -567,7 +533,7 @@ def enrichr( # Generate KEGG pathway image if kegg_out: candidate_rank = df[df["rank"] == kegg_rank].iloc[0, :] - kegg_img = pykegg.visualize( + pykegg.visualize( candidate_rank["path_name"], candidate_rank["overlapping_genes"], db=database, diff --git a/gget/gget_gpt.py b/gget/gget_gpt.py index 463ae721c..e22a7622a 100644 --- a/gget/gget_gpt.py +++ b/gget/gget_gpt.py @@ -17,10 +17,10 @@ def gpt( out=None, verbose=True, ): - """ - Generates natural language text based on a given prompt using the OpenAI API's 'openai.ChatCompletion.create' endpoint. + """Generates natural language text based on a given prompt using the OpenAI API's 'openai.ChatCompletion.create' endpoint. - Parameters: + Parameters + ---------- - prompt (str): The input prompt to generate text from. - api_key (str): Your OpenAI API key (see: https://platform.openai.com/account/api-keys). - model (str): The name of the GPT model to use for generating the text. Default is "gpt-3.5-turbo". @@ -40,7 +40,8 @@ def gpt( - out (str) If provided, saves the generated text to a file with the specified path. Default is None. - verbose True/False whether to print progress information. Default True. - Returns: + Returns + ------- - A string containing the generated text. NOTE: OpenAI API calls are only 'free' for the first three months after generating your OpenAI Account @@ -49,6 +50,7 @@ def gpt( See their pricing and FAQ here: https://openai.com/pricing This module, including its source code, documentation and unittests, were partly written by OpenAI's Chat-GTP3. + """ # Check if cellxgene_census is installed try: @@ -56,7 +58,7 @@ def gpt( except ImportError: logger.error( """ - Some third-party dependencies are missing. Please run the following command: + Some third-party dependencies are missing. Please run the following command: >>> gget.setup('gpt') or $ gget setup gpt Alternative: Install the openai package using pip (https://pypi.org/project/openai). @@ -100,9 +102,7 @@ def gpt( ) if verbose: - logger.info( - f"Total tokens used for API call to model '{model}': {response['usage']['total_tokens']}" - ) + logger.info(f"Total tokens used for API call to model '{model}': {response['usage']['total_tokens']}") texts = response["choices"][0]["message"]["content"] diff --git a/gget/gget_info.py b/gget/gget_info.py index d7497aa90..ddf69b8f3 100644 --- a/gget/gget_info.py +++ b/gget/gget_info.py @@ -1,27 +1,27 @@ +import json as json_package + import numpy as np import pandas as pd -import json as json_package import requests from bs4 import BeautifulSoup # Custom functions from .utils import ( - rest_query, - get_uniprot_info, - wrap_cols_func, get_pdb_ids, - set_up_logger, + get_uniprot_info, post_query, + set_up_logger, + wrap_cols_func, ) logger = set_up_logger() # Constants -from .constants import ( +from .constants import ( # noqa: E402 + DEFAULT_REQUESTS_TIMEOUT, ENSEMBL_REST_API, - UNIPROT_REST_API, NCBI_URL, - DEFAULT_REQUESTS_TIMEOUT, + UNIPROT_REST_API, ) @@ -38,8 +38,7 @@ def info( expand=False, ensembl_only=False, ): - """ - Fetch gene and transcript metadata using Ensembl IDs. + """Fetch gene and transcript metadata using Ensembl IDs. Args: - ens_ids One or more Ensembl IDs to look up (string or list of strings). @@ -66,9 +65,7 @@ def info( ) if ensembl_only: if verbose: - logger.warning( - "'ensembl_only' argument deprecated! Please use arguments 'ncbi=False' and 'uniprot=False'." - ) + logger.warning("'ensembl_only' argument deprecated! Please use arguments 'ncbi=False' and 'uniprot=False'.") # Set synonyms found by each database initially to none ncbi_synonyms = None @@ -84,11 +81,10 @@ def info( # Define Ensembl REST API server server = ENSEMBL_REST_API # Define type of returned content from REST - content_type = "application/json" ## Clean up Ensembl IDs # If single Ensembl ID passed as string, convert to list - if type(ens_ids) == str: + if isinstance(ens_ids, str): ens_ids = [ens_ids] # Remove Ensembl ID version if passed ens_ids_clean = [] @@ -124,7 +120,7 @@ def info( results_dict = post_query(server, endpoint, query) results_dict = {k: v for k, v in results_dict.items() if v is not None} - for ensembl_ID, df_temp in results_dict.items(): + for ensembl_ID, df_temp in results_dict.items(): # noqa: B007 try: # Add Ensembl ID with latest version number to df_temp df_temp["ensembl_id"] = str(df_temp["id"]) + "." + str(df_temp["version"]) @@ -133,11 +129,7 @@ def info( df_temp["ensembl_id"] = str(df_temp["id"]) # second pass for ids that were not found in the initial query - ens_ids_clean_tmp = [ - ensembl_ID - for ensembl_ID in ens_ids_clean - if ensembl_ID not in results_dict.keys() - ] + ens_ids_clean_tmp = [ensembl_ID for ensembl_ID in ens_ids_clean if ensembl_ID not in results_dict.keys()] if len(ens_ids_clean_tmp) > 0: # print(f"Second pass for ids: {ens_ids_clean_tmp}") @@ -146,12 +138,10 @@ def info( results_dict_new = post_query(server, endpoint, query) results_dict_new = {k: v for k, v in results_dict_new.items() if v is not None} - for ensembl_ID, df_temp in results_dict_new.items(): + for ensembl_ID, df_temp in results_dict_new.items(): # noqa: B007 try: # Add Ensembl ID with latest version number to df_temp - df_temp["ensembl_id"] = ( - str(df_temp["id"]) + "." + str(df_temp["version"]) - ) + df_temp["ensembl_id"] = str(df_temp["id"]) + "." + str(df_temp["version"]) except KeyError: # Just add Ensembl ID if no version found df_temp["ensembl_id"] = str(df_temp["id"]) @@ -164,14 +154,10 @@ def info( ens_ids_clean_2.append(ensembl_ID) else: if verbose: - logger.warning( - f"ID '{ensembl_ID}' not found. Please double-check spelling/arguments and try again." - ) + logger.warning(f"ID '{ensembl_ID}' not found. Please double-check spelling/arguments and try again.") # rewrite results to be in the input order - results_dict = { - ensembl_ID: results_dict[ensembl_ID] for ensembl_ID in ens_ids_clean_2 - } + results_dict = {ensembl_ID: results_dict[ensembl_ID] for ensembl_ID in ens_ids_clean_2} master_dict.update(results_dict) @@ -199,15 +185,11 @@ def info( if fetch_uniprot is True: try: # Get gene names and descriptions from UniProt - df_uniprot = get_uniprot_info( - UNIPROT_REST_API, ens_id, verbose=verbose - ) + df_uniprot = get_uniprot_info(UNIPROT_REST_API, ens_id, verbose=verbose) - except Exception as e: + except Exception as e: # noqa: BLE001 if verbose: - logger.warning( - f"UniProt server request for ID '{ens_id}' returned the following error:\n{e}" - ) + logger.warning(f"UniProt server request for ID '{ens_id}' returned the following error:\n{e}") continue if not isinstance(df_uniprot, type(None)): @@ -224,9 +206,7 @@ def info( # Get uniprot synonyms and remove NaN values uni_synonyms = df_uniprot["uni_synonyms"].values[0] - uni_synonyms = [ - item for item in uni_synonyms if not (pd.isnull(item)) == True - ] + uni_synonyms = [item for item in uni_synonyms if not (pd.isnull(item))] # Transpose UniProt data frame and add Ensembl ID as column name df_uniprot = df_uniprot.T @@ -254,12 +234,9 @@ def info( # Check for error message in NCBI return if ( soup.find("li", class_="error icon") is not None - and "An error has occured" - in soup.find("li", class_="error icon").text.strip() + and "An error has occured" in soup.find("li", class_="error icon").text.strip() ): - error_message = soup.find( - "li", class_="error icon" - ).text.strip() + error_message = soup.find("li", class_="error icon").text.strip() logger.error( f"The NCBI server request for Ensembl ID '{ens_id}' returned the following error:\n{error_message}" @@ -272,9 +249,7 @@ def info( # Check if NCBI gene ID is available try: - ncbi_gene_id = soup.find("input", {"id": "gene-id-value"}).get( - "value" - ) + ncbi_gene_id = soup.find("input", {"id": "gene-id-value"}).get("value") except AttributeError: ncbi_gene_id = np.nan @@ -302,7 +277,7 @@ def info( except AttributeError: ncbi_synonyms = None - except Exception as e: + except Exception as e: # noqa: BLE001 logger.error( f"The NCBI server request for Ensembl ID '{ens_id}' returned the following error:\n{e}" ) @@ -329,7 +304,7 @@ def info( try: pdb_ids = get_pdb_ids(ens_id) - except Exception as e: + except Exception as e: # noqa: BLE001 if verbose: logger.warning( f"The PDBe server request for Ensembl ID '{ens_id}' returned the following error:\n{e}" @@ -352,7 +327,7 @@ def info( if ncbi_synonyms is not None and not isinstance(df_uniprot, type(None)): synonyms = list(set().union(uni_synonyms, ncbi_synonyms)) # Remove nan values - synonyms = [item for item in synonyms if not (pd.isnull(item)) == True] + synonyms = [item for item in synonyms if not (pd.isnull(item))] # Add only UniProt synonyms if NCBI syns not available elif ncbi_synonyms is None and not isinstance(df_uniprot, type(None)): @@ -370,7 +345,7 @@ def info( # Sort synonyms alphabetically (if sortable) try: synonyms = sorted(synonyms) - except: + except Exception: # noqa: BLE001 pass # Append dataframes with data from NCBI, UniProt and PDB from ens_id to df_temp @@ -448,34 +423,32 @@ def info( try: try: # Add Transcript ID with latest version if available - versioned_trans_id = ( - str(trans_dict["id"]) + "." + str(trans_dict["version"]) - ) + versioned_trans_id = str(trans_dict["id"]) + "." + str(trans_dict["version"]) all_transcripts.append(versioned_trans_id) except KeyError: # Just add ID if no version found all_transcripts.append(trans_dict["id"]) - except: + except Exception: # noqa: BLE001 all_transcripts.append(np.nan) try: transcript_names.append(trans_dict["display_name"]) - except: + except Exception: # noqa: BLE001 transcript_names.append(np.nan) try: transcript_biotypes.append(trans_dict["biotype"]) - except: + except Exception: # noqa: BLE001 transcript_biotypes.append(np.nan) try: transcript_starts.append(trans_dict["start"]) - except: + except Exception: # noqa: BLE001 transcript_starts.append(np.nan) try: transcript_ends.append(trans_dict["end"]) - except: + except Exception: # noqa: BLE001 transcript_ends.append(np.nan) try: transcript_strands.append(trans_dict["strand"]) - except: + except Exception: # noqa: BLE001 transcript_strands.append(np.nan) data["all_transcripts"].append(all_transcripts) @@ -485,7 +458,7 @@ def info( data["transcript_starts"].append(transcript_starts) data["transcript_ends"].append(transcript_ends) - except: + except Exception: # noqa: BLE001 data["all_transcripts"].append(np.nan) data["transcript_biotypes"].append(np.nan) data["transcript_names"].append(np.nan) @@ -502,29 +475,27 @@ def info( try: try: # Add ID with latest version if available - versioned_id = ( - str(exon_dict["id"]) + "." + str(exon_dict["version"]) - ) + versioned_id = str(exon_dict["id"]) + "." + str(exon_dict["version"]) all_exons.append(versioned_id) except KeyError: # Just add ID if no version found all_exons.append(exon_dict["id"]) - except: + except Exception: # noqa: BLE001 all_exons.append(np.nan) try: exon_starts.append(exon_dict["start"]) - except: + except Exception: # noqa: BLE001 exon_starts.append(np.nan) try: exon_ends.append(exon_dict["end"]) - except: + except Exception: # noqa: BLE001 exon_ends.append(np.nan) data["all_exons"].append(all_exons) data["exon_starts"].append(exon_starts) data["exon_ends"].append(exon_ends) - except: + except Exception: # noqa: BLE001 data["all_exons"].append(np.nan) data["exon_starts"].append(np.nan) data["exon_ends"].append(np.nan) @@ -538,37 +509,33 @@ def info( try: try: # Add ID with latest version if available - versioned_id = ( - str(transl_dict["id"]) + "." + str(transl_dict["version"]) - ) + versioned_id = str(transl_dict["id"]) + "." + str(transl_dict["version"]) all_translations.append(versioned_id) except KeyError: # Just add ID if no version found all_translations.append(transl_dict["id"]) - except: + except Exception: # noqa: BLE001 all_translations.append(np.nan) try: translation_starts.append(transl_dict["start"]) - except: + except Exception: # noqa: BLE001 translation_starts.append(np.nan) try: translation_ends.append(transl_dict["end"]) - except: + except Exception: # noqa: BLE001 translation_ends.append(np.nan) data["all_translations"].append(all_translations) data["translation_starts"].append(translation_starts) data["translation_ends"].append(translation_ends) - except: + except Exception: # noqa: BLE001 data["all_translations"].append(np.nan) data["translation_starts"].append(np.nan) data["translation_ends"].append(np.nan) # Append cleaned up info to df_final - df_final = pd.concat( - [df_final, pd.DataFrame.from_dict(data, orient="index", columns=ens_ids)] - ) + df_final = pd.concat([df_final, pd.DataFrame.from_dict(data, orient="index", columns=ens_ids)]) ## Transpose data frame so each row corresponds to one Ensembl ID df_final = df_final.T @@ -617,6 +584,7 @@ def info( transcript_strands or [], transcript_starts or [], transcript_ends or [], + strict=False, ): results_dict[ens_id]["all_transcripts"].append( { @@ -641,9 +609,7 @@ def info( # Build new dictionary entries results_dict[ens_id].update({"all_exons": []}) - for exon_id, exon_start, exon_end in zip( - exon_ids or [], exon_starts or [], exon_ends or [] - ): + for exon_id, exon_start, exon_end in zip(exon_ids or [], exon_starts or [], exon_ends or [], strict=False): results_dict[ens_id]["all_exons"].append( {"exon_id": exon_id, "exon_start": exon_start, "exon_end": exon_end} ) @@ -661,7 +627,7 @@ def info( # Build new dictionary entries results_dict[ens_id].update({"all_translations": []}) for translation_id, translation_start, translation_end in zip( - translation_ids or [], translation_starts or [], translation_ends or [] + translation_ids or [], translation_starts or [], translation_ends or [], strict=False ): results_dict[ens_id]["all_translations"].append( { diff --git a/gget/gget_muscle.py b/gget/gget_muscle.py index 26343376a..bd1f03a6f 100644 --- a/gget/gget_muscle.py +++ b/gget/gget_muscle.py @@ -1,31 +1,26 @@ +import itertools import os import platform import subprocess -import itertools import sys import time import uuid # Custom functions -from .compile import compile_muscle, MUSCLE_PATH, PACKAGE_PATH -from .utils import aa_colors, n_colors, create_tmp_fasta, set_up_logger +from .compile import MUSCLE_PATH, PACKAGE_PATH, compile_muscle +from .utils import aa_colors, create_tmp_fasta, n_colors, set_up_logger logger = set_up_logger() # Path to precompiled muscle binary if platform.system() == "Windows": - PRECOMPILED_MUSCLE_PATH = os.path.join( - PACKAGE_PATH, f"bins/{platform.system()}/muscle.win64.exe" - ) + PRECOMPILED_MUSCLE_PATH = os.path.join(PACKAGE_PATH, f"bins/{platform.system()}/muscle.win64.exe") else: - PRECOMPILED_MUSCLE_PATH = os.path.join( - PACKAGE_PATH, f"bins/{platform.system()}/muscle" - ) + PRECOMPILED_MUSCLE_PATH = os.path.join(PACKAGE_PATH, f"bins/{platform.system()}/muscle") def muscle(fasta, super5=False, out=None, verbose=True): - """ - Align multiple nucleotide or amino acid sequences against each other (using the Muscle v5 algorithm). + """Align multiple nucleotide or amino acid sequences against each other (using the Muscle v5 algorithm). Args: - fasta List of sequences or path to fasta file containing the sequences to be aligned. @@ -63,7 +58,7 @@ def muscle(fasta, super5=False, out=None, verbose=True): abs_out_path = os.path.abspath(out) # Compile muscle if it is not already compiled - if os.path.isfile(PRECOMPILED_MUSCLE_PATH) == False: + if not os.path.isfile(PRECOMPILED_MUSCLE_PATH): # Compile muscle compile_muscle() muscle_path = MUSCLE_PATH @@ -115,15 +110,14 @@ def muscle(fasta, super5=False, out=None, verbose=True): return else: if verbose: - logger.info( - f"MUSCLE alignment complete. Alignment time: {round(time.time() - start_time, 2)} seconds" - ) + logger.info(f"MUSCLE alignment complete. Alignment time: {round(time.time() - start_time, 2)} seconds") if out is None: ## Print cleaned up muscle output # Get the titles and sequences from the generated .afa file titles = [] seqs_master = [] + seqs = [] with open(abs_out_path) as aln_file: for i, line in enumerate(aln_file): # Recognize title lines by the '>' character diff --git a/gget/gget_mutate.py b/gget/gget_mutate.py index ff68f4d1c..353b146eb 100644 --- a/gget/gget_mutate.py +++ b/gget/gget_mutate.py @@ -1,13 +1,13 @@ -import pandas as pd +import os import re -from tqdm import tqdm + import numpy as np -import os -from typing import Union, List, Optional +import pandas as pd +from tqdm import tqdm tqdm.pandas() -from .utils import read_fasta, set_up_logger +from .utils import read_fasta, set_up_logger # noqa: E402 logger = set_up_logger() @@ -20,7 +20,9 @@ cosmic_incorrect_wt_base = 0 mut_idx_outside_seq = 0 -mutation_pattern = r"(?:c|g)\.([0-9_\-\+\*]+)([a-zA-Z>]+)" # more complex: r'c\.([0-9_\-\+\*\(\)\?]+)([a-zA-Z>\(\)0-9]+)' +mutation_pattern = ( + r"(?:c|g)\.([0-9_\-\+\*]+)([a-zA-Z>]+)" # more complex: r'c\.([0-9_\-\+\*\(\)\?]+)([a-zA-Z>\(\)0-9]+)' +) # Get complement complement = { @@ -110,6 +112,7 @@ def convert_chromosome_value_to_int_when_possible(val): + """Convert a chromosome value to an integer string when possible, otherwise return it as a string.""" try: # Try to convert the value to a float, then to an int, and finally to a string return str(int(float(val))) @@ -118,9 +121,8 @@ def convert_chromosome_value_to_int_when_possible(val): return str(val) -def merge_gtf_transcript_locations_into_cosmic_csv( - mutations, gtf_path, gtf_transcript_id_column -): +def merge_gtf_transcript_locations_into_cosmic_csv(mutations, gtf_path, gtf_transcript_id_column): + """Merge transcript start/end positions and strand from a GTF file into the mutations DataFrame.""" gtf_df = pd.read_csv( gtf_path, sep="\t", @@ -146,9 +148,7 @@ def merge_gtf_transcript_locations_into_cosmic_csv( gtf_df["transcript_id"] = gtf_df["attribute"].str.extract('transcript_id "([^"]+)"') - assert len(gtf_df["transcript_id"]) == len( - set(gtf_df["transcript_id"]) - ), "Duplicate transcript_id values found!" + assert len(gtf_df["transcript_id"]) == len(set(gtf_df["transcript_id"])), "Duplicate transcript_id values found!" # Filter out rows where transcript_id is NaN gtf_df = gtf_df.dropna(subset=["transcript_id"]) @@ -164,22 +164,20 @@ def merge_gtf_transcript_locations_into_cosmic_csv( merged_df = pd.merge(mutations, gtf_df, on=gtf_transcript_id_column, how="left") # Fill NaN values - merged_df["start_transcript_position"] = merged_df[ - "start_transcript_position" - ].fillna(0) - merged_df["end_transcript_position"] = merged_df["end_transcript_position"].fillna( - 9999999 - ) + merged_df["start_transcript_position"] = merged_df["start_transcript_position"].fillna(0) + merged_df["end_transcript_position"] = merged_df["end_transcript_position"].fillna(9999999) merged_df["strand"] = merged_df["strand"].fillna(".") return merged_df def get_sequence_length(seq_id, seq_dict): + """Return the length of the sequence stored under seq_id in seq_dict.""" return len(seq_dict.get(seq_id, "")) def get_nucleotide_at_position(seq_id, pos, seq_dict): + """Return the nucleotide at the given position in the sequence for seq_id, or None if out of range.""" full_seq = seq_dict.get(seq_id, "") if pos < len(full_seq): return full_seq[pos] @@ -187,12 +185,11 @@ def get_nucleotide_at_position(seq_id, pos, seq_dict): def translate_sequence(sequence, start, end): + """Translate a nucleotide sequence into an amino acid sequence between start and end.""" amino_acid_sequence = "" for i in range(start, end, 3): codon = sequence[i : i + 3].upper() - amino_acid = codon_to_amino_acid.get( - codon, "X" - ) # Use 'X' for unknown or incomplete codons + amino_acid = codon_to_amino_acid.get(codon, "X") # Use 'X' for unknown or incomplete codons amino_acid_sequence += amino_acid return amino_acid_sequence @@ -203,15 +200,15 @@ def translate_sequence(sequence, start, end): def remove_gt_after_semicolon(line): + """Remove leading '>' characters from each semicolon-separated part except the first.""" parts = line.split(";") # Remove '>' from the beginning of each part except the first part parts = [parts[0]] + [part.lstrip(">") for part in parts[1:]] return ";".join(parts) -def wt_fragment_and_mutant_fragment_share_kmer( - mutated_fragment: str, wildtype_fragment: str, k: int -) -> bool: +def wt_fragment_and_mutant_fragment_share_kmer(mutated_fragment: str, wildtype_fragment: str, k: int) -> bool: + """Return True if the mutated fragment shares any k-mer with the wildtype fragment.""" if len(mutated_fragment) <= k: if mutated_fragment in wildtype_fragment: return True @@ -227,9 +224,8 @@ def wt_fragment_and_mutant_fragment_share_kmer( def add_mutation_type(mutations, mut_column): - mutations["mutation_type_id"] = mutations[mut_column].str.extract(mutation_pattern)[ - 1 - ] + """Add a 'mutation_type' column to the mutations DataFrame based on the mutation notation.""" + mutations["mutation_type_id"] = mutations[mut_column].str.extract(mutation_pattern)[1] # Define conditions and choices for the mutation types conditions = [ @@ -262,15 +258,15 @@ def add_mutation_type(mutations, mut_column): def extract_sequence(row, seq_dict, seq_id_column="seq_ID"): + """Extract the subsequence spanning the mutation positions for a row, or None if positions are missing.""" if pd.isna(row["start_mutation_position"]) or pd.isna(row["end_mutation_position"]): return None - seq = seq_dict[row[seq_id_column]][ - int(row["start_mutation_position"]) : int(row["end_mutation_position"]) + 1 - ] + seq = seq_dict[row[seq_id_column]][int(row["start_mutation_position"]) : int(row["end_mutation_position"]) + 1] return seq def common_prefix_length(s1, s2): + """Return the length of the common prefix shared by s1 and s2.""" min_len = min(len(s1), len(s2)) for i in range(min_len): if s1[i] != s2[i]: @@ -280,6 +276,7 @@ def common_prefix_length(s1, s2): # Function to find the length of the common suffix with the prefix def common_suffix_length(s1, s2): + """Return the length of the common suffix shared by s1 and s2.""" min_len = min(len(s1), len(s2)) for i in range(min_len): if s1[-(i + 1)] != s2[-(i + 1)]: @@ -288,6 +285,7 @@ def common_suffix_length(s1, s2): def count_repeat_right_flank(mut_nucleotides, right_flank_region): + """Count the total overlap length of repeated mut_nucleotides at the start of the right flank region.""" total_overlap_len = 0 while right_flank_region.startswith(mut_nucleotides): total_overlap_len += len(mut_nucleotides) @@ -297,6 +295,7 @@ def count_repeat_right_flank(mut_nucleotides, right_flank_region): def count_repeat_left_flank(mut_nucleotides, left_flank_region): + """Count the total overlap length of repeated mut_nucleotides at the end of the left flank region.""" total_overlap_len = 0 while left_flank_region.endswith(mut_nucleotides): total_overlap_len += len(mut_nucleotides) @@ -306,6 +305,7 @@ def count_repeat_left_flank(mut_nucleotides, left_flank_region): def beginning_mut_nucleotides_with_right_flank(mut_nucleotides, right_flank_region): + """Return the overlap length between mut_nucleotides and the beginning of the right flank region.""" if mut_nucleotides == right_flank_region[: len(mut_nucleotides)]: return count_repeat_right_flank(mut_nucleotides, right_flank_region) else: @@ -314,6 +314,7 @@ def beginning_mut_nucleotides_with_right_flank(mut_nucleotides, right_flank_regi # Comparing end of mut_nucleotides to the end of left_flank_region def end_mut_nucleotides_with_left_flank(mut_nucleotides, left_flank_region): + """Return the overlap length between mut_nucleotides and the end of the left flank region.""" if mut_nucleotides == left_flank_region[-len(mut_nucleotides) :]: return count_repeat_left_flank(mut_nucleotides, left_flank_region) else: @@ -321,6 +322,7 @@ def end_mut_nucleotides_with_left_flank(mut_nucleotides, left_flank_region): def calculate_beginning_mutation_overlap_with_right_flank(row): + """Calculate the overlap between the beginning of a row's mutation and its right flank region.""" if row["mutation_type"] == "deletion": sequence_to_check = row["wt_nucleotides_ensembl"] else: @@ -331,12 +333,11 @@ def calculate_beginning_mutation_overlap_with_right_flank(row): else: original_sequence = row["right_flank_region"] - return beginning_mut_nucleotides_with_right_flank( - sequence_to_check, original_sequence - ) + return beginning_mut_nucleotides_with_right_flank(sequence_to_check, original_sequence) def calculate_end_mutation_overlap_with_left_flank(row): + """Calculate the overlap between the end of a row's mutation and its left flank region.""" if row["mutation_type"] == "deletion": sequence_to_check = row["wt_nucleotides_ensembl"] else: @@ -351,30 +352,30 @@ def calculate_end_mutation_overlap_with_left_flank(row): def mutate( - sequences: Union[str, List[str]], - mutations: Union[str, List[str]], + sequences: str | list[str], + mutations: str | list[str], mut_column: str = "mutation", seq_id_column: str = "seq_ID", - mut_id_column: Optional[str] = None, - gtf: Optional[str] = None, - gtf_transcript_id_column: Optional[str] = None, + mut_id_column: str | None = None, + gtf: str | None = None, + gtf_transcript_id_column: str | None = None, k: int = 30, - min_seq_len: Optional[int] = None, + min_seq_len: int | None = None, optimize_flanking_regions: bool = False, remove_seqs_with_wt_kmers: bool = False, - max_ambiguous: Optional[int] = None, + max_ambiguous: int | None = None, merge_identical: bool = True, update_df: bool = False, - update_df_out: Optional[str] = None, + update_df_out: str | None = None, store_full_sequences: bool = False, translate: bool = False, - translate_start: Union[int, str, None] = None, - translate_end: Union[int, str, None] = None, - out: Optional[str] = None, + translate_start: int | str | None = None, + translate_end: int | str | None = None, + out: str | None = None, verbose: bool = True, ): - """ - Takes in nucleotide sequences and mutations (in standard mutation annotation - see below) + """Takes in nucleotide sequences and mutations (in standard mutation annotation - see below) + and returns mutated versions of the input sequences according to the provided mutations. Reuiqred input argument: @@ -461,8 +462,14 @@ def mutate( Saves mutated sequences in fasta format (or, if out=None: when update_df is True, returns the mutation dataframe, otherwise returns a list containing the mutated sequences). """ - - global intronic_mutations, posttranslational_region_mutations, unknown_mutations, uncertain_mutations, ambiguous_position_mutations, cosmic_incorrect_wt_base, mut_idx_outside_seq + global \ + intronic_mutations, \ + posttranslational_region_mutations, \ + unknown_mutations, \ + uncertain_mutations, \ + ambiguous_position_mutations, \ + cosmic_incorrect_wt_base, \ + mut_idx_outside_seq columns_to_keep = [ "header", @@ -472,7 +479,7 @@ def mutate( "wt_sequence", "mutant_sequence", "start_mutation_position", - "end_mutation_position" + "end_mutation_position", ] # Load input sequences and their identifiers from fasta file @@ -481,7 +488,7 @@ def mutate( # Handle input sequences passed as a list elif isinstance(sequences, list): - titles = [f"seq{i+1}" for i in range(len(sequences))] + titles = [f"seq{i + 1}" for i in range(len(sequences))] seqs = sequences # Handle a single sequence passed as a string @@ -492,7 +499,7 @@ def mutate( else: raise ValueError( """ - Format of the input to the 'sequences' argument not recognized. + Format of the input to the 'sequences' argument not recognized. 'sequences' must be one of the following: - Path to the fasta file containing the sequences to be mutated (e.g. 'seqs.fa') - A list of sequences to be mutated (e.g. ['ACTGCTAGCT', 'AGCTAGCT']) @@ -508,18 +515,14 @@ def mutate( mutations = pd.read_csv(mutations) for col in mutations.columns: if col not in columns_to_keep: - columns_to_keep.append( - col - ) # append "mutation_aa", "gene_name", "mutation_id" + columns_to_keep.append(col) # append "mutation_aa", "gene_name", "mutation_id" elif isinstance(mutations, str) and mutations.endswith(".tsv"): mutations_path = mutations mutations = pd.read_csv(mutations, sep="\t") for col in mutations.columns: if col not in columns_to_keep: - columns_to_keep.append( - col - ) # append "mutation_aa", "gene_name", "mutation_id" + columns_to_keep.append(col) # append "mutation_aa", "gene_name", "mutation_id" # Handle mutations passed as a list elif isinstance(mutations, list): @@ -531,14 +534,14 @@ def mutate( temp = pd.DataFrame() temp["mutation"] = mutations - temp["mut_ID"] = [f"mut{i+1}" for i in range(len(mutations))] - temp["seq_ID"] = [f"seq{i+1}" for i in range(len(mutations))] + temp["mut_ID"] = [f"mut{i + 1}" for i in range(len(mutations))] + temp["seq_ID"] = [f"seq{i + 1}" for i in range(len(mutations))] mutations = temp else: temp = pd.DataFrame() temp["mutation"] = [mutations[0]] * len(seqs) - temp["mut_ID"] = [f"mut{i+1}" for i in range(len(seqs))] - temp["seq_ID"] = [f"seq{i+1}" for i in range(len(seqs))] + temp["mut_ID"] = [f"mut{i + 1}" for i in range(len(seqs))] + temp["seq_ID"] = [f"seq{i + 1}" for i in range(len(seqs))] mutations = temp # Handle single mutation passed as a string @@ -546,8 +549,8 @@ def mutate( # This will work for one mutation for one sequence as well as one mutation for multiple sequences temp = pd.DataFrame() temp["mutation"] = [mutations] * len(seqs) - temp["mut_ID"] = [f"mut{i+1}" for i in range(len(seqs))] - temp["seq_ID"] = [f"seq{i+1}" for i in range(len(seqs))] + temp["mut_ID"] = [f"mut{i + 1}" for i in range(len(seqs))] + temp["seq_ID"] = [f"seq{i + 1}" for i in range(len(seqs))] mutations = temp elif isinstance(mutations, pd.DataFrame): @@ -556,7 +559,7 @@ def mutate( else: raise ValueError( """ - Format of the input to the 'mutations' argument not recognized. + Format of the input to the 'mutations' argument not recognized. 'mutations' must be one of the following: - Path to comma-separated csv file (e.g. 'mutations.csv') - A pandas DataFrame object @@ -570,7 +573,7 @@ def mutate( seq_dict = {} non_nuc_seqs = 0 - for title, seq in zip(titles, seqs): + for title, seq in zip(titles, seqs, strict=False): # Check that sequences are nucleotide sequences if not set(seq) <= nucleotides: non_nuc_seqs += 1 @@ -583,7 +586,7 @@ def mutate( logger.warning( f""" Non-nucleotide characters detected in {non_nuc_seqs} input sequences. gget mutate is currently only optimized for mutating nucleotide sequences. - Specifically inversion mutations might not be performed correctly. + Specifically inversion mutations might not be performed correctly. """ ) @@ -600,9 +603,7 @@ def mutate( mutations = mutations.dropna(subset=[seq_id_column]) # ensure seq_ID column is string type, and chromosome numbers don't have decimals - mutations[seq_id_column] = mutations[seq_id_column].apply( - convert_chromosome_value_to_int_when_possible - ) + mutations[seq_id_column] = mutations[seq_id_column].apply(convert_chromosome_value_to_int_when_possible) mutations = add_mutation_type(mutations, mut_column) @@ -615,16 +616,16 @@ def mutate( if 0 < len(seqs_not_found) < 20: logger.warning( f""" - The sequences with the following {len(seqs_not_found)} sequence ID(s) were not found: {", ".join(seqs_not_found[seq_id_column].values)} - These sequences and their corresponding mutations will not be included in the output. + The sequences with the following {len(seqs_not_found)} sequence ID(s) were not found: {", ".join(seqs_not_found[seq_id_column].values)} + These sequences and their corresponding mutations will not be included in the output. Ensure that the sequence IDs correspond to the string following the > character in the 'sequences' fasta file (do NOT include spaces or dots). """ ) elif len(seqs_not_found) > 0: logger.warning( f""" - The sequences corresponding to {len(seqs_not_found)} sequence IDs were not found. - These sequences and their corresponding mutations will not be included in the output. + The sequences corresponding to {len(seqs_not_found)} sequence IDs were not found. + These sequences and their corresponding mutations will not be included in the output. Ensure that the sequence IDs correspond to the string following the > character in the 'sequences' fasta file (do NOT include spaces or dots). """ ) @@ -634,7 +635,7 @@ def mutate( if len(mutations) < 1: raise ValueError( """ - None of the input sequences match the sequence IDs provided in 'mutations'. + None of the input sequences match the sequence IDs provided in 'mutations'. Ensure that the sequence IDs correspond to the string following the > character in the 'sequences' fasta file (do NOT include spaces or dots). """ ) @@ -645,9 +646,7 @@ def mutate( mut_id_column = mut_column mutations["mutant_sequence"] = "" - mutations["header"] = ( - ">" + mutations[seq_id_column] + ":" + mutations[mut_id_column] - ) + mutations["header"] = ">" + mutations[seq_id_column] + ":" + mutations[mut_id_column] # Calculate number of bad mutations uncertain_mutations = mutations[mut_column].str.contains(r"\?").sum() @@ -664,9 +663,7 @@ def mutate( mutations = mutations[~mask] # Extract nucleotide positions and mutation info from Mutation CDS - mutations[["nucleotide_positions", "actual_mutation"]] = mutations[ - mut_column - ].str.extract(mutation_pattern) + mutations[["nucleotide_positions", "actual_mutation"]] = mutations[mut_column].str.extract(mutation_pattern) # Filter out mutations that did not match the re unknown_mutations = mutations["nucleotide_positions"].isna().sum() @@ -681,15 +678,13 @@ def mutate( mutations["start_mutation_position"] = split_positions[0] if split_positions.shape[1] > 1: - mutations["end_mutation_position"] = split_positions[1].fillna( - split_positions[0] - ) + mutations["end_mutation_position"] = split_positions[1].fillna(split_positions[0]) else: mutations["end_mutation_position"] = mutations["start_mutation_position"] - mutations.loc[ - mutations["end_mutation_position"].isna(), "end_mutation_position" - ] = mutations["start_mutation_position"] + mutations.loc[mutations["end_mutation_position"].isna(), "end_mutation_position"] = mutations[ + "start_mutation_position" + ] mutations[["start_mutation_position", "end_mutation_position"]] = mutations[ ["start_mutation_position", "end_mutation_position"] @@ -700,14 +695,12 @@ def mutate( mutations["end_mutation_position"] -= 1 # don't forget to increment by 1 later # Calculate sequence length - mutations["sequence_length"] = mutations[seq_id_column].apply( - lambda x: get_sequence_length(x, seq_dict) - ) + mutations["sequence_length"] = mutations[seq_id_column].apply(lambda x: get_sequence_length(x, seq_dict)) # Filter out mutations with positions outside the sequence - index_error_mask = ( - mutations["start_mutation_position"] > mutations["sequence_length"] - ) | (mutations["end_mutation_position"] > mutations["sequence_length"]) + index_error_mask = (mutations["start_mutation_position"] > mutations["sequence_length"]) | ( + mutations["end_mutation_position"] > mutations["sequence_length"] + ) mut_idx_outside_seq = index_error_mask.sum() @@ -728,33 +721,15 @@ def mutate( if remove_seqs_with_wt_kmers: long_duplications = ( - (duplication_mask) - & ( - ( - mutations["end_mutation_position"] - - mutations["start_mutation_position"] - ) - >= k - ) + (duplication_mask) & ((mutations["end_mutation_position"] - mutations["start_mutation_position"]) >= k) ).sum() logger.info(f"Removing {long_duplications} duplications > k") mutations = mutations[ - ~( - (duplication_mask) - & ( - ( - mutations["end_mutation_position"] - - mutations["start_mutation_position"] - ) - >= k - ) - ) + ~((duplication_mask) & ((mutations["end_mutation_position"] - mutations["start_mutation_position"]) >= k)) ] # Create a mask for all non-substitution mutations - non_substitution_mask = ( - deletion_mask | delins_mask | insertion_mask | duplication_mask | inversion_mask - ) + non_substitution_mask = deletion_mask | delins_mask | insertion_mask | duplication_mask | inversion_mask # Extract the WT nucleotides for the substitution rows from reference fasta (i.e., Ensembl) start_positions = mutations.loc[substitution_mask, "start_mutation_position"].values @@ -763,27 +738,19 @@ def mutate( wt_nucleotides_substitution = np.array( [ get_nucleotide_at_position(seq_id, pos, seq_dict) - for seq_id, pos in zip( - mutations.loc[substitution_mask, seq_id_column], start_positions - ) + for seq_id, pos in zip(mutations.loc[substitution_mask, seq_id_column], start_positions, strict=False) ] ) - mutations.loc[substitution_mask, "wt_nucleotides_ensembl"] = ( - wt_nucleotides_substitution - ) + mutations.loc[substitution_mask, "wt_nucleotides_ensembl"] = wt_nucleotides_substitution # Extract the WT nucleotides for the substitution rows from the Mutation CDS (i.e., COSMIC) mutations["wt_nucleotides_cosmic"] = None - mutations.loc[substitution_mask, "wt_nucleotides_cosmic"] = mutations[ - "actual_mutation" - ].str[0] + mutations.loc[substitution_mask, "wt_nucleotides_cosmic"] = mutations["actual_mutation"].str[0] - congruent_wt_bases_mask = ( - mutations["wt_nucleotides_cosmic"] == mutations["wt_nucleotides_ensembl"] - ) | mutations[["wt_nucleotides_cosmic", "wt_nucleotides_ensembl"]].isna().any( - axis=1 - ) + congruent_wt_bases_mask = (mutations["wt_nucleotides_cosmic"] == mutations["wt_nucleotides_ensembl"]) | mutations[ + ["wt_nucleotides_cosmic", "wt_nucleotides_ensembl"] + ].isna().any(axis=1) cosmic_incorrect_wt_base = (~congruent_wt_bases_mask).sum() @@ -794,40 +761,33 @@ def mutate( return mutations if update_df else [] # Adjust the start and end positions for insertions - mutations.loc[ - insertion_mask, "start_mutation_position" - ] += 1 # in other cases, we want left flank to exclude the start of mutation site; but with insertion, the start of mutation site as it is denoted still belongs in the flank region - mutations.loc[ - insertion_mask, "end_mutation_position" - ] -= 1 # in this notation, the end position is one before the start position + mutations.loc[insertion_mask, "start_mutation_position"] += ( + 1 # in other cases, we want left flank to exclude the start of mutation site; but with insertion, the start of mutation site as it is denoted still belongs in the flank region + ) + mutations.loc[insertion_mask, "end_mutation_position"] -= ( + 1 # in this notation, the end position is one before the start position + ) # Extract the WT nucleotides for the non-substitution rows from the Mutation CDS (i.e., COSMIC) - mutations.loc[non_substitution_mask, "wt_nucleotides_ensembl"] = mutations.loc[ - non_substitution_mask - ].apply(lambda row: extract_sequence(row, seq_dict, seq_id_column), axis=1) + mutations.loc[non_substitution_mask, "wt_nucleotides_ensembl"] = mutations.loc[non_substitution_mask].apply( + lambda row: extract_sequence(row, seq_dict, seq_id_column), axis=1 + ) # Apply mutations to the sequences mutations["mut_nucleotides"] = None - mutations.loc[substitution_mask, "mut_nucleotides"] = mutations.loc[ - substitution_mask, "actual_mutation" - ].str[-1] + mutations.loc[substitution_mask, "mut_nucleotides"] = mutations.loc[substitution_mask, "actual_mutation"].str[-1] mutations.loc[deletion_mask, "mut_nucleotides"] = "" - mutations.loc[delins_mask, "mut_nucleotides"] = mutations.loc[ - delins_mask, "actual_mutation" - ].str.extract(r"delins([A-Z]+)")[0] - mutations.loc[insertion_mask, "mut_nucleotides"] = mutations.loc[ - insertion_mask, "actual_mutation" - ].str.extract(r"ins([A-Z]+)")[0] - mutations.loc[duplication_mask, "mut_nucleotides"] = mutations.loc[ - duplication_mask - ].apply(lambda row: row["wt_nucleotides_ensembl"], axis=1) - mutations.loc[inversion_mask, "mut_nucleotides"] = mutations.loc[ - inversion_mask - ].apply( - lambda row: "".join( - complement.get(nucleotide, "N") - for nucleotide in row["wt_nucleotides_ensembl"][::-1] - ), + mutations.loc[delins_mask, "mut_nucleotides"] = mutations.loc[delins_mask, "actual_mutation"].str.extract( + r"delins([A-Z]+)" + )[0] + mutations.loc[insertion_mask, "mut_nucleotides"] = mutations.loc[insertion_mask, "actual_mutation"].str.extract( + r"ins([A-Z]+)" + )[0] + mutations.loc[duplication_mask, "mut_nucleotides"] = mutations.loc[duplication_mask].apply( + lambda row: row["wt_nucleotides_ensembl"], axis=1 + ) + mutations.loc[inversion_mask, "mut_nucleotides"] = mutations.loc[inversion_mask].apply( + lambda row: "".join(complement.get(nucleotide, "N") for nucleotide in row["wt_nucleotides_ensembl"][::-1]), axis=1, ) @@ -840,31 +800,22 @@ def mutate( # Calculate the kmer bounds mutations["start_kmer_position_min"] = mutations["start_mutation_position"] - k - mutations["start_kmer_position"] = mutations["start_kmer_position_min"].combine( - 0, max - ) + mutations["start_kmer_position"] = mutations["start_kmer_position_min"].combine(0, max) mutations["end_kmer_position_max"] = mutations["end_mutation_position"] + k - mutations["end_kmer_position"] = mutations[ - ["end_kmer_position_max", "sequence_length"] - ].min( + mutations["end_kmer_position"] = mutations[["end_kmer_position_max", "sequence_length"]].min( axis=1 ) # don't forget to increment by 1 later on if gtf is not None: - assert mutations_path.endswith(".csv") or mutations_path.endswith( - ".tsv" - ), "Mutations must be a CSV or TSV file" + assert mutations_path.endswith(".csv") or mutations_path.endswith(".tsv"), "Mutations must be a CSV or TSV file" if ( - "start_transcript_position" not in mutations.columns - and "end_transcript_position" not in mutations.columns + "start_transcript_position" not in mutations.columns and "end_transcript_position" not in mutations.columns ): # * currently hard-coded column names, but optionally can be changed to arguments later mutations = merge_gtf_transcript_locations_into_cosmic_csv( mutations, gtf, gtf_transcript_id_column=gtf_transcript_id_column ) - columns_to_keep.extend( - ["start_transcript_position", "end_transcript_position", "strand"] - ) + columns_to_keep.extend(["start_transcript_position", "end_transcript_position", "strand"]) else: logger.warning( "Transcript positions already present in the input mutations file. Skipping GTF file merging." @@ -873,18 +824,10 @@ def mutate( # adjust start_transcript_position to be 0-index mutations["start_transcript_position"] -= 1 - mutations["start_kmer_position"] = mutations[ - ["start_kmer_position", "start_transcript_position"] - ].max(axis=1) - mutations["end_kmer_position"] = mutations[ - ["end_kmer_position", "end_transcript_position"] - ].min(axis=1) - - mut_apply = ( - (lambda *args, **kwargs: mutations.progress_apply(*args, **kwargs)) - if verbose - else mutations.apply - ) + mutations["start_kmer_position"] = mutations[["start_kmer_position", "start_transcript_position"]].max(axis=1) + mutations["end_kmer_position"] = mutations[["end_kmer_position", "end_transcript_position"]].min(axis=1) + + mut_apply = (lambda *args, **kwargs: mutations.progress_apply(*args, **kwargs)) if verbose else mutations.apply if update_df and store_full_sequences: # Extract flank sequences @@ -892,9 +835,7 @@ def mutate( tqdm.pandas(desc="Extracting full left flank sequences") mutations["left_flank_region_full"] = mut_apply( - lambda row: seq_dict[row[seq_id_column]][ - 0 : row["start_mutation_position"] - ], + lambda row: seq_dict[row[seq_id_column]][0 : row["start_mutation_position"]], axis=1, ) # ? vectorize @@ -902,9 +843,7 @@ def mutate( tqdm.pandas(desc="Extracting full right flank sequences") mutations["right_flank_region_full"] = mut_apply( - lambda row: seq_dict[row[seq_id_column]][ - row["end_mutation_position"] + 1 : row["sequence_length"] - ], + lambda row: seq_dict[row[seq_id_column]][row["end_mutation_position"] + 1 : row["sequence_length"]], axis=1, ) # ? vectorize @@ -912,9 +851,7 @@ def mutate( tqdm.pandas(desc="Extracting k-mer left flank sequences") mutations["left_flank_region"] = mut_apply( - lambda row: seq_dict[row[seq_id_column]][ - row["start_kmer_position"] : row["start_mutation_position"] - ], + lambda row: seq_dict[row[seq_id_column]][row["start_kmer_position"] : row["start_mutation_position"]], axis=1, ) # ? vectorize @@ -922,9 +859,7 @@ def mutate( tqdm.pandas(desc="Extracting k-mer right flank sequences") mutations["right_flank_region"] = mut_apply( - lambda row: seq_dict[row[seq_id_column]][ - row["end_mutation_position"] + 1 : row["end_kmer_position"] + 1 - ], + lambda row: seq_dict[row[seq_id_column]][row["end_mutation_position"] + 1 : row["end_kmer_position"] + 1], axis=1, ) # ? vectorize @@ -945,31 +880,25 @@ def mutate( if optimize_flanking_regions: # Apply the function for beginning of mut_nucleotides with right_flank_region - mutations.loc[ - non_substitution_mask, "beginning_mutation_overlap_with_right_flank" - ] = mutations.loc[non_substitution_mask].apply( - calculate_beginning_mutation_overlap_with_right_flank, axis=1 - ) + mutations.loc[non_substitution_mask, "beginning_mutation_overlap_with_right_flank"] = mutations.loc[ + non_substitution_mask + ].apply(calculate_beginning_mutation_overlap_with_right_flank, axis=1) # Apply the function for end of mut_nucleotides with left_flank_region - mutations.loc[non_substitution_mask, "end_mutation_overlap_with_left_flank"] = ( - mutations.loc[non_substitution_mask].apply( - calculate_end_mutation_overlap_with_left_flank, axis=1 - ) - ) + mutations.loc[non_substitution_mask, "end_mutation_overlap_with_left_flank"] = mutations.loc[ + non_substitution_mask + ].apply(calculate_end_mutation_overlap_with_left_flank, axis=1) # Calculate k-len(flank) (see above instructions) - mutations.loc[non_substitution_mask, "k_minus_left_flank_length"] = ( - k - mutations.loc[non_substitution_mask, "left_flank_region"].apply(len) - ) - mutations.loc[non_substitution_mask, "k_minus_right_flank_length"] = ( - k - mutations.loc[non_substitution_mask, "right_flank_region"].apply(len) - ) + mutations.loc[non_substitution_mask, "k_minus_left_flank_length"] = k - mutations.loc[ + non_substitution_mask, "left_flank_region" + ].apply(len) + mutations.loc[non_substitution_mask, "k_minus_right_flank_length"] = k - mutations.loc[ + non_substitution_mask, "right_flank_region" + ].apply(len) mutations.loc[non_substitution_mask, "updated_left_flank_start"] = np.maximum( - mutations.loc[ - non_substitution_mask, "beginning_mutation_overlap_with_right_flank" - ] + mutations.loc[non_substitution_mask, "beginning_mutation_overlap_with_right_flank"] - mutations.loc[non_substitution_mask, "k_minus_left_flank_length"], 0, ) @@ -979,12 +908,8 @@ def mutate( 0, ) - mutations["updated_left_flank_start"] = ( - mutations["updated_left_flank_start"].fillna(0).astype(int) - ) - mutations["updated_right_flank_end"] = ( - mutations["updated_right_flank_end"].fillna(0).astype(int) - ) + mutations["updated_left_flank_start"] = mutations["updated_left_flank_start"].fillna(0).astype(int) + mutations["updated_right_flank_end"] = mutations["updated_right_flank_end"].fillna(0).astype(int) else: mutations["updated_left_flank_start"] = 0 @@ -998,14 +923,12 @@ def mutate( ) # Create WT non-substitution k-mer sequences - mutations.loc[non_substitution_mask, "wt_sequence"] = mutations.loc[ - non_substitution_mask - ].apply( - lambda row: row["left_flank_region"][row["updated_left_flank_start"] :] - + row["wt_nucleotides_ensembl"] - + row["right_flank_region"][ - : len(row["right_flank_region"]) - row["updated_right_flank_end"] - ], + mutations.loc[non_substitution_mask, "wt_sequence"] = mutations.loc[non_substitution_mask].apply( + lambda row: ( + row["left_flank_region"][row["updated_left_flank_start"] :] + + row["wt_nucleotides_ensembl"] + + row["right_flank_region"][: len(row["right_flank_region"]) - row["updated_right_flank_end"]] + ), axis=1, ) @@ -1017,22 +940,18 @@ def mutate( ) # Create mutant non-substitution k-mer sequences - mutations.loc[non_substitution_mask, "mutant_sequence"] = mutations.loc[ - non_substitution_mask - ].apply( - lambda row: row["left_flank_region"][row["updated_left_flank_start"] :] - + row["mut_nucleotides"] - + row["right_flank_region"][ - : len(row["right_flank_region"]) - row["updated_right_flank_end"] - ], + mutations.loc[non_substitution_mask, "mutant_sequence"] = mutations.loc[non_substitution_mask].apply( + lambda row: ( + row["left_flank_region"][row["updated_left_flank_start"] :] + + row["mut_nucleotides"] + + row["right_flank_region"][: len(row["right_flank_region"]) - row["updated_right_flank_end"]] + ), axis=1, ) if remove_seqs_with_wt_kmers: if verbose: - tqdm.pandas( - desc="Removing mutant fragments that share a kmer with wt fragments" - ) + tqdm.pandas(desc="Removing mutant fragments that share a kmer with wt fragments") mutations["wt_fragment_and_mutant_fragment_share_kmer"] = mut_apply( lambda row: wt_fragment_and_mutant_fragment_share_kmer( @@ -1043,9 +962,7 @@ def mutate( axis=1, ) - mutations_overlapping_with_wt = mutations[ - "wt_fragment_and_mutant_fragment_share_kmer" - ].sum() + mutations_overlapping_with_wt = mutations["wt_fragment_and_mutant_fragment_share_kmer"].sum() mutations = mutations[~mutations["wt_fragment_and_mutant_fragment_share_kmer"]] @@ -1054,9 +971,7 @@ def mutate( # Create full sequences (substitution and non-substitution) mutations["mutant_sequence_full"] = ( - mutations["left_flank_region_full"] - + mutations["mut_nucleotides"] - + mutations["right_flank_region_full"] + mutations["left_flank_region_full"] + mutations["mut_nucleotides"] + mutations["right_flank_region_full"] ) # Calculate k-mer lengths and report the distribution @@ -1067,16 +982,12 @@ def mutate( max_length = mutations["mutant_sequence_kmer_length"].max() if min_seq_len: - rows_less_than_minimum = ( - mutations["mutant_sequence_kmer_length"] < min_seq_len - ).sum() + rows_less_than_minimum = (mutations["mutant_sequence_kmer_length"] < min_seq_len).sum() mutations = mutations[mutations["mutant_sequence_kmer_length"] >= min_seq_len] if verbose: - logger.info( - f"Removed {rows_less_than_minimum} mutant kmers with length less than {min_seq_len}..." - ) + logger.info(f"Removed {rows_less_than_minimum} mutant kmers with length less than {min_seq_len}...") if max_ambiguous is not None: # Get number of 'N' or 'n' occuring in the sequence @@ -1085,9 +996,7 @@ def mutate( mutations = mutations[mutations["num_N"] <= max_ambiguous] if verbose: - logger.info( - f"Removed {num_rows_with_N} mutant kmers containing more than {max_ambiguous} 'N's..." - ) + logger.info(f"Removed {num_rows_with_N} mutant kmers containing more than {max_ambiguous} 'N's...") # Drop the 'num_N' column after filtering mutations = mutations.drop(columns=["num_N"]) @@ -1097,16 +1006,14 @@ def mutate( bins = range(0, max_length + 6, 5) # Bin the lengths and count the number of elements in each bin - binned_lengths = pd.cut( - mutations["mutant_sequence_kmer_length"], bins=bins, right=False - ) + binned_lengths = pd.cut(mutations["mutant_sequence_kmer_length"], bins=bins, right=False) bin_counts = binned_lengths.value_counts().sort_index() # Display the report if verbose: logger.debug("Report of the number of elements in each bin of width 5:") logger.debug(bin_counts) - except Exception as e: + except Exception: # noqa: BLE001 pass # split_cols = mutations[mut_id_column].str.split("_", n=1, expand=True) @@ -1133,27 +1040,27 @@ def mutate( # good_mutations = good_mutations - num_rows_with_N report = f""" - {good_mutations} mutations correctly recorded ({good_mutations/total_mutations*100:.2f}%) - {intronic_mutations} intronic mutations found ({intronic_mutations/total_mutations*100:.2f}%) - {posttranslational_region_mutations} posttranslational region mutations found ({posttranslational_region_mutations/total_mutations*100:.2f}%) - {unknown_mutations} unknown mutations found ({unknown_mutations/total_mutations*100:.2f}%) - {uncertain_mutations} mutations with uncertain mutation found ({uncertain_mutations/total_mutations*100:.2f}%) - {ambiguous_position_mutations} mutations with ambiguous position found ({ambiguous_position_mutations/total_mutations*100:.2f}%) - {cosmic_incorrect_wt_base} mutations with incorrect wildtype base found ({cosmic_incorrect_wt_base/total_mutations*100:.2f}%) - {mut_idx_outside_seq} mutations with indices outside of the sequence length found ({mut_idx_outside_seq/total_mutations*100:.2f}%) + {good_mutations} mutations correctly recorded ({good_mutations / total_mutations * 100:.2f}%) + {intronic_mutations} intronic mutations found ({intronic_mutations / total_mutations * 100:.2f}%) + {posttranslational_region_mutations} posttranslational region mutations found ({posttranslational_region_mutations / total_mutations * 100:.2f}%) + {unknown_mutations} unknown mutations found ({unknown_mutations / total_mutations * 100:.2f}%) + {uncertain_mutations} mutations with uncertain mutation found ({uncertain_mutations / total_mutations * 100:.2f}%) + {ambiguous_position_mutations} mutations with ambiguous position found ({ambiguous_position_mutations / total_mutations * 100:.2f}%) + {cosmic_incorrect_wt_base} mutations with incorrect wildtype base found ({cosmic_incorrect_wt_base / total_mutations * 100:.2f}%) + {mut_idx_outside_seq} mutations with indices outside of the sequence length found ({mut_idx_outside_seq / total_mutations * 100:.2f}%) """ if remove_seqs_with_wt_kmers: - report += f"""{long_duplications} duplications longer than k found ({long_duplications/total_mutations*100:.2f}%) - {mutations_overlapping_with_wt} mutations with overlapping kmers found ({mutations_overlapping_with_wt/total_mutations*100:.2f}%) + report += f"""{long_duplications} duplications longer than k found ({long_duplications / total_mutations * 100:.2f}%) + {mutations_overlapping_with_wt} mutations with overlapping kmers found ({mutations_overlapping_with_wt / total_mutations * 100:.2f}%) """ if min_seq_len: - report += f"""{rows_less_than_minimum} mutations with fragment length < k found ({rows_less_than_minimum/total_mutations*100:.2f}%) + report += f"""{rows_less_than_minimum} mutations with fragment length < k found ({rows_less_than_minimum / total_mutations * 100:.2f}%) """ if max_ambiguous is not None: - report += f"""{num_rows_with_N} mutations with Ns found ({num_rows_with_N/total_mutations*100:.2f}%) + report += f"""{num_rows_with_N} mutations with Ns found ({num_rows_with_N / total_mutations * 100:.2f}%) """ if good_mutations != total_mutations: @@ -1165,9 +1072,9 @@ def mutate( columns_to_keep.extend(["wt_sequence_aa_full", "mutant_sequence_aa_full"]) if not mutations_path: - assert ( - type(translate_start) != str and type(translate_end) != str - ), "translate_start and translate_end must be integers when translating sequences (or default None)." + assert not isinstance(translate_start, str) and not isinstance(translate_end, str), ( + "translate_start and translate_end must be integers when translating sequences (or default None)." + ) if translate_start is None: translate_start = 0 if translate_end is None: @@ -1177,38 +1084,24 @@ def mutate( if verbose: tqdm.pandas(desc="Translating WT amino acid sequences") - mutations["wt_sequence_aa_full"] = mutations[ - "wt_sequence_full" - ].progress_apply( - lambda x: translate_sequence( - x, start=translate_start, end=translate_end - ) + mutations["wt_sequence_aa_full"] = mutations["wt_sequence_full"].progress_apply( + lambda x: translate_sequence(x, start=translate_start, end=translate_end) ) else: mutations["wt_sequence_aa_full"] = mutations["wt_sequence_full"].apply( - lambda x: translate_sequence( - x, start=translate_start, end=translate_end - ) + lambda x: translate_sequence(x, start=translate_start, end=translate_end) ) if verbose: tqdm.pandas(desc="Translating mutant amino acid sequences") - mutations["mutant_sequence_aa_full"] = mutations[ - "mutant_sequence_full" - ].progress_apply( - lambda x: translate_sequence( - x, start=translate_start, end=translate_end - ) + mutations["mutant_sequence_aa_full"] = mutations["mutant_sequence_full"].progress_apply( + lambda x: translate_sequence(x, start=translate_start, end=translate_end) ) else: - mutations["mutant_sequence_aa_full"] = mutations[ - "mutant_sequence_full" - ].apply( - lambda x: translate_sequence( - x, start=translate_start, end=translate_end - ) + mutations["mutant_sequence_aa_full"] = mutations["mutant_sequence_full"].apply( + lambda x: translate_sequence(x, start=translate_start, end=translate_end) ) print(f"Translated mutated sequences: {mutations['wt_sequence_aa_full']}") @@ -1229,9 +1122,7 @@ def mutate( tqdm.pandas(desc="Translating WT amino acid sequences") mutations["wt_sequence_aa_full"] = mut_apply( - lambda row: translate_sequence( - row["wt_sequence_full"], row[translate_start], row[translate_end] - ), + lambda row: translate_sequence(row["wt_sequence_full"], row[translate_start], row[translate_end]), axis=1, ) @@ -1257,17 +1148,13 @@ def mutate( ) mutations = ( mutations.groupby("mutant_sequence", sort=False) - .agg( - lambda x: ";".join(x.astype(str)) - ) # Concatenate values with semicolons + .agg(lambda x: ";".join(x.astype(str))) # Concatenate values with semicolons .reset_index() ) else: mutations = ( - mutations.groupby("mutant_sequence", sort=False, group_keys=False)[ - "header" - ] + mutations.groupby("mutant_sequence", sort=False, group_keys=False)["header"] .apply(";".join) .reset_index() ) @@ -1296,9 +1183,7 @@ def mutate( empty_kmer_count = (mutations["mutant_sequence"] == "").sum() if empty_kmer_count > 0 and verbose: - logger.warning( - f"{empty_kmer_count} mutated sequences were empty and were not included in the output." - ) + logger.warning(f"{empty_kmer_count} mutated sequences were empty and were not included in the output.") mutations = mutations[mutations["mutant_sequence"] != ""] @@ -1307,9 +1192,7 @@ def mutate( if update_df: logger.info("Saving dataframe with updated mutation info...") saved_updated_df = True - logger.warning( - "File size can be very large if the number of mutations is large." - ) + logger.warning("File size can be very large if the number of mutations is large.") if not update_df_out: if not mutations_path: # logger.warning( @@ -1323,9 +1206,7 @@ def mutate( mutations.to_csv(update_df_out, index=False) print(f"Updated mutation info has been saved to {update_df_out}") - mutations["fasta_format"] = ( - ">" + mutations["header"] + "\n" + mutations["mutant_sequence"] + "\n" - ) + mutations["fasta_format"] = ">" + mutations["header"] + "\n" + mutations["mutant_sequence"] + "\n" if out: # Save mutated sequences in new fasta file @@ -1342,12 +1223,12 @@ def mutate( else: all_mut_seqs = [] all_mut_seqs.extend(mutations["mutant_sequence"].values) - + # Remove empty strings from final list of mutated sequences # (these are introduced when unknown mutations are encountered) while "" in all_mut_seqs: all_mut_seqs.remove("") - + if len(all_mut_seqs) > 0: return all_mut_seqs return [] diff --git a/gget/gget_opentargets.py b/gget/gget_opentargets.py index d09f0dc16..cdb704770 100644 --- a/gget/gget_opentargets.py +++ b/gget/gget_opentargets.py @@ -1,9 +1,10 @@ import json as json_ import textwrap + import pandas as pd -from .constants import OPENTARGETS_GRAPHQL_API, DEFAULT_REQUESTS_TIMEOUT -from .utils import set_up_logger, http_json, dig +from .constants import DEFAULT_REQUESTS_TIMEOUT, OPENTARGETS_GRAPHQL_API +from .utils import dig, http_json, set_up_logger logger = set_up_logger() # export GGET_LOGLEVEL=DEBUG @@ -132,7 +133,7 @@ diseaseFromSource depmapId geneEffect - } + } } } } @@ -155,7 +156,7 @@ speciesA { taxonId } - intB + intB targetB { id approvedSymbol @@ -170,13 +171,22 @@ } """ -OPENTARGETS_RESOURCES = {"diseases", "drugs", "tractability", "pharmacogenetics", "expression", "depmap", "interactions"} +OPENTARGETS_RESOURCES = { + "diseases", + "drugs", + "tractability", + "pharmacogenetics", + "expression", + "depmap", + "interactions", +} + def _collapse_singletons(obj): - """ - Recursively collapse: + """Recursively collapse nested single-element lists and single dicts with one key. + - nested single-element lists - - single dicts with one key → value + - single dicts with one key → value. """ # ------------------------- # Case 1: list @@ -189,7 +199,7 @@ def flatten(x): yield from flatten(el) else: yield el - + flat = list(flatten(obj)) flat = [el for el in flat if el is not None] @@ -209,7 +219,7 @@ def flatten(x): if len(obj) == 0: return None - + # if single key → collapse if len(obj) == 1: return next(iter(obj.values())) @@ -221,23 +231,26 @@ def flatten(x): # ------------------------- return obj + def _make_hashable(x): - if isinstance(x, dict): - return tuple(sorted((k, _make_hashable(v)) for k, v in x.items())) - elif isinstance(x, list): - return tuple(_make_hashable(v) for v in x) - elif isinstance(x, set): - return tuple(sorted(_make_hashable(v) for v in x)) - else: - return x - + if isinstance(x, dict): + return tuple(sorted((k, _make_hashable(v)) for k, v in x.items())) + elif isinstance(x, list): + return tuple(_make_hashable(v) for v in x) + elif isinstance(x, set): + return tuple(sorted(_make_hashable(v) for v in x)) + else: + return x + + def _unhash(x): - if isinstance(x, tuple): - # detect dict-like tuples - if all(isinstance(i, tuple) and len(i) == 2 for i in x): - return {k: _unhash(v) for k, v in x} - return [_unhash(v) for v in x] - return x + if isinstance(x, tuple): + # detect dict-like tuples + if all(isinstance(i, tuple) and len(i) == 2 for i in x): + return {k: _unhash(v) for k, v in x} + return [_unhash(v) for v in x] + return x + def opentargets( ensembl_id, @@ -248,8 +261,7 @@ def opentargets( filters=None, json=False, ): - """ - Query OpenTargets for data associated with a given Ensembl gene ID. + """Query OpenTargets for data associated with a given Ensembl gene ID. Args: @@ -272,7 +284,6 @@ def opentargets( Returns requested information in DataFrame format. """ - if resource == "diseases": query_string = QUERY_STRING_DISEASES rows_path = ["associatedDiseases", "rows"] @@ -290,12 +301,17 @@ def opentargets( rows_path = ["expressions"] elif resource == "depmap": query_string = QUERY_STRING_DEPMAP - rows_path = ["depMapEssentiality", "_FLATTEN_screens"] #* _FLATTEN_ indicates that we want to flatten the nested 'screens' field into the main table + rows_path = [ + "depMapEssentiality", + "_FLATTEN_screens", + ] # * _FLATTEN_ indicates that we want to flatten the nested 'screens' field into the main table elif resource == "interactions": query_string = QUERY_STRING_INTERACTIONS rows_path = ["interactions", "rows"] else: - raise ValueError(f"'resource' argument specified as {resource}. Expected one of: {', '.join(OPENTARGETS_RESOURCES)}") + raise ValueError( + f"'resource' argument specified as {resource}. Expected one of: {', '.join(OPENTARGETS_RESOURCES)}" + ) variables = {"ensemblId": ensembl_id} @@ -331,12 +347,12 @@ def opentargets( rows = [ { **{k: v for k, v in row.items() if k != row_key}, # keep everything except the nested field - **subdict # unpack the nested dict + **subdict, # unpack the nested dict } for row in rows for subdict in row[row_key] ] - + if len(rows) == 0: if verbose: logger.info(f"No {resource} data found for {ensembl_id}.") @@ -352,24 +368,24 @@ def opentargets( if limit is not None: df = df.head(limit) - + df = df.map(_unhash) df = df.map(_collapse_singletons) if filters is not None: for filter_key, filter_value in filters.items(): if filter_key not in df.columns: - raise ValueError(f"Filter key '{filter_key}' not found in data columns. Available columns: {', '.join(df.columns)}") + raise ValueError( + f"Filter key '{filter_key}' not found in data columns. Available columns: {', '.join(df.columns)}" + ) df = df[df[filter_key] == filter_value] if wrap_text: for col in df.columns: if df[col].dtype == object: - df[col] = df[col].apply( - lambda x: textwrap.fill(str(x), width=40) if isinstance(x, str) else x - ) - + df[col] = df[col].apply(lambda x: textwrap.fill(str(x), width=40) if isinstance(x, str) else x) + if json: return json_.loads(df.to_json(orient="records", force_ascii=False)) - + return df diff --git a/gget/gget_pdb.py b/gget/gget_pdb.py index 4975061be..b26ed0fa1 100644 --- a/gget/gget_pdb.py +++ b/gget/gget_pdb.py @@ -1,17 +1,16 @@ -from urllib.request import urlopen -from urllib.error import HTTPError import json +from urllib.error import HTTPError +from urllib.request import urlopen from .utils import set_up_logger logger = set_up_logger() -from .constants import RCSB_PDB_API +from .constants import RCSB_PDB_API # noqa: E402 def pdb(pdb_id, resource="pdb", identifier=None, save=False): - """ - Query RCSB PDB for the protein structutre/metadata of a given PDB ID. + """Query RCSB PDB for the protein structutre/metadata of a given PDB ID. Args: - pdb_id PDB ID to be queried (str), e.g. "7S7U". @@ -33,7 +32,6 @@ def pdb(pdb_id, resource="pdb", identifier=None, save=False): Returns requested information in JSON format (except for resource="pdb" which returns protein structure in PDB format). """ - # Check if resource argument is valid resources = [ "pdb", @@ -49,9 +47,7 @@ def pdb(pdb_id, resource="pdb", identifier=None, save=False): "nonpolymer_entity_instance", ] if resource not in resources: - raise ValueError( - f"'resource' argument specified as {resource}. Expected one of: {', '.join(resources)}" - ) + raise ValueError(f"'resource' argument specified as {resource}. Expected one of: {', '.join(resources)}") # Check if required identifiers are present if resource == "assembly" and identifier is None: @@ -92,7 +88,6 @@ def pdb(pdb_id, resource="pdb", identifier=None, save=False): # Submit URL request with fallback logic r = None - last_error = None code = None for url in urls: try: @@ -105,8 +100,7 @@ def pdb(pdb_id, resource="pdb", identifier=None, save=False): if code == 200: break - except HTTPError as e: - last_error = e + except HTTPError: continue if r is None or code != 200: @@ -123,9 +117,7 @@ def pdb(pdb_id, resource="pdb", identifier=None, save=False): f"{resource} for {pdb_id} chain {identifier} was not found. Please double-check arguments and try again." ) else: - logger.error( - f"{resource} for {pdb_id} was not found. Please double-check arguments and try again." - ) + logger.error(f"{resource} for {pdb_id} was not found. Please double-check arguments and try again.") return if resource != "pdb": diff --git a/gget/gget_ref.py b/gget/gget_ref.py index 865bd47ec..f65b8066c 100644 --- a/gget/gget_ref.py +++ b/gget/gget_ref.py @@ -1,28 +1,28 @@ -from bs4 import BeautifulSoup -import requests import json +import requests +from bs4 import BeautifulSoup + # Custom functions from .utils import ( - ref_species_options, find_latest_ens_rel, find_nv_kingdom, + ref_species_options, set_up_logger, ) logger = set_up_logger() -from .constants import ( +from .constants import ( # noqa: E402 + DEFAULT_REQUESTS_TIMEOUT, ENSEMBL_FTP_URL, - ENSEMBL_FTP_URL_NV, ENSEMBL_FTP_URL_GRCH37, - DEFAULT_REQUESTS_TIMEOUT, + ENSEMBL_FTP_URL_NV, ) def find_FTP_link(url, link_substring): - """ - Helper function for gget ref to find an FTP link, its release date and size. + """Helper function for gget ref to find an FTP link, its release date and size. Args: url - URL link to FTP subfolder (e.g. GTF) including species and release @@ -34,9 +34,7 @@ def find_FTP_link(url, link_substring): # Raise error if status code not "OK" Response if html.status_code != 200: - raise RuntimeError( - f"HTTP response status code {html.status_code}. Please try again.\n" - ) + raise RuntimeError(f"HTTP response status code {html.status_code}. Please try again.\n") soup = BeautifulSoup(html.text, "html.parser") @@ -67,8 +65,7 @@ def ref( list_iv_species=False, verbose=True, ): - """ - Fetch FTPs for reference genomes and annotations by species from Ensembl. + """Fetch FTPs for reference genomes and annotations by species from Ensembl. Args: - species Defines the species for which the reference should be fetched in the format "_", @@ -138,13 +135,9 @@ def ref( ) # Find all available species for GTFs for this Ensembl release - species_list_gtf = ref_species_options( - "gtf", database=ENSEMBL_FTP_URL_NV, release=release - ) + species_list_gtf = ref_species_options("gtf", database=ENSEMBL_FTP_URL_NV, release=release) # Find all available species for FASTAs for this Ensembl release - species_list_dna = ref_species_options( - "dna", database=ENSEMBL_FTP_URL_NV, release=release - ) + species_list_dna = ref_species_options("dna", database=ENSEMBL_FTP_URL_NV, release=release) # Find intersection of the two lists # (Only species which have GTF and FASTAs available can continue) @@ -158,7 +151,7 @@ def ref( ## Check 'which' parameter # If single which passed as string, convert to list - if type(which) == str: + if isinstance(which, str): which = [which] # Raise error if several values are passed and 'all' is included @@ -170,7 +163,7 @@ def ref( which_allowed = ["all", "gtf", "cdna", "dna", "cds", "ncrna", "pep"] if any(x not in which_allowed for x in which): raise ValueError( - f"Parameter 'which' must be 'all', or any one or a combination of the following: 'gtf', 'cdna', 'dna', 'cds', 'ncrna', 'pep'.\n" + "Parameter 'which' must be 'all', or any one or a combination of the following: 'gtf', 'cdna', 'dna', 'cds', 'ncrna', 'pep'.\n" ) # Species shortcuts @@ -191,9 +184,7 @@ def ref( database = ENSEMBL_FTP_URL_GRCH37 ENS_rel = find_latest_ens_rel(ENSEMBL_FTP_URL) # Standard database - elif species in ref_species_options( - "dna", database=ENSEMBL_FTP_URL, release=release - ): + elif species in ref_species_options("dna", database=ENSEMBL_FTP_URL, release=release): database = ENSEMBL_FTP_URL # Find latest vertebrate Ensembl release ENS_rel = find_latest_ens_rel(database) @@ -204,24 +195,18 @@ def ref( ENS_rel = find_latest_ens_rel(database) # If release != None, use user-defined Ensembl release - if release != None: + if release is not None: # Warn user when release is higher than the latest release if release > ENS_rel: - logger.warning( - f"Provided Ensembl release number {release} is greater than the latest release ({ENS_rel})." - ) + logger.warning(f"Provided Ensembl release number {release} is greater than the latest release ({ENS_rel}).") ENS_rel = release if not grch37: ## Raise error if species not found (both FASTA and GTF have to be available) # Find all available species for genome FASTAs for this Ensembl release - species_list_dna = ref_species_options( - "dna", database=database, release=ENS_rel - ) + species_list_dna = ref_species_options("dna", database=database, release=ENS_rel) # Find all available species for GTFs for this Ensembl release - species_list_gtf = ref_species_options( - "gtf", database=database, release=ENS_rel - ) + species_list_gtf = ref_species_options("gtf", database=database, release=ENS_rel) # Find intersection of the two lists # (Only species which have GTF and FASTAs available can continue) species_list = list(set(species_list_gtf) & set(species_list_dna)) @@ -251,9 +236,7 @@ def ref( link_substring = f"{ENS_rel}.gtf.gz" # Get link, release date and dataset size - gtf_str, gtf_date, gtf_size = find_FTP_link( - url=gtf_search_url, link_substring=link_substring - ) + gtf_str, gtf_date, gtf_size = find_FTP_link(url=gtf_search_url, link_substring=link_substring) # Build the final download link if not isinstance(gtf_str, type(None)): gtf_url = gtf_search_url + gtf_str @@ -266,17 +249,13 @@ def ref( if "all" in which or "cdna" in which: if database == ENSEMBL_FTP_URL_NV: # Define location of cdna links - cdna_search_url = ( - database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/cdna/" - ) + cdna_search_url = database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/cdna/" else: # Define location of cdna links cdna_search_url = database + f"release-{ENS_rel}/fasta/{species}/cdna/" # Get link, release date and dataset size - cdna_str, cdna_date, cdna_size = find_FTP_link( - url=cdna_search_url, link_substring="cdna.all.fa" - ) + cdna_str, cdna_date, cdna_size = find_FTP_link(url=cdna_search_url, link_substring="cdna.all.fa") # Build the final download link if not isinstance(cdna_str, type(None)): cdna_url = cdna_search_url + cdna_str @@ -289,22 +268,16 @@ def ref( if "all" in which or "dna" in which: # Define location of dna links if database == ENSEMBL_FTP_URL_NV: - dna_search_url = ( - database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/dna/" - ) + dna_search_url = database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/dna/" else: dna_search_url = database + f"release-{ENS_rel}/fasta/{species}/dna/" # Get link, release date and dataset size - dna_str, dna_date, dna_size = find_FTP_link( - url=dna_search_url, link_substring=".dna.primary_assembly.fa" - ) + dna_str, dna_date, dna_size = find_FTP_link(url=dna_search_url, link_substring=".dna.primary_assembly.fa") # Get toplevel if primary assembly not available if dna_str is None: # Get link, release date and dataset size - dna_str, dna_date, dna_size = find_FTP_link( - url=dna_search_url, link_substring=".dna.toplevel.fa" - ) + dna_str, dna_date, dna_size = find_FTP_link(url=dna_search_url, link_substring=".dna.toplevel.fa") # Build the final download link if not isinstance(dna_str, type(None)): @@ -318,15 +291,11 @@ def ref( if "all" in which or "cds" in which: # Define location of cds links if database == ENSEMBL_FTP_URL_NV: - cds_search_url = ( - database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/cds/" - ) + cds_search_url = database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/cds/" else: cds_search_url = database + f"release-{ENS_rel}/fasta/{species}/cds/" # Get link, release date and dataset size - cds_str, cds_date, cds_size = find_FTP_link( - url=cds_search_url, link_substring="cds.all.fa" - ) + cds_str, cds_date, cds_size = find_FTP_link(url=cds_search_url, link_substring="cds.all.fa") # Build the final download link if not isinstance(cds_str, type(None)): cds_url = cds_search_url + cds_str @@ -339,9 +308,7 @@ def ref( if "all" in which or "ncrna" in which: # Define location of ncRNA links if database == ENSEMBL_FTP_URL_NV: - ncrna_search_url = ( - database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/ncrna/" - ) + ncrna_search_url = database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/ncrna/" else: ncrna_search_url = database + f"release-{ENS_rel}/fasta/{species}/ncrna/" @@ -373,15 +340,11 @@ def ref( if "all" in which or "pep" in which: # Define location of pep links if database == ENSEMBL_FTP_URL_NV: - pep_search_url = ( - database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/pep/" - ) + pep_search_url = database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/pep/" else: pep_search_url = database + f"release-{ENS_rel}/fasta/{species}/pep/" # Get link, release date and dataset size - pep_str, pep_date, pep_size = find_FTP_link( - url=pep_search_url, link_substring=".pep.all.fa" - ) + pep_str, pep_date, pep_size = find_FTP_link(url=pep_search_url, link_substring=".pep.all.fa") # Build the final download link if not isinstance(pep_str, type(None)): pep_url = pep_search_url + pep_str @@ -517,17 +480,13 @@ def ref( with open("gget_ref_results.json", "w", encoding="utf-8") as file: json.dump(ref_dict, file, ensure_ascii=False, indent=4) if verbose: - logger.info( - f"Fetching reference information for {species} from Ensembl release: {ENS_rel}." - ) + logger.info(f"Fetching reference information for {species} from Ensembl release: {ENS_rel}.") return ref_dict # If FTP==True, return only the specified URLs as a list if ftp: if verbose: - logger.info( - f"Fetching reference information for {species} from Ensembl release: {ENS_rel}." - ) + logger.info(f"Fetching reference information for {species} from Ensembl release: {ENS_rel}.") results = [] for return_val in which: if return_val == "all": diff --git a/gget/gget_search.py b/gget/gget_search.py index 221bb6d96..1ffbf236f 100644 --- a/gget/gget_search.py +++ b/gget/gget_search.py @@ -1,27 +1,29 @@ -import numpy as np -import pandas as pd import json as json_package -import mysql.connector as sql import time import warnings +import mysql.connector as sql +import numpy as np +import pandas as pd + warnings.simplefilter(action="ignore", category=UserWarning) # Custom functions -from .utils import ( - search_species_options, +from .utils import ( # noqa: E402 find_latest_ens_rel, - wrap_cols_func, find_nv_kingdom, + search_species_options, set_up_logger, + wrap_cols_func, ) logger = set_up_logger() -from gget.constants import ENSEMBL_FTP_URL, ENSEMBL_FTP_URL_NV +from gget.constants import ENSEMBL_FTP_URL, ENSEMBL_FTP_URL_NV # noqa: E402 def clean_cols(x): + """Collapse a list to its single unique value, or return x unchanged if not a list.""" if isinstance(x, list): unique_list = list(set(x)) if len(unique_list) == 1: @@ -45,8 +47,8 @@ def search( save=False, verbose=True, ): - """ - Function to query Ensembl for genes based on species and free form search terms. + """Function to query Ensembl for genes based on species and free form search terms. + Automatically fetches results from latest Ensembl release, unless user specifies database (see 'species' argument) or release database (see 'release' argument). @@ -81,9 +83,7 @@ def search( """ # Handle deprecated arguments if seqtype: - logger.error( - "'seqtype' argument deprecated! Please use argument 'id_type' instead." - ) + logger.error("'seqtype' argument deprecated! Please use argument 'id_type' instead.") return start_time = time.time() @@ -93,17 +93,13 @@ def search( id_types = ["gene", "transcript"] id_type = id_type.lower() if id_type not in id_types: - raise ValueError( - f"ID type (id_type) specified is '{id_type}'. Expected one of: {', '.join(id_types)}" - ) + raise ValueError(f"ID type (id_type) specified is '{id_type}'. Expected one of: {', '.join(id_types)}") # Check if 'andor' arg is valid andors = ["and", "or"] andor = andor.lower() if andor not in andors: - raise ValueError( - f"'andor' argument specified as {andor}. Expected one of {', '.join(andors)}" - ) + raise ValueError(f"'andor' argument specified as {andor}. Expected one of {', '.join(andors)}") ## Get database for specified species # Species shortcuts @@ -122,9 +118,7 @@ def search( if "core" in species: db = species if release: - logger.warning( - "Specified release overwritten because database name was provided." - ) + logger.warning("Specified release overwritten because database name was provided.") else: if release: ens_rel = release @@ -157,11 +151,7 @@ def search( db = f"homo_sapiens_core_{ens_rel}_38" # Check for ambiguous species matches in species other than mouse and human - elif ( - len(db) > 1 - and "mus_musculus" not in species - and "homo_sapiens" not in species - ): + elif len(db) > 1 and "mus_musculus" not in species and "homo_sapiens" not in species: logger.warning( f"Species matches more than one database. Defaulting to first database: {db[0]}.\n" "All available databases can be found here:\n" @@ -203,7 +193,7 @@ def search( ) connection_successful = True break - except Exception as e: + except Exception as e: # noqa: BLE001 last_exception = e # Continue to the next port if the connection is unsuccessful continue @@ -214,18 +204,16 @@ def search( raise RuntimeError( f""" The Ensembl server returned the following error: {str(last_exception)}. - This might be caused by the Ensembl release number being too low. + This might be caused by the Ensembl release number being too low. Please try again with a more recent release. """ ) else: - raise RuntimeError( - f"The Ensembl server returned the following error: {str(last_exception)}" - ) + raise RuntimeError(f"The Ensembl server returned the following error: {str(last_exception)}") ## Clean up list of searchwords # If single searchword passed as string, convert to list - if type(searchwords) == str: + if isinstance(searchwords, str): searchwords = [searchwords] ## Find genes @@ -233,10 +221,10 @@ def search( if id_type == "gene": query = f""" SELECT gene.stable_id AS 'ensembl_id', xref.display_label AS 'gene_name', gene.description AS 'ensembl_description', xref.description AS 'ext_ref_description', gene.biotype AS 'biotype', external_synonym.synonym AS 'synonym' - FROM gene - LEFT JOIN xref ON gene.display_xref_id = xref.xref_id - LEFT JOIN external_synonym ON gene.display_xref_id = external_synonym.xref_id - LEFT JOIN gene_attrib ON gene.gene_id = gene_attrib.gene_id + FROM gene + LEFT JOIN xref ON gene.display_xref_id = xref.xref_id + LEFT JOIN external_synonym ON gene.display_xref_id = external_synonym.xref_id + LEFT JOIN gene_attrib ON gene.gene_id = gene_attrib.gene_id WHERE (gene.description LIKE '%{searchword}%' OR xref.description LIKE '%{searchword}%' OR xref.display_label LIKE '%{searchword}%' OR external_synonym.synonym LIKE '%{searchword}%' OR gene_attrib.value LIKE '%{searchword}%') """ @@ -268,10 +256,10 @@ def search( if id_type == "transcript": query = f""" SELECT transcript.stable_id AS 'ensembl_id', xref.display_label AS 'gene_name', transcript.description AS 'ensembl_description', xref.description AS 'ext_ref_description', transcript.biotype AS 'biotype', external_synonym.synonym AS 'synonym' - FROM transcript - LEFT JOIN xref ON transcript.display_xref_id = xref.xref_id - LEFT JOIN external_synonym ON transcript.display_xref_id = external_synonym.xref_id - LEFT JOIN transcript_attrib ON transcript.transcript_id = transcript_attrib.transcript_id + FROM transcript + LEFT JOIN xref ON transcript.display_xref_id = xref.xref_id + LEFT JOIN external_synonym ON transcript.display_xref_id = external_synonym.xref_id + LEFT JOIN transcript_attrib ON transcript.transcript_id = transcript_attrib.transcript_id WHERE (transcript.description LIKE '%{searchword}%' OR xref.description LIKE '%{searchword}%' OR xref.display_label LIKE '%{searchword}%' OR external_synonym.synonym LIKE '%{searchword}%' OR transcript_attrib.value LIKE '%{searchword}%') """ @@ -317,12 +305,11 @@ def search( # Keep synonyms always of type list for consistency df["synonym"] = [ - np.sort(syn).tolist() if isinstance(syn, list) else np.sort([syn]).tolist() - for syn in df["synonym"].values + np.sort(syn).tolist() if isinstance(syn, list) else np.sort([syn]).tolist() for syn in df["synonym"].values ] # If limit is not None, keep only the first {limit} rows - if limit != None: + if limit is not None: # Print number of genes/transcripts found versus fetched if verbose: logger.info(f"Returning {limit} matches of {len(df)} total matches found.") @@ -342,33 +329,19 @@ def search( clean_db = "_".join(db.split("_")[:3]).replace("_core", "") ## Find kingdom for non-vertebrate species - kingdom = find_nv_kingdom( - clean_db, release=find_latest_ens_rel(database=ENSEMBL_FTP_URL_NV) - ) + kingdom = find_nv_kingdom(clean_db, release=find_latest_ens_rel(database=ENSEMBL_FTP_URL_NV)) if kingdom: # Add URL to gene summary on Ensembl for invertebrates - df["url"] = ( - f"https://{kingdom}.ensembl.org/" - + clean_db - + "/Gene/Summary?g=" - + df["ensembl_id"] - ) + df["url"] = f"https://{kingdom}.ensembl.org/" + clean_db + "/Gene/Summary?g=" + df["ensembl_id"] else: # Add URL to gene summary on Ensembl for vertebrates - df["url"] = ( - "https://useast.ensembl.org/" - + clean_db - + "/Gene/Summary?g=" - + df["ensembl_id"] - ) + df["url"] = "https://useast.ensembl.org/" + clean_db + "/Gene/Summary?g=" + df["ensembl_id"] if wrap_text: df_wrapped = df.copy() - wrap_cols_func( - df_wrapped, ["ensembl_description", "ext_ref_description", "url"] - ) + wrap_cols_func(df_wrapped, ["ensembl_description", "ext_ref_description", "url"]) if json: results_dict = json_package.loads(df.to_json(orient="records")) diff --git a/gget/gget_seq.py b/gget/gget_seq.py index 535f739fc..15bd3a859 100644 --- a/gget/gget_seq.py +++ b/gget/gget_seq.py @@ -1,13 +1,10 @@ -import numpy as np - # Custom functions -from .utils import rest_query, get_uniprot_seqs, set_up_logger, post_query +from .utils import get_uniprot_seqs, post_query, rest_query, set_up_logger logger = set_up_logger() -from .gget_info import info - # Constants -from .constants import ENSEMBL_REST_API, UNIPROT_REST_API +from .constants import ENSEMBL_REST_API, UNIPROT_REST_API # noqa: E402 +from .gget_info import info # noqa: E402 def seq( @@ -19,9 +16,9 @@ def seq( seqtype=None, verbose=True, ): - """ - Fetch nucleotide or amino acid sequence (FASTA) of a gene - (and all its isoforms) or transcript by Ensembl, WormBase or FlyBase ID. + """Fetch nucleotide or amino acid sequence (FASTA) of a gene or transcript. + + Fetches the gene (and all its isoforms) or transcript by Ensembl, WormBase or FlyBase ID. Args: - ens_ids One or more Ensembl IDs (passed as string or list of strings). @@ -41,9 +38,7 @@ def seq( """ # Handle deprecated arguments if seqtype: - logger.error( - "'seqtype' argument deprecated! Please use True/False argument 'translate' instead." - ) + logger.error("'seqtype' argument deprecated! Please use True/False argument 'translate' instead.") return if transcribe: translate = transcribe @@ -51,7 +46,7 @@ def seq( ## Clean up arguments # Clean up Ensembl IDs # If single Ensembl ID passed as string, convert to list - if type(ens_ids) == str: + if isinstance(ens_ids, str): ens_ids = [ens_ids] # Remove Ensembl ID version if passed ens_ids_clean = [] @@ -109,16 +104,12 @@ def seq( actual_results_dict[ensembl_ID] = {"seq": df_temp} if verbose: - logger.info( - f"Requesting nucleotide sequence of {ensembl_ID} from Ensembl." - ) + logger.info(f"Requesting nucleotide sequence of {ensembl_ID} from Ensembl.") missing_ids = set(ens_ids_clean) - set(actual_results_dict.keys()) for missing in missing_ids: - logger.error( - f"ID {missing} not found. Please double-check spelling/arguments and try again." - ) + logger.error(f"ID {missing} not found. Please double-check spelling/arguments and try again.") # Add results to master dict master_dict.update(actual_results_dict) @@ -131,9 +122,7 @@ def seq( results_dict = {ensembl_ID: {}} # Get ID type (gene, transcript, ...) using gget info - info_df = info( - ensembl_ID, verbose=False, pdb=False, ncbi=False, uniprot=False - ) + info_df = info(ensembl_ID, verbose=False, pdb=False, ncbi=False, uniprot=False) # Check if Ensembl ID was found if isinstance(info_df, type(None)): @@ -147,9 +136,7 @@ def seq( # If the ID is a gene, get the IDs of all its transcripts if ens_ID_type == "Gene": if verbose: - logger.info( - f"Requesting nucleotide sequences of all transcripts of {ensembl_ID} from Ensembl." - ) + logger.info(f"Requesting nucleotide sequences of all transcripts of {ensembl_ID} from Ensembl.") for transcipt_id in info_df.loc[ensembl_ID]["all_transcripts"]: # Remove version number for Ensembl IDs (not for flybase/wormbase IDs) @@ -170,14 +157,11 @@ def seq( df_temp.pop(key, None) # Add results to main dict - results_dict[ensembl_ID].update( - {f"{transcipt_id}": df_temp} - ) + results_dict[ensembl_ID].update({f"{transcipt_id}": df_temp}) except RuntimeError: logger.error( - f"ID {transcipt_id} not found. " - "Please double-check spelling/arguments and try again." + f"ID {transcipt_id} not found. Please double-check spelling/arguments and try again." ) # If isoform true, but ID is not a gene; ignore the isoform parameter @@ -199,15 +183,12 @@ def seq( # Add results to main dict results_dict[ensembl_ID].update({"seq": df_temp}) - logger.info( - f"Requesting nucleotide sequence of {ensembl_ID} from Ensembl." - ) + logger.info(f"Requesting nucleotide sequence of {ensembl_ID} from Ensembl.") logger.warning("The isoform option only applies to gene IDs.") except RuntimeError: logger.error( - f"ID {ensembl_ID} not found. " - "Please double-check spelling/arguments and try again." + f"ID {ensembl_ID} not found. Please double-check spelling/arguments and try again." ) # Add results to master dict @@ -220,12 +201,7 @@ def seq( fasta.append(">" + ens_ID + " " + master_dict[ens_ID][key]["desc"]) fasta.append(master_dict[ens_ID][key]["seq"]) else: - fasta.append( - ">" - + master_dict[ens_ID][key]["id"] - + " " - + master_dict[ens_ID][key]["desc"] - ) + fasta.append(">" + master_dict[ens_ID][key]["id"] + " " + master_dict[ens_ID][key]["desc"]) fasta.append(master_dict[ens_ID][key]["seq"]) ## Fetch amino acid sequences from UniProt @@ -236,15 +212,11 @@ def seq( for ensembl_ID in ens_ids_clean: # Get ID type (gene, transcript, ...) using gget info - info_df = info( - ensembl_ID, verbose=False, pdb=False, ncbi=False, uniprot=False - ) + info_df = info(ensembl_ID, verbose=False, pdb=False, ncbi=False, uniprot=False) # Check that Ensembl ID was found if isinstance(info_df, type(None)): - logger.warning( - f"ID '{ensembl_ID}' not found. Please double-check spelling/arguments." - ) + logger.warning(f"ID '{ensembl_ID}' not found. Please double-check spelling/arguments.") continue ens_ID_type = info_df.loc[ensembl_ID]["object_type"] @@ -285,9 +257,7 @@ def seq( trans_ids.append(ensembl_ID) if verbose: - logger.info( - f"Requesting amino acid sequence of {ensembl_ID} from UniProt." - ) + logger.info(f"Requesting amino acid sequence of {ensembl_ID} from UniProt.") else: logger.warning( @@ -303,15 +273,11 @@ def seq( for ensembl_ID in ens_ids_clean: # Get ID type (gene, transcript, ...) using gget info - info_df = info( - ensembl_ID, verbose=False, pdb=False, ncbi=False, uniprot=False - ) + info_df = info(ensembl_ID, verbose=False, pdb=False, ncbi=False, uniprot=False) # Check that Ensembl ID was found if isinstance(info_df, type(None)): - logger.warning( - f"ID '{ensembl_ID}' not found. Please double-check spelling/arguments." - ) + logger.warning(f"ID '{ensembl_ID}' not found. Please double-check spelling/arguments.") continue ens_ID_type = info_df.loc[ensembl_ID]["object_type"] @@ -347,9 +313,7 @@ def seq( trans_ids.append(ensembl_ID) if verbose: - logger.info( - f"Requesting amino acid sequence of {ensembl_ID} from UniProt." - ) + logger.info(f"Requesting amino acid sequence of {ensembl_ID} from UniProt.") logger.warning("The isoform option only applies to gene IDs.") else: @@ -380,6 +344,7 @@ def seq( df_uniprot["organism"].values, df_uniprot["sequence_length"].values, df_uniprot["sequence"].values, + strict=False, ): fasta.append( ">" diff --git a/gget/gget_setup.py b/gget/gget_setup.py index a2a24d0d8..650988449 100644 --- a/gget/gget_setup.py +++ b/gget/gget_setup.py @@ -1,22 +1,22 @@ -import os +import importlib import logging +import os +import pathlib +import platform import shutil -import sys import subprocess -import platform -import uuid +import sys import tempfile -import pathlib -import importlib +import uuid -from .utils import set_up_logger, check_file_for_error_message +from .utils import check_file_for_error_message, set_up_logger logger = set_up_logger() -from .compile import PACKAGE_PATH -from .constants import ( - ELM_INSTANCES_FASTA_DOWNLOAD, +from .compile import PACKAGE_PATH # noqa: E402 +from .constants import ( # noqa: E402 ELM_CLASSES_TSV_DOWNLOAD, + ELM_INSTANCES_FASTA_DOWNLOAD, ELM_INSTANCES_TSV_DOWNLOAD, ELM_INTDOMAINS_TSV_DOWNLOAD, ) @@ -37,9 +37,7 @@ # # Path to temporary mounted disk (global) # TMP_DISK = "" # Model parameters -PARAMS_URL = ( - "https://storage.googleapis.com/alphafold/alphafold_params_colab_2022-12-06.tar" -) +PARAMS_URL = "https://storage.googleapis.com/alphafold/alphafold_params_colab_2022-12-06.tar" PARAMS_DIR = os.path.join(PACKAGE_PATH, "bins/alphafold/") PARAMS_PATH = os.path.join(PARAMS_DIR, "params_temp.tar") @@ -60,11 +58,11 @@ def _install(package: str, import_name: str, verbose: bool = True): if process.wait() != 0: if stderr: sys.stderr.write(stderr) - logger.error( - f"{package} installation with '{cmd_str}' (https://pypi.org/project/{package}) failed." - ) + logger.error(f"{package} installation with '{cmd_str}' (https://pypi.org/project/{package}) failed.") if cmd == pip_cmds[-1]: - logger.error(f"All installation attempts for {package} have failed. Note: Some dependencies (e.g., cellxgene-census) may not support the latest Python versions. If you encounter installation errors, try using an earlier Python version.") + logger.error( + f"All installation attempts for {package} have failed. Note: Some dependencies (e.g., cellxgene-census) may not support the latest Python versions. If you encounter installation errors, try using an earlier Python version." + ) return else: if verbose: @@ -82,7 +80,9 @@ def _install(package: str, import_name: str, verbose: bool = True): ) # Retry with pip if import after uv installation failed if cmd == pip_cmds[-1]: - logger.error(f"All installation attempts for {package} have failed. Note: Some dependencies (e.g., cellxgene-census) may not support the latest Python versions. If you encounter installation errors, try using an earlier Python version.") + logger.error( + f"All installation attempts for {package} have failed. Note: Some dependencies (e.g., cellxgene-census) may not support the latest Python versions. If you encounter installation errors, try using an earlier Python version." + ) return else: if verbose: @@ -91,8 +91,8 @@ def _install(package: str, import_name: str, verbose: bool = True): def setup(module, verbose=True, out=None): - """ - Function to install third-party dependencies for a specified gget module. + """Function to install third-party dependencies for a specified gget module. + Some modules require pip to be installed (https://pip.pypa.io/en/stable/installation). Some modules require curl to be installed (https://everything.curl.dev/get). @@ -105,9 +105,7 @@ def setup(module, verbose=True, out=None): """ supported_modules = ["alphafold", "cellxgene", "elm", "gpt", "cbio"] if module not in supported_modules: - raise ValueError( - f"'module' argument specified as {module}. Expected one of: {', '.join(supported_modules)}" - ) + raise ValueError(f"'module' argument specified as {module}. Expected one of: {', '.join(supported_modules)}") if module == "gpt": _install("openai<=0.28.1", "openai", verbose=verbose) @@ -120,18 +118,14 @@ def setup(module, verbose=True, out=None): logger.info( "ELM data can be downloaded & distributed for non-commercial use according to the following license: http://elm.eu.org/media/Elm_academic_license.pdf" ) - logger.info( - "Downloading ELM database files (requires curl to be installed)..." - ) + logger.info("Downloading ELM database files (requires curl to be installed)...") if out is not None: elm_files_out = os.path.abspath(out) elm_instances_fasta = os.path.join(elm_files_out, "elm_instances.fasta") elm_classes_tsv = os.path.join(elm_files_out, "elms_classes.tsv") elm_instances_tsv = os.path.join(elm_files_out, "elm_instances.tsv") - elm_intdomains_tsv = os.path.join( - elm_files_out, "elm_interaction_domains.tsv" - ) + elm_intdomains_tsv = os.path.join(elm_files_out, "elm_interaction_domains.tsv") # Create folder for ELM files (if it does not exist) if not os.path.exists(elm_files_out): @@ -199,16 +193,11 @@ def setup(module, verbose=True, out=None): missing.append(label) if missing: - raise RuntimeError( - "ELM database files download failed; missing files: " - + ", ".join(missing) - ) + raise RuntimeError("ELM database files download failed; missing files: " + ", ".join(missing)) elif module == "alphafold": if platform.system() == "Windows": - logger.error( - "gget setup alphafold and gget alphafold are not supported on Windows OS." - ) + logger.error("gget setup alphafold and gget alphafold are not supported on Windows OS.") return ## Ask user to install openmm if not already installed @@ -229,19 +218,19 @@ def setup(module, verbose=True, out=None): except ImportError as e: raise ImportError( f""" - Trying to import openmm resulted in the following error: + Trying to import openmm resulted in the following error: {e} - Please install AlphaFold third-party dependency openmm by running the following command from the command line: - For Python version < 3.10: - 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1' - For Python version 3.10: - 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0' - For Python version 3.11: - 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0' + Please install AlphaFold third-party dependency openmm by running the following command from the command line: + For Python version < 3.10: + 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1' + For Python version 3.10: + 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0' + For Python version 3.11: + 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0' (Recommendation: Follow with 'conda update -qy conda' to update conda to the latest version afterwards.) """ - ) + ) from e ## Install py3Dmol _install("py3Dmol", "py3Dmol", verbose=verbose) @@ -257,9 +246,7 @@ def setup(module, verbose=True, out=None): os.environ.setdefault("UV_HTTP_TIMEOUT", "300") # Define AlphaFold folder name and location - alphafold_folder = os.path.join( - tempfile.gettempdir(), f"tmp_alphafold_{uuid.uuid4()}" - ) + alphafold_folder = os.path.join(tempfile.gettempdir(), f"tmp_alphafold_{uuid.uuid4()}") pathlib.Path(alphafold_folder).mkdir(parents=True, exist_ok=True) # Clean (unescaped) jackhmmer cache dir; we’ll patch file contents via Python @@ -268,7 +255,7 @@ def setup(module, verbose=True, out=None): # Core AlphaFold dependencies (Colab/CPU friendly set) alphafold_deps = [ "absl-py>=2.1,<3", - "dm-haiku<=0.0.12", # dont upgrade to avoid clash with jax + "dm-haiku<=0.0.12", # dont upgrade to avoid clash with jax "dm-tree>=0.1.8", "filelock>=3.12", "jax==0.4.26", @@ -278,7 +265,7 @@ def setup(module, verbose=True, out=None): "jmp>=0.0.4", "ml-collections>=0.1,<1", "ml-dtypes>=0.3.1,<0.6", - "numpy>=1.26,<2", # keeps TF 2.17 CPU happy + "numpy>=1.26,<2", # keeps TF 2.17 CPU happy "opt-einsum>=3.4,<4", "pillow>=10,<12", "protobuf<4", @@ -300,7 +287,7 @@ def setup(module, verbose=True, out=None): # Patch jackhmmer.py jack_py = os.path.join(alphafold_folder, "alphafold", "data", "tools", "jackhmmer.py") - with open(jack_py, "r", encoding="utf-8") as f: + with open(jack_py, encoding="utf-8") as f: txt = f.read() txt = txt.replace("/tmp/ramdisk", jack_dir) @@ -315,16 +302,10 @@ def setup(module, verbose=True, out=None): f.write(txt) # Base deps first (NumPy/TF/JAX in a known good combo) - subprocess.run( - [*pip_upgrade.split(), "numpy>=1.26,<2", "tensorflow-cpu>=2.17,<2.18"], - check=True - ) + subprocess.run([*pip_upgrade.split(), "numpy>=1.26,<2", "tensorflow-cpu>=2.17,<2.18"], check=True) # The rest of the deps - subprocess.run( - [*pip_upgrade.split(), *alphafold_deps], - check=True - ) + subprocess.run([*pip_upgrade.split(), *alphafold_deps], check=True) # Install AF itself without bringing in its pinned requirements subprocess.run(f'{pip_nodeps} "{alphafold_folder}"', check=True, shell=True) @@ -334,7 +315,7 @@ def setup(module, verbose=True, out=None): # Show any captured stderr from our last step, if available try: sys.stderr.write(str(e) + "\n") - except Exception: + except Exception: # noqa: BLE001 pass shutil.rmtree(alphafold_folder, ignore_errors=True) return @@ -344,6 +325,7 @@ def setup(module, verbose=True, out=None): try: import alphafold as AlphaFold + if verbose: logger.info("AlphaFold installed succesfully.") except ImportError as e: @@ -359,9 +341,7 @@ def setup(module, verbose=True, out=None): if verbose: logger.info("Installing pdbfixer from source (requires pip and git).") - pdbfixer_folder = os.path.join( - tempfile.gettempdir(), f"tmp_pdbfixer_{uuid.uuid4()}" - ) + pdbfixer_folder = os.path.join(tempfile.gettempdir(), f"tmp_pdbfixer_{uuid.uuid4()}") try: if openmm.__version__ == "7.5.1": @@ -369,7 +349,7 @@ def setup(module, verbose=True, out=None): PDBFIXER_VERSION = "v1.7" else: PDBFIXER_VERSION = "v1.8.1" - except: + except Exception: # noqa: BLE001 PDBFIXER_VERSION = "v1.8.1" pip_cmd = "uv pip install" if shutil.which("uv") else "pip install -q" @@ -398,7 +378,7 @@ def setup(module, verbose=True, out=None): pdb_out, err = process.communicate() if pdb_out.decode() != "": - logger.info(f"pdbfixer installed succesfully.") + logger.info("pdbfixer installed succesfully.") else: logger.error("pdbfixer installation failed.") return @@ -418,19 +398,17 @@ def setup(module, verbose=True, out=None): # The double-quotation marks allow white spaces in the path, but this does not work for Windows command = f""" curl -# -o {PARAMS_PATH} {PARAMS_URL} \\ - && tar --extract --file={PARAMS_PATH} --directory={PARAMS_DIR+'params/'} --preserve-permissions \\ + && tar --extract --file={PARAMS_PATH} --directory={PARAMS_DIR + "params/"} --preserve-permissions \\ && rm {PARAMS_PATH} """ else: command = f""" curl -# -o '{PARAMS_PATH}' '{PARAMS_URL}' \\ - && tar --extract --file='{PARAMS_PATH}' --directory='{PARAMS_DIR+'params/'}' --preserve-permissions \\ + && tar --extract --file='{PARAMS_PATH}' --directory='{PARAMS_DIR + "params/"}' --preserve-permissions \\ && rm '{PARAMS_PATH}' """ - with subprocess.Popen( - command, shell=True, stderr=subprocess.PIPE - ) as process: + with subprocess.Popen(command, shell=True, stderr=subprocess.PIPE) as process: stderr = process.stderr.read().decode("utf-8") # Log the standard error if it is not empty if stderr: diff --git a/gget/gget_virus.py b/gget/gget_virus.py index 5970878ed..b6bf49383 100644 --- a/gget/gget_virus.py +++ b/gget/gget_virus.py @@ -1,37 +1,40 @@ +import calendar +import gc # For garbage collection to manage memory +import http.client +import json +import logging # For logging level checks import os +import platform # For OS detection import re -import json -import sys # For accessing command line arguments -import time # For adding delays between requests -import logging # For logging level checks -import shutil # For directory operations -import subprocess # For executing external commands -import traceback # For error traceback logging -import platform # For OS detection -import stat # For file permission constants -import gc # For garbage collection to manage memory -import pandas as pd # For data manipulation and CSV output -import requests # For HTTP requests to NCBI API -import zipfile # For extracting downloaded ZIP files -from tqdm import tqdm # For progress bar display +import shutil # For directory operations +import stat # For file permission constants +import subprocess # For executing external commands +import sys # For accessing command line arguments +import time # For adding delays between requests +import traceback # For error traceback logging +import xml.etree.ElementTree as ET # For XML parsing +import zipfile # For extracting downloaded ZIP files from datetime import datetime # For date handling -from dateutil import parser # For flexible date parsing -import xml.etree.ElementTree as ET # For XML parsing -import http.client +from urllib.parse import quote + +import pandas as pd # For data manipulation and CSV output +import requests # For HTTP requests to NCBI API import urllib3 -from urllib3.util.retry import Retry +from dateutil import parser # For flexible date parsing from requests.adapters import HTTPAdapter -from urllib.parse import quote -import calendar +from tqdm import tqdm # For progress bar display +from urllib3.util.retry import Retry -# Internal imports for logging, unique ID generation, and FASTA parsing -from .utils import set_up_logger, FastaIO -from .constants import NCBI_API_BASE, NCBI_EUTILS_BASE_EFETCH, NCBI_EUTILS_BASE_ESEARCH from .compile import PACKAGE_PATH +from .constants import NCBI_API_BASE, NCBI_EUTILS_BASE_EFETCH, NCBI_EUTILS_BASE_ESEARCH + +# Internal imports for logging, unique ID generation, and FASTA parsing +from .utils import FastaIO, set_up_logger # Optional psutil import for memory monitoring try: import psutil + PSUTIL_AVAILABLE = True except ImportError: PSUTIL_AVAILABLE = False @@ -67,7 +70,7 @@ GENBANK_MAX_BATCH_SIZE_WARNING = 500 # Warn user if batch size exceeds this GENBANK_RETRY_ATTEMPTS = 5 # Number of retry attempts for GenBank requests GENBANK_XML_CHUNK_SIZE = 10000 # Rows to process before writing to CSV -GENBANK_COMPLEXITY = 1 # Complexity level with only the accessions requested. All levels explained here: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch +GENBANK_COMPLEXITY = 1 # Complexity level with only the accessions requested. All levels explained here: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch # Resolve API key from environment variable NCBI_API_KEY. # Users can also pass an api_key argument directly to the virus() function / CLI --api_key. @@ -94,33 +97,62 @@ # Virus Detection Identifiers SARS_COV2_IDENTIFIERS = { - 'sarscov2', 'sars2', '2697049', 'sarscov', - 'severeacuterespiratorysyndromecoronavirus2', - 'covid19', 'covid', 'coronavirusdisease', 'ncov', 'hcov19' + "sarscov2", + "sars2", + "2697049", + "sarscov", + "severeacuterespiratorysyndromecoronavirus2", + "covid19", + "covid", + "coronavirusdisease", + "ncov", + "hcov19", } ALPHAINFLUENZA_IDENTIFIERS = { - 'alphainfluenza', 'alphainfluenzavirus', 'alphainfluenzavirusinfluenzae', - 'influenzaavirus', 'influenzaa', 'flua', - '197911', # Alphainfluenza genus - '2955291', # Alphainfluenzavirus influenzae species - '11320' # Influenza A virus + "alphainfluenza", + "alphainfluenzavirus", + "alphainfluenzavirusinfluenzae", + "influenzaavirus", + "influenzaa", + "flua", + "197911", # Alphainfluenza genus + "2955291", # Alphainfluenzavirus influenzae species + "11320", # Influenza A virus } # Default taxon for Alphainfluenza downloads (most comprehensive cached data) ALPHAINFLUENZA_DEFAULT_TAXON = "Alphainfluenzavirus influenzae" # Progress Indicator Keywords (for subprocess monitoring) -PROGRESS_INDICATORS = ['%', '=', 'downloading', 'fetching', 'MB', 'GB', 'bytes'] +PROGRESS_INDICATORS = ["%", "=", "downloading", "fetching", "MB", "GB", "bytes"] # Protein/Gene Keywords for Header Parsing PROTEIN_KEYWORDS = [ - 'hemagglutinin', 'neuraminidase', 'polymerase', 'nucleoprotein', - 'matrix protein', 'nonstructural protein', 'ns1', 'ns2', - 'spike', 'envelope', 'membrane', 'nucleocapsid', - 'orf', 'nsp', 'pp1a', 'pp1ab', - 'segment 1', 'segment 2', 'segment 3', 'segment 4', - 'segment 5', 'segment 6', 'segment 7', 'segment 8', + "hemagglutinin", + "neuraminidase", + "polymerase", + "nucleoprotein", + "matrix protein", + "nonstructural protein", + "ns1", + "ns2", + "spike", + "envelope", + "membrane", + "nucleocapsid", + "orf", + "nsp", + "pp1a", + "pp1ab", + "segment 1", + "segment 2", + "segment 3", + "segment 4", + "segment 5", + "segment 6", + "segment 7", + "segment 8", ] # Date Parsing Configuration @@ -156,120 +188,122 @@ # MEMORY MONITORING HELPERS # ============================================================================= + def _get_memory_usage(): - """ - Get current memory usage information for debugging. - - Returns: + """Get current memory usage information for debugging. + + Returns + ------- dict: Dictionary with memory stats including: - rss_mb: Resident Set Size in MB (actual RAM used) - vms_mb: Virtual Memory Size in MB - percent: Percent of total system memory used - available_mb: Available system memory in MB - + Note: Falls back to /proc/self/status on Linux if psutil is not available. + """ if PSUTIL_AVAILABLE: try: process = psutil.Process() mem_info = process.memory_info() sys_mem = psutil.virtual_memory() - + return { - 'rss_mb': mem_info.rss / (1024 * 1024), - 'vms_mb': mem_info.vms / (1024 * 1024), - 'percent': process.memory_percent(), - 'available_mb': sys_mem.available / (1024 * 1024), - 'total_mb': sys_mem.total / (1024 * 1024), - 'system_percent': sys_mem.percent, - 'psutil_available': True + "rss_mb": mem_info.rss / (1024 * 1024), + "vms_mb": mem_info.vms / (1024 * 1024), + "percent": process.memory_percent(), + "available_mb": sys_mem.available / (1024 * 1024), + "total_mb": sys_mem.total / (1024 * 1024), + "system_percent": sys_mem.percent, + "psutil_available": True, } - except Exception as e: + except Exception: # noqa: BLE001 pass # Fall through to /proc fallback - + # Fallback for Linux: read from /proc/self/status - result = { - 'rss_mb': None, - 'vms_mb': None, - 'percent': None, - 'available_mb': None, - 'psutil_available': False - } - + result = {"rss_mb": None, "vms_mb": None, "percent": None, "available_mb": None, "psutil_available": False} + try: - with open('/proc/self/status', 'r') as f: + with open("/proc/self/status") as f: for line in f: - if line.startswith('VmRSS:'): + if line.startswith("VmRSS:"): # VmRSS is in kB rss_kb = int(line.split()[1]) - result['rss_mb'] = rss_kb / 1024 - elif line.startswith('VmSize:'): + result["rss_mb"] = rss_kb / 1024 + elif line.startswith("VmSize:"): vms_kb = int(line.split()[1]) - result['vms_mb'] = vms_kb / 1024 + result["vms_mb"] = vms_kb / 1024 except (FileNotFoundError, PermissionError, ValueError): pass # Not on Linux or can't read /proc - + # Try to get system memory from /proc/meminfo try: - with open('/proc/meminfo', 'r') as f: + with open("/proc/meminfo") as f: for line in f: - if line.startswith('MemAvailable:'): + if line.startswith("MemAvailable:"): avail_kb = int(line.split()[1]) - result['available_mb'] = avail_kb / 1024 - elif line.startswith('MemTotal:'): + result["available_mb"] = avail_kb / 1024 + elif line.startswith("MemTotal:"): total_kb = int(line.split()[1]) - result['total_mb'] = total_kb / 1024 + result["total_mb"] = total_kb / 1024 except (FileNotFoundError, PermissionError, ValueError): pass - + # Calculate percent if we have both values - if result.get('rss_mb') and result.get('total_mb'): - result['percent'] = (result['rss_mb'] / result['total_mb']) * 100 - + if result.get("rss_mb") and result.get("total_mb"): + result["percent"] = (result["rss_mb"] / result["total_mb"]) * 100 + return result def _log_memory_usage(context=""): - """ - Log current memory usage with context information. - + """Log current memory usage with context information. + Args: context (str): Description of where in the code this is being called. """ mem = _get_memory_usage() - - if not mem.get('psutil_available'): + + if not mem.get("psutil_available"): logger.debug("Memory monitoring: psutil not available (install with 'pip install psutil' for memory debugging)") return - - if mem.get('rss_mb') is not None: - logger.info("📊 MEMORY [%s]: Process RSS=%.1f MB (%.1f%%), System: %.1f%% used, %.1f MB available of %.1f MB total", - context, - mem['rss_mb'], - mem.get('percent', 0), - mem.get('system_percent', 0), - mem.get('available_mb', 0), - mem.get('total_mb', 0)) + + if mem.get("rss_mb") is not None: + logger.info( + "📊 MEMORY [%s]: Process RSS=%.1f MB (%.1f%%), System: %.1f%% used, %.1f MB available of %.1f MB total", + context, + mem["rss_mb"], + mem.get("percent", 0), + mem.get("system_percent", 0), + mem.get("available_mb", 0), + mem.get("total_mb", 0), + ) else: - logger.debug("Memory monitoring: Unable to get memory info - %s", mem.get('error', 'unknown error')) + logger.debug("Memory monitoring: Unable to get memory info - %s", mem.get("error", "unknown error")) def _force_garbage_collection(context=""): - """ - Force garbage collection and log the results. - + """Force garbage collection and log the results. + Args: context (str): Description of where in the code this is being called. """ before = _get_memory_usage() collected = gc.collect() after = _get_memory_usage() - - if before.get('rss_mb') is not None and after.get('rss_mb') is not None: - freed = before['rss_mb'] - after['rss_mb'] - logger.info("🗑️ GC [%s]: Collected %d objects, freed %.1f MB (%.1f MB -> %.1f MB)", - context, collected, freed, before['rss_mb'], after['rss_mb']) + + if before.get("rss_mb") is not None and after.get("rss_mb") is not None: + freed = before["rss_mb"] - after["rss_mb"] + logger.info( + "🗑️ GC [%s]: Collected %d objects, freed %.1f MB (%.1f MB -> %.1f MB)", + context, + collected, + freed, + before["rss_mb"], + after["rss_mb"], + ) else: logger.debug("GC [%s]: Collected %d objects", context, collected) @@ -281,17 +315,13 @@ def _force_garbage_collection(context=""): # Set up logger for this module logger = set_up_logger() timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") -random_suffix = os.urandom(3).hex() # random suffix for naming uniqueness +random_suffix = os.urandom(3).hex() # random suffix for naming uniqueness # Path to precompiled datasets binary if platform.system() == "Windows": - PRECOMPILED_DATASETS_PATH = os.path.join( - PACKAGE_PATH, "bins", "Windows", "datasets.exe" - ) + PRECOMPILED_DATASETS_PATH = os.path.join(PACKAGE_PATH, "bins", "Windows", "datasets.exe") else: - PRECOMPILED_DATASETS_PATH = os.path.join( - PACKAGE_PATH, "bins", platform.system(), "datasets" - ) + PRECOMPILED_DATASETS_PATH = os.path.join(PACKAGE_PATH, "bins", platform.system(), "datasets") # Cache for the datasets path to avoid repeated checks _datasets_path_cache = None @@ -300,6 +330,7 @@ def _force_garbage_collection(context=""): # HELPER FUNCTIONS FOR RETRIES AND ERROR TRACKING # ============================================================================= + def _retry_with_exponential_backoff( operation_name, operation_func, @@ -310,13 +341,12 @@ def _retry_with_exponential_backoff( retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError), failed_commands=None, ): - """ - Execute an operation with exponential backoff retry logic. - + """Execute an operation with exponential backoff retry logic. + This is a reusable helper that consolidates the exponential backoff retry pattern used throughout the module. It handles retryable exceptions with configurable delays and logging. - + Args: operation_name (str): Name of the operation for logging (e.g., "batch_10"). operation_func (callable): Function to execute, should raise an exception on failure. @@ -325,34 +355,40 @@ def _retry_with_exponential_backoff( backoff_multiplier (float): Multiplier for exponential backoff. retryable_exceptions (tuple): Exception types to retry on. failed_commands (dict, optional): Dictionary to track failed operations. - - Returns: + + Returns + ------- tuple: (success, result, error_info) - success (bool): True if operation succeeded. - result: Return value of operation_func (or None if failed). - error_info (dict): Details about the failure (if any). + """ retry_delay = initial_delay last_exception = None - + for attempt in range(max_retries): try: result = operation_func() return True, result, None - + except retryable_exceptions as e: last_exception = e is_retryable = True - + # For HTTPError, check if it's a server error (5xx) - if isinstance(e, requests.exceptions.HTTPError) and hasattr(e, 'response') and e.response: + if isinstance(e, requests.exceptions.HTTPError) and hasattr(e, "response") and e.response: is_retryable = 500 <= e.response.status_code < 600 - + if attempt < max_retries - 1 and is_retryable: capped_delay = min(retry_delay, max_delay) logger.warning( "⚠️ %s failed (attempt %d/%d): %s. Retrying in %.1f seconds...", - operation_name, attempt + 1, max_retries, e, capped_delay + operation_name, + attempt + 1, + max_retries, + e, + capped_delay, ) time.sleep(capped_delay) retry_delay *= backoff_multiplier @@ -360,28 +396,27 @@ def _retry_with_exponential_backoff( else: # Out of retries or non-retryable error break - - except Exception as e: + + except Exception as e: # noqa: BLE001 # Non-retryable exception types last_exception = e break - + # Operation failed after retries error_info = { - 'error': str(last_exception), - 'exception_type': type(last_exception).__name__, + "error": str(last_exception), + "exception_type": type(last_exception).__name__, } - + return False, None, error_info def _track_failed_operation(failed_commands, operation_type, batch_info, error_info): - """ - Track a failed operation in the failed_commands dictionary. - + """Track a failed operation in the failed_commands dictionary. + This ensures consistent error tracking across all operation types for later reporting in the command summary. - + Args: failed_commands (dict): Dictionary to track failures. operation_type (str): Type of operation ('metadata_batch', 'sequence_batch', 'pagination', etc.). @@ -390,32 +425,33 @@ def _track_failed_operation(failed_commands, operation_type, batch_info, error_i """ if failed_commands is None: return - + if operation_type not in failed_commands: failed_commands[operation_type] = [] - + failure_record = {**batch_info, **error_info} failed_commands[operation_type].append(failure_record) logger.debug("Tracked failed %s: %s", operation_type, failure_record) def _validate_datasets_binary(path): - """ - Validate that a datasets binary exists and is functional. - + """Validate that a datasets binary exists and is functional. + Args: path (str): Path to the datasets binary to validate. - - Returns: + + Returns + ------- bool: True if the binary exists and runs successfully, False otherwise. + """ if not path: return False - + # Check if the file exists (for bundled binary) or is in PATH (for system binary) if not os.path.isfile(path) and not shutil.which(path): return False - + # Verify the binary actually works try: result = subprocess.run( @@ -430,9 +466,8 @@ def _validate_datasets_binary(path): def _clear_datasets_cache(): - """ - Clear the cached datasets path, forcing re-detection on next call. - + """Clear the cached datasets path, forcing re-detection on next call. + This is useful when the environment changes (e.g., user installs/uninstalls the datasets CLI) or when the cached binary becomes unavailable. """ @@ -442,8 +477,7 @@ def _clear_datasets_cache(): def _get_datasets_path(): - """ - Get the path to the NCBI datasets CLI binary. + """Get the path to the NCBI datasets CLI binary. This helper first checks if datasets is available in the system PATH. If found, it uses the system-installed version. Otherwise, it falls back @@ -453,14 +487,17 @@ def _get_datasets_path(): invalidated if the cached binary becomes unavailable (e.g., deleted or environment changed), triggering re-detection. - Returns: + Returns + ------- str: Path to the datasets binary ("datasets" for system PATH, or full path for bundled). - Raises: + Raises + ------ RuntimeError: If no working datasets binary is available. + """ global _datasets_path_cache - + # If we have a cached path, validate it's still working if _datasets_path_cache is not None: if _validate_datasets_binary(_datasets_path_cache): @@ -468,11 +505,11 @@ def _get_datasets_path(): else: # Cached binary is no longer valid, clear cache and re-detect logger.warning( - "⚠️ Previously cached datasets binary at '%s' is no longer available. " - "Re-detecting...", _datasets_path_cache + "⚠️ Previously cached datasets binary at '%s' is no longer available. Re-detecting...", + _datasets_path_cache, ) _clear_datasets_cache() - + # First, check if datasets is available in the system PATH datasets_path = shutil.which("datasets") if datasets_path: @@ -484,17 +521,15 @@ def _get_datasets_path(): timeout=SUBPROCESS_VERSION_TIMEOUT, ) if result.returncode == 0: - logger.info( - "✅ Using system-installed NCBI datasets CLI: %s", result.stdout.strip() - ) + logger.info("✅ Using system-installed NCBI datasets CLI: %s", result.stdout.strip()) _datasets_path_cache = datasets_path return datasets_path except (subprocess.TimeoutExpired, OSError): pass # System binary didn't work, try bundled - + # Fall back to the bundled binary datasets_path = PRECOMPILED_DATASETS_PATH - + # Check if the precompiled binary exists if not os.path.isfile(datasets_path): raise RuntimeError( @@ -503,16 +538,14 @@ def _get_datasets_path(): "or install the NCBI datasets CLI manually: " "https://www.ncbi.nlm.nih.gov/datasets/docs/v2/download-and-install/" ) - + # On non-Windows systems, ensure the binary is executable if platform.system() != "Windows": try: os.chmod(datasets_path, os.stat(datasets_path).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) except OSError as e: - raise RuntimeError( - f"Failed to make NCBI datasets binary executable: {e}" - ) - + raise RuntimeError(f"Failed to make NCBI datasets binary executable: {e}") from e + # Verify the bundled binary works try: result = subprocess.run( @@ -522,30 +555,25 @@ def _get_datasets_path(): timeout=SUBPROCESS_VERSION_TIMEOUT, ) if result.returncode == 0: - logger.info( - "✅ Using bundled NCBI datasets CLI: %s", result.stdout.strip() - ) + logger.info("✅ Using bundled NCBI datasets CLI: %s", result.stdout.strip()) _datasets_path_cache = datasets_path return datasets_path except (subprocess.TimeoutExpired, OSError) as e: - raise RuntimeError( - f"Failed to verify bundled NCBI datasets binary at {datasets_path}: {e}" - ) - - raise RuntimeError( - f"NCBI datasets binary at {datasets_path} failed verification." - ) + raise RuntimeError(f"Failed to verify bundled NCBI datasets binary at {datasets_path}: {e}") from e + + raise RuntimeError(f"NCBI datasets binary at {datasets_path} failed verification.") def _get_datasets_version(): - """ - Get the version of the NCBI datasets CLI if available. - + """Get the version of the NCBI datasets CLI if available. + Attempts to retrieve the version string from the datasets binary. Returns None if datasets is not available or version check fails. - - Returns: + + Returns + ------- str or None: Version string from datasets (e.g., "16.11.0") or None if unavailable. + """ try: datasets_path = _get_datasets_path() @@ -562,28 +590,29 @@ def _get_datasets_version(): return version_output except (RuntimeError, subprocess.TimeoutExpired, OSError) as e: logger.debug("Could not retrieve datasets version: %s", e) - + return None def _get_gget_version(): - """ - Get the version of gget. - - Returns: + """Get the version of gget. + + Returns + ------- str: Version string (e.g., "1.2.0") or "unknown" if not available. + """ try: from . import __version__ + return __version__ except (ImportError, AttributeError): return "unknown" def _get_modified_virus_name(virus_name, attempt=1): - """ - Modify the virus name for retry attempts when the NCBI server is unreachable. - + """Modify the virus name for retry attempts when the NCBI server is unreachable. + This function generates alternative virus names to try when the initial query fails due to server unreachability. The modification strategies are: 1. (attempt=1) If the name contains parentheses, remove them and their contents. @@ -592,14 +621,15 @@ def _get_modified_virus_name(virus_name, attempt=1): (e.g., "Dengue" -> "Dengue virus") 3. (attempt=2) If the name ends with "virus" without a space, add a space. (e.g., "Denguevirus" -> "Dengue virus") - + Args: virus_name (str): Original virus name that failed. attempt (int): Which modification attempt this is (1 or 2). - - Returns: + + Returns + ------- str or None: Modified virus name to retry, or None if no modification is possible. - + Example: >>> _get_modified_virus_name("Lassa virus (LASV)", attempt=1) 'Lassa virus' @@ -611,160 +641,148 @@ def _get_modified_virus_name(virus_name, attempt=1): 'Dengue virus' >>> _get_modified_virus_name("Dengue virus", attempt=2) None # Already contains "virus" properly + """ if not virus_name: return None - + virus_lower = virus_name.lower().strip() - + # Attempt 1: Try removing parenthetical content if attempt == 1: # Check if there are parentheses to remove - if '(' in virus_name and ')' in virus_name: + if "(" in virus_name and ")" in virus_name: # Remove parenthetical content (e.g., "(LASV)" or "(strain XYZ)") - modified = re.sub(r'\s*\([^)]*\)\s*', ' ', virus_name).strip() + modified = re.sub(r"\s*\([^)]*\)\s*", " ", virus_name).strip() # Clean up any double spaces - modified = re.sub(r'\s+', ' ', modified) + modified = re.sub(r"\s+", " ", modified) if modified and modified.lower() != virus_lower: - logger.debug("Modified virus name by removing parentheses: '%s' -> '%s'", - virus_name, modified) + logger.debug("Modified virus name by removing parentheses: '%s' -> '%s'", virus_name, modified) return modified return None - + # Attempt 2: Try adding "virus" suffix or spacing if attempt == 2: # Check if the name already contains "virus" anywhere (case-insensitive) if "virus" in virus_lower: # Add a space before "virus" only if there isn't one already idx = virus_name.lower().rfind("virus") - if idx > 0 and virus_name[idx - 1] != ' ': + if idx > 0 and virus_name[idx - 1] != " ": modified = virus_name[:idx] + " " + virus_name[idx:] - logger.debug("Modified virus name by adding space before 'virus': '%s' -> '%s'", - virus_name, modified) + logger.debug("Modified virus name by adding space before 'virus': '%s' -> '%s'", virus_name, modified) return modified # Already has "virus" correctly spaced in the name, no modification needed return None - + # Name doesn't contain "virus" anywhere, so append " virus" modified = virus_name + " virus" - logger.debug("Modified virus name by appending ' virus': '%s' -> '%s'", - virus_name, modified) + logger.debug("Modified virus name by appending ' virus': '%s' -> '%s'", virus_name, modified) return modified - + return None def _parse_accession_input(accession_input): - """ - Parse accession input which can be: + """Parse accession input which can be: + 1. Single accession: 'NC_045512.2' 2. Space-separated accessions: 'NC_045512.2 MN908947.3 MT020781.1' - 3. Path to text file: '/path/to/accessions.txt' (one accession per line) - + 3. Path to text file: '/path/to/accessions.txt' (one accession per line). + Args: accession_input (str): The accession input string. - - Returns: + + Returns + ------- dict: A dictionary with keys: - 'type': 'single', 'list', or 'file' - 'accessions': list of accession strings (for 'list' type) or single accession (for 'single') - 'file_path': file path (for 'file' type only) - 'is_file': True if input is a file path - - Raises: + + Raises + ------ ValueError: If file path doesn't exist or file is empty. - + Example: - >>> _parse_accession_input('NC_045512.2') + >>> _parse_accession_input("NC_045512.2") {'type': 'single', 'accessions': 'NC_045512.2', 'file_path': None, 'is_file': False} - - >>> _parse_accession_input('NC_045512.2 MN908947.3') + + >>> _parse_accession_input("NC_045512.2 MN908947.3") {'type': 'list', 'accessions': ['NC_045512.2', 'MN908947.3'], 'file_path': None, 'is_file': False} - - >>> _parse_accession_input('/path/to/accessions.txt') + + >>> _parse_accession_input("/path/to/accessions.txt") {'type': 'file', 'accessions': ['NC_045512.2', 'MN908947.3', ...], 'file_path': '/path/to/accessions.txt', 'is_file': True} + """ accession_input = accession_input.strip() - + # Check if input is a file path if os.path.isfile(accession_input): logger.info("Parsing accession numbers from file: %s", accession_input) try: - with open(accession_input, 'r') as f: + with open(accession_input) as f: accessions = [line.strip() for line in f if line.strip()] - + if not accessions: raise ValueError(f"Accession file {accession_input} is empty.") - + logger.info("Loaded %d accession(s) from file", len(accessions)) - return { - 'type': 'file', - 'accessions': accessions, - 'file_path': accession_input, - 'is_file': True - } - except IOError as e: - raise ValueError(f"Error reading accession file {accession_input}: {e}") - + return {"type": "file", "accessions": accessions, "file_path": accession_input, "is_file": True} + except OSError as e: + raise ValueError(f"Error reading accession file {accession_input}: {e}") from e + # Check if input is space-separated accessions - if ' ' in accession_input: + if " " in accession_input: accessions = accession_input.split() logger.info("Parsed %d accession(s) from space-separated input", len(accessions)) - return { - 'type': 'list', - 'accessions': accessions, - 'file_path': None, - 'is_file': False - } - + return {"type": "list", "accessions": accessions, "file_path": None, "is_file": False} + # Single accession logger.debug("Single accession input: %s", accession_input) - return { - 'type': 'single', - 'accessions': accession_input, - 'file_path': None, - 'is_file': False - } + return {"type": "single", "accessions": accession_input, "file_path": None, "is_file": False} def _parse_baseline_file(baseline_path): - """ - Parse a baseline metadata file to extract accession numbers for deduplication. - + """Parse a baseline metadata file to extract accession numbers for deduplication. + Supports multiple file formats: - CSV: Looks for 'accession' column (case-insensitive) - JSONL: Looks for 'accession' key in each JSON object - JSON: Looks for 'accession' key in a list of objects - Text: Treats each non-empty line as an accession number - + Accession numbers are normalized (stripped, lowercased) for consistent comparison. - + Args: baseline_path (str): Path to the baseline metadata file. - - Returns: + + Returns + ------- set: Set of normalized accession numbers from the baseline file. - - Raises: + + Raises + ------ FileNotFoundError: If the baseline file does not exist. ValueError: If no accessions could be extracted. + """ if not baseline_path or not os.path.exists(baseline_path): raise FileNotFoundError(f"Baseline file not found: {baseline_path}") - + baseline_accessions = set() file_ext = os.path.splitext(baseline_path)[1].lower() logger.info("Parsing baseline file: %s (format: %s)", baseline_path, file_ext or "auto-detect") - + try: - if file_ext == '.csv': + if file_ext == ".csv": # CSV format: look for 'accession' column df = pd.read_csv(baseline_path, low_memory=False) # Case-insensitive column name search acc_col = None for col in df.columns: - if col.strip().lower() == 'accession': + if col.strip().lower() == "accession": acc_col = col break if acc_col is None: @@ -772,33 +790,31 @@ def _parse_baseline_file(baseline_path): f"Baseline CSV file '{baseline_path}' has no 'accession' column. " f"Available columns: {list(df.columns)}" ) - baseline_accessions = set( - str(acc).strip().lower() for acc in df[acc_col].dropna() if str(acc).strip() - ) - - elif file_ext == '.jsonl': + baseline_accessions = {str(acc).strip().lower() for acc in df[acc_col].dropna() if str(acc).strip()} + + elif file_ext == ".jsonl": # JSONL format: one JSON object per line - with open(baseline_path, 'r', encoding='utf-8') as f: + with open(baseline_path, encoding="utf-8") as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue try: record = json.loads(line) - acc = record.get('accession', '') + acc = record.get("accession", "") if acc and str(acc).strip(): baseline_accessions.add(str(acc).strip().lower()) except json.JSONDecodeError: logger.debug("Skipping invalid JSON on line %d of baseline file", line_num) - - elif file_ext == '.json': + + elif file_ext == ".json": # JSON format: list of objects - with open(baseline_path, 'r', encoding='utf-8') as f: + with open(baseline_path, encoding="utf-8") as f: data = json.load(f) if isinstance(data, list): for record in data: if isinstance(record, dict): - acc = record.get('accession', '') + acc = record.get("accession", "") if acc and str(acc).strip(): baseline_accessions.add(str(acc).strip().lower()) elif isinstance(data, dict): @@ -812,52 +828,50 @@ def _parse_baseline_file(baseline_path): df = pd.read_csv(baseline_path, low_memory=False) acc_col = None for col in df.columns: - if col.strip().lower() == 'accession': + if col.strip().lower() == "accession": acc_col = col break if acc_col is not None: - baseline_accessions = set( - str(acc).strip().lower() for acc in df[acc_col].dropna() if str(acc).strip() - ) + baseline_accessions = {str(acc).strip().lower() for acc in df[acc_col].dropna() if str(acc).strip()} logger.debug("Auto-detected CSV format with 'accession' column") else: raise ValueError("No accession column found, trying text format") except (ValueError, pd.errors.ParserError): # Fall back to text format: one accession per line - with open(baseline_path, 'r', encoding='utf-8') as f: + with open(baseline_path, encoding="utf-8") as f: for line in f: line = line.strip() - if line and not line.startswith('#'): + if line and not line.startswith("#"): # Take first whitespace-delimited token as accession acc = line.split()[0] baseline_accessions.add(acc.lower()) logger.debug("Parsed as text format (one accession per line)") - + except (FileNotFoundError, ValueError): raise except Exception as e: raise ValueError(f"Failed to parse baseline file '{baseline_path}': {e}") from e - + if not baseline_accessions: raise ValueError( - f"No accessions found in baseline file '{baseline_path}'. " - f"Ensure the file contains accession numbers." + f"No accessions found in baseline file '{baseline_path}'. Ensure the file contains accession numbers." ) - + logger.info("✅ Loaded %d accessions from baseline file", len(baseline_accessions)) return baseline_accessions def _deduplicate_metadata_against_baseline(metadata_dict, baseline_accessions): - """ - Remove metadata records whose accessions are already in the baseline set. - + """Remove metadata records whose accessions are already in the baseline set. + Args: metadata_dict (dict): Dictionary mapping accession -> metadata. baseline_accessions (set): Set of normalized accession numbers from baseline. - - Returns: + + Returns + ------- tuple: (new_metadata_dict, skipped_count) + """ new_metadata = {} skipped_count = 0 @@ -868,14 +882,12 @@ def _deduplicate_metadata_against_baseline(metadata_dict, baseline_accessions): else: new_metadata[acc] = meta - logger.info("Deduplication results: %d new, %d skipped (already in baseline)", - len(new_metadata), skipped_count) + logger.info("Deduplication results: %d new, %d skipped (already in baseline)", len(new_metadata), skipped_count) return new_metadata, skipped_count def _save_partial_metadata(metadata_dict, outfolder, virus_clean, reason="api_failure"): - """ - Save partial metadata to CSV for recovery via --baseline. + """Save partial metadata to CSV for recovery via --baseline. Args: metadata_dict (dict): Dictionary mapping accession -> metadata. @@ -883,8 +895,10 @@ def _save_partial_metadata(metadata_dict, outfolder, virus_clean, reason="api_fa virus_clean (str): Sanitized virus name for the filename. reason (str): Reason for saving (for the filename). - Returns: + Returns + ------- str or None: Path to the saved partial metadata file. + """ if not metadata_dict: return None @@ -895,48 +909,56 @@ def _save_partial_metadata(metadata_dict, outfolder, virus_clean, reason="api_fa try: rows = [] for acc, meta in metadata_dict.items(): - row = {'accession': acc} - for key in ['virus_name', 'length', 'completeness', 'releaseDate', - 'location', 'sourceDatabase', 'isolateName']: + row = {"accession": acc} + for key in [ + "virus_name", + "length", + "completeness", + "releaseDate", + "location", + "sourceDatabase", + "isolateName", + ]: if key in meta: row[key] = meta[key] - host_info = meta.get('host', {}) + host_info = meta.get("host", {}) if isinstance(host_info, dict): - row['host'] = host_info.get('organism_name', '') + row["host"] = host_info.get("organism_name", "") elif host_info: - row['host'] = str(host_info) + row["host"] = str(host_info) rows.append(row) df = pd.DataFrame(rows) df.to_csv(partial_file, index=False) logger.info("Partial metadata saved: %s (%d records)", partial_file, len(df)) return partial_file - except Exception as e: + except Exception as e: # noqa: BLE001 logger.warning("Failed to save partial metadata: %s", e) return None def _merge_baseline_with_new(baseline_path, new_metadata_list, output_path): - """ - Merge baseline metadata with newly fetched metadata into a single CSV. + """Merge baseline metadata with newly fetched metadata into a single CSV. Args: baseline_path (str): Path to the baseline metadata file. new_metadata_list (list): List of new metadata dictionaries. output_path (str): Path for the merged CSV output. - Returns: + Returns + ------- bool: True if merge was successful, False otherwise. + """ try: # Load baseline data file_ext = os.path.splitext(baseline_path)[1].lower() - - if file_ext == '.csv': + + if file_ext == ".csv": baseline_df = pd.read_csv(baseline_path, low_memory=False) - elif file_ext == '.jsonl': + elif file_ext == ".jsonl": records = [] - with open(baseline_path, 'r', encoding='utf-8') as f: + with open(baseline_path, encoding="utf-8") as f: for line in f: line = line.strip() if line: @@ -945,106 +967,113 @@ def _merge_baseline_with_new(baseline_path, new_metadata_list, output_path): except json.JSONDecodeError: continue baseline_df = pd.DataFrame(records) - elif file_ext == '.json': - with open(baseline_path, 'r', encoding='utf-8') as f: + elif file_ext == ".json": + with open(baseline_path, encoding="utf-8") as f: data = json.load(f) if isinstance(data, list): baseline_df = pd.DataFrame(data) elif isinstance(data, dict): - baseline_df = pd.DataFrame(list(data.values()) if all(isinstance(v, dict) for v in data.values()) else [data]) + baseline_df = pd.DataFrame( + list(data.values()) if all(isinstance(v, dict) for v in data.values()) else [data] + ) else: baseline_df = pd.DataFrame() else: accessions = [] - with open(baseline_path, 'r', encoding='utf-8') as f: + with open(baseline_path, encoding="utf-8") as f: for line in f: line = line.strip() - if line and not line.startswith('#'): + if line and not line.startswith("#"): accessions.append(line.split()[0]) - baseline_df = pd.DataFrame({'accession': accessions}) - + baseline_df = pd.DataFrame({"accession": accessions}) + # Create new DataFrame from new metadata if new_metadata_list: new_df = pd.DataFrame(new_metadata_list) else: new_df = pd.DataFrame() - + # Merge: concatenate baseline + new merged_df = pd.concat([baseline_df, new_df], ignore_index=True, sort=False) acc_col = None for col in merged_df.columns: - if col.strip().lower() == 'accession': + if col.strip().lower() == "accession": acc_col = col break if acc_col: - merged_df = merged_df.drop_duplicates(subset=[acc_col], keep='last') - + merged_df = merged_df.drop_duplicates(subset=[acc_col], keep="last") + merged_df.to_csv(output_path, index=False) logger.info("Merged output saved: %s (%d total records)", output_path, len(merged_df)) return True - - except Exception as e: + + except Exception as e: # noqa: BLE001 logger.error("❌ Failed to merge baseline with new metadata: %s", e) return False def _calculate_max_accessions_per_batch(base_url_length): - """ - Calculate the maximum number of accessions that can fit in a single API URL. - + """Calculate the maximum number of accessions that can fit in a single API URL. + The NCBI API URL format for multiple accessions is: https://api.ncbi.nlm.nih.gov/datasets/v2/virus/accession/ACC1%2CACC2%2CACC3/dataset_report - + Args: base_url_length (int): Length of the base URL without accessions. - - Returns: + + Returns + ------- int: Maximum number of accessions per batch. - + Example: >>> _calculate_max_accessions_per_batch(80) 100 # Approximate, depends on accession lengths + """ # Calculate available space for accessions - available_length = MAX_URL_LENGTH - base_url_length - BUFFER_SIZE - + available_length = MAX_URL_LENGTH - base_url_length - BUFFER_SIZE + # Each accession takes: average accession length + URL-encoded comma (%2C = 3 chars) chars_per_accession = ACCESSION_AVG_LENGTH + len(ACCESSION_URL_ENCODING) - + max_accessions = max(1, available_length // chars_per_accession) - logger.debug("Calculated max accessions per batch: %d (URL limit: %d, base URL: %d)", - max_accessions, MAX_URL_LENGTH, base_url_length) - + logger.debug( + "Calculated max accessions per batch: %d (URL limit: %d, base URL: %d)", + max_accessions, + MAX_URL_LENGTH, + base_url_length, + ) + return max_accessions def _batch_accessions_for_url(accessions, base_url_length): - """ - Split a list of accessions into batches that fit within URL length limits. - + """Split a list of accessions into batches that fit within URL length limits. + Args: accessions (list): List of accession numbers. base_url_length (int): Length of the base URL without accessions. - - Returns: + + Returns + ------- list: List of accession batches (each batch is a list of accessions). - + Example: - >>> batches = _batch_accessions_for_url(['NC_045512.2', 'MN908947.3', ...], 80) + >>> batches = _batch_accessions_for_url(["NC_045512.2", "MN908947.3", ...], 80) >>> len(batches) # Number of batches needed 3 + """ max_per_batch = _calculate_max_accessions_per_batch(base_url_length) - + batches = [] for i in range(0, len(accessions), max_per_batch): - batch = accessions[i:i + max_per_batch] + batch = accessions[i : i + max_per_batch] batches.append(batch) - - logger.info("Split %d accessions into %d batches (max %d per batch)", - len(accessions), len(batches), max_per_batch) - + + logger.info("Split %d accessions into %d batches (max %d per batch)", len(accessions), len(batches), max_per_batch) + return batches @@ -1059,18 +1088,17 @@ def _fetch_metadata_for_accession_list( failed_commands=None, temp_output_dir=None, ): - """ - Fetch metadata for a list of accessions, handling URL length limits with retries. - + """Fetch metadata for a list of accessions, handling URL length limits with retries. + This function fetches metadata for multiple accessions by: 1. Splitting the accession list into batches that fit within URL limits 2. Making separate API calls for each batch with exponential backoff retries 3. Combining all results into a single list 4. Continuing processing even if some batches fail (graceful degradation) - + The NCBI API URL format for multiple accessions is: https://api.ncbi.nlm.nih.gov/datasets/v2/virus/accession/ACC1%2CACC2%2CACC3/dataset_report - + Args: accessions (list): List of accession numbers to fetch. host (str, optional): Host organism filter. @@ -1081,48 +1109,59 @@ def _fetch_metadata_for_accession_list( refseq_only (bool, optional): RefSeq only filter. failed_commands (dict, optional): Dictionary to track failed operations. temp_output_dir (str, optional): Directory for temporary files. - - Returns: - list: Combined list of metadata records from all batches. + + Returns + ------- + list: Combined list of metadata records from all batches. Returns partial results even if some batches fail. - - Raises: + + Raises + ------ RuntimeError: If all batches fail to fetch. + """ if not accessions: logger.warning("No accessions provided to fetch metadata for") return [] - + # Initialize failed_commands tracking if not already done - if failed_commands is not None and 'api_batches' not in failed_commands: - failed_commands['api_batches'] = [] - + if failed_commands is not None and "api_batches" not in failed_commands: + failed_commands["api_batches"] = [] + # Calculate base URL length for batch sizing # BUFFER_SIZE accounts for query parameters (filters) added by fetch_virus_metadata base_url_length = len(f"{NCBI_API_BASE}/virus/accession//dataset_report") - + # Split accessions into URL-safe batches batches = _batch_accessions_for_url(accessions, base_url_length) - + all_reports = [] failed_batches = [] aggregated_deferred_filters = None # Track deferred filters from batches - - logger.info("Fetching metadata for %d accessions in %d batch(es) with exponential backoff retries", - len(accessions), len(batches)) - - for batch_num, batch in tqdm(enumerate(batches, 1), total=len(batches), desc="Fetching accession batches", unit="batch", disable=len(batches)==1): - logger.info("Processing accession batch %d/%d (%d accessions)", - batch_num, len(batches), len(batch)) - + + logger.info( + "Fetching metadata for %d accessions in %d batch(es) with exponential backoff retries", + len(accessions), + len(batches), + ) + + for batch_num, batch in tqdm( + enumerate(batches, 1), + total=len(batches), + desc="Fetching accession batches", + unit="batch", + disable=len(batches) == 1, + ): + logger.info("Processing accession batch %d/%d (%d accessions)", batch_num, len(batches), len(batch)) + # Join accessions with URL-encoded comma for the API URL accession_string = ACCESSION_URL_ENCODING.join(batch) - + # Define the fetch operation for retries def fetch_batch_metadata(): - """Callable for retry helper""" + """Callable for retry helper.""" return fetch_virus_metadata( - virus=accession_string, + virus=accession_string, # noqa: B023 accession=True, # This is an accession-based query host=host, geographic_location=geographic_location, @@ -1133,7 +1172,7 @@ def fetch_batch_metadata(): failed_commands=failed_commands, temp_output_dir=temp_output_dir, ) - + # Use exponential backoff helper for batch retries success, batch_result, error_info = _retry_with_exponential_backoff( operation_name=f"Accession batch {batch_num}/{len(batches)} ({len(batch)} accessions)", @@ -1141,10 +1180,14 @@ def fetch_batch_metadata(): max_retries=API_MAX_RETRIES, initial_delay=API_INITIAL_RETRY_DELAY, backoff_multiplier=API_RETRY_BACKOFF_MULTIPLIER, - retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError, requests.exceptions.Timeout), + retryable_exceptions=( + requests.exceptions.ConnectionError, + requests.exceptions.HTTPError, + requests.exceptions.Timeout, + ), failed_commands=failed_commands, ) - + # Unpack the tuple result from fetch_virus_metadata batch_reports = None batch_deferred_filters = None @@ -1154,13 +1197,13 @@ def fetch_batch_metadata(): else: # Backward compatibility if result is just a list batch_reports = batch_result - + # If batch_reports is a file path (string), read reports from it # This happens when fetch_virus_metadata streams to disk if isinstance(batch_reports, str) and os.path.isfile(batch_reports): file_reports = [] try: - with open(batch_reports, 'r', encoding='utf-8') as bf: + with open(batch_reports, encoding="utf-8") as bf: for line in bf: line = line.strip() if line: @@ -1168,10 +1211,10 @@ def fetch_batch_metadata(): file_reports.append(json.loads(line)) except json.JSONDecodeError: continue - except IOError: + except OSError: file_reports = [] batch_reports = file_reports - + if success and batch_reports: all_reports.extend(batch_reports) # Track deferred filters (should be the same across all batches if any) @@ -1181,9 +1224,9 @@ def fetch_batch_metadata(): tqdm.write(f"✅ Batch {batch_num}: Retrieved {len(batch_reports)} records") else: # Batch failed or returned empty - error_msg = error_info['error'] if error_info else "No data returned" + error_msg = error_info["error"] if error_info else "No data returned" tqdm.write(f"❌ Batch {batch_num} failed after {API_MAX_RETRIES} retries: {error_msg}") - + # Build URL with applied filters for manual retry base_url = f"{NCBI_API_BASE}/virus/accession/{accession_string}/dataset_report" query_params = [] @@ -1199,55 +1242,61 @@ def fetch_batch_metadata(): query_params.append(f"filter.geo_location={geographic_location.replace('_', ' ')}") if min_release_date: query_params.append(f"filter.released_since={min_release_date}T00:00:00.000Z") - + api_url = base_url + ("?" + "&".join(query_params) if query_params else "") - + failed_batch_info = { - 'batch_num': batch_num, - 'accession_count': len(batch), - 'accessions': batch, - 'api_url': api_url, + "batch_num": batch_num, + "accession_count": len(batch), + "accessions": batch, + "api_url": api_url, } failed_batches.append(failed_batch_info) - + # Track in failed_commands for later reporting _track_failed_operation( failed_commands, - 'api_batches', + "api_batches", failed_batch_info, - error_info if error_info else {'error': 'No data returned', 'exception_type': 'EmptyResponse'} + error_info if error_info else {"error": "No data returned", "exception_type": "EmptyResponse"}, ) - + # Add a small delay between batches to respect rate limits if batch_num < len(batches): time.sleep(EUTILS_INTER_BATCH_DELAY) - + # Log summary if failed_batches: - logger.warning("⚠️ %d out of %d accession batches failed to fetch metadata", - len(failed_batches), len(batches)) + logger.warning("⚠️ %d out of %d accession batches failed to fetch metadata", len(failed_batches), len(batches)) for fb in failed_batches: - logger.debug("Failed batch %d (%d accessions): %s", - fb['batch_num'], fb['accession_count'], fb['accessions'][:3]) - + logger.debug( + "Failed batch %d (%d accessions): %s", fb["batch_num"], fb["accession_count"], fb["accessions"][:3] + ) + # Continue with partial results if at least some batches succeeded if all_reports: - logger.info("Successfully retrieved %d total metadata records from %d batches", - len(all_reports), len(batches) - len(failed_batches)) + logger.info( + "Successfully retrieved %d total metadata records from %d batches", + len(all_reports), + len(batches) - len(failed_batches), + ) if failed_batches: - logger.warning("⚠️ Continuing pipeline with partial results (%d/%d batches succeeded)", - len(batches) - len(failed_batches), len(batches)) + logger.warning( + "⚠️ Continuing pipeline with partial results (%d/%d batches succeeded)", + len(batches) - len(failed_batches), + len(batches), + ) if aggregated_deferred_filters: logger.info("Deferred filters will be applied in metadata filtering stage: %s", aggregated_deferred_filters) return all_reports, aggregated_deferred_filters - + # Only raise if ALL batches failed if failed_batches: raise RuntimeError( f"All {len(batches)} accession batches failed to fetch metadata. " f"Last error: {failed_batches[-1]['accessions']}" ) - + # Fallback (shouldn't reach here) logger.warning("No accession batches were processed") return [], None @@ -1265,14 +1314,13 @@ def _try_modified_virus_names( failed_commands, _retry_attempt, error_type="Error", - temp_output_dir=None + temp_output_dir=None, ): - """ - Try fetching virus metadata with modified virus names. - + """Try fetching virus metadata with modified virus names. + This helper function iterates through available retry strategies (modification of virus name) and attempts to fetch metadata with each modified name. - + Args: virus: Original virus name. accession: Accession filter. @@ -1285,20 +1333,26 @@ def _try_modified_virus_names( failed_commands: List to track failed commands. _retry_attempt: Current retry attempt number. error_type: String describing the error type for logging. - - Returns: + + Returns + ------- list or None: The fetched metadata if successful, None if all retries failed. + """ if _retry_attempt >= 2 or accession or virus.isdigit(): return None - + # Try modification strategies in order for attempt_num in range(_retry_attempt + 1, 3): # Try remaining attempts (1 and/or 2) modified_virus = _get_modified_virus_name(virus, attempt=attempt_num) if modified_virus: - logger.warning("%s with virus name '%s'. " - "Retrying with modified name: '%s' (strategy %d)", - error_type, virus, modified_virus, attempt_num) + logger.warning( + "%s with virus name '%s'. Retrying with modified name: '%s' (strategy %d)", + error_type, + virus, + modified_virus, + attempt_num, + ) try: return fetch_virus_metadata( virus=modified_virus, @@ -1317,7 +1371,7 @@ def _try_modified_virus_names( logger.warning("Retry with modified virus name '%s' failed", modified_virus) # Continue to try next strategy continue - + # All retry strategies exhausted logger.warning("All retry strategies failed") return None @@ -1336,18 +1390,17 @@ def fetch_virus_metadata( _retry_attempt=0, temp_output_dir=None, ): - """ - Fetch virus metadata using NCBI Datasets API. - + """Fetch virus metadata using NCBI Datasets API. + This function retrieves metadata for virus sequences from the NCBI Datasets API using either taxon-based or accession-based queries. It handles pagination automatically to retrieve all available results. - + When the server is unreachable, this function will automatically retry with modified virus names: 1. First retry: Remove parenthetical content (e.g., "(LASV)") 2. Second retry: Add " virus" suffix or fix spacing - + Args: virus (str): Virus taxon name/ID or accession number. accession (bool): Whether virus parameter is an accession number. @@ -1359,26 +1412,28 @@ def fetch_virus_metadata( refseq_only (bool, optional): Limit to RefSeq genomes only. failed_commands (dict, optional): Dictionary to track failed operations. _retry_attempt (int): Internal counter for retry attempts (0=original, 1=first retry, 2=second retry). - - Returns: + + Returns + ------- list: List of virus metadata records from the API response. - - Raises: + + Raises + ------ RuntimeError: If the API request fails. - + Note: Metadata is streamed to a temporary JSONL file during fetching to reduce RAM usage for large datasets. If temp_output_dir is provided, the file is saved there; otherwise it's saved in the system temp directory. + """ - metadata_file = None temp_metadata_file = None - + # Save original filter values before URL encoding (for deferred local filtering) original_geographic_location = geographic_location original_host = host - + # Choose the appropriate API endpoint based on whether we're querying by accession or taxon if accession: # For accession numbers (e.g., NC_045512.2), use the accession-specific endpoint @@ -1388,7 +1443,7 @@ def fetch_virus_metadata( logger.debug("Using accession endpoint for virus: %s", virus) params = {} else: - # For taxon names/IDs (e.g., 'Zika Virus', 'influenza'), use the taxon endpoint + # For taxon names/IDs (e.g., 'Zika Virus', 'influenza'), use the taxon endpoint url = f"{NCBI_API_BASE}/virus/taxon/{virus}/dataset_report" logger.debug("Using taxon endpoint for virus: %s", virus) params = {} @@ -1397,50 +1452,50 @@ def fetch_virus_metadata( # These filters are applied server-side before results are returned if refseq_only: # Limit results to RefSeq database entries only - params['filter.refseq_only'] = 'true' + params["filter.refseq_only"] = "true" logger.debug("Applied RefSeq-only filter") - + if annotated is True: # Only return sequences that have been annotated with gene/protein information - params['filter.annotated_only'] = 'true' + params["filter.annotated_only"] = "true" logger.debug("Applied annotated-only filter") - + if complete_only: # Only return complete genome sequences (not partial sequences) - params['filter.complete_only'] = 'true' + params["filter.complete_only"] = "true" logger.debug("Applied complete-only filter") - + if host: # Filter by host organism name, replacing underscores with spaces for API compatibility - host = host.strip('"\'-_<|>`\'') - host = host.replace('-', '+').replace('_', '+').replace(' ', '+') - params['filter.host'] = host + host = host.strip("\"'-_<|>`'") # noqa: B005 + host = host.replace("-", "+").replace("_", "+").replace(" ", "+") + params["filter.host"] = host logger.debug("Applied host filter: %s", host) - + if geographic_location: # Filter by geographic location, replacing underscores with spaces for API compatibility geographic_location = geographic_location.strip('"-_<|>`') geographic_location = geographic_location.replace("'", "%27").replace("`", "%27") - geographic_location = geographic_location.replace('-', '+').replace('_', '+').replace(' ', '+') - params['filter.geo_location'] = geographic_location + geographic_location = geographic_location.replace("-", "+").replace("_", "+").replace(" ", "+") + params["filter.geo_location"] = geographic_location logger.debug("Applied geographic location filter: %s", geographic_location) if min_release_date: # Convert date to ISO format expected by the API (YYYY-MM-DDTHH:MM:SS.sssZ) - params['filter.released_since'] = f"{min_release_date}T00:00:00.000Z" + params["filter.released_since"] = f"{min_release_date}T00:00:00.000Z" logger.debug("Applied minimum release date filter: %s", min_release_date) # Set page size to maximum allowed to minimize the number of API calls needed # The NCBI API supports pagination for large result sets - params['page_size'] = API_PAGE_SIZE + params["page_size"] = API_PAGE_SIZE logger.debug("Set page size to maximum: %d records per request", API_PAGE_SIZE) - + # Initialize variables for handling paginated results total_records_streamed = 0 # Counter for records written to temp file (NOT held in RAM) - page_token = None # Token for accessing subsequent pages - page_count = 0 # Track number of pages processed for logging - pages_pbar = None # Progress bar for pagination (created when we know total pages) - + page_token = None # Token for accessing subsequent pages + page_count = 0 # Track number of pages processed for logging + pages_pbar = None # Progress bar for pagination (created when we know total pages) + # Create a temporary file to stream metadata as it arrives from the API # This prevents large datasets from consuming all system RAM # Save in output temp directory @@ -1448,23 +1503,23 @@ def fetch_virus_metadata( temp_metadata_file = os.path.join(temp_output_dir, f"gget_metadata_{timestamp}_{random_suffix}.jsonl") metadata_file = None try: - metadata_file = open(temp_metadata_file, 'w', encoding='utf-8') + metadata_file = open(temp_metadata_file, "w", encoding="utf-8") logger.info("Streaming API metadata to temporary file: %s", temp_metadata_file) - except IOError as e: + except OSError as e: logger.warning("Could not open temporary metadata file for streaming: %s. Metadata will be held in RAM.", e) temp_metadata_file = None - + # Main pagination loop - continue until all pages are retrieved loop = True while loop: page_count += 1 - + # Add pagination token if we're not on the first page if page_token: - params['page_token'] = page_token - + params["page_token"] = page_token + def fetch_single_page(): - """Callable that fetches a single page of results""" + """Callable that fetches a single page of results.""" # Build the query string manually to preserve '+' characters in filter values # The requests library would URL-encode '+' to '%2B', but NCBI API expects literal '+' query_parts = [] @@ -1473,13 +1528,13 @@ def fetch_single_page(): encoded_value = quote(str(value), safe="+:") query_parts.append(f"{key}={encoded_value}") full_url = url + ("?" + "&".join(query_parts) if query_parts else "") - - # Make the HTTP GET request to the NCBI API + + # Make the HTTP GET request to the NCBI API logger.debug("Making API request to: %s", url) logger.debug("Request parameters: %s", params) response = requests.get(full_url, timeout=API_REQUEST_TIMEOUT) logger.debug("Explicit URL request sent: %s", response.url) - + # Raise an exception if the HTTP request failed (4xx or 5xx status codes) response.raise_for_status() @@ -1491,9 +1546,9 @@ def fetch_single_page(): f"NCBI API returned non-JSON response (HTTP {response.status_code}): {response.text[:200]}" ) from e logger.debug("Received response with %d bytes", len(response.content)) - + return data - + # Use exponential backoff helper for single page fetch success, page_data, error_info = _retry_with_exponential_backoff( operation_name=f"API page {page_count}", @@ -1501,32 +1556,41 @@ def fetch_single_page(): max_retries=API_MAX_RETRIES, initial_delay=API_INITIAL_RETRY_DELAY, backoff_multiplier=API_RETRY_BACKOFF_MULTIPLIER, - retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError, requests.exceptions.Timeout), + retryable_exceptions=( + requests.exceptions.ConnectionError, + requests.exceptions.HTTPError, + requests.exceptions.Timeout, + ), failed_commands=failed_commands, ) - + # If the initial page fetch failed, try filter removal strategies FIRST, then page size reduction if not success and _retry_attempt == 0: - logger.debug("⚠️ Page fetch failed with page_size=%d. Trying filter removal strategies...", params['page_size']) - + logger.debug( + "⚠️ Page fetch failed with page_size=%d. Trying filter removal strategies...", params["page_size"] + ) + # Helper to close temp files before retry def close_temp_files(): nonlocal pages_pbar, metadata_file if pages_pbar: pages_pbar.close() pages_pbar = None - if metadata_file: + if metadata_file: # noqa: B023 try: - metadata_file.close() + metadata_file.close() # noqa: B023 except OSError as e: logger.debug("Failed to clean up metadata_file: %s", e) - + # STRATEGY 1: If geo_location filter exists, try without it (keeping host) if not success and geographic_location: logger.warning("🔄 FETCH FAILED - ATTEMPTING WITHOUT GEOGRAPHIC FILTER") - logger.warning("Retrying without the geographic_location filter '%s' (will be applied later)...", original_geographic_location) + logger.warning( + "Retrying without the geographic_location filter '%s' (will be applied later)...", + original_geographic_location, + ) close_temp_files() - + try: retry_result = fetch_virus_metadata( virus=virus, @@ -1541,26 +1605,31 @@ def close_temp_files(): _retry_attempt=1, # Mark as retry to prevent infinite loops temp_output_dir=temp_output_dir, ) - + # Handle None return (signals chunking needed) - propagate it if retry_result is None: - logger.warning("Retry without geographic filter returned None (dataset too large for single request)") + logger.warning( + "Retry without geographic filter returned None (dataset too large for single request)" + ) else: retry_reports = retry_result[0] if isinstance(retry_result, tuple) else retry_result if retry_reports is not None: logger.info("✅ Successfully retrieved records without geographic filter") - logger.info("Geographic location filter '%s' will be applied during metadata filtering", original_geographic_location) - return retry_reports, {'geographic_location': original_geographic_location} - except Exception as retry_error: + logger.info( + "Geographic location filter '%s' will be applied during metadata filtering", + original_geographic_location, + ) + return retry_reports, {"geographic_location": original_geographic_location} + except Exception as retry_error: # noqa: BLE001 logger.warning("Retry without geographic filter failed: %s", retry_error) - + # STRATEGY 2: If BOTH geo_location and host filters exist, try without both # Skip this strategy for "all viruses" (taxon 10239) since downloading ~15M unfiltered records is not viable as a retry strategy - chunked download handles it if not success and geographic_location and host and virus != NCBI_ALL_VIRUSES_TAXID: logger.warning("🔄 ATTEMPTING WITHOUT BOTH GEOGRAPHIC AND HOST FILTERS") logger.warning("Retrying without both filters (will be applied later)...") close_temp_files() - + try: retry_result = fetch_virus_metadata( virus=virus, @@ -1575,29 +1644,39 @@ def close_temp_files(): _retry_attempt=1, # Mark as retry to prevent infinite loops temp_output_dir=temp_output_dir, ) - + # Handle None return (signals chunking needed) - propagate it if retry_result is None: - logger.warning("Retry without both filters returned None (dataset too large for single request)") + logger.warning( + "Retry without both filters returned None (dataset too large for single request)" + ) else: retry_reports = retry_result[0] if isinstance(retry_result, tuple) else retry_result if retry_reports is not None: logger.info("✅ Successfully retrieved records without geographic and host filters") - logger.info("Geographic location filter '%s' will be applied during metadata filtering", original_geographic_location) + logger.info( + "Geographic location filter '%s' will be applied during metadata filtering", + original_geographic_location, + ) logger.info("Host filter '%s' will be applied during metadata filtering", original_host) - return retry_reports, {'geographic_location': original_geographic_location, 'host': original_host} - except Exception as retry_error: + return retry_reports, { + "geographic_location": original_geographic_location, + "host": original_host, + } + except Exception as retry_error: # noqa: BLE001 logger.warning("Retry without both filters failed: %s", retry_error) elif not success and geographic_location and host and virus == NCBI_ALL_VIRUSES_TAXID: - logger.info("Skipping unfiltered retry for 'all viruses' taxon (dataset too large) - will use chunked download") - + logger.info( + "Skipping unfiltered retry for 'all viruses' taxon (dataset too large) - will use chunked download" + ) + # STRATEGY 3: If host filter exists (whether or not geo_location was tried), try without host only # Skip for "all viruses" taxon - the API also fails with just geo filter for this taxon if not success and host and virus != NCBI_ALL_VIRUSES_TAXID: logger.warning("🔄 ATTEMPTING WITHOUT HOST FILTER ONLY") logger.warning("Retrying without the host filter '%s' (will be applied later)...", original_host) close_temp_files() - + try: retry_result = fetch_virus_metadata( virus=virus, @@ -1612,7 +1691,7 @@ def close_temp_files(): _retry_attempt=1, # Mark as retry to prevent infinite loops temp_output_dir=temp_output_dir, ) - + # Handle None return (signals chunking needed) - propagate it if retry_result is None: logger.warning("Retry without host filter returned None (dataset too large for single request)") @@ -1621,35 +1700,39 @@ def close_temp_files(): if retry_reports is not None: logger.info("✅ Successfully retrieved records without host filter") logger.info("Host filter '%s' will be applied during metadata filtering", original_host) - return retry_reports, {'host': original_host} - except Exception as retry_error: + return retry_reports, {"host": original_host} + except Exception as retry_error: # noqa: BLE001 logger.warning("❌ Retry without host filter failed: %s", retry_error) - + # STRATEGY 4: If all filter removal strategies failed, try reducing page size # Skip for all-viruses taxon since the issue is query scope, not page size - if not success and params['page_size'] > MIN_PAGE_SIZE_FALLBACK and virus != NCBI_ALL_VIRUSES_TAXID: + if not success and params["page_size"] > MIN_PAGE_SIZE_FALLBACK and virus != NCBI_ALL_VIRUSES_TAXID: logger.info("All filter removal strategies failed. Trying smaller page sizes...") - + # Re-open temp file for continued attempts try: - metadata_file = open(temp_metadata_file, 'a', encoding='utf-8') - except IOError: + metadata_file = open(temp_metadata_file, "a", encoding="utf-8") + except OSError: metadata_file = None - - current_page_size = params['page_size'] + + current_page_size = params["page_size"] page_size_retry_count = 0 - + while not success and current_page_size > MIN_PAGE_SIZE_FALLBACK: # Decrease page size for next retry current_page_size = max(MIN_PAGE_SIZE_FALLBACK, current_page_size - PAGE_SIZE_FALLBACK_DECREMENT) page_size_retry_count += 1 - - logger.debug("📉 Attempting retry #%d with page_size=%d (page %d)", - page_size_retry_count, current_page_size, page_count) - + + logger.debug( + "📉 Attempting retry #%d with page_size=%d (page %d)", + page_size_retry_count, + current_page_size, + page_count, + ) + # Update params with new page size - params['page_size'] = current_page_size - + params["page_size"] = current_page_size + # Retry the fetch with the smaller page size success, page_data, error_info = _retry_with_exponential_backoff( operation_name=f"API page {page_count} (page_size={current_page_size})", @@ -1657,71 +1740,81 @@ def close_temp_files(): max_retries=API_MAX_RETRIES, initial_delay=API_INITIAL_RETRY_DELAY, backoff_multiplier=API_RETRY_BACKOFF_MULTIPLIER, - retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError, requests.exceptions.Timeout), + retryable_exceptions=( + requests.exceptions.ConnectionError, + requests.exceptions.HTTPError, + requests.exceptions.Timeout, + ), failed_commands=failed_commands, ) - + if success: - logger.debug("✅ Successfully fetched page with page_size=%d after %d retry attempt(s)", - current_page_size, page_size_retry_count) + logger.debug( + "✅ Successfully fetched page with page_size=%d after %d retry attempt(s)", + current_page_size, + page_size_retry_count, + ) # Update the global page_size for remaining pages if successful - params['page_size'] = current_page_size - + params["page_size"] = current_page_size + # If progress bar exists, recalculate total pages based on new page size if pages_pbar is not None and page_data: - total_count = page_data.get('total_count', 0) + total_count = page_data.get("total_count", 0) if total_count > 0: new_total_pages = (total_count + current_page_size - 1) // current_page_size - logger.debug("📊 Recalculating progress bar: page_size changed to %d, total pages now: %d", - current_page_size, new_total_pages) + logger.debug( + "📊 Recalculating progress bar: page_size changed to %d, total pages now: %d", + current_page_size, + new_total_pages, + ) pages_pbar.total = new_total_pages break - + # If still failed after trying all page sizes down to minimum if not success: logger.warning("⚠️ All page size fallback attempts failed (page %d)", page_count) - + # Handle page fetch result if success and page_data: # Extract the virus reports from the response - reports = page_data.get('reports', []) + reports = page_data.get("reports", []) # Create progress bar on first page when we know total pages # Use current page_size (which may have been reduced) for accurate total page calculation if pages_pbar is None and page_count == 1: - total_pages = page_data.get('total_count', 0) - current_page_size = params['page_size'] + total_pages = page_data.get("total_count", 0) + current_page_size = params["page_size"] if total_pages > 0: total_pages = (total_pages + current_page_size - 1) // current_page_size pages_pbar = tqdm(total=max(total_pages, 1), desc="Fetching pages", unit="page", leave=False) if pages_pbar: pages_pbar.update(1) pages_pbar.set_postfix({"records": total_records_streamed}) - + # Stream reports to temporary file if available if metadata_file and reports: try: for report in reports: - metadata_file.write(json.dumps(report) + '\n') + metadata_file.write(json.dumps(report) + "\n") metadata_file.flush() # Ensure data is written to disk logger.debug("Streamed %d records to temporary metadata file", len(reports)) - except IOError as e: + except OSError as e: logger.warning("Error writing to temporary metadata file: %s", e) - + # Track count only - records are on disk, NOT held in RAM total_records_streamed += len(reports) - + # Check if there are more pages to retrieve - next_page_token = page_data.get('next_page_token') + next_page_token = page_data.get("next_page_token") if not next_page_token: if pages_pbar: pages_pbar.close() loop = False break - + # Set up for the next page page_token = next_page_token logger.debug("Next page token received, continuing pagination...") - + else: # Page fetch failed after retries or returned empty data # Handle case where success=True but page_data is empty (error_info will be None) @@ -1746,12 +1839,15 @@ def close_temp_files(): metadata_file.close() except OSError as e: logger.debug("Failed to clean up metadata_file: %s", e) - + # FALLBACK 1: If geo_location filter exists, try without it if geographic_location: logger.warning("🔄 EMPTY RESPONSE - ATTEMPTING WITHOUT GEOGRAPHIC FILTER") - logger.warning("Retrying without the geographic_location filter '%s' (will be applied locally)...", original_geographic_location) - + logger.warning( + "Retrying without the geographic_location filter '%s' (will be applied locally)...", + original_geographic_location, + ) + try: retry_result = fetch_virus_metadata( virus=virus, @@ -1766,30 +1862,39 @@ def close_temp_files(): _retry_attempt=1, # Mark as retry to prevent infinite loops temp_output_dir=temp_output_dir, ) - + # Handle None return (signals chunking needed) - propagate it if retry_result is None: - logger.warning("Retry without geographic filter returned None (dataset too large for single request)") + logger.warning( + "Retry without geographic filter returned None (dataset too large for single request)" + ) else: retry_reports = retry_result[0] if isinstance(retry_result, tuple) else retry_result if retry_reports is not None: logger.info("✅ Successfully retrieved records without geographic filter") - logger.info("Geographic location filter '%s' will be applied during metadata filtering", original_geographic_location) - return retry_reports, {'geographic_location': original_geographic_location} - except Exception as retry_error: + logger.info( + "Geographic location filter '%s' will be applied during metadata filtering", + original_geographic_location, + ) + return retry_reports, {"geographic_location": original_geographic_location} + except Exception as retry_error: # noqa: BLE001 logger.warning("Retry without geographic filter failed: %s", retry_error) - + # FALLBACK 2: If host filter exists and geo retry failed or wasn't tried if host: logger.warning("🔄 ATTEMPTING WITHOUT HOST FILTER") - logger.warning("Retrying without the host filter '%s' (will be applied locally)...", original_host) - + logger.warning( + "Retrying without the host filter '%s' (will be applied locally)...", original_host + ) + try: retry_result = fetch_virus_metadata( virus=virus, accession=accession, host=None, # Remove host filter - geographic_location=None if geographic_location else None, # Also remove geo if present + geographic_location=None + if geographic_location + else None, # Also remove geo if present annotated=annotated, complete_only=complete_only, min_release_date=min_release_date, @@ -1798,71 +1903,80 @@ def close_temp_files(): _retry_attempt=1, temp_output_dir=temp_output_dir, ) - + # Handle None return (signals chunking needed) - propagate it if retry_result is None: - logger.warning("Retry without host filter returned None (dataset too large for single request)") + logger.warning( + "Retry without host filter returned None (dataset too large for single request)" + ) else: retry_reports = retry_result[0] if isinstance(retry_result, tuple) else retry_result if retry_reports is not None: - deferred = {'host': original_host} + deferred = {"host": original_host} if geographic_location: - deferred['geographic_location'] = original_geographic_location + deferred["geographic_location"] = original_geographic_location logger.info("✅ Successfully retrieved records without filters") - logger.info("Deferred filters will be applied during metadata filtering: %s", list(deferred.keys())) + logger.info( + "Deferred filters will be applied during metadata filtering: %s", + list(deferred.keys()), + ) return retry_reports, deferred - except Exception as retry_error: + except Exception as retry_error: # noqa: BLE001 logger.warning("Retry without host filter failed: %s", retry_error) - + # All fallback strategies exhausted or we're already in retry mode error_msg = f"API request returned no data for {virus}. The dataset may be empty or unavailable. Please verify the virus name and filters, or try again later." if failed_commands is not None: - failed_commands['empty_response'] = {'error': error_msg} + failed_commands["empty_response"] = {"error": error_msg} logger.error(error_msg) raise RuntimeError(error_msg) from None - + last_exception = error_info - - if isinstance(last_exception.get('exception_type'), str) and last_exception['exception_type'] == 'Timeout': + + if isinstance(last_exception.get("exception_type"), str) and last_exception["exception_type"] == "Timeout": # For pagination timeouts, we can continue with partial results if page_count > 1 and total_records_streamed > 0: # We have collected some pages already logger.warning("⚠️ Request timed out while fetching additional pages (page %d)", page_count) logger.info("Continuing with %d records collected so far...", total_records_streamed) - + # Track timeout in failed_commands for user reference if failed_commands is not None: - if 'pagination_timeouts' not in failed_commands: - failed_commands['pagination_timeouts'] = [] - failed_commands['pagination_timeouts'].append({ - 'page': page_count, - 'error': 'API request timeout', - 'url': url, - 'records_retrieved': total_records_streamed, - }) - + if "pagination_timeouts" not in failed_commands: + failed_commands["pagination_timeouts"] = [] + failed_commands["pagination_timeouts"].append( + { + "page": page_count, + "error": "API request timeout", + "url": url, + "records_retrieved": total_records_streamed, + } + ) + # Break pagination loop and return partial results loop = False break else: # Handle timeout error with specific guidance for known problematic filters - error_msg = f"Request timed out while fetching virus metadata: {last_exception.get('error', 'Unknown')}" - + error_msg = ( + f"Request timed out while fetching virus metadata: {last_exception.get('error', 'Unknown')}" + ) + # Track API timeout information for summary if failed_commands is not None: - failed_commands['api_timeout'] = { - 'error': 'API request timeout', - 'url': url, - 'alternative_command': None + failed_commands["api_timeout"] = { + "error": "API request timeout", + "url": url, + "alternative_command": None, } - + # Log the timeout error before raising logger.error("=" * 80) logger.error("REQUEST TIMEOUT") logger.error("=" * 80) logger.error(error_msg) logger.error("=" * 80) - + # Close temporary file and progress bar before raising exception if pages_pbar: pages_pbar.close() @@ -1871,26 +1985,28 @@ def close_temp_files(): metadata_file.close() except OSError as e: logger.debug("Failed to clean up metadata_file: %s", e) - + raise RuntimeError(error_msg) from None - - elif last_exception.get('exception_type') == 'ConnectionError': + + elif last_exception.get("exception_type") == "ConnectionError": # For pagination connection errors, continue with partial results if available if page_count > 1 and total_records_streamed > 0: logger.warning("⚠️ Connection error while fetching additional pages (page %d)", page_count) logger.info("Continuing with %d records collected so far...", total_records_streamed) - + # Track error in failed_commands if failed_commands is not None: - if 'pagination_errors' not in failed_commands: - failed_commands['pagination_errors'] = [] - failed_commands['pagination_errors'].append({ - 'page': page_count, - 'error_type': 'ConnectionError', - 'error': last_exception.get('error', 'Unknown'), - 'records_retrieved': total_records_streamed, - }) - + if "pagination_errors" not in failed_commands: + failed_commands["pagination_errors"] = [] + failed_commands["pagination_errors"].append( + { + "page": page_count, + "error_type": "ConnectionError", + "error": last_exception.get("error", "Unknown"), + "records_retrieved": total_records_streamed, + } + ) + loop = False break else: @@ -1908,75 +2024,93 @@ def close_temp_files(): failed_commands=failed_commands, _retry_attempt=_retry_attempt, error_type="Connection error", - temp_output_dir=temp_output_dir + temp_output_dir=temp_output_dir, ) if retry_result is not None: return retry_result - + # Log the connection error before raising - error_msg = f"Connection error while fetching virus metadata: {last_exception.get('error', 'Unknown')}" + error_msg = ( + f"Connection error while fetching virus metadata: {last_exception.get('error', 'Unknown')}" + ) logger.error("=" * 80) logger.error("CONNECTION ERROR") logger.error("=" * 80) logger.error(error_msg) logger.error("Please check your internet connection and try again.") logger.error("=" * 80) - + # Close temporary file before raising exception if metadata_file: try: metadata_file.close() except OSError as e: logger.debug("Failed to clean up metadata_file: %s", e) - + raise RuntimeError(error_msg) from None - - elif last_exception.get('exception_type') == 'HTTPError': + + elif last_exception.get("exception_type") == "HTTPError": # For pagination HTTP errors, continue with partial results if available if page_count > 1 and total_records_streamed > 0: - logger.warning("⚠️ HTTP error while fetching additional pages (page %d): %s", page_count, last_exception.get('error')) + logger.warning( + "⚠️ HTTP error while fetching additional pages (page %d): %s", + page_count, + last_exception.get("error"), + ) logger.info("Continuing with %d records collected so far...", total_records_streamed) - + # Track error in failed_commands if failed_commands is not None: - if 'pagination_errors' not in failed_commands: - failed_commands['pagination_errors'] = [] - failed_commands['pagination_errors'].append({ - 'page': page_count, - 'error_type': 'HTTPError', - 'error': last_exception.get('error', 'Unknown'), - 'records_retrieved': total_records_streamed, - }) - + if "pagination_errors" not in failed_commands: + failed_commands["pagination_errors"] = [] + failed_commands["pagination_errors"].append( + { + "page": page_count, + "error_type": "HTTPError", + "error": last_exception.get("error", "Unknown"), + "records_retrieved": total_records_streamed, + } + ) + loop = False break else: # Handle HTTP errors with specific guidance for known issues error_msg = f"HTTP error while fetching virus metadata: {last_exception.get('error', 'Unknown')}" - + # Check for specific server error patterns (5xx errors indicate server unreachability) - is_server_error = '500' in last_exception.get('error', '') or '502' in last_exception.get('error', '') or '503' in last_exception.get('error', '') or '504' in last_exception.get('error', '') - + is_server_error = ( + "500" in last_exception.get("error", "") + or "502" in last_exception.get("error", "") + or "503" in last_exception.get("error", "") + or "504" in last_exception.get("error", "") + ) + if is_server_error: # Special handling for "all viruses" query # If this is the first page and we're querying all viruses without date filters, # the dataset is too large for NCBI to handle - need to chunk by date - if virus == NCBI_ALL_VIRUSES_TAXID and not accession and page_count == 1 and not min_release_date: + if ( + virus == NCBI_ALL_VIRUSES_TAXID + and not accession + and page_count == 1 + and not min_release_date + ): logger.warning("⚠️ NCBI API cannot handle 'all viruses' query in a single request") logger.info("🔄 Automatically switching to date-chunked download strategy...") logger.info("This will split the download into yearly chunks to avoid server overload") - + # Close temporary file before returning None if metadata_file: try: metadata_file.close() except OSError as e: logger.debug("Failed to clean up metadata_file: %s", e) - + # Return None to signal that chunking is needed # The calling function will handle the chunking strategy return None - + # Special handling for numeric taxon IDs that fail with 500 errors # These are often transient issues with NCBI's server if virus.isdigit(): @@ -1986,7 +2120,7 @@ def close_temp_files(): logger.info(" 1. Wait a few minutes and try again") logger.info(" 2. Try using the virus name instead of the taxon ID") logger.info(" 3. Consider using more specific filters to reduce the dataset size") - + # Try retrying with modified virus names (skip for numeric IDs since they won't have modified versions) if not virus.isdigit(): retry_result = _try_modified_virus_names( @@ -2001,55 +2135,59 @@ def close_temp_files(): failed_commands=failed_commands, _retry_attempt=_retry_attempt, error_type="Server error (5xx)", - temp_output_dir=temp_output_dir + temp_output_dir=temp_output_dir, ) if retry_result is not None: return retry_result - + # Note: Retry without geographic_location is now handled in the page size # reduction loop above, after the first smaller page size fails - + error_msg += ( - f"\n\n🔧 SERVER ERROR DETECTED: " - f"NCBI's API is experiencing temporary server-side issues. " - f"This could be due to the specific virus/taxon ID or a genuine server problem. " - f"All page size and filter removal retries have been exhausted." + "\n\n🔧 SERVER ERROR DETECTED: " + "NCBI's API is experiencing temporary server-side issues. " + "This could be due to the specific virus/taxon ID or a genuine server problem. " + "All page size and filter removal retries have been exhausted." ) - + # Log the error details before raising logger.error("=" * 80) logger.error("API REQUEST FAILED") logger.error("=" * 80) logger.error(error_msg) logger.error("=" * 80) - + # Close temporary file before raising exception if metadata_file: try: metadata_file.close() except OSError as e: logger.debug("Failed to clean up metadata_file: %s", e) - + raise RuntimeError(error_msg) from None - + else: # Handle any other request-related errors # For pagination errors with partial results, continue if page_count > 1 and total_records_streamed > 0: - logger.warning("⚠️ Error while fetching additional pages (page %d): %s", page_count, last_exception.get('error')) + logger.warning( + "⚠️ Error while fetching additional pages (page %d): %s", page_count, last_exception.get("error") + ) logger.info("Continuing with %d records collected so far...", total_records_streamed) - + # Track error in failed_commands if failed_commands is not None: - if 'pagination_errors' not in failed_commands: - failed_commands['pagination_errors'] = [] - failed_commands['pagination_errors'].append({ - 'page': page_count, - 'error_type': last_exception.get('exception_type', 'Unknown'), - 'error': last_exception.get('error', 'Unknown'), - 'records_retrieved': total_records_streamed, - }) - + if "pagination_errors" not in failed_commands: + failed_commands["pagination_errors"] = [] + failed_commands["pagination_errors"].append( + { + "page": page_count, + "error_type": last_exception.get("exception_type", "Unknown"), + "error": last_exception.get("error", "Unknown"), + "records_retrieved": total_records_streamed, + } + ) + if pages_pbar: pages_pbar.close() loop = False @@ -2061,16 +2199,16 @@ def close_temp_files(): logger.error("=" * 80) logger.error(error_msg) logger.error("=" * 80) - + # Close temporary file before raising exception if metadata_file: try: metadata_file.close() except OSError as e: logger.debug("Failed to clean up metadata_file: %s", e) - + raise RuntimeError(error_msg) from None - + # Close the temporary metadata file if it was created if metadata_file: try: @@ -2078,51 +2216,53 @@ def close_temp_files(): logger.debug("✅ Closed temporary metadata file: %s", temp_metadata_file) logger.debug(" This file is being used to reduce RAM usage during API metadata fetching") logger.debug(" It will be kept in: %s", temp_output_dir) - except IOError as e: + except OSError as e: logger.warning("Error closing temporary metadata file: %s", e) - + # Log the final results summary - logger.info("Successfully retrieved %d virus records from NCBI API across %d pages", - total_records_streamed, page_count) - + logger.info( + "Successfully retrieved %d virus records from NCBI API across %d pages", total_records_streamed, page_count + ) + if temp_metadata_file and os.path.exists(temp_metadata_file): file_size_mb = os.path.getsize(temp_metadata_file) / (1024 * 1024) logger.info("Temporary metadata file size: %.2f MB", file_size_mb) - + # Return the temp file path instead of holding all records in RAM. The caller will stream from this file to build metadata_dict. return temp_metadata_file, None # (temp_file_path, deferred_filters) - None means no deferred filters def fetch_virus_metadata_chunked( - virus, - accession=False, - host=None, - geographic_location=None, + virus, + accession=False, + host=None, + geographic_location=None, annotated=None, complete_only=False, min_release_date=None, max_release_date=None, refseq_only=False, failed_commands=None, - temp_output_dir=None + temp_output_dir=None, ): - """ - Fetch virus metadata using a chunked date-range strategy for very large datasets. - + """Fetch virus metadata using a chunked date-range strategy for very large datasets. + This function is used as a fallback when the standard fetch_virus_metadata fails due to dataset size limitations. It breaks down the request into yearly chunks starting from a reasonable start date or user's min_release_date to the present. - + Because the NCBI API currently cannot handle broad taxon queries (e.g., taxon 10239 for all viruses) with server-side filters like host or geographic_location, this function makes UNFILTERED requests and tracks those filters as deferred, to be applied later during metadata filtering. - + Args: Same as fetch_virus_metadata. - - Returns: + + Returns + ------- tuple: (list of virus metadata records, dict of deferred_filters or None) - - Raises: + + Raises + ------ RuntimeError: If any chunk fails to download. + """ - logger.info("=" * 80) logger.info("📦 CHUNKED DOWNLOAD MODE ACTIVATED") logger.info("=" * 80) @@ -2130,63 +2270,70 @@ def fetch_virus_metadata_chunked( logger.info("Splitting download into yearly chunks to ensure successful completion.") logger.info("This may take a while, but ensures all data is retrieved.") logger.info("=" * 80) - + # Since the API cannot handle broad taxon queries with filters (returns 500), we make unfiltered requests and track filters as deferred for post-hoc application. deferred_filters = {} if host: - deferred_filters['host'] = host + deferred_filters["host"] = host logger.info("Host filter '%s' will be deferred and applied during metadata filtering", host) if geographic_location: - deferred_filters['geographic_location'] = geographic_location - logger.info("Geographic location filter '%s' will be deferred and applied during metadata filtering", geographic_location) - + deferred_filters["geographic_location"] = geographic_location + logger.info( + "Geographic location filter '%s' will be deferred and applied during metadata filtering", + geographic_location, + ) + if deferred_filters: logger.info("Filters deferred to post-download filtering: %s", list(deferred_filters.keys())) - + # Define date range for chunking # If user specified min_release_date, use it; otherwise start from default year if min_release_date: # Extract year from user's min_release_date - start_year = int(min_release_date.split('-')[0]) + start_year = int(min_release_date.split("-")[0]) logger.info(f"Starting from user-specified year: {start_year}") else: # Start from default year as most valuable viral sequence data is from then onwards start_year = CHUNKED_DOWNLOAD_START_YEAR logger.info("Starting from year %d (default for 'all viruses' downloads)", CHUNKED_DOWNLOAD_START_YEAR) - + current_date = datetime.now() current_year = current_date.year - + # If max_release_date is specified, limit the end year to avoid downloading unnecessary data end_year = current_year if max_release_date: try: - end_year = int(max_release_date.split('-')[0]) - logger.info("Limiting chunked download to year %d based on max_release_date '%s'", end_year, max_release_date) + end_year = int(max_release_date.split("-")[0]) + logger.info( + "Limiting chunked download to year %d based on max_release_date '%s'", end_year, max_release_date + ) # max_release_date will be applied by the caller's metadata filtering step (already in the filters dict), so no need to add it to deferred_filters here except (ValueError, IndexError): - logger.warning("Could not parse max_release_date '%s' for year limit, downloading to current year", max_release_date) + logger.warning( + "Could not parse max_release_date '%s' for year limit, downloading to current year", max_release_date + ) end_year = current_year - + all_reports = [] chunk_temp_files = [] # Track temp file paths from each chunk total_records_count = 0 # Track total records without holding in RAM total_chunks = end_year - start_year + 1 - + logger.info(f"Will process {total_chunks} year(s) from {start_year} to {end_year}") logger.info("=" * 80) - + for year in tqdm(range(start_year, end_year + 1), total=total_chunks, desc="Fetching yearly chunks", unit="year"): chunk_start = f"{year}-01-01" chunk_end = f"{year}-12-31" - + # For the current year, use today's date as the end if year == current_year: chunk_end = current_date.strftime("%Y-%m-%d") - + chunk_num = year - start_year + 1 tqdm.write(f"📥 Chunk {chunk_num}/{total_chunks}: Fetching data for year {year} ({chunk_start} to {chunk_end})") - + try: # Fetch metadata for this date chunk WITHOUT host/geo filters # (the API currently cannot handle them for broad taxon queries) @@ -2200,9 +2347,9 @@ def fetch_virus_metadata_chunked( min_release_date=chunk_start, refseq_only=refseq_only, failed_commands=failed_commands, - temp_output_dir=temp_output_dir + temp_output_dir=temp_output_dir, ) - + # Handle tuple return (reports, deferred_filters) chunk_reports = None chunk_deferred_filters = None @@ -2211,30 +2358,30 @@ def fetch_virus_metadata_chunked( chunk_reports, chunk_deferred_filters = chunk_result else: chunk_reports = chunk_result - + # If we got None, it means even this chunk is too large if chunk_reports is None: logger.error(f"❌ Chunk for year {year} returned None (dataset too large even for yearly chunk)") logger.error("This is unexpected and may indicate an API issue") raise RuntimeError(f"Year {year} chunk failed - dataset too large even when split by year") - + # Merge any deferred filters from the chunk itself (e.g., if annotated was deferred) if chunk_deferred_filters: for k, v in chunk_deferred_filters.items(): if k not in deferred_filters: deferred_filters[k] = v logger.debug("Chunk %d added deferred filter: %s=%s", chunk_num, k, v) - + # Handle chunk_reports which can be a file path (string) or a list if isinstance(chunk_reports, str) and os.path.isfile(chunk_reports): # Count records in the chunk file without loading into RAM chunk_count = 0 try: - with open(chunk_reports, 'r', encoding='utf-8') as cf: + with open(chunk_reports, encoding="utf-8") as cf: for line in cf: if line.strip(): chunk_count += 1 - except IOError: + except OSError: chunk_count = 0 chunk_temp_files.append(chunk_reports) total_records_count += chunk_count @@ -2243,162 +2390,168 @@ def fetch_virus_metadata_chunked( chunk_count = len(chunk_reports) all_reports.extend(chunk_reports) total_records_count += chunk_count - - tqdm.write(f"✅ Chunk {chunk_num}/{total_chunks}: Retrieved {chunk_count:,} records (total: {total_records_count:,})") - + + tqdm.write( + f"✅ Chunk {chunk_num}/{total_chunks}: Retrieved {chunk_count:,} records (total: {total_records_count:,})" + ) + # Add a small delay between chunks to be respectful to NCBI servers if year < end_year: time.sleep(CHUNKED_DOWNLOAD_INTER_CHUNK_DELAY) - + except Exception as e: logger.error(f"❌ Failed to fetch chunk for year {year}: {e}") raise RuntimeError(f"Chunked download failed at year {year}") from e - + logger.info("") logger.info("=" * 80) - logger.info(f"✅ CHUNKED DOWNLOAD COMPLETE") + logger.info("✅ CHUNKED DOWNLOAD COMPLETE") logger.info(f" Total records retrieved: {total_records_count:,}") logger.info(f" Total chunks processed: {total_chunks}") if deferred_filters: logger.info(" Deferred filters to apply: %s", deferred_filters) logger.info("=" * 80) - + # If we have chunk temp files, merge them into a single JSONL and return the path if chunk_temp_files: # Create a merged temp file path merged_temp_file = os.path.join(temp_output_dir, f"gget_metadata_chunked_{timestamp}_{random_suffix}.jsonl") try: - with open(merged_temp_file, 'w', encoding='utf-8') as outf: + with open(merged_temp_file, "w", encoding="utf-8") as outf: # First, write any in-memory reports (from small chunks) for report in all_reports: - outf.write(json.dumps(report) + '\n') + outf.write(json.dumps(report) + "\n") # Then append contents of chunk temp files for chunk_file in chunk_temp_files: - with open(chunk_file, 'r', encoding='utf-8') as inf: + with open(chunk_file, encoding="utf-8") as inf: for line in inf: if line.strip(): - outf.write(line if line.endswith('\n') else line + '\n') + outf.write(line if line.endswith("\n") else line + "\n") logger.info("Merged %d chunk files into: %s", len(chunk_temp_files), merged_temp_file) return merged_temp_file, deferred_filters if deferred_filters else None - except IOError as e: + except OSError as e: logger.warning("Failed to merge chunk files: %s. Falling back to in-memory.", e) # Fall through to return all_reports if merge fails - + return all_reports, deferred_filters if deferred_filters else None def is_sars_cov2_query(virus, accession=False): - """ - Check if the query is for SARS-CoV-2 to enable optimized cached downloads. - + """Check if the query is for SARS-CoV-2 to enable optimized cached downloads. + Args: virus (str): Virus taxon name/ID or accession number. accession (bool): Whether virus parameter is an accession number. - - Returns: + + Returns + ------- bool: True if this is a SARS-CoV-2 query. + """ if accession: # When in accession mode, let the user explicitly set is_sars_cov2=True # rather than trying to detect it return False - + # Check for common SARS-CoV-2 identifiers in taxon names - virus_lower = virus.lower().replace('-', '').replace('_', '').replace(' ', '') - + virus_lower = virus.lower().replace("-", "").replace("_", "").replace(" ", "") + # Check if the query matches any SARS-CoV-2 identifier for identifier in SARS_COV2_IDENTIFIERS: if identifier in virus_lower: logger.info("Detected SARS-CoV-2 query: %s matches %s", virus, identifier) return True - + # logger.info("=== Not a SARS-CoV-2 query: %s", virus) return False def is_alphainfluenza_query(virus, accession=False): - """ - Check if the query is for Alphainfluenza to enable optimized cached downloads. - + """Check if the query is for Alphainfluenza to enable optimized cached downloads. + Cached packages are available for: - Alphainfluenza (genus, taxid: 197911) - Alphainfluenzavirus influenzae (species, taxid: 2955291) - Influenza A virus (no-rank, taxid: 11320) - + Args: virus (str): Virus taxon name/ID or accession number. accession (bool): Whether virus parameter is an accession number. - - Returns: + + Returns + ------- bool: True if this is an Alphainfluenza query. + """ if accession: # When in accession mode, let the user explicitly set is_alphainfluenza=True # rather than trying to detect it return False - + # Check for common Alphainfluenza identifiers in taxon names - virus_lower = virus.lower().replace('-', '').replace('_', '').replace(' ', '') - + virus_lower = virus.lower().replace("-", "").replace("_", "").replace(" ", "") + # Check if the query matches any Alphainfluenza identifier for identifier in ALPHAINFLUENZA_IDENTIFIERS: if identifier in virus_lower: logger.info("Detected Alphainfluenza query: %s matches %s", virus, identifier) return True - + # logger.info("=== Not an Alphainfluenza query: %s", virus) return False def process_cached_download(zip_file, virus_type="virus"): - """ - Process a cached download ZIP file and extract sequences with metadata. - + """Process a cached download ZIP file and extract sequences with metadata. + This helper function extracts sequences from a cached ZIP download and loads the rich metadata from data_report.jsonl (if available). The metadata is essential for post-download filtering operations. - + NCBI cached downloads typically include: - genomic.fna: FASTA sequences - data_report.jsonl: Rich metadata with virus genome information - dataset_catalog.json: List of files in the package - + Args: zip_file (str): Path to the downloaded ZIP file. virus_type (str): Type of virus for logging messages. - - Returns: + + Returns + ------- tuple: (sequences, metadata_dict, success) - sequences: List of all sequence records from the cached download. - metadata_dict: Dictionary mapping accessions to metadata (rich metadata from data_report.jsonl if available, or basic metadata from FASTA headers). - success: Boolean indicating if processing was successful. - - Raises: + + Raises + ------ RuntimeError: If no valid sequences are found in the cached data. + """ if not zip_file or not os.path.exists(zip_file): return None, None, False - + # Extract directory path from zip file name extract_dir = os.path.splitext(zip_file)[0] _unzip_file(zip_file, extract_dir) - + if not os.path.exists(extract_dir): logger.warning("Extraction directory not found: %s", extract_dir) return None, None, False - + logger.info("🔬 PROCESSING CACHED DATA...") logger.info("Extracted cached data to: %s", extract_dir) - + # Find and load metadata from data_report.jsonl (rich metadata from NCBI) metadata_files = [] fasta_files = [] - for root, dirs, files in os.walk(extract_dir): + for root, _dirs, files in os.walk(extract_dir): for file in files: - if file == 'data_report.jsonl': + if file == "data_report.jsonl": metadata_files.append(os.path.join(root, file)) - elif file.endswith(('.fasta', '.fa', '.fna')): + elif file.endswith((".fasta", ".fa", ".fna")): fasta_files.append(os.path.join(root, file)) - + # Write rich metadata from data_report.jsonl to a temp JSONL file (memory-efficient) # Instead of building a dict of millions of records in RAM, we stream to disk and let the caller load/filter from the file with _load_metadata_dict_from_temp_jsonl cached_metadata_jsonl_path = None @@ -2407,214 +2560,223 @@ def process_cached_download(zip_file, virus_type="virus"): logger.info("Found %d metadata file(s) in cached download", len(metadata_files)) # Create temp JSONL path next to the zip file cached_metadata_jsonl_path = os.path.join(extract_dir, "_cached_metadata_internal.jsonl") - + try: - with open(cached_metadata_jsonl_path, 'w', encoding='utf-8') as out_jsonl: + with open(cached_metadata_jsonl_path, "w", encoding="utf-8") as out_jsonl: for metadata_file in metadata_files: try: # Get file size for progress bar estimation file_size = os.path.getsize(metadata_file) file_size_mb = file_size / BYTES_PER_MB logger.debug("Streaming metadata file to temp JSONL: %s (%.1f MB)", metadata_file, file_size_mb) - - with open(metadata_file, 'r', encoding='utf-8') as f: + + with open(metadata_file, encoding="utf-8") as f: # Use tqdm to show progress while reading the file pbar = tqdm( total=file_size, - unit='B', + unit="B", unit_scale=True, unit_divisor=1024, desc="Processing metadata", ncols=80, - leave=True + leave=True, ) - + for line in f: if line.strip(): # Update progress based on bytes read - pbar.update(len(line.encode('utf-8'))) - + pbar.update(len(line.encode("utf-8"))) + report = json.loads(line) # Extract accession from the report - accession = report.get('accession', '') + accession = report.get("accession", "") if not accession: continue - + cached_metadata_record_count += 1 - + # Update progress bar description with record count if cached_metadata_record_count % 10000 == 0: - pbar.set_description(f"Processing metadata ({cached_metadata_record_count:,} records)") - + pbar.set_description( + f"Processing metadata ({cached_metadata_record_count:,} records)" + ) + # Transform the NCBI report format to our internal metadata format # This mirrors the logic in load_metadata_from_api_reports metadata = { - 'accession': accession, - 'length': report.get('length'), - 'geneCount': report.get('geneCount'), - 'completeness': report.get('completeness', '').lower(), + "accession": accession, + "length": report.get("length"), + "geneCount": report.get("geneCount"), + "completeness": report.get("completeness", "").lower(), } - + # Extract virus info - virus_info = report.get('virus', {}) - metadata['virusName'] = virus_info.get('organismName') - metadata['virusTaxId'] = virus_info.get('taxId') - metadata['virusPangolinClassification'] = virus_info.get('pangolinClassification') - + virus_info = report.get("virus", {}) + metadata["virusName"] = virus_info.get("organismName") + metadata["virusTaxId"] = virus_info.get("taxId") + metadata["virusPangolinClassification"] = virus_info.get("pangolinClassification") + # Extract host info - host_info = report.get('host', {}) - metadata['hostName'] = host_info.get('organismName') - metadata['hostTaxId'] = host_info.get('taxId') - - # Extract isolate info - isolate_info = report.get('isolate', {}) - metadata['isolateName'] = isolate_info.get('name') + host_info = report.get("host", {}) + metadata["hostName"] = host_info.get("organismName") + metadata["hostTaxId"] = host_info.get("taxId") + + # Extract isolate info + isolate_info = report.get("isolate", {}) + metadata["isolateName"] = isolate_info.get("name") # Store isolate as nested dict to match filter_metadata_only expectations - metadata['isolate'] = { - 'collectionDate': isolate_info.get('collectionDate'), - 'source': isolate_info.get('source'), + metadata["isolate"] = { + "collectionDate": isolate_info.get("collectionDate"), + "source": isolate_info.get("source"), } - + # Extract location info - location_info = report.get('location', {}) - metadata['location'] = location_info.get('geographicLocation') - metadata['region'] = location_info.get('geographicRegion') - + location_info = report.get("location", {}) + metadata["location"] = location_info.get("geographicLocation") + metadata["region"] = location_info.get("geographicRegion") + # Extract other fields - metadata['releaseDate'] = report.get('releaseDate') - metadata['isAnnotated'] = report.get('isAnnotated', False) - metadata['sourceDatabase'] = report.get('sourceDatabase') - metadata['isLabHost'] = report.get('isLabHost', False) - + metadata["releaseDate"] = report.get("releaseDate") + metadata["isAnnotated"] = report.get("isAnnotated", False) + metadata["sourceDatabase"] = report.get("sourceDatabase") + metadata["isLabHost"] = report.get("isLabHost", False) + # Gene and protein counts - metadata['proteinCount'] = report.get('proteinCount') - metadata['maturePeptideCount'] = report.get('maturePeptideCount') - + metadata["proteinCount"] = report.get("proteinCount") + metadata["maturePeptideCount"] = report.get("maturePeptideCount") + # Extract segment - metadata['segment'] = report.get('segment') - + metadata["segment"] = report.get("segment") + # Extract vaccine strain flag - metadata['isVaccineStrain'] = report.get('isVaccineStrain', False) + metadata["isVaccineStrain"] = report.get("isVaccineStrain", False) + + submitter_info = report.get("submitter", {}) + metadata["submitterName"] = submitter_info.get("names") + metadata["submitterCountry"] = submitter_info.get("country") + metadata["submitterInstitution"] = submitter_info.get("affiliation") - submitter_info = report.get('submitter', {}) - metadata['submitterName'] = submitter_info.get('names') - metadata['submitterCountry'] = submitter_info.get('country') - metadata['submitterInstitution'] = submitter_info.get('affiliation') - # Write transformed record to temp JSONL (one line per record) out_jsonl.write(json.dumps(metadata) + "\\n") - + pbar.close() - - logger.info("✅ Streamed %d metadata records from %s to temp JSONL", - cached_metadata_record_count, metadata_file) - except Exception as e: + + logger.info( + "✅ Streamed %d metadata records from %s to temp JSONL", + cached_metadata_record_count, + metadata_file, + ) + except Exception as e: # noqa: BLE001 logger.warning("❌ Failed to process metadata file %s: %s", metadata_file, e) continue - except Exception as e: + except Exception as e: # noqa: BLE001 logger.warning("❌ Failed to create cached metadata JSONL: %s", e) cached_metadata_jsonl_path = None else: logger.warning("No data_report.jsonl found in cached download. Post-download filters may be limited.") - + if not fasta_files: logger.error("❌ No FASTA files found in cached data.") raise RuntimeError("No FASTA files found in cached data") - + for fasta_file in fasta_files: file_size = os.path.getsize(fasta_file) file_size_mb = file_size / BYTES_PER_MB if file_size_mb < MIN_VALID_FASTA_SIZE_MB: - logger.warning("⚠️ FASTA file %s is smaller than expected (%.1f MB). It may not contain valid sequences.", fasta_file, file_size_mb) + logger.warning( + "⚠️ FASTA file %s is smaller than expected (%.1f MB). It may not contain valid sequences.", + fasta_file, + file_size_mb, + ) else: logger.info("✅ Cached FASTA file available for streaming: %s (%.1f MB)", fasta_file, file_size_mb) - + # If no rich metadata was loaded, create minimal metadata from FASTA headers if not cached_metadata_jsonl_path or cached_metadata_record_count == 0: logger.info("Creating basic metadata from FASTA headers (no data_report.jsonl available)") logger.info("Streaming FASTA files to extract minimal metadata...") - + cached_metadata_jsonl_path = os.path.join(extract_dir, "_cached_metadata_internal.jsonl") cached_metadata_record_count = 0 seen_fasta_accessions = set() - + try: - with open(cached_metadata_jsonl_path, 'w', encoding='utf-8') as out_jsonl: + with open(cached_metadata_jsonl_path, "w", encoding="utf-8") as out_jsonl: for fasta_file in fasta_files: try: file_size = os.path.getsize(fasta_file) - - with open(fasta_file, 'r', encoding='utf-8') as f: + + with open(fasta_file, encoding="utf-8") as f: pbar = tqdm( total=file_size, - unit='B', + unit="B", unit_scale=True, unit_divisor=1024, desc="Extracting FASTA metadata", ncols=80, - leave=True + leave=True, ) - + current_accession = None sequence_length = 0 description = "" - + for line in f: - pbar.update(len(line.encode('utf-8'))) - - if line.startswith('>'): + pbar.update(len(line.encode("utf-8"))) + + if line.startswith(">"): # Save previous sequence if exists if current_accession and current_accession not in seen_fasta_accessions: seen_fasta_accessions.add(current_accession) metadata = { - 'accession': current_accession, - 'description': description, - 'length': sequence_length, - 'source': 'cached_fasta_header' + "accession": current_accession, + "description": description, + "length": sequence_length, + "source": "cached_fasta_header", } out_jsonl.write(json.dumps(metadata) + "\\n") cached_metadata_record_count += 1 - + # Parse new header header = line[1:].strip() current_accession = header.split()[0] description = header sequence_length = 0 - + else: # Count bases in sequence (not including whitespace) sequence_length += len(line.strip()) - + # Save last sequence if current_accession and current_accession not in seen_fasta_accessions: seen_fasta_accessions.add(current_accession) metadata = { - 'accession': current_accession, - 'description': description, - 'length': sequence_length, - 'source': 'cached_fasta_header' + "accession": current_accession, + "description": description, + "length": sequence_length, + "source": "cached_fasta_header", } out_jsonl.write(json.dumps(metadata) + "\\n") cached_metadata_record_count += 1 - + pbar.close() - + logger.info("✅ Extracted metadata for sequences from %s", fasta_file) - - except Exception as e: + + except Exception as e: # noqa: BLE001 logger.warning("❌ Failed to extract metadata from FASTA %s: %s", fasta_file, e) continue - + logger.info("Created basic metadata for %d sequences", cached_metadata_record_count) - except Exception as e: + except Exception as e: # noqa: BLE001 logger.warning("❌ Failed to create FASTA metadata JSONL: %s", e) cached_metadata_jsonl_path = None - + logger.info("🎉 CACHED DATA LOADING SUCCESSFUL!") logger.debug("Cached %s sequences will be streamed on-demand (not loaded to RAM)", virus_type) if metadata_files: logger.info("Rich metadata available from data_report.jsonl for post-download filtering") - + # Return the cached FASTA file path and the path to the metadata JSONL (not loaded to RAM) # Sequences and metadata will be streamed on-demand when needed cached_fasta_file = fasta_files[0] if fasta_files else None @@ -2622,38 +2784,40 @@ def process_cached_download(zip_file, virus_type="virus"): def _monitor_subprocess_with_progress(process, cmd, timeout=None, progress_timeout=None): - """ - Monitor a subprocess with progress tracking and timeout handling. - + """Monitor a subprocess with progress tracking and timeout handling. + This helper function monitors a running subprocess. When stdout/stderr are piped, it checks for progress indicators. When they're not piped (output goes to console), it simply polls for completion. - + Args: process: subprocess.Popen instance to monitor. cmd (list): Command that was executed (for error reporting). timeout (int): Maximum total execution time in seconds. Defaults to DOWNLOAD_OVERALL_TIMEOUT. progress_timeout (int): Maximum time without progress in seconds. Defaults to DOWNLOAD_PROGRESS_TIMEOUT. - - Returns: + + Returns + ------- subprocess.CompletedProcess: Result of the completed process. - - Raises: + + Raises + ------ subprocess.TimeoutExpired: If timeout conditions are met. + """ # Apply default timeouts if not specified if timeout is None: timeout = DOWNLOAD_OVERALL_TIMEOUT if progress_timeout is None: progress_timeout = DOWNLOAD_PROGRESS_TIMEOUT - + start_time = time.time() last_progress = start_time - + while True: # Check if process has finished retcode = process.poll() if retcode is not None: break - + # Only check for progress if stderr was captured (is not None) if process.stderr is not None: # Read stderr without blocking @@ -2661,12 +2825,12 @@ def _monitor_subprocess_with_progress(process, cmd, timeout=None, progress_timeo if stderr: # Log the stderr for debugging # logger.debug("Progress output: %s", stderr.strip()) - + # If we see any progress indicator, update the last_progress time if any(indicator.lower() in stderr.lower() for indicator in PROGRESS_INDICATORS): last_progress = time.time() # logger.debug("Progress detected, updating last_progress time") - + # Check timeout conditions: # 1. Less than total timeout, continue # 2. If more than total timeout but progress in last progress_timeout, continue @@ -2674,13 +2838,13 @@ def _monitor_subprocess_with_progress(process, cmd, timeout=None, progress_timeo current_time = time.time() total_time = current_time - start_time time_since_progress = current_time - last_progress - + if total_time > timeout and time_since_progress > progress_timeout: process.kill() raise subprocess.TimeoutExpired(cmd, timeout) - + time.sleep(DOWNLOAD_PROGRESS_CHECK_INTERVAL) # Prevent CPU spin - + # Only call communicate if process was created with pipes, otherwise just wait if process.stdout is not None or process.stderr is not None: stdout, stderr = process.communicate() @@ -2688,31 +2852,19 @@ def _monitor_subprocess_with_progress(process, cmd, timeout=None, progress_timeo stdout = None stderr = None process.wait() - - return subprocess.CompletedProcess( - args=cmd, - returncode=retcode, - stdout=stdout, - stderr=stderr - ) + + return subprocess.CompletedProcess(args=cmd, returncode=retcode, stdout=stdout, stderr=stderr) def _download_optimized_cached( - virus_type, - strategies, - zip_path, - outdir, - use_accession=False, - accession=None, - requested_filters=None + virus_type, strategies, zip_path, outdir, use_accession=False, accession=None, requested_filters=None ): - """ - Execute optimized cached download strategies with fallback. - + """Execute optimized cached download strategies with fallback. + This is a generic implementation of the hierarchical fallback download pattern used for both SARS-CoV-2 and Alphainfluenza. It tries each strategy in order until one succeeds, with comprehensive error handling and logging. - + Args: virus_type (str): Type of virus for error messages ('SARS-CoV-2', 'Alphainfluenza', etc.). strategies (list): List of tuples (strategy_name, cmd, applied_filters). @@ -2721,45 +2873,47 @@ def _download_optimized_cached( use_accession (bool): Whether using accession-based download. accession (str, optional): Accession number if using accession-based download. requested_filters (dict, optional): Dictionary of originally requested filters. - - Returns: + + Returns + ------- tuple: (zip_path, applied_filters, missing_filters) - zip_path (str): Path to the successfully downloaded ZIP file. - applied_filters (list): List of filter names applied in successful strategy. - missing_filters (list): List of filter names not applied (need post-processing). - - Raises: + + Raises + ------ RuntimeError: If all strategies fail or datasets CLI is not available. - + Example: >>> strategies = [ ... ("Strategy 1 (specific)", ["datasets", "download", ...], ["complete-only"]), - ... ("Strategy 2 (general)", ["datasets", "download", ...], []) + ... ("Strategy 2 (general)", ["datasets", "download", ...], []), ... ] >>> zip_file, applied, missing = _download_optimized_cached( ... "SARS-CoV-2", strategies, "/path/to/output.zip", "/output/dir" ... ) + """ - # Get the path to the datasets CLI binary (uses precompiled binary bundled with gget) datasets_path = _get_datasets_path() - + last_error = None - + for strategy_name, cmd, applied_filters in strategies: # Replace "datasets" with the actual path to the binary if cmd and cmd[0] == "datasets": cmd = [datasets_path] + cmd[1:] - + logger.info("🔄 Trying optimised strategy download with %s...", strategy_name) - + if applied_filters: logger.info("Applied filters: %s", ", ".join(applied_filters)) else: logger.info("No specific filters applied") - + logger.debug("Command: %s", " ".join(cmd)) - + try: # Log the exact command being executed cmd_str = " ".join(cmd) @@ -2768,12 +2922,7 @@ def _download_optimized_cached( # Start subprocess for progress monitoring # Note: We don't use cwd=outdir because the command already includes full paths try: - process = subprocess.Popen( - cmd, - stdout=None, - stderr=None, - text=True - ) + process = subprocess.Popen(cmd, stdout=None, stderr=None, text=True) except FileNotFoundError as fnf_error: # Datasets binary not found - this shouldn't happen if bundled correctly error_msg = ( @@ -2784,32 +2933,36 @@ def _download_optimized_cached( ) logger.error(error_msg) raise RuntimeError(error_msg) from fnf_error - + # Monitor progress with timeout handling using helper function result = _monitor_subprocess_with_progress(process, cmd) - + # Check if the command was successful if result.returncode == 0 and os.path.exists(zip_path): file_size = os.path.getsize(zip_path) - + # Check if file is too small (likely empty result) - if so, try next strategy. It's not zero since the folder always comes with a generic (readme) files. if file_size < MIN_VALID_ZIP_SIZE: - logger.warning("⚠️ %s resulted in file that's too small (%.2f MB, < 100 KB minimum). Trying next strategy...", - strategy_name, file_size / 1024 / 1024) + logger.warning( + "⚠️ %s resulted in file that's too small (%.2f MB, < 100 KB minimum). Trying next strategy...", + strategy_name, + file_size / 1024 / 1024, + ) # Clean up invalid file try: os.remove(zip_path) except OSError: pass continue - - logger.info("✅ %s successful: %s (%.2f MB)", - strategy_name, os.path.basename(zip_path), file_size / 1024 / 1024) - + + logger.info( + "✅ %s successful: %s (%.2f MB)", strategy_name, os.path.basename(zip_path), file_size / 1024 / 1024 + ) + # Log any important output from the datasets CLI # if result.stdout: # logger.debug("datasets CLI output: %s", result.stdout.strip()) - + # Check which filters from the original request weren't applied in this strategy if requested_filters: requested_filter_list = [] @@ -2823,16 +2976,18 @@ def _download_optimized_cached( else: logger.debug("Non-boolean filter detected, adding key=value: %s=%s", key, value) requested_filter_list.append(f"{key}={value}") - + missing_filters = [f for f in requested_filter_list if f not in applied_filters] if missing_filters: logger.warning("⚠️ Some requested filters were not applied in successful strategy:") - logger.warning(" Filters applied: %s", ", ".join(applied_filters) if applied_filters else "none") + logger.warning( + " Filters applied: %s", ", ".join(applied_filters) if applied_filters else "none" + ) logger.warning(" Filters missing: %s", ", ".join(missing_filters)) logger.warning(" These filters will need to be applied through post-processing") else: missing_filters = [] - + return zip_path, applied_filters, missing_filters else: # Strategy failed, prepare error message @@ -2841,7 +2996,7 @@ def _download_optimized_cached( error_msg += f": {result.stderr.strip()}" logger.warning("%s", error_msg) last_error = error_msg - + # If this was an accession download that failed, provide specific guidance if use_accession: error_msg = ( @@ -2850,50 +3005,50 @@ def _download_optimized_cached( f"If you're not sure, try without the is_{virus_type.lower().replace('-', '_').replace(' ', '_')} flag." ) raise RuntimeError(error_msg) - + # Clean up failed download file if it exists if os.path.exists(zip_path): try: os.remove(zip_path) except OSError: pass - continue # Try next strategy - + continue # Try next strategy + except subprocess.TimeoutExpired: error_msg = f"{strategy_name} timed out after 30 minutes" logger.warning("%s", error_msg) last_error = error_msg continue - + except subprocess.CalledProcessError as e: error_msg = f"{strategy_name} execution failed: {e}" logger.warning("%s", error_msg) last_error = error_msg continue - - except Exception as e: + + except Exception as e: # noqa: BLE001 error_msg = f"{strategy_name} unexpected error: {e}" logger.warning("%s", error_msg) last_error = error_msg continue - + # All strategies failed logger.warning("🚨 All cached download strategies failed. Last error: %s", last_error) - + # Provide helpful guidance based on virus type example_taxon = "SARS-CoV-2" if "sars" in virus_type.lower() else virus_type guidance_messages = [ "🔧 TROUBLESHOOTING SUGGESTIONS:", "1. Check your internet connection", "2. Try running the command manually to see detailed error messages:", - f" {datasets_path} download virus genome taxon \"{example_taxon}\" --filename test.zip", + f' {datasets_path} download virus genome taxon "{example_taxon}" --filename test.zip', "3. NCBI servers may be temporarily unavailable - try again later", - f"4. Consider using the general API method by removing {virus_type} specific terms from your query" + f"4. Consider using the general API method by removing {virus_type} specific terms from your query", ] - + for msg in guidance_messages: logger.info(msg) - + # Raise error with the last failure details raise RuntimeError( f"All {virus_type} cached download strategies failed. " @@ -2911,21 +3066,20 @@ def download_sars_cov2_optimized( accession=None, use_accession=False, ): - """ - Download SARS-CoV-2 sequences using NCBI's optimized cached data packages. - + """Download SARS-CoV-2 sequences using NCBI's optimized cached data packages. + NCBI provides pre-computed, highly compressed cached packages for SARS-CoV-2 that offer faster and more reliable downloads than the general API endpoints. This function uses the datasets CLI to download these optimized packages with hierarchical fallback from specific to general cached files. - + Download strategies (in order of precedence): 1. If use_accession=True: Direct accession download using accession endpoint. 2. If use_accession=False: a. Specific lineage + complete + host filters using taxon endpoint. b. Complete genomes only using taxon endpoint. c. All SARS-CoV-2 genomes using taxon endpoint (default fallback). - + Args: host (str, optional): Host organism filter (optimized for 'human'). complete_only (bool, optional): Whether to download only complete genomes. @@ -2934,34 +3088,36 @@ def download_sars_cov2_optimized( lineage (str, optional): SARS-CoV-2 lineage filter (e.g., 'B.1.1.7', 'P.1'). accession (str, optional): Specific SARS-CoV-2 accession or taxon ID. use_accession (bool): Whether to use accession endpoint. Defaults to False. - - Returns: + + Returns + ------- str: Path to the downloaded ZIP file containing sequences and metadata. - - Raises: + + Raises + ------ RuntimeError: If the datasets CLI is not available or download fails. + """ - # Determine filter specificity for logging filter_count = sum(1 for param in [host, complete_only, annotated, lineage] if param is not None) if filter_count > 0: logger.info("Attempting SARS-CoV-2 cached download with %d specific filters", filter_count) else: logger.info("Attempting general SARS-CoV-2 cached download (no specific filters)") - + # Determine output directory if not outdir: outdir = os.getcwd() logger.debug("No output directory specified, using current directory: %s", outdir) - + # Ensure output directory exists os.makedirs(outdir, exist_ok=True) logger.debug("Output directory ready: %s", outdir) - + # Create descriptive filename with timestamp and random suffix zip_filename = f"sars_cov_2_{timestamp}_{random_suffix}.zip" zip_path = os.path.join(outdir, zip_filename) - + # Define which filters are available for this download logger.debug("Available filters for SARS-CoV-2 download:") if complete_only: @@ -2972,42 +3128,43 @@ def download_sars_cov2_optimized( logger.debug("- host filter: %s", host) if annotated: logger.debug("- annotated filter") - + # Define fallback strategies in order of preference strategies = [] - + if use_accession: # Parse the accession input to handle single, space-separated, or file-based accessions parsed = _parse_accession_input(accession) - - if parsed['is_file']: + + if parsed["is_file"]: # File-based input: use --inputfile flag - cmd1 = ["datasets", "download", "virus", "genome", "accession", - "--inputfile", parsed['file_path']] + cmd1 = ["datasets", "download", "virus", "genome", "accession", "--inputfile", parsed["file_path"]] cmd1.extend(["--filename", zip_path]) strategies.append(("Strategy 1 (accessions from file)", cmd1, [f"inputfile={parsed['file_path']}"])) - logger.debug("Using accession input file: %s", parsed['file_path']) - elif parsed['type'] == 'list': + logger.debug("Using accession input file: %s", parsed["file_path"]) + elif parsed["type"] == "list": # Space-separated accessions: pass as arguments - cmd1 = ["datasets", "download", "virus", "genome", "accession"] + parsed['accessions'] + cmd1 = ["datasets", "download", "virus", "genome", "accession"] + parsed["accessions"] cmd1.extend(["--filename", zip_path]) - strategies.append(("Strategy 1 (multiple accessions)", cmd1, [f"accessions={', '.join(parsed['accessions'][:3])}..."])) - logger.debug("Using multiple accessions: %s", ", ".join(parsed['accessions'])) + strategies.append( + ("Strategy 1 (multiple accessions)", cmd1, [f"accessions={', '.join(parsed['accessions'][:3])}..."]) + ) + logger.debug("Using multiple accessions: %s", ", ".join(parsed["accessions"])) else: # Single accession - cmd1 = ["datasets", "download", "virus", "genome", "accession", parsed['accessions']] + cmd1 = ["datasets", "download", "virus", "genome", "accession", parsed["accessions"]] cmd1.extend(["--filename", zip_path]) strategies.append(("Strategy 1 (direct accession)", cmd1, [f"accession={parsed['accessions']}"])) - logger.debug("Using single accession: %s", parsed['accessions']) + logger.debug("Using single accession: %s", parsed["accessions"]) elif lineage or complete_only or host or annotated: # Strategy 1: Try with specific filters using taxon endpoint cmd1 = ["datasets", "download", "virus", "genome", "taxon", "SARS-CoV-2"] filters1 = [] - + if complete_only: cmd1.append("--complete-only") filters1.append("complete-only") - + if lineage: cmd1.extend(["--lineage", lineage]) filters1.append(f"lineage={lineage}") @@ -3015,32 +3172,76 @@ def download_sars_cov2_optimized( if host: cmd1.extend(["--host", host]) filters1.append(f"host={host}") - + if annotated: cmd1.append("--annotated") filters1.append("annotated") cmd1.extend(["--filename", zip_path]) strategies.append(("Strategy 1 (specific filters)", cmd1, filters1)) - + # Strategy 2: Try complete-only and host if it was requested (without lineage) if complete_only and host and lineage: # Only add this if we had lineage in strategy 1 - cmd2 = ["datasets", "download", "virus", "genome", "taxon", "SARS-CoV-2", "--complete-only", "--host", host, "--filename", zip_path] + cmd2 = [ + "datasets", + "download", + "virus", + "genome", + "taxon", + "SARS-CoV-2", + "--complete-only", + "--host", + host, + "--filename", + zip_path, + ] strategies.append(("Strategy 2 (complete-only and host)", cmd2, ["complete-only", f"host={host}"])) # Strategy 3: Try complete-only if it was requested - if complete_only and (host or lineage): - cmd3 = ["datasets", "download", "virus", "genome", "taxon", "SARS-CoV-2", "--complete-only", "--filename", zip_path] + if complete_only and (host or lineage): + cmd3 = [ + "datasets", + "download", + "virus", + "genome", + "taxon", + "SARS-CoV-2", + "--complete-only", + "--filename", + zip_path, + ] strategies.append(("Strategy 3 (complete-only)", cmd3, ["complete-only"])) - # Strategy 4: Try host if it was requested - if host and (complete_only or lineage): - cmd4 = ["datasets", "download", "virus", "genome", "taxon", "SARS-CoV-2", "--host", host, "--filename", zip_path] + # Strategy 4: Try host if it was requested + if host and (complete_only or lineage): + cmd4 = [ + "datasets", + "download", + "virus", + "genome", + "taxon", + "SARS-CoV-2", + "--host", + host, + "--filename", + zip_path, + ] strategies.append(("Strategy 4 (host)", cmd4, [f"host={host}"])) - # Strategy 5: Try lineage if it was requested - if lineage and (complete_only or host): - cmd5 = ["datasets", "download", "virus", "genome", "taxon", "SARS-CoV-2", "--lineage", lineage, "--filename", zip_path] + # Strategy 5: Try lineage if it was requested + if lineage and (complete_only or host): + cmd5 = [ + "datasets", + "download", + "virus", + "genome", + "taxon", + "SARS-CoV-2", + "--lineage", + lineage, + "--filename", + zip_path, + ] strategies.append(("Strategy 5 (lineage)", cmd5, [f"lineage={lineage}"])) # Strategy 6: General SARS-CoV-2 package (no filters) @@ -3048,13 +3249,8 @@ def download_sars_cov2_optimized( strategies.append(("Strategy 6 (general package)", cmd6, [])) # Use the common download function with all strategies - requested_filters_dict = { - 'complete-only': complete_only, - 'lineage': lineage, - 'host': host, - 'annotated': annotated - } - + requested_filters_dict = {"complete-only": complete_only, "lineage": lineage, "host": host, "annotated": annotated} + return _download_optimized_cached( virus_type="SARS-CoV-2", strategies=strategies, @@ -3062,7 +3258,7 @@ def download_sars_cov2_optimized( outdir=outdir, use_accession=use_accession, accession=accession, - requested_filters=requested_filters_dict + requested_filters=requested_filters_dict, ) @@ -3074,25 +3270,24 @@ def download_alphainfluenza_optimized( accession=None, use_accession=False, ): - """ - Download Alphainfluenza sequences using NCBI's optimized cached data packages. - + """Download Alphainfluenza sequences using NCBI's optimized cached data packages. + NCBI provides pre-computed, highly compressed cached packages for Alphainfluenza that offer faster and more reliable downloads than the general API endpoints. This function uses the datasets CLI to download these optimized packages with hierarchical fallback from specific to general cached files. - + Cached packages are available for the following Alphainfluenza taxonomic nodes: 1. Alphainfluenza (genus, taxid: 197911) 2. Alphainfluenzavirus influenzae (species, taxid: 2955291) 3. Influenza A virus (no-rank, taxid: 11320) - + For each taxon, filtered sets are available: 1. All genomes 2. Human host only 3. Human host only & complete 4. Complete only - + Args: host (str, optional): Host organism filter (optimized for 'human'). complete_only (bool, optional): Whether to download only complete genomes. @@ -3100,37 +3295,39 @@ def download_alphainfluenza_optimized( outdir (str, optional): Output directory for downloaded files. accession (str, optional): Specific Alphainfluenza accession or taxon ID. use_accession (bool): Whether to use accession endpoint. Defaults to False. - - Returns: + + Returns + ------- str: Path to the downloaded ZIP file containing sequences and metadata. - - Raises: + + Raises + ------ RuntimeError: If the datasets CLI is not available or download fails. + """ - # Determine filter specificity for logging filter_count = sum(1 for param in [host, complete_only, annotated] if param is not None) if filter_count > 0: logger.info("Attempting Alphainfluenza cached download with %d specific filters", filter_count) else: logger.info("Attempting general Alphainfluenza cached download (no specific filters)") - + # Determine output directory if not outdir: outdir = os.getcwd() logger.debug("No output directory specified, using current directory: %s", outdir) - + # Ensure output directory exists before passing path to datasets CLI os.makedirs(outdir, exist_ok=True) logger.debug("Output directory ready: %s", outdir) - + # Create descriptive filename with timestamp and random suffix zip_filename = f"alphainfluenza_{timestamp}_{random_suffix}.zip" zip_path = os.path.join(outdir, zip_filename) - + # Ensure the parent directory exists (in case outdir has subdirectories) os.makedirs(os.path.dirname(zip_path), exist_ok=True) - + # Define which filters are available for this download logger.debug("Available filters for Alphainfluenza download:") if complete_only: @@ -3139,42 +3336,43 @@ def download_alphainfluenza_optimized( logger.debug("- host filter: %s", host) if annotated: logger.debug("- annotated filter") - + # Define fallback strategies in order of preference strategies = [] - + # Default taxon to use (most specific: Alphainfluenzavirus influenzae species) # This taxon ID has the most comprehensive cached data default_taxon = ALPHAINFLUENZA_DEFAULT_TAXON - + if use_accession: # Parse the accession input to handle single, space-separated, or file-based accessions parsed = _parse_accession_input(accession) - - if parsed['is_file']: + + if parsed["is_file"]: # File-based input: use --inputfile flag - cmd1 = ["datasets", "download", "virus", "genome", "accession", - "--inputfile", parsed['file_path']] + cmd1 = ["datasets", "download", "virus", "genome", "accession", "--inputfile", parsed["file_path"]] cmd1.extend(["--filename", zip_path]) strategies.append(("Strategy 1 (accessions from file)", cmd1, [f"inputfile={parsed['file_path']}"])) - logger.debug("Using accession input file: %s", parsed['file_path']) - elif parsed['type'] == 'list': + logger.debug("Using accession input file: %s", parsed["file_path"]) + elif parsed["type"] == "list": # Space-separated accessions: pass as arguments - cmd1 = ["datasets", "download", "virus", "genome", "accession"] + parsed['accessions'] + cmd1 = ["datasets", "download", "virus", "genome", "accession"] + parsed["accessions"] cmd1.extend(["--filename", zip_path]) - strategies.append(("Strategy 1 (multiple accessions)", cmd1, [f"accessions={', '.join(parsed['accessions'][:3])}..."])) - logger.debug("Using multiple accessions: %s", ", ".join(parsed['accessions'])) + strategies.append( + ("Strategy 1 (multiple accessions)", cmd1, [f"accessions={', '.join(parsed['accessions'][:3])}..."]) + ) + logger.debug("Using multiple accessions: %s", ", ".join(parsed["accessions"])) else: # Single accession - cmd1 = ["datasets", "download", "virus", "genome", "accession", parsed['accessions']] + cmd1 = ["datasets", "download", "virus", "genome", "accession", parsed["accessions"]] cmd1.extend(["--filename", zip_path]) strategies.append(("Strategy 1 (direct accession)", cmd1, [f"accession={parsed['accessions']}"])) - logger.debug("Using single accession: %s", parsed['accessions']) + logger.debug("Using single accession: %s", parsed["accessions"]) elif complete_only or host or annotated: # Strategy 1: Try with specific filters using taxon endpoint cmd1 = ["datasets", "download", "virus", "genome", "taxon", default_taxon] filters1 = [] - + if complete_only: cmd1.append("--complete-only") filters1.append("complete-only") @@ -3182,27 +3380,60 @@ def download_alphainfluenza_optimized( if host: cmd1.extend(["--host", host]) filters1.append(f"host={host}") - + if annotated: cmd1.append("--annotated") filters1.append("annotated") cmd1.extend(["--filename", zip_path]) strategies.append(("Strategy 1 (specific filters)", cmd1, filters1)) - + # Strategy 2: Try complete-only and host if both were requested if complete_only and host: - cmd2 = ["datasets", "download", "virus", "genome", "taxon", default_taxon, "--complete-only", "--host", host, "--filename", zip_path] + cmd2 = [ + "datasets", + "download", + "virus", + "genome", + "taxon", + default_taxon, + "--complete-only", + "--host", + host, + "--filename", + zip_path, + ] strategies.append(("Strategy 2 (complete-only and host)", cmd2, ["complete-only", f"host={host}"])) # Strategy 3: Try complete-only if it was requested - if complete_only and (host or annotated): - cmd3 = ["datasets", "download", "virus", "genome", "taxon", default_taxon, "--complete-only", "--filename", zip_path] + if complete_only and (host or annotated): + cmd3 = [ + "datasets", + "download", + "virus", + "genome", + "taxon", + default_taxon, + "--complete-only", + "--filename", + zip_path, + ] strategies.append(("Strategy 3 (complete-only)", cmd3, ["complete-only"])) - # Strategy 4: Try host if it was requested - if host and (complete_only or annotated): - cmd4 = ["datasets", "download", "virus", "genome", "taxon", default_taxon, "--host", host, "--filename", zip_path] + # Strategy 4: Try host if it was requested + if host and (complete_only or annotated): + cmd4 = [ + "datasets", + "download", + "virus", + "genome", + "taxon", + default_taxon, + "--host", + host, + "--filename", + zip_path, + ] strategies.append(("Strategy 4 (host)", cmd4, [f"host={host}"])) # Strategy 5: General Alphainfluenza package (no filters) @@ -3210,12 +3441,8 @@ def download_alphainfluenza_optimized( strategies.append(("Strategy 5 (general package)", cmd5, [])) # Use the common download function with all strategies - requested_filters_dict = { - 'complete-only': complete_only, - 'host': host, - 'annotated': annotated - } - + requested_filters_dict = {"complete-only": complete_only, "host": host, "annotated": annotated} + return _download_optimized_cached( virus_type="Alphainfluenza", strategies=strategies, @@ -3223,140 +3450,159 @@ def download_alphainfluenza_optimized( outdir=outdir, use_accession=use_accession, accession=accession, - requested_filters=requested_filters_dict + requested_filters=requested_filters_dict, ) def download_sequences_by_accessions(accessions, outdir=None, batch_size=200, failed_commands=None, api_key=None): - """ - Download virus genome sequences for a specific list of accession numbers. - + """Download virus genome sequences for a specific list of accession numbers. + This function downloads sequences for a pre-filtered list of accessions, using NCBI E-utilities API with batching to avoid URL length limitations. Large requests are automatically split into smaller batches. - + Args: accessions (list): List of accession numbers to download. outdir (str, optional): Output directory for downloaded files. batch_size (int): Maximum number of accessions per batch. Defaults to 200. failed_commands (dict, optional): Dictionary to track failed operations. api_key (str, optional): NCBI API key for higher rate limits (10 req/sec vs 3). - - Returns: + + Returns + ------- str: Path to the downloaded FASTA file containing sequences. - - Raises: + + Raises + ------ RuntimeError: If the download request fails. ValueError: If no accessions are provided. + """ - if not accessions: raise ValueError("No accessions provided for download") - + logger.info("Downloading sequences for %d accessions using E-utilities API", len(accessions)) - logger.debug("Accession list: %s", accessions[:5] + ['...'] if len(accessions) > 5 else accessions) - + logger.debug("Accession list: %s", accessions[:5] + ["..."] if len(accessions) > 5 else accessions) + # Determine output directory - use current working directory if not specified if not outdir: outdir = os.getcwd() logger.debug("No output directory specified, using current directory: %s", outdir) - + # Ensure output directory exists os.makedirs(outdir, exist_ok=True) logger.debug("Ensured output directory exists: %s", outdir) - + # Create output FASTA file path fasta_path = os.path.join(outdir, f"virus_sequences_{timestamp}_{random_suffix}.fasta") logger.debug("Saving sequences to: %s", fasta_path) - + # For large datasets, prefer the EPost + EFetch History Server pipeline # This is NCBI's recommended approach and is significantly faster if len(accessions) > batch_size: - logger.info("Large request detected (%d accessions). Trying EPost+EFetch History Server pipeline...", - len(accessions)) + logger.info( + "Large request detected (%d accessions). Trying EPost+EFetch History Server pipeline...", len(accessions) + ) try: _download_sequences_epost_efetch(accessions, fasta_path, failed_commands) - except Exception as epost_error: + except Exception as epost_error: # noqa: BLE001 logger.warning("EPost+EFetch pipeline failed: %s", epost_error) logger.info("Falling back to direct batched E-utilities requests...") # Reset the file in case partial data was written if os.path.exists(fasta_path): os.remove(fasta_path) - return _download_sequences_batched(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, batch_size, failed_commands, api_key=api_key) - + return _download_sequences_batched( + accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, batch_size, failed_commands, api_key=api_key + ) + # Check for missing sequences and retry them via direct batched download downloaded_accs = set() try: - with open(fasta_path, 'r', encoding='utf-8') as f: + with open(fasta_path, encoding="utf-8") as f: for line in f: - if line.startswith('>'): + if line.startswith(">"): acc = line[1:].split()[0].strip() downloaded_accs.add(acc) - except IOError: + except OSError: pass - + requested_set = set(accessions) missing_accs = requested_set - downloaded_accs - + if missing_accs: - logger.warning("⚠️ EPost+EFetch pipeline missed %d/%d sequences. " - "Retrying missing accessions via direct batched download...", - len(missing_accs), len(accessions)) - + logger.warning( + "⚠️ EPost+EFetch pipeline missed %d/%d sequences. " + "Retrying missing accessions via direct batched download...", + len(missing_accs), + len(accessions), + ) + # Retry the missing accessions by appending to the existing FASTA file temp_retry_path = fasta_path + ".retry_tmp" try: _download_sequences_batched( - list(missing_accs), NCBI_EUTILS_BASE_EFETCH, - temp_retry_path, batch_size, failed_commands, api_key=api_key + list(missing_accs), + NCBI_EUTILS_BASE_EFETCH, + temp_retry_path, + batch_size, + failed_commands, + api_key=api_key, ) # Append recovered sequences to the main FASTA file if os.path.exists(temp_retry_path) and os.path.getsize(temp_retry_path) > 0: - with open(fasta_path, 'a', encoding='utf-8') as main_f: - with open(temp_retry_path, 'r', encoding='utf-8') as retry_f: + with open(fasta_path, "a", encoding="utf-8") as main_f: + with open(temp_retry_path, encoding="utf-8") as retry_f: main_f.write(retry_f.read()) # Count recovered sequences recovered = 0 - with open(temp_retry_path, 'r', encoding='utf-8') as retry_f: + with open(temp_retry_path, encoding="utf-8") as retry_f: for line in retry_f: - if line.startswith('>'): + if line.startswith(">"): recovered += 1 - logger.info("✅ Recovered %d/%d missing sequences via direct download", - recovered, len(missing_accs)) - except Exception as retry_error: - logger.warning("⚠️ Retry of missing sequences failed: %s. " - "Proceeding with %d/%d sequences.", - retry_error, len(downloaded_accs), len(accessions)) + logger.info( + "✅ Recovered %d/%d missing sequences via direct download", recovered, len(missing_accs) + ) + except Exception as retry_error: # noqa: BLE001 + logger.warning( + "⚠️ Retry of missing sequences failed: %s. Proceeding with %d/%d sequences.", + retry_error, + len(downloaded_accs), + len(accessions), + ) finally: if os.path.exists(temp_retry_path): os.remove(temp_retry_path) - + return fasta_path - + # For smaller requests, use single request - return _download_sequences_single_batch(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, failed_commands, api_key=api_key) + return _download_sequences_single_batch( + accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, failed_commands, api_key=api_key + ) def _download_sequences_epost_efetch(accessions, fasta_path, failed_commands=None, api_key=None): - """ - Download FASTA sequences using NCBI EPost + EFetch History Server pipeline. - + """Download FASTA sequences using NCBI EPost + EFetch History Server pipeline. + This is NCBI's recommended approach for large datasets. It uploads accession IDs to the History Server via EPost, then retrieves FASTA sequences in batches using the WebEnv/query_key reference. This avoids URL length limitations and is significantly faster than individual batched requests. - + Args: accessions (list): List of accession numbers to download. fasta_path (str): Path where FASTA file should be saved. failed_commands (dict, optional): Dictionary to track failed operations. api_key (str, optional): NCBI API key for higher rate limits (10 req/sec vs 3). - - Returns: + + Returns + ------- str: Path to the saved FASTA file. - - Raises: + + Raises + ------ RuntimeError: If EPost fails or no sequences are retrieved. + """ # Resolve API key: argument > module-level env var if api_key is None: @@ -3366,64 +3612,64 @@ def _download_sequences_epost_efetch(accessions, fasta_path, failed_commands=Non # Step 1: Upload accessions to NCBI History Server via EPost web_env, query_key = _epost_accessions(accessions, api_key=api_key) - + if not web_env or not query_key: raise RuntimeError( "EPost failed: could not upload accessions to NCBI History Server. " "The server may be temporarily unavailable." ) - + # Step 2: Fetch FASTA sequences in batches using the History Server reference total = len(accessions) retmax = EFETCH_FASTA_RETMAX total_downloaded = 0 batch_failures = 0 - + # Determine inter-batch delay based on API key availability delay = EUTILS_INTER_BATCH_DELAY_WITH_KEY if api_key else EUTILS_INTER_BATCH_DELAY - + logger.info("Fetching FASTA sequences in batches of %d (total: %d)", retmax, total) - + try: - with open(fasta_path, 'w', encoding='utf-8') as fasta_handle: + with open(fasta_path, "w", encoding="utf-8") as fasta_handle: for retstart in range(0, total, retmax): batch_num = (retstart // retmax) + 1 total_batches = (total + retmax - 1) // retmax - - logger.debug("EFetch FASTA batch %d/%d (retstart=%d, retmax=%d)", - batch_num, total_batches, retstart, retmax) - + + logger.debug( + "EFetch FASTA batch %d/%d (retstart=%d, retmax=%d)", batch_num, total_batches, retstart, retmax + ) + # Define the fetch operation for retry helper def _fetch_fasta_batch(rs=retstart): params = { - 'db': 'nucleotide', - 'WebEnv': web_env, - 'query_key': query_key, - 'retstart': rs, - 'retmax': retmax, - 'rettype': 'fasta', - 'retmode': 'text', + "db": "nucleotide", + "WebEnv": web_env, + "query_key": query_key, + "retstart": rs, + "retmax": retmax, + "rettype": "fasta", + "retmode": "text", } if api_key: - params['api_key'] = api_key - + params["api_key"] = api_key + response = requests.get( NCBI_EUTILS_BASE_EFETCH, params=params, timeout=EUTILS_TIMEOUT, - headers={'User-Agent': 'gget/1.0'} + headers={"User-Agent": "gget/1.0"}, ) response.raise_for_status() - + # Validate FASTA content text = response.text.strip() - if not text or not text.startswith('>'): + if not text or not text.startswith(">"): raise RuntimeError( - f"Invalid FASTA response for batch at retstart={rs}: " - f"response starts with '{text[:50]}'" + f"Invalid FASTA response for batch at retstart={rs}: response starts with '{text[:50]}'" ) return text - + # Use exponential backoff retry success, fasta_text, error_info = _retry_with_exponential_backoff( operation_name=f"EFetch FASTA batch {batch_num}/{total_batches}", @@ -3439,38 +3685,46 @@ def _fetch_fasta_batch(rs=retstart): ), failed_commands=failed_commands, ) - + if success: # Write FASTA data to file fasta_handle.write(fasta_text) - if not fasta_text.endswith('\n'): - fasta_handle.write('\n') - - seq_count = fasta_text.count('>') + if not fasta_text.endswith("\n"): + fasta_handle.write("\n") + + seq_count = fasta_text.count(">") total_downloaded += seq_count - logger.debug("Batch %d/%d: wrote %d sequences (total: %d)", - batch_num, total_batches, seq_count, total_downloaded) + logger.debug( + "Batch %d/%d: wrote %d sequences (total: %d)", + batch_num, + total_batches, + seq_count, + total_downloaded, + ) else: batch_failures += 1 - logger.warning("❌ Batch %d/%d failed after retries: %s", - batch_num, total_batches, - error_info.get('error', 'unknown')) - + logger.warning( + "❌ Batch %d/%d failed after retries: %s", + batch_num, + total_batches, + error_info.get("error", "unknown"), + ) + # Track the failure _track_failed_operation( failed_commands, - 'sequence_batches', - {'batch_num': batch_num, 'retstart': retstart, 'retmax': retmax}, - error_info if error_info else {'error': 'unknown'} + "sequence_batches", + {"batch_num": batch_num, "retstart": retstart, "retmax": retmax}, + error_info if error_info else {"error": "unknown"}, ) - + # Respect NCBI rate limits if retstart + retmax < total: time.sleep(delay) - - except IOError as e: + + except OSError as e: raise RuntimeError(f"Failed to write FASTA file {fasta_path}: {e}") from e - + # Validate results if total_downloaded == 0: # Clean up empty file @@ -3480,74 +3734,72 @@ def _fetch_fasta_batch(rs=retstart): f"EPost+EFetch pipeline downloaded 0 sequences out of {total} requested. " f"All {batch_failures} batches failed." ) - + file_size_mb = os.path.getsize(fasta_path) / BYTES_PER_MB - logger.info("✅ EPost+EFetch pipeline complete: %d sequences downloaded (%.2f MB)", - total_downloaded, file_size_mb) - + logger.info("✅ EPost+EFetch pipeline complete: %d sequences downloaded (%.2f MB)", total_downloaded, file_size_mb) + if batch_failures > 0: - logger.warning("⚠️ %d batch(es) failed during download. %d/%d sequences retrieved.", - batch_failures, total_downloaded, total) - + logger.warning( + "⚠️ %d batch(es) failed during download. %d/%d sequences retrieved.", batch_failures, total_downloaded, total + ) + return fasta_path -def _download_sequences_single_batch(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, failed_commands=None, api_key=None): - """ - Download sequences in a single E-utilities request with exponential backoff retries. - +def _download_sequences_single_batch( + accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, failed_commands=None, api_key=None +): + """Download sequences in a single E-utilities request with exponential backoff retries. + This function handles downloading virus sequences for a list of accessions using a single HTTP request to NCBI E-utilities. It's optimized for smaller batches (< 200 accessions) to avoid URL length limitations. Includes exponential backoff retries for transient failures. - + Args: accessions (list): List of accession numbers to download. NCBI_EUTILS_BASE_EFETCH (str): Base URL for NCBI E-utilities API. fasta_path (str): Path where FASTA file should be saved. failed_commands (dict, optional): Dictionary to track failed operations. - - Returns: + + Returns + ------- str: Path to the saved FASTA file. - - Raises: + + Raises + ------ RuntimeError: If the download fails after retries or response is invalid - + Note: - Validates FASTA format before saving - Includes extended timeout for large datasets - Implements exponential backoff retries for transient failures - Automatically falls back to batching if URL is too long - + Example: - >>> accessions = ['NC_045512.2', 'MN908947.3'] - >>> path = _download_sequences_single_batch(accessions, BASE_URL, 'output.fasta') + >>> accessions = ["NC_045512.2", "MN908947.3"] + >>> path = _download_sequences_single_batch(accessions, BASE_URL, "output.fasta") + """ - # Build accession string (E-utils supports comma-separated IDs) accession_string = ",".join(accessions) - + def execute_request(): - params = { - 'db': 'nucleotide', - 'id': accession_string, - 'rettype': 'fasta', - 'retmode': 'text' - } + params = {"db": "nucleotide", "id": accession_string, "rettype": "fasta", "retmode": "text"} if api_key: - params['api_key'] = api_key + params["api_key"] = api_key logger.debug("E-utilities URL: %s", NCBI_EUTILS_BASE_EFETCH) response = requests.get(NCBI_EUTILS_BASE_EFETCH, params=params, timeout=EUTILS_TIMEOUT) response.raise_for_status() - + # Verify we got FASTA data - if not response.text.strip().startswith('>'): + if not response.text.strip().startswith(">"): raise RuntimeError(f"Invalid FASTA response: {response.text[:100]}") - + return response.text - + logger.info("Initiating E-utilities request for %d accessions", len(accessions)) - + # Use exponential backoff helper for retries success, response_text, error_info = _retry_with_exponential_backoff( operation_name=f"E-utilities request ({len(accessions)} accessions)", @@ -3555,63 +3807,80 @@ def execute_request(): max_retries=API_MAX_RETRIES, initial_delay=API_INITIAL_RETRY_DELAY, backoff_multiplier=API_RETRY_BACKOFF_MULTIPLIER, - retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError, requests.exceptions.Timeout), + retryable_exceptions=( + requests.exceptions.ConnectionError, + requests.exceptions.HTTPError, + requests.exceptions.Timeout, + ), failed_commands=failed_commands, ) - + if not success: # Check for specific URL length error - error_msg = error_info['error'] + error_msg = error_info["error"] if "414" in error_msg or "Request-URI Too Long" in error_msg: logger.info("URL too long error detected. Retrying with batch processing...") # Retry with smaller batches (half of default) - return _download_sequences_batched(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, batch_size=EUTILS_DEFAULT_BATCH_SIZE // 2, failed_commands=failed_commands, api_key=api_key) - + return _download_sequences_batched( + accessions, + NCBI_EUTILS_BASE_EFETCH, + fasta_path, + batch_size=EUTILS_DEFAULT_BATCH_SIZE // 2, + failed_commands=failed_commands, + api_key=api_key, + ) + # Log and track the failure logger.error("❌ E-utilities request failed after %d retries: %s", API_MAX_RETRIES, error_msg) - + # Track failed operation for later reporting in command summary retry_url = f"{NCBI_EUTILS_BASE_EFETCH}?db=nucleotide&id={accession_string}&rettype=fasta&retmode=text" _track_failed_operation( - failed_commands, - 'sequence_fetch', + failed_commands, + "sequence_fetch", { - 'operation': 'single_batch_download', - 'accession_count': len(accessions), - 'retry_url': retry_url, + "operation": "single_batch_download", + "accession_count": len(accessions), + "retry_url": retry_url, }, - error_info + error_info, ) - - raise RuntimeError(f"❌ Failed to download virus sequences via E-utilities after {API_MAX_RETRIES} retries: {error_msg}") from None - + + raise RuntimeError( + f"❌ Failed to download virus sequences via E-utilities after {API_MAX_RETRIES} retries: {error_msg}" + ) from None + # Save to file try: # Count sequences in response - sequence_count = response_text.count('>') + sequence_count = response_text.count(">") logger.info("Received %d sequences from E-utilities", sequence_count) - + # Write FASTA data to file - with open(fasta_path, 'w', encoding='utf-8') as f: + with open(fasta_path, "w", encoding="utf-8") as f: f.write(response_text) - - logger.info("Successfully saved sequences to: %s (%.2f MB)", - fasta_path, len(response_text.encode('utf-8')) / 1024 / 1024) + + logger.info( + "Successfully saved sequences to: %s (%.2f MB)", + fasta_path, + len(response_text.encode("utf-8")) / 1024 / 1024, + ) return fasta_path - - except IOError as e: + + except OSError as e: logger.error("❌ Failed to save FASTA file: %s", e) raise RuntimeError(f"❌ Failed to save downloaded sequences: {e}") from e -def _download_sequences_batched(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, batch_size, failed_commands=None, api_key=None): - """ - Download sequences using multiple batched E-utilities requests with incremental file writing. - +def _download_sequences_batched( + accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, batch_size, failed_commands=None, api_key=None +): + """Download sequences using multiple batched E-utilities requests with incremental file writing. + This function handles large sequence downloads by splitting them into smaller batches and writing results incrementally to avoid memory issues. It includes robust error handling with automatic exponential backoff retries for failed batches. - + Key features: - Batched requests to avoid URL length limits - Exponential backoff retries for each batch @@ -3619,7 +3888,7 @@ def _download_sequences_batched(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, - Automatic retry with smaller batch sizes for URL length failures - Progress tracking and detailed logging - Graceful handling of partial failures (continues after batch failures) - + Args: accessions (list): List of accession numbers to download. NCBI_EUTILS_BASE_EFETCH (str): Base URL for NCBI E-utilities API. @@ -3627,64 +3896,61 @@ def _download_sequences_batched(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, batch_size (int): Number of accessions per batch. failed_commands (dict, optional): Dictionary to track failed operations. api_key (str, optional): NCBI API key for higher rate limits (10 req/sec vs 3). - - Returns: + + Returns + ------- str: Path to the saved FASTA file containing all downloaded sequences - - Raises: + + Raises + ------ RuntimeError: If all batches fail or no sequences are downloaded - + Note: - Respects NCBI rate limits with 0.5s delays between batches - Implements exponential backoff for individual batch retries - Automatically reduces batch size for URL length errors - Continues processing even if some batches fail - Writes sequences immediately to reduce memory usage - + Example: - >>> large_accession_list = ['NC_045512.2', 'MN908947.3', ...] # 1000+ accessions - >>> path = _download_sequences_batched(large_accession_list, BASE_URL, 'out.fasta', 200) + >>> large_accession_list = ["NC_045512.2", "MN908947.3", ...] # 1000+ accessions + >>> path = _download_sequences_batched(large_accession_list, BASE_URL, "out.fasta", 200) + """ - # Initialize failed_commands tracking if not already done - if failed_commands is not None and 'sequence_batches' not in failed_commands: - failed_commands['sequence_batches'] = [] - + if failed_commands is not None and "sequence_batches" not in failed_commands: + failed_commands["sequence_batches"] = [] + # Split accessions into batches - batches = [accessions[i:i + batch_size] for i in range(0, len(accessions), batch_size)] - logger.info("Downloading %d accessions in %d batches of size %d", - len(accessions), len(batches), batch_size) - + batches = [accessions[i : i + batch_size] for i in range(0, len(accessions), batch_size)] + logger.info("Downloading %d accessions in %d batches of size %d", len(accessions), len(batches), batch_size) + total_downloaded = 0 batch_failed_count = 0 - + # Open file once and write batches incrementally to avoid storing all data in memory try: - with open(fasta_path, 'w', encoding='utf-8') as f: - for batch_num, batch_accessions in tqdm(enumerate(batches, 1), total=len(batches), desc="Downloading batches", unit="batch"): - + with open(fasta_path, "w", encoding="utf-8") as f: + for batch_num, batch_accessions in tqdm( + enumerate(batches, 1), total=len(batches), desc="Downloading batches", unit="batch" + ): # Build accession string for this batch accession_string = ",".join(batch_accessions) - + def download_batch(): - """Callable for retry helper function""" - params = { - 'db': 'nucleotide', - 'id': accession_string, - 'rettype': 'fasta', - 'retmode': 'text' - } + """Callable for retry helper function.""" + params = {"db": "nucleotide", "id": accession_string, "rettype": "fasta", "retmode": "text"} # noqa: B023 if api_key: - params['api_key'] = api_key + params["api_key"] = api_key response = requests.get(NCBI_EUTILS_BASE_EFETCH, params=params, timeout=EUTILS_TIMEOUT) response.raise_for_status() - + # Verify we got FASTA data - if not response.text.strip().startswith('>'): + if not response.text.strip().startswith(">"): raise RuntimeError(f"Invalid FASTA response: {response.text[:100]}") - + return response.text - + # Use exponential backoff helper for batch retries success, batch_response_text, error_info = _retry_with_exponential_backoff( operation_name=f"Batch {batch_num}/{len(batches)} ({len(batch_accessions)} accessions)", @@ -3692,65 +3958,80 @@ def download_batch(): max_retries=API_MAX_RETRIES, initial_delay=API_INITIAL_RETRY_DELAY, backoff_multiplier=API_RETRY_BACKOFF_MULTIPLIER, - retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError, requests.exceptions.Timeout), + retryable_exceptions=( + requests.exceptions.ConnectionError, + requests.exceptions.HTTPError, + requests.exceptions.Timeout, + ), failed_commands=failed_commands, ) - + if success: # Count sequences in this batch - batch_sequence_count = batch_response_text.count('>') + batch_sequence_count = batch_response_text.count(">") total_downloaded += batch_sequence_count - + # Write sequences immediately to file (incremental write) f.write(batch_response_text) - if not batch_response_text.endswith('\n'): - f.write('\n') # Ensure proper line endings between batches + if not batch_response_text.endswith("\n"): + f.write("\n") # Ensure proper line endings between batches f.flush() # Force write to disk immediately - + # Update progress bar description with current stats - batch_size_mb = len(batch_response_text.encode('utf-8')) / BYTES_PER_MB - logger.debug(f"✓ Batch {batch_num}: Downloaded {batch_sequence_count} sequences ({batch_size_mb:.2f} MB)") - + batch_size_mb = len(batch_response_text.encode("utf-8")) / BYTES_PER_MB + logger.debug( + f"✓ Batch {batch_num}: Downloaded {batch_sequence_count} sequences ({batch_size_mb:.2f} MB)" + ) + else: # Batch failed after retries - error_msg = error_info['error'] + error_msg = error_info["error"] batch_failed_count += 1 - + # Check for URL length error if "414" in error_msg and batch_size > EUTILS_MIN_BATCH_SIZE_FOR_SPLIT: - tqdm.write(f"⚠️ WARNING: Batch {batch_num} URL too long (size={batch_size}). Retrying with smaller batch...") + tqdm.write( + f"⚠️ WARNING: Batch {batch_num} URL too long (size={batch_size}). Retrying with smaller batch..." + ) # Recursively retry this batch with smaller size by splitting it further temp_batch_path = f"temp_batch_{batch_num}.fasta" try: _download_sequences_batched( - batch_accessions, NCBI_EUTILS_BASE_EFETCH, temp_batch_path, batch_size // 2, failed_commands, api_key=api_key + batch_accessions, + NCBI_EUTILS_BASE_EFETCH, + temp_batch_path, + batch_size // 2, + failed_commands, + api_key=api_key, ) # Read the temporary file and append to main file - with open(temp_batch_path, 'r', encoding='utf-8') as temp_f: + with open(temp_batch_path, encoding="utf-8") as temp_f: batch_content = temp_f.read() f.write(batch_content) - if not batch_content.endswith('\n'): - f.write('\n') + if not batch_content.endswith("\n"): + f.write("\n") f.flush() # Count sequences in this recovered batch - recovered_count = batch_content.count('>') + recovered_count = batch_content.count(">") total_downloaded += recovered_count batch_failed_count -= 1 # This batch succeeded after retry - tqdm.write(f"✓ Recovered batch {batch_num} with smaller size: {recovered_count} sequences") + tqdm.write( + f"✓ Recovered batch {batch_num} with smaller size: {recovered_count} sequences" + ) os.remove(temp_batch_path) # Clean up temp file - except Exception as file_error: + except Exception as file_error: # noqa: BLE001 tqdm.write(f"❌ Failed to recover batch {batch_num}: {file_error}") # Track the failed batch _track_failed_operation( failed_commands, - 'sequence_batches', + "sequence_batches", { - 'batch_num': batch_num, - 'accession_count': len(batch_accessions), - 'accessions': batch_accessions, - 'retry_url': f"{NCBI_EUTILS_BASE_EFETCH}?db=nucleotide&id={accession_string}&rettype=fasta&retmode=text", + "batch_num": batch_num, + "accession_count": len(batch_accessions), + "accessions": batch_accessions, + "retry_url": f"{NCBI_EUTILS_BASE_EFETCH}?db=nucleotide&id={accession_string}&rettype=fasta&retmode=text", }, - error_info + error_info, ) continue else: @@ -3758,56 +4039,60 @@ def download_batch(): tqdm.write(f"❌ Batch {batch_num} failed after {API_MAX_RETRIES} retries: {error_msg}") _track_failed_operation( failed_commands, - 'sequence_batches', + "sequence_batches", { - 'batch_num': batch_num, - 'accession_count': len(batch_accessions), - 'accessions': batch_accessions, - 'retry_url': f"{NCBI_EUTILS_BASE_EFETCH}?db=nucleotide&id={accession_string}&rettype=fasta&retmode=text", + "batch_num": batch_num, + "accession_count": len(batch_accessions), + "accessions": batch_accessions, + "retry_url": f"{NCBI_EUTILS_BASE_EFETCH}?db=nucleotide&id={accession_string}&rettype=fasta&retmode=text", }, - error_info + error_info, ) continue - + # Add small delay between requests to be respectful to NCBI servers if batch_num < len(batches): # Don't delay after the last batch time.sleep(EUTILS_INTER_BATCH_DELAY) - + # Check if we downloaded anything if total_downloaded == 0: raise RuntimeError("❌ All batches failed. No sequences were downloaded.") - + if batch_failed_count > 0: - logger.warning(f"⚠️ WARNING: {batch_failed_count} out of {len(batches)} batches failed. Successfully downloaded {total_downloaded} sequences.") - tqdm.write(f"⚠️ WARNING: {batch_failed_count} out of {len(batches)} batches failed. Successfully downloaded {total_downloaded} sequences.") - + logger.warning( + f"⚠️ WARNING: {batch_failed_count} out of {len(batches)} batches failed. Successfully downloaded {total_downloaded} sequences." + ) + tqdm.write( + f"⚠️ WARNING: {batch_failed_count} out of {len(batches)} batches failed. Successfully downloaded {total_downloaded} sequences." + ) + file_size = os.path.getsize(fasta_path) - logger.info("Successfully saved %d sequences to: %s (%.2f MB)", - total_downloaded, fasta_path, file_size / BYTES_PER_MB) + logger.info( + "Successfully saved %d sequences to: %s (%.2f MB)", total_downloaded, fasta_path, file_size / BYTES_PER_MB + ) return fasta_path - - except IOError as e: + + except OSError as e: logger.error("❌ Failed to write FASTA file: %s", e) raise RuntimeError(f"❌ Failed to save downloaded sequences: {e}") from e def _unzip_file(zip_file_path, extract_to_path): - """ - Extract a ZIP file to a specified directory. - + """Extract a ZIP file to a specified directory. + Args: zip_file_path (str): Path to the ZIP file. extract_to_path (str): Target directory for extraction. """ os.makedirs(extract_to_path, exist_ok=True) logger.debug("Created extraction directory: %s", extract_to_path) - + try: with zipfile.ZipFile(zip_file_path, "r") as zip_ref: zip_ref.extractall(extract_to_path) file_list = zip_ref.namelist() logger.info("Extracted %d files from %s", len(file_list), zip_file_path) - + except zipfile.BadZipFile as e: raise zipfile.BadZipFile(f"Invalid or corrupted ZIP file: {zip_file_path}") from e except PermissionError as e: @@ -3817,22 +4102,24 @@ def _unzip_file(zip_file_path, extract_to_path): def _parse_date(date_str, filtername=""): - """ - Parse various date formats into a datetime object. - + """Parse various date formats into a datetime object. + Args: date_str (str): Date string to parse (various formats accepted). filtername (str): Name of the filter/field for error reporting. - - Returns: + + Returns + ------- datetime: Parsed datetime object, or None if parsing fails. - - Raises: + + Raises + ------ ValueError: If date parsing fails - + Note: Uses a default date of year 1500 for incomplete date strings to ensure proper comparison behavior with minimum date filters. + """ try: # Use dateutil parser for flexible date parsing @@ -3840,7 +4127,7 @@ def _parse_date(date_str, filtername=""): parsed_date = parser.parse(date_str, default=datetime(DATE_PARSE_DEFAULT_YEAR, 1, 1)) logger.debug("Successfully parsed date '%s' as %s", date_str, parsed_date) return parsed_date - + except (ValueError, TypeError) as exc: error_msg = ( f"Invalid date detected for argument {filtername}: '{date_str}'.\n" @@ -3859,53 +4146,55 @@ def _parse_date(date_str, filtername=""): def _parse_partial_date_for_range_check(date_str, for_min_comparison=True, filtername=""): - """ - Parse partial dates with range-aware handling for comparison. - + """Parse partial dates with range-aware handling for comparison. + When comparing partial dates (year-only or year-month) against specific dates, we need to handle them based on the comparison direction: - + - For min_collection_date comparisons: use the END of the partial range (e.g., "2015" -> 2015-12-31, "2015-06" -> 2015-06-30, "2021/2022" -> 2022-12-31) This ensures records from that year/month are included if they COULD be >= min. - + - For max_collection_date comparisons: use the START of the partial range (e.g., "2015" -> 2015-01-01, "2015-06" -> 2015-06-01, "2021/2022" -> 2021-01-01) This ensures records from that year/month are included if they COULD be <= max. - + Args: date_str (str): Date string to parse (various formats). for_min_comparison (bool): True if comparing against min date, False for max date. filtername (str): Name of the filter for error messages. - - Returns: + + Returns + ------- datetime: Parsed datetime object with partial dates adjusted appropriately. - - Raises: + + Raises + ------ ValueError: If date parsing fails. - """ + + """ if not date_str or not date_str.strip(): raise ValueError(f"Empty date string for {filtername}") - + date_str = date_str.strip() - + # Detect date precision based on format # Year-only: "2015" (4 digits) # Year-range: "2021/2022" or "2021-2022" (two 4-digit years) # Year-month: "2015-06", "2015/06", "Jun 2015", etc. # Full date: "2015-06-15", "2015/06/15", "Jun 15, 2015", etc. - - year_only_pattern = r'^(\d{4})$' - year_month_pattern = r'^(\d{4})[-/](\d{1,2})$' - year_range_pattern = r'^(\d{4})[-/](\d{4})$' + + year_only_pattern = r"^(\d{4})$" + year_month_pattern = r"^(\d{4})[-/](\d{1,2})$" + year_range_pattern = r"^(\d{4})[-/](\d{4})$" # NCBI API returns date ranges as "[2021 TO 2022]" or "[2021-06 TO 2022-03]" - bracket_range_pattern = r'^\[(.+?)\s+TO\s+(.+?)\]$' - + bracket_range_pattern = r"^\[(.+?)\s+TO\s+(.+?)\]$" + year_match = re.match(year_only_pattern, date_str) year_month_match = re.match(year_month_pattern, date_str) year_range_match = re.match(year_range_pattern, date_str) bracket_range_match = re.match(bracket_range_pattern, date_str, re.IGNORECASE) - + try: if bracket_range_match: # Bracket range from NCBI API like "[2021 TO 2022]" or "[2021-06 TO 2022-03]" @@ -3916,16 +4205,18 @@ def _parse_partial_date_for_range_check(date_str, for_min_comparison=True, filte end_result = _parse_partial_date_for_range_check( range_end_str, for_min_comparison=True, filtername=filtername ) - logger.debug("Parsed bracket-range date '%s' as %s (end of range for min comparison)", - date_str, end_result) + logger.debug( + "Parsed bracket-range date '%s' as %s (end of range for min comparison)", date_str, end_result + ) return end_result else: # For max comparison, use the START of the range start_result = _parse_partial_date_for_range_check( range_start_str, for_min_comparison=False, filtername=filtername ) - logger.debug("Parsed bracket-range date '%s' as %s (start of range for max comparison)", - date_str, start_result) + logger.debug( + "Parsed bracket-range date '%s' as %s (start of range for max comparison)", date_str, start_result + ) return start_result elif year_range_match: @@ -3935,13 +4226,11 @@ def _parse_partial_date_for_range_check(date_str, for_min_comparison=True, filte if for_min_comparison: # For min comparison, use end of the range (Dec 31 of end year) result = datetime(year_end, 12, 31) - logger.debug("Parsed year-range date '%s' as %s (end of range for min comparison)", - date_str, result) + logger.debug("Parsed year-range date '%s' as %s (end of range for min comparison)", date_str, result) else: # For max comparison, use start of the range (Jan 1 of start year) result = datetime(year_start, 1, 1) - logger.debug("Parsed year-range date '%s' as %s (start of range for max comparison)", - date_str, result) + logger.debug("Parsed year-range date '%s' as %s (start of range for max comparison)", date_str, result) return result elif year_match: @@ -3950,15 +4239,13 @@ def _parse_partial_date_for_range_check(date_str, for_min_comparison=True, filte if for_min_comparison: # For min comparison, use end of year (Dec 31) result = datetime(year, 12, 31) - logger.debug("Parsed year-only date '%s' as %s (end of year for min comparison)", - date_str, result) + logger.debug("Parsed year-only date '%s' as %s (end of year for min comparison)", date_str, result) else: # For max comparison, use start of year (Jan 1) result = datetime(year, 1, 1) - logger.debug("Parsed year-only date '%s' as %s (start of year for max comparison)", - date_str, result) + logger.debug("Parsed year-only date '%s' as %s (start of year for max comparison)", date_str, result) return result - + elif year_month_match: # Year-month date like "2015-06" year = int(year_month_match.group(1)) @@ -3967,18 +4254,16 @@ def _parse_partial_date_for_range_check(date_str, for_min_comparison=True, filte # For min comparison, use end of month _, last_day = calendar.monthrange(year, month) result = datetime(year, month, last_day) - logger.debug("Parsed year-month date '%s' as %s (end of month for min comparison)", - date_str, result) + logger.debug("Parsed year-month date '%s' as %s (end of month for min comparison)", date_str, result) else: # For max comparison, use start of month result = datetime(year, month, 1) - logger.debug("Parsed year-month date '%s' as %s (start of month for max comparison)", - date_str, result) + logger.debug("Parsed year-month date '%s' as %s (start of month for max comparison)", date_str, result) return result else: # Full date - use standard parsing return _parse_date(date_str, filtername=filtername) - + except (ValueError, TypeError) as exc: error_msg = ( f"Invalid date detected for argument {filtername}: '{date_str}'.\n" @@ -3990,40 +4275,40 @@ def _parse_partial_date_for_range_check(date_str, for_min_comparison=True, filte def _write_fasta_record(handle, record): - """ - Write a single FASTA record to an open file handle. - + """Write a single FASTA record to an open file handle. + Args: handle: Open file handle for writing. record: FastaRecord object with id, description, and seq attributes. """ - if hasattr(record, 'description') and record.description: + if hasattr(record, "description") and record.description: handle.write(f">{record.id} {record.description}\n") else: handle.write(f">{record.id}\n") seq_str = str(record.seq) for i in range(0, len(seq_str), 70): - handle.write(seq_str[i:i+70] + '\n') + handle.write(seq_str[i : i + 70] + "\n") def _stream_copy_fasta(input_path, output_path, accession_set=None): - """ - Stream-copy FASTA records from input to output, optionally filtering by accession set. - + """Stream-copy FASTA records from input to output, optionally filtering by accession set. + This avoids loading all sequences into RAM — only one record at a time is in memory. For large datasets (millions of sequences), this is critical to avoid out-of-memory errors. - + Args: input_path (str): Path to input FASTA file. output_path (str): Path to output FASTA file. accession_set (set, optional): If provided, only copy records whose ID is in this set. - - Returns: + + Returns + ------- int: Number of records written. + """ count = 0 skipped = 0 - with open(output_path, 'w', encoding='utf-8') as out_handle: + with open(output_path, "w", encoding="utf-8") as out_handle: for record in FastaIO.parse(input_path, "fasta"): if accession_set is not None and record.id not in accession_set: skipped += 1 @@ -4032,7 +4317,7 @@ def _stream_copy_fasta(input_path, output_path, accession_set=None): count += 1 if count % FASTA_STREAM_LOG_INTERVAL == 0: logger.debug("Streamed %d FASTA records so far...", count) - + if accession_set is not None: logger.info("Stream-copied %d FASTA records (%d skipped by accession filter)", count, skipped) else: @@ -4041,51 +4326,52 @@ def _stream_copy_fasta(input_path, output_path, accession_set=None): def _load_metadata_dict_from_temp_jsonl(temp_file_path): - """ - Stream metadata records from a temporary JSONL file and build metadata_dict directly. - + """Stream metadata records from a temporary JSONL file and build metadata_dict directly. + This function reads the JSONL file line-by-line, converting each raw API report to the internal metadata format. This avoids loading the entire raw API response list into memory at once. - + The conversion logic mirrors load_metadata_from_api_reports() but processes records one at a time from disk. - + Args: temp_file_path (str): Path to the temporary JSONL file containing raw API reports. - - Returns: + + Returns + ------- dict: Dictionary mapping accession numbers to metadata dictionaries. Same format as load_metadata_from_api_reports(). + """ metadata_dict = {} processed_count = 0 skipped_count = 0 - + if not temp_file_path or not os.path.exists(temp_file_path): logger.warning("Temporary metadata file not found: %s", temp_file_path) return metadata_dict - + file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024) logger.info("Loading metadata from temp file: %s (%.2f MB)", temp_file_path, file_size_mb) - - with open(temp_file_path, 'r', encoding='utf-8') as f: + + with open(temp_file_path, encoding="utf-8") as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue - + try: report = json.loads(line) except json.JSONDecodeError: skipped_count += 1 logger.debug("Skipping malformed JSON at line %d", line_num) continue - + # Extract the accession number accession = report.get("accession") - + if accession: processed_count += 1 - + # Transform API report format to internal format (same as load_metadata_from_api_reports) metadata = { "accession": accession, @@ -4103,8 +4389,8 @@ def _load_metadata_dict_from_temp_jsonl(temp_file_path): "sourceDatabase": report.get("source_database", ""), "isolateName": report.get("isolate", {}).get("name", ""), "isolate": { - 'collectionDate': report.get("isolate", {}).get("collection_date", ""), - 'source': report.get("isolate", {}).get("source", ""), + "collectionDate": report.get("isolate", {}).get("collection_date", ""), + "source": report.get("isolate", {}).get("source", ""), }, "virusTaxId": report.get("virus", {}).get("tax_id", None), "virusName": report.get("virus", {}).get("organism_name", ""), @@ -4119,62 +4405,62 @@ def _load_metadata_dict_from_temp_jsonl(temp_file_path): "submitterCountry": report.get("submitter", {}).get("country", ""), "submitterInstitution": report.get("submitter", {}).get("affiliation", ""), } - + metadata_dict[accession] = metadata else: skipped_count += 1 - + # Log progress for large files if processed_count > 0 and processed_count % 500000 == 0: logger.info(" ... processed %d records from temp file", processed_count) - - logger.info("Loaded %d metadata records from temp file (skipped %d)", - processed_count, skipped_count) - + + logger.info("Loaded %d metadata records from temp file (skipped %d)", processed_count, skipped_count) + return metadata_dict def _load_cached_metadata_from_jsonl(jsonl_path): - """ - Load cached metadata from a JSONL file where records are already in internal format. - + """Load cached metadata from a JSONL file where records are already in internal format. + Unlike _load_metadata_dict_from_temp_jsonl (which transforms raw API format), this function loads records that are already transformed (from process_cached_download). Each line is a JSON object with 'accession' key and all metadata fields directly. - + Args: jsonl_path (str): Path to the cached metadata JSONL file. - - Returns: + + Returns + ------- dict: Dictionary mapping accession numbers to metadata dictionaries. + """ metadata_dict = {} processed_count = 0 - + if not jsonl_path or not os.path.exists(jsonl_path): logger.warning("Cached metadata JSONL file not found: %s", jsonl_path) return metadata_dict - + file_size_mb = os.path.getsize(jsonl_path) / (1024 * 1024) logger.info("Loading cached metadata from JSONL: %s (%.2f MB)", jsonl_path, file_size_mb) - - with open(jsonl_path, 'r', encoding='utf-8') as f: + + with open(jsonl_path, encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue - + try: metadata = json.loads(line) except json.JSONDecodeError: continue - + accession = metadata.get("accession") if accession: metadata_dict[accession] = metadata processed_count += 1 - + if processed_count % 500000 == 0: logger.info(" ... loaded %d cached metadata records", processed_count) - + logger.info("Loaded %d cached metadata records from JSONL", processed_count) return metadata_dict @@ -4190,13 +4476,12 @@ def _stream_filter_cached_metadata_from_jsonl( min_release_date=None, applied_strategy_filters=None, ): - """ - Stream cached metadata from a JSONL file, applying filters on-the-fly. - + """Stream cached metadata from a JSONL file, applying filters on-the-fly. + This is the memory-efficient equivalent of loading ALL records into a dict and then calling filter_cached_metadata_for_unused_filters(). Only records that pass ALL filters are kept in memory. - + Args: jsonl_path (str): Path to the cached metadata JSONL file. host (str, optional): Host organism filter. @@ -4207,192 +4492,192 @@ def _stream_filter_cached_metadata_from_jsonl( refseq_only (bool, optional): RefSeq only filter. min_release_date (str, optional): Minimum release date filter. applied_strategy_filters (list, optional): Filters already applied server-side. - - Returns: + + Returns + ------- tuple: (metadata_dict, total_records, filter_stats) - metadata_dict: dict mapping accession to metadata (only passing records) - total_records: total number of records scanned - filter_stats: dict with counts of records filtered by each category + """ if applied_strategy_filters is None: applied_strategy_filters = [] - + # Determine which filters to actually apply filters_active = {} - if 'host' not in applied_strategy_filters and host: - filters_active['host'] = host - if 'complete-only' not in applied_strategy_filters and complete_only: - filters_active['complete_only'] = True - if 'annotated' not in applied_strategy_filters and annotated: - filters_active['annotated'] = True - if 'lineage' not in applied_strategy_filters and lineage: - filters_active['lineage'] = lineage + if "host" not in applied_strategy_filters and host: + filters_active["host"] = host + if "complete-only" not in applied_strategy_filters and complete_only: + filters_active["complete_only"] = True + if "annotated" not in applied_strategy_filters and annotated: + filters_active["annotated"] = True + if "lineage" not in applied_strategy_filters and lineage: + filters_active["lineage"] = lineage if geographic_location: - filters_active['geographic_location'] = geographic_location + filters_active["geographic_location"] = geographic_location if refseq_only: - filters_active['refseq_only'] = True + filters_active["refseq_only"] = True if min_release_date: - filters_active['min_release_date'] = min_release_date - + filters_active["min_release_date"] = min_release_date + # Parse min_release_date once min_release_date_parsed = None - if 'min_release_date' in filters_active: + if "min_release_date" in filters_active: min_release_date_parsed = _parse_date(min_release_date, filtername="min_release_date") - + metadata_dict = {} total_records = 0 filter_stats = { - 'host': 0, - 'complete_only': 0, - 'annotated': 0, - 'lineage': 0, - 'geographic_location': 0, - 'refseq_only': 0, - 'min_release_date': 0, + "host": 0, + "complete_only": 0, + "annotated": 0, + "lineage": 0, + "geographic_location": 0, + "refseq_only": 0, + "min_release_date": 0, } - + if not jsonl_path or not os.path.exists(jsonl_path): logger.warning("Cached metadata JSONL not found for streaming filter: %s", jsonl_path) return metadata_dict, 0, filter_stats - + file_size_mb = os.path.getsize(jsonl_path) / (1024 * 1024) logger.info("Stream-filtering cached metadata from JSONL: %s (%.2f MB)", jsonl_path, file_size_mb) if filters_active: logger.info("Active filters: %s", list(filters_active.keys())) else: logger.info("No filters to apply — loading all records") - - with open(jsonl_path, 'r', encoding='utf-8') as f: + + with open(jsonl_path, encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue - + try: metadata = json.loads(line) except json.JSONDecodeError: continue - + accession = metadata.get("accession") if not accession: continue - + total_records += 1 - + # Apply filters — skip record if any filter fails skip = False - + # Host filter - if not skip and 'host' in filters_active: - host_name = metadata.get('hostName', '') + if not skip and "host" in filters_active: + host_name = metadata.get("hostName", "") if not host_name or host.lower() not in host_name.lower(): - filter_stats['host'] += 1 + filter_stats["host"] += 1 skip = True - + # Complete-only filter - if not skip and 'complete_only' in filters_active: - completeness = metadata.get('completeness', '') - if not completeness or completeness.lower() != 'complete': - filter_stats['complete_only'] += 1 + if not skip and "complete_only" in filters_active: + completeness = metadata.get("completeness", "") + if not completeness or completeness.lower() != "complete": + filter_stats["complete_only"] += 1 skip = True - + # Annotated filter - if not skip and 'annotated' in filters_active: - is_annotated = metadata.get('isAnnotated', False) + if not skip and "annotated" in filters_active: + is_annotated = metadata.get("isAnnotated", False) if not is_annotated: - filter_stats['annotated'] += 1 + filter_stats["annotated"] += 1 skip = True - + # Lineage filter - if not skip and 'lineage' in filters_active: - virus_pangolin = metadata.get('virusPangolinClassification', '') + if not skip and "lineage" in filters_active: + virus_pangolin = metadata.get("virusPangolinClassification", "") if not virus_pangolin or lineage.lower() not in str(virus_pangolin).lower(): - filter_stats['lineage'] += 1 + filter_stats["lineage"] += 1 skip = True - + # Geographic location filter - if not skip and 'geographic_location' in filters_active: - geo_loc = metadata.get('location', '') or '' - geo_region = metadata.get('region', '') or '' - virus_name = metadata.get('virusName', '') or '' + if not skip and "geographic_location" in filters_active: + geo_loc = metadata.get("location", "") or "" + geo_region = metadata.get("region", "") or "" + virus_name = metadata.get("virusName", "") or "" geo_filter = geographic_location.lower() loc_matches = geo_loc and geo_filter in geo_loc.lower() region_matches = geo_region and geo_filter in geo_region.lower() virus_name_matches = virus_name and geo_filter in virus_name.lower() if not loc_matches and not region_matches and not virus_name_matches: - filter_stats['geographic_location'] += 1 + filter_stats["geographic_location"] += 1 skip = True - + # RefSeq only filter - if not skip and 'refseq_only' in filters_active: - is_refseq = metadata.get('sourceDatabase', '').lower() == 'refseq' + if not skip and "refseq_only" in filters_active: + is_refseq = metadata.get("sourceDatabase", "").lower() == "refseq" if not is_refseq: - filter_stats['refseq_only'] += 1 + filter_stats["refseq_only"] += 1 skip = True - + # Minimum release date filter - if not skip and 'min_release_date' in filters_active and min_release_date_parsed: - release_date_str = metadata.get('releaseDate', '') + if not skip and "min_release_date" in filters_active and min_release_date_parsed: + release_date_str = metadata.get("releaseDate", "") if not release_date_str: - filter_stats['min_release_date'] += 1 + filter_stats["min_release_date"] += 1 skip = True else: try: release_date = _parse_date(release_date_str, filtername="releaseDate") if release_date and release_date < min_release_date_parsed: - filter_stats['min_release_date'] += 1 + filter_stats["min_release_date"] += 1 skip = True except (ValueError, TypeError): - filter_stats['min_release_date'] += 1 + filter_stats["min_release_date"] += 1 skip = True - + if not skip: metadata_dict[accession] = metadata - + # Log progress every 1M records if total_records % 1000000 == 0: - logger.info(" ... scanned %d records, %d passing filters so far", - total_records, len(metadata_dict)) - - logger.info("Stream-filter complete: scanned %d records, %d passed all filters", - total_records, len(metadata_dict)) + logger.info(" ... scanned %d records, %d passing filters so far", total_records, len(metadata_dict)) + + logger.info("Stream-filter complete: scanned %d records, %d passed all filters", total_records, len(metadata_dict)) if any(v > 0 for v in filter_stats.values()): - logger.info("Filter statistics: %s", - {k: v for k, v in filter_stats.items() if v > 0}) - + logger.info("Filter statistics: %s", {k: v for k, v in filter_stats.items() if v > 0}) + return metadata_dict, total_records, filter_stats def load_metadata_from_api_reports(api_reports): - """ - Load metadata from API response reports into a dictionary. - + """Load metadata from API response reports into a dictionary. + This function transforms the raw API response format into a standardized internal metadata format that can be used by the filtering functions. It maps API field names to the expected internal field names and handles missing or null values appropriately. - + Args: api_reports (list): List of virus metadata reports from the NCBI API. - - Returns: + + Returns + ------- dict: Dictionary mapping accession numbers to metadata dictionaries Key: accession number (str) Value: metadata dictionary with standardized field names + """ metadata_dict = {} processed_count = 0 skipped_count = 0 - + logger.debug("Processing %d API reports into metadata dictionary", len(api_reports)) - + for report in api_reports: # Extract the accession number - this serves as our unique identifier accession = report.get("accession") - + if accession: processed_count += 1 - + # Transform API report format to match expected internal metadata format # Map API fields to expected internal field names with appropriate defaults metadata = { @@ -4400,7 +4685,9 @@ def load_metadata_from_api_reports(api_reports): "length": report.get("length"), # Sequence length in nucleotides # "source": "NCBI_REST_API", "geneCount": report.get("gene_count"), # Number of genes annotated - "completeness": (report.get("completeness") or "").lower(), # Completeness status (e.g., complete, partial) + "completeness": ( + report.get("completeness") or "" + ).lower(), # Completeness status (e.g., complete, partial) "host": report.get("host", {}), # Host organism details "hostName": report.get("host", {}).get("organism_name", ""), # Host organism name "hostTaxId": report.get("host", {}).get("tax_id", None), # Host taxonomy ID @@ -4414,69 +4701,68 @@ def load_metadata_from_api_reports(api_reports): "isolateName": report.get("isolate", {}).get("name", ""), # Isolate name "isolate": { # 'name': report.get("isolate", {}).get("name", ""), - 'collectionDate': report.get("isolate", {}).get("collection_date", ""), - 'source': report.get("isolate", {}).get("source", ""), + "collectionDate": report.get("isolate", {}).get("collection_date", ""), + "source": report.get("isolate", {}).get("source", ""), }, "virusTaxId": report.get("virus", {}).get("tax_id", None), # Virus taxonomy and classification "virusName": report.get("virus", {}).get("organism_name", ""), # Virus name "isAnnotated": report.get("is_annotated", False), # Whether sequence is annotated "releaseDate": report.get("release_date", ""), # When sequence was released - # "sraAccessions": report.get("sra_accessions", []), # SRA read data accessions - # "bioprojects": report.get("bioprojects", []), # Associated BioProject IDs - # "biosample": report.get("biosample"), # BioSample ID + # "sraAccessions": report.get("sra_accessions", []), # SRA read data accessions + # "bioprojects": report.get("bioprojects", []), # Associated BioProject IDs + # "biosample": report.get("biosample"), # BioSample ID "proteinCount": report.get("protein_count"), # Number of proteins "maturePeptideCount": report.get("mature_peptide_count"), # Number of mature peptides "segment": report.get("segment"), # Virus segment identifier (e.g., 'HA', 'NA', 'PB1') "isVaccineStrain": report.get("is_vaccine_strain", False), # Whether this is a vaccine strain - "virusPangolinClassification" : report.get("virus", {}).get("pangolin_classification", {}), # Pangolin lineage classification - "submitterName" : report.get("submitter", {}).get("names", ""), # Submitter names - "submitterCountry" : report.get("submitter", {}).get("country", ""), # Submitter country - "submitterInstitution" : report.get("submitter", {}).get("affiliation", "") # Submitter institution + "virusPangolinClassification": report.get("virus", {}).get( + "pangolin_classification", {} + ), # Pangolin lineage classification + "submitterName": report.get("submitter", {}).get("names", ""), # Submitter names + "submitterCountry": report.get("submitter", {}).get("country", ""), # Submitter country + "submitterInstitution": report.get("submitter", {}).get("affiliation", ""), # Submitter institution } - + # Store the metadata using accession as the key metadata_dict[accession] = metadata - # logger.debug("Processed metadata for accession: %s (length: %s, host: %s)", - # accession, - # metadata.get("length"), + # logger.debug("Processed metadata for accession: %s (length: %s, host: %s)", + # accession, + # metadata.get("length"), # metadata.get("host", {}).get("organism_name", "Unknown")) - + else: # Skip reports without accession numbers skipped_count += 1 logger.warning("Skipping API report without accession number: %s", report) - - logger.info("Processed %d metadata records, skipped %d records without accessions", - processed_count, skipped_count) - - return metadata_dict + logger.info("Processed %d metadata records, skipped %d records without accessions", processed_count, skipped_count) + return metadata_dict def _check_protein_requirements(record, metadata, has_proteins, proteins_complete): - """ - Check if a sequence meets protein/gene requirements based on FASTA header. - + """Check if a sequence meets protein/gene requirements based on FASTA header. + This function validates whether a virus sequence contains required proteins or genes by checking the FASTA header. For segmented viruses (like influenza), this checks segment/protein labels in the sequence description. - + The function extracts the protein/segment portion of the header by: 1. Splitting the description on the isolate name (if available in metadata) 2. Splitting by semicolons to get individual protein/segment parts 3. Using regex to match protein names (case-insensitive, handles quotes/parentheses) - + Args: record: FastaRecord object containing the sequence and description metadata (dict): Metadata dictionary for this accession has_proteins (str/list/None): Required protein(s)/gene(s) to check for Can be a single string or list of strings proteins_complete (bool): Whether proteins must be marked as "complete" in header - - Returns: + + Returns + ------- bool: True if protein requirements are met, False otherwise - + Example: >>> # Check for HA segment >>> _check_protein_requirements(record, metadata, "HA", False) @@ -4484,12 +4770,12 @@ def _check_protein_requirements(record, metadata, has_proteins, proteins_complet >>> # Check for multiple segments, requiring complete >>> _check_protein_requirements(record, metadata, ["HA", "NA"], True) True # Only if both HA and NA are present AND marked "complete" + """ - # If no protein filter specified and proteins_complete is False, pass through if has_proteins is None and not proteins_complete: return True - + # If only proteins_complete is True but no specific proteins required, # we can't check completion status without knowing which proteins to look for if has_proteins is None and proteins_complete: @@ -4501,11 +4787,11 @@ def _check_protein_requirements(record, metadata, has_proteins, proteins_complet logger.debug("Sequence %s has no protein/gene annotations", record.id) return False return True - + # Convert single string to list for uniform processing if isinstance(has_proteins, str): has_proteins = [has_proteins] - + try: # Extract the protein/segment portion of the header # If isolate name exists in metadata, split on it to get just the protein info @@ -4516,75 +4802,69 @@ def _check_protein_requirements(record, metadata, has_proteins, proteins_complet # If sample name was not added to metadata, # whole header will be searched for protein/segment names prot_header = record.description - + # Split header into parts by semicolon for checking individual annotations prot_parts = prot_header.split(";") - + # Check that ALL required proteins are present for protein in has_proteins: # Dynamically create regex for each protein with case insensitivity # Handles optional quotes, parentheses around protein names regex = rf"(?i)\b['\",]?\(?{re.escape(protein)}\)?['\",]?\b" - + if proteins_complete: # Only keeping sequences for which proteins are marked as "complete" - if not any( - re.search(regex, part) and "complete" in part.lower() - for part in prot_parts - ): - logger.debug("Sequence %s: protein '%s' not found or not complete", - record.id, protein) + if not any(re.search(regex, part) and "complete" in part.lower() for part in prot_parts): + logger.debug("Sequence %s: protein '%s' not found or not complete", record.id, protein) return False else: # Just check if protein name appears anywhere in header parts if not any(re.search(regex, part) for part in prot_parts): - logger.debug("Sequence %s: required protein '%s' not found in header", - record.id, protein) + logger.debug("Sequence %s: required protein '%s' not found in header", record.id, protein) return False - - logger.debug("Sequence %s passed protein requirements: %s (complete=%s)", - record.id, has_proteins, proteins_complete) + + logger.debug( + "Sequence %s passed protein requirements: %s (complete=%s)", record.id, has_proteins, proteins_complete + ) return True - - except Exception as e: + + except Exception as e: # noqa: BLE001 logger.warning( - f"The 'has_proteins' filter could not be applied to sequence {record.id} " - f"due to the following error:\n{e}" + f"The 'has_proteins' filter could not be applied to sequence {record.id} due to the following error:\n{e}" ) # On error, exclude the sequence (conservative approach) return False def _extract_protein_info_from_header(description, metadata=None): - """ - Extract protein/segment information from FASTA header. - + """Extract protein/segment information from FASTA header. + This function extracts the protein/segment portion of the FASTA description by splitting on the isolate name (if available in metadata). This is particularly important for segmented viruses like influenza. - + The extraction logic matches the original Laura_OG implementation: 1. If isolate name exists in metadata, split description on it and take the last part 2. Otherwise, use the entire description as the protein/segment info - + Args: description (str): FASTA header/description line metadata (dict, optional): Metadata dictionary that may contain isolate name - - Returns: + + Returns + ------- str: Extracted protein/segment information, or pd.NA if extraction fails - + Example: >>> _extract_protein_info_from_header( - ... "NC_001234 A/California/07/2009 HA; complete cds", - ... {"isolate": {"name": "A/California/07/2009"}} + ... "NC_001234 A/California/07/2009 HA; complete cds", {"isolate": {"name": "A/California/07/2009"}} ... ) " HA; complete cds" + """ - if not description: return pd.NA - + try: # If isolate name exists in metadata, split on it to get just the protein info if metadata is not None: @@ -4592,12 +4872,12 @@ def _extract_protein_info_from_header(description, metadata=None): if isolate_name: prot_header = description.split(isolate_name)[-1] return prot_header - + # If sample name was not added to metadata, # whole header will be added as protein/segment description return description - - except Exception: + + except Exception: # noqa: BLE001 return pd.NA @@ -4609,17 +4889,16 @@ def filter_sequences( proteins_complete=False, output_fasta_path=None, ): - """ - Apply sequence-dependent filters to downloaded sequences. - + """Apply sequence-dependent filters to downloaded sequences. + Applies filters requiring actual sequence data (ambiguous character counting, protein/feature analysis). Metadata-only filters should be applied by filter_metadata_only before downloading sequences. - + When output_fasta_path is provided, filtered sequences are streamed directly to the output file instead of accumulating in memory. This is critical for large datasets (millions of sequences) that would otherwise exhaust system RAM. - + Args: fna_file (str): Path to FASTA file containing sequences. metadata_dict (dict): Dictionary mapping accession numbers to metadata. @@ -4628,29 +4907,31 @@ def filter_sequences( proteins_complete (bool): Whether proteins must be complete. output_fasta_path (str, optional): Path to write filtered sequences directly. When provided, sequences are streamed to disk instead of held in memory. - - Returns: + + Returns + ------- tuple: (filtered_count, filtered_metadata, protein_headers) - filtered_count (int): Number of sequences that passed all filters. - filtered_metadata (list): Metadata dicts for sequences passing filters. - protein_headers (list): Protein/segment info from FASTA headers. + """ logger.info("Applying sequence-dependent filters...") - logger.debug("Sequence filters: max_ambiguous=%s, complete=%s, streaming=%s", - max_ambiguous_chars, proteins_complete, output_fasta_path is not None) - + logger.debug( + "Sequence filters: max_ambiguous=%s, complete=%s, streaming=%s", + max_ambiguous_chars, + proteins_complete, + output_fasta_path is not None, + ) + # Initialize lists to store filtered results (metadata is small, kept in memory) - filtered_metadata = [] # Will store corresponding metadata dictionaries - protein_headers = [] # Will store protein/segment information from FASTA headers - filtered_count = 0 # Count of sequences passing all filters - + filtered_metadata = [] # Will store corresponding metadata dictionaries + protein_headers = [] # Will store protein/segment information from FASTA headers + filtered_count = 0 # Count of sequences passing all filters + # Counters for logging filter statistics total_sequences = 0 - filter_stats = { - 'seq_length': 0, - 'ambiguous_chars': 0, - 'proteins': 0 - } + filter_stats = {"seq_length": 0, "ambiguous_chars": 0, "proteins": 0} # Read and process sequences from the FASTA file # When output_fasta_path is set, write passing records directly to disk @@ -4658,27 +4939,27 @@ def filter_sequences( output_handle = None try: if output_fasta_path: - output_handle = open(output_fasta_path, 'w', encoding='utf-8') + output_handle = open(output_fasta_path, "w", encoding="utf-8") logger.info("Streaming filtered sequences directly to: %s", output_fasta_path) - + for record in FastaIO.parse(fna_file, "fasta"): total_sequences += 1 record_passes = True - + # Normalize accession by taking only the first part (before space) - record_accession = record.id.split()[0] if hasattr(record, 'id') else str(record) - + record_accession = record.id.split()[0] if hasattr(record, "id") else str(record) + # Count ambiguous characters (N's) if max_ambiguous_chars is not None: - ambiguous_count = record.seq.upper().count('N') + ambiguous_count = record.seq.upper().count("N") if ambiguous_count > max_ambiguous_chars: - filter_stats['ambiguous_chars'] += 1 + filter_stats["ambiguous_chars"] += 1 record_passes = False continue - + # Get metadata for this record to check protein information record_metadata = metadata_dict.get(record_accession, {}) - + if proteins_complete: protein_count = record_metadata.get("proteinCount", 0) gene_count = record_metadata.get("geneCount", 0) @@ -4686,21 +4967,20 @@ def filter_sequences( if gene_count is None or gene_count == 0: logger.debug("Sequence %s has no protein/gene annotations", record.id) record_passes = False - filter_stats['proteins'] += 1 + filter_stats["proteins"] += 1 continue - + # If sequence passed all filters, keep it and its metadata if record_passes: filtered_count += 1 filtered_metadata.append(record_metadata) - + # Write directly to output file if streaming (memory-efficient) if output_handle: _write_fasta_record(output_handle, record) - + if filtered_count % FASTA_STREAM_LOG_INTERVAL == 0: - logger.debug("Processed %d sequences, %d passed filters so far...", - total_sequences, filtered_count) + logger.debug("Processed %d sequences, %d passed filters so far...", total_sequences, filtered_count) finally: if output_handle: output_handle.close() @@ -4708,11 +4988,11 @@ def filter_sequences( # Log filtering results logger.info("Sequence filter results:") logger.info("- Total sequences processed: %d", total_sequences) - logger.info("- Filtered out because of sequence length: %d", filter_stats['seq_length']) - logger.info("- Filtered out because of number of ambiguous characters: %d", filter_stats['ambiguous_chars']) - logger.info("- Filtered out because of protein requirements: %d", filter_stats['proteins']) + logger.info("- Filtered out because of sequence length: %d", filter_stats["seq_length"]) + logger.info("- Filtered out because of number of ambiguous characters: %d", filter_stats["ambiguous_chars"]) + logger.info("- Filtered out because of protein requirements: %d", filter_stats["proteins"]) logger.info("- Sequences passing all filters: %d", filtered_count) - + return filtered_count, filtered_metadata, protein_headers, filter_stats @@ -4743,30 +5023,28 @@ def save_command_summary( total_after_genbank_filter=None, total_after_sequence_filter=None, ): - """ - Save a summary file documenting the command execution and results. - + """Save a summary file documenting the command execution and results. + Creates a comprehensive summary including command line, statistics, output files, and any errors encountered. """ - # Get versions if not provided if gget_version is None: gget_version = _get_gget_version() - + summary_file = os.path.join(outfolder, "command_summary.txt") - + try: - with open(summary_file, 'w', encoding='utf-8') as f: + with open(summary_file, "w", encoding="utf-8") as f: # Header f.write("=" * 80 + "\n") f.write("GGET VIRUS COMMAND SUMMARY\n") f.write("=" * 80 + "\n\n") - + # Timestamp f.write(f"Execution Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") f.write(f"Output Folder: {outfolder}\n\n") - + # Version information f.write("-" * 80 + "\n") f.write("SOFTWARE VERSIONS\n") @@ -4775,13 +5053,13 @@ def save_command_summary( if datasets_version is not None: f.write(f"{datasets_version}\n") f.write("\n") - + # Command line f.write("-" * 80 + "\n") f.write("COMMAND LINE\n") f.write("-" * 80 + "\n") f.write(f"{command_line}\n\n") - + # Execution status f.write("-" * 80 + "\n") f.write("EXECUTION STATUS\n") @@ -4796,7 +5074,7 @@ def save_command_summary( f.write("✗ Command failed\n") if error_message: f.write(f"Error: {error_message}\n\n") - + # Runtime if runtime_seconds is not None: f.write("-" * 80 + "\n") @@ -4805,31 +5083,33 @@ def save_command_summary( hours, remainder = divmod(int(runtime_seconds), 3600) minutes, seconds = divmod(remainder, 60) if hours > 0: - f.write(f"Total wall-clock time: {hours}h {minutes}m {seconds}s ({runtime_seconds:.1f} seconds)\n\n") + f.write( + f"Total wall-clock time: {hours}h {minutes}m {seconds}s ({runtime_seconds:.1f} seconds)\n\n" + ) elif minutes > 0: f.write(f"Total wall-clock time: {minutes}m {seconds}s ({runtime_seconds:.1f} seconds)\n\n") else: f.write(f"Total wall-clock time: {runtime_seconds:.1f} seconds\n\n") - + # Memory usage if memory_info: f.write("-" * 80 + "\n") f.write("MEMORY USAGE\n") f.write("-" * 80 + "\n") - if memory_info.get('rss_mb') is not None: + if memory_info.get("rss_mb") is not None: f.write(f"Process RSS (resident memory): {memory_info['rss_mb']:.1f} MB\n") - if memory_info.get('vms_mb') is not None: + if memory_info.get("vms_mb") is not None: f.write(f"Process VMS (virtual memory): {memory_info['vms_mb']:.1f} MB\n") - if memory_info.get('percent') is not None: + if memory_info.get("percent") is not None: f.write(f"Process memory percent: {memory_info['percent']:.1f}%\n") - if memory_info.get('total_mb') is not None: + if memory_info.get("total_mb") is not None: f.write(f"System total memory: {memory_info['total_mb']:.0f} MB\n") - if memory_info.get('available_mb') is not None: + if memory_info.get("available_mb") is not None: f.write(f"System available memory: {memory_info['available_mb']:.0f} MB\n") - if memory_info.get('system_percent') is not None: + if memory_info.get("system_percent") is not None: f.write(f"System memory used: {memory_info['system_percent']:.1f}%\n") f.write("\n") - + # Statistics f.write("-" * 80 + "\n") f.write("SEQUENCE STATISTICS\n") @@ -4848,14 +5128,14 @@ def save_command_summary( if total_after_sequence_filter is not None: f.write(f"After sequence filtering: {total_after_sequence_filter}\n") f.write(f"Final sequences (after all filters): {total_final_sequences}\n\n") - + # Filter breakdown by stage - any_filter_stats = (metadata_filter_stats or genbank_filter_stats or sequence_filter_stats) + any_filter_stats = metadata_filter_stats or genbank_filter_stats or sequence_filter_stats if any_filter_stats: f.write("-" * 80 + "\n") f.write("FILTER BREAKDOWN BY STAGE\n") f.write("-" * 80 + "\n") - + if metadata_filter_stats: active_meta = {k: v for k, v in metadata_filter_stats.items() if v > 0} if active_meta: @@ -4864,7 +5144,7 @@ def save_command_summary( f.write(f" {filter_name}: {count}\n") else: f.write("\nMetadata filtering: no records excluded\n") - + if genbank_filter_stats: active_gb = {k: v for k, v in genbank_filter_stats.items() if v > 0} if active_gb: @@ -4873,7 +5153,7 @@ def save_command_summary( f.write(f" {filter_name}: {count}\n") else: f.write("\nGenBank metadata filtering: no records excluded\n") - + if sequence_filter_stats: active_seq = {k: v for k, v in sequence_filter_stats.items() if v > 0} if active_seq: @@ -4882,9 +5162,9 @@ def save_command_summary( f.write(f" {filter_name}: {count}\n") else: f.write("\nSequence filtering: no records excluded\n") - + f.write("\n") - + # Partial metadata recovery information if partial_metadata_file: f.write("-" * 80 + "\n") @@ -4892,20 +5172,22 @@ def save_command_summary( f.write("-" * 80 + "\n") f.write(f"Partial metadata saved: {partial_metadata_file}\n") if recovery_command: - f.write(f"\nRecovery command:\n") + f.write("\nRecovery command:\n") f.write(f" {recovery_command}\n") f.write("\n") - + # Detailed statistics from metadata if filtered_metadata and len(filtered_metadata) > 0: f.write("-" * 80 + "\n") f.write("DETAILED STATISTICS\n") f.write("-" * 80 + "\n") - + # Unique hosts hosts = set() for meta in filtered_metadata: - host_name = meta.get('host', {}).get('organism_name') if isinstance(meta.get('host'), dict) else None + host_name = ( + meta.get("host", {}).get("organism_name") if isinstance(meta.get("host"), dict) else None + ) if host_name: hosts.add(host_name) f.write(f"Unique hosts: {len(hosts)}\n") @@ -4917,11 +5199,11 @@ def save_command_summary( for host in sorted(hosts)[:20]: f.write(f" - {host}\n") f.write("\n") - + # Unique geographic locations locations = set() for meta in filtered_metadata: - location = meta.get('location') + location = meta.get("location") if location: locations.add(location) f.write(f"Unique geographic locations: {len(locations)}\n") @@ -4933,37 +5215,37 @@ def save_command_summary( for loc in sorted(locations)[:20]: f.write(f" - {loc}\n") f.write("\n") - + # Sequence length statistics - lengths = [meta.get('length') for meta in filtered_metadata if meta.get('length')] + lengths = [meta.get("length") for meta in filtered_metadata if meta.get("length")] if lengths: f.write(f"Sequence length range: {min(lengths)} - {max(lengths)} bp\n") f.write(f"Average sequence length: {sum(lengths) / len(lengths):.0f} bp\n\n") - + # Completeness breakdown completeness_counts = {} for meta in filtered_metadata: - comp = meta.get('completeness', 'unknown') + comp = meta.get("completeness", "unknown") completeness_counts[comp] = completeness_counts.get(comp, 0) + 1 f.write("Completeness breakdown:\n") for comp, count in sorted(completeness_counts.items()): f.write(f" - {comp}: {count}\n") f.write("\n") - + # Source database breakdown source_counts = {} for meta in filtered_metadata: - source = meta.get('sourceDatabase', 'unknown') + source = meta.get("sourceDatabase", "unknown") source_counts[source] = source_counts.get(source, 0) + 1 f.write("Source database breakdown:\n") for source, count in sorted(source_counts.items()): f.write(f" - {source}: {count}\n") f.write("\n") - + # Submitter countries countries = set() for meta in filtered_metadata: - country = meta.get('submitterCountry') + country = meta.get("submitterCountry") if country: countries.add(country) f.write(f"Unique submitter countries: {len(countries)}\n") @@ -4975,7 +5257,7 @@ def save_command_summary( for country in sorted(countries)[:20]: f.write(f" - {country}\n") f.write("\n") - + # Output files f.write("-" * 80 + "\n") f.write("OUTPUT FILES\n") @@ -4990,101 +5272,113 @@ def save_command_summary( else: f.write("No output files generated\n") f.write("\n") - + # Failed operations - if any occurred if failed_commands: has_failures = False - + # Check for API timeouts - if failed_commands.get('api_timeout'): + if failed_commands.get("api_timeout"): if not has_failures: f.write("-" * 80 + "\n") f.write("⚠️ FAILED OPERATIONS - MANUAL RETRY REQUIRED\n") f.write("-" * 80 + "\n") has_failures = True - timeout_info = failed_commands['api_timeout'] - f.write(f"\n📍 API TIMEOUT:\n") + timeout_info = failed_commands["api_timeout"] + f.write("\n📍 API TIMEOUT:\n") f.write(f" Error: {timeout_info.get('error', 'Unknown')}\n") f.write(f" URL: {timeout_info.get('url', 'Unknown')}\n") - f.write(f" Recommendation: Try again later or use different filters\n\n") - + f.write(" Recommendation: Try again later or use different filters\n\n") + # Check for empty API response - if failed_commands.get('empty_response'): + if failed_commands.get("empty_response"): if not has_failures: f.write("-" * 80 + "\n") f.write("⚠️ FAILED OPERATIONS - MANUAL RETRY REQUIRED\n") f.write("-" * 80 + "\n") has_failures = True - empty_resp_info = failed_commands['empty_response'] - f.write(f"\n📍 EMPTY API RESPONSE:\n") + empty_resp_info = failed_commands["empty_response"] + f.write("\n📍 EMPTY API RESPONSE:\n") f.write(f" Error: {empty_resp_info.get('error', 'Unknown')}\n") - f.write(f" Recommendation: Check your virus identifier or try different filter parameters\n\n") - + f.write(" Recommendation: Check your virus identifier or try different filter parameters\n\n") + # Check for failed API batches - if failed_commands.get('api_batches'): + if failed_commands.get("api_batches"): if not has_failures: f.write("-" * 80 + "\n") f.write("⚠️ FAILED OPERATIONS - MANUAL RETRY REQUIRED\n") f.write("-" * 80 + "\n") has_failures = True f.write(f"\n📍 FAILED METADATA BATCHES ({len(failed_commands['api_batches'])} batches):\n") - for batch_info in failed_commands['api_batches'][:5]: # Show first 5 - f.write(f"\n Batch {batch_info.get('batch_num', '?')}: {batch_info.get('accession_count', '?')} accessions\n") + for batch_info in failed_commands["api_batches"][:5]: # Show first 5 + f.write( + f"\n Batch {batch_info.get('batch_num', '?')}: {batch_info.get('accession_count', '?')} accessions\n" + ) f.write(f" Error: {batch_info.get('error', 'Unknown')}\n") f.write(f" API URL: {batch_info.get('api_url', 'Unknown')}\n") - if len(failed_commands['api_batches']) > 5: + if len(failed_commands["api_batches"]) > 5: f.write(f"\n ... and {len(failed_commands['api_batches']) - 5} more failed batches\n") f.write("\n") - + # Check for pagination errors/timeouts - if failed_commands.get('pagination_timeouts') or failed_commands.get('pagination_errors'): + if failed_commands.get("pagination_timeouts") or failed_commands.get("pagination_errors"): has_failures = True - if not (failed_commands.get('api_batches') or failed_commands.get('api_timeout')): + if not (failed_commands.get("api_batches") or failed_commands.get("api_timeout")): f.write("-" * 80 + "\n") f.write("⚠️ FAILED OPERATIONS - PARTIAL RESULTS OBTAINED\n") f.write("-" * 80 + "\n") - - if failed_commands.get('pagination_timeouts'): + + if failed_commands.get("pagination_timeouts"): f.write(f"\n📍 PAGINATION TIMEOUTS ({len(failed_commands['pagination_timeouts'])} pages):\n") - for page_info in failed_commands['pagination_timeouts'][:3]: - f.write(f" Page {page_info.get('page', '?')}: {page_info.get('records_retrieved', 0)} records retrieved\n") + for page_info in failed_commands["pagination_timeouts"][:3]: + f.write( + f" Page {page_info.get('page', '?')}: {page_info.get('records_retrieved', 0)} records retrieved\n" + ) f.write(f" Error: {page_info.get('error', 'Unknown')}\n") - - if failed_commands.get('pagination_errors'): + + if failed_commands.get("pagination_errors"): f.write(f"\n📍 PAGINATION ERRORS ({len(failed_commands['pagination_errors'])} pages):\n") - for page_info in failed_commands['pagination_errors'][:3]: - f.write(f" Page {page_info.get('page', '?')}: {page_info.get('error_type', 'Unknown')} error\n") + for page_info in failed_commands["pagination_errors"][:3]: + f.write( + f" Page {page_info.get('page', '?')}: {page_info.get('error_type', 'Unknown')} error\n" + ) f.write(f" Error: {page_info.get('error', 'Unknown')}\n") - + # Check for sequence download failures - if failed_commands.get('sequence_batches'): + if failed_commands.get("sequence_batches"): has_failures = True - if not (failed_commands.get('api_batches') or failed_commands.get('api_timeout') or - failed_commands.get('pagination_timeouts') or failed_commands.get('pagination_errors')): + if not ( + failed_commands.get("api_batches") + or failed_commands.get("api_timeout") + or failed_commands.get("pagination_timeouts") + or failed_commands.get("pagination_errors") + ): f.write("-" * 80 + "\n") f.write("⚠️ FAILED OPERATIONS - MANUAL RETRY AVAILABLE\n") f.write("-" * 80 + "\n") - - f.write(f"\n📍 FAILED SEQUENCE DOWNLOAD BATCHES ({len(failed_commands['sequence_batches'])} batches):\n") - for batch_info in failed_commands['sequence_batches'][:5]: + + f.write( + f"\n📍 FAILED SEQUENCE DOWNLOAD BATCHES ({len(failed_commands['sequence_batches'])} batches):\n" + ) + for batch_info in failed_commands["sequence_batches"][:5]: f.write(f"\n Batch {batch_info.get('batch_num', '?')}\n") f.write(f" Error: {batch_info.get('error', 'Unknown')}\n") f.write(f" Retry URL: {batch_info.get('retry_url', 'Unknown')}\n") - if len(failed_commands['sequence_batches']) > 5: + if len(failed_commands["sequence_batches"]) > 5: f.write(f"\n ... and {len(failed_commands['sequence_batches']) - 5} more failed batches\n") - + # Check for single sequence fetch failures - if failed_commands.get('sequence_fetch'): + if failed_commands.get("sequence_fetch"): has_failures = True f.write(f"\n📍 SEQUENCE FETCH FAILURES ({len(failed_commands['sequence_fetch'])} operations):\n") - for fetch_info in failed_commands['sequence_fetch'][:3]: + for fetch_info in failed_commands["sequence_fetch"][:3]: f.write(f"\n Operation: {fetch_info.get('operation', 'Unknown')}\n") f.write(f" Accessions: {fetch_info.get('accession_count', '?')}\n") f.write(f" Error: {fetch_info.get('error', 'Unknown')}\n") f.write(f" Retry URL: {fetch_info.get('retry_url', 'Unknown')}\n") - if len(failed_commands['sequence_fetch']) > 3: + if len(failed_commands["sequence_fetch"]) > 3: f.write(f"\n ... and {len(failed_commands['sequence_fetch']) - 3} more failures\n") - + if has_failures: f.write("\n💡 RECOVERY INSTRUCTIONS:\n") f.write(" 1. Copy the URL from above and paste it into your browser\n") @@ -5092,10 +5386,10 @@ def save_command_summary( f.write(" 3. Retry the command with updated filters (e.g., stricter date ranges)\n") f.write(" 4. If the issue persists, NCBI servers may be temporarily unavailable\n") if partial_metadata_file: - f.write(f"\n 5. RESUME with baseline deduplication:\n") + f.write("\n 5. RESUME with baseline deduplication:\n") f.write(f" {recovery_command}\n") f.write("\n") - + # Footer f.write("=" * 80 + "\n") f.write("END OF SUMMARY\n") @@ -5104,158 +5398,155 @@ def save_command_summary( logger.info("=" * 60) logger.info("✅ Command summary saved: %s", summary_file) return summary_file - - except Exception as e: + + except Exception as e: # noqa: BLE001 logger.error("Failed to save command summary: %s", e) logger.error("Traceback: %s", traceback.format_exc()) return None def merge_metadata_csvs(genbank_csv_path, standard_csv_path): - """ - Merge standard metadata CSV into GenBank metadata CSV. - + """Merge standard metadata CSV into GenBank metadata CSV. + Where GenBank data is missing, fills in values from the standard metadata CSV. Does not overwrite any existing data in the GenBank CSV. - + Args: genbank_csv_path (str): Path to the GenBank metadata CSV file standard_csv_path (str): Path to the standard metadata CSV file - - Returns: + + Returns + ------- bool: True if merge was successful, False otherwise + """ try: if not os.path.exists(standard_csv_path): logger.debug("Standard metadata CSV not found, skipping merge: %s", standard_csv_path) return False - + logger.info("Merging standard metadata into GenBank metadata...") - + # Read both CSV files - use dtype=str to avoid type conversion issues genbank_df = pd.read_csv(genbank_csv_path, dtype=str) standard_df = pd.read_csv(standard_csv_path, dtype=str) - + logger.debug("GenBank CSV: %d rows × %d columns", len(genbank_df), len(genbank_df.columns)) logger.debug("Standard CSV: %d rows × %d columns", len(standard_df), len(standard_df.columns)) - + # Create a mapping from accession to standard metadata for quick lookup standard_by_accession = {} - if 'accession' in standard_df.columns: + if "accession" in standard_df.columns: for _, row in standard_df.iterrows(): - acc = row['accession'] - if pd.notna(acc) and str(acc).strip() and str(acc) != 'nan': + acc = row["accession"] + if pd.notna(acc) and str(acc).strip() and str(acc) != "nan": standard_by_accession[str(acc)] = row - + logger.debug("Indexed %d accessions from standard metadata", len(standard_by_accession)) - + # Fill missing values in genbank_df from standard_df rows_updated = 0 columns_updated = 0 - + for idx, row in genbank_df.iterrows(): - accession = str(row['accession']).strip() if pd.notna(row['accession']) else None - - if accession and accession != 'nan' and accession in standard_by_accession: + accession = str(row["accession"]).strip() if pd.notna(row["accession"]) else None + + if accession and accession != "nan" and accession in standard_by_accession: standard_row = standard_by_accession[accession] - + # For each column in genbank_df, if the value is NaN/empty, fill from standard for col in genbank_df.columns: if col in standard_row.index: genbank_val = str(row[col]).strip() if pd.notna(row[col]) else None standard_val = str(standard_row[col]).strip() if pd.notna(standard_row[col]) else None - + # Fill if genbank is empty but standard has data - if (not genbank_val or genbank_val == 'nan') and standard_val and standard_val != 'nan': + if (not genbank_val or genbank_val == "nan") and standard_val and standard_val != "nan": genbank_df.at[idx, col] = standard_val columns_updated += 1 - + if columns_updated > 0: rows_updated += 1 - + # Save the merged dataframe back to the genbank CSV - genbank_df.to_csv(genbank_csv_path, index=False, encoding='utf-8') - - logger.info("✅ Metadata merge complete: updated %d cells across %d rows", - columns_updated, rows_updated) + genbank_df.to_csv(genbank_csv_path, index=False, encoding="utf-8") + + logger.info("✅ Metadata merge complete: updated %d cells across %d rows", columns_updated, rows_updated) logger.debug("Merged GenBank CSV: %d rows × %d columns", len(genbank_df), len(genbank_df.columns)) - + return True - - except Exception as e: + + except Exception as e: # noqa: BLE001 logger.warning("❌ Failed to merge metadata CSVs: %s", e) logger.debug("Exception details:", exc_info=True) return False def save_metadata_to_csv(filtered_metadata, protein_headers, output_metadata_file): - """ - Save filtered metadata to a CSV file with a specific column order. - + """Save filtered metadata to a CSV file with a specific column order. + This function creates a comprehensive CSV file containing all relevant metadata for the filtered virus sequences. The output format is designed to be compatible with downstream analysis tools like Delphy. - + Args: filtered_metadata (list): List of metadata dictionaries for filtered sequences protein_headers (list): List of protein/segment information extracted from headers output_metadata_file (str): Path to the output CSV file - + Note: The column order is specifically designed to match requirements for phylogenetic analysis tools and provides a standardized format. """ - logger.info("Preparing metadata for CSV output...") - logger.debug("Processing %d metadata records with %d protein headers", - len(filtered_metadata), len(protein_headers)) + logger.debug("Processing %d metadata records with %d protein headers", len(filtered_metadata), len(protein_headers)) # Define the column order for the output CSV # This order prioritizes the most commonly used fields and matches # the format expected by downstream analysis tools columns = [ - "accession", # Primary identifier (lowercase for Delphy compatibility) - "Organism Name", # Virus species/strain name - "GenBank/RefSeq", # Source database (GenBank or RefSeq) - "Submitters", # Names of sequence submitters - "Organization", # Submitting organization/institution - "Submitter Country", # Country of submitting organization - "Release date", # Date when sequence was released to public databases - "Isolate", # Isolate/sample identifier - "Virus Lineage", # Taxonomic lineage of the virus - "Length", # Sequence length in base pairs - "Nuc Completeness", # Completeness status (complete/partial) + "accession", # Primary identifier (lowercase for Delphy compatibility) + "Organism Name", # Virus species/strain name + "GenBank/RefSeq", # Source database (GenBank or RefSeq) + "Submitters", # Names of sequence submitters + "Organization", # Submitting organization/institution + "Submitter Country", # Country of submitting organization + "Release date", # Date when sequence was released to public databases + "Isolate", # Isolate/sample identifier + "Virus Lineage", # Taxonomic lineage of the virus + "Length", # Sequence length in base pairs + "Nuc Completeness", # Completeness status (complete/partial) "Proteins/Segments", # Protein/segment information from FASTA headers - "Segment", # Virus segment identifier (e.g., 'HA', 'NA', '4', '6') + "Segment", # Virus segment identifier (e.g., 'HA', 'NA', '4', '6') "Is Vaccine Strain", # Whether this sequence is from a vaccine strain "Geographic Region", # Geographic region where sample was collected - "Geographic Location",# Specific geographic location - "Host", # Host organism name - "Host Lineage", # Taxonomic lineage of host organism - "Lab Host", # Whether sample was lab-passaged - "Tissue/Specimen/Source", # Sample source/tissue type - "Collection Date", # Date when sample was collected - "Sample Name", # Sample identifier - "Annotated", # Whether sequence has annotation data - "SRA Accessions", # Associated SRA (sequencing) accessions - "Bioprojects", # Associated BioProject identifiers - "Biosample", # BioSample identifier - "Protein count", # Number of proteins annotated - "Gene count", # Number of genes annotated - "Mature Peptide Count", # Number of mature peptides annotated + "Geographic Location", # Specific geographic location + "Host", # Host organism name + "Host Lineage", # Taxonomic lineage of host organism + "Lab Host", # Whether sample was lab-passaged + "Tissue/Specimen/Source", # Sample source/tissue type + "Collection Date", # Date when sample was collected + "Sample Name", # Sample identifier + "Annotated", # Whether sequence has annotation data + "SRA Accessions", # Associated SRA (sequencing) accessions + "Bioprojects", # Associated BioProject identifiers + "Biosample", # BioSample identifier + "Protein count", # Number of proteins annotated + "Gene count", # Number of genes annotated + "Mature Peptide Count", # Number of mature peptides annotated # Additional GenBank columns - "definition", # GenBank sequence definition - "strain", # Strain information - "isolation_source", # Source of isolation - "create_date", # GenBank creation date - "update_date", # GenBank update date - "assembly_name", # Assembly name - "authors", # Publication authors - "title", # Publication title - "journal", # Publication journal - "pubmed_id", # PubMed ID - "reference_count", # Number of references - "comment", # Additional comments + "definition", # GenBank sequence definition + "strain", # Strain information + "isolation_source", # Source of isolation + "create_date", # GenBank creation date + "update_date", # GenBank update date + "assembly_name", # Assembly name + "authors", # Publication authors + "title", # Publication title + "journal", # Publication journal + "pubmed_id", # PubMed ID + "reference_count", # Number of references + "comment", # Additional comments ] logger.debug("Using column order: %s", columns) @@ -5263,63 +5554,54 @@ def save_metadata_to_csv(filtered_metadata, protein_headers, output_metadata_fil # Process metadata in chunks for memory efficiency on large datasets total_records = len(filtered_metadata) chunk_size = METADATA_CSV_CHUNK_SIZE - + logger.info("Processing %d metadata records (chunk_size=%d)...", total_records, chunk_size) - + def _build_row(i, metadata): """Build a single row dictionary from metadata.""" return { # Primary identifiers "accession": metadata.get("accession", pd.NA), "Organism Name": metadata.get("virus", {}).get("organism_name", pd.NA), - # Database and submission information "GenBank/RefSeq": metadata.get("sourceDatabase", pd.NA), - "Submitters": ", ".join(metadata.get("submitter", {}).get("names", [])) if metadata.get("submitter", {}).get("names") else pd.NA, + "Submitters": ", ".join(metadata.get("submitter", {}).get("names", [])) + if metadata.get("submitter", {}).get("names") + else pd.NA, "Organization": metadata.get("submitter", {}).get("affiliation", pd.NA), "Submitter Country": metadata.get("submitter", {}).get("country", pd.NA), "Release date": metadata.get("releaseDate", "").split("T")[0] if metadata.get("releaseDate") else pd.NA, - # Sample and isolate information "Isolate": metadata.get("isolate", {}).get("name", pd.NA), "Sample Name": metadata.get("isolate", {}).get("name", pd.NA), - # Virus classification "Virus Lineage": metadata.get("virus", {}).get("lineage", []), - # Sequence characteristics "Length": metadata.get("length", pd.NA), "Nuc Completeness": metadata.get("completeness", pd.NA), "Proteins/Segments": protein_headers[i] if i < len(protein_headers) else pd.NA, - "Segment": metadata.get("segment", pd.NA), + "Segment": metadata.get("segment", pd.NA), "Is Vaccine Strain": metadata.get("isVaccineStrain", metadata.get("is_vaccine_strain", pd.NA)), - # Geographic information "Geographic Region": metadata.get("region", pd.NA), "Geographic Location": metadata.get("location", pd.NA), - # Host information "Host": metadata.get("host", {}).get("organism_name", pd.NA), "Host Lineage": metadata.get("host", {}).get("lineage", []), "Lab Host": metadata.get("labHost", pd.NA), - # Sample source information "Tissue/Specimen/Source": metadata.get("isolate", {}).get("source", pd.NA), "Collection Date": metadata.get("isolate", {}).get("collection_date", pd.NA), - # Annotation and quality information "Annotated": metadata.get("isAnnotated", pd.NA), - # Associated database records "SRA Accessions": metadata.get("sraAccessions", []), "Bioprojects": metadata.get("bioprojects", []), "Biosample": metadata.get("biosample", pd.NA), - # Counts "Gene count": metadata.get("geneCount"), "Protein count": metadata.get("proteinCount"), "Mature Peptide Count": metadata.get("maturePeptideCount"), - # GenBank-specific columns (not available from NCBI API metadata) "definition": pd.NA, "strain": pd.NA, @@ -5339,32 +5621,32 @@ def _build_row(i, metadata): try: first_chunk = True rows_written = 0 - + for chunk_start in range(0, total_records, chunk_size): chunk_end = min(chunk_start + chunk_size, total_records) chunk_data = [] - + for i in range(chunk_start, chunk_end): chunk_data.append(_build_row(i, filtered_metadata[i])) - + df_chunk = pd.DataFrame(chunk_data, columns=columns) - + # First chunk writes header, subsequent chunks append without header if first_chunk: - df_chunk.to_csv(output_metadata_file, index=False, mode='w') + df_chunk.to_csv(output_metadata_file, index=False, mode="w") first_chunk = False else: - df_chunk.to_csv(output_metadata_file, index=False, mode='a', header=False) - + df_chunk.to_csv(output_metadata_file, index=False, mode="a", header=False) + rows_written += len(df_chunk) - + # Free chunk memory del chunk_data del df_chunk - + if total_records > chunk_size: logger.debug("CSV progress: %d/%d rows written", rows_written, total_records) - + logger.info("Successfully saved metadata CSV to: %s", output_metadata_file) logger.debug("CSV file contains %d rows and %d columns", rows_written, len(columns)) except Exception as e: @@ -5373,27 +5655,27 @@ def _build_row(i, metadata): def check_min_max(min_val, max_val, filtername, date=False): - """ - Validate that minimum and maximum values are in the correct order. - + """Validate that minimum and maximum values are in the correct order. + Args: min_val: Minimum value (can be numeric or date string). max_val: Maximum value (can be numeric or date string). filtername (str): Name of the filter for error reporting. date (bool): Whether the values are dates that need parsing. - - Raises: + + Raises + ------ ValueError: If minimum value is greater than maximum value - + Example: check_min_max(100, 50, "sequence length") # Raises ValueError check_min_max(100, 200, "sequence length") # No error + """ # Only perform validation if both values are provided if min_val is not None and max_val is not None: - logger.debug("Validating min/max values for %s: min=%s, max=%s", - filtername, min_val, max_val) - + logger.debug("Validating min/max values for %s: min=%s, max=%s", filtername, min_val, max_val) + if date: try: min_val = _parse_date(min_val) @@ -5402,19 +5684,20 @@ def check_min_max(min_val, max_val, filtername, date=False): except Exception as e: logger.error("❌ Failed to parse dates for validation: %s", e) raise ValueError(f"Invalid date format in {filtername} filters") from e - + if min_val > max_val: error_msg = f"Min value ({min_val}) cannot be greater than max value ({max_val}) for {filtername}." logger.error("❌ Validation failed: %s", error_msg) raise ValueError(error_msg) - + logger.debug("Min/max validation passed for %s", filtername) # ============================================================================= -# ESEARCH PRE-FILTERING +# ESEARCH PRE-FILTERING # ============================================================================= + def _esearch_prefilter_genbank( virus_taxid, metadata_filtered_accessions, @@ -5426,13 +5709,12 @@ def _esearch_prefilter_genbank( max_seq_length=None, api_key=None, ): - """ - Use NCBI ESearch to pre-filter accessions BEFORE fetching full GenBank XML. - + """Use NCBI ESearch to pre-filter accessions BEFORE fetching full GenBank XML. + Instead of fetching full GenBank XML for ALL metadata-filtered accessions, we use ESearch to find only the subset that might pass the GenBank- dependent filters. This typically reduces the set from hundreds of thousands to a few hundred, making the GenBank fetch near-instant. - + Strategy: Build an ESearch query that is a SUPERSET of what the exact GenBank XML parsing will select. Use broad matching to avoid false negatives. Then intersect with the metadata-filtered accessions and fetch GenBank XML only for the intersection. - + Args: virus_taxid (str/int): NCBI taxonomy ID for the virus (e.g. 11676 for HIV-1). metadata_filtered_accessions (list): Accessions that passed metadata filters. @@ -5443,37 +5725,39 @@ def _esearch_prefilter_genbank( min_seq_length (int, optional): Minimum sequence length. max_seq_length (int, optional): Maximum sequence length. api_key (str, optional): NCBI API key for higher rate limits. - - Returns: + + Returns + ------- set: Set of accession numbers that might pass GenBank filters, or None if pre-filtering couldn't be performed (fall back to full fetch). + """ # Build ESearch query terms query_parts = [] - + # Always include organism constraint if virus_taxid: query_parts.append(f"txid{virus_taxid}[Organism]") - + # NOTE: We deliberately do NOT use "proviral" as an ESearch term here. # ESearch free-text "proviral" only matches sequences that contain the WORD - # "proviral" in their GenBank text (title, comment, keywords). Many sequences that ARE proviral (especially complete genomes like AF004394.1, U69593.1) do NOT contain this word in their text. The NCBI Virus web interface uses a different mechanism (structured metadata/mol_type qualifier) to identify proviral sequences. + # "proviral" in their GenBank text (title, comment, keywords). Many sequences that ARE proviral (especially complete genomes like AF004394.1, U69593.1) do NOT contain this word in their text. The NCBI Virus web interface uses a different mechanism (structured metadata/mol_type qualifier) to identify proviral sequences. # The provirus filter is correctly applied later in filter_genbank_metadata() by checking the actual GenBank XML features/qualifiers. - + # Has_proteins filter: search for protein name if has_proteins: # Use broad "All Fields" matching for protein name to ensure superset # The exact filtering will happen later on the GenBank XML query_parts.append(f'"{has_proteins}"') - + # Genotype filter: search for genotype string if genotype: query_parts.append(f'"{genotype}"') - + # Molecule type filter if gen_mol_type: query_parts.append(f'"{gen_mol_type}"[Molecule Type]') - + # Sequence length filter if min_seq_length and max_seq_length: query_parts.append(f"{min_seq_length}:{max_seq_length}[SLEN]") @@ -5481,107 +5765,111 @@ def _esearch_prefilter_genbank( query_parts.append(f"{min_seq_length}:99999999[SLEN]") elif max_seq_length: query_parts.append(f"1:{max_seq_length}[SLEN]") - + # Need at least organism + one GenBank filter for pre-filtering to be useful if len(query_parts) < 2: logger.info("ESearch pre-filter: not enough filter criteria for useful pre-filtering") return None - + search_query = " AND ".join(query_parts) logger.info("ESearch pre-filter query: %s", search_query) - + try: # Step 1: ESearch to get count and WebEnv params = { - 'db': 'nucleotide', - 'term': search_query, - 'retmax': 0, - 'usehistory': 'y', + "db": "nucleotide", + "term": search_query, + "retmax": 0, + "usehistory": "y", } if api_key: - params['api_key'] = api_key - - response = requests.get(NCBI_EUTILS_BASE_ESEARCH, params=params, timeout=60, - headers={'User-Agent': 'gget/1.0'}) + params["api_key"] = api_key + + response = requests.get(NCBI_EUTILS_BASE_ESEARCH, params=params, timeout=60, headers={"User-Agent": "gget/1.0"}) response.raise_for_status() root = ET.fromstring(response.text) - - count_elem = root.find('.//Count') - web_env_elem = root.find('.//WebEnv') - query_key_elem = root.find('.//QueryKey') - + + count_elem = root.find(".//Count") + web_env_elem = root.find(".//WebEnv") + query_key_elem = root.find(".//QueryKey") + if count_elem is None or web_env_elem is None: logger.warning("ESearch pre-filter: could not parse response") return None - + total_count = int(count_elem.text) web_env = web_env_elem.text query_key = query_key_elem.text - + logger.info("ESearch pre-filter: found %d accessions matching GenBank criteria", total_count) - + # If the pre-filter results are TOO large (>50K), it's not useful # Fall back to the full fetch method if total_count > 50000: - logger.info("ESearch pre-filter: %d results is too large for effective pre-filtering (>50K), skipping", total_count) + logger.info( + "ESearch pre-filter: %d results is too large for effective pre-filtering (>50K), skipping", total_count + ) return None - + if total_count == 0: logger.info("ESearch pre-filter: NO accessions match the GenBank criteria") return set() - + # Step 2: Fetch all matching accessions using EFetch rettype=acc (lightweight!) all_esearch_accessions = [] batch_size = 10000 - + for retstart in range(0, total_count, batch_size): time.sleep(0.35 if not api_key else 0.1) - + fetch_params = { - 'db': 'nucleotide', - 'WebEnv': web_env, - 'query_key': query_key, - 'retmax': batch_size, - 'retstart': retstart, - 'rettype': 'acc', - 'retmode': 'text', + "db": "nucleotide", + "WebEnv": web_env, + "query_key": query_key, + "retmax": batch_size, + "retstart": retstart, + "rettype": "acc", + "retmode": "text", } if api_key: - fetch_params['api_key'] = api_key - - resp = requests.get(NCBI_EUTILS_BASE_EFETCH, params=fetch_params, timeout=120, - headers={'User-Agent': 'gget/1.0'}) + fetch_params["api_key"] = api_key + + resp = requests.get( + NCBI_EUTILS_BASE_EFETCH, params=fetch_params, timeout=120, headers={"User-Agent": "gget/1.0"} + ) resp.raise_for_status() - - batch_accs = [a.strip() for a in resp.text.strip().split('\n') if a.strip()] + + batch_accs = [a.strip() for a in resp.text.strip().split("\n") if a.strip()] all_esearch_accessions.extend(batch_accs) - logger.debug("ESearch pre-filter: fetched %d accessions (retstart=%d)", - len(batch_accs), retstart) - + logger.debug("ESearch pre-filter: fetched %d accessions (retstart=%d)", len(batch_accs), retstart) + logger.info("ESearch pre-filter: retrieved %d accession numbers total", len(all_esearch_accessions)) - + # Step 3: Intersect with our metadata-filtered accessions esearch_set = set(all_esearch_accessions) metadata_set = set(metadata_filtered_accessions) intersection = esearch_set & metadata_set - + logger.info("ESearch pre-filter RESULTS:") logger.info(" ESearch matches: %d", len(esearch_set)) logger.info(" Metadata-filtered: %d", len(metadata_set)) logger.info(" Intersection (candidates): %d", len(intersection)) - logger.info(" Reduction: %.1f%% (from %d to %d accessions for GenBank fetch)", - (1 - len(intersection) / len(metadata_set)) * 100 if metadata_set else 0, - len(metadata_set), len(intersection)) - + logger.info( + " Reduction: %.1f%% (from %d to %d accessions for GenBank fetch)", + (1 - len(intersection) / len(metadata_set)) * 100 if metadata_set else 0, + len(metadata_set), + len(intersection), + ) + return intersection - + except requests.exceptions.RequestException as e: logger.warning("ESearch pre-filter failed (network error): %s. Falling back to full fetch.", e) return None except ET.ParseError as e: logger.warning("ESearch pre-filter failed (parse error): %s. Falling back to full fetch.", e) return None - except Exception as e: + except Exception as e: # noqa: BLE001 logger.warning("ESearch pre-filter failed (unexpected): %s. Falling back to full fetch.", e) return None @@ -5590,82 +5878,79 @@ def _esearch_prefilter_genbank( # EPOST + EFETCH HELPER FUNCTIONS (NCBI-recommended for large datasets) # ============================================================================= + def _epost_accessions(accessions, api_key=None): - """ - Upload accession numbers to NCBI History Server using EPost. - + """Upload accession numbers to NCBI History Server using EPost. + EPost allows uploading large numbers of UIDs to the server, which assigns them a WebEnv and query_key for subsequent EFetch requests. This avoids URL length limitations that restrict direct efetch calls to ~200 accessions. - + Args: accessions (list): List of accession numbers to upload. api_key (str, optional): NCBI API key for higher rate limits. - - Returns: + + Returns + ------- tuple: (web_env, query_key) for use in subsequent EFetch calls, or (None, None) if upload failed. + """ logger.info("Uploading %d accessions to NCBI History Server via EPost...", len(accessions)) - + epost_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi" - + # For EPost, we send IDs in the POST body to avoid URL length limits # Can handle tens of thousands of accessions in one request accession_string = ",".join(accessions) - + params = { - 'db': 'nucleotide', + "db": "nucleotide", } if api_key: - params['api_key'] = api_key - + params["api_key"] = api_key + # POST body contains the accession list - data = { - 'id': accession_string - } - - headers = {'User-Agent': 'gget/1.0'} - + data = {"id": accession_string} + + headers = {"User-Agent": "gget/1.0"} + try: # Make POST request with accessions in body - response = requests.post( - epost_url, - params=params, - data=data, - headers=headers, - timeout=EUTILS_TIMEOUT - ) + response = requests.post(epost_url, params=params, data=data, headers=headers, timeout=EUTILS_TIMEOUT) response.raise_for_status() - + # Parse the XML response to extract WebEnv and query_key # Example response: # # 1 # NCID_01_... # - + root = ET.fromstring(response.text) - - query_key_elem = root.find('.//QueryKey') - web_env_elem = root.find('.//WebEnv') - + + query_key_elem = root.find(".//QueryKey") + web_env_elem = root.find(".//WebEnv") + if query_key_elem is not None and web_env_elem is not None: query_key = query_key_elem.text web_env = web_env_elem.text - logger.info("✅ EPost successful: QueryKey=%s, WebEnv=%s...", - query_key, web_env[:30] if len(web_env) > 30 else web_env) + logger.info( + "✅ EPost successful: QueryKey=%s, WebEnv=%s...", + query_key, + web_env[:30] if len(web_env) > 30 else web_env, + ) return web_env, query_key else: # Check for error message - error_elem = root.find('.//ERROR') + error_elem = root.find(".//ERROR") if error_elem is not None: logger.error("❌ EPost error: %s", error_elem.text) else: logger.error("❌ EPost failed: Could not parse WebEnv/QueryKey from response") logger.debug("Response: %s", response.text[:500]) return None, None - + except requests.exceptions.RequestException as e: logger.error("❌ EPost request failed: %s", e) return None, None @@ -5675,13 +5960,12 @@ def _epost_accessions(accessions, api_key=None): def _efetch_with_history(web_env, query_key, retstart, retmax, api_key=None, failed_log_path=None): - """ - Fetch GenBank records using History Server reference (WebEnv/query_key). - + """Fetch GenBank records using History Server reference (WebEnv/query_key). + This is the NCBI-recommended method for large datasets. After uploading UIDs via EPost, use this function to retrieve records in batches using pagination (retstart/retmax). - + Args: web_env (str): WebEnv string from EPost. query_key (str): Query key from EPost. @@ -5689,25 +5973,26 @@ def _efetch_with_history(web_env, query_key, retstart, retmax, api_key=None, fai retmax (int): Maximum number of records to retrieve in this batch. api_key (str, optional): NCBI API key for higher rate limits. failed_log_path (str, optional): Path to log failed requests. - - Returns: + + Returns + ------- tuple: (metadata_dict, xml_text) where metadata_dict maps accessions to parsed metadata, and xml_text is the raw XML response. + """ - params = { - 'db': 'nucleotide', - 'WebEnv': web_env, - 'query_key': query_key, - 'retstart': retstart, - 'retmax': retmax, - 'rettype': 'gb', - 'retmode': 'xml', - 'complexity': GENBANK_COMPLEXITY, + "db": "nucleotide", + "WebEnv": web_env, + "query_key": query_key, + "retstart": retstart, + "retmax": retmax, + "rettype": "gb", + "retmode": "xml", + "complexity": GENBANK_COMPLEXITY, } if api_key: - params['api_key'] = api_key - + params["api_key"] = api_key + # Create a requests.Session with retry logic session = requests.Session() try: @@ -5715,7 +6000,7 @@ def _efetch_with_history(web_env, query_key, retstart, retmax, api_key=None, fai total=GENBANK_RETRY_ATTEMPTS, backoff_factor=HTTP_INITIAL_BACKOFF, status_forcelist=HTTP_RETRY_STATUS_CODES, - allowed_methods=frozenset(['GET', 'POST']) + allowed_methods=frozenset(["GET", "POST"]), ) except TypeError: # Fallback for older urllib3 versions @@ -5723,44 +6008,45 @@ def _efetch_with_history(web_env, query_key, retstart, retmax, api_key=None, fai total=GENBANK_RETRY_ATTEMPTS, backoff_factor=HTTP_INITIAL_BACKOFF, status_forcelist=HTTP_RETRY_STATUS_CODES, - method_whitelist=frozenset(['GET', 'POST']) + method_whitelist=frozenset(["GET", "POST"]), ) - + adapter = HTTPAdapter(max_retries=retry_strategy) session.mount("https://", adapter) session.mount("http://", adapter) - - headers = {'Connection': 'close', 'User-Agent': 'gget/1.0'} - + + headers = {"Connection": "close", "User-Agent": "gget/1.0"} + max_attempts = HTTP_MAX_LOCAL_RETRIES attempt = 0 backoff = HTTP_INITIAL_BACKOFF - + while attempt < max_attempts: try: - logger.debug("EFetch with history: retstart=%d, retmax=%d (attempt %d)", - retstart, retmax, attempt + 1) - + logger.debug("EFetch with history: retstart=%d, retmax=%d (attempt %d)", retstart, retmax, attempt + 1) + response = session.get(NCBI_EUTILS_BASE_EFETCH, params=params, timeout=EUTILS_TIMEOUT, headers=headers) response.raise_for_status() - + # Verify we got XML data - if not response.text.strip().startswith(' 100 # Use EPost for any significant number optimized_batch_size = 500 if use_epost_method else batch_size # EPost method allows larger batches - + # Optimize delay based on API key: 10 req/sec with key vs 3 req/sec without # With API key: 0.1s delay allows ~10 req/sec # Without API key: 0.35s delay allows ~3 req/sec effective_delay = 0.1 if api_key else delay - logger.info("Using delay of %.2fs between requests (API key: %s)", - effective_delay, "yes" if api_key else "no") - + logger.info("Using delay of %.2fs between requests (API key: %s)", effective_delay, "yes" if api_key else "no") + try: # Open temp file for incremental XML writing - xml_file = open(temp_xml_path, 'w', encoding='utf-8') + xml_file = open(temp_xml_path, "w", encoding="utf-8") xml_file.write("\n") - + # Open temp JSONL file for incremental metadata writing - metadata_jsonl_file = open(temp_metadata_jsonl_path, 'w', encoding='utf-8') - + metadata_jsonl_file = open(temp_metadata_jsonl_path, "w", encoding="utf-8") + if use_epost_method: # ===== EPost + EFetch with History Server ===== EPOST_CHUNK_SIZE = 2000 # Tuned to avoid History Server session timeout - + if len(accessions) > EPOST_CHUNK_SIZE: - epost_chunks = [accessions[i:i + EPOST_CHUNK_SIZE] - for i in range(0, len(accessions), EPOST_CHUNK_SIZE)] + epost_chunks = [ + accessions[i : i + EPOST_CHUNK_SIZE] for i in range(0, len(accessions), EPOST_CHUNK_SIZE) + ] else: epost_chunks = [accessions] - - logger.info("Using optimized EPost+EFetch workflow (efetch_batch=%d, epost_chunks=%d of up to %d each)", - optimized_batch_size, len(epost_chunks), EPOST_CHUNK_SIZE) - + + logger.info( + "Using optimized EPost+EFetch workflow (efetch_batch=%d, epost_chunks=%d of up to %d each)", + optimized_batch_size, + len(epost_chunks), + EPOST_CHUNK_SIZE, + ) + # Calculate total EFetch batches across all EPost chunks for progress tracking overall_batch_num = 0 total_batches_all_chunks = sum( - (len(chunk) + optimized_batch_size - 1) // optimized_batch_size - for chunk in epost_chunks + (len(chunk) + optimized_batch_size - 1) // optimized_batch_size for chunk in epost_chunks ) - + # Determine GC and memory logging frequency based on total batches gc_frequency = max(20, total_batches_all_chunks // 20) memory_log_frequency = max(50, total_batches_all_chunks // 10) - + epost_failures = [] # Track chunks that fail EPost for direct-URL fallback - + for chunk_idx, chunk_accessions in enumerate(epost_chunks): - logger.info("EPost chunk %d/%d: uploading %d accessions to History Server...", - chunk_idx + 1, len(epost_chunks), len(chunk_accessions)) - + logger.info( + "EPost chunk %d/%d: uploading %d accessions to History Server...", + chunk_idx + 1, + len(epost_chunks), + len(chunk_accessions), + ) + # Step 1: Upload this chunk to NCBI History Server via EPost web_env, query_key = _epost_accessions(chunk_accessions, api_key=api_key) - + if web_env and query_key: - logger.info("✅ EPost chunk %d/%d successful: uploaded %d accessions", - chunk_idx + 1, len(epost_chunks), len(chunk_accessions)) - + logger.info( + "✅ EPost chunk %d/%d successful: uploaded %d accessions", + chunk_idx + 1, + len(epost_chunks), + len(chunk_accessions), + ) + # Step 2: Fetch data for this chunk using WebEnv/query_key with pagination chunk_total = len(chunk_accessions) num_batches = (chunk_total + optimized_batch_size - 1) // optimized_batch_size - + for batch_num in range(num_batches): overall_batch_num += 1 retstart = batch_num * optimized_batch_size - logger.info("Processing GenBank batch %d/%d (chunk %d/%d, retstart=%d, retmax=%d)", - overall_batch_num, total_batches_all_chunks, - chunk_idx + 1, len(epost_chunks), - retstart, optimized_batch_size) - + logger.info( + "Processing GenBank batch %d/%d (chunk %d/%d, retstart=%d, retmax=%d)", + overall_batch_num, + total_batches_all_chunks, + chunk_idx + 1, + len(epost_chunks), + retstart, + optimized_batch_size, + ) + try: batch_metadata, batch_xml_text = _efetch_with_history( web_env=web_env, @@ -5926,9 +6237,9 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p retstart=retstart, retmax=optimized_batch_size, api_key=api_key, - failed_log_path=failed_log_path + failed_log_path=failed_log_path, ) - + if batch_metadata: # Stream parsed metadata to temp JSONL (not held in RAM) for acc, meta in batch_metadata.items(): @@ -5940,7 +6251,7 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p del batch_metadata else: batch_count = 0 - + if batch_xml_text: cleaned_xml = _clean_xml_declarations(batch_xml_text) xml_file.write(cleaned_xml + "\n") @@ -5948,56 +6259,64 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p xml_written = True del batch_xml_text del cleaned_xml - logger.info("Batch %d/%d: Successfully retrieved metadata for %d accessions", - overall_batch_num, total_batches_all_chunks, - batch_count) + logger.info( + "Batch %d/%d: Successfully retrieved metadata for %d accessions", + overall_batch_num, + total_batches_all_chunks, + batch_count, + ) else: - logger.warning("Batch %d/%d returned no data", - overall_batch_num, total_batches_all_chunks) + logger.warning( + "Batch %d/%d returned no data", overall_batch_num, total_batches_all_chunks + ) # Track failed accessions for potential retry batch_start = batch_num * optimized_batch_size batch_end = min(batch_start + optimized_batch_size, chunk_total) failed_batches.append(chunk_accessions[batch_start:batch_end]) - + # Periodic garbage collection if overall_batch_num % gc_frequency == 0: _force_garbage_collection(f"after batch {overall_batch_num}/{total_batches_all_chunks}") - + # Periodic memory logging if overall_batch_num % memory_log_frequency == 0: _log_memory_usage(f"GenBank batch {overall_batch_num}/{total_batches_all_chunks}") - + # Delay between requests (respect NCBI rate limits) if overall_batch_num < total_batches_all_chunks and effective_delay > 0: time.sleep(effective_delay) - - except Exception as e: - logger.error("⚠️ Batch %d/%d failed: %s", - overall_batch_num, total_batches_all_chunks, e) + + except Exception as e: # noqa: BLE001 + logger.error("⚠️ Batch %d/%d failed: %s", overall_batch_num, total_batches_all_chunks, e) batch_start = batch_num * optimized_batch_size batch_end = min(batch_start + optimized_batch_size, chunk_total) failed_batches.append(chunk_accessions[batch_start:batch_end]) continue - + # Brief delay between EPost chunks to be respectful to NCBI if chunk_idx < len(epost_chunks) - 1: time.sleep(1.0) - + else: - logger.warning("EPost chunk %d/%d failed, will use direct fetch for %d accessions", - chunk_idx + 1, len(epost_chunks), len(chunk_accessions)) + logger.warning( + "EPost chunk %d/%d failed, will use direct fetch for %d accessions", + chunk_idx + 1, + len(epost_chunks), + len(chunk_accessions), + ) epost_failures.extend(chunk_accessions) - + # If any EPost chunks failed entirely, fall back to direct URL method for those if epost_failures: - logger.info("Falling back to direct URL method for %d accessions from failed EPost chunks", - len(epost_failures)) - direct_batches = [epost_failures[i:i + batch_size] - for i in range(0, len(epost_failures), batch_size)] + logger.info( + "Falling back to direct URL method for %d accessions from failed EPost chunks", len(epost_failures) + ) + direct_batches = [epost_failures[i : i + batch_size] for i in range(0, len(epost_failures), batch_size)] for dbatch_num, dbatch_accessions in enumerate(direct_batches, 1): try: batch_metadata, batch_xml_text = _fetch_genbank_batch( - dbatch_accessions, failed_log_path=failed_log_path) + dbatch_accessions, failed_log_path=failed_log_path + ) if batch_metadata: for acc, meta in batch_metadata.items(): metadata_jsonl_file.write(json.dumps({"accession": acc, "metadata": meta}) + "\n") @@ -6014,42 +6333,46 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p del cleaned_xml if dbatch_num < len(direct_batches) and effective_delay > 0: time.sleep(effective_delay) - except Exception as e: + except Exception as e: # noqa: BLE001 logger.error("⚠️ Direct fallback batch %d failed: %s", dbatch_num, e) failed_batches.append(dbatch_accessions) continue - + # Check if we got ANY data from EPost method if total_metadata_written == 0 and not epost_failures: logger.warning("EPost method returned no data, falling back to direct fetch method") use_epost_method = False # Fall through to traditional method - + if not use_epost_method: # ===== FALLBACK METHOD: Traditional direct URL batching ===== logger.info("Using traditional direct URL method (batch_size=%d)", batch_size) - + # Split accessions into batches to avoid URL length limits if len(accessions) > batch_size: - batches = [accessions[i:i + batch_size] for i in range(0, len(accessions), batch_size)] - logger.info("Processing %d accessions in %d batches of size %d", - len(accessions), len(batches), batch_size) + batches = [accessions[i : i + batch_size] for i in range(0, len(accessions), batch_size)] + logger.info( + "Processing %d accessions in %d batches of size %d", len(accessions), len(batches), batch_size + ) else: batches = [accessions] logger.info("Processing %d accessions in 1 batch", len(accessions)) - + # Determine GC and memory logging frequency based on total batches gc_frequency = max(50, len(batches) // 20) # GC roughly every 5% of batches memory_log_frequency = max(100, len(batches) // 10) # Log memory every 10% - + # Process each batch for batch_num, batch_accessions in enumerate(batches, 1): - logger.info("Processing GenBank batch %d/%d (%d accessions)", - batch_num, len(batches), len(batch_accessions)) - + logger.info( + "Processing GenBank batch %d/%d (%d accessions)", batch_num, len(batches), len(batch_accessions) + ) + try: # Fetch GenBank XML data using E-utilities efetch - batch_metadata, batch_xml_text = _fetch_genbank_batch(batch_accessions, failed_log_path=failed_log_path) - + batch_metadata, batch_xml_text = _fetch_genbank_batch( + batch_accessions, failed_log_path=failed_log_path + ) + if batch_metadata: # Stream parsed metadata to temp JSONL (not held in RAM) for acc, meta in batch_metadata.items(): @@ -6068,37 +6391,38 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p xml_file.write(cleaned_xml + "\n") xml_file.flush() # Ensure data is written to disk xml_written = True - + # Clear batch_xml_text from memory del batch_xml_text del cleaned_xml - - logger.info("Batch %d: Successfully retrieved metadata for %d accessions", - batch_num, batch_count) + + logger.info( + "Batch %d: Successfully retrieved metadata for %d accessions", batch_num, batch_count + ) else: # Batch failed, add to failed_batches for retry logger.warning("Batch %d returned no data, will retry later", batch_num) failed_batches.append(batch_accessions) - + # Periodic garbage collection to prevent memory buildup if batch_num % gc_frequency == 0: _force_garbage_collection(f"after batch {batch_num}/{len(batches)}") - + # Periodic memory logging if batch_num % memory_log_frequency == 0: _log_memory_usage(f"GenBank batch {batch_num}/{len(batches)}") - + # Add delay between requests to be respectful to NCBI servers if batch_num < len(batches) and effective_delay > 0: logger.debug("Adding %.1f second delay before next batch", effective_delay) time.sleep(effective_delay) - - except Exception as e: + + except Exception as e: # noqa: BLE001 logger.error("⚠️ Batch %d failed: %s", batch_num, e) failed_batches.append(batch_accessions) logger.info("Added batch %d to retry list", batch_num) continue - + # Retry failed batches at the end if failed_batches: logger.info("Retrying %d failed batches", len(failed_batches)) @@ -6123,24 +6447,29 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p del xml del cleaned_xml logger.info("Successfully retried batch with %d accessions", len(batch_accessions)) - except Exception as e: + except Exception: # noqa: BLE001 logger.warning("Final retry failed for batch: %s", batch_accessions) - + if retry_success: - logger.info("Successfully recovered %d/%d failed batches on retry", len(retry_success), len(failed_batches)) + logger.info( + "Successfully recovered %d/%d failed batches on retry", len(retry_success), len(failed_batches) + ) # ===== DETECT AND RETRY SILENTLY DROPPED ACCESSIONS ===== # The NCBI history server sometimes silently drops individual accessions from batch responses without raising errors. Detect these and retry them individually with direct URL fetch to maximize completeness. # This also catches accessions lost due to EPost/EFetch position mismatch (server internal ordering differs from posting order) after batch retries. silently_dropped = set(accessions) - seen_accessions if silently_dropped: - logger.info("🔄 Detected %d accessions silently dropped by NCBI history server — retrying with direct fetch", - len(silently_dropped)) + logger.info( + "🔄 Detected %d accessions silently dropped by NCBI history server — retrying with direct fetch", + len(silently_dropped), + ) dropped_list = sorted(silently_dropped) # Use batch size of 200 for direct URL retry (efficient for large sets, small enough to avoid URL length limits ~8KB for 200 accessions) direct_batch_size = min(200, len(dropped_list)) - direct_batches = [dropped_list[i:i + direct_batch_size] - for i in range(0, len(dropped_list), direct_batch_size)] + direct_batches = [ + dropped_list[i : i + direct_batch_size] for i in range(0, len(dropped_list), direct_batch_size) + ] recovered_count = 0 for dbatch_num, dbatch_accessions in enumerate(direct_batches, 1): try: @@ -6162,31 +6491,32 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p del cleaned_xml if dbatch_num < len(direct_batches) and effective_delay > 0: time.sleep(effective_delay) - except Exception as e: - logger.warning("Direct retry failed for dropped accessions %s: %s", - dbatch_accessions, e) + except Exception as e: # noqa: BLE001 + logger.warning("Direct retry failed for dropped accessions %s: %s", dbatch_accessions, e) if recovered_count: - logger.info("✅ Recovered %d/%d silently dropped accessions via direct fetch", - recovered_count, len(silently_dropped)) + logger.info( + "✅ Recovered %d/%d silently dropped accessions via direct fetch", + recovered_count, + len(silently_dropped), + ) else: - logger.warning("Could not recover any of the %d silently dropped accessions", - len(silently_dropped)) + logger.warning("Could not recover any of the %d silently dropped accessions", len(silently_dropped)) # Close XML wrapper xml_file.write("\n") xml_file.close() xml_file = None - + # Close metadata JSONL file if metadata_jsonl_file is not None: metadata_jsonl_file.close() metadata_jsonl_file = None - + # Move temp file to final location if xml_written: shutil.move(temp_xml_path, genbank_full_xml_path) logger.debug("Saved full GenBank XML to: %s", genbank_full_xml_path) - + # Convert XML to CSV (memory-efficient chunked processing) _log_memory_usage("before XML to CSV conversion") _genbank_xml_to_csv(genbank_full_xml_path, genbank_full_csv_path) @@ -6195,7 +6525,7 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p logger.warning("No GenBank XML content retrieved to save") if os.path.exists(temp_xml_path): os.remove(temp_xml_path) - + except Exception as e: logger.error("Error during GenBank metadata fetch: %s", e) raise @@ -6218,16 +6548,17 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p except OSError as e: logger.debug("Failed to clean up temp_xml_path (os.remove): %s", e) - logger.info("GenBank metadata retrieval complete: %d/%d accessions processed", - total_metadata_written, len(accessions)) - + logger.info( + "GenBank metadata retrieval complete: %d/%d accessions processed", total_metadata_written, len(accessions) + ) + # Load all metadata from temp JSONL file into dict for return # This is fine because GenBank metadata is fetched only for the post-filter subset # (typically 1K-50K records, not millions) all_metadata = {} if os.path.exists(temp_metadata_jsonl_path) and total_metadata_written > 0: try: - with open(temp_metadata_jsonl_path, 'r', encoding='utf-8') as f: + with open(temp_metadata_jsonl_path, encoding="utf-8") as f: for line in f: line = line.strip() if not line: @@ -6238,23 +6569,23 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p if acc and meta: all_metadata[acc] = meta logger.info("Loaded %d GenBank metadata records from temp file", len(all_metadata)) - except Exception as e: + except Exception as e: # noqa: BLE001 logger.error("Failed to load GenBank metadata from temp JSONL: %s", e) - + # Clean up temp JSONL file if os.path.exists(temp_metadata_jsonl_path): try: os.remove(temp_metadata_jsonl_path) except OSError as e: logger.debug("Failed to clean up temp_metadata_jsonl_path (os.remove): %s", e) - + # Final memory log and GC _force_garbage_collection("GenBank fetch complete") _log_memory_usage("GenBank fetch complete") if not all_metadata: logger.warning("No GenBank metadata was successfully retrieved") - + missing_accessions = set(accessions) - set(all_metadata.keys()) if missing_accessions: logger.info("❌ The following accessions could not be downloaded:") @@ -6264,39 +6595,39 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p if len(missing_accessions) > 10: logger.info(f" ... and {len(missing_accessions) - 10} more") logger.info(f"A log of failed batches and efetch URLs is saved at: {failed_log_path}") - + # Return both metadata and the path to the failed batches log for summary tracking return all_metadata, failed_log_path if os.path.exists(failed_log_path) else None def _fetch_genbank_batch(accessions, failed_log_path=None): - """ - Fetch GenBank metadata for a single batch of accessions. - + """Fetch GenBank metadata for a single batch of accessions. + Includes retry logic with exponential backoff and automatic batch splitting for problematic requests. - + Args: accessions (list): List of accession numbers for this batch. failed_log_path (str, optional): Path to log file for failed batches. - - Returns: + + Returns + ------- tuple: (metadata_dict, xml_text) where metadata_dict maps accessions to parsed metadata, and xml_text is the raw XML response. + """ - # Build E-utilities efetch URL for GenBank XML format # base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" accession_string = ",".join(accessions) - + params = { - 'db': 'nucleotide', # Nucleotide database (includes virus sequences) - 'id': accession_string, # Comma-separated accession numbers - 'rettype': 'gb', # GenBank format - 'retmode': 'xml', # XML output for structured parsing - 'complexity': GENBANK_COMPLEXITY, + "db": "nucleotide", # Nucleotide database (includes virus sequences) + "id": accession_string, # Comma-separated accession numbers + "rettype": "gb", # GenBank format + "retmode": "xml", # XML output for structured parsing + "complexity": GENBANK_COMPLEXITY, } - + # Create a requests.Session with urllib3 Retry/HTTPAdapter for robust retries session = requests.Session() try: @@ -6304,7 +6635,7 @@ def _fetch_genbank_batch(accessions, failed_log_path=None): total=GENBANK_RETRY_ATTEMPTS, backoff_factor=HTTP_INITIAL_BACKOFF, status_forcelist=HTTP_RETRY_STATUS_CODES, - allowed_methods=frozenset(['GET', 'POST']) + allowed_methods=frozenset(["GET", "POST"]), ) except TypeError: # Fallback for older urllib3 versions that use method_whitelist @@ -6312,14 +6643,14 @@ def _fetch_genbank_batch(accessions, failed_log_path=None): total=GENBANK_RETRY_ATTEMPTS, backoff_factor=HTTP_INITIAL_BACKOFF, status_forcelist=HTTP_RETRY_STATUS_CODES, - method_whitelist=frozenset(['GET', 'POST']) + method_whitelist=frozenset(["GET", "POST"]), ) adapter = HTTPAdapter(max_retries=retry_strategy) session.mount("https://", adapter) session.mount("http://", adapter) - headers = {'Connection': 'close', 'User-Agent': 'gget/1.0'} + headers = {"Connection": "close", "User-Agent": "gget/1.0"} # Local retry loop for transient chunk/connection errors with exponential backoff max_attempts = HTTP_MAX_LOCAL_RETRIES attempt = 0 @@ -6330,7 +6661,10 @@ def _fetch_genbank_batch(accessions, failed_log_path=None): try: logger.debug("Making E-utilities request for %d accessions (attempt %d)", len(accessions), attempt + 1) logger.debug("Request URL: %s", NCBI_EUTILS_BASE_EFETCH) - logger.debug("Request parameters: %s", {k: (v[:50] + '...' if isinstance(v, str) and len(v) > 50 else v) for k, v in params.items()}) + logger.debug( + "Request parameters: %s", + {k: (v[:50] + "..." if isinstance(v, str) and len(v) > 50 else v) for k, v in params.items()}, + ) # Use POST instead of GET for EFetch to avoid 414 URI Too Long errors. # NCBI E-utilities supports POST for all requests, and POST puts the @@ -6341,7 +6675,7 @@ def _fetch_genbank_batch(accessions, failed_log_path=None): response.raise_for_status() # Verify we got XML data - if not response.text.strip().startswith('>> xml = '\n\n...' >>> _clean_xml_declarations(xml) '...' + """ cleaned_lines = [] for line in xml_text.splitlines(): @@ -6445,31 +6787,31 @@ def _clean_xml_declarations(xml_text): def _local_name(tag): - """ - Return the local name of an XML tag (strip namespace if present). - + """Return the local name of an XML tag (strip namespace if present). + XML tags may include namespace prefixes (e.g., '{http://namespace}TagName'). This helper function extracts just the tag name without the namespace. - + Args: tag (str): XML tag string, potentially with namespace. - - Returns: + + Returns + ------- str: Tag name without namespace prefix. - + Example: - >>> _local_name('{http://www.ncbi.nlm.nih.gov}GBSeq') + >>> _local_name("{http://www.ncbi.nlm.nih.gov}GBSeq") 'GBSeq' - >>> _local_name('GBSeq') + >>> _local_name("GBSeq") 'GBSeq' + """ - return tag.split('}')[-1] if '}' in tag else tag + return tag.split("}")[-1] if "}" in tag else tag def _genbank_xml_to_csv(xml_path, csv_path, chunk_size=None): - """ - Convert GenBank XML to CSV with streaming and dynamic qualifier columns. - + """Convert GenBank XML to CSV with streaming and dynamic qualifier columns. + Args: xml_path (str): Path to input GenBank XML file. csv_path (str): Path to output CSV file. @@ -6478,16 +6820,15 @@ def _genbank_xml_to_csv(xml_path, csv_path, chunk_size=None): # Apply default chunk size if not specified if chunk_size is None: chunk_size = GENBANK_XML_CHUNK_SIZE - + qualifier_names = set() rows = [] header_written = False - csv_file = open(csv_path, "w", newline='', encoding='utf-8') - writer = None + csv_file = open(csv_path, "w", newline="", encoding="utf-8") # Stream-parse XML - for event, elem in ET.iterparse(xml_path, events=("end",)): + for _event, elem in ET.iterparse(xml_path, events=("end",)): if _local_name(elem.tag) == "GBSeq": # # Skip protein sequences (AA type) - we only want nucleotide sequences # moltype_elem = elem.findtext(".//GBSeq_moltype", "").strip() @@ -6495,7 +6836,7 @@ def _genbank_xml_to_csv(xml_path, csv_path, chunk_size=None): # logger.debug("Skipping protein sequence (AA type) in XML to CSV conversion") # elem.clear() # continue - + accession = elem.findtext(".//GBSeq_accession-version", "").strip() sequence = elem.findtext(".//GBSeq_sequence", "").strip() features = elem.findall(".//GBFeature") @@ -6540,7 +6881,7 @@ def _genbank_xml_to_csv(xml_path, csv_path, chunk_size=None): "Interval_to": interval_to, "Interval_accession": interval_acc, "order": order_flag, - "sequence": "" # leave blank for now + "sequence": "", # leave blank for now } for qn in qualifier_names: @@ -6579,9 +6920,8 @@ def _genbank_xml_to_csv(xml_path, csv_path, chunk_size=None): def _save_genbank_xml_and_csv(xml_content, xml_file_name, csv_file_name): - """ - Save GenBank XML content and convert to CSV. - + """Save GenBank XML content and convert to CSV. + Args: xml_content (str): Raw XML content from E-utilities. xml_file_name (str): Path for XML output file. @@ -6608,24 +6948,25 @@ def _save_genbank_xml_and_csv(xml_content, xml_file_name, csv_file_name): def _parse_genbank_xml(xml_content): - """ - Parse GenBank XML response and extract high-level metadata fields. + """Parse GenBank XML response and extract high-level metadata fields. + This function processes the GenBank XML format returned by E-utilities efetch and extracts key metadata including collection dates, geographic information, host details, publication references, and sequence features. - + Args: xml_content (str): Raw XML content from E-utilities efetch. - - Returns: + + Returns + ------- dict: Dictionary mapping accession numbers to metadata dictionaries - + Note: Uses xml.etree.ElementTree for parsing to avoid external dependencies. The GenBank XML schema is documented by NCBI and contains structured information about sequence records. + """ - # Parse the XML content try: root = ET.fromstring(xml_content) @@ -6635,127 +6976,127 @@ def _parse_genbank_xml(xml_content): logger.error("❌ XML parsing failed: %s", e) logger.debug("XML content preview: %s", xml_content[:500]) raise RuntimeError(f"Invalid XML format in GenBank response: {e}") from e - + metadata_dict = {} - + # Process each GenBank sequence record in the XML - for gbseq in root.findall('.//GBSeq'): + for gbseq in root.findall(".//GBSeq"): try: # # Skip protein sequences (AA type) - we only want nucleotide sequences # moltype_elem = gbseq.find('GBSeq_moltype') # if moltype_elem is not None and moltype_elem.text == 'AA': # logger.debug("Skipping protein sequence (AA type)") # continue - + # Extract accession number as the primary key - accession_elem = gbseq.find('GBSeq_accession-version') + accession_elem = gbseq.find("GBSeq_accession-version") if accession_elem is None: - accession_elem = gbseq.find('GBSeq_primary-accession') - + accession_elem = gbseq.find("GBSeq_primary-accession") + if accession_elem is None: logger.warning("Skipping GenBank record without accession number") continue - + accession = accession_elem.text logger.debug("Processing GenBank record: %s", accession) - + # Initialize metadata dictionary for this record metadata = { - 'accession': accession, - 'genbank_data': {} # Store GenBank-specific fields + "accession": accession, + "genbank_data": {}, # Store GenBank-specific fields } - + # Extract basic sequence information - length_elem = gbseq.find('GBSeq_length') - metadata['genbank_data']['sequence_length'] = int(length_elem.text) if length_elem is not None else None - - organism_elem = gbseq.find('GBSeq_organism') - metadata['genbank_data']['organism'] = organism_elem.text if organism_elem is not None else "" - - definition_elem = gbseq.find('GBSeq_definition') - metadata['genbank_data']['definition'] = definition_elem.text if definition_elem is not None else "" - + length_elem = gbseq.find("GBSeq_length") + metadata["genbank_data"]["sequence_length"] = int(length_elem.text) if length_elem is not None else None + + organism_elem = gbseq.find("GBSeq_organism") + metadata["genbank_data"]["organism"] = organism_elem.text if organism_elem is not None else "" + + definition_elem = gbseq.find("GBSeq_definition") + metadata["genbank_data"]["definition"] = definition_elem.text if definition_elem is not None else "" + # Extract taxonomy information - taxonomy_elem = gbseq.find('GBSeq_taxonomy') - metadata['genbank_data']['taxonomy'] = taxonomy_elem.text if taxonomy_elem is not None else "" - + taxonomy_elem = gbseq.find("GBSeq_taxonomy") + metadata["genbank_data"]["taxonomy"] = taxonomy_elem.text if taxonomy_elem is not None else "" + # Extract creation and update dates - create_date_elem = gbseq.find('GBSeq_create-date') - metadata['genbank_data']['create_date'] = create_date_elem.text if create_date_elem is not None else "" - - update_date_elem = gbseq.find('GBSeq_update-date') - metadata['genbank_data']['update_date'] = update_date_elem.text if update_date_elem is not None else "" - + create_date_elem = gbseq.find("GBSeq_create-date") + metadata["genbank_data"]["create_date"] = create_date_elem.text if create_date_elem is not None else "" + + update_date_elem = gbseq.find("GBSeq_update-date") + metadata["genbank_data"]["update_date"] = update_date_elem.text if update_date_elem is not None else "" + # Extract references (publications) references = [] - for ref in gbseq.findall('.//GBReference'): + for ref in gbseq.findall(".//GBReference"): ref_data = {} - - title_elem = ref.find('GBReference_title') - ref_data['title'] = title_elem.text if title_elem is not None else "" - - authors_elem = ref.find('GBReference_authors') + + title_elem = ref.find("GBReference_title") + ref_data["title"] = title_elem.text if title_elem is not None else "" + + authors_elem = ref.find("GBReference_authors") if authors_elem is not None: - authors = [a.text for a in authors_elem.findall('GBAuthor') if a.text] - ref_data['authors'] = ', '.join(authors) + authors = [a.text for a in authors_elem.findall("GBAuthor") if a.text] + ref_data["authors"] = ", ".join(authors) else: - ref_data['authors'] = "" + ref_data["authors"] = "" # ref_data['authors'] = authors_elem.text if authors_elem is not None else "" - - journal_elem = ref.find('GBReference_journal') - ref_data['journal'] = journal_elem.text if journal_elem is not None else "" - - pubmed_elem = ref.find('GBReference_pubmed') - ref_data['pubmed_id'] = pubmed_elem.text if pubmed_elem is not None else "" - + + journal_elem = ref.find("GBReference_journal") + ref_data["journal"] = journal_elem.text if journal_elem is not None else "" + + pubmed_elem = ref.find("GBReference_pubmed") + ref_data["pubmed_id"] = pubmed_elem.text if pubmed_elem is not None else "" + if any(ref_data.values()): # Only add if we got some reference data references.append(ref_data) - - metadata['genbank_data']['references'] = references - + + metadata["genbank_data"]["references"] = references + # Extract features (collection_date, geographic location, host, etc.) features_data = {} gene_count = 0 mature_peptide_count = 0 products = [] - - for feature in gbseq.findall('.//GBFeature'): - feature_key_elem = feature.find('GBFeature_key') + + for feature in gbseq.findall(".//GBFeature"): + feature_key_elem = feature.find("GBFeature_key") if feature_key_elem is None: continue - + feature_key = feature_key_elem.text - + # Count genes and mature peptides - if feature_key == 'gene': + if feature_key == "gene": gene_count += 1 - elif feature_key == 'mat_peptide': + elif feature_key == "mat_peptide": mature_peptide_count += 1 - + # Extract qualifiers for this feature feature_qualifiers = {} has_proviral = False - for qual in feature.findall('.//GBQualifier'): - qual_name_elem = qual.find('GBQualifier_name') - qual_value_elem = qual.find('GBQualifier_value') - + for qual in feature.findall(".//GBQualifier"): + qual_name_elem = qual.find("GBQualifier_name") + qual_value_elem = qual.find("GBQualifier_value") + if qual_name_elem is not None: qual_name = qual_name_elem.text # Handle qualifiers without values (e.g., proviral) qual_value = qual_value_elem.text if qual_value_elem is not None else "" feature_qualifiers[qual_name] = qual_value - + # Track proviral flag (presence indicates proviral) - if qual_name == 'proviral': + if qual_name == "proviral": has_proviral = True - + # Collect product names for has_proteins filter - if qual_name == 'product' and qual_value: + if qual_name == "product" and qual_value: products.append(qual_value) - + if has_proviral: - feature_qualifiers['_has_proviral'] = True - + feature_qualifiers["_has_proviral"] = True + if feature_qualifiers: # Store multiple features of same type (e.g., multiple CDS) if feature_key not in features_data: @@ -6764,238 +7105,233 @@ def _parse_genbank_xml(xml_content): features_data[feature_key].append(feature_qualifiers) else: features_data[feature_key] = [features_data[feature_key], feature_qualifiers] - + # Extract specific fields of interest from source feature - source_feature = features_data.get('source', {}) + source_feature = features_data.get("source", {}) if isinstance(source_feature, list): source_feature = source_feature[0] # Use first source if multiple - - metadata['genbank_data']['collection_date'] = source_feature.get('collection_date', '') - metadata['genbank_data']['geographic_location'] = source_feature.get('geo_loc_name', '') - metadata['genbank_data']['host'] = source_feature.get('host', '') - metadata['genbank_data']['isolation_source'] = source_feature.get('isolation_source', '') - metadata['genbank_data']['strain'] = source_feature.get('strain', '') - metadata['genbank_data']['isolate'] = source_feature.get('isolate', '') + + metadata["genbank_data"]["collection_date"] = source_feature.get("collection_date", "") + metadata["genbank_data"]["geographic_location"] = source_feature.get("geo_loc_name", "") + metadata["genbank_data"]["host"] = source_feature.get("host", "") + metadata["genbank_data"]["isolation_source"] = source_feature.get("isolation_source", "") + metadata["genbank_data"]["strain"] = source_feature.get("strain", "") + metadata["genbank_data"]["isolate"] = source_feature.get("isolate", "") # metadata['genbank_data']['collected_by'] = source_feature.get('collected_by', '') # metadata['genbank_data']['specimen_voucher'] = source_feature.get('specimen_voucher', '') - + # Extract additional GenBank-specific fields for filtering - metadata['genbank_data']['proviral'] = source_feature.get('_has_proviral', False) or 'proviral' in source_feature - metadata['genbank_data']['mol_type'] = source_feature.get('mol_type', '') - metadata['genbank_data']['serotype'] = source_feature.get('serotype', '') - metadata['genbank_data']['gene_count'] = gene_count - metadata['genbank_data']['mature_peptide_count'] = mature_peptide_count - metadata['genbank_data']['products'] = products - + metadata["genbank_data"]["proviral"] = ( + source_feature.get("_has_proviral", False) or "proviral" in source_feature + ) + metadata["genbank_data"]["mol_type"] = source_feature.get("mol_type", "") + metadata["genbank_data"]["serotype"] = source_feature.get("serotype", "") + metadata["genbank_data"]["gene_count"] = gene_count + metadata["genbank_data"]["mature_peptide_count"] = mature_peptide_count + metadata["genbank_data"]["products"] = products + # Extract genotype from note field or serotype - genotype = '' - note_value = source_feature.get('note', '') + genotype = "" + note_value = source_feature.get("note", "") if note_value: # Look for genotype pattern in note (e.g., "genotype: IV" or "genotype=H5N1") - genotype_match = re.search(r'genotype[:\s=]+([^\s;,]+)', note_value, re.IGNORECASE) + genotype_match = re.search(r"genotype[:\s=]+([^\s;,]+)", note_value, re.IGNORECASE) if genotype_match: genotype = genotype_match.group(1).strip() - if not genotype and source_feature.get('serotype'): - genotype = source_feature.get('serotype', '') - metadata['genbank_data']['genotype'] = genotype - + if not genotype and source_feature.get("serotype"): + genotype = source_feature.get("serotype", "") + metadata["genbank_data"]["genotype"] = genotype + # Store all features for potential future use - metadata['genbank_data']['all_features'] = features_data - + metadata["genbank_data"]["all_features"] = features_data + # Extract comment field (often contains additional metadata) - comment_elem = gbseq.find('GBSeq_comment') + comment_elem = gbseq.find("GBSeq_comment") comment_text = comment_elem.text if comment_elem is not None else "" - metadata['genbank_data']['comment'] = comment_text - + metadata["genbank_data"]["comment"] = comment_text + # Parse assembly name from comment if present (used in some studies) assembly_name = "" if comment_text: - assembly_match = re.search(r'Assembly Name :: (\S+)', comment_text) + assembly_match = re.search(r"Assembly Name :: (\S+)", comment_text) if assembly_match: assembly_name = assembly_match.group(1) - metadata['genbank_data']['assembly_name'] = assembly_name - + metadata["genbank_data"]["assembly_name"] = assembly_name + # Store the metadata for this accession metadata_dict[accession] = metadata - - logger.debug("Extracted GenBank metadata for %s: organism=%s, collection_date=%s, geographic_location=%s", - accession, - metadata['genbank_data']['organism'], - metadata['genbank_data']['collection_date'], - metadata['genbank_data']['geographic_location']) - - except Exception as e: - logger.warning("❌ Failed to parse GenBank record %s: %s", - accession if 'accession' in locals() else 'unknown', e) + + logger.debug( + "Extracted GenBank metadata for %s: organism=%s, collection_date=%s, geographic_location=%s", + accession, + metadata["genbank_data"]["organism"], + metadata["genbank_data"]["collection_date"], + metadata["genbank_data"]["geographic_location"], + ) + + except Exception as e: # noqa: BLE001 + logger.warning( + "❌ Failed to parse GenBank record %s: %s", accession if "accession" in locals() else "unknown", e + ) continue - + logger.info("✅ Successfully parsed GenBank metadata for %d records", len(metadata_dict)) return metadata_dict def save_genbank_metadata_to_csv(genbank_metadata, output_file, virus_metadata=None): - """ - Save GenBank metadata to a CSV file with the same column headers as the standard metadata CSV. - + """Save GenBank metadata to a CSV file with the same column headers as the standard metadata CSV. + Args: genbank_metadata (dict): Dictionary mapping accessions to GenBank metadata output_file (str): Path to the output CSV file virus_metadata (list, optional): List of virus metadata dictionaries to merge - + Note: The CSV format uses the same column headers as save_metadata_to_csv to ensure consistency between the two output files, making them directly comparable. """ - logger.info("Preparing GenBank metadata for CSV output...") logger.debug("Processing %d GenBank records", len(genbank_metadata)) - + # Use the same column order as save_metadata_to_csv for consistency columns = [ - "accession", # Primary identifier (lowercase for Delphy compatibility) - "Organism Name", # Virus species/strain name - "GenBank/RefSeq", # Source database (GenBank or RefSeq) - "Submitters", # Names of sequence submitters - "Organization", # Submitting organization/institution - "Submitter Country", # Country of submitting organization - "Release date", # Date when sequence was released to public databases - "Isolate", # Isolate/sample identifier - "Virus Lineage", # Taxonomic lineage of the virus - "Length", # Sequence length in base pairs - "Nuc Completeness", # Completeness status (complete/partial) + "accession", # Primary identifier (lowercase for Delphy compatibility) + "Organism Name", # Virus species/strain name + "GenBank/RefSeq", # Source database (GenBank or RefSeq) + "Submitters", # Names of sequence submitters + "Organization", # Submitting organization/institution + "Submitter Country", # Country of submitting organization + "Release date", # Date when sequence was released to public databases + "Isolate", # Isolate/sample identifier + "Virus Lineage", # Taxonomic lineage of the virus + "Length", # Sequence length in base pairs + "Nuc Completeness", # Completeness status (complete/partial) "Proteins/Segments", # Protein/segment information from FASTA headers - "Segment", # Virus segment identifier (e.g., 'HA', 'NA', '4', '6') + "Segment", # Virus segment identifier (e.g., 'HA', 'NA', '4', '6') "Is Vaccine Strain", # Whether this sequence is from a vaccine strain "Geographic Region", # Geographic region where sample was collected - "Geographic Location",# Specific geographic location - "Host", # Host organism name - "Host Lineage", # Taxonomic lineage of host organism - "Lab Host", # Whether sample was lab-passaged - "Tissue/Specimen/Source", # Sample source/tissue type - "Collection Date", # Date when sample was collected - "Sample Name", # Sample identifier - "Annotated", # Whether sequence has annotation data - "SRA Accessions", # Associated SRA (sequencing) accessions - "Bioprojects", # Associated BioProject identifiers - "Biosample", # BioSample identifier - "Protein count", # Number of proteins annotated - "Gene count", # Number of genes annotated - "Mature Peptide Count", # Number of mature peptides annotated + "Geographic Location", # Specific geographic location + "Host", # Host organism name + "Host Lineage", # Taxonomic lineage of host organism + "Lab Host", # Whether sample was lab-passaged + "Tissue/Specimen/Source", # Sample source/tissue type + "Collection Date", # Date when sample was collected + "Sample Name", # Sample identifier + "Annotated", # Whether sequence has annotation data + "SRA Accessions", # Associated SRA (sequencing) accessions + "Bioprojects", # Associated BioProject identifiers + "Biosample", # BioSample identifier + "Protein count", # Number of proteins annotated + "Gene count", # Number of genes annotated + "Mature Peptide Count", # Number of mature peptides annotated # Additional GenBank columns - "definition", # GenBank sequence definition - "strain", # Strain information - "isolation_source", # Source of isolation - "create_date", # GenBank creation date - "update_date", # GenBank update date - "assembly_name", # Assembly name - "authors", # Publication authors - "title", # Publication title - "journal", # Publication journal - "pubmed_id", # PubMed ID - "reference_count", # Number of references - "comment", # Additional comments + "definition", # GenBank sequence definition + "strain", # Strain information + "isolation_source", # Source of isolation + "create_date", # GenBank creation date + "update_date", # GenBank update date + "assembly_name", # Assembly name + "authors", # Publication authors + "title", # Publication title + "journal", # Publication journal + "pubmed_id", # PubMed ID + "reference_count", # Number of references + "comment", # Additional comments ] - + logger.debug("Using column order: %s", columns) - + # Prepare data for DataFrame creation data_for_df = [] - + for accession, metadata in genbank_metadata.items(): logger.debug("Processing GenBank metadata for accession: %s", accession) - - genbank_data = metadata.get('genbank_data', {}) - + + genbank_data = metadata.get("genbank_data", {}) + # Extract publication information (use first reference if available) - references = genbank_data.get('references', []) + references = genbank_data.get("references", []) first_ref = references[0] if references else {} - + # Build the row dictionary with the same column structure as save_metadata_to_csv row = { # Primary identifier "accession": accession, - # Organism and database information - "Organism Name": genbank_data.get('organism', pd.NA), - "GenBank/RefSeq": metadata.get('sourceDatabase', pd.NA), - + "Organism Name": genbank_data.get("organism", pd.NA), + "GenBank/RefSeq": metadata.get("sourceDatabase", pd.NA), # Submission information - "Submitters": metadata.get('submitter', {}).get('names', []) if metadata.get('submitter', {}).get('names') else pd.NA, - "Organization": metadata.get('submitter', {}).get('affiliation', pd.NA), - "Submitter Country": metadata.get('submitter', {}).get('country', pd.NA), - "Release date": metadata.get('releaseDate', '').split('T')[0] if metadata.get('releaseDate') else pd.NA, - + "Submitters": metadata.get("submitter", {}).get("names", []) + if metadata.get("submitter", {}).get("names") + else pd.NA, + "Organization": metadata.get("submitter", {}).get("affiliation", pd.NA), + "Submitter Country": metadata.get("submitter", {}).get("country", pd.NA), + "Release date": metadata.get("releaseDate", "").split("T")[0] if metadata.get("releaseDate") else pd.NA, # Sample and isolate information - "Isolate": genbank_data.get('isolate', pd.NA), - "Sample Name": genbank_data.get('isolate', pd.NA), - + "Isolate": genbank_data.get("isolate", pd.NA), + "Sample Name": genbank_data.get("isolate", pd.NA), # Virus classification - "Virus Lineage": genbank_data.get('taxonomy', pd.NA), - + "Virus Lineage": genbank_data.get("taxonomy", pd.NA), # Sequence characteristics - "Length": genbank_data.get('sequence_length', pd.NA), - "Nuc Completeness": metadata.get('completeness', pd.NA), + "Length": genbank_data.get("sequence_length", pd.NA), + "Nuc Completeness": metadata.get("completeness", pd.NA), "Proteins/Segments": pd.NA, # Not available from GenBank XML parsing - "Segment": metadata.get('segment', pd.NA), # Virus segment identifier - "Is Vaccine Strain": metadata.get('isVaccineStrain', metadata.get('is_vaccine_strain', pd.NA)), - + "Segment": metadata.get("segment", pd.NA), # Virus segment identifier + "Is Vaccine Strain": metadata.get("isVaccineStrain", metadata.get("is_vaccine_strain", pd.NA)), # Geographic information - "Geographic Region": metadata.get('region', pd.NA), - "Geographic Location": genbank_data.get('geographic_location', pd.NA), - + "Geographic Region": metadata.get("region", pd.NA), + "Geographic Location": genbank_data.get("geographic_location", pd.NA), # Host information - "Host": genbank_data.get('host', pd.NA), - "Host Lineage": metadata.get('host', {}).get('lineage', []) if isinstance(metadata.get('host'), dict) else pd.NA, - "Lab Host": metadata.get('labHost', pd.NA), - + "Host": genbank_data.get("host", pd.NA), + "Host Lineage": metadata.get("host", {}).get("lineage", []) + if isinstance(metadata.get("host"), dict) + else pd.NA, + "Lab Host": metadata.get("labHost", pd.NA), # Sample source information - "Tissue/Specimen/Source": genbank_data.get('isolation_source', pd.NA), - "Collection Date": genbank_data.get('collection_date', pd.NA), - + "Tissue/Specimen/Source": genbank_data.get("isolation_source", pd.NA), + "Collection Date": genbank_data.get("collection_date", pd.NA), # Annotation and quality information - "Annotated": metadata.get('isAnnotated', pd.NA), - + "Annotated": metadata.get("isAnnotated", pd.NA), # Associated database records - "SRA Accessions": metadata.get('sraAccessions', pd.NA), - "Bioprojects": metadata.get('bioprojects', pd.NA), - "Biosample": metadata.get('biosample', pd.NA), - + "SRA Accessions": metadata.get("sraAccessions", pd.NA), + "Bioprojects": metadata.get("bioprojects", pd.NA), + "Biosample": metadata.get("biosample", pd.NA), # Counts - "Gene count": metadata.get('geneCount', pd.NA), - "Protein count": metadata.get('proteinCount', pd.NA), - "Mature Peptide Count": metadata.get('maturePeptideCount', pd.NA), - + "Gene count": metadata.get("geneCount", pd.NA), + "Protein count": metadata.get("proteinCount", pd.NA), + "Mature Peptide Count": metadata.get("maturePeptideCount", pd.NA), # GenBank-specific columns - "definition": genbank_data.get('definition', pd.NA), - "strain": genbank_data.get('strain', pd.NA), - "isolation_source": genbank_data.get('isolation_source', pd.NA), - "create_date": genbank_data.get('create_date', pd.NA), - "update_date": genbank_data.get('update_date', pd.NA), - "assembly_name": genbank_data.get('assembly_name', pd.NA), - "authors": first_ref.get('authors', pd.NA), - "title": first_ref.get('title', pd.NA), - "journal": first_ref.get('journal', pd.NA), - "pubmed_id": first_ref.get('pubmed_id', pd.NA), + "definition": genbank_data.get("definition", pd.NA), + "strain": genbank_data.get("strain", pd.NA), + "isolation_source": genbank_data.get("isolation_source", pd.NA), + "create_date": genbank_data.get("create_date", pd.NA), + "update_date": genbank_data.get("update_date", pd.NA), + "assembly_name": genbank_data.get("assembly_name", pd.NA), + "authors": first_ref.get("authors", pd.NA), + "title": first_ref.get("title", pd.NA), + "journal": first_ref.get("journal", pd.NA), + "pubmed_id": first_ref.get("pubmed_id", pd.NA), "reference_count": len(references) if references else pd.NA, - "comment": genbank_data.get('comment', pd.NA), + "comment": genbank_data.get("comment", pd.NA), } - + data_for_df.append(row) - + logger.info("Creating DataFrame with %d rows and %d columns", len(data_for_df), len(columns)) - + # Create DataFrame with the specified column order df = pd.DataFrame(data_for_df, columns=columns) # Write DataFrame to CSV file try: - df.to_csv(output_file, index=False, encoding='utf-8') + df.to_csv(output_file, index=False, encoding="utf-8") logger.info("✅ GenBank metadata successfully saved to: %s", output_file) - logger.info("CSV file contains %d rows and %d columns", len(df), len(df.columns)) + logger.info("CSV file contains %d rows and %d columns", len(df), len(df.columns)) except Exception as e: logger.error("❌ Failed to save GenBank metadata CSV: %s", e) raise RuntimeError(f"❌ Failed to save GenBank metadata to {output_file}: {e}") from e - + def filter_cached_metadata_for_unused_filters( metadata_dict, @@ -7008,13 +7344,12 @@ def filter_cached_metadata_for_unused_filters( min_release_date=None, applied_strategy_filters=None, ): - """ - Apply filters that were not used in the cached download strategy. - + """Apply filters that were not used in the cached download strategy. + This is Step 3 of the cached download pipeline. It applies: 1. Server-side filters not used in the successful cached strategy: host, complete_only, annotated, lineage 2. API-only filters that couldn't be applied server-side: geographic_location, refseq_only, min_release_date - + Args: metadata_dict (dict): Dictionary mapping accession numbers to metadata from cached download. host (str, optional): Host organism filter (not applied if in applied_strategy_filters). @@ -7026,129 +7361,135 @@ def filter_cached_metadata_for_unused_filters( min_release_date (str, optional): Minimum release date filter (API-only, always applied if specified). applied_strategy_filters (list, optional): List of filter names that were applied during cached strategy. Includes: 'host', 'complete-only', 'annotated', 'lineage' - - Returns: + + Returns + ------- tuple: (filtered_accessions, filtered_metadata_list) - """ - if applied_strategy_filters is None and geographic_location is None and refseq_only is None and min_release_date is None: + """ + if ( + applied_strategy_filters is None + and geographic_location is None + and refseq_only is None + and min_release_date is None + ): logger.debug("No filters specified for post-cached-download filtering. Returning all metadata unchanged.") return list(metadata_dict.keys()), list(metadata_dict.values()) - + if applied_strategy_filters is None: applied_strategy_filters = [] - - logger.info("="*60) + + logger.info("=" * 60) logger.info("STEP 3b: Applying post-cached-download filters") - logger.info("="*60) + logger.info("=" * 60) logger.debug("Filters applied during cached strategy: %s", applied_strategy_filters) - + # Determine which filters to apply based on what wasn't used in strategy filters_to_apply = {} - + # Server-side filters (only apply if not already applied in strategy) - if 'host' not in applied_strategy_filters and host: - filters_to_apply['host'] = host + if "host" not in applied_strategy_filters and host: + filters_to_apply["host"] = host logger.debug("Will apply host filter: %s (not used in cached strategy)", host) - - if 'complete-only' not in applied_strategy_filters and complete_only: - filters_to_apply['complete_only'] = complete_only + + if "complete-only" not in applied_strategy_filters and complete_only: + filters_to_apply["complete_only"] = complete_only logger.debug("Will apply complete-only filter (not used in cached strategy)") - - if 'annotated' not in applied_strategy_filters and annotated: - filters_to_apply['annotated'] = annotated + + if "annotated" not in applied_strategy_filters and annotated: + filters_to_apply["annotated"] = annotated logger.debug("Will apply annotated filter (not used in cached strategy)") - - if 'lineage' not in applied_strategy_filters and lineage: - filters_to_apply['lineage'] = lineage + + if "lineage" not in applied_strategy_filters and lineage: + filters_to_apply["lineage"] = lineage logger.debug("Will apply lineage filter: %s (not used in cached strategy)", lineage) - + # API-only filters (always apply if specified since they're never in cached strategy) if geographic_location: - filters_to_apply['geographic_location'] = geographic_location + filters_to_apply["geographic_location"] = geographic_location logger.debug("Will apply geographic_location filter: %s (API-only)", geographic_location) - + if refseq_only: - filters_to_apply['refseq_only'] = refseq_only + filters_to_apply["refseq_only"] = refseq_only logger.debug("Will apply refseq_only filter (API-only)") - + if min_release_date: - filters_to_apply['min_release_date'] = min_release_date + filters_to_apply["min_release_date"] = min_release_date logger.debug("Will apply min_release_date filter: %s (API-only)", min_release_date) - + # If no filters to apply, return all metadata unchanged if not filters_to_apply: logger.debug("No post-cached-download filters to apply. All %d records will proceed.", len(metadata_dict)) return list(metadata_dict.keys()), list(metadata_dict.values()) - + logger.info("Applying post-cached-download filters: %s", list(filters_to_apply.keys())) - + # Parse min_release_date once for efficiency min_release_date_parsed = None - if 'min_release_date' in filters_to_apply: + if "min_release_date" in filters_to_apply: min_release_date_parsed = _parse_date(min_release_date, filtername="min_release_date") - + # Apply filters to each metadata record filtered_accessions = [] filtered_metadata_list = [] filter_stats = { - 'host': 0, - 'complete_only': 0, - 'annotated': 0, - 'lineage': 0, - 'geographic_location': 0, - 'refseq_only': 0, - 'min_release_date': 0, + "host": 0, + "complete_only": 0, + "annotated": 0, + "lineage": 0, + "geographic_location": 0, + "refseq_only": 0, + "min_release_date": 0, } - + for accession, metadata in metadata_dict.items(): # Apply each filter - if any fails, skip this record - + # Host filter - if 'host' in filters_to_apply: - host_name = metadata.get('hostName', '') + if "host" in filters_to_apply: + host_name = metadata.get("hostName", "") if not host_name: logger.debug("Skipping %s: missing host metadata", accession) - filter_stats['host'] += 1 + filter_stats["host"] += 1 continue if host.lower() not in host_name.lower(): logger.debug("Skipping %s: host '%s' does not match '%s'", accession, host_name, host) - filter_stats['host'] += 1 + filter_stats["host"] += 1 continue - + # Complete-only filter - if 'complete_only' in filters_to_apply: - nuc_completeness = metadata.get('completeness', '') - if not nuc_completeness or nuc_completeness.lower() != 'complete': + if "complete_only" in filters_to_apply: + nuc_completeness = metadata.get("completeness", "") + if not nuc_completeness or nuc_completeness.lower() != "complete": logger.debug("Skipping %s: completeness '%s' != 'complete'", accession, nuc_completeness) - filter_stats['complete_only'] += 1 + filter_stats["complete_only"] += 1 continue - + # Annotated filter - if 'annotated' in filters_to_apply: - is_annotated = metadata.get('isAnnotated', False) + if "annotated" in filters_to_apply: + is_annotated = metadata.get("isAnnotated", False) if not is_annotated: logger.debug("Skipping %s: not annotated", accession) - filter_stats['annotated'] += 1 + filter_stats["annotated"] += 1 continue - + # Lineage filter (SARS-CoV-2 specific) - if 'lineage' in filters_to_apply: - virus_pangolin = metadata.get('virusPangolinClassification', '') + if "lineage" in filters_to_apply: + virus_pangolin = metadata.get("virusPangolinClassification", "") if not virus_pangolin or lineage.lower() not in virus_pangolin.lower(): logger.debug("Skipping %s: lineage '%s' does not match '%s'", accession, virus_pangolin, lineage) - filter_stats['lineage'] += 1 + filter_stats["lineage"] += 1 continue - + # Geographic location filter (API-only) - if 'geographic_location' in filters_to_apply: - geo_loc = metadata.get('location', '') or '' - geo_region = metadata.get('region', '') or '' - virus_name = metadata.get('virusName', '') or '' + if "geographic_location" in filters_to_apply: + geo_loc = metadata.get("location", "") or "" + geo_region = metadata.get("region", "") or "" + virus_name = metadata.get("virusName", "") or "" # Only skip if ALL location sources are empty if not geo_loc and not geo_region and not virus_name: logger.debug("Skipping %s: missing location, region, and virusName metadata", accession) - filter_stats['geographic_location'] += 1 + filter_stats["geographic_location"] += 1 continue # Check if filter matches location, region, or virusName (fallback for older records) geo_filter = geographic_location.lower() @@ -7157,46 +7498,54 @@ def filter_cached_metadata_for_unused_filters( # Also check virusName as fallback (e.g., "B/USA/65/2002" contains location in name) virus_name_matches = virus_name and geo_filter in virus_name.lower() if not loc_matches and not region_matches and not virus_name_matches: - logger.debug("Skipping %s: geo_location '%s', region '%s', virusName '%s' do not match '%s'", accession, geo_loc, geo_region, virus_name, geographic_location) - filter_stats['geographic_location'] += 1 + logger.debug( + "Skipping %s: geo_location '%s', region '%s', virusName '%s' do not match '%s'", + accession, + geo_loc, + geo_region, + virus_name, + geographic_location, + ) + filter_stats["geographic_location"] += 1 continue - + # RefSeq only filter (API-only) - if 'refseq_only' in filters_to_apply: - is_refseq = metadata.get('sourceDatabase', '').lower() == 'refseq' + if "refseq_only" in filters_to_apply: + is_refseq = metadata.get("sourceDatabase", "").lower() == "refseq" if not is_refseq: logger.debug("Skipping %s: not RefSeq (refseq_only=True)", accession) - filter_stats['refseq_only'] += 1 + filter_stats["refseq_only"] += 1 continue - + # Minimum release date filter (API-only) - if 'min_release_date' in filters_to_apply: - release_date_str = metadata.get('releaseDate', '') + if "min_release_date" in filters_to_apply: + release_date_str = metadata.get("releaseDate", "") if not release_date_str: logger.debug("Skipping %s: missing release date metadata", accession) - filter_stats['min_release_date'] += 1 + filter_stats["min_release_date"] += 1 continue release_date = _parse_date(release_date_str.split("T")[0], filtername="release_date") if not release_date or (min_release_date_parsed and release_date < min_release_date_parsed): logger.debug("Skipping %s: release date %s < min %s", accession, release_date, min_release_date_parsed) - filter_stats['min_release_date'] += 1 + filter_stats["min_release_date"] += 1 continue - + # All filters passed filtered_accessions.append(accession) filtered_metadata_list.append(metadata) - + # Log comprehensive filtering statistics - logger.info("✅ Post-cached-download filtering complete: %d -> %d records", - len(metadata_dict), len(filtered_accessions)) - + logger.info( + "✅ Post-cached-download filtering complete: %d -> %d records", len(metadata_dict), len(filtered_accessions) + ) + total_filtered = sum(filter_stats.values()) if total_filtered > 0: logger.info("Filter statistics (records excluded):") for filter_name, count in filter_stats.items(): if count > 0: logger.info(" %s: %d records", filter_name, count) - + return filtered_accessions, filtered_metadata_list @@ -7227,23 +7576,42 @@ def filter_metadata_only( geographic_location=None, host=None, ): - """ - Filter metadata records based on metadata-only criteria. - + """Filter metadata records based on metadata-only criteria. + Applies filters that can be evaluated using only metadata, reducing the number of accessions before downloading sequences. Sequence-dependent filters are deferred to post-download filtering. - + Args: metadata_dict (dict): Dictionary mapping accession numbers to metadata. (other args): Filter criteria - same as filter_sequences. - - Returns: + + Returns + ------- tuple: (filtered_accessions, filtered_metadata_list) + """ - logger.info("Starting metadata-only filtering process...") - logger.debug("Applying metadata-only filters: seq_length(%s-%s), completeness(%s), lab_passaged(%s), annotated(%s), submitter_country(%s), collection_date(%s-%s), source_database(%s), max_release_date(%s), protein_count(%s-%s), segment(%s), vaccine_strain(%s), submitter_name(%s), submitter_institution(%s), isolate(%s)", min_seq_length, max_seq_length, nuc_completeness, lab_passaged, annotated, submitter_country, min_collection_date, max_collection_date, source_database, max_release_date, min_protein_count, max_protein_count, segment, vaccine_strain, submitter_name, submitter_institution, isolate) + logger.debug( + "Applying metadata-only filters: seq_length(%s-%s), completeness(%s), lab_passaged(%s), annotated(%s), submitter_country(%s), collection_date(%s-%s), source_database(%s), max_release_date(%s), protein_count(%s-%s), segment(%s), vaccine_strain(%s), submitter_name(%s), submitter_institution(%s), isolate(%s)", + min_seq_length, + max_seq_length, + nuc_completeness, + lab_passaged, + annotated, + submitter_country, + min_collection_date, + max_collection_date, + source_database, + max_release_date, + min_protein_count, + max_protein_count, + segment, + vaccine_strain, + submitter_name, + submitter_institution, + isolate, + ) # Convert date filters to datetime objects for proper comparison # Parse user-provided filter dates with appropriate partial date handling: @@ -7252,19 +7620,23 @@ def filter_metadata_only( min_collection_date = ( _parse_partial_date_for_range_check( min_collection_date, for_min_comparison=False, filtername="min_collection_date" - ) if min_collection_date else None + ) + if min_collection_date + else None ) max_collection_date = ( _parse_partial_date_for_range_check( max_collection_date, for_min_comparison=True, filtername="max_collection_date" - ) if max_collection_date else None + ) + if max_collection_date + else None ) max_release_date = ( - _parse_partial_date_for_range_check( - max_release_date, for_min_comparison=True, filtername="max_release_date" - ) if max_release_date else None + _parse_partial_date_for_range_check(max_release_date, for_min_comparison=True, filtername="max_release_date") + if max_release_date + else None ) - + if min_collection_date: logger.debug("Parsed min_collection_date: %s", min_collection_date) if max_collection_date: @@ -7275,55 +7647,55 @@ def filter_metadata_only( # Initialize lists to store filtered results filtered_accessions = [] filtered_metadata_list = [] - + # Counters for logging filter statistics total_sequences = len(metadata_dict) filter_stats = { - 'seq_length': 0, + "seq_length": 0, # 'gene_count': 0, - 'completeness': 0, - 'lab_passaged': 0, - 'annotated': 0, - 'submitter_country': 0, - 'collection_date': 0, - 'source_database': 0, - 'release_date': 0, + "completeness": 0, + "lab_passaged": 0, + "annotated": 0, + "submitter_country": 0, + "collection_date": 0, + "source_database": 0, + "release_date": 0, # 'mature_peptide_count': 0, - 'protein_count': 0, - 'segment': 0, - 'vaccine_strain': 0, - 'submitter_name': 0, - 'submitter_institution': 0, - 'isolate': 0, - 'isolation_source': 0, - 'geographic_location': 0, - 'host': 0, + "protein_count": 0, + "segment": 0, + "vaccine_strain": 0, + "submitter_name": 0, + "submitter_institution": 0, + "isolate": 0, + "isolation_source": 0, + "geographic_location": 0, + "host": 0, } logger.info("Processing %d metadata records...", total_sequences) - + for accession, metadata in metadata_dict.items(): # logger.debug("Processing metadata for: %s", accession) - + # Apply filters sequentially - each filter can exclude the record # If any filter fails, we continue to the next record - + # FILTER 1: Sequence length filters if min_seq_length is not None or max_seq_length is not None: sequence_length = metadata.get("length") if sequence_length is None: logger.debug("Skipping %s: missing length metadata", accession) - filter_stats['seq_length'] += 1 + filter_stats["seq_length"] += 1 continue - + if min_seq_length is not None and sequence_length < min_seq_length: logger.debug("Skipping %s: length %d < min %d", accession, sequence_length, min_seq_length) - filter_stats['seq_length'] += 1 + filter_stats["seq_length"] += 1 continue - + if max_seq_length is not None and sequence_length > max_seq_length: logger.debug("Skipping %s: length %d > max %d", accession, sequence_length, max_seq_length) - filter_stats['seq_length'] += 1 + filter_stats["seq_length"] += 1 continue # FILTER 2: Gene count filters @@ -7333,12 +7705,12 @@ def filter_metadata_only( # logger.debug("Skipping %s: missing gene count metadata", accession) # filter_stats['gene_count'] += 1 # continue - + # if min_gene_count is not None and gene_count < min_gene_count: # logger.debug("Skipping %s: gene count %d < min %d", accession, gene_count, min_gene_count) # filter_stats['gene_count'] += 1 # continue - + # if max_gene_count is not None and gene_count > max_gene_count: # logger.debug("Skipping %s: gene count %d > max %d", accession, gene_count, max_gene_count) # filter_stats['gene_count'] += 1 @@ -7349,13 +7721,14 @@ def filter_metadata_only( completeness_status = metadata.get("completeness") if completeness_status is None: logger.debug("Skipping %s: missing completeness metadata", accession) - filter_stats['completeness'] += 1 + filter_stats["completeness"] += 1 continue - + if completeness_status.lower() != nuc_completeness.lower(): - logger.debug("Skipping %s: completeness '%s' != required '%s'", - accession, completeness_status, nuc_completeness) - filter_stats['completeness'] += 1 + logger.debug( + "Skipping %s: completeness '%s' != required '%s'", accession, completeness_status, nuc_completeness + ) + filter_stats["completeness"] += 1 continue # FILTER 4: Lab passaging status filter @@ -7363,14 +7736,14 @@ def filter_metadata_only( from_lab = metadata.get("isLabHost") if not from_lab: logger.debug("Skipping %s: not lab-passaged (required)", accession) - filter_stats['lab_passaged'] += 1 + filter_stats["lab_passaged"] += 1 continue if lab_passaged is False: from_lab = metadata.get("isLabHost") if from_lab: logger.debug("Skipping %s: is lab-passaged (excluded)", accession) - filter_stats['lab_passaged'] += 1 + filter_stats["lab_passaged"] += 1 continue # FILTER 5: Annotation status filter @@ -7380,135 +7753,151 @@ def filter_metadata_only( is_annotated = metadata.get("isAnnotated") if is_annotated: logger.debug("Skipping %s: is annotated (excluded when annotated=False)", accession) - filter_stats['annotated'] += 1 + filter_stats["annotated"] += 1 continue # FILTER 6: Submitter country filter if submitter_country is not None: - submitter_country_value = "_".join( - (metadata.get("submitterCountry") or "").split(" ") - ).lower() - + submitter_country_value = "_".join((metadata.get("submitterCountry") or "").split(" ")).lower() + if not submitter_country_value: logger.debug("Skipping %s: missing submitter country", accession) - filter_stats['submitter_country'] += 1 + filter_stats["submitter_country"] += 1 continue - + if submitter_country_value != submitter_country.lower(): - logger.debug("Skipping %s: submitter country '%s' != required '%s'", - accession, submitter_country_value, submitter_country.lower()) - filter_stats['submitter_country'] += 1 + logger.debug( + "Skipping %s: submitter country '%s' != required '%s'", + accession, + submitter_country_value, + submitter_country.lower(), + ) + filter_stats["submitter_country"] += 1 continue # FILTER 7: Submitter name filter if submitter_name is not None: # Convert submitter_name to list if it's a string submitter_name_list = [submitter_name] if isinstance(submitter_name, str) else submitter_name - + # Get submitter name from metadata metadata_submitter_name = metadata.get("submitterName", "") if not metadata_submitter_name: - filter_stats['submitter_name'] += 1 + filter_stats["submitter_name"] += 1 continue - + # Build set of acceptable submitter name values (case-insensitive) acceptable_names = {s.lower().strip() for s in submitter_name_list} - + # Check if metadata submitter name matches any acceptable value (case-insensitive) metadata_submitter_name_lower = str(metadata_submitter_name).lower().strip() if metadata_submitter_name_lower not in acceptable_names: - logger.debug("Skipping %s: submitter name '%s' not in required list %s", - accession, metadata_submitter_name, submitter_name_list) - filter_stats['submitter_name'] += 1 + logger.debug( + "Skipping %s: submitter name '%s' not in required list %s", + accession, + metadata_submitter_name, + submitter_name_list, + ) + filter_stats["submitter_name"] += 1 continue # FILTER 8: Submitter institution filter if submitter_institution is not None: # Convert submitter_institution to list if it's a string - submitter_institution_list = [submitter_institution] if isinstance(submitter_institution, str) else submitter_institution - + submitter_institution_list = ( + [submitter_institution] if isinstance(submitter_institution, str) else submitter_institution + ) + # Get submitter institution from metadata metadata_submitter_institution = metadata.get("submitterInstitution", "") if not metadata_submitter_institution: - filter_stats['submitter_institution'] += 1 + filter_stats["submitter_institution"] += 1 continue - + # Build set of acceptable submitter institution values (case-insensitive) acceptable_institutions = {s.lower().strip() for s in submitter_institution_list} - + # Check if metadata submitter institution matches any acceptable value (case-insensitive) metadata_submitter_institution_lower = str(metadata_submitter_institution).lower().strip() if metadata_submitter_institution_lower not in acceptable_institutions: - logger.debug("Skipping %s: submitter institution '%s' not in required list %s", - accession, metadata_submitter_institution, submitter_institution_list) - filter_stats['submitter_institution'] += 1 + logger.debug( + "Skipping %s: submitter institution '%s' not in required list %s", + accession, + metadata_submitter_institution, + submitter_institution_list, + ) + filter_stats["submitter_institution"] += 1 continue # FILTER 9: isolate filter if isolate is not None: # Convert isolate to list if it's a string isolate_list = [isolate] if isinstance(isolate, str) else isolate - + # Get isolate from metadata metadata_isolate = metadata.get("isolateName", "") if not metadata_isolate: - filter_stats['isolate'] += 1 + filter_stats["isolate"] += 1 continue - + # Build set of acceptable isolate values (case-insensitive) acceptable_isolates = {s.lower().strip() for s in isolate_list} - + # Check if metadata isolate matches any acceptable value (case-insensitive) metadata_isolate_lower = str(metadata_isolate).lower().strip() if metadata_isolate_lower not in acceptable_isolates: - logger.debug("Skipping %s: isolate '%s' not in required list %s", - accession, metadata_isolate, isolate_list) - filter_stats['isolate'] += 1 + logger.debug( + "Skipping %s: isolate '%s' not in required list %s", accession, metadata_isolate, isolate_list + ) + filter_stats["isolate"] += 1 continue # FILTER 10: isolation source filter if isolation_source is not None: # Convert isolation_source to list if it's a string isolation_source_list = [isolation_source] if isinstance(isolation_source, str) else isolation_source - + # Get isolation source from metadata metadata_isolation_source = metadata.get("isolate", {}).get("source", "") if not metadata_isolation_source: - filter_stats['isolation_source'] += 1 + filter_stats["isolation_source"] += 1 continue - + # Build set of acceptable isolation source values (case-insensitive) acceptable_isolation_sources = {s.lower().strip() for s in isolation_source_list} - + # Check if metadata isolation source matches any acceptable value (case-insensitive) metadata_isolation_source_lower = str(metadata_isolation_source).lower().strip() if metadata_isolation_source_lower not in acceptable_isolation_sources: - logger.debug("Skipping %s: isolation source '%s' not in required list %s", - accession, metadata_isolation_source, isolation_source_list) - filter_stats['isolation_source'] += 1 + logger.debug( + "Skipping %s: isolation source '%s' not in required list %s", + accession, + metadata_isolation_source, + isolation_source_list, + ) + filter_stats["isolation_source"] += 1 continue - # FILTER 11: Collection date range filter if min_collection_date is not None or max_collection_date is not None: date_str = metadata.get("isolate", {}).get("collectionDate", "") - + # Skip records with empty or missing collection date if not date_str: logger.debug("Skipping %s: missing or empty collection date", accession) - filter_stats['collection_date'] += 1 + filter_stats["collection_date"] += 1 continue - + # Handle partial dates (year-only or year-month) appropriately for range checks # For min_collection_date: use end of partial range (record COULD be >= min) # For max_collection_date: use start of partial range (record COULD be <= max) - + skip_record = False - + if min_collection_date: try: # Parse with for_min_comparison=True to use end of range for partial dates @@ -7517,20 +7906,25 @@ def filter_metadata_only( ) except ValueError: logger.debug("Skipping %s: invalid collection date format '%s'", accession, date_str) - filter_stats['collection_date'] += 1 + filter_stats["collection_date"] += 1 continue - + if date_for_min is None: logger.debug("Skipping %s: missing or invalid collection date '%s'", accession, date_str) - filter_stats['collection_date'] += 1 + filter_stats["collection_date"] += 1 continue - + if date_for_min < min_collection_date: - logger.debug("Skipping %s: collection date %s (from '%s') < min %s", - accession, date_for_min, date_str, min_collection_date) - filter_stats['collection_date'] += 1 + logger.debug( + "Skipping %s: collection date %s (from '%s') < min %s", + accession, + date_for_min, + date_str, + min_collection_date, + ) + filter_stats["collection_date"] += 1 skip_record = True - + if not skip_record and max_collection_date: try: # Parse with for_min_comparison=False to use start of range for partial dates @@ -7539,35 +7933,40 @@ def filter_metadata_only( ) except ValueError: logger.debug("Skipping %s: invalid collection date format '%s'", accession, date_str) - filter_stats['collection_date'] += 1 + filter_stats["collection_date"] += 1 continue - + if date_for_max is None: logger.debug("Skipping %s: missing or invalid collection date '%s'", accession, date_str) - filter_stats['collection_date'] += 1 + filter_stats["collection_date"] += 1 continue - + if date_for_max > max_collection_date: - logger.debug("Skipping %s: collection date %s (from '%s') > max %s", - accession, date_for_max, date_str, max_collection_date) - filter_stats['collection_date'] += 1 + logger.debug( + "Skipping %s: collection date %s (from '%s') > max %s", + accession, + date_for_max, + date_str, + max_collection_date, + ) + filter_stats["collection_date"] += 1 skip_record = True - + if skip_record: continue # FILTER 12: Maximum release date filter if max_release_date is not None: release_date_str = metadata.get("releaseDate") - + if not release_date_str: logger.debug("Skipping %s: missing release date", accession) - filter_stats['release_date'] += 1 + filter_stats["release_date"] += 1 continue - + # Strip time portion if present (e.g., "2024-02-14T00:00:00Z" -> "2024-02-14") release_date_str_clean = release_date_str.split("T")[0] - + try: # For max comparison, use START of range for partial dates # so that records that COULD be within range are included @@ -7576,39 +7975,44 @@ def filter_metadata_only( ) except ValueError: logger.debug("Skipping %s: invalid release date format '%s'", accession, release_date_str) - filter_stats['release_date'] += 1 + filter_stats["release_date"] += 1 continue - + if release_date_value is None: logger.debug("Skipping %s: invalid release date '%s'", accession, release_date_str) - filter_stats['release_date'] += 1 + filter_stats["release_date"] += 1 continue - + if release_date_value > max_release_date: - logger.debug("Skipping %s: release date %s (from '%s') > max %s", - accession, release_date_value, release_date_str, max_release_date) - filter_stats['release_date'] += 1 + logger.debug( + "Skipping %s: release date %s (from '%s') > max %s", + accession, + release_date_value, + release_date_str, + max_release_date, + ) + filter_stats["release_date"] += 1 continue # FILTER 13: Mature peptide count filters # if min_mature_peptide_count is not None or max_mature_peptide_count is not None: # mature_peptide_count = metadata.get("maturePeptideCount") - + # if mature_peptide_count is None: # logger.debug("Skipping %s: missing mature peptide count", accession) # filter_stats['mature_peptide_count'] += 1 # continue - - # if (min_mature_peptide_count is not None and + + # if (min_mature_peptide_count is not None and # mature_peptide_count < min_mature_peptide_count): - # logger.debug("Skipping %s: mature peptide count %d < min %d", + # logger.debug("Skipping %s: mature peptide count %d < min %d", # accession, mature_peptide_count, min_mature_peptide_count) # filter_stats['mature_peptide_count'] += 1 # continue - - # if (max_mature_peptide_count is not None and + + # if (max_mature_peptide_count is not None and # mature_peptide_count > max_mature_peptide_count): - # logger.debug("Skipping %s: mature peptide count %d > max %d", + # logger.debug("Skipping %s: mature peptide count %d > max %d", # accession, mature_peptide_count, max_mature_peptide_count) # filter_stats['mature_peptide_count'] += 1 # continue @@ -7616,45 +8020,44 @@ def filter_metadata_only( # FILTER 14: Protein count filters if min_protein_count is not None or max_protein_count is not None: protein_count = metadata.get("proteinCount") - + if protein_count is None: logger.debug("Skipping %s: missing protein count", accession) - filter_stats['protein_count'] += 1 + filter_stats["protein_count"] += 1 continue - + if min_protein_count is not None and protein_count < min_protein_count: - logger.debug("Skipping %s: protein count %d < min %d", - accession, protein_count, min_protein_count) - filter_stats['protein_count'] += 1 + logger.debug("Skipping %s: protein count %d < min %d", accession, protein_count, min_protein_count) + filter_stats["protein_count"] += 1 continue - + if max_protein_count is not None and protein_count > max_protein_count: - logger.debug("Skipping %s: protein count %d > max %d", - accession, protein_count, max_protein_count) - filter_stats['protein_count'] += 1 + logger.debug("Skipping %s: protein count %d > max %d", accession, protein_count, max_protein_count) + filter_stats["protein_count"] += 1 continue # FILTER 15: Segment filter - simple case-insensitive matching if segment is not None: # Convert segment to list if it's a string segment_list = [segment] if isinstance(segment, str) else segment - + # Get segment from metadata metadata_segment = metadata.get("segment") - + if not metadata_segment: - filter_stats['segment'] += 1 + filter_stats["segment"] += 1 continue - + # Build set of acceptable segment values (case-insensitive) acceptable_segments = {s.lower().strip() for s in segment_list} - + # Check if metadata segment matches any acceptable value (case-insensitive) metadata_segment_lower = str(metadata_segment).lower().strip() if metadata_segment_lower not in acceptable_segments: - logger.debug("Skipping %s: segment '%s' not in required list %s", - accession, metadata_segment, segment_list) - filter_stats['segment'] += 1 + logger.debug( + "Skipping %s: segment '%s' not in required list %s", accession, metadata_segment, segment_list + ) + filter_stats["segment"] += 1 continue # FILTER 16: Vaccine strain filter @@ -7663,12 +8066,12 @@ def filter_metadata_only( if vaccine_strain: if not is_vaccine: logger.debug("Skipping %s: not a vaccine strain (required)", accession) - filter_stats['vaccine_strain'] += 1 + filter_stats["vaccine_strain"] += 1 continue - if vaccine_strain == False: + if not vaccine_strain: if is_vaccine: logger.debug("Skipping %s: is a vaccine strain (not allowed)", accession) - filter_stats['vaccine_strain'] += 1 + filter_stats["vaccine_strain"] += 1 continue # FILTER 17: Source database filter @@ -7676,104 +8079,119 @@ def filter_metadata_only( source_db = (metadata.get("sourceDatabase") or "").lower() if not source_db: logger.debug("Skipping %s: missing source database", accession) - filter_stats['source_database'] += 1 + filter_stats["source_database"] += 1 continue - + if source_db != source_database.lower(): - logger.debug("Skipping %s: source database '%s' != required '%s'", - accession, source_db, source_database.lower()) - filter_stats['source_database'] += 1 + logger.debug( + "Skipping %s: source database '%s' != required '%s'", accession, source_db, source_database.lower() + ) + filter_stats["source_database"] += 1 continue # FILTER 18: Geographic location filter (deferred from API when server-side filter fails) if geographic_location is not None: # Convert geographic_location to list if it's a string geo_location_list = [geographic_location] if isinstance(geographic_location, str) else geographic_location - + # Get geographic location from metadata - stored in "location", "region", and "virusName" fields metadata_location = metadata.get("location", "") or "" metadata_region = metadata.get("region", "") or "" metadata_virus_name = metadata.get("virusName", "") or "" - + # Only skip if ALL location sources are empty if not metadata_location and not metadata_region and not metadata_virus_name: logger.debug("Skipping %s: missing location, region, and virusName", accession) - filter_stats['geographic_location'] += 1 + filter_stats["geographic_location"] += 1 continue - + # Build set of acceptable location values (case-insensitive) # Normalize the filter values: remove special chars and create variations acceptable_locations = set() for loc in geo_location_list: loc_normalized = loc.lower().strip() # Remove common separators and create variations - loc_normalized = loc_normalized.replace('-', ' ').replace('_', ' ').replace('+', ' ') + loc_normalized = loc_normalized.replace("-", " ").replace("_", " ").replace("+", " ") acceptable_locations.add(loc_normalized) # Also add version without spaces for matching - acceptable_locations.add(loc_normalized.replace(' ', '')) - + acceptable_locations.add(loc_normalized.replace(" ", "")) + # Normalize metadata location and region for comparison metadata_location_lower = str(metadata_location).lower().strip() - metadata_location_normalized = metadata_location_lower.replace('-', ' ').replace('_', ' ') - metadata_location_no_spaces = metadata_location_normalized.replace(' ', '') - + metadata_location_normalized = metadata_location_lower.replace("-", " ").replace("_", " ") + metadata_location_no_spaces = metadata_location_normalized.replace(" ", "") + metadata_region_lower = str(metadata_region).lower().strip() - metadata_region_normalized = metadata_region_lower.replace('-', ' ').replace('_', ' ') - metadata_region_no_spaces = metadata_region_normalized.replace(' ', '') + metadata_region_normalized = metadata_region_lower.replace("-", " ").replace("_", " ") + metadata_region_no_spaces = metadata_region_normalized.replace(" ", "") metadata_virus_name_lower = str(metadata_virus_name).lower().strip() - + # Check for partial/substring match in location, region, OR virusName location_match = False for acceptable_loc in acceptable_locations: # Check location field - if metadata_location_normalized and (acceptable_loc in metadata_location_normalized or acceptable_loc in metadata_location_no_spaces): + if metadata_location_normalized and ( + acceptable_loc in metadata_location_normalized or acceptable_loc in metadata_location_no_spaces + ): location_match = True break # Also check if metadata location is contained in acceptable location - if metadata_location_normalized and (metadata_location_normalized in acceptable_loc or metadata_location_no_spaces in acceptable_loc): + if metadata_location_normalized and ( + metadata_location_normalized in acceptable_loc or metadata_location_no_spaces in acceptable_loc + ): location_match = True break # Check region field - if metadata_region_normalized and (acceptable_loc in metadata_region_normalized or acceptable_loc in metadata_region_no_spaces): + if metadata_region_normalized and ( + acceptable_loc in metadata_region_normalized or acceptable_loc in metadata_region_no_spaces + ): location_match = True break # Also check if metadata region is contained in acceptable location - if metadata_region_normalized and (metadata_region_normalized in acceptable_loc or metadata_region_no_spaces in acceptable_loc): + if metadata_region_normalized and ( + metadata_region_normalized in acceptable_loc or metadata_region_no_spaces in acceptable_loc + ): location_match = True break # Check virusName as fallback (for older records where location is in the name) if metadata_virus_name_lower and acceptable_loc in metadata_virus_name_lower: location_match = True break - + if not location_match: - logger.debug("Skipping %s: location '%s', region '%s', virusName '%s' do not match required '%s'", - accession, metadata_location, metadata_region, metadata_virus_name, geo_location_list) - filter_stats['geographic_location'] += 1 + logger.debug( + "Skipping %s: location '%s', region '%s', virusName '%s' do not match required '%s'", + accession, + metadata_location, + metadata_region, + metadata_virus_name, + geo_location_list, + ) + filter_stats["geographic_location"] += 1 continue # FILTER 19: Host filter (deferred from API when server-side filter fails) if host is not None: # Convert host to list if it's a string host_list = [host] if isinstance(host, str) else host - + # Get host from metadata - stored in "hostName" field metadata_host = metadata.get("hostName", "") or "" - + if not metadata_host: logger.debug("Skipping %s: missing hostName", accession) - filter_stats['host'] += 1 + filter_stats["host"] += 1 continue - + # Build set of acceptable host values (case-insensitive) acceptable_hosts = set() for h in host_list: h_normalized = h.lower().strip() acceptable_hosts.add(h_normalized) - + # Normalize metadata host for comparison metadata_host_lower = str(metadata_host).lower().strip() - + # Check for partial/substring match in host host_match = False for acceptable_host in acceptable_hosts: @@ -7781,22 +8199,23 @@ def filter_metadata_only( if acceptable_host in metadata_host_lower or metadata_host_lower in acceptable_host: host_match = True break - + if not host_match: - logger.debug("Skipping %s: hostName '%s' does not match required '%s'", - accession, metadata_host, host_list) - filter_stats['host'] += 1 + logger.debug( + "Skipping %s: hostName '%s' does not match required '%s'", accession, metadata_host, host_list + ) + filter_stats["host"] += 1 continue # If we reach this point, the metadata record has passed all filters filtered_accessions.append(accession) filtered_metadata_list.append(metadata) - + logger.debug("Metadata %s passed all filters", accession) # Log comprehensive filtering statistics num_filtered = len(filtered_accessions) - + if num_filtered == 0: # Simplified output when nothing passes filters logger.info("=================================") @@ -7814,7 +8233,7 @@ def filter_metadata_only( logger.info("Metadata-only filtering complete:") logger.info(" Total metadata records: %d", total_sequences) logger.info(" Records passing filters: %d", num_filtered) - + # Log detailed filter statistics if any records were filtered out total_filtered = sum(filter_stats.values()) if total_filtered > 0: @@ -7822,7 +8241,7 @@ def filter_metadata_only( for filter_name, count in filter_stats.items(): if count > 0: logger.info(" %s: %d records", filter_name, count) - + return filtered_accessions, filtered_metadata_list, filter_stats @@ -7838,13 +8257,12 @@ def filter_genbank_metadata( gen_mol_type=None, env_source=None, ): - """ - Filter accessions based on GenBank-specific metadata fields. - + """Filter accessions based on GenBank-specific metadata fields. + This function filters accessions using metadata extracted from GenBank XML that is not available in the standard NCBI Datasets API. These fields require fetching full GenBank records via E-utilities. - + Filtering is done using the parsed genbank_metadata dictionary where each accession maps to metadata containing a 'genbank_data' sub-dictionary with: - gene_count: Number of gene features @@ -7856,7 +8274,7 @@ def filter_genbank_metadata( - isolation_source: Sample isolation source (for env_source filter) - host: Host information (used to determine env_source when empty) - comment: GenBank comment field (may contain env_source keywords) - + Args: genbank_metadata (dict): Dictionary mapping accession numbers to metadata with 'genbank_data' sub-dictionary from _parse_genbank_xml. @@ -7874,29 +8292,39 @@ def filter_genbank_metadata( env_source (str or list, optional): Environmental source keywords to match. Searches isolation_source and note fields for keywords like 'sewage', 'ocean water', 'sea', etc. when host is empty or not human. - - Returns: + + Returns + ------- list: List of accession numbers that passed all GenBank-based filters. - + Example: >>> genbank_filtered = filter_genbank_metadata( ... genbank_metadata=genbank_data, ... min_gene_count=5, ... provirus=False, - ... genotype=['H5N1', 'H5N8'], - ... has_proteins='hemagglutinin', + ... genotype=["H5N1", "H5N8"], + ... has_proteins="hemagglutinin", ... ) + """ - logger.info("Starting GenBank metadata filtering...") - logger.debug("GenBank filters: gene_count(%s-%s), mature_peptide(%s-%s), provirus=%s, genotype=%s, has_proteins=%s, mol_type=%s, env_source=%s", - min_gene_count, max_gene_count, min_mature_peptide_count, max_mature_peptide_count, - provirus, genotype, has_proteins, gen_mol_type, env_source) - + logger.debug( + "GenBank filters: gene_count(%s-%s), mature_peptide(%s-%s), provirus=%s, genotype=%s, has_proteins=%s, mol_type=%s, env_source=%s", + min_gene_count, + max_gene_count, + min_mature_peptide_count, + max_mature_peptide_count, + provirus, + genotype, + has_proteins, + gen_mol_type, + env_source, + ) + if not genbank_metadata: logger.warning("No GenBank metadata provided for filtering") return [] - + # Convert single string filters to lists for uniform processing if isinstance(genotype, str): genotype = [genotype] @@ -7904,86 +8332,108 @@ def filter_genbank_metadata( has_proteins = [has_proteins] if isinstance(env_source, str): env_source = [env_source] - + # Environmental source keywords to search for - env_keywords = ['sewage', 'wastewater', 'ocean', 'sea', 'river', 'lake', - 'pond', 'water', 'soil', 'environment', 'feces', 'fecal', - 'stool', 'manure', 'avian', 'bird', 'poultry', 'swine', 'pig'] - + env_keywords = [ + "sewage", + "wastewater", + "ocean", + "sea", + "river", + "lake", + "pond", + "water", + "soil", + "environment", + "feces", + "fecal", + "stool", + "manure", + "avian", + "bird", + "poultry", + "swine", + "pig", + ] + # Filter statistics filter_stats = { - 'gene_count': 0, - 'mature_peptide_count': 0, - 'provirus': 0, - 'genotype': 0, - 'has_proteins': 0, - 'mol_type': 0, - 'env_source': 0, - 'no_genbank_data': 0, + "gene_count": 0, + "mature_peptide_count": 0, + "provirus": 0, + "genotype": 0, + "has_proteins": 0, + "mol_type": 0, + "env_source": 0, + "no_genbank_data": 0, } - + filtered_accessions = [] total_sequences = len(genbank_metadata) - + for accession, metadata in genbank_metadata.items(): # Get genbank_data sub-dictionary - gb_data = metadata.get('genbank_data', {}) + gb_data = metadata.get("genbank_data", {}) if not gb_data: logger.debug("Skipping %s: no genbank_data", accession) - filter_stats['no_genbank_data'] += 1 + filter_stats["no_genbank_data"] += 1 continue - + # FILTER 1: Gene count if min_gene_count is not None or max_gene_count is not None: - gene_count = gb_data.get('gene_count', 0) - + gene_count = gb_data.get("gene_count", 0) + if min_gene_count is not None and gene_count < min_gene_count: logger.debug("Skipping %s: gene count %d < min %d", accession, gene_count, min_gene_count) - filter_stats['gene_count'] += 1 + filter_stats["gene_count"] += 1 continue - + if max_gene_count is not None and gene_count > max_gene_count: logger.debug("Skipping %s: gene count %d > max %d", accession, gene_count, max_gene_count) - filter_stats['gene_count'] += 1 + filter_stats["gene_count"] += 1 continue - + # FILTER 2: Mature peptide count if min_mature_peptide_count is not None or max_mature_peptide_count is not None: - mat_count = gb_data.get('mature_peptide_count', 0) - + mat_count = gb_data.get("mature_peptide_count", 0) + if min_mature_peptide_count is not None and mat_count < min_mature_peptide_count: - logger.debug("Skipping %s: mature peptide count %d < min %d", accession, mat_count, min_mature_peptide_count) - filter_stats['mature_peptide_count'] += 1 + logger.debug( + "Skipping %s: mature peptide count %d < min %d", accession, mat_count, min_mature_peptide_count + ) + filter_stats["mature_peptide_count"] += 1 continue - + if max_mature_peptide_count is not None and mat_count > max_mature_peptide_count: - logger.debug("Skipping %s: mature peptide count %d > max %d", accession, mat_count, max_mature_peptide_count) - filter_stats['mature_peptide_count'] += 1 + logger.debug( + "Skipping %s: mature peptide count %d > max %d", accession, mat_count, max_mature_peptide_count + ) + filter_stats["mature_peptide_count"] += 1 continue - + # FILTER 3: Proviral status if provirus is not None: - is_proviral = gb_data.get('proviral', False) - + is_proviral = gb_data.get("proviral", False) + if provirus and not is_proviral: logger.debug("Skipping %s: not proviral (required)", accession) - filter_stats['provirus'] += 1 + filter_stats["provirus"] += 1 continue - + if provirus is False and is_proviral: logger.debug("Skipping %s: is proviral (excluded)", accession) - filter_stats['provirus'] += 1 + filter_stats["provirus"] += 1 continue - + # FILTER 4: Genotype (from serotype or note field) if genotype is not None: - record_genotype = (gb_data.get('genotype', '') or '').lower().strip() - + record_genotype = (gb_data.get("genotype", "") or "").lower().strip() + if not record_genotype: logger.debug("Skipping %s: missing genotype", accession) - filter_stats['genotype'] += 1 + filter_stats["genotype"] += 1 continue - + # Check if any requested genotype matches (case-insensitive, partial match) genotype_match = False for g in genotype: @@ -7991,17 +8441,17 @@ def filter_genbank_metadata( if g_lower in record_genotype or record_genotype in g_lower: genotype_match = True break - + if not genotype_match: logger.debug("Skipping %s: genotype '%s' not in required %s", accession, record_genotype, genotype) - filter_stats['genotype'] += 1 + filter_stats["genotype"] += 1 continue - + # FILTER 5: Has proteins (check product names) if has_proteins is not None: - products = gb_data.get('products', []) + products = gb_data.get("products", []) products_lower = [p.lower() for p in products] - + # Check that AT LEAST ONE required protein is present any_protein_found = False for protein in has_proteins: @@ -8012,40 +8462,41 @@ def filter_genbank_metadata( break if any_protein_found: break - + if not any_protein_found: - logger.debug("Skipping %s: none of required proteins %s found in %s", - accession, has_proteins, products[:5]) - filter_stats['has_proteins'] += 1 + logger.debug( + "Skipping %s: none of required proteins %s found in %s", accession, has_proteins, products[:5] + ) + filter_stats["has_proteins"] += 1 continue - + # FILTER 6: Molecule type (gen_mol_type) if gen_mol_type is not None: - mol_type = (gb_data.get('mol_type', '') or '').lower().strip() - + mol_type = (gb_data.get("mol_type", "") or "").lower().strip() + if not mol_type: logger.debug("Skipping %s: missing mol_type", accession) - filter_stats['mol_type'] += 1 + filter_stats["mol_type"] += 1 continue - + gen_mol_type_lower = gen_mol_type.lower().strip() - + # Define molecule type mappings for common aliases # dsDNA/ssRNA etc are structural classifications while GenBank uses "genomic DNA/RNA" mol_type_mappings = { - 'dsdna': ['genomic dna', 'dna'], - 'ssdna': ['genomic dna', 'dna'], - 'dsrna': ['genomic rna', 'rna'], - 'ssrna': ['genomic rna', 'rna', 'mrna', 'viral crna'], - 'dna': ['genomic dna', 'dna'], - 'rna': ['genomic rna', 'rna', 'mrna', 'viral crna'], - 'genomic dna': ['genomic dna', 'dna'], - 'genomic rna': ['genomic rna', 'rna'], + "dsdna": ["genomic dna", "dna"], + "ssdna": ["genomic dna", "dna"], + "dsrna": ["genomic rna", "rna"], + "ssrna": ["genomic rna", "rna", "mrna", "viral crna"], + "dna": ["genomic dna", "dna"], + "rna": ["genomic rna", "rna", "mrna", "viral crna"], + "genomic dna": ["genomic dna", "dna"], + "genomic rna": ["genomic rna", "rna"], } - + # Check if the filter matches directly or via mapping mol_type_match = False - + # First check direct/partial match if gen_mol_type_lower in mol_type or mol_type in gen_mol_type_lower: mol_type_match = True @@ -8056,33 +8507,33 @@ def filter_genbank_metadata( if mapped_val in mol_type: mol_type_match = True break - + if not mol_type_match: logger.debug("Skipping %s: mol_type '%s' != required '%s'", accession, mol_type, gen_mol_type) - filter_stats['mol_type'] += 1 + filter_stats["mol_type"] += 1 continue - + # FILTER 7: Environmental source (env_source) if env_source is not None: # Get isolation_source, note, and check host - isolation_source = (gb_data.get('isolation_source', '') or '').lower() - host = (gb_data.get('host', '') or '').lower() - comment = (gb_data.get('comment', '') or '').lower() - all_features = gb_data.get('all_features', {}) - + isolation_source = (gb_data.get("isolation_source", "") or "").lower() + host = (gb_data.get("host", "") or "").lower() + comment = (gb_data.get("comment", "") or "").lower() + all_features = gb_data.get("all_features", {}) + # Get note from source feature - source_feature = all_features.get('source', {}) + source_feature = all_features.get("source", {}) if isinstance(source_feature, list): source_feature = source_feature[0] - note = (source_feature.get('note', '') or '').lower() - + note = (source_feature.get("note", "") or "").lower() + # Combine all text to search search_text = f"{isolation_source} {note} {comment}" - + # If host is specified and is human, skip env_source filtering # (env_source is for non-human/environmental samples) - is_human_host = 'human' in host or 'homo sapiens' in host - + is_human_host = "human" in host or "homo sapiens" in host + env_match = False for env_term in env_source: env_term_lower = env_term.lower().strip() @@ -8096,21 +8547,20 @@ def filter_genbank_metadata( break if env_match: break - + # Only filter out if explicitly looking for environmental and not found if not env_match and not is_human_host: - logger.debug("Skipping %s: env_source '%s' not found in isolation_source/note", - accession, env_source) - filter_stats['env_source'] += 1 + logger.debug("Skipping %s: env_source '%s' not found in isolation_source/note", accession, env_source) + filter_stats["env_source"] += 1 continue - + # If we reach here, the record passed all filters filtered_accessions.append(accession) logger.debug("GenBank metadata %s passed all filters", accession) - + # Log filtering summary num_filtered = len(filtered_accessions) - + if num_filtered == 0: logger.info("=================================") logger.info("GenBank filtering complete: 0 of %d records passed filters", total_sequences) @@ -8126,14 +8576,14 @@ def filter_genbank_metadata( logger.info("GenBank filtering complete:") logger.info(" Total GenBank records: %d", total_sequences) logger.info(" Records passing filters: %d", num_filtered) - + total_excluded = sum(filter_stats.values()) if total_excluded > 0: logger.info("Filter statistics (records excluded):") for filter_name, count in filter_stats.items(): if count > 0: logger.info(" %s: %d records", filter_name, count) - + return filtered_accessions, filter_stats @@ -8178,18 +8628,17 @@ def virus( isolate=None, genotype=None, isolation_source=None, - env_source=None, + env_source=None, submitter_name=None, submitter_institution=None, - gen_mol_type=None, + gen_mol_type=None, # assembly_completeness=None, api_key=None, baseline_metadata=None, merge_results=True, verbose=True, - ): - """ - Download a virus genome dataset from the NCBI Virus database (https://www.ncbi.nlm.nih.gov/labs/virus/). +): + """Download a virus genome dataset from the NCBI Virus database (https://www.ncbi.nlm.nih.gov/labs/virus/). This is the main function that orchestrates the entire virus data retrieval process, now optimized to download sequences only after all metadata-based filtering: @@ -8211,29 +8660,30 @@ def virus( if api_key: logger.info("Using NCBI API key for higher rate limits (10 req/sec)") else: - logger.info("No NCBI API key provided. Using default rate limit (3 req/sec). " - "Set NCBI_API_KEY env var or pass --api_key for faster requests.") + logger.info( + "No NCBI API key provided. Using default rate limit (3 req/sec). " + "Set NCBI_API_KEY env var or pass --api_key for faster requests." + ) # Save the original logger level and set it based on verbose parameter original_logger_level = logger.level if not verbose: logger.setLevel(logging.CRITICAL) - + logger.info("Starting virus data retrieval process...") - + # Capture the command line for summary command_line = " ".join(sys.argv) if len(sys.argv) > 0 else "virus (called programmatically)" - + # Track wall-clock runtime _virus_start_time = time.time() - + # Initialize variables for tracking results total_api_records = 0 total_after_metadata_filter = 0 total_final_sequences = 0 output_files_dict = {} final_metadata_for_summary = [] - filtered_sequences = [] refseq_only = False # Initialize filter stats for each stage (populated by filter functions) @@ -8242,156 +8692,177 @@ def virus( sequence_filter_stats = {} total_after_genbank_filter = None total_after_sequence_filter = None - + # Initialize failed commands tracker for tracking all types of failures failed_commands = { - 'api_timeout': None, - 'empty_response': None, - 'sequence_batches': [], - 'genbank_batches': [], - 'api_batches': [], - 'pagination_timeouts': [], - 'pagination_errors': [], - 'sequence_fetch': [], + "api_timeout": None, + "empty_response": None, + "sequence_batches": [], + "genbank_batches": [], + "api_batches": [], + "pagination_timeouts": [], + "pagination_errors": [], + "sequence_fetch": [], } - + # Track if GenBank metadata was successfully retrieved genbank_success = False genbank_error_msg = None if download_all_accessions: logger.info("ATTENTION: Download all accessions mode is active.") - logger.info("This will download ALL virus accessions from NCBI, which can be a very large dataset and take a long time.") - virus = NCBI_ALL_VIRUSES_TAXID # NCBI taxonomy ID for all Viruses + logger.info( + "This will download ALL virus accessions from NCBI, which can be a very large dataset and take a long time." + ) + virus = NCBI_ALL_VIRUSES_TAXID # NCBI taxonomy ID for all Viruses is_accession = False - logger.info("Overriding virus query and accession tag to fetch all viruses using taxon ID: %s. Filters remain unchanged.", virus) + logger.info( + "Overriding virus query and accession tag to fetch all viruses using taxon ID: %s. Filters remain unchanged.", + virus, + ) - logger.info("Query parameters: virus='%s', is_accession=%s, outfolder='%s'", - virus, is_accession, outfolder) - logger.debug("Applied filters: host=%s, seq_length=(%s-%s), gene_count=(%s-%s), completeness=%s, annotated=%s, source_db(%s), keep_temp=%s, lab_passaged=%s, geographic_location=%s, submitter_country=%s, submitter_name=%s, submitter_institution=%s, collection_date=(%s-%s), release_date=(%s-%s), protein_count=(%s-%s), mature_peptide_count=(%s-%s), max_ambiguous=%s, has_proteins=%s, proteins_complete=%s, segment=%s, vaccine_strain=%s, lineage=%s, provirus=%s, isolate=%s, genotype=%s, isolation_source=%s, env_source=%s, gen_mol_type=%s, genbank_metadata=%s, genbank_batch_size=%s", - host, min_seq_length, max_seq_length, min_gene_count, max_gene_count, nuc_completeness, annotated, source_database, keep_temp, lab_passaged, geographic_location, submitter_country, submitter_name, submitter_institution, min_collection_date, max_collection_date, min_release_date, max_release_date, min_protein_count, max_protein_count, min_mature_peptide_count, max_mature_peptide_count, max_ambiguous_chars, has_proteins, proteins_complete, segment, vaccine_strain, lineage, provirus, isolate, genotype, isolation_source, env_source, gen_mol_type, genbank_metadata, genbank_batch_size) + logger.info("Query parameters: virus='%s', is_accession=%s, outfolder='%s'", virus, is_accession, outfolder) + logger.debug( + "Applied filters: host=%s, seq_length=(%s-%s), gene_count=(%s-%s), completeness=%s, annotated=%s, source_db(%s), keep_temp=%s, lab_passaged=%s, geographic_location=%s, submitter_country=%s, submitter_name=%s, submitter_institution=%s, collection_date=(%s-%s), release_date=(%s-%s), protein_count=(%s-%s), mature_peptide_count=(%s-%s), max_ambiguous=%s, has_proteins=%s, proteins_complete=%s, segment=%s, vaccine_strain=%s, lineage=%s, provirus=%s, isolate=%s, genotype=%s, isolation_source=%s, env_source=%s, gen_mol_type=%s, genbank_metadata=%s, genbank_batch_size=%s", + host, + min_seq_length, + max_seq_length, + min_gene_count, + max_gene_count, + nuc_completeness, + annotated, + source_database, + keep_temp, + lab_passaged, + geographic_location, + submitter_country, + submitter_name, + submitter_institution, + min_collection_date, + max_collection_date, + min_release_date, + max_release_date, + min_protein_count, + max_protein_count, + min_mature_peptide_count, + max_mature_peptide_count, + max_ambiguous_chars, + has_proteins, + proteins_complete, + segment, + vaccine_strain, + lineage, + provirus, + isolate, + genotype, + isolation_source, + env_source, + gen_mol_type, + genbank_metadata, + genbank_batch_size, + ) # SECTION 1: INPUT VALIDATION AND OUTPUT DIRECTORY SETUP # Validate and normalize input arguments before proceeding logger.info("=" * 60) logger.info("STEP 1: VALIDATING INPUT ARGUMENTS AND OUTPUT DIRECTORY SETUP...") logger.info("=" * 60) - + # Validate virus parameter if virus is None or (isinstance(virus, str) and virus.strip() == ""): - raise ValueError( - "Argument 'virus' must be a non-empty string (virus name, taxon ID, or accession number)." - ) - + raise ValueError("Argument 'virus' must be a non-empty string (virus name, taxon ID, or accession number).") + # Validate that both host and env_source filters are not used together, as they are mutually exclusive if host is not None and env_source is not None: - raise ValueError("Both 'host' and 'env_source' filters are specified. If there is a host, there is no environmental source. Use only one of these filters for results.") - + raise ValueError( + "Both 'host' and 'env_source' filters are specified. If there is a host, there is no environmental source. Use only one of these filters for results." + ) + # Normalize host parameter: convert "human" to "Homo sapiens" for NCBI API compatibility if host is not None and host.strip().lower() == "human": logger.debug("Normalizing host 'human' to 'Homo sapiens' for NCBI API compatibility") host = "Homo sapiens" - + # Validate nucleotide completeness argument if nuc_completeness is not None: nuc_completeness = nuc_completeness.lower() # Normalize to lowercase if nuc_completeness not in ["partial", "complete"]: - raise ValueError( - "Argument 'nuc_completeness' must be 'partial', 'complete', or None." - ) + raise ValueError("Argument 'nuc_completeness' must be 'partial', 'complete', or None.") logger.debug("Nucleotide completeness filter set to: %s", nuc_completeness) # Validate source database argument if source_database is not None: source_database = source_database.lower() # Normalize to lowercase if source_database not in ["refseq", "genbank"]: - raise ValueError( - "Argument 'source_database' must be 'refseq', 'genbank', or None." - ) + raise ValueError("Argument 'source_database' must be 'refseq', 'genbank', or None.") elif source_database == "refseq": refseq_only = True logger.debug("Source database filter set to RefSeq only") logger.debug("Source database filter set to: %s", source_database) - + # Validate boolean arguments with proper type checking if annotated is not None and not isinstance(annotated, bool): - raise TypeError( - "Argument 'annotated' must be a boolean (True or False) or None." - ) - + raise TypeError("Argument 'annotated' must be a boolean (True or False) or None.") + if lab_passaged is not None and not isinstance(lab_passaged, bool): - raise TypeError( - "Argument 'lab_passaged' must be a boolean (True or False) or None." - ) - + raise TypeError("Argument 'lab_passaged' must be a boolean (True or False) or None.") + if proteins_complete is not None and not isinstance(proteins_complete, bool): - raise TypeError( - "Argument 'proteins_complete' must be a boolean (True or False)." - ) + raise TypeError("Argument 'proteins_complete' must be a boolean (True or False).") # if refseq_only is not None and not isinstance(refseq_only, bool): # raise TypeError( # "Argument 'refseq_only' must be a boolean (True or False)." - # ) - + # ) + if keep_temp is not None and not isinstance(keep_temp, bool): - raise TypeError( - "Argument 'keep_temp' must be a boolean (True or False)." - ) + raise TypeError("Argument 'keep_temp' must be a boolean (True or False).") if is_accession is not None and not isinstance(is_accession, bool): - raise TypeError( - "Argument 'is_accession' must be a boolean (True or False)." - ) - + raise TypeError("Argument 'is_accession' must be a boolean (True or False).") + if genbank_metadata is not None and not isinstance(genbank_metadata, bool): - raise TypeError( - "Argument 'genbank_metadata' must be a boolean (True or False)." - ) + raise TypeError("Argument 'genbank_metadata' must be a boolean (True or False).") if vaccine_strain is not None and not isinstance(vaccine_strain, bool): - raise TypeError( - "Argument 'vaccine_strain' must be a boolean (True or False) or None." - ) + raise TypeError("Argument 'vaccine_strain' must be a boolean (True or False) or None.") if provirus is not None and not isinstance(provirus, bool): - raise TypeError( - "Argument 'provirus' must be a boolean (True or False) or None." - ) - + raise TypeError("Argument 'provirus' must be a boolean (True or False) or None.") + if genbank_batch_size is not None: if not isinstance(genbank_batch_size, int) or genbank_batch_size <= 0: - raise ValueError( - "Argument 'genbank_batch_size' must be a positive integer." - ) + raise ValueError("Argument 'genbank_batch_size' must be a positive integer.") if genbank_batch_size > GENBANK_MAX_BATCH_SIZE_WARNING: - logger.warning("Large genbank_batch_size (%d) may cause API timeouts. Consider using smaller batches.", genbank_batch_size) - + logger.warning( + "Large genbank_batch_size (%d) may cause API timeouts. Consider using smaller batches.", + genbank_batch_size, + ) + if genbank_metadata: logger.info("GenBank metadata retrieval enabled (batch_size=%d)", genbank_batch_size) else: # Check if any GenBank-dependent filters are specified genbank_dependent_filters = { - 'provirus': provirus, - 'genotype': genotype, - 'has_proteins': has_proteins, - 'gen_mol_type': gen_mol_type, - 'env_source': env_source, - 'min_gene_count': min_gene_count, - 'max_gene_count': max_gene_count, - 'min_mature_peptide_count': min_mature_peptide_count, - 'max_mature_peptide_count': max_mature_peptide_count, + "provirus": provirus, + "genotype": genotype, + "has_proteins": has_proteins, + "gen_mol_type": gen_mol_type, + "env_source": env_source, + "min_gene_count": min_gene_count, + "max_gene_count": max_gene_count, + "min_mature_peptide_count": min_mature_peptide_count, + "max_mature_peptide_count": max_mature_peptide_count, } active_genbank_filters = [k for k, v in genbank_dependent_filters.items() if v is not None] - + if active_genbank_filters: - logger.info("GenBank-dependent filters detected: %s", ', '.join(active_genbank_filters)) + logger.info("GenBank-dependent filters detected: %s", ", ".join(active_genbank_filters)) logger.info("Automatically enabling GenBank metadata retrieval (-g flag)") genbank_metadata = True else: logger.debug("GenBank metadata retrieval disabled") - # Convert integer virus identifiers to strings for API compatibility if isinstance(virus, int): virus = str(virus) @@ -8440,20 +8911,23 @@ def virus( ############## # Prepare output directory and all used file paths - virus_clean = virus.replace(' ', '_').replace('/', '_').replace('-', '_') + virus_clean = virus.replace(" ", "_").replace("/", "_").replace("-", "_") # Create and prepare output directory structure if outfolder is None: currentfolder = os.getcwd() - outfolder = os.path.join(currentfolder, "gget_virus_output" , f"{virus_clean}_{timestamp}") - logger.info("No output folder specified, creating a subdirectory in current directory named 'gget_virus_output' and placing results in a folder named: %s", outfolder) + outfolder = os.path.join(currentfolder, "gget_virus_output", f"{virus_clean}_{timestamp}") + logger.info( + "No output folder specified, creating a subdirectory in current directory named 'gget_virus_output' and placing results in a folder named: %s", + outfolder, + ) else: logger.info("Using specified output folder: %s", outfolder) - + # Ensure output folder exists os.makedirs(outfolder, exist_ok=True) logger.debug("Output folder ready: %s", outfolder) - + # Create temporary directory for intermediate processing # This will be cleaned up at the end regardless of success or failure temp_dir = os.path.join(outfolder, f"tmp_{timestamp}_{random_suffix}") @@ -8465,13 +8939,13 @@ def virus( genbank_full_xml_path = os.path.join(outfolder, f"{virus_clean}_genbank_metadata_full.xml") genbank_full_csv_path = os.path.join(outfolder, f"{virus_clean}_genbank_metadata_full.csv") output_api_metadata_jsonl = os.path.join(outfolder, f"{virus_clean}_api_metadata.jsonl") - + # SECTION 1b: BASELINE METADATA LOADING FOR DEDUPLICATION # If a baseline file is provided, load accessions for deduplication baseline_accessions = None baseline_skipped_count = 0 partial_metadata_file = None # Will be set if API fails and partial metadata is saved - + if baseline_metadata is not None: logger.info("=" * 60) logger.info("STEP 1b: LOADING BASELINE METADATA FOR DEDUPLICATION") @@ -8494,7 +8968,7 @@ def virus( # SECTION 2: CHECKING FOR CACHED DATA PROCESSING logger.info("=" * 60) - logger.info("STEP 2: CHECKING FOR SARS-CoV-2 AND INFLUENZA A QUERIES TO APPLY OPTIMIZED CACHED PATHWAY") + logger.info("STEP 2: CHECKING FOR SARS-CoV-2 AND INFLUENZA A QUERIES TO APPLY OPTIMIZED CACHED PATHWAY") logger.info("=" * 60) # Initialize variables to track cached download results cached_fasta_file = None # Path to cached FASTA file (sequences streamed on-demand) @@ -8506,22 +8980,22 @@ def virus( # For SARS-CoV-2 queries, use cached data packages with hierarchical fallback if _skip_cache: logger.info("⏭️ SKIPPING CACHED PATHWAY - Using API method directly (via _skip_cache flag)") - elif (is_sars_cov2 or is_sars_cov2_query(virus, is_accession)): + elif is_sars_cov2 or is_sars_cov2_query(virus, is_accession): logger.info("DETECTED SARS-CoV-2 QUERY - USING CACHED DATA PACKAGE PATHWAY") logger.info("SARS-CoV-2 queries will use NCBI's optimized cached data packages") logger.info("with hierarchical fallback from specific to general cached files.") - + # Use the download_sars_cov2_optimized function which handles fallback strategies internally params = { - 'host': host, - 'complete_only': (nuc_completeness == "complete"), - 'annotated': annotated, - 'outdir': outfolder, - 'lineage': lineage, - 'accession': virus, - 'use_accession': is_accession + "host": host, + "complete_only": (nuc_completeness == "complete"), + "annotated": annotated, + "outdir": outfolder, + "lineage": lineage, + "accession": virus, + "use_accession": is_accession, } - + try: download_result = download_sars_cov2_optimized(**params) # Unpack tuple: (zip_path, applied_filters, missing_filters) @@ -8530,20 +9004,22 @@ def virus( missing_filters = download_result[2] cached_zip_file = zip_file # Track for cleanup datasets_version = _get_datasets_version() - + cached_fasta_file, cached_metadata_dict, used_cached_download = process_cached_download( zip_file, virus_type="SARS-CoV-2" ) if used_cached_download: - logger.info("Cached download completed. Server-side filters (host, complete_only, annotated, lineage) applied.") + logger.info( + "Cached download completed. Server-side filters (host, complete_only, annotated, lineage) applied." + ) logger.info("All other filters will be applied in the unified filtering pipeline.") logger.debug("Applied filters: %s", applied_filters) logger.debug("Missing filters (to apply in Step 3b): %s", missing_filters) - except Exception as cache_error: + except Exception as cache_error: # noqa: BLE001 logger.warning("SARS-CoV-2 cached download failed after all strategies: %s", cache_error) logger.info("🔄 Retrying with normal API download method (_skip_cache=True)...") # Retry the entire virus() call with _skip_cache=True to use the normal API pathway - _virus_func = globals()['virus'] + _virus_func = globals()["virus"] return _virus_func( virus=virus, is_accession=is_accession, @@ -8593,26 +9069,25 @@ def virus( else: logger.info("No SARS-CoV-2 query detected.") - # SECTION 2b: ALPHAINFLUENZA CACHED DATA PROCESSING # For Alphainfluenza queries, use cached data packages with hierarchical fallback if _skip_cache: logger.info("⏭️ SKIPPING CACHED PATHWAY - Using API method directly (via _skip_cache flag)") - elif (is_alphainfluenza or is_alphainfluenza_query(virus, is_accession)): + elif is_alphainfluenza or is_alphainfluenza_query(virus, is_accession): logger.info("DETECTED ALPHAINFLUENZA QUERY - USING CACHED DATA PACKAGES") logger.info("Alphainfluenza queries will use NCBI's optimized cached data packages") logger.info("with hierarchical fallback from specific to general cached files.") - + # Use the download_alphainfluenza_optimized function which handles fallback strategies internally params = { - 'host': host, - 'complete_only': (nuc_completeness == "complete"), - 'annotated': annotated, - 'outdir': outfolder, - 'accession': virus, - 'use_accession': is_accession + "host": host, + "complete_only": (nuc_completeness == "complete"), + "annotated": annotated, + "outdir": outfolder, + "accession": virus, + "use_accession": is_accession, } - + try: download_result = download_alphainfluenza_optimized(**params) # Unpack tuple: (zip_path, applied_filters, missing_filters) @@ -8621,7 +9096,7 @@ def virus( missing_filters = download_result[2] cached_zip_file = zip_file # Track for cleanup datasets_version = _get_datasets_version() - + cached_fasta_file, cached_metadata_dict, used_cached_download = process_cached_download( zip_file, virus_type="Alphainfluenza" ) @@ -8630,13 +9105,13 @@ def virus( logger.info("All other filters will be applied in the unified filtering pipeline.") logger.debug("Applied filters: %s", applied_filters) logger.debug("Missing filters (to apply in Step 3b): %s", missing_filters) - except Exception as cache_error: + except Exception as cache_error: # noqa: BLE001 logger.warning("Alphainfluenza cached download failed after all strategies: %s", cache_error) logger.info("🔄 Retrying with normal API download method (_skip_cache=True)...") # Retry the entire virus() call with _skip_cache=True to use the normal API pathway # Note: We use globals()['virus'] because the local parameter 'virus' (a string) # shadows the function name in this scope. - _virus_func = globals()['virus'] + _virus_func = globals()["virus"] return _virus_func( virus=virus, is_accession=is_accession, @@ -8685,24 +9160,24 @@ def virus( ) else: logger.info("No Alphainfluenza query detected.") - + # Initialize deferred_filters for tracking filters that couldn't be applied server-side deferred_filters = None try: - # SECTION 3: METADATA RETRIEVAL AND FILTERING + # SECTION 3: METADATA RETRIEVAL AND FILTERING # Check if we're using cached download data if used_cached_download and cached_metadata_dict: logger.info("=" * 60) logger.info("STEP 3: Applying metadata filters for cached download") logger.info("=" * 60) logger.info("Using metadata from cached download (skipping API metadata fetch)") - + # cached_metadata_dict is now a file path (not a dict) # Use streaming filter to load only records passing filters into memory if isinstance(cached_metadata_dict, str) and os.path.isfile(cached_metadata_dict): logger.info("Memory-efficient path: streaming cached metadata JSONL with on-the-fly filtering") - + metadata_dict, total_api_records, cache_filter_stats = _stream_filter_cached_metadata_from_jsonl( cached_metadata_dict, host=host, @@ -8714,30 +9189,31 @@ def virus( min_release_date=min_release_date, applied_strategy_filters=applied_filters, ) - logger.info("Loaded %d records passing filters (from %d total in cache)", - len(metadata_dict), total_api_records) - + logger.info( + "Loaded %d records passing filters (from %d total in cache)", len(metadata_dict), total_api_records + ) + # Copy the cached JSONL to the output API metadata location for consistency try: shutil.copy(cached_metadata_dict, output_api_metadata_jsonl) logger.info("✅ Saved cached metadata JSONL: %s", output_api_metadata_jsonl) - except Exception as e: + except Exception as e: # noqa: BLE001 logger.warning("❌ Failed to copy cached metadata JSONL: %s", e) else: # Fallback: cached_metadata_dict is already a dict (legacy path) metadata_dict = cached_metadata_dict total_api_records = len(metadata_dict) logger.info("Loaded %d metadata records from cached download", total_api_records) - + # Save metadata JSONL for consistency try: with open(output_api_metadata_jsonl, "w", encoding="utf-8") as f: for md in metadata_dict.values(): f.write(json.dumps(md) + "\n") logger.info("✅ Saved cached metadata JSONL: %s", output_api_metadata_jsonl) - except Exception as e: + except Exception as e: # noqa: BLE001 logger.warning("❌ Failed to save cached metadata JSONL: %s", e) - + # Apply post-cached-download filters (legacy in-memory path) logger.debug("Using applied_filters from cached strategy: %s", applied_filters) filtered_accessions_step3, filtered_metadata_step3 = filter_cached_metadata_for_unused_filters( @@ -8751,44 +9227,58 @@ def virus( min_release_date=min_release_date, applied_strategy_filters=applied_filters, ) - metadata_dict = {acc: md for acc, md in zip(filtered_accessions_step3, filtered_metadata_step3)} - + metadata_dict = dict(zip(filtered_accessions_step3, filtered_metadata_step3, strict=False)) + logger.info("After post-cached-download filtering: %d records remain", len(metadata_dict)) - + # Baseline deduplication for cached path baseline_skipped_count = 0 if baseline_accessions is not None and metadata_dict: - logger.info("Deduplicating cached metadata against baseline (%d accessions)...", len(baseline_accessions)) + logger.info( + "Deduplicating cached metadata against baseline (%d accessions)...", len(baseline_accessions) + ) metadata_dict, baseline_skipped_count = _deduplicate_metadata_against_baseline( metadata_dict, baseline_accessions ) - logger.info("After baseline deduplication: %d new records (skipped %d)", - len(metadata_dict), baseline_skipped_count) - + logger.info( + "After baseline deduplication: %d new records (skipped %d)", + len(metadata_dict), + baseline_skipped_count, + ) + else: # Regular API metadata fetch logger.info("=" * 60) logger.info("STEP 3: Fetching virus metadata from NCBI API") logger.info("=" * 60) api_annotated_filter = annotated if annotated is True else None - api_complete_filter = True if nuc_completeness=="complete" else False + api_complete_filter = True if nuc_completeness == "complete" else False + + logger.debug( + "Applying server-side filters: host=%s, geo_location=%s, annotated=%s, complete_only=%s, min_release_date=%s, refseq_only=%s", + host, + geographic_location, + annotated, + api_complete_filter, + min_release_date, + refseq_only, + ) - logger.debug("Applying server-side filters: host=%s, geo_location=%s, annotated=%s, complete_only=%s, min_release_date=%s, refseq_only=%s", host, geographic_location, annotated, api_complete_filter, min_release_date, refseq_only) - # Track deferred filters that couldn't be applied server-side deferred_filters = None - + try: # Check if this is a multi-accession query (list or file) use_batched_fetch = False if is_accession: parsed_accessions = _parse_accession_input(virus) - if parsed_accessions['type'] in ('list', 'file'): + if parsed_accessions["type"] in ("list", "file"): use_batched_fetch = True - accession_list = parsed_accessions['accessions'] - logger.info("Detected %d accessions from %s input", - len(accession_list), parsed_accessions['type']) - + accession_list = parsed_accessions["accessions"] + logger.info( + "Detected %d accessions from %s input", len(accession_list), parsed_accessions["type"] + ) + if use_batched_fetch: # Multiple accessions - use batched fetching api_result = _fetch_metadata_for_accession_list( @@ -8849,7 +9339,7 @@ def virus( api_reports, deferred_filters = api_result else: api_reports = api_result - + # Log deferred filters if any if deferred_filters: logger.info("=" * 60) @@ -8863,39 +9353,42 @@ def virus( # Ensure output folder exists for summary file os.makedirs(outfolder, exist_ok=True) logger.debug("Ensured output folder exists for error summary: %s", outfolder) - + # Save partial metadata if any was collected before the failure # Check for streaming temp file from fetch_virus_metadata partial_metadata_dict = {} temp_metadata_glob = os.path.join(temp_dir, "gget_metadata_*.jsonl") import glob as _glob + temp_metadata_files = _glob.glob(temp_metadata_glob) for tmf in temp_metadata_files: try: - with open(tmf, 'r', encoding='utf-8') as f: + with open(tmf, encoding="utf-8") as f: for line in f: line = line.strip() if line: try: record = json.loads(line) - acc = record.get('accession', {}) + acc = record.get("accession", {}) if isinstance(acc, dict): - acc = acc.get('accession', '') + acc = acc.get("accession", "") if acc: partial_metadata_dict[str(acc)] = record except json.JSONDecodeError: continue - except Exception: + except Exception: # noqa: BLE001 continue - + if partial_metadata_dict: # Convert raw API reports to internal format try: partial_internal = load_metadata_from_api_reports(list(partial_metadata_dict.values())) - except Exception: - partial_internal = {acc: {'accession': acc} for acc in partial_metadata_dict} - - partial_metadata_file = _save_partial_metadata(partial_internal, outfolder, virus_clean, reason="api_failure") + except Exception: # noqa: BLE001 + partial_internal = {acc: {"accession": acc} for acc in partial_metadata_dict} + + partial_metadata_file = _save_partial_metadata( + partial_internal, outfolder, virus_clean, reason="api_failure" + ) if partial_metadata_file: logger.info("=" * 60) logger.info("💾 PARTIAL METADATA SAVED FOR RECOVERY") @@ -8903,10 +9396,14 @@ def virus( logger.info(" Records: %d", len(partial_internal)) logger.info("") logger.info(" Recovery command:") - logger.info(" gget virus %s --baseline %s --merge-results -o %s", - virus, partial_metadata_file, outfolder) + logger.info( + " gget virus %s --baseline %s --merge-results -o %s", + virus, + partial_metadata_file, + outfolder, + ) logger.info("=" * 60) - + # Save a summary file documenting the failure, then exit gracefully logger.error("Failed to fetch virus metadata from NCBI API") save_command_summary( @@ -8922,7 +9419,9 @@ def virus( error_message=str(e), failed_commands=failed_commands, partial_metadata_file=partial_metadata_file if partial_metadata_dict else None, - recovery_command=f"gget virus {virus} --baseline {partial_metadata_file} --merge-results -o {outfolder}" if partial_metadata_dict and partial_metadata_file else None, + recovery_command=f"gget virus {virus} --baseline {partial_metadata_file} --merge-results -o {outfolder}" + if partial_metadata_dict and partial_metadata_file + else None, ) return None @@ -8941,7 +9440,7 @@ def virus( datasets_version=datasets_version, success=True, error_message="No virus records found matching the specified criteria (API returned 0 records)", - failed_commands=failed_commands + failed_commands=failed_commands, ) return @@ -8951,7 +9450,7 @@ def virus( # - A list of report dicts for smaller/accession-based queries _log_memory_usage("after API fetch") logger.debug("Converting API metadata to internal format...") - + if isinstance(api_reports, str) and os.path.isfile(api_reports): # Stream from temp JSONL file - avoids loading raw API reports into RAM logger.info("Loading metadata from streamed temp file (memory-efficient path)...") @@ -8972,7 +9471,7 @@ def virus( datasets_version=datasets_version, success=True, error_message="No virus records found matching the specified criteria (API returned 0 records)", - failed_commands=failed_commands + failed_commands=failed_commands, ) return else: @@ -8981,7 +9480,7 @@ def virus( metadata_dict = load_metadata_from_api_reports(api_reports) # Delete api_reports after conversion - no longer needed del api_reports - + _force_garbage_collection("after api_reports conversion") _log_memory_usage("after api_reports cleanup") @@ -8992,40 +9491,44 @@ def virus( for md in metadata_dict.values(): f.write(json.dumps(md) + "\n") logger.info("✅ Saved API metadata JSONL: %s", output_api_metadata_jsonl) - except Exception as e: + except Exception as e: # noqa: BLE001 logger.warning("❌ Failed to save API metadata JSONL: %s", e) - # SECTION 3b: BASELINE DEDUPLICATION + # SECTION 3b: BASELINE DEDUPLICATION # If baseline accessions were loaded, remove them from the metadata baseline_skipped_count = 0 if baseline_accessions is not None and metadata_dict: logger.info("=" * 60) logger.info("STEP 3b: DEDUPLICATING AGAINST BASELINE") logger.info("=" * 60) - logger.info("API returned %d records. Comparing against %d baseline accessions...", - len(metadata_dict), len(baseline_accessions)) - + logger.info( + "API returned %d records. Comparing against %d baseline accessions...", + len(metadata_dict), + len(baseline_accessions), + ) + metadata_dict, baseline_skipped_count = _deduplicate_metadata_against_baseline( metadata_dict, baseline_accessions ) - + logger.info("Deduplication complete:") logger.info(" - Total from API: %d", total_api_records) logger.info(" - Already in baseline (skipped): %d", baseline_skipped_count) logger.info(" - New accessions to process: %d", len(metadata_dict)) - + if not metadata_dict: logger.warning("All API records already exist in the baseline file.") logger.info("No new sequences to download.") - + if merge_results and baseline_metadata: # Copy baseline to output as the merged result merged_output = os.path.join(outfolder, f"{virus_clean}_merged.csv") import shutil as _shutil + _shutil.copy2(baseline_metadata, merged_output) logger.info("✅ Baseline copied as merged output: %s", merged_output) - output_files_dict['Merged Metadata'] = merged_output - + output_files_dict["Merged Metadata"] = merged_output + save_command_summary( outfolder=outfolder, command_line=command_line, @@ -9044,7 +9547,7 @@ def virus( ) return - # SECTION 4: METADATA-ONLY FILTERING + # SECTION 4: METADATA-ONLY FILTERING logger.info("=" * 60) logger.info("STEP 4: Applying metadata-only filters") logger.info("=" * 60) @@ -9054,12 +9557,14 @@ def virus( "max_seq_length": max_seq_length, # "min_gene_count": min_gene_count, # "max_gene_count": max_gene_count, - "nuc_completeness": nuc_completeness if nuc_completeness and nuc_completeness.lower() == 'partial' else None, #only for partial cases + "nuc_completeness": nuc_completeness + if nuc_completeness and nuc_completeness.lower() == "partial" + else None, # only for partial cases "lab_passaged": lab_passaged, "submitter_country": submitter_country, "min_collection_date": min_collection_date, "max_collection_date": max_collection_date, - "source_database": source_database if source_database and source_database.lower() == 'genbank' else None, + "source_database": source_database if source_database and source_database.lower() == "genbank" else None, "max_release_date": max_release_date, # "min_mature_peptide_count": min_mature_peptide_count, # "max_mature_peptide_count": max_mature_peptide_count, @@ -9074,8 +9579,8 @@ def virus( "isolate": isolate, "isolation_source": isolation_source, # Add deferred filters if server-side filter failed - "geographic_location": deferred_filters.get('geographic_location') if deferred_filters else None, - "host": deferred_filters.get('host') if deferred_filters else None, + "geographic_location": deferred_filters.get("geographic_location") if deferred_filters else None, + "host": deferred_filters.get("host") if deferred_filters else None, } all_metadata_filters_none = all(v is None for k, v in filters.items()) @@ -9091,7 +9596,9 @@ def virus( filtered_metadata = list(metadata_dict.values()) logger.info("All %d sequences will proceed to sequence download", len(filtered_accessions)) else: - filtered_accessions, filtered_metadata, metadata_filter_stats = filter_metadata_only(metadata_dict, **filters) + filtered_accessions, filtered_metadata, metadata_filter_stats = filter_metadata_only( + metadata_dict, **filters + ) if not filtered_accessions: pass # No sequences passed metadata filters total_after_metadata_filter = 0 @@ -9114,7 +9621,7 @@ def virus( metadata_filter_stats=metadata_filter_stats, ) return - + total_after_metadata_filter = len(filtered_accessions) _log_memory_usage("after metadata filtering") @@ -9125,29 +9632,31 @@ def virus( for md in filtered_metadata: f.write(json.dumps(md) + "\n") logger.info("✅ Saved filtered metadata JSONL: %s", output_metadata_jsonl) - except Exception as e: + except Exception as e: # noqa: BLE001 logger.warning("❌ Failed to save filtered metadata JSONL: %s", e) - # SECTION 4.5: EARLY GENBANK METADATA FETCHING AND FILTERING - # This step fetches GenBank metadata and applies GenBank-dependent filters BEFORE downloading sequences, dramatically reducing the number of sequences to download. - + # SECTION 4.5: EARLY GENBANK METADATA FETCHING AND FILTERING + # This step fetches GenBank metadata and applies GenBank-dependent filters BEFORE downloading sequences, dramatically reducing the number of sequences to download. + # Track GenBank data for later steps (saving to CSV) genbank_data_prefetch = None genbank_prefetch_done = False - + # Check if any GenBank-dependent filters are specified - genbank_dependent_filters_active = any([ - provirus is not None, - genotype is not None, - has_proteins is not None, - gen_mol_type is not None, - env_source is not None, - min_gene_count is not None, - max_gene_count is not None, - min_mature_peptide_count is not None, - max_mature_peptide_count is not None, - ]) - + genbank_dependent_filters_active = any( + [ + provirus is not None, + genotype is not None, + has_proteins is not None, + gen_mol_type is not None, + env_source is not None, + min_gene_count is not None, + max_gene_count is not None, + min_mature_peptide_count is not None, + max_mature_peptide_count is not None, + ] + ) + if genbank_dependent_filters_active and filtered_accessions: logger.info("=" * 60) logger.info("STEP 4.5: Early GenBank metadata fetch and filtering (OPTIMIZATION)") @@ -9156,36 +9665,36 @@ def virus( logger.info("sequence download to reduce the number of sequences to download.") logger.info("This can dramatically speed up processing for large datasets.") _log_memory_usage("before early GenBank fetch") - + try: # Create temp paths for GenBank data genbank_prefetch_xml = os.path.join(temp_dir, f"{virus_clean}_genbank_prefetch.xml") genbank_prefetch_csv = os.path.join(temp_dir, f"{virus_clean}_genbank_prefetch.csv") - + # ESearch pre-filtering accessions_to_fetch = list(set(filtered_accessions)) - + # Get virus taxid from metadata for ESearch query virus_taxid_for_esearch = None if filtered_metadata: - virus_taxid_for_esearch = filtered_metadata[0].get('virusTaxId') + virus_taxid_for_esearch = filtered_metadata[0].get("virusTaxId") if not virus_taxid_for_esearch: # Try to get from nested virus dict - virus_dict = filtered_metadata[0].get('virus', {}) + virus_dict = filtered_metadata[0].get("virus", {}) if isinstance(virus_dict, dict): - virus_taxid_for_esearch = virus_dict.get('tax_id') - + virus_taxid_for_esearch = virus_dict.get("tax_id") + # Only attempt ESearch pre-filtering if we have enough accessions # to make it worthwhile (>1000) and we have GenBank-dependent filters esearch_prefilter_threshold = 1000 if len(accessions_to_fetch) > esearch_prefilter_threshold and virus_taxid_for_esearch: logger.info("=" * 60) - logger.info("ESearch PRE-FILTERING: Narrowing %d accessions to likely candidates", - len(accessions_to_fetch)) - logger.info("This avoids fetching full GenBank XML for all %d accessions", - len(accessions_to_fetch)) + logger.info( + "ESearch PRE-FILTERING: Narrowing %d accessions to likely candidates", len(accessions_to_fetch) + ) + logger.info("This avoids fetching full GenBank XML for all %d accessions", len(accessions_to_fetch)) logger.info("=" * 60) - + esearch_candidates = _esearch_prefilter_genbank( virus_taxid=virus_taxid_for_esearch, metadata_filtered_accessions=accessions_to_fetch, @@ -9197,7 +9706,7 @@ def virus( max_seq_length=max_seq_length, api_key=api_key, ) - + if esearch_candidates is not None: if len(esearch_candidates) == 0: # ESearch says NO accessions match @@ -9206,7 +9715,7 @@ def virus( filtered_accessions = [] filtered_metadata = [] genbank_data_prefetch = {} - + save_command_summary( outfolder=outfolder, command_line=command_line, @@ -9228,19 +9737,22 @@ def virus( if not keep_temp and os.path.isdir(temp_dir): try: shutil.rmtree(temp_dir) - except Exception: + except Exception: # noqa: BLE001 pass return else: # Narrow to only the candidates accessions_to_fetch = list(esearch_candidates) - logger.info("✅ ESearch pre-filter successful: narrowed from %d to %d accessions", - len(filtered_accessions), len(accessions_to_fetch)) + logger.info( + "✅ ESearch pre-filter successful: narrowed from %d to %d accessions", + len(filtered_accessions), + len(accessions_to_fetch), + ) else: logger.info("ESearch pre-filter: could not pre-filter, using full accession list") - + logger.info("Fetching GenBank metadata for %d accessions...", len(accessions_to_fetch)) - + # Fetch GenBank metadata genbank_data_prefetch, genbank_failed_log = fetch_genbank_metadata( accessions=accessions_to_fetch, @@ -9248,14 +9760,14 @@ def virus( genbank_full_csv_path=genbank_prefetch_csv, batch_size=genbank_batch_size, delay=GENBANK_INTER_BATCH_DELAY, - api_key=api_key + api_key=api_key, ) - + if genbank_data_prefetch: genbank_prefetch_done = True logger.info("Successfully retrieved GenBank metadata for %d accessions", len(genbank_data_prefetch)) _log_memory_usage("after early GenBank fetch") - + # Apply GenBank-dependent filters logger.info("Applying GenBank-dependent filters...") filters_genbank_early = { @@ -9269,38 +9781,50 @@ def virus( "min_mature_peptide_count": min_mature_peptide_count, "max_mature_peptide_count": max_mature_peptide_count, } - + genbank_filtered_accessions_early, genbank_filter_stats = filter_genbank_metadata( genbank_metadata=genbank_data_prefetch, **filters_genbank_early, ) - + if genbank_filtered_accessions_early: # Calculate reduction before_count = len(filtered_accessions) after_count = len(genbank_filtered_accessions_early) reduction_pct = ((before_count - after_count) / before_count) * 100 if before_count > 0 else 0 - + logger.info("=" * 60) logger.info("🎯 EARLY GENBANK FILTERING RESULTS:") logger.info(" Before GenBank filters: %d accessions", before_count) logger.info(" After GenBank filters: %d accessions", after_count) - logger.info(" Reduction: %.1f%% (%d accessions filtered out)", reduction_pct, before_count - after_count) - logger.info(" This means we'll download %d sequences instead of %d!", after_count, before_count) + logger.info( + " Reduction: %.1f%% (%d accessions filtered out)", + reduction_pct, + before_count - after_count, + ) + logger.info( + " This means we'll download %d sequences instead of %d!", after_count, before_count + ) logger.info("=" * 60) - + # Update filtered_accessions and filtered_metadata genbank_filtered_set_early = set(genbank_filtered_accessions_early) filtered_accessions = [acc for acc in filtered_accessions if acc in genbank_filtered_set_early] - filtered_metadata = [md for md in filtered_metadata if md['accession'] in genbank_filtered_set_early] - + filtered_metadata = [ + md for md in filtered_metadata if md["accession"] in genbank_filtered_set_early + ] + # Also filter genbank_data_prefetch to only include passing accessions - genbank_data_prefetch = {acc: genbank_data_prefetch[acc] for acc in genbank_filtered_accessions_early if acc in genbank_data_prefetch} - + genbank_data_prefetch = { + acc: genbank_data_prefetch[acc] + for acc in genbank_filtered_accessions_early + if acc in genbank_data_prefetch + } + # Update total_after_metadata_filter to reflect GenBank filtering # Note: This is now total after BOTH metadata + GenBank filtering total_after_genbank_filter = len(filtered_accessions) - + _force_garbage_collection("after early GenBank filtering") _log_memory_usage("after early GenBank filtering cleanup") else: @@ -9309,7 +9833,7 @@ def virus( filtered_accessions = [] filtered_metadata = [] genbank_data_prefetch = {} - + # Save command summary and return early save_command_summary( outfolder=outfolder, @@ -9333,21 +9857,23 @@ def virus( if not keep_temp and os.path.isdir(temp_dir): try: shutil.rmtree(temp_dir) - except Exception: + except Exception: # noqa: BLE001 pass return else: logger.warning("Failed to retrieve GenBank metadata. Proceeding without early filtering.") logger.warning("GenBank-dependent filters will be applied after sequence download (slower).") genbank_prefetch_done = False - - except Exception as e: + + except Exception as e: # noqa: BLE001 logger.warning("Early GenBank fetch failed: %s", e) - logger.warning("Proceeding without early GenBank filtering. Filters will be applied after sequence download.") + logger.warning( + "Proceeding without early GenBank filtering. Filters will be applied after sequence download." + ) genbank_prefetch_done = False genbank_data_prefetch = None - # SECTION 5: DOWNLOAD SEQUENCES FOR FILTERED ACCESSIONS ONLY + # SECTION 5: DOWNLOAD SEQUENCES FOR FILTERED ACCESSIONS ONLY logger.info("=" * 60) logger.info("STEP 5: Downloading sequences for filtered accessions") logger.info("=" * 60) @@ -9356,24 +9882,24 @@ def virus( if used_cached_download and cached_fasta_file: logger.info("Using sequences from cached download (skipping sequence download)") logger.info("Streaming and filtering cached FASTA file on-demand...") - + # Create filtered accessions set for faster lookup filtered_acc_set = set(filtered_accessions) - + # Stream through cached FASTA and write only filtered sequences # This avoids loading the entire file into RAM fna_file = os.path.join(temp_dir, f"{virus_clean}_cached_sequences.fasta") filtered_count = 0 - + try: # Generator expression: yields only sequences in filtered_accessions filtered_records = (r for r in FastaIO.parse(cached_fasta_file, "fasta") if r.id in filtered_acc_set) FastaIO.write(filtered_records, fna_file, "fasta") - + # Count the written sequences for _ in FastaIO.parse(fna_file, "fasta"): filtered_count += 1 - + logger.info("✅ Streamed and wrote %d filtered sequences from cached FASTA", filtered_count) logger.info(" Output: %s", fna_file) except Exception as e: @@ -9381,17 +9907,19 @@ def virus( raise RuntimeError(f"Failed to process cached FASTA file: {e}") from e else: # Regular sequence download - fna_file = download_sequences_by_accessions(filtered_accessions, outdir=temp_dir, failed_commands=failed_commands, api_key=api_key) + fna_file = download_sequences_by_accessions( + filtered_accessions, outdir=temp_dir, failed_commands=failed_commands, api_key=api_key + ) if not os.path.exists(fna_file): raise RuntimeError(f"❌ Download failed: FASTA file not found at {fna_file}") logger.info("Downloaded FASTA file: %s (%.2f MB)", fna_file, os.path.getsize(fna_file) / 1024 / 1024) - # SECTION 6: SEQUENCE-DEPENDENT FILTERING + # SECTION 6: SEQUENCE-DEPENDENT FILTERING logger.info("=" * 60) logger.info("STEP 6: Applying sequence-dependent filters and saving results") logger.info("=" * 60) - filters_seq={ + filters_seq = { "max_ambiguous_chars": max_ambiguous_chars, # "has_proteins": has_proteins, "proteins_complete": proteins_complete, @@ -9420,7 +9948,7 @@ def virus( ) # Clean up filtered_metadata_dict after use del filtered_metadata_dict - + # metadata_dict is no longer needed after this point # filtered_metadata_final and filtered_accessions contain all we need try: @@ -9430,7 +9958,7 @@ def virus( _force_garbage_collection("after sequence filtering") _log_memory_usage("after sequence filtering cleanup") - # SECTION 7: SAVING FINAL OUTPUT FILES + # SECTION 7: SAVING FINAL OUTPUT FILES logger.info("=" * 60) logger.info("STEP 7: Saving final output files") logger.info("=" * 60) @@ -9439,11 +9967,15 @@ def virus( # FASTA was already written during Section 6 (streaming to output_fasta_file) if os.path.exists(output_fasta_file): - logger.info("✅ FASTA file saved: %s (%.2f MB)", output_fasta_file, os.path.getsize(output_fasta_file) / 1024 / 1024) - output_files_dict['FASTA Sequences'] = output_fasta_file + logger.info( + "✅ FASTA file saved: %s (%.2f MB)", + output_fasta_file, + os.path.getsize(output_fasta_file) / 1024 / 1024, + ) + output_files_dict["FASTA Sequences"] = output_fasta_file else: logger.error("❌ FASTA file not found at expected location: %s", output_fasta_file) - + # Track final metadata for summary final_metadata_for_summary = filtered_metadata_final @@ -9452,8 +9984,12 @@ def virus( with open(output_metadata_jsonl, "w", encoding="utf-8") as file: for metadata in filtered_metadata_final: file.write(json.dumps(metadata) + "\n") - logger.info("✅ JSONL metadata file saved: %s (%.2f MB)", output_metadata_jsonl, os.path.getsize(output_metadata_jsonl) / 1024 / 1024) - output_files_dict['JSONL Metadata'] = output_metadata_jsonl + logger.info( + "✅ JSONL metadata file saved: %s (%.2f MB)", + output_metadata_jsonl, + os.path.getsize(output_metadata_jsonl) / 1024 / 1024, + ) + output_files_dict["JSONL Metadata"] = output_metadata_jsonl except Exception as e: logger.error("❌ Failed to save JSONL metadata file: %s", e) raise @@ -9462,20 +9998,24 @@ def virus( try: save_metadata_to_csv(filtered_metadata_final, protein_headers, output_metadata_csv) if os.path.exists(output_metadata_csv): - logger.info("✅ CSV metadata file saved: %s (%.2f MB)", output_metadata_csv, os.path.getsize(output_metadata_csv) / 1024 / 1024) - output_files_dict['CSV Metadata'] = output_metadata_csv + logger.info( + "✅ CSV metadata file saved: %s (%.2f MB)", + output_metadata_csv, + os.path.getsize(output_metadata_csv) / 1024 / 1024, + ) + output_files_dict["CSV Metadata"] = output_metadata_csv else: logger.error("❌ Failed to create CSV file: %s", output_metadata_csv) except Exception as e: logger.error("❌ Failed to save CSV metadata file: %s", e) raise - + # SECTION 7b: BASELINE MERGE/NO-MERGE OUTPUT if baseline_accessions is not None and baseline_metadata: logger.info("=" * 60) logger.info("STEP 7b: Baseline merge/split output") logger.info("=" * 60) - + if merge_results: # Merge new results with baseline into a single file merged_csv_path = os.path.join(outfolder, f"{virus_clean}_merged.csv") @@ -9483,7 +10023,7 @@ def virus( baseline_metadata, filtered_metadata_final, merged_csv_path ) if merge_success: - output_files_dict['Merged Metadata (CSV)'] = merged_csv_path + output_files_dict["Merged Metadata (CSV)"] = merged_csv_path logger.info("✅ Merged CSV: %s", merged_csv_path) else: logger.warning("⚠️ Merge failed. New-only output is available at: %s", output_metadata_csv) @@ -9491,71 +10031,74 @@ def virus( # No-merge mode: label the new-only output clearly new_csv_path = os.path.join(outfolder, f"{virus_clean}_new.csv") baseline_ref_path = os.path.join(outfolder, f"{virus_clean}_baseline_provided.csv") - + # Rename existing CSV to _new if os.path.exists(output_metadata_csv): shutil.copy2(output_metadata_csv, new_csv_path) - output_files_dict['New Metadata (CSV)'] = new_csv_path + output_files_dict["New Metadata (CSV)"] = new_csv_path logger.info("✅ New-only CSV: %s (%d sequences)", new_csv_path, len(filtered_metadata_final)) - + # Copy baseline as reference shutil.copy2(baseline_metadata, baseline_ref_path) - output_files_dict['Baseline Provided (CSV)'] = baseline_ref_path + output_files_dict["Baseline Provided (CSV)"] = baseline_ref_path logger.info("✅ Baseline reference: %s", baseline_ref_path) else: logger.info("Skipping this step since no sequences passed all filters") - + # Clean up before GenBank fetch # filtered_metadata_final contains all we need - clear other references # Note: filtered_metadata may still reference same objects, but that's okay _force_garbage_collection("before GenBank fetch") - # SECTION 8: GENBANK METADATA RETRIEVAL (OPTIONAL) + # SECTION 8: GENBANK METADATA RETRIEVAL (OPTIONAL) logger.info("=" * 60) logger.info("STEP 8: Fetching detailed GenBank metadata") logger.info("=" * 60) _log_memory_usage("STEP 8 start") if genbank_metadata and total_final_sequences > 0: logger.info("GenBank metadata retrieval requested...") - + # Check if we already have GenBank data from early pre-fetch (Step 4.5) if genbank_prefetch_done and genbank_data_prefetch: logger.info("Using pre-fetched GenBank data from Step 4.5 (no re-fetch needed)") genbank_data = genbank_data_prefetch - + # Save GenBank metadata to final output location try: # Copy temp XML to final location if it exists genbank_prefetch_xml = os.path.join(temp_dir, f"{virus_clean}_genbank_prefetch.xml") genbank_prefetch_csv = os.path.join(temp_dir, f"{virus_clean}_genbank_prefetch.csv") - + if os.path.exists(genbank_prefetch_xml): shutil.copy(genbank_prefetch_xml, genbank_full_xml_path) if os.path.exists(genbank_prefetch_csv): shutil.copy(genbank_prefetch_csv, genbank_full_csv_path) - + # Save GenBank metadata to CSV save_genbank_metadata_to_csv( genbank_metadata=genbank_data, output_file=genbank_csv_path, - virus_metadata=filtered_metadata_final + virus_metadata=filtered_metadata_final, + ) + logger.info( + "✅ GenBank metadata CSV saved: %s (%.2f MB)", + genbank_csv_path, + os.path.getsize(genbank_csv_path) / 1024 / 1024, ) - logger.info("✅ GenBank metadata CSV saved: %s (%.2f MB)", - genbank_csv_path, os.path.getsize(genbank_csv_path) / 1024 / 1024) - + # Merge with standard metadata CSV if it exists if os.path.exists(output_metadata_csv): merge_metadata_csvs(genbank_csv_path, output_metadata_csv) - - output_files_dict['GenBank CSV Metadata'] = genbank_csv_path + + output_files_dict["GenBank CSV Metadata"] = genbank_csv_path if os.path.exists(genbank_full_xml_path): - output_files_dict['GenBank Full XML'] = genbank_full_xml_path + output_files_dict["GenBank Full XML"] = genbank_full_xml_path if os.path.exists(genbank_full_csv_path): - output_files_dict['GenBank Full CSV'] = genbank_full_csv_path + output_files_dict["GenBank Full CSV"] = genbank_full_csv_path genbank_success = True - - except Exception as e: + + except Exception as e: # noqa: BLE001 logger.error("❌ Failed to save pre-fetched GenBank data: %s", e) genbank_error_msg = str(e) else: @@ -9565,64 +10108,69 @@ def virus( # Use filtered_accessions (from metadata) instead of iterating # in-memory sequences - avoids loading all sequences into RAM final_accessions = list(filtered_accessions) - + if final_accessions: logger.info("Fetching GenBank metadata for %d sequences...", len(final_accessions)) # Fetch GenBank metadata genbank_data, genbank_failed_log = fetch_genbank_metadata( accessions=list(set(final_accessions)), # Remove duplicates - genbank_full_xml_path=genbank_full_xml_path, genbank_full_csv_path=genbank_full_csv_path, + genbank_full_xml_path=genbank_full_xml_path, + genbank_full_csv_path=genbank_full_csv_path, batch_size=genbank_batch_size, delay=GENBANK_INTER_BATCH_DELAY, - api_key=api_key + api_key=api_key, ) - + # Parse GenBank failed batches log if it exists if genbank_failed_log and os.path.exists(genbank_failed_log): try: - with open(genbank_failed_log, 'r') as flog: + with open(genbank_failed_log) as flog: content = flog.read() # Parse the log file to extract failed batch information - batch_pattern = r'FAILED_BATCH: \[([^\]]+)\][\s\S]*?URL: ([^\n]+)' + batch_pattern = r"FAILED_BATCH: \[([^\]]+)\][\s\S]*?URL: ([^\n]+)" matches = re.findall(batch_pattern, content) for accessions_str, url in matches: # Clean up accessions string - batch_accessions = [acc.strip().strip("'").strip('"') for acc in accessions_str.split(',')] - failed_commands['genbank_batches'].append({ - 'accessions': batch_accessions, - 'retry_url': url.strip() - }) - except Exception as parse_error: + batch_accessions = [ + acc.strip().strip("'").strip('"') for acc in accessions_str.split(",") + ] + failed_commands["genbank_batches"].append( + {"accessions": batch_accessions, "retry_url": url.strip()} + ) + except Exception as parse_error: # noqa: BLE001 logger.debug("Could not parse GenBank failed batches log: %s", parse_error) - + if genbank_data and not genbank_dependent_filters_active: # No GenBank filters - just save the data save_genbank_metadata_to_csv( genbank_metadata=genbank_data, output_file=genbank_csv_path, - virus_metadata=filtered_metadata_final + virus_metadata=filtered_metadata_final, ) - logger.info("✅ GenBank metadata CSV saved: %s (%.2f MB)", - genbank_csv_path, os.path.getsize(genbank_csv_path) / 1024 / 1024) - + logger.info( + "✅ GenBank metadata CSV saved: %s (%.2f MB)", + genbank_csv_path, + os.path.getsize(genbank_csv_path) / 1024 / 1024, + ) + # Merge with standard metadata CSV if it exists if os.path.exists(output_metadata_csv): merge_metadata_csvs(genbank_csv_path, output_metadata_csv) - - output_files_dict['GenBank CSV Metadata'] = genbank_csv_path + + output_files_dict["GenBank CSV Metadata"] = genbank_csv_path if os.path.exists(genbank_full_xml_path): - output_files_dict['GenBank Full XML'] = genbank_full_xml_path + output_files_dict["GenBank Full XML"] = genbank_full_xml_path if os.path.exists(genbank_full_csv_path): - output_files_dict['GenBank Full CSV'] = genbank_full_csv_path + output_files_dict["GenBank Full CSV"] = genbank_full_csv_path genbank_success = True elif genbank_data and genbank_dependent_filters_active: # GenBank filters needed - this is the fallback path when pre-fetch failed logger.info("GenBank metadata retrieved. Applying filters (fallback path)...") _log_memory_usage("before fallback GenBank filtering") _force_garbage_collection("before fallback filtering") - - filters_genbank={ + + filters_genbank = { "provirus": provirus, "has_proteins": has_proteins, "genotype": genotype, @@ -9641,54 +10189,68 @@ def virus( ) if genbank_filtered_accessions: - logger.info("After applying GenBank-based filters, %d sequences remain", len(genbank_filtered_accessions)) - + logger.info( + "After applying GenBank-based filters, %d sequences remain", + len(genbank_filtered_accessions), + ) + genbank_filtered_set = set(genbank_filtered_accessions) total_after_genbank_filter = len(genbank_filtered_accessions) # Re-filter FASTA by streaming from output # file through accession filter, instead of holding all sequences in RAM temp_refiltered_fasta = output_fasta_file + ".tmp" - refiltered_count = _stream_copy_fasta(output_fasta_file, temp_refiltered_fasta, genbank_filtered_set) + refiltered_count = _stream_copy_fasta( + output_fasta_file, temp_refiltered_fasta, genbank_filtered_set + ) shutil.move(temp_refiltered_fasta, output_fasta_file) total_final_sequences = refiltered_count - - filtered_metadata_final = [md for md in filtered_metadata_final if md['accession'] in genbank_filtered_set] - genbank_data_filtered = {acc: genbank_data[acc] for acc in genbank_filtered_accessions if acc in genbank_data} - + + filtered_metadata_final = [ + md for md in filtered_metadata_final if md["accession"] in genbank_filtered_set + ] + genbank_data_filtered = { + acc: genbank_data[acc] + for acc in genbank_filtered_accessions + if acc in genbank_data + } + del genbank_data _force_garbage_collection("after fallback GenBank filtering") save_metadata_to_csv(filtered_metadata_final, protein_headers, output_metadata_csv) - + try: with open(output_metadata_jsonl, "w", encoding="utf-8") as f: for md in filtered_metadata_final: f.write(json.dumps(md) + "\n") - except Exception as e: + except Exception as e: # noqa: BLE001 logger.warning("❌ Failed to update JSONL metadata file: %s", e) - + save_genbank_metadata_to_csv( genbank_metadata=genbank_data_filtered, output_file=genbank_csv_path, - virus_metadata=filtered_metadata_final + virus_metadata=filtered_metadata_final, ) - logger.info("✅ GenBank metadata CSV saved: %s (%.2f MB)", - genbank_csv_path, os.path.getsize(genbank_csv_path) / 1024 / 1024) - + logger.info( + "✅ GenBank metadata CSV saved: %s (%.2f MB)", + genbank_csv_path, + os.path.getsize(genbank_csv_path) / 1024 / 1024, + ) + if os.path.exists(output_metadata_csv): merge_metadata_csvs(genbank_csv_path, output_metadata_csv) - - output_files_dict['FASTA Sequences'] = output_fasta_file - output_files_dict['CSV Metadata'] = output_metadata_csv - output_files_dict['GenBank CSV Metadata'] = genbank_csv_path + + output_files_dict["FASTA Sequences"] = output_fasta_file + output_files_dict["CSV Metadata"] = output_metadata_csv + output_files_dict["GenBank CSV Metadata"] = genbank_csv_path if os.path.exists(genbank_full_xml_path): - output_files_dict['GenBank Full XML'] = genbank_full_xml_path + output_files_dict["GenBank Full XML"] = genbank_full_xml_path if os.path.exists(genbank_full_csv_path): - output_files_dict['GenBank Full CSV'] = genbank_full_csv_path + output_files_dict["GenBank Full CSV"] = genbank_full_csv_path final_metadata_for_summary = filtered_metadata_final genbank_success = True - + del genbank_data_filtered del genbank_filtered_accessions del genbank_filtered_set @@ -9703,28 +10265,30 @@ def virus( os.remove(output_metadata_csv) if os.path.exists(output_metadata_jsonl): os.remove(output_metadata_jsonl) - except Exception as e: + except Exception as e: # noqa: BLE001 logger.error("❌ Failed to apply GenBank-based filters: %s", e) - logger.warning("Continuing with previously filtered results without GenBank-based filtering.") + logger.warning( + "Continuing with previously filtered results without GenBank-based filtering." + ) else: logger.warning("No GenBank metadata was retrieved") genbank_error_msg = "No GenBank metadata was retrieved" else: logger.warning("No accession numbers found for GenBank metadata lookup") genbank_error_msg = "No accession numbers found for GenBank metadata lookup" - - except Exception as genbank_error: + + except Exception as genbank_error: # noqa: BLE001 logger.error("❌ GenBank metadata retrieval failed: %s", genbank_error) logger.warning("Continuing without GenBank metadata - standard output files are still available") genbank_error_msg = str(genbank_error) - + _log_memory_usage("GenBank processing complete") _force_garbage_collection("after GenBank processing") logger.info("GenBank metadata processing completed") else: logger.info("Skipping this step since GenBank metadata retrieval was not requested.") - # SECTION 9: FINAL SUMMARY + # SECTION 9: FINAL SUMMARY # Provide comprehensive summary of the results if total_final_sequences > 0: logger.info("=" * 60) @@ -9744,11 +10308,16 @@ def virus( # Check if GenBank metadata CSV was created if genbank_metadata: if genbank_success and os.path.exists(genbank_csv_path): - logger.info(" 📊 Metadata (including Genbank information): %s", os.path.basename(genbank_csv_path)) + logger.info( + " 📊 Metadata (including Genbank information): %s", os.path.basename(genbank_csv_path) + ) if os.path.exists(genbank_full_xml_path): logger.info(" 🧬 GenBank-only full XML: %s", os.path.basename(genbank_full_xml_path)) if os.path.exists(genbank_full_csv_path): - logger.info(" 🧬 GenBank-only full CSV (readable XML format): %s", os.path.basename(genbank_full_csv_path)) + logger.info( + " 🧬 GenBank-only full CSV (readable XML format): %s", + os.path.basename(genbank_full_csv_path), + ) else: logger.warning("") logger.warning("⚠️ GenBank metadata was requested but NOT saved due to errors:") @@ -9756,7 +10325,7 @@ def virus( logger.warning(" Standard metadata files are still available.") logger.info("=" * 60) - + # Save command summary save_command_summary( outfolder=outfolder, @@ -9793,7 +10362,7 @@ def virus( logger.warning(" - Trying a broader virus query term") logger.warning(" - Removing some of the more restrictive filters") logger.warning("=" * 60) - + # Save command summary even when no sequences pass save_command_summary( outfolder=outfolder, @@ -9817,9 +10386,11 @@ def virus( except Exception as e: # Handle any unexpected errors during processing error_msg = str(e) - + # Check if this is a server-side issue that we can provide guidance for - if any(indicator in error_msg.lower() for indicator in ['timeout', '500 server error', 'internal server error']): + if any( + indicator in error_msg.lower() for indicator in ["timeout", "500 server error", "internal server error"] + ): logger.error("=" * 80) logger.error("SERVER-SIDE ERROR DETECTED") logger.error("=" * 80) @@ -9828,17 +10399,17 @@ def virus( logger.error("") logger.error("Error details: %s", e) logger.error("") - + # Provide alternative commands based on the problematic parameters if geographic_location: logger.error("🔧 SUGGESTED SOLUTION:") logger.error("The geographic location filter may be causing server issues.") logger.error("Try running without the geographic filter and filter manually afterward:") logger.error("") - + # Build alternative command cmd_parts = [f"gget.virus('{virus}'"] - + # Add all non-problematic filters if host: cmd_parts.append(f"host='{host}'") @@ -9873,25 +10444,25 @@ def virus( cmd_parts.append(f"has_proteins={has_proteins}") else: cmd_parts.append(f"has_proteins='{has_proteins}'") - + cmd_parts.append(f"outfolder='{virus_clean}_data'") - + alternative_cmd = ", ".join(cmd_parts) + ")" logger.error("📋 ALTERNATIVE COMMAND:") logger.error(" %s", alternative_cmd) logger.error("") logger.error("After download completes, filter the output CSV file by the") logger.error("'Geographic Location' column to get sequences from '%s'.", geographic_location) - - elif any(x in virus.lower() for x in ['sars-cov-2', 'covid', 'influenza']) and not host: + + elif any(x in virus.lower() for x in ["sars-cov-2", "covid", "influenza"]) and not host: logger.error("🔧 SUGGESTED SOLUTION:") logger.error("Large datasets like '%s' may cause server timeouts.", virus) logger.error("Try adding a host filter to reduce the dataset size:") logger.error("") - + # Build alternative command with host filter cmd_parts = [f"gget.virus('{virus}'", "host='human'"] - + # Add existing filters if min_seq_length: cmd_parts.append(f"min_seq_length={min_seq_length}") @@ -9901,20 +10472,20 @@ def virus( cmd_parts.append(f"nuc_completeness='{nuc_completeness}'") if annotated is not None: cmd_parts.append(f"annotated={annotated}") - + cmd_parts.append(f"outfolder='{virus_clean}_data'") - + alternative_cmd = ", ".join(cmd_parts) + ")" logger.error("📋 ALTERNATIVE COMMAND:") logger.error(" %s", alternative_cmd) - + else: logger.error("🔧 SUGGESTED SOLUTIONS:") logger.error("1. Wait a few minutes and try again (server issues are often temporary)") logger.error("2. Try using more specific filters to reduce dataset size") logger.error("3. Use host='human' filter if studying human pathogens") logger.error("4. Add date range filters to limit the time period") - + logger.error("=" * 80) else: # For non-server errors, show the original error message @@ -9922,7 +10493,7 @@ def virus( logger.error("Error type: %s", type(e).__name__) if logger.getEffectiveLevel() <= logging.DEBUG: logger.debug("Full traceback:\n%s", traceback.format_exc()) - + # Save command summary with error information save_command_summary( outfolder=outfolder if outfolder else os.getcwd(), @@ -9942,9 +10513,9 @@ def virus( genbank_filter_stats=genbank_filter_stats, sequence_filter_stats=sequence_filter_stats, ) - + raise - + # SECTION 10: CLEANUP finally: # Always clean up temporary files, regardless of success or failure @@ -9958,9 +10529,9 @@ def virus( if genbank_metadata and genbank_success and os.path.exists(output_metadata_csv): os.remove(output_metadata_csv) logger.debug("✅ Cleaned up temporary directory: %s", temp_dir) - except Exception as e: + except Exception as e: # noqa: BLE001 logger.warning("Failed to clean up temporary directory %s: %s", temp_dir, e) - + # Clean up cached download files (zip file and extracted directory) # This ensures the folder structure is identical whether using cached or API-based downloads logger.debug("Checking for cached download files to clean up...") @@ -9977,11 +10548,11 @@ def virus( if os.path.exists(cached_extract_dir) and os.path.isdir(cached_extract_dir): shutil.rmtree(cached_extract_dir) logger.debug("✅ Cleaned up cached extracted directory: %s", cached_extract_dir) - except Exception as e: + except Exception as e: # noqa: BLE001 logger.warning("Failed to clean up cached download files: %s", e) elif cached_zip_file and keep_temp: logger.debug("Preserving cached download files as per user request: %s", cached_zip_file) - + if keep_temp and os.path.exists(output_api_metadata_jsonl): logger.debug("Preserving temporary directory as per user request: %s", temp_dir) shutil.move(output_api_metadata_jsonl, os.path.join(temp_dir, os.path.basename(output_api_metadata_jsonl))) @@ -9991,15 +10562,16 @@ def virus( if total_final_sequences == 0 and os.path.exists(output_api_metadata_jsonl): try: os.remove(output_api_metadata_jsonl) - logger.debug("Removed filtered metadata JSONL due to no passing sequences: %s", output_api_metadata_jsonl) - except Exception as e: - logger.warning("Failed to remove filtered metadata JSONL even though no sequence passed all filters: %s", e) + logger.debug( + "Removed filtered metadata JSONL due to no passing sequences: %s", output_api_metadata_jsonl + ) + except Exception as e: # noqa: BLE001 + logger.warning( + "Failed to remove filtered metadata JSONL even though no sequence passed all filters: %s", e + ) - - - logger.info("NCBI virus data retrieval process completed.") - + # Restore the original logger level logger.setLevel(original_logger_level) diff --git a/gget/main.py b/gget/main.py index b51adc92e..a7a57c6c5 100644 --- a/gget/main.py +++ b/gget/main.py @@ -1,51 +1,51 @@ import argparse import sys from datetime import datetime -from typing import Optional import pandas as pd # Get current date and time for alphafold default foldername dt_string = datetime.now().strftime("%Y_%m_%d-%H_%M") -import os -import json -import subprocess +import json # noqa: E402 +import os # noqa: E402 +import subprocess # noqa: E402 -from .utils import set_up_logger +from .utils import set_up_logger # noqa: E402 logger = set_up_logger() -from .__init__ import __version__ +from .__init__ import __version__ # noqa: E402 +from .gget_alphafold import alphafold # noqa: E402 +from .gget_archs4 import archs4 # noqa: E402 +from .gget_bgee import bgee # noqa: E402 +from .gget_blast import blast # noqa: E402 +from .gget_blat import blat # noqa: E402 +from .gget_cbio import cbio_plot, cbio_search # noqa: E402 +from .gget_cellxgene import cellxgene # noqa: E402 +from .gget_cosmic import cosmic # noqa: E402 +from .gget_diamond import diamond # noqa: E402 +from .gget_elm import elm # noqa: E402 +from .gget_enrichr import enrichr # noqa: E402 +from .gget_gpt import gpt # noqa: E402 +from .gget_info import info # noqa: E402 +from .gget_muscle import muscle # noqa: E402 +from .gget_mutate import mutate # noqa: E402 +from .gget_opentargets import OPENTARGETS_RESOURCES, opentargets # noqa: E402 +from .gget_pdb import pdb # noqa: E402 # Module functions -from .gget_ref import ref -from .gget_search import search -from .gget_info import info -from .gget_seq import seq -from .gget_muscle import muscle -from .gget_blast import blast -from .gget_blat import blat -from .gget_enrichr import enrichr -from .gget_archs4 import archs4 -from .gget_alphafold import alphafold -from .gget_setup import setup -from .gget_pdb import pdb -from .gget_gpt import gpt -from .gget_cellxgene import cellxgene -from .gget_elm import elm -from .gget_diamond import diamond -from .gget_cosmic import cosmic -from .gget_mutate import mutate -from .gget_opentargets import opentargets, OPENTARGETS_RESOURCES -from .gget_cbio import cbio_plot, cbio_search -from .gget_bgee import bgee -from .gget_8cube import specificity, psi_block, gene_expression -from .gget_virus import virus +from .gget_ref import ref # noqa: E402 +from .gget_search import search # noqa: E402 +from .gget_seq import seq # noqa: E402 +from .gget_setup import setup # noqa: E402 +from .gget_virus import virus # noqa: E402 # Custom formatter for help messages that preserved the text formatting and adds the default value to the end of the help message class CustomHelpFormatter(argparse.RawTextHelpFormatter): + """Help formatter that preserves text formatting and appends default values to help messages.""" + def _get_help_string(self, action): help_str = action.help if action.help else "" if ( @@ -63,28 +63,33 @@ def _get_help_string(self, action): def convert_to_list(*args): + """Return the given arguments as a list.""" args_list = list(args) return args_list def int_or_str(value): + """Return value as an int if possible, otherwise return it unchanged.""" try: return int(value) except ValueError: return value - + + def str_to_bool_or_none(value): - if value is None or value.lower() in ('none', 'null', ''): + """Convert a string to None, True, False, or return it unchanged.""" + if value is None or value.lower() in ("none", "null", ""): return None - if value.lower() in ('true', 'yes', 't', '1'): + if value.lower() in ("true", "yes", "t", "1"): return True - if value.lower() in ('false', 'no', 'f', '0'): + if value.lower() in ("false", "no", "f", "0"): return False # If it's not a clear boolean/None, treat as a string or raise error - return value + return value def parse_opentargets_filter(filter_arg): + """Parse a COLUMN=VALUE OpenTargets filter argument into a (key, value) tuple.""" if "=" not in filter_arg: raise argparse.ArgumentTypeError( "OpenTargets filters must be passed as COLUMN=VALUE, e.g. 'disease.id=EFO_0000274'." @@ -95,34 +100,24 @@ def parse_opentargets_filter(filter_arg): filter_value = filter_value.strip() if not filter_key: - raise argparse.ArgumentTypeError( - "OpenTargets filter column name cannot be empty." - ) + raise argparse.ArgumentTypeError("OpenTargets filter column name cannot be empty.") return filter_key, int_or_str(str_to_bool_or_none(filter_value)) def main(): - """ - Function containing argparse parsers and arguments to allow the use of gget from the terminal. - """ + """Function containing argparse parsers and arguments to allow the use of gget from the terminal.""" # Define parent parser - parent_parser = argparse.ArgumentParser( - description=f"gget v{__version__}", add_help=False - ) + parent_parser = argparse.ArgumentParser(description=f"gget v{__version__}", add_help=False) # Initiate subparsers parent_subparsers = parent_parser.add_subparsers(dest="command") # Define parent (not sure why I need both parent parser and parent, but otherwise it does not work) parent = argparse.ArgumentParser(add_help=False) # Add custom help argument to parent parser - parent_parser.add_argument( - "-h", "--help", action="store_true", help="Print manual." - ) + parent_parser.add_argument("-h", "--help", action="store_true", help="Print manual.") # Add custom version argument to parent parser - parent_parser.add_argument( - "-v", "--version", action="store_true", help="Print version." - ) + parent_parser.add_argument("-v", "--version", action="store_true", help="Print version.") ## gget ref subparser ref_desc = "Fetch FTPs for reference genomes and annotations by species." @@ -248,9 +243,7 @@ def main(): ) ## gget search subparser - search_desc = ( - "Fetch gene and transcript IDs from Ensembl using free-form search terms." - ) + search_desc = "Fetch gene and transcript IDs from Ensembl using free-form search terms." parser_gget = parent_subparsers.add_parser( "search", parents=[parent], @@ -460,10 +453,7 @@ def main(): "--out", type=str, required=False, - help=( - "Path to folder to save results in, e.g. path/to/directory.\n" - "Default: Standard out." - ), + help=("Path to folder to save results in, e.g. path/to/directory.\nDefault: Standard out."), ) # gget diamond parser diamond_desc = "Align multiple protein or translated DNA sequences using DIAMOND." @@ -636,8 +626,7 @@ def main(): type=str, required=False, help=( - "Path to file the results will be saved as, e.g. path/to/directory/results.json.\n" - "Default: Standard out." + "Path to file the results will be saved as, e.g. path/to/directory/results.json.\nDefault: Standard out." ), ) parser_info.add_argument( @@ -697,9 +686,7 @@ def main(): default=False, action="store_true", required=False, - help=( - "Returns amino acid sequences from UniProt. (Otherwise returns nucleotide sequences from Ensembl.)" - ), + help=("Returns amino acid sequences from UniProt. (Otherwise returns nucleotide sequences from Ensembl.)"), ) parser_seq.add_argument( "-iso", @@ -752,7 +739,9 @@ def main(): ) ## gget muscle subparser - muscle_desc = "Align multiple nucleotide or amino acid sequences against each other (using the Muscle v5 algorithm)." + muscle_desc = ( + "Align multiple nucleotide or amino acid sequences against each other (using the Muscle v5 algorithm)." + ) parser_muscle = parent_subparsers.add_parser( "muscle", parents=[parent], @@ -929,9 +918,7 @@ def main(): ) ## gget blat subparser - blat_desc = ( - "BLAT a nucleotide or amino acid sequence against any BLAT UCSC assembly." - ) + blat_desc = "BLAT a nucleotide or amino acid sequence against any BLAT UCSC assembly." parser_blat = parent_subparsers.add_parser( "blat", parents=[parent], @@ -1190,10 +1177,7 @@ def main(): default=100, type=int, required=False, - help=( - "Number of correlated genes to return (default: 100).\n" - "(Only for gene correlation.)" - ), + help=("Number of correlated genes to return (default: 100).\n(Only for gene correlation.)"), ) parser_archs4.add_argument( "-s", @@ -1527,9 +1511,7 @@ def main(): ) # cellxgene parser arguments - cellxgene_desc = ( - "Query data from CZ CELLxGENE Discover (https://cellxgene.cziscience.com/)." - ) + cellxgene_desc = "Query data from CZ CELLxGENE Discover (https://cellxgene.cziscience.com/)." parser_cellxgene = parent_subparsers.add_parser( "cellxgene", parents=[parent], @@ -2156,9 +2138,7 @@ def main(): add_help=True, formatter_class=CustomHelpFormatter, ) - parser_cbio_subparsers = parser_cbio.add_subparsers( - dest="subcommand", help="Subcommand to execute." - ) + parser_cbio_subparsers = parser_cbio.add_subparsers(dest="subcommand", help="Subcommand to execute.") parser_cbio_search = parser_cbio_subparsers.add_parser( "search", description="Search for genes in cBioPortal.", @@ -2367,9 +2347,7 @@ def main(): formatter_class=CustomHelpFormatter, ) - parser_cube_spec.add_argument( - "genes", nargs="+", help="Gene symbols or Ensembl IDs." - ) + parser_cube_spec.add_argument("genes", nargs="+", help="Gene symbols or Ensembl IDs.") parser_cube_spec.add_argument( "-csv", @@ -2398,9 +2376,7 @@ def main(): formatter_class=CustomHelpFormatter, ) - parser_cube_psib.add_argument( - "genes", nargs="+", help="Gene symbols or Ensembl IDs." - ) + parser_cube_psib.add_argument("genes", nargs="+", help="Gene symbols or Ensembl IDs.") parser_cube_psib.add_argument( "-al", @@ -2443,13 +2419,9 @@ def main(): formatter_class=CustomHelpFormatter, ) - parser_cube_expr.add_argument( - "genes", nargs="+", help="Gene symbols or Ensembl IDs." - ) + parser_cube_expr.add_argument("genes", nargs="+", help="Gene symbols or Ensembl IDs.") - parser_cube_expr.add_argument( - "-al", "--analysis_level", required=True, help="Analysis level, e.g. 'Kidney'." - ) + parser_cube_expr.add_argument("-al", "--analysis_level", required=True, help="Analysis level, e.g. 'Kidney'.") parser_cube_expr.add_argument( "-at", @@ -2475,7 +2447,7 @@ def main(): action="store_false", help="Does not print progress information.", ) - + ## gget virus subparser virus_desc = "Download virus genome datasets and associated GenBank metadata from the NCBI Virus database." parser_virus = parent_subparsers.add_parser( @@ -2493,11 +2465,11 @@ def main(): nargs="?", default=None, help="Virus taxon name/ID to query, e.g. 'SARS-CoV-2', 'zika virus', or taxon ID '1335626'.\n" - "When using --is_accession flag, can also be:\n" - " - Single accession: 'NC_038294.1'\n" - " - Space-separated accessions: 'NC_038294.1 NC_045512.2'\n" - " - Path to text file: 'accessions.txt' (one accession per line)\n" - "For SARS-CoV-2 and Alphainfluenza cached downloads, multiple accessions are fully supported.", + "When using --is_accession flag, can also be:\n" + " - Single accession: 'NC_038294.1'\n" + " - Space-separated accessions: 'NC_038294.1 NC_045512.2'\n" + " - Path to text file: 'accessions.txt' (one accession per line)\n" + "For SARS-CoV-2 and Alphainfluenza cached downloads, multiple accessions are fully supported.", ) parser_virus.add_argument( "-a", @@ -2506,8 +2478,8 @@ def main(): action="store_true", required=False, help="Treat the virus argument as an accession number (single, space-separated list, or text file path with one accession per line).\n" - "Single: 'NC_038294.1' | List: 'NC_038294.1 NC_045512.2' | File: 'accessions.txt'\n" - "For SARS-CoV-2 and Alphainfluenza cached downloads, multiple accessions are fully supported.", + "Single: 'NC_038294.1' | List: 'NC_038294.1 NC_045512.2' | File: 'accessions.txt'\n" + "For SARS-CoV-2 and Alphainfluenza cached downloads, multiple accessions are fully supported.", ) parser_virus.add_argument( "-o", @@ -2577,7 +2549,7 @@ def main(): parser_virus.add_argument( "--annotated", type=str_to_bool_or_none, - nargs='?', + nargs="?", const=True, required=False, default=None, @@ -2685,7 +2657,7 @@ def main(): "--vaccine_strain", default=None, type=str_to_bool_or_none, - nargs='?', + nargs="?", const=True, required=False, help="Vaccine strain filter: 'true' or 'false' or None. True will only keep sequences marked as vaccine strains. False filters out vaccine strains. and None (Default) will not filter based on vaccine strain status.", @@ -2693,7 +2665,7 @@ def main(): parser_virus.add_argument( "--lab_passaged", type=str_to_bool_or_none, - nargs='?', + nargs="?", const=True, required=False, default=None, @@ -2702,7 +2674,7 @@ def main(): parser_virus.add_argument( "--provirus", type=str_to_bool_or_none, - nargs='?', + nargs="?", const=True, required=False, default=None, @@ -2797,9 +2769,9 @@ def main(): default=None, dest="baseline_metadata", help="Path to a baseline metadata file (CSV/JSONL/JSON/text) containing accessions to skip.\n" - "Only new accessions (not in baseline) will be downloaded.\n" - "Useful for incremental updates or resuming after API failures.\n" - "CSV files must have an 'accession' column. Text files: one accession per line.", + "Only new accessions (not in baseline) will be downloaded.\n" + "Useful for incremental updates or resuming after API failures.\n" + "CSV files must have an 'accession' column. Text files: one accession per line.", ) parser_virus.add_argument( "--merge-results", @@ -2816,7 +2788,7 @@ def main(): dest="no_merge", required=False, help="When using --baseline, output new results separately from baseline.\n" - "Creates {virus}_new.csv (new sequences only) and {virus}_baseline_provided.csv (reference).", + "Creates {virus}_new.csv (new sequences only) and {virus}_baseline_provided.csv (reference).", ) parser_virus.add_argument( "--api_key", @@ -2824,9 +2796,9 @@ def main(): required=False, default=None, help="NCBI API key for higher E-utilities rate limits (10 requests/sec vs 3/sec without).\n" - "Obtain a free key from https://www.ncbi.nlm.nih.gov/account/settings/\n" - "Can also be set via the NCBI_API_KEY environment variable.\n" - "If not provided, requests continue at the lower default rate limit.", + "Obtain a free key from https://www.ncbi.nlm.nih.gov/account/settings/\n" + "Can also be set via the NCBI_API_KEY environment variable.\n" + "If not provided, requests continue at the lower default rate limit.", ) parser_virus.add_argument( "-q", @@ -2844,14 +2816,12 @@ def main(): if args.help: # Retrieve all subparsers from the parent parser subparsers_actions = [ - action - for action in parent_parser._actions - if isinstance(action, argparse._SubParsersAction) + action for action in parent_parser._actions if isinstance(action, argparse._SubParsersAction) ] for subparsers_action in subparsers_actions: # Get all subparsers and print help for choice, subparser in subparsers_action.choices.items(): - print("Subparser '{}'".format(choice)) + print(f"Subparser '{choice}'") print(subparser.format_help()) sys.exit(0) @@ -3139,14 +3109,10 @@ def main(): if args.command == "archs4": # Handle deprecated flags for backwards compatibility if args.gene_deprecated and args.gene: - logger.warning( - "The [-g][--gene] argument is deprecated, using positional argument [gene] instead." - ) + logger.warning("The [-g][--gene] argument is deprecated, using positional argument [gene] instead.") if args.gene_deprecated and not args.gene: args.gene = args.gene_deprecated - logger.warning( - "The [-g][--gene] argument is deprecated, please use positional argument [gene] instead." - ) + logger.warning("The [-g][--gene] argument is deprecated, please use positional argument [gene] instead.") if not args.gene_deprecated and not args.gene: parser_archs4.error("the following arguments are required: gene") @@ -3191,14 +3157,10 @@ def main(): if args.command == "muscle": # Handle deprecated flags for backwards compatibility if args.fasta_deprecated and args.fasta: - logger.warning( - "The [-fa][--fasta] argument is deprecated, using positional argument [fasta] instead." - ) + logger.warning("The [-fa][--fasta] argument is deprecated, using positional argument [fasta] instead.") if args.fasta_deprecated and not args.fasta: args.fasta = args.fasta_deprecated - logger.warning( - "The [-fa][--fasta] argument is deprecated, please use positional argument [fasta] instead." - ) + logger.warning("The [-fa][--fasta] argument is deprecated, please use positional argument [fasta] instead.") if not args.fasta_deprecated and not args.fasta: parser_muscle.error("the following arguments are required: fasta") @@ -3250,9 +3212,7 @@ def main(): if args.command == "ref": # Return all vertebrate available species if args.list_species: - species_list = ref( - species=None, release=args.release, list_species=args.list_species - ) + species_list = ref(species=None, release=args.release, list_species=args.list_species) # Save in specified directory if -o specified if args.out: directory = "/".join(args.out.split("/")[:-1]) @@ -3266,9 +3226,7 @@ def main(): # Return all invertebrate available species elif args.list_iv_species: - species_list = ref( - species=None, release=args.release, list_iv_species=args.list_iv_species - ) + species_list = ref(species=None, release=args.release, list_iv_species=args.list_iv_species) # Save in specified directory if -o specified if args.out: directory = "/".join(args.out.split("/")[:-1]) @@ -3282,9 +3240,7 @@ def main(): # Handle deprecated flags for backwards compatibility if args.species_deprecated and args.species: - logger.warning( - "The [-s][--species] argument is deprecated, using positional argument [species] instead." - ) + logger.warning("The [-s][--species] argument is deprecated, using positional argument [species] instead.") if args.species_deprecated and not args.species: args.species = args.species_deprecated logger.warning( @@ -3292,11 +3248,7 @@ def main(): ) # Raise error if neither species nor list flag passed - if ( - args.species is None - and args.list_species is False - and args.list_iv_species is False - ): + if args.species is None and args.list_species is False and args.list_iv_species is False: parser_ref.error( "the following arguments are required: species \n" "'gget ref --list_species' -> lists out all available vertebrate species. \n" @@ -3441,14 +3393,10 @@ def main(): if args.command == "enrichr": # Handle deprecated flags for backwards compatibility if args.genes_deprecated and args.genes: - logger.warning( - "The [-g][--genes] argument is deprecated, using positional argument [genes] instead." - ) + logger.warning("The [-g][--genes] argument is deprecated, using positional argument [genes] instead.") if args.genes_deprecated and not args.genes: args.genes = args.genes_deprecated - logger.warning( - "The [-g][--genes] argument is deprecated, please use positional argument [genes] instead." - ) + logger.warning("The [-g][--genes] argument is deprecated, please use positional argument [genes] instead.") if not args.genes_deprecated and not args.genes: parser_enrichr.error("the following arguments are required: genes") @@ -3471,9 +3419,7 @@ def main(): for gene in args.background_list: bkg_genes_clean.append(gene.split(",")) # Flatten bkg_genes_clean - bkg_genes_clean_final = [ - item for sublist in bkg_genes_clean for item in sublist - ] + bkg_genes_clean_final = [item for sublist in bkg_genes_clean for item in sublist] # Remove empty strings resulting from split while "" in genes_clean_final: bkg_genes_clean_final.remove("") @@ -3523,14 +3469,10 @@ def main(): if args.command == "info": # Handle deprecated flags for backwards compatibility if args.id_deprecated and args.ens_ids: - logger.warning( - "The [-id][--ens_ids] argument is deprecated, using positional argument [ens_ids] instead." - ) + logger.warning("The [-id][--ens_ids] argument is deprecated, using positional argument [ens_ids] instead.") if args.id_deprecated and not args.ens_ids: args.ens_ids = args.id_deprecated - logger.warning( - "The [-id][--genes] argument is deprecated, please use arguments [ens_ids] instead." - ) + logger.warning("The [-id][--genes] argument is deprecated, please use arguments [ens_ids] instead.") if args.ensembl_only: logger.warning( "The [-eo][--ensembl_only] argument is deprecated, please use arguments [ncbi] and [uniprot] instead." @@ -3591,9 +3533,7 @@ def main(): if args.command == "seq": # Handle deprecated flags for backwards compatibility if args.id_deprecated and args.ens_ids: - logger.warning( - "The [-id][--ens_ids] argument is deprecated, using positional argument [ens_ids] instead." - ) + logger.warning("The [-id][--ens_ids] argument is deprecated, using positional argument [ens_ids] instead.") if args.id_deprecated and not args.ens_ids: args.ens_ids = args.id_deprecated logger.warning( @@ -3624,7 +3564,7 @@ def main(): ) # Save in specified directory if -o specified - if args.out and seq_results != None: + if args.out and seq_results is not None: directory = "/".join(args.out.split("/")[:-1]) if directory != "": os.makedirs(directory, exist_ok=True) @@ -3635,7 +3575,7 @@ def main(): # Print results if no directory specified else: - if seq_results != None: + if seq_results is not None: for seq_res in seq_results: print(seq_res) @@ -3722,18 +3662,12 @@ def main(): if args.csv: opentargets_results.to_csv(f, index=False) else: - opentargets_results.to_json( - f, orient="records", force_ascii=False, indent=4 - ) + opentargets_results.to_json(f, orient="records", force_ascii=False, indent=4) else: if args.csv: opentargets_results.to_csv(sys.stdout, index=False) else: - print( - opentargets_results.to_json( - orient="records", force_ascii=False, indent=4 - ) - ) + print(opentargets_results.to_json(orient="records", force_ascii=False, indent=4)) ## cbio return if args.command == "cbio": @@ -3776,25 +3710,19 @@ def main(): if args.csv: bgee_results.to_csv(f, index=False) else: - bgee_results.to_json( - f, orient="records", force_ascii=False, indent=4 - ) + bgee_results.to_json(f, orient="records", force_ascii=False, indent=4) else: if args.csv: bgee_results.to_csv(sys.stdout, index=False) else: - print( - bgee_results.to_json(orient="records", force_ascii=False, indent=4) - ) + print(bgee_results.to_json(orient="records", force_ascii=False, indent=4)) ## 8cube return if args.command == "8cube": - from .gget_8cube import specificity, psi_block, gene_expression + from .gget_8cube import gene_expression, psi_block, specificity if args.cube_command is None: - parser_8cube.error( - "Please specify a subcommand: specificity, psi_block, or expression" - ) + parser_8cube.error("Please specify a subcommand: specificity, psi_block, or expression") # SPECIFICITY if args.cube_command == "specificity": @@ -3811,7 +3739,7 @@ def main(): if directory: os.makedirs(directory, exist_ok=True) - if not args.csv: # args.csv stores False + if not args.csv: # args.csv stores False pd.DataFrame(results).to_csv(args.out, index=False) else: with open(args.out, "w", encoding="utf-8") as f: @@ -3819,7 +3747,7 @@ def main(): return # Print to STDOUT - if not args.csv: # args.csv stores False + if not args.csv: # args.csv stores False pd.DataFrame(results).to_csv(sys.stdout, index=False) else: print(json.dumps(results, ensure_ascii=False, indent=4)) @@ -3841,14 +3769,14 @@ def main(): if directory: os.makedirs(directory, exist_ok=True) - if not args.csv: # args.csv stores False + if not args.csv: # args.csv stores False pd.DataFrame(results).to_csv(args.out, index=False) else: with open(args.out, "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=4) return - if not args.csv: # args.csv stores False + if not args.csv: # args.csv stores False pd.DataFrame(results).to_csv(sys.stdout, index=False) else: print(json.dumps(results, ensure_ascii=False, indent=4)) @@ -3870,14 +3798,14 @@ def main(): if directory: os.makedirs(directory, exist_ok=True) - if not args.csv: # args.csv stores False + if not args.csv: # args.csv stores False pd.DataFrame(results).to_csv(args.out, index=False) else: with open(args.out, "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=4) return - if not args.csv: # args.csv stores False + if not args.csv: # args.csv stores False pd.DataFrame(results).to_csv(sys.stdout, index=False) else: print(json.dumps(results, ensure_ascii=False, indent=4)) @@ -3887,48 +3815,48 @@ def main(): if args.command == "virus": # Parse has_proteins argument - convert comma-separated string to list has_proteins_arg = args.has_proteins - if has_proteins_arg and ',' in has_proteins_arg: - has_proteins_arg = [p.strip() for p in has_proteins_arg.split(',')] - + if has_proteins_arg and "," in has_proteins_arg: + has_proteins_arg = [p.strip() for p in has_proteins_arg.split(",")] + segment_arg = args.segment - if segment_arg and ',' in segment_arg: - segment_arg = [s.strip() for s in segment_arg.split(',')] + if segment_arg and "," in segment_arg: + segment_arg = [s.strip() for s in segment_arg.split(",")] isolate_arg = args.isolate - if isolate_arg and ',' in isolate_arg: - isolate_arg = [i.strip() for i in isolate_arg.split(',')] + if isolate_arg and "," in isolate_arg: + isolate_arg = [i.strip() for i in isolate_arg.split(",")] submitter_name_arg = args.submitter_name - if submitter_name_arg and ',' in submitter_name_arg: - submitter_name_arg = [a.strip() for a in submitter_name_arg.split(',')] + if submitter_name_arg and "," in submitter_name_arg: + submitter_name_arg = [a.strip() for a in submitter_name_arg.split(",")] submitter_institution_arg = args.submitter_institution - if submitter_institution_arg and ',' in submitter_institution_arg: - submitter_institution_arg = [i.strip() for i in submitter_institution_arg.split(',')] + if submitter_institution_arg and "," in submitter_institution_arg: + submitter_institution_arg = [i.strip() for i in submitter_institution_arg.split(",")] submitter_country_arg = args.submitter_country - if submitter_country_arg and ',' in submitter_country_arg: - submitter_country_arg = [c.strip() for c in submitter_country_arg.split(',')] - + if submitter_country_arg and "," in submitter_country_arg: + submitter_country_arg = [c.strip() for c in submitter_country_arg.split(",")] + env_source_arg = args.env_source - if env_source_arg and ',' in env_source_arg: - env_source_arg = [e.strip() for e in env_source_arg.split(',')] + if env_source_arg and "," in env_source_arg: + env_source_arg = [e.strip() for e in env_source_arg.split(",")] lineage_arg = args.lineage - if lineage_arg and ',' in lineage_arg: - lineage_arg = [l.strip() for l in lineage_arg.split(',')] + if lineage_arg and "," in lineage_arg: + lineage_arg = [l.strip() for l in lineage_arg.split(",")] genotype_arg = args.genotype - if genotype_arg and ',' in genotype_arg: - genotype_arg = [g.strip() for g in genotype_arg.split(',')] + if genotype_arg and "," in genotype_arg: + genotype_arg = [g.strip() for g in genotype_arg.split(",")] isolation_source_arg = args.isolation_source - if isolation_source_arg and ',' in isolation_source_arg: - isolation_source_arg = [i.strip() for i in isolation_source_arg.split(',')] + if isolation_source_arg and "," in isolation_source_arg: + isolation_source_arg = [i.strip() for i in isolation_source_arg.split(",")] gen_mol_type_arg = args.gen_mol_type - if gen_mol_type_arg and ',' in gen_mol_type_arg: - gen_mol_type_arg = [g.strip() for g in gen_mol_type_arg.split(',')] + if gen_mol_type_arg and "," in gen_mol_type_arg: + gen_mol_type_arg = [g.strip() for g in gen_mol_type_arg.split(",")] # Determine merge_results: --no-merge overrides --merge-results merge_results_arg = True # default diff --git a/gget/utils.py b/gget/utils.py index 399194473..461de07cf 100644 --- a/gget/utils.py +++ b/gget/utils.py @@ -1,44 +1,42 @@ -from bs4 import BeautifulSoup -import requests - import concurrent.futures +import functools import json -import re +import logging import os import time import uuid -import functools -import pandas as pd + import numpy as np -from IPython.display import display, HTML -import logging +import pandas as pd +import requests +from bs4 import BeautifulSoup +from IPython.display import HTML, display # from datetime import datetime # Mute numexpr threads info logging.getLogger("numexpr").setLevel(logging.WARNING) -from .constants import ( - ENSEMBL_FTP_URL, - ENSEMBL_FTP_URL_NV, - ENS_TO_PDB_API, +from .constants import ( # noqa: E402 COSMIC_RELEASE_URL, DEFAULT_REQUESTS_TIMEOUT, + ENS_TO_PDB_API, + ENSEMBL_FTP_URL, + ENSEMBL_FTP_URL_NV, ) def set_up_logger(): + """Configure and return the module logger using the GGET_LOGLEVEL environment variable.""" logging_level_name = os.getenv("GGET_LOGLEVEL", "INFO") logging_level = logging.getLevelName(logging_level_name) - if type(logging_level) != int: # unknown log level + if not isinstance(logging_level, int): # unknown log level logging_level = logging.INFO logger = logging.getLogger(__name__) logger.setLevel(logging_level) if not logger.hasHandlers(): - formatter = logging.Formatter( - "%(asctime)s - %(levelname)s - %(message)s", "%H:%M:%S" - ) + formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s", "%H:%M:%S") console_handler = logging.StreamHandler() console_handler.setFormatter(formatter) @@ -63,15 +61,13 @@ def set_up_logger(): def flatten(xss): - """ - Function to flatten a list of lists. - """ + """Function to flatten a list of lists.""" return [x for xs in xss for x in xs] def parallel_map(fn, items, *, max_workers=None): - """ - Apply `fn` to each item using a thread pool and return the results + """Apply `fn` to each item using a thread pool and return the results + in input order. Designed for I/O-bound work — typically per-ID HTTP calls — where the per-call latency is dominated by network RTT. @@ -105,8 +101,8 @@ def http_json( backoff=1.0, **kwargs, ): - """ - Issue an HTTP request and return the parsed JSON body, raising a + """Issue an HTTP request and return the parsed JSON body, raising a + RuntimeError with consistent context if the request fails or the body is not valid JSON. @@ -142,15 +138,13 @@ def http_json( # to the caller without retry. if response.status_code < 500: body = response.text[:200] if response.text else "" - raise RuntimeError( - f"{label} returned HTTP {response.status_code}. Body: {body}" - ) + raise RuntimeError(f"{label} returned HTTP {response.status_code}. Body: {body}") last_exc = None last_status = response.status_code last_body = response.text[:200] if response.text else "" if attempt < attempts - 1: - delay = backoff * (2 ** attempt) + delay = backoff * (2**attempt) logger.warning( f"{label}: transient failure (%s); retrying in %.1fs (attempt %d/%d).", last_exc or f"HTTP {last_status}", @@ -161,17 +155,13 @@ def http_json( time.sleep(delay) if last_exc is not None: - raise RuntimeError( - f"{label} request failed after {attempts} attempts: {last_exc}" - ) from last_exc - raise RuntimeError( - f"{label} returned HTTP {last_status} after {attempts} attempts. Body: {last_body}" - ) + raise RuntimeError(f"{label} request failed after {attempts} attempts: {last_exc}") from last_exc + raise RuntimeError(f"{label} returned HTTP {last_status} after {attempts} attempts. Body: {last_body}") def dig(obj, *path, context=""): - """ - Walk a nested key path through `obj` and return the resulting value. + """Walk a nested key path through `obj` and return the resulting value. + Raises RuntimeError with `context` if any intermediate key is missing or any intermediate value is not a dict. Use to make `response["data"]["target"]`-style access fail with a clear message @@ -182,26 +172,24 @@ def dig(obj, *path, context=""): if not isinstance(cur, dict) or key not in cur: traversed = ".".join(path[:i]) or "" label = f"{context}: " if context else "" - raise RuntimeError( - f"{label}expected key '{key}' under {traversed} in response." - ) + raise RuntimeError(f"{label}expected key '{key}' under {traversed} in response.") cur = cur[key] return cur def get_latest_cosmic(): + """Fetch and return the latest COSMIC release version number.""" html = requests.get(COSMIC_RELEASE_URL) if html.status_code != 200: - raise RuntimeError( - f"The COSMIC server returned error status code {html.status_code}. Please try again." - ) + raise RuntimeError(f"The COSMIC server returned error status code {html.status_code}. Please try again.") soup = BeautifulSoup(html.text, "html.parser") return int(soup.find("div", class_="news").get("id").split("v")[-1]) def check_file_for_error_message(filepath, filename, download_path): - with open(filepath, "r", encoding="utf-8") as file: + """Raise a ValueError if the downloaded file contains a known server error message.""" + with open(filepath, encoding="utf-8") as file: content = file.read().strip() # Define common error indicators @@ -217,7 +205,7 @@ def check_file_for_error_message(filepath, filename, download_path): if any(keyword in content for keyword in error_keywords): raise ValueError( f""" - The {filename} downloaded from {download_path} + The {filename} downloaded from {download_path} contains an error message instead of valid data.\n Error message:\n{content}\n Please try again. If the problem persists, please report it here: https://github.com/pachterlab/gget/issues/new?template=issue_report.yml @@ -226,11 +214,10 @@ def check_file_for_error_message(filepath, filename, download_path): def read_fasta(fasta): - """ + """Return titles and seqs from a fasta file as two list objects. + Args: - fasta (str) Path to fasta file. - - Returns titles and seqs from fasta file as two list objects. """ titles = [] seqs = [] @@ -269,11 +256,10 @@ def read_fasta(fasta): def n_colors(nucleotide): - """ - Returns a string format to print the nucleotide + """Returns a string format to print the nucleotide + with its appropriate background color according to the Clustal Colour Scheme. """ - # Raw python background colors # References: # https://stackabuse.com/how-to-print-colored-text-in-python/ @@ -305,7 +291,7 @@ def n_colors(nucleotide): # If the nucleotide does not fall into the defined color categories, # make it white (e.g. "-") - if bkg_color == None: + if bkg_color is None: bkg_color = raw_colors["white"] if letter_color is not None and letter_color in ["blue", "red"]: @@ -319,12 +305,11 @@ def n_colors(nucleotide): def aa_colors(amino_acid): - """ - Returns a string format to print the amino acid + """Returns a string format to print the amino acid + with its appropriate background color according to the Clustal Colour Scheme: - http://www.jalview.org/help/html/colourSchemes/clustal.html + http://www.jalview.org/help/html/colourSchemes/clustal.html. """ - # Raw python background colors # References: # https://stackabuse.com/how-to-print-colored-text-in-python/ @@ -364,7 +349,7 @@ def aa_colors(amino_acid): # If the amino acid does not fall into the defined color categories, # make it white (e.g. "-") - if bkg_color == None: + if bkg_color is None: bkg_color = raw_colors["white"] if letter_color is not None and letter_color in [ @@ -399,9 +384,7 @@ def _fetch_uniprot_for_id(server, id_): ) payload = r.json() if len(payload["results"]) > 0: - logger.warning( - f"No reviewed UniProt results were found for ID {id_}. Returning all unreviewed results." - ) + logger.warning(f"No reviewed UniProt results were found for ID {id_}. Returning all unreviewed results.") if not len(payload["results"]) > 0: logger.warning(f"No UniProt sequences were found for ID {id_}.") @@ -435,8 +418,7 @@ def _fetch_uniprot_for_id(server, id_): def get_uniprot_seqs(server, ensembl_ids): - """ - Retrieve UniProt sequences based on Ensemsbl, WormBase or FlyBase identifiers. + """Retrieve UniProt sequences based on Ensemsbl, WormBase or FlyBase identifiers. Args: - server Link to UniProt REST API server. @@ -444,17 +426,14 @@ def get_uniprot_seqs(server, ensembl_ids): Returns data frame with UniProt ID, gene name, organism, sequence, sequence length, and query ID. """ - # If a single UniProt ID is passed as string, convert to list - if type(ensembl_ids) == str: + if isinstance(ensembl_ids, str): ensembl_ids = [ensembl_ids] # Fan out per-ID requests across a thread pool. Each call is independent # and entirely I/O-bound, so the wall-clock saving on a list of IDs is # roughly the pool size. Override with GGET_MAX_WORKERS env var. - results = parallel_map( - lambda id_: _fetch_uniprot_for_id(server, id_), ensembl_ids - ) + results = parallel_map(lambda id_: _fetch_uniprot_for_id(server, id_), ensembl_ids) per_id_dfs = [df for df in results if df is not None] if per_id_dfs: return pd.concat(per_id_dfs, ignore_index=True) @@ -462,8 +441,7 @@ def get_uniprot_seqs(server, ensembl_ids): def get_uniprot_info(server, ensembl_id, verbose=True): - """ - Retrieve UniProt synonyms and description based on Ensemsbl identifiers. + """Retrieve UniProt synonyms and description based on Ensemsbl identifiers. Args: - server Link to UniProt REST API server. @@ -520,7 +498,7 @@ def get_uniprot_info(server, ensembl_id, verbose=True): for i in np.arange(len(json["results"])): try: gene_names.append(json["results"][i]["genes"][0]["geneName"]["value"]) - except: + except Exception: # noqa: BLE001 gene_names.append(np.nan) df["primary_gene_name"] = gene_names @@ -531,7 +509,7 @@ def get_uniprot_info(server, ensembl_id, verbose=True): try: for syn in json["results"][i]["genes"][0]["synonyms"]: uni_syn_temp.append(syn["value"]) - except: + except Exception: # noqa: BLE001 uni_syn_temp.append(np.nan) uni_synonyms.append(uni_syn_temp) df["uni_synonyms"] = uni_synonyms @@ -540,12 +518,8 @@ def get_uniprot_info(server, ensembl_id, verbose=True): protein_names = [] for i in np.arange(len(json["results"])): try: - protein_names.append( - json["results"][i]["proteinDescription"]["recommendedName"][ - "fullName" - ]["value"] - ) - except: + protein_names.append(json["results"][i]["proteinDescription"]["recommendedName"]["fullName"]["value"]) + except Exception: # noqa: BLE001 protein_names.append(np.nan) df["protein_names"] = protein_names @@ -561,7 +535,7 @@ def get_uniprot_info(server, ensembl_id, verbose=True): des_temp = np.unique(np.array(des_temp)) # Append all descriptions to a single string object des_temp = " ".join(des_temp) - except: + except Exception: # noqa: BLE001 des_temp.append(np.nan) descriptions.append(des_temp) @@ -577,7 +551,7 @@ def get_uniprot_info(server, ensembl_id, verbose=True): if comment_json["commentType"] == "SUBCELLULAR LOCATION": for location_dict in comment_json["subcellularLocations"]: subcel_locs.append(location_dict["location"]["value"]) - except: + except Exception: # noqa: BLE001 pass subcel_locs_final.append(subcel_locs) @@ -601,16 +575,16 @@ def get_uniprot_info(server, ensembl_id, verbose=True): syn_lists = df[column].values try: flat_list = [item for sublist in syn_lists for item in sublist] - final_df[column] = [list({value: "" for value in flat_list})] + final_df[column] = [list(dict.fromkeys(flat_list, ""))] - except: + except Exception: # noqa: BLE001 final_df[column] = [syn_lists] else: val_list = df[column].values try: - final_df[column] = [list({value: "" for value in val_list})] - except: + final_df[column] = [list(dict.fromkeys(val_list, ""))] + except Exception: # noqa: BLE001 final_df[column] = [val_list] # Try to clean up the entries (so they are not a bunch of lists of one item) @@ -618,8 +592,8 @@ def get_uniprot_info(server, ensembl_id, verbose=True): if len(final_df[column]) == 1 and column != "uni_synonyms": try: final_df[column] = final_df[column][0] - except: - None + except Exception: # noqa: BLE001 + pass return final_df @@ -724,14 +698,13 @@ def get_uniprot_info(server, ensembl_id, verbose=True): def get_pdb_ids(ens_id): - """ - Function to fetch all PDB IDs linked to an Ensembl ID. - using the PDBe API https://wwwdev.ebi.ac.uk/pdbe/aggregated-api/mappings/ensembl_to_pdb/[ens_id] + """Function to fetch all PDB IDs linked to an Ensembl ID. + + using the PDBe API https://wwwdev.ebi.ac.uk/pdbe/aggregated-api/mappings/ensembl_to_pdb/[ens_id]. API documentation: https://www.ebi.ac.uk/pdbe/aggregated-api/#/SIFTS/get_ensembl_to_pdb_mappings_api_mappings_ensembl_to_pdb__gene_id__get """ - res = requests.get(ENS_TO_PDB_API + ens_id) if not res.ok: @@ -748,12 +721,12 @@ def get_pdb_ids(ens_id): for entry in pdb_dict: pdb_ids.append(entry["pdb_id"]) - return sorted(list(set(pdb_ids))) + return sorted(set(pdb_ids)) def wrap_cols_func(df, cols): - """ - Function to wrap columns cols of a + """Function to wrap columns cols of a + data frame df for easier reading. """ for col in cols: @@ -763,8 +736,7 @@ def wrap_cols_func(df, cols): def rest_query(server, query, content_type): - """ - Function to perform a REST API query. + """Function to perform a REST API query. Args: - server Server to query. @@ -773,13 +745,11 @@ def rest_query(server, query, content_type): Returns server output. """ - r = requests.get(server + query, headers={"Content-Type": content_type}) if not r.ok: raise RuntimeError( - f"{server} returned error status code {r.status_code}. " - "Please double-check arguments and try again.\n" + f"{server} returned error status code {r.status_code}. Please double-check arguments and try again.\n" ) if content_type == "application/json": @@ -789,8 +759,7 @@ def rest_query(server, query, content_type): def post_query(server, endpoint, query): - """ - Function to perform a POST API query. + """Function to perform a POST API query. :param server: Server to query . :param endpoint: Server endpoint @@ -798,23 +767,18 @@ def post_query(server, endpoint, query): :return: server output """ - - r = requests.post( - server + endpoint, json=query, headers={"Content-Type": "application/json"} - ) + r = requests.post(server + endpoint, json=query, headers={"Content-Type": "application/json"}) if not r.ok: raise RuntimeError( - f"{server} returned error status code {r.status_code}. " - "Please double-check arguments and try again.\n" + f"{server} returned error status code {r.status_code}. Please double-check arguments and try again.\n" ) return r.json() def graphql_query(server, query, variables): - """ - Function to perform a GraphQL API query. + """Function to perform a GraphQL API query. Args: - server Server to query. @@ -823,25 +787,20 @@ def graphql_query(server, query, variables): Returns server output. """ - r = requests.post(server, json={"query": query, "variables": variables}) if not r.ok: - logger.debug( - f"Server: {server}, Query: {query}, Variables: {variables}, Response: {r.text}" - ) + logger.debug(f"Server: {server}, Query: {query}, Variables: {variables}, Response: {r.text}") raise RuntimeError( - f"{server} returned error status code {r.status_code}. " - "Please double-check arguments and try again.\n" + f"{server} returned error status code {r.status_code}. Please double-check arguments and try again.\n" ) return r.json() -@functools.lru_cache(maxsize=None) +@functools.cache def find_latest_ens_rel(database=ENSEMBL_FTP_URL): - """ - Returns the latest Ensembl release number. + """Returns the latest Ensembl release number. Args: - database Link to Ensembl database. @@ -870,18 +829,15 @@ def find_latest_ens_rel(database=ENSEMBL_FTP_URL): html = requests.get(database + "VERSION") if html.status_code != 200: - raise RuntimeError( - f"The Ensembl FTP server returned error status code {html.status_code}. Please try again." - ) + raise RuntimeError(f"The Ensembl FTP server returned error status code {html.status_code}. Please try again.") ENS_rel = int(html.text) return ENS_rel -@functools.lru_cache(maxsize=None) +@functools.cache def search_species_options(database=ENSEMBL_FTP_URL, release=None): - """ - Function to find all available species core databases for gget search. + """Function to find all available species core databases for gget search. Args: - release Ensembl release for which the databases are fetched. @@ -894,11 +850,9 @@ def search_species_options(database=ENSEMBL_FTP_URL, release=None): ENS_rel = find_latest_ens_rel(database) # If release != None, use user-defined Ensembl release - if release != None: + if release is not None: if release > ENS_rel: - logger.warning( - f"Provided Ensembl release number {release} is greater than the latest release ({ENS_rel})." - ) + logger.warning(f"Provided Ensembl release number {release} is greater than the latest release ({ENS_rel}).") ENS_rel = release ## Find all available databases @@ -929,9 +883,7 @@ def search_species_options(database=ENSEMBL_FTP_URL, release=None): # Raise error if status code not "OK" Response if html.status_code != 200: - raise RuntimeError( - f"The Ensembl server returned error status code {html.status_code}. Please try again." - ) + raise RuntimeError(f"The Ensembl server returned error status code {html.status_code}. Please try again.") soup = BeautifulSoup(html.text, "html.parser") @@ -944,8 +896,9 @@ def search_species_options(database=ENSEMBL_FTP_URL, release=None): return databases -@functools.lru_cache(maxsize=None) +@functools.cache def find_nv_kingdom(species, release): + """Return the Ensembl non-vertebrate kingdom that contains the given species for a release.""" kds = ["plants", "protists", "metazoa", "fungi"] for kingdom in kds: url = ENSEMBL_FTP_URL_NV + f"release-{release}/{kingdom}/fasta/" @@ -953,9 +906,7 @@ def find_nv_kingdom(species, release): # Raise error if status code not "OK" Response if html.status_code != 200: - raise RuntimeError( - f"The Ensembl server returned error status code {html.status_code}. Please try again." - ) + raise RuntimeError(f"The Ensembl server returned error status code {html.status_code}. Please try again.") # Parse the html and generate a clean list of the available genomes soup = BeautifulSoup(html.text, "html.parser") @@ -969,10 +920,9 @@ def find_nv_kingdom(species, release): return kingdom -@functools.lru_cache(maxsize=None) +@functools.cache def ref_species_options(which, database=ENSEMBL_FTP_URL, release=None): - """ - Function to find all available species for gget ref. + """Function to find all available species for gget ref. Args: - which Which type of file to check for. @@ -987,12 +937,10 @@ def ref_species_options(which, database=ENSEMBL_FTP_URL, release=None): ENS_rel = find_latest_ens_rel(database) # If release != None, use user-defined Ensembl release - if release != None: + if release is not None: # Warn user if user-defined release is higher than the latest release if release > ENS_rel: - logger.warning( - f"Provided Ensembl release number {release} is greater than the latest release ({ENS_rel})." - ) + logger.warning(f"Provided Ensembl release number {release} is greater than the latest release ({ENS_rel}).") ENS_rel = release # Handle structure of non-vertebrate database @@ -1034,9 +982,7 @@ def ref_species_options(which, database=ENSEMBL_FTP_URL, release=None): # Raise error if status code not "OK" Response if html.status_code != 200: - raise RuntimeError( - f"The Ensembl server returned error status code {html.status_code}. Please try again." - ) + raise RuntimeError(f"The Ensembl server returned error status code {html.status_code}. Please try again.") # Parse the html and generate a clean list of the available genomes soup = BeautifulSoup(html.text, "html.parser") @@ -1052,8 +998,8 @@ def ref_species_options(which, database=ENSEMBL_FTP_URL, release=None): def parse_blast_ref_page(handle): - """ - Extract RID and RTOE from the NCBI 'please wait' page (handle). + """Extract RID and RTOE from the NCBI 'please wait' page (handle). + RTOE = 'Estimated time fo completion.' RID = 'Request ID'. @@ -1064,7 +1010,6 @@ def parse_blast_ref_page(handle): Biopython License Agreement and BSD 3-Clause License https://github.com/biopython/biopython/blob/171697883aca6894f8367f8f20f1463ce7784d0c/LICENSE.rst """ - # Decode handle string = handle.read().decode() @@ -1107,9 +1052,7 @@ def parse_blast_ref_page(handle): msg = string[i:].split("<", 1)[0].split("\n", 1)[0].strip() raise ValueError(f"Error message from NCBI: {msg}") # Raise general error, if the error layout was not recognized - raise ValueError( - "No request ID and no estimated time to completion were found in the NCBI 'please wait' page." - ) + raise ValueError("No request ID and no estimated time to completion were found in the NCBI 'please wait' page.") # Raise error if RTOE was found but RID was not elif not rid: raise ValueError( @@ -1126,12 +1069,11 @@ def parse_blast_ref_page(handle): except ValueError: raise ValueError( f"A non-integer estimated time to completion was found in the NCBI 'please wait' page: '{rtoe}'." - ) + ) from None def tsv_to_df(tsv_file, headers=None, skiprows=None): - """ - Convert tsv file to dataframe format. + """Convert tsv file to dataframe format. Args: - tsv_file File to be converted @@ -1143,12 +1085,11 @@ def tsv_to_df(tsv_file, headers=None, skiprows=None): return df except pd.errors.EmptyDataError: - raise RuntimeError(f"tsv to data frame reformatting failed.") + raise RuntimeError("tsv to data frame reformatting failed.") from None def create_tmp_fasta(sequences): - """ - Create temporary FASTA file from str or list of sequences. + """Create temporary FASTA file from str or list of sequences. Args: - sequences List of user input amino acid sequences @@ -1158,7 +1099,7 @@ def create_tmp_fasta(sequences): # Generate random ID random_id = str(uuid.uuid4()) - if type(sequences) == str: + if isinstance(sequences, str): sequences = [sequences] with open(f"tmp_{random_id}.fa", "w") as f: @@ -1169,8 +1110,7 @@ def create_tmp_fasta(sequences): def remove_temp_files(files_to_delete): - """ - Delete temporary files. + """Delete temporary files. Args: - files_to_delete List of paths to files to delete. @@ -1181,8 +1121,7 @@ def remove_temp_files(files_to_delete): def json_list_to_df(json_list, columns) -> pd.DataFrame: - """ - Convert list of JSON objects to data frame. + """Convert list of JSON objects to data frame. Args: @@ -1192,7 +1131,6 @@ def json_list_to_df(json_list, columns) -> pd.DataFrame: Returns data frame with columns as specified in keys. """ - tmp_columns = [[] for _ in range(len(columns))] for json_obj in json_list: @@ -1206,7 +1144,7 @@ def json_list_to_df(json_list, columns) -> pd.DataFrame: value = value[k] tmp_columns[i].append(value) - return pd.DataFrame({key[0]: value for key, value in zip(columns, tmp_columns)}) + return pd.DataFrame({key[0]: value for key, value in zip(columns, tmp_columns, strict=False)}) # FASTA parsing functionality @@ -1219,8 +1157,10 @@ def json_list_to_df(json_list, columns) -> pd.DataFrame: # functionality specifically for FASTA files, maintaining compatibility with # the original BioPython API while removing the external dependency. + class FastaRecord: - """Simple FASTA record class compatible with BioPython SeqIO.SeqRecord""" + """Simple FASTA record class compatible with BioPython SeqIO.SeqRecord.""" + def __init__(self, seq, id, description=""): self.seq = seq self.id = id @@ -1228,35 +1168,35 @@ def __init__(self, seq, id, description=""): class FastaIO: - """Simple FASTA parser and writer, compatible with BioPython SeqIO interface""" - + """Simple FASTA parser and writer, compatible with BioPython SeqIO interface.""" + @staticmethod def parse(filename, format=None): - """Parse FASTA file and yield records. Compatible with SeqIO.parse()""" + """Parse FASTA file and yield records. Compatible with SeqIO.parse().""" if format and format.lower() != "fasta": raise ValueError(f"Unsupported format: {format}") - - with open(filename, 'r', encoding='utf-8') as handle: + + with open(filename, encoding="utf-8") as handle: current_id = None current_description = "" current_seq = [] - + for line in handle: line = line.strip() if not line: continue - - if line.startswith('>'): + + if line.startswith(">"): # Yield previous record if exists if current_id is not None: - seq_str = ''.join(current_seq) + seq_str = "".join(current_seq) yield FastaRecord(seq_str, current_id, current_description) - + # Start new record header = line[1:] # Remove '>' - if ' ' in header: - current_id = header.split(' ', 1)[0] - current_description = header.split(' ', 1)[1] + if " " in header: + current_id = header.split(" ", 1)[0] + current_description = header.split(" ", 1)[1] else: current_id = header current_description = "" @@ -1264,27 +1204,27 @@ def parse(filename, format=None): else: # Accumulate sequence current_seq.append(line) - + # Yield final record if exists if current_id is not None: - seq_str = ''.join(current_seq) + seq_str = "".join(current_seq) yield FastaRecord(seq_str, current_id, current_description) - + @staticmethod def write(records, filename, format=None): - """Write records to FASTA file. Compatible with SeqIO.write()""" + """Write records to FASTA file. Compatible with SeqIO.write().""" if format and format.lower() != "fasta": raise ValueError(f"Unsupported format: {format}") - - with open(filename, 'w', encoding='utf-8') as handle: + + with open(filename, "w", encoding="utf-8") as handle: for record in records: # Write header - if hasattr(record, 'description') and record.description: + if hasattr(record, "description") and record.description: handle.write(f">{record.id} {record.description}\n") else: handle.write(f">{record.id}\n") - + # Write sequence (wrap at 70 characters) seq_str = str(record.seq) for i in range(0, len(seq_str), 70): - handle.write(seq_str[i:i+70] + '\n') + handle.write(seq_str[i : i + 70] + "\n") diff --git a/pyproject.toml b/pyproject.toml index 8fe2f47af..0947f81fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,128 @@ [build-system] -requires = ["setuptools>=42", "wheel"] -build-backend = "setuptools.build_meta" +build-backend = "hatchling.build" +requires = [ "hatchling" ] + +[project] +name = "gget" +version = "0.30.6" +description = "Efficient querying of genomic databases." +readme = "README.md" +keywords = [ "gget" ] +license = "BSD-2-Clause" +license-files = [ "LICENSE" ] +maintainers = [ + { name = "Laura Luebbert", email = "lauralubbert@gmail.com" }, +] +authors = [ + { name = "Laura Luebbert", email = "lauralubbert@gmail.com" }, +] +requires-python = ">=3.12" +classifiers = [ + "Environment :: Console", + "Framework :: Jupyter", + "Intended Audience :: Science/Research", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Topic :: Utilities", +] +dependencies = [ + "beautifulsoup4>=4.10", + "ipython", + "ipywidgets", + "lxml", + "matplotlib", + "mysql-connector-python>=8.0.32", + "numpy>=1.17.2", + "pandas>=1", + "requests>=2.22", + "tqdm", +] +# Optional feature dependency for `gget cellxgene` (install: pip install gget[cellxgene]). +# No wheels for the newest Python versions yet (e.g. 3.14 via tiledbsoma). +optional-dependencies.cellxgene = [ "cellxgene-census" ] +# https://docs.pypi.org/project_metadata/#project-urls +urls.Documentation = "https://pachterlab.github.io/gget" +urls.Homepage = "https://github.com/pachterlab/gget" +urls.Source = "https://github.com/pachterlab/gget" +scripts.gget = "gget.main:main" + +[dependency-groups] +dev = [ "pre-commit" ] +test = [ + "bravado==11.0.3", + "coverage>=7", + "openai<=0.28.1", + "parameterized==0.9", + "pytest>=7", + "pytest-cov>=6.2.1", +] + +[tool.hatch] +build.targets.wheel.packages = [ "gget" ] +envs.default.installer = "uv" +envs.hatch-test.matrix = [ + { python = [ "3.12", "3.13", "3.14" ] }, +] +# cellxgene-census (the `cellxgene` extra) has no wheels for the newest Python +# versions yet (e.g. 3.14 via tiledbsoma), so install it only where available; +# the gget cellxgene test skips itself when the dependency is absent. +envs.hatch-test.overrides.matrix.python.features = [ + { value = "cellxgene", if = [ "3.12", "3.13" ] }, +] +# pyproject.toml is the single source of truth for the environments CI tests: +# the workflow reads this matrix via `hatch env show --json`. +envs.hatch-test.default-args = [ "tests" ] +envs.hatch-test.dependency-groups = [ "test" ] + +[tool.ruff] +line-length = 120 +src = [ "gget" ] +extend-include = [ "*.ipynb" ] +format.docstring-code-format = true +lint.select = [ + "B", # flake8-bugbear + "BLE", # flake8-blind-except + "C4", # flake8-comprehensions + "D", # pydocstyle + "E", # Error detected by Pycodestyle + "F", # Errors detected by Pyflakes + "I", # isort + "RUF100", # Report unused noqa directives + "TID", # flake8-tidy-imports + "UP", # pyupgrade + "W", # Warning detected by Pycodestyle +] +lint.ignore = [ + "B008", # Errors from function calls in argument defaults. These are fine when the result is immutable. + "D100", # Missing docstring in public module + "D104", # Missing docstring in public package + "D105", # __magic__ methods are often self-explanatory, allow missing docstrings + "D107", # Missing docstring in __init__ + # Disable one in each pair of mutually incompatible rules + "D203", # We don’t want a blank line before a class docstring + "D213", # <> We want docstrings to start immediately after the opening triple quote + "D400", # first line should end with a period [Bug: doesn’t work with single-line docstrings] + "D401", # First line should be in imperative mood; try rephrasing + "E501", # line too long -> we accept long comment lines; formatter gets rid of long code lines + "E731", # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient + "E741", # allow I, O, l as variable names -> I is the identity matrix +] +lint.per-file-ignores."*/__init__.py" = [ "F401" ] +lint.per-file-ignores."docs/*" = [ "I" ] +lint.per-file-ignores."tests/*" = [ "D" ] +lint.pydocstyle.convention = "numpy" + +[tool.pytest] +ini_options.testpaths = [ "tests" ] +ini_options.addopts = [ "-ra" ] + +[tool.coverage] +run.omit = [ + "**/test_*.py", + "gget/main.py", +] +run.source = [ "gget" ] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index d9b0ef104..000000000 --- a/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -# Also add new dependencies to setup.cfg -numpy>=1.17.2 -pandas>=1.0.0 -requests>=2.22.0 -ipython -matplotlib -mysql-connector-python>=8.0.32 -beautifulsoup4>=4.10.0 -ipywidgets -tqdm -lxml diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index fffdbc011..000000000 --- a/setup.cfg +++ /dev/null @@ -1,43 +0,0 @@ -[metadata] -name = gget -version = 0.30.7 -author = Laura Luebbert -author_email = lauralubbert@gmail.com -maintainer = Laura Luebbert -maintainer_email = lauralubbert@gmail.com -description = Efficient querying of genomic databases. -long_description = file: README.md -long_description_content_type = text/markdown -license = BSD-2 -url = https://github.com/scverse/gget -keywords = gget -classifiers = - Environment :: Console - Framework :: Jupyter - Intended Audience :: Science/Research - License :: OSI Approved :: BSD License - Operating System :: OS Independent - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Programming Language :: Python :: 3.10 - Programming Language :: Python :: 3.11 - Programming Language :: Python :: 3.12 - Topic :: Scientific/Engineering :: Bio-Informatics - Topic :: Utilities - -[options] -python_requires = >=3.8 -packages = find: -include_package_data = True -zip_safe = False -install_requires = - numpy>=1.17.2 - pandas>=1.0.0 - requests>=2.22.0 - ipython - matplotlib - mysql-connector-python>=8.0.32 - beautifulsoup4>=4.10.0 - ipywidgets - tqdm - lxml diff --git a/setup.py b/setup.py deleted file mode 100644 index 0ff30dc7d..000000000 --- a/setup.py +++ /dev/null @@ -1,9 +0,0 @@ -from setuptools import setup, find_packages - -setup( - packages=find_packages(include=["gget", "gget.*"]), - include_package_data=True, - entry_points={ - "console_scripts": ["gget=gget.main:main"], - }, -) diff --git a/tests/from_json.py b/tests/from_json.py index 2cc562a9d..1b41cbcf6 100644 --- a/tests/from_json.py +++ b/tests/from_json.py @@ -1,14 +1,16 @@ from __future__ import annotations import unittest + unittest.TestCase.maxDiff = 10_000 # from typing import Callable, Any, Optional, Union -import logging -import pandas as pd -import sys -import json -import hashlib +import hashlib # noqa: E402 +import json # noqa: E402 +import logging # noqa: E402 +import sys # noqa: E402 + +import pandas as pd # noqa: E402 # Here's a question: how many errors does Copilot know? Answer: see below. _KNOWN_ERRORS = { @@ -66,16 +68,17 @@ def assert_equal(self: unittest.TestCase): result_to_test = do_call(func, td[test]["args"]) if test == "test_cosmic_defaults": # special case for cosmic import numpy as np + expected_result = pd.DataFrame(expected_result[0]) expected_result = expected_result.replace({None: np.nan}) # result_to_test.equals(expected_result) pd.testing.assert_frame_equal(result_to_test, expected_result, check_dtype=False) return - + # If result is a DataFrame, convert to list if isinstance(result_to_test, pd.DataFrame): result_to_test = result_to_test.dropna(axis=1).values.tolist() - + self.assertEqual(result_to_test, expected_result) return assert_equal @@ -117,10 +120,7 @@ def assert_none(self: unittest.TestCase): self.assertIn( expected_log, joined, - msg=( - f"Expected log substring {expected_log!r} not found. " - f"Captured: {joined}" - ), + msg=(f"Expected log substring {expected_log!r} not found. Captured: {joined}"), ) else: result_to_test = do_call(func, td[test]["args"]) @@ -154,9 +154,7 @@ def assert_equal_nested(self: unittest.TestCase): result_to_test = do_call(func, td[test]["args"]) # If result is a DataFrame, convert to json (nested dataframes prevent easy listification) if isinstance(result_to_test, pd.DataFrame): - result_to_test = json.loads( - result_to_test.to_json(orient="records", force_ascii=False) - ) + result_to_test = json.loads(result_to_test.to_json(orient="records", force_ascii=False)) self.assertEqual(result_to_test, expected_result) @@ -170,9 +168,7 @@ def assert_equal_json_hash_nested(self: unittest.TestCase): result_to_test = do_call(func, td[test]["args"]) # If result is a DataFrame, convert to json (nested dataframes prevent easy listification) if isinstance(result_to_test, pd.DataFrame): - result_to_test = json.loads( - result_to_test.to_json(orient="records", force_ascii=False) - ) + result_to_test = json.loads(result_to_test.to_json(orient="records", force_ascii=False)) result_to_test = json.dumps(result_to_test) result_to_test = hashlib.md5(result_to_test.encode()).hexdigest() @@ -181,6 +177,7 @@ def assert_equal_json_hash_nested(self: unittest.TestCase): return assert_equal_json_hash_nested + def _assert_equal_json_with_keys(name, td, func): def assert_equal_json_with_keys(self: unittest.TestCase): def normalize(x): @@ -194,10 +191,10 @@ def normalize(x): if all(isinstance(i, list) and len(i) == 2 for i in x): try: x = {i[0]: i[1] for i in x} - except Exception: + except Exception: # noqa: BLE001 try: x = {i[1]: i[0] for i in x} - except Exception: + except Exception: # noqa: BLE001 pass # Collapse singleton wrappers such as: @@ -218,16 +215,14 @@ def normalize(x): return x return x - + test = name - + expected_result = td[test]["expected_result"] result_to_test = do_call(func, td[test]["args"]) if isinstance(result_to_test, pd.DataFrame): result_to_test = json.loads( - result_to_test.dropna(axis=1, how="all").to_json( - orient="records", force_ascii=False - ) + result_to_test.dropna(axis=1, how="all").to_json(orient="records", force_ascii=False) ) result_to_test = normalize(result_to_test) @@ -240,19 +235,12 @@ def normalize(x): keys = list(expected_result[0].keys()) # Convert list-of-lists → list-of-dicts - if ( - isinstance(result_to_test, list) - and len(result_to_test) > 0 - and isinstance(result_to_test[0], list) - ): - result_to_test = [dict(zip(keys, row)) for row in result_to_test] + if isinstance(result_to_test, list) and len(result_to_test) > 0 and isinstance(result_to_test[0], list): + result_to_test = [dict(zip(keys, row, strict=False)) for row in result_to_test] # Optional: float rounding def round_dict(d, ndigits=10): - return { - k: round(v, ndigits) if isinstance(v, float) else v - for k, v in d.items() - } + return {k: round(v, ndigits) if isinstance(v, float) else v for k, v in d.items()} result_to_test = [round_dict(r) for r in result_to_test] expected_result = [round_dict(r) for r in expected_result] @@ -261,12 +249,13 @@ def round_dict(d, ndigits=10): return assert_equal_json_with_keys + def _error(name, td, func): try: # noinspection PyPep8Naming Error = td[name]["expected_result"] except KeyError: - raise ValueError("Error test must have an 'expected_result' key.") + raise ValueError("Error test must have an 'expected_result' key.") from None if Error not in _KNOWN_ERRORS: raise ValueError(f"Unknown error type: {Error}") @@ -275,9 +264,7 @@ def _error(name, td, func): Error = _KNOWN_ERRORS[Error] if "expected_msg" not in td[name]: - print( - f"^ Warning: 'error' test should have an 'expected_msg' key, but test '{name}' lacks one." - ) + print(f"^ Warning: 'error' test should have an 'expected_msg' key, but test '{name}' lacks one.") def error(self: unittest.TestCase): test = name @@ -286,9 +273,7 @@ def error(self: unittest.TestCase): the_exception = cm.exception if "expected_msg" in td[test]: - self.assertEqual( - td[test]["expected_msg"], str(the_exception), f"Error message mismatch" - ) + self.assertEqual(td[test]["expected_msg"], str(the_exception), "Error message mismatch") return error @@ -329,9 +314,7 @@ def __new__(cls, name, bases, dct): type_ = v["type"] if type_ == "code_defined": if k not in dct: - raise ValueError( - f"Test {k} is not defined in code, despite being of type 'code_defined'." - ) + raise ValueError(f"Test {k} is not defined in code, despite being of type 'code_defined'.") continue if type_ in local_types: if not k.startswith("test_"): @@ -360,9 +343,7 @@ def inner(*args, **kwargs): print(f"Loaded test {k} of type {type_} from json.") else: if k not in dct: - raise ValueError( - f"Unknown test type: {type_} and no test method defined." - ) + raise ValueError(f"Unknown test type: {type_} and no test method defined.") print(f"Unknown test type: {type_}", file=sys.stderr) return super().__new__(cls, name, bases, dct) diff --git a/tests/test_8cube.py b/tests/test_8cube.py index a26bfdafc..0a3d14a51 100644 --- a/tests/test_8cube.py +++ b/tests/test_8cube.py @@ -1,8 +1,9 @@ -import unittest import json import os +import unittest + +from gget.gget_8cube import gene_expression, psi_block, specificity -from gget.gget_8cube import specificity, psi_block, gene_expression from .from_json import from_json # Load JSON fixture @@ -19,9 +20,7 @@ gene_expression_tests = {k: v for k, v in fixture.items() if "gene_expression" in k} -class TestSpecificity( - unittest.TestCase, metaclass=from_json(specificity_tests, specificity) -): +class TestSpecificity(unittest.TestCase, metaclass=from_json(specificity_tests, specificity)): """Tests for specificity()""" pass @@ -33,9 +32,7 @@ class TestPsiBlock(unittest.TestCase, metaclass=from_json(psi_block_tests, psi_b pass -class TestGeneExpression( - unittest.TestCase, metaclass=from_json(gene_expression_tests, gene_expression) -): +class TestGeneExpression(unittest.TestCase, metaclass=from_json(gene_expression_tests, gene_expression)): """Tests for gene_expression()""" pass diff --git a/tests/test_archs4.py b/tests/test_archs4.py index c9847cadc..c6336bc91 100644 --- a/tests/test_archs4.py +++ b/tests/test_archs4.py @@ -1,6 +1,8 @@ -import unittest import json +import unittest + from gget.gget_archs4 import archs4 + from .from_json import from_json # Load dictionary containing arguments and expected results diff --git a/tests/test_bgee.py b/tests/test_bgee.py index 58af3cb49..08903199a 100644 --- a/tests/test_bgee.py +++ b/tests/test_bgee.py @@ -1,6 +1,8 @@ -import unittest import json +import unittest + from gget.gget_bgee import bgee + from .from_json import from_json # Load dictionary containing arguments and expected results diff --git a/tests/test_blast.py b/tests/test_blast.py index 9a668c34f..8fea295e6 100644 --- a/tests/test_blast.py +++ b/tests/test_blast.py @@ -1,12 +1,14 @@ -import unittest import json +import unittest + from gget.gget_blast import blast + from .from_json import from_json # Load dictionary containing arguments and expected results with open("./tests/fixtures/test_blast.json") as json_file: blast_dict = json.load(json_file) + class TestBlast(unittest.TestCase, metaclass=from_json(blast_dict, blast)): pass # all tests are loaded from json - \ No newline at end of file diff --git a/tests/test_blat.py b/tests/test_blat.py index d871c27bb..adcd114b4 100644 --- a/tests/test_blat.py +++ b/tests/test_blat.py @@ -1,6 +1,8 @@ -import unittest import json +import unittest + from gget.gget_blat import blat + from .from_json import from_json # Load dictionary containing arguments and expected results diff --git a/tests/test_cbio.py b/tests/test_cbio.py index 9e9629691..66040f191 100644 --- a/tests/test_cbio.py +++ b/tests/test_cbio.py @@ -1,9 +1,10 @@ -import hashlib +import json import os import unittest -import json -from gget.gget_cbio import download_cbioportal_data, cbio_search -from .from_json import from_json, do_call + +from gget.gget_cbio import cbio_search, download_cbioportal_data + +from .from_json import do_call, from_json # Load dictionary containing arguments and expected results with open("./tests/fixtures/test_cbio_search.json") as json_file: @@ -13,9 +14,7 @@ cb_dict = json.load(json_file) -class TestCbioSearch( - unittest.TestCase, metaclass=from_json(cb_search_dict, cbio_search) -): +class TestCbioSearch(unittest.TestCase, metaclass=from_json(cb_search_dict, cbio_search)): pass # all tests are loaded from json @@ -24,9 +23,7 @@ def cbio_download(self: unittest.TestCase): test = name expected_result = td[test]["expected_result"] - if not isinstance(expected_result, dict) and not isinstance( - expected_result, bool - ): + if not isinstance(expected_result, dict) and not isinstance(expected_result, bool): raise ValueError("Expected result must be a dictionary or a boolean") result = do_call(func, td[test]["args"]) @@ -35,7 +32,7 @@ def cbio_download(self: unittest.TestCase): # # check that all files downloaded # self.assertTrue(result) - for file_name, expected_hash in expected_result.items(): + for file_name, _expected_hash in expected_result.items(): if os.path.exists(file_name): # # check non-empty if os.path.getsize(file_name) == 0: @@ -57,8 +54,6 @@ def cbio_download(self: unittest.TestCase): class TestCbio( unittest.TestCase, - metaclass=from_json( - cb_dict, download_cbioportal_data, {"cbio_download": _cbio_download} - ), + metaclass=from_json(cb_dict, download_cbioportal_data, {"cbio_download": _cbio_download}), ): pass # all tests are loaded from json diff --git a/tests/test_cellxgene.py b/tests/test_cellxgene.py index 07acf7301..29b77e67d 100644 --- a/tests/test_cellxgene.py +++ b/tests/test_cellxgene.py @@ -1,7 +1,14 @@ -import unittest -import pandas as pd +import importlib.util import json -from gget.gget_cellxgene import cellxgene, SUPPORTED_SPECIES +import unittest + +from gget.gget_cellxgene import SUPPORTED_SPECIES, cellxgene + +# cellxgene-census has no wheels for some newer Python versions (e.g. 3.14, via +# its tiledbsoma dependency). The live integration tests below need it, so they +# skip when it is unavailable; the validation tests do not need it (the species +# allowlist check raises before the optional dependency is imported) and always run. +_HAS_CELLXGENE_CENSUS = importlib.util.find_spec("cellxgene_census") is not None # Load dictionary containing arguments and expected results with open("./tests/fixtures/test_cellxgene.json") as json_file: @@ -9,9 +16,7 @@ def repr_dict(adata): - """ - Function to convert the items/structure of an AnnData object to a dictionary. - """ + """Convert the items/structure of an AnnData object to a dictionary.""" d = {} for attr in ( "n_obs", @@ -35,6 +40,7 @@ def repr_dict(adata): return d +@unittest.skipUnless(_HAS_CELLXGENE_CENSUS, "cellxgene-census is not installed") class TestCellxgene(unittest.TestCase): def test_cellxgene_adata(self): test = "test_cellxgene_adata" diff --git a/tests/test_compile.py b/tests/test_compile.py index 619fef4e7..ed0a71c9a 100644 --- a/tests/test_compile.py +++ b/tests/test_compile.py @@ -1,12 +1,9 @@ +import contextlib +import os import unittest # Used here to mock different operating systems -from unittest.mock import patch -from unittest.mock import MagicMock - -import os -import shutil -import contextlib +from unittest.mock import MagicMock, patch from gget.compile import compile_muscle @@ -23,6 +20,7 @@ def test_compiler_windows(self): with contextlib.redirect_stdout(open(os.devnull, "w")): compile_muscle() + ## The make command requires different programs for each OS, so these tests do not work universally # class TestCompilerLinux(unittest.TestCase): # def test_compiler_linux(self): diff --git a/tests/test_cosmic.py b/tests/test_cosmic.py index 5a5b33561..9a7b3fe93 100644 --- a/tests/test_cosmic.py +++ b/tests/test_cosmic.py @@ -1,11 +1,11 @@ -import unittest +import json import os import pathlib as pl -import pandas as pd -import json import time +import unittest from gget.gget_cosmic import cosmic + # from gget.utils import get_latest_cosmic from .from_json import from_json @@ -28,10 +28,12 @@ class TestCaseBase(unittest.TestCase): def assertIsFile(self, path): if not pl.Path(path).resolve().is_file(): - raise AssertionError("File does not exist: %s" % str(path)) + raise AssertionError(f"File does not exist: {path}") -class TestCosmicWorkflow(TestCaseBase, metaclass=from_json(cosmic_dict, cosmic, pre_test=lambda: time.sleep(sleep_time))): +class TestCosmicWorkflow( + TestCaseBase, metaclass=from_json(cosmic_dict, cosmic, pre_test=lambda: time.sleep(sleep_time)) +): """ Combined test class to: 1. Download COSMIC cancer_example data diff --git a/tests/test_diamond.py b/tests/test_diamond.py index 5bf602744..24db19e64 100644 --- a/tests/test_diamond.py +++ b/tests/test_diamond.py @@ -1,7 +1,8 @@ -import unittest import json +import unittest from gget.gget_diamond import diamond + from .from_json import from_json # Load dictionary containing arguments and expected results diff --git a/tests/test_elm.py b/tests/test_elm.py index 983a74a52..de8f67c35 100644 --- a/tests/test_elm.py +++ b/tests/test_elm.py @@ -1,5 +1,5 @@ -import unittest import json +import unittest from gget.gget_elm import elm from gget.gget_setup import setup as gget_setup @@ -10,16 +10,14 @@ gget_setup(module="elm") + class TestELM(unittest.TestCase): def test_elm_uniprot_id_in_elm(self): test = "test1" expected_result = elm_dict[test]["expected_result"] result1, result2 = elm(**elm_dict[test]["args"]) - result_to_test = ( - result1.dropna(axis=1).values.tolist() - + result2.dropna(axis=1).values.tolist()[15:20] - ) + result_to_test = result1.dropna(axis=1).values.tolist() + result2.dropna(axis=1).values.tolist()[15:20] self.assertListEqual(result_to_test, expected_result) @@ -28,10 +26,7 @@ def test_elm_uniprot_id_new(self): expected_result = elm_dict[test]["expected_result"] result1, result2 = elm(**elm_dict[test]["args"]) - result_to_test = ( - result1.dropna(axis=1).values.tolist() - + result2.dropna(axis=1).values.tolist()[15:20] - ) + result_to_test = result1.dropna(axis=1).values.tolist() + result2.dropna(axis=1).values.tolist()[15:20] self.assertListEqual(result_to_test, expected_result) @@ -40,9 +35,6 @@ def test_elm_uniprot_aminoacidseq(self): expected_result = elm_dict[test]["expected_result"] result1, result2 = elm(**elm_dict[test]["args"]) - result_to_test = ( - result1.dropna(axis=1).values.tolist() - + result2.dropna(axis=1).values.tolist()[15:20] - ) + result_to_test = result1.dropna(axis=1).values.tolist() + result2.dropna(axis=1).values.tolist()[15:20] self.assertListEqual(result_to_test, expected_result) diff --git a/tests/test_enrichr.py b/tests/test_enrichr.py index 5e0da6c85..3df50fa34 100644 --- a/tests/test_enrichr.py +++ b/tests/test_enrichr.py @@ -1,9 +1,11 @@ -import unittest -import pandas as pd import json +import math +import unittest + import matplotlib import matplotlib.pyplot as plt -import math +import pandas as pd + from .from_json import from_json # Prevent matplotlib from opening windows @@ -31,10 +33,7 @@ def test_enrichr_background(self): # If result is a DataFrame, convert to list if isinstance(result_to_test, pd.DataFrame): result_to_test = result_to_test.values.tolist()[:20] - result_to_test = [ - list(map(lambda x: x if x != math.inf else "inf", i)) - for i in result_to_test - ] + result_to_test = [[x if x != math.inf else "inf" for x in i] for i in result_to_test] self.assertListEqual(result_to_test, expected_result) @@ -45,10 +44,7 @@ def test_enrichr_background_ensembl(self): # If result is a DataFrame, convert to list if isinstance(result_to_test, pd.DataFrame): result_to_test = result_to_test.values.tolist() - result_to_test = [ - list(map(lambda x: x if x != math.inf else "inf", i)) - for i in result_to_test - ] + result_to_test = [[x if x != math.inf else "inf" for x in i] for i in result_to_test] self.assertListEqual(result_to_test, expected_result) diff --git a/tests/test_gpt.py b/tests/test_gpt.py index 824a70227..d3853a557 100644 --- a/tests/test_gpt.py +++ b/tests/test_gpt.py @@ -1,5 +1,6 @@ import unittest from unittest.mock import patch + from gget.gget_gpt import gpt diff --git a/tests/test_info.py b/tests/test_info.py index 359613532..c4157bb11 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -1,9 +1,11 @@ -import unittest # import unittest.mock # import pandas as pd import json +import unittest + # import time from gget.gget_info import info + from .from_json import from_json # Load dictionary containing arguments and expected results @@ -13,9 +15,11 @@ # Sleep time in seconds (wait [sleep_time] seconds between server requests to avoid 502 errors for WB and FB IDs) # sleep_time = 15 + class TestInfo(unittest.TestCase, metaclass=from_json(info_dict, info)): pass # all tests are loaded from json + # # todo convert to json loading once wormbase & flybase IDs are fixed. At that point, the json test framework will need a way to handle the ANY values # class TestInfo(unittest.TestCase): # maxDiff = None diff --git a/tests/test_muscle.py b/tests/test_muscle.py index 10a1202de..5ba50e509 100644 --- a/tests/test_muscle.py +++ b/tests/test_muscle.py @@ -1,10 +1,10 @@ +import contextlib +import filecmp +import os import unittest # Library to test functions that have calls to print() from unittest import mock -import os -import contextlib -import filecmp from gget.gget_muscle import muscle @@ -28,7 +28,7 @@ def test_muscle_nt(self): ) def tearDown(self): - super(TestMuscle, self).tearDown() + super().tearDown() # Delete temporary result file os.remove("tests/fixtures/tmp.afa") @@ -49,11 +49,6 @@ def test_muscle_nt_txt(self): "The reference and muscle nucleotide alignment are not the same.", ) - def tearDown(self): - super(TestMuscle, self).tearDown() - # Delete temporary result file - os.remove("tests/fixtures/tmp.afa") - class TestMuscleSuper(unittest.TestCase): def test_muscle_nt_super5(self): @@ -74,7 +69,7 @@ def test_muscle_nt_super5(self): ) def tearDown(self): - super(TestMuscleSuper, self).tearDown() + super().tearDown() # Delete temporary result file os.remove("tests/fixtures/tmp.afa") @@ -98,7 +93,7 @@ def test_muscle_aa(self): ) def tearDown(self): - super(TestMuscleAA, self).tearDown() + super().tearDown() # Delete temporary result file os.remove("tests/fixtures/tmp.afa") @@ -127,7 +122,7 @@ def test_muscle_seqs_as_input(self): ) def tearDown(self): - super(TestMuscleSeqsInput, self).tearDown() + super().tearDown() # Delete temporary result file os.remove("tests/fixtures/tmp.afa") @@ -151,7 +146,7 @@ def test_muscle_aa_super5(self): ) def tearDown(self): - super(TestMuscleAASuper, self).tearDown() + super().tearDown() # Delete temporary result file os.remove("tests/fixtures/tmp.afa") @@ -167,6 +162,4 @@ def test_muscle_print_nt(self): muscle(fasta) # print_mock.assert_called_with("\n") # print_mock.assert_called_with("test1\n", "\x1b[38;5;15m\x1b[48;5;9mA\x1b[0;0m") - print_mock.assert_called_with( - "test2\n", "\t", "\x1b[38;5;15m\x1b[48;5;9mA\x1b[0;0m" - ) + print_mock.assert_called_with("test2\n", "\t", "\x1b[38;5;15m\x1b[48;5;9mA\x1b[0;0m") diff --git a/tests/test_mutate.py b/tests/test_mutate.py index e990ad8f9..5d8f7250a 100644 --- a/tests/test_mutate.py +++ b/tests/test_mutate.py @@ -1,16 +1,15 @@ import json - -import pytest +import os +import tempfile import unittest + import gget import pandas as pd -import os -import tempfile -from .from_json import from_json, do_call +import pytest + +from .from_json import do_call, from_json -LONG_SEQUENCE = ( - "CCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCG" -) +LONG_SEQUENCE = "CCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCG" EXTRA_LONG_SEQUENCE = "CCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCG" LONG_SEQUENCE_WITH_N = "CCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCNCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCG" @@ -46,7 +45,7 @@ def create_temp_files(): temp_fasta_file = tempfile.NamedTemporaryFile(delete=False, suffix=".fasta") with open(temp_fasta_file.name, "w") as fasta_file: - for seq_id, sequence in zip(seq_ID_list, sequence_list): + for seq_id, sequence in zip(seq_ID_list, sequence_list, strict=False): fasta_file.write(f">{seq_id}\n") fasta_file.write(f"{sequence}\n") @@ -65,15 +64,9 @@ def assert_global_variables_zero( number_index_errors=0, ): assert gget.gget_mutate.intronic_mutations == number_intronic_position_mutations - assert ( - gget.gget_mutate.posttranslational_region_mutations - == number_posttranslational_region_mutations - ) + assert gget.gget_mutate.posttranslational_region_mutations == number_posttranslational_region_mutations assert gget.gget_mutate.uncertain_mutations == number_uncertain_mutations - assert ( - gget.gget_mutate.ambiguous_position_mutations - == number_ambiguous_position_mutations - ) + assert gget.gget_mutate.ambiguous_position_mutations == number_ambiguous_position_mutations assert gget.gget_mutate.mut_idx_outside_seq == number_index_errors @@ -88,9 +81,7 @@ def _recursive_replace(v, old: str, new: str, exact=False): return v.replace(old, new) elif isinstance(v, dict): return { - _recursive_replace(k, old, new, exact=exact): _recursive_replace( - v, old, new, exact=exact - ) + _recursive_replace(k, old, new, exact=exact): _recursive_replace(v, old, new, exact=exact) for k, v in v.items() } elif isinstance(v, list): @@ -212,9 +203,7 @@ class TestMutate( def test_csv_of_mutations(create_temp_files): mutation_temp_csv_file, sequence_temp_fasta_path = create_temp_files - result = gget.mutate( - sequences=sequence_temp_fasta_path, mutations=mutation_temp_csv_file - ) + result = gget.mutate(sequences=sequence_temp_fasta_path, mutations=mutation_temp_csv_file) assert result == [ "GCCCCACCCCGCCCCTCCCCGCCCCACCCCACCCCTCCCCGCCCCACCCCGCCCCTCCCCG", diff --git a/tests/test_opentargets.py b/tests/test_opentargets.py index 3879b0921..e73ced1cc 100644 --- a/tests/test_opentargets.py +++ b/tests/test_opentargets.py @@ -1,6 +1,8 @@ -import unittest import json +import unittest + from gget.gget_opentargets import opentargets + from .from_json import from_json # Load dictionary containing arguments and expected results diff --git a/tests/test_pdb.py b/tests/test_pdb.py index ecaacfeb6..a805e09a1 100644 --- a/tests/test_pdb.py +++ b/tests/test_pdb.py @@ -1,8 +1,8 @@ -import unittest -import pandas as pd -import json import filecmp +import json import os +import unittest + from gget.gget_pdb import pdb # Load dictionary containing arguments and expected results @@ -79,7 +79,7 @@ def test_pdb_pdb(self): ) def tearDown(self): - super(TestPDB, self).tearDown() + super().tearDown() # Delete temporary result file try: os.remove("4ACQ.pdb") diff --git a/tests/test_ref.py b/tests/test_ref.py index 6111e4dfe..c6ab3e0e8 100644 --- a/tests/test_ref.py +++ b/tests/test_ref.py @@ -1,6 +1,8 @@ -import unittest import json +import unittest + from gget.gget_ref import ref + from .from_json import from_json # Load dictionary containing arguments and expected results diff --git a/tests/test_search.py b/tests/test_search.py index b6b911cbf..f284c5368 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -1,7 +1,8 @@ -import unittest -import pandas as pd import json +import unittest + from gget.gget_search import search + from .from_json import from_json # Load dictionary containing arguments and expected results diff --git a/tests/test_seq.py b/tests/test_seq.py index 94893be2c..8d34e4ed7 100644 --- a/tests/test_seq.py +++ b/tests/test_seq.py @@ -1,7 +1,7 @@ -import unittest -import pandas as pd import json import time +import unittest + from gget.gget_seq import seq # Load dictionary containing arguments and expected results @@ -11,6 +11,7 @@ # Sleep time in seconds (wait [sleep_time] seconds between server requests to avoid 502 errors for WB and FB IDs) sleep_time = 10 + # todo convert to json loading once wormbase & flybase IDs are fixed. At that point, the json test framework will need a way to handle the ANY values class TestSeq(unittest.TestCase): def test_seq_gene(self): diff --git a/tests/test_utils.py b/tests/test_utils.py index 125b36578..1eda8078d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,33 +1,30 @@ import unittest -import numpy as np + +from gget.constants import ENSEMBL_FTP_URL_NV, ENSEMBL_REST_API, UNIPROT_REST_API from gget.utils import ( - n_colors, aa_colors, - get_uniprot_seqs, + find_latest_ens_rel, get_uniprot_info, + get_uniprot_seqs, + n_colors, + read_fasta, + ref_species_options, rest_query, - find_latest_ens_rel, search_species_options, - ref_species_options, - read_fasta, ) -from gget.constants import UNIPROT_REST_API, ENSEMBL_REST_API, ENSEMBL_FTP_URL_NV - from .fixtures import ( - LATEST_ENS_RELEASE, - SPECIES_OPTIONS, IV_SPECIES_OPTIONS, - REF_SPECIES_OPTIONS, + LATEST_ENS_RELEASE, REF_IV_SPECIES_OPTIONS, + REF_SPECIES_OPTIONS, + SPECIES_OPTIONS, ) class TestUtils(unittest.TestCase): def test_read_fasta(self): - result_to_test1, result_to_test2 = read_fasta( - "tests/fixtures/muscle_nt_test.fa" - ) + result_to_test1, result_to_test2 = read_fasta("tests/fixtures/muscle_nt_test.fa") result_to_test = result_to_test1 + result_to_test2 expected_result1 = [ @@ -55,9 +52,7 @@ def test_aa_colors(self): self.assertEqual(result_to_test, expected_result) def test_get_uniprot_seqs(self): - df = get_uniprot_seqs( - UNIPROT_REST_API, ["ENST00000392653.3", "ENST00000392657.7"] - ) + df = get_uniprot_seqs(UNIPROT_REST_API, ["ENST00000392653.3", "ENST00000392657.7"]) result_to_test = df.values.tolist() expected_result = [ [ @@ -179,9 +174,7 @@ def test_ref_species_options(self): self.assertEqual(result_to_test, expected_result) def test_ref_iv_species_options(self): - result_to_test = ref_species_options( - database=ENSEMBL_FTP_URL_NV, which="dna", release=55 - ) + result_to_test = ref_species_options(database=ENSEMBL_FTP_URL_NV, which="dna", release=55) expected_result = REF_IV_SPECIES_OPTIONS self.assertEqual(result_to_test, expected_result) diff --git a/tests/test_virus.py b/tests/test_virus.py index 1be574528..12798ac11 100644 --- a/tests/test_virus.py +++ b/tests/test_virus.py @@ -129,75 +129,76 @@ Total: 186 tests """ -import unittest + +import calendar +import functools import json import os -import re import shutil import subprocess import tempfile import time -import functools +import unittest import zipfile -import calendar from datetime import datetime import pandas as pd - from gget.gget_virus import ( - virus, - _get_datasets_path, + _batch_accessions_for_url, + _calculate_max_accessions_per_batch, + _clean_xml_declarations, _clear_datasets_cache, - _get_modified_virus_name, - _track_failed_operation, - _validate_datasets_binary, + _deduplicate_metadata_against_baseline, + _force_garbage_collection, + _genbank_xml_to_csv, + _get_datasets_path, _get_datasets_version, _get_gget_version, + _get_memory_usage, + _get_modified_virus_name, + _local_name, + _merge_baseline_with_new, _parse_accession_input, _parse_baseline_file, - _deduplicate_metadata_against_baseline, - _save_partial_metadata, - _merge_baseline_with_new, - _calculate_max_accessions_per_batch, - _batch_accessions_for_url, - _retry_with_exponential_backoff, _parse_date, + _parse_genbank_xml, _parse_partial_date_for_range_check, - _clean_xml_declarations, - _local_name, + _retry_with_exponential_backoff, + _save_partial_metadata, + _stream_copy_fasta, + _track_failed_operation, _unzip_file, - _get_memory_usage, - _force_garbage_collection, - is_sars_cov2_query, - is_alphainfluenza_query, - load_metadata_from_api_reports, + _validate_datasets_binary, + _write_fasta_record, check_min_max, - filter_metadata_only, - filter_genbank_metadata, filter_cached_metadata_for_unused_filters, - _write_fasta_record, - _stream_copy_fasta, + filter_genbank_metadata, + filter_metadata_only, filter_sequences, - save_command_summary, + is_alphainfluenza_query, + is_sars_cov2_query, + load_metadata_from_api_reports, merge_metadata_csvs, - save_metadata_to_csv, - _genbank_xml_to_csv, - _parse_genbank_xml, + save_command_summary, save_genbank_metadata_to_csv, + save_metadata_to_csv, + virus, ) + from .from_json import from_json def retry_on_network_error(max_retries=3, delay=5): """Decorator to retry tests that may fail due to network issues. - + This is useful for tests that make real API calls to NCBI, which can occasionally time out or fail due to network flakiness. - + Args: max_retries: Maximum number of retry attempts (default: 3) delay: Seconds to wait between retries (default: 5) """ + def decorator(test_func): @functools.wraps(test_func) def wrapper(*args, **kwargs): @@ -208,7 +209,7 @@ def wrapper(*args, **kwargs): except Exception as e: # Only retry on network-related errors error_msg = str(e).lower() - if any(keyword in error_msg for keyword in ['timeout', 'timed out', 'connection', 'network']): + if any(keyword in error_msg for keyword in ["timeout", "timed out", "connection", "network"]): last_exception = e if attempt < max_retries - 1: time.sleep(delay) @@ -217,9 +218,12 @@ def wrapper(*args, **kwargs): raise # If all retries failed, raise the last exception raise last_exception + return wrapper + return decorator + # Load dictionary containing arguments and expected results with open("./tests/fixtures/test_virus.json") as json_file: virus_dict = json.load(json_file) @@ -227,14 +231,14 @@ def wrapper(*args, **kwargs): class TestVirus(unittest.TestCase, metaclass=from_json(virus_dict, virus)): """Test suite for gget.virus module. - + This comprehensive test suite covers: - + 1. Input Validation (19 JSON-defined tests): - Type checking for boolean, string, and integer parameters - Value validation (completeness, batch sizes, virus names) - Range validation (min/max pairs for dates, lengths, counts) - + 2. Functional Tests (18 code-defined tests): - Basic file creation and accession downloads - Individual filter functionality (host, completeness, length, annotated, refseq) @@ -245,172 +249,174 @@ class TestVirus(unittest.TestCase, metaclass=from_json(virus_dict, virus)): - GenBank metadata retrieval - Multiple filter combinations - Integer virus ID handling - + 3. Data Quality & Verification Tests (6 code-defined tests): - Relationship checks: FASTA/CSV/JSONL count consistency - Filter verification: Host and release date filter effectiveness - Schema validation: Expected metadata columns exist - Completeness filter verification - Multi-filter relationship checks - + Coverage: 85% of parameters tested (29/34), with 43 total test cases. See module docstring for detailed parameter coverage analysis. """ - + @classmethod def setUpClass(cls): """Set up test fixtures that are shared across all tests.""" cls.test_output_dir = "test_virus_output" - + @classmethod def tearDownClass(cls): """Clean up after all tests have run.""" # Clean up test output directory if it exists if os.path.exists(cls.test_output_dir): shutil.rmtree(cls.test_output_dir) - + def setUp(self): """Set up before each test method.""" # Create a fresh test output directory if os.path.exists(self.test_output_dir): shutil.rmtree(self.test_output_dir) os.makedirs(self.test_output_dir, exist_ok=True) - + def tearDown(self): """Clean up after each test method.""" # Clean up test output directory if os.path.exists(self.test_output_dir): try: shutil.rmtree(self.test_output_dir) - except Exception: + except Exception: # noqa: BLE001 pass # Ignore cleanup errors - + def _check_output_files(self, virus_name, outfolder): """Helper method to check if expected output files were created. - + Args: virus_name: Name of the virus (used in file naming) outfolder: Output folder where files should be created - + Returns: dict: Dictionary with file paths and existence status """ # Clean virus name for file naming (replace spaces with underscores) virus_clean = virus_name.replace(" ", "_") - + expected_files = { "fasta": os.path.join(outfolder, f"{virus_clean}_sequences.fasta"), "csv": os.path.join(outfolder, f"{virus_clean}_metadata.csv"), - "jsonl": os.path.join(outfolder, f"{virus_clean}_metadata.jsonl") + "jsonl": os.path.join(outfolder, f"{virus_clean}_metadata.jsonl"), } - + results = {} for file_type, file_path in expected_files.items(): results[file_type] = { "path": file_path, "exists": os.path.exists(file_path), - "size": os.path.getsize(file_path) if os.path.exists(file_path) else 0 + "size": os.path.getsize(file_path) if os.path.exists(file_path) else 0, } - + return results - + def _count_fasta_sequences(self, fasta_file): """Count the number of sequences in a FASTA file. - + Args: fasta_file: Path to FASTA file - + Returns: int: Number of sequences """ count = 0 if os.path.exists(fasta_file): - with open(fasta_file, 'r') as f: + with open(fasta_file) as f: for line in f: - if line.startswith('>'): + if line.startswith(">"): count += 1 return count - + def _count_jsonl_records(self, jsonl_file): """Count the number of records in a JSONL file. - + Args: jsonl_file: Path to JSONL file - + Returns: int: Number of records """ count = 0 if os.path.exists(jsonl_file): - with open(jsonl_file, 'r') as f: + with open(jsonl_file) as f: for line in f: if line.strip(): count += 1 return count - + def _count_csv_records(self, csv_file): """Count the number of records in a CSV file (excluding header). - + Args: csv_file: Path to CSV file - + Returns: int: Number of records (excluding header) """ count = 0 if os.path.exists(csv_file): - with open(csv_file, 'r') as f: + with open(csv_file) as f: # Skip header next(f, None) for line in f: if line.strip(): count += 1 return count - + def _parse_csv_metadata(self, csv_file): """Parse CSV metadata file and return records as list of dicts. - + Args: csv_file: Path to CSV file - + Returns: list: List of dictionaries containing metadata records """ import csv + records = [] if os.path.exists(csv_file): - with open(csv_file, 'r', encoding='utf-8') as f: + with open(csv_file, encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: records.append(row) return records - + def _get_csv_columns(self, csv_file): """Get column names from CSV file. - + Args: csv_file: Path to CSV file - + Returns: list: List of column names """ import csv + if os.path.exists(csv_file): - with open(csv_file, 'r', encoding='utf-8') as f: + with open(csv_file, encoding="utf-8") as f: reader = csv.DictReader(f) return reader.fieldnames return [] - + # ========================================================================= # FUNCTIONAL TESTS: Basic file creation and filter functionality # ========================================================================= # These tests verify that the virus function creates output files # correctly and that individual filters work as expected. - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_specific_accession_file_creation(self): """Test that files are created when downloading a specific accession. - + Downloads SARS-CoV-2 reference sequence (NC_045512.2) and verifies: - Function returns None (writes to disk) - All three output files created (FASTA, CSV, JSONL) @@ -419,124 +425,100 @@ def test_virus_specific_accession_file_creation(self): """ virus_name = "NC_045512.2" outfolder = self.test_output_dir - + # Run the function (should create files, returns None) - result = virus( - virus=virus_name, - is_accession=True, - outfolder=outfolder - ) - + result = virus(virus=virus_name, is_accession=True, outfolder=outfolder) + # Check that function returns None self.assertIsNone(result) - + # Check that output files were created files = self._check_output_files(virus_name, outfolder) - + # Assert all files exist - self.assertTrue(files["fasta"]["exists"], - f"FASTA file not created: {files['fasta']['path']}") - self.assertTrue(files["csv"]["exists"], - f"CSV file not created: {files['csv']['path']}") - self.assertTrue(files["jsonl"]["exists"], - f"JSONL file not created: {files['jsonl']['path']}") - + self.assertTrue(files["fasta"]["exists"], f"FASTA file not created: {files['fasta']['path']}") + self.assertTrue(files["csv"]["exists"], f"CSV file not created: {files['csv']['path']}") + self.assertTrue(files["jsonl"]["exists"], f"JSONL file not created: {files['jsonl']['path']}") + # Assert files are not empty self.assertGreater(files["fasta"]["size"], 0, "FASTA file is empty") self.assertGreater(files["csv"]["size"], 0, "CSV file is empty") self.assertGreater(files["jsonl"]["size"], 0, "JSONL file is empty") - + # Count sequences (should be 1 for a specific accession) seq_count = self._count_fasta_sequences(files["fasta"]["path"]) self.assertGreaterEqual(seq_count, 1, "No sequences found in FASTA file") - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_with_host_filter(self): """Test that host filter works and creates appropriate files.""" virus_name = "Zika virus" outfolder = self.test_output_dir - - result = virus( - virus=virus_name, - host="human", - outfolder=outfolder - ) - + + result = virus(virus=virus_name, host="human", outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) self.assertTrue(files["fasta"]["exists"], "FASTA file not created with host filter") self.assertTrue(files["csv"]["exists"], "CSV file not created with host filter") self.assertTrue(files["jsonl"]["exists"], "JSONL file not created with host filter") - + # Verify that files contain data self.assertGreater(files["fasta"]["size"], 0, "FASTA file is empty with host filter") - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_with_completeness_filter(self): """Test that completeness filter works correctly.""" virus_name = "Zika virus" outfolder = self.test_output_dir - - result = virus( - virus=virus_name, - nuc_completeness="complete", - outfolder=outfolder - ) - + + result = virus(virus=virus_name, nuc_completeness="complete", outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) self.assertTrue(files["fasta"]["exists"], "FASTA file not created with completeness filter") self.assertGreater(files["fasta"]["size"], 0, "FASTA file is empty with completeness filter") - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_with_length_filters(self): """Test that sequence length filters work correctly.""" virus_name = "Zika virus" outfolder = self.test_output_dir - - result = virus( - virus=virus_name, - min_seq_length=10000, - max_seq_length=11000, - outfolder=outfolder - ) - + + result = virus(virus=virus_name, min_seq_length=10000, max_seq_length=11000, outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) self.assertTrue(files["fasta"]["exists"], "FASTA file not created with length filters") - + # Verify sequences are within expected length range # This would require parsing the FASTA file, which we do with count seq_count = self._count_fasta_sequences(files["fasta"]["path"]) self.assertGreater(seq_count, 0, "No sequences passed length filters") - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_with_annotated_filter(self): """Test that annotated filter works correctly.""" virus_name = "Zika virus" outfolder = self.test_output_dir - - result = virus( - virus=virus_name, - annotated=True, - outfolder=outfolder - ) - + + result = virus(virus=virus_name, annotated=True, outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) self.assertTrue(files["fasta"]["exists"], "FASTA file not created with annotated filter") self.assertGreater(files["fasta"]["size"], 0, "FASTA file is empty with annotated filter") - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_with_multiple_filters(self): """Test that multiple filters can be combined correctly.""" virus_name = "Zika virus" outfolder = self.test_output_dir - + result = virus( virus=virus_name, host="human", @@ -544,226 +526,215 @@ def test_virus_with_multiple_filters(self): min_seq_length=10500, max_seq_length=11000, annotated=True, - outfolder=outfolder + outfolder=outfolder, ) - + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) self.assertTrue(files["fasta"]["exists"], "FASTA file not created with multiple filters") self.assertTrue(files["csv"]["exists"], "CSV file not created with multiple filters") self.assertTrue(files["jsonl"]["exists"], "JSONL file not created with multiple filters") - + # Check that filters reduced the dataset (should have some sequences) seq_count = self._count_fasta_sequences(files["fasta"]["path"]) self.assertGreater(seq_count, 0, "No sequences passed multiple filters") - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_integer_virus_id(self): """Test that integer virus IDs are handled correctly. - + Tests using Zika virus taxon ID (64320) as integer input. Verifies that integer IDs are properly converted and files created. """ virus_id = 64320 # Zika virus taxon ID outfolder = self.test_output_dir - - result = virus( - virus=virus_id, - outfolder=outfolder - ) - + + result = virus(virus=virus_id, outfolder=outfolder) + self.assertIsNone(result) - + # Check files with string version of virus ID virus_clean = str(virus_id) expected_fasta = os.path.join(outfolder, f"{virus_clean}_sequences.fasta") - self.assertTrue(os.path.exists(expected_fasta), - f"FASTA file not created for integer virus ID: {expected_fasta}") - + self.assertTrue( + os.path.exists(expected_fasta), f"FASTA file not created for integer virus ID: {expected_fasta}" + ) + # ========================================================================= # DATA QUALITY & VERIFICATION TESTS # ========================================================================= # These tests verify data consistency, filter effectiveness, and that # API/data source changes would be detected. They go beyond simple file # existence checks to validate actual data quality. - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_relationship_check_counts_match(self): """Test that FASTA sequence count matches CSV and JSONL record counts. - + Downloads a specific accession and verifies: - Number of FASTA sequences = number of CSV records = number of JSONL records - No data loss between different output formats - At least one record in all files - + This catches: Format conversion bugs, data loss, parsing errors. """ virus_name = "NC_045512.2" outfolder = self.test_output_dir - - result = virus( - virus=virus_name, - is_accession=True, - outfolder=outfolder - ) - + + result = virus(virus=virus_name, is_accession=True, outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) - + # Count records in each file type fasta_count = self._count_fasta_sequences(files["fasta"]["path"]) csv_count = self._count_csv_records(files["csv"]["path"]) jsonl_count = self._count_jsonl_records(files["jsonl"]["path"]) - + # All counts should match - self.assertEqual(fasta_count, csv_count, - f"FASTA count ({fasta_count}) does not match CSV count ({csv_count})") - self.assertEqual(fasta_count, jsonl_count, - f"FASTA count ({fasta_count}) does not match JSONL count ({jsonl_count})") - self.assertEqual(csv_count, jsonl_count, - f"CSV count ({csv_count}) does not match JSONL count ({jsonl_count})") - + self.assertEqual(fasta_count, csv_count, f"FASTA count ({fasta_count}) does not match CSV count ({csv_count})") + self.assertEqual( + fasta_count, jsonl_count, f"FASTA count ({fasta_count}) does not match JSONL count ({jsonl_count})" + ) + self.assertEqual(csv_count, jsonl_count, f"CSV count ({csv_count}) does not match JSONL count ({jsonl_count})") + # Should have at least one record self.assertGreater(fasta_count, 0, "No records found in output files") - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_host_filter_verification(self): """Test that host filter actually filters by host in metadata. - + Downloads Zika virus with host="human" filter and verifies: - Records are returned (filter doesn't break the query) - Host column exists in metadata - If host data is populated, it matches the filter criterion - + Note: Host filter is applied server-side by NCBI API. The returned records should all match, but the Host field in CSV may be empty or have various formats (scientific names, common names). - + This catches: Broken host filters, API changes in filtering behavior. """ virus_name = "Zika virus" host = "human" outfolder = self.test_output_dir - - result = virus( - virus=virus_name, - host=host, - outfolder=outfolder - ) - + + result = virus(virus=virus_name, host=host, outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) - + # Parse CSV metadata records = self._parse_csv_metadata(files["csv"]["path"]) - + # Should have some records self.assertGreater(len(records), 0, "No records returned with host filter") - + # Check that host column exists if records: - self.assertIn("Host", records[0].keys(), - "Host column not found in metadata") - + self.assertIn("Host", records[0].keys(), "Host column not found in metadata") + # Note: Host filter is applied server-side by NCBI API # The returned records should all match, but the Host field in CSV # may be empty or have various formats (scientific names, common names) # We verify the filter worked by checking that records were returned # (if filter was broken, we'd get all hosts or an error) - + # Count non-empty host values - non_empty_hosts = sum(1 for record in records - if record.get("Host", "").strip()) - + non_empty_hosts = sum(1 for record in records if record.get("Host", "").strip()) + # If we have host data populated, verify it matches if non_empty_hosts > 0: host_lower = host.lower() # Also check for "Homo sapiens" which is scientific name for human - matching_hosts = sum(1 for record in records - if host_lower in record.get("Host", "").lower() - or "homo sapiens" in record.get("Host", "").lower()) - + matching_hosts = sum( + 1 + for record in records + if host_lower in record.get("Host", "").lower() or "homo sapiens" in record.get("Host", "").lower() + ) + # If host data is populated, at least 50% should match if non_empty_hosts > 0: match_percentage = (matching_hosts / non_empty_hosts) * 100 - self.assertGreater(match_percentage, 50, - f"Only {match_percentage:.1f}% of populated host fields match filter '{host}'") - + self.assertGreater( + match_percentage, + 50, + f"Only {match_percentage:.1f}% of populated host fields match filter '{host}'", + ) + @retry_on_network_error(max_retries=3, delay=5) def test_virus_release_date_filter_verification(self): """Test that release date filter is applied correctly in metadata. - + Downloads Mumps virus with min_release_date="2024-12-31" and verifies: - Records are returned (API is working) - Release date field exists in metadata - All release dates are on or after 2024-12-31 - Count matches expected API results - + This test compares against the direct API call: curl -X GET "https://api.ncbi.nlm.nih.gov/datasets/v2/virus/taxon/mumps%20virus/dataset_report?filter.released_since=2024-12-31T00:00:00.000Z" - + This catches: Release date filter bugs, date parsing errors, API filter issues. """ - import requests from datetime import datetime - + + import requests + virus_name = "mumps virus" min_release_date = "2024-12-31" outfolder = self.test_output_dir - + # First, get the expected count from direct API call using full timestamp format api_url = "https://api.ncbi.nlm.nih.gov/datasets/v2/virus/taxon/mumps%20virus/dataset_report" params = {"filter.released_since": "2024-12-31T00:00:00.000Z", "page_size": 1000} - + try: - response = requests.get(api_url, params=params, headers={'accept': 'application/json'}) + response = requests.get(api_url, params=params, headers={"accept": "application/json"}) response.raise_for_status() api_data = response.json() - expected_count = len(api_data.get('reports', [])) - except Exception as e: + expected_count = len(api_data.get("reports", [])) + except Exception as e: # noqa: BLE001 self.skipTest(f"Could not fetch API data for comparison: {e}") - + # Run virus function with same filter - result = virus( - virus=virus_name, - min_release_date=min_release_date, - outfolder=outfolder - ) - + result = virus(virus=virus_name, min_release_date=min_release_date, outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) - + # Verify files were created - self.assertTrue(os.path.exists(files["csv"]["path"]), - "CSV file not created with release date filter") - self.assertTrue(os.path.exists(files["fasta"]["path"]), - "FASTA file not created with release date filter") - + self.assertTrue(os.path.exists(files["csv"]["path"]), "CSV file not created with release date filter") + self.assertTrue(os.path.exists(files["fasta"]["path"]), "FASTA file not created with release date filter") + # Parse CSV metadata records = self._parse_csv_metadata(files["csv"]["path"]) - + # Should have records matching API count (allowing for small variance due to timing) self.assertGreater(len(records), 0, "No records returned with release date filter") - self.assertEqual(len(records), expected_count, - f"Record count ({len(records)}) doesn't match API count ({expected_count})") - + self.assertEqual( + len(records), expected_count, f"Record count ({len(records)}) doesn't match API count ({expected_count})" + ) + # Check that release date column exists release_date_field = None for possible_field in ["Release date", "Release Date", "ReleaseDate", "release_date"]: if possible_field in records[0].keys(): release_date_field = possible_field break - - self.assertIsNotNone(release_date_field, - f"Release date field not found. Available fields: {list(records[0].keys())}") - + + self.assertIsNotNone( + release_date_field, f"Release date field not found. Available fields: {list(records[0].keys())}" + ) + # Parse filter date for comparison (inclusive - on or after this date) filter_date = datetime.strptime(min_release_date, "%Y-%m-%d") - + # Verify all release dates are on or after the filter date (inclusive) invalid_dates = [] for record in records: @@ -773,117 +744,118 @@ def test_virus_release_date_filter_verification(self): # Parse ISO date format (YYYY-MM-DD) record_date = datetime.strptime(date_str, "%Y-%m-%d") if record_date < filter_date: - invalid_dates.append((record.get('Accession', 'unknown'), date_str)) + invalid_dates.append((record.get("Accession", "unknown"), date_str)) except ValueError as e: # If date parsing fails, that's also a test failure self.fail(f"Could not parse release date '{date_str}': {e}") - - self.assertEqual(len(invalid_dates), 0, - f"Found {len(invalid_dates)} records with release dates before {min_release_date}: {invalid_dates[:5]}") - + + self.assertEqual( + len(invalid_dates), + 0, + f"Found {len(invalid_dates)} records with release dates before {min_release_date}: {invalid_dates[:5]}", + ) + @retry_on_network_error(max_retries=3, delay=5) def test_virus_metadata_schema_validation(self): """Test that expected metadata columns exist in CSV output. - + Downloads a specific accession and verifies: - CSV contains expected essential columns (accession, length, host) - At least 5 columns present (reasonable metadata breadth) - Column names are properly formatted - + This catches: API schema changes, missing metadata fields, field name changes that would break downstream analysis tools. """ virus_name = "NC_045512.2" outfolder = self.test_output_dir - - result = virus( - virus=virus_name, - is_accession=True, - outfolder=outfolder - ) - + + result = virus(virus=virus_name, is_accession=True, outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) - + # Get column names from CSV columns = self._get_csv_columns(files["csv"]["path"]) - + # Check for expected essential columns (these should always be present) # Using case-insensitive checking since column names might vary columns_lower = [col.lower() for col in columns] - + expected_columns = [ "accession", # Or GenBank Accession - "length", # Or Sequence Length - "host", # Host information + "length", # Or Sequence Length + "host", # Host information ] - + missing_columns = [] for expected in expected_columns: found = any(expected in col_lower for col_lower in columns_lower) if not found: missing_columns.append(expected) - - self.assertEqual(len(missing_columns), 0, - f"Missing expected metadata columns: {missing_columns}. " - f"Available columns: {columns}") - + + self.assertEqual( + len(missing_columns), + 0, + f"Missing expected metadata columns: {missing_columns}. Available columns: {columns}", + ) + # Verify we have a reasonable number of columns (at least 5) - self.assertGreaterEqual(len(columns), 5, - f"Only {len(columns)} columns found, expected at least 5") - + self.assertGreaterEqual(len(columns), 5, f"Only {len(columns)} columns found, expected at least 5") + @retry_on_network_error(max_retries=3, delay=5) def test_virus_completeness_filter_verification(self): """Test that completeness filter returns appropriate sequences. - + Downloads Zika virus with nuc_completeness="complete" and verifies: - Records are returned (filter works) - If completeness field exists, validates values - Falls back to checking length field exists - + This catches: Broken completeness filters, metadata field changes, filter logic errors. """ virus_name = "Zika virus" outfolder = self.test_output_dir - - result = virus( - virus=virus_name, - nuc_completeness="complete", - outfolder=outfolder - ) - + + result = virus(virus=virus_name, nuc_completeness="complete", outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) - + # Parse CSV metadata records = self._parse_csv_metadata(files["csv"]["path"]) - + # Should have some records self.assertGreater(len(records), 0, "No records returned with completeness filter") - + # Check if there's a completeness or length field if records: # Look for completeness-related fields completeness_field = None - for possible_field in ["Completeness", "Nuc_Completeness", "Nucleotide Completeness", - "Genome Coverage", "completeness"]: + for possible_field in [ + "Completeness", + "Nuc_Completeness", + "Nucleotide Completeness", + "Genome Coverage", + "completeness", + ]: if possible_field in records[0].keys(): completeness_field = possible_field break - + # If completeness field exists, verify values if completeness_field: - complete_count = sum(1 for record in records - if "complete" in record.get(completeness_field, "").lower()) - + complete_count = sum( + 1 for record in records if "complete" in record.get(completeness_field, "").lower() + ) + # At least 50% should be marked as complete if complete_count > 0: complete_percentage = (complete_count / len(records)) * 100 - self.assertGreater(complete_percentage, 50, - f"Only {complete_percentage:.1f}% marked as complete") + self.assertGreater(complete_percentage, 50, f"Only {complete_percentage:.1f}% marked as complete") else: # If no explicit completeness field, check length field exists # (complete genomes should have consistent lengths) @@ -892,162 +864,154 @@ def test_virus_completeness_filter_verification(self): if possible_field in records[0].keys(): length_field = possible_field break - - self.assertIsNotNone(length_field, - "Neither completeness nor length field found in metadata") - + + self.assertIsNotNone(length_field, "Neither completeness nor length field found in metadata") + @retry_on_network_error(max_retries=3, delay=5) def test_virus_multiple_filters_relationship_check(self): """Test relationship checks work correctly with multiple filters applied. - + Downloads Zika virus with multiple filters (host, completeness, length) and verifies: - FASTA/CSV/JSONL counts still match with complex filtering - At least one record passes all filters - No data loss when multiple filters interact - + This catches: Filter interaction bugs, data loss with complex queries, inconsistent filtering across output formats. """ virus_name = "Zika virus" outfolder = self.test_output_dir - + result = virus( - virus=virus_name, - host="human", - nuc_completeness="complete", - min_seq_length=10000, - outfolder=outfolder + virus=virus_name, host="human", nuc_completeness="complete", min_seq_length=10000, outfolder=outfolder ) - + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) - + # Count records in each file type fasta_count = self._count_fasta_sequences(files["fasta"]["path"]) csv_count = self._count_csv_records(files["csv"]["path"]) jsonl_count = self._count_jsonl_records(files["jsonl"]["path"]) - + # All counts should match even with filters - self.assertEqual(fasta_count, csv_count, - f"FASTA count ({fasta_count}) does not match CSV count ({csv_count}) with multiple filters") - self.assertEqual(fasta_count, jsonl_count, - f"FASTA count ({fasta_count}) does not match JSONL count ({jsonl_count}) with multiple filters") - + self.assertEqual( + fasta_count, + csv_count, + f"FASTA count ({fasta_count}) does not match CSV count ({csv_count}) with multiple filters", + ) + self.assertEqual( + fasta_count, + jsonl_count, + f"FASTA count ({fasta_count}) does not match JSONL count ({jsonl_count}) with multiple filters", + ) + # Should have at least one record self.assertGreater(fasta_count, 0, "No records found with multiple filters applied") # ========================================================================= # ADDITIONAL FUNCTIONAL TESTS: Testing previously untested parameters # ========================================================================= - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_with_geographic_location_filter(self): """Test that geographic location filter works correctly. - + Downloads Zika virus sequences from Brazil and verifies: - Files are created successfully - Records are returned - Geographic location metadata field exists - + This catches: Geographic location filter bugs, API parameter issues. """ virus_name = "Zika virus" outfolder = self.test_output_dir - - result = virus( - virus=virus_name, - geographic_location="Brazil", - outfolder=outfolder - ) - + + result = virus(virus=virus_name, geographic_location="Brazil", outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) self.assertTrue(files["fasta"]["exists"], "FASTA file not created with geographic location filter") self.assertTrue(files["csv"]["exists"], "CSV file not created with geographic location filter") - + # Parse CSV metadata records = self._parse_csv_metadata(files["csv"]["path"]) - + # Should have some records (Brazil had Zika outbreak) self.assertGreater(len(records), 0, "No records returned with geographic location filter") - + # Check that geographic location fields exist if records: geo_fields = ["Geographic Location", "Geographic Region", "Geo String"] has_geo_field = any(field in records[0].keys() for field in geo_fields) - self.assertTrue(has_geo_field, - f"No geographic location field found. Available fields: {list(records[0].keys())}") - + self.assertTrue( + has_geo_field, f"No geographic location field found. Available fields: {list(records[0].keys())}" + ) + @retry_on_network_error(max_retries=3, delay=5) def test_virus_with_protein_count_filters(self): """Test that protein count filters work correctly. - + Downloads Zika virus with protein count filters and verifies: - Files are created successfully - Records are returned - Protein count field exists in metadata - + This catches: Protein count filter bugs, metadata field issues. """ virus_name = "Zika virus" outfolder = self.test_output_dir - - result = virus( - virus=virus_name, - min_protein_count=1, - max_protein_count=20, - outfolder=outfolder - ) - + + result = virus(virus=virus_name, min_protein_count=1, max_protein_count=20, outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) self.assertTrue(files["fasta"]["exists"], "FASTA file not created with protein count filters") - + # Parse CSV metadata records = self._parse_csv_metadata(files["csv"]["path"]) - + # Should have some records self.assertGreater(len(records), 0, "No records returned with protein count filters") - + # Check that protein count field exists if records: - self.assertIn("Protein count", records[0].keys(), - f"Protein count field not found. Available fields: {list(records[0].keys())}") - + self.assertIn( + "Protein count", + records[0].keys(), + f"Protein count field not found. Available fields: {list(records[0].keys())}", + ) + @retry_on_network_error(max_retries=3, delay=5) def test_virus_with_source_database_filter(self): """Test that source database filter works correctly. - + Downloads Zika virus from GenBank database and verifies: - Files are created successfully - Records are returned - Source database field exists in metadata - + This catches: Source database filter bugs, API parameter issues. """ virus_name = "Zika virus" outfolder = self.test_output_dir - - result = virus( - virus=virus_name, - source_database="GenBank", - outfolder=outfolder - ) - + + result = virus(virus=virus_name, source_database="GenBank", outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) self.assertTrue(files["fasta"]["exists"], "FASTA file not created with source database filter") - + # Parse CSV metadata records = self._parse_csv_metadata(files["csv"]["path"]) - + # Should have some records self.assertGreater(len(records), 0, "No records returned with source database filter") - + # Check that source database field exists if records: db_field = None @@ -1055,183 +1019,161 @@ def test_virus_with_source_database_filter(self): if possible_field in records[0].keys(): db_field = possible_field break - - self.assertIsNotNone(db_field, - f"Source database field not found. Available fields: {list(records[0].keys())}") - + + self.assertIsNotNone( + db_field, f"Source database field not found. Available fields: {list(records[0].keys())}" + ) @retry_on_network_error(max_retries=3, delay=5) def test_virus_with_lab_passaged_filter(self): """Test that lab_passaged filter works correctly. - + Downloads Zika virus with lab_passaged=False filter and verifies: - Files are created successfully - Records are returned - + Note: Lab passaged data may be sparse, so we mainly verify the filter doesn't break the query. - + This catches: Lab passaged filter bugs, API parameter issues. """ virus_name = "Zika virus" outfolder = self.test_output_dir - - result = virus( - virus=virus_name, - lab_passaged=False, - outfolder=outfolder - ) - + + result = virus(virus=virus_name, lab_passaged=False, outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) self.assertTrue(files["fasta"]["exists"], "FASTA file not created with lab_passaged filter") - + # Should create files (even if no lab passaged field in results) self.assertGreater(files["fasta"]["size"], 0, "FASTA file is empty with lab_passaged filter") - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_with_collection_date_filters(self): """Test that collection date filters don't break the query. - + Downloads Zika virus with collection date range and verifies: - Function completes without errors - + Note: Collection date data is often sparse, filters may return no results. This test just ensures the filter doesn't cause errors. """ virus_name = "Zika virus" outfolder = self.test_output_dir - + # This will complete without error even if no results match result = virus( - virus=virus_name, - min_collection_date="2016-01-01", - max_collection_date="2016-12-31", - outfolder=outfolder + virus=virus_name, min_collection_date="2016-01-01", max_collection_date="2016-12-31", outfolder=outfolder ) - + # Function should complete successfully self.assertIsNone(result) - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_with_max_ambiguous_chars_filter(self): """Test that max_ambiguous_chars filter works correctly. - + Downloads Zika virus with max_ambiguous_chars filter and verifies: - Files are created successfully - Records are returned - Filter doesn't break the query - + This catches: Max ambiguous chars filter bugs, sequence quality filtering issues. """ virus_name = "Zika virus" outfolder = self.test_output_dir - - result = virus( - virus=virus_name, - max_ambiguous_chars=100, - outfolder=outfolder - ) - + + result = virus(virus=virus_name, max_ambiguous_chars=100, outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) self.assertTrue(files["fasta"]["exists"], "FASTA file not created with max_ambiguous_chars filter") - + # Should have some records (most sequences have some ambiguous bases) seq_count = self._count_fasta_sequences(files["fasta"]["path"]) self.assertGreater(seq_count, 0, "No sequences passed max_ambiguous_chars filter") - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_with_has_proteins_filter(self): """Test that has_proteins filter works correctly. - + Downloads Zika virus requiring specific proteins and verifies: - Files are created successfully - Records are returned - Filter doesn't break the query - + This catches: has_proteins filter bugs, protein filtering logic issues. """ virus_name = "Zika virus" outfolder = self.test_output_dir - + # Test with a common protein (polyprotein is typical for Zika) - result = virus( - virus=virus_name, - has_proteins="polyprotein", - outfolder=outfolder - ) - + result = virus(virus=virus_name, has_proteins="polyprotein", outfolder=outfolder) + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) self.assertTrue(files["fasta"]["exists"], "FASTA file not created with has_proteins filter") - + # Should have some records (polyprotein is common in Zika) seq_count = self._count_fasta_sequences(files["fasta"]["path"]) self.assertGreater(seq_count, 0, "No sequences passed has_proteins filter") - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_with_genbank_metadata_retrieval(self): """Test that GenBank metadata retrieval works correctly. - + Downloads a single accession with genbank_metadata=True and verifies: - Function completes without errors - Standard files are created - GenBank metadata CSV file is created - + This catches: GenBank metadata retrieval bugs, batch processing issues. """ virus_name = "NC_045512.2" outfolder = self.test_output_dir - + result = virus( - virus=virus_name, - is_accession=True, - genbank_metadata=True, - genbank_batch_size=10, - outfolder=outfolder + virus=virus_name, is_accession=True, genbank_metadata=True, genbank_batch_size=10, outfolder=outfolder ) - + self.assertIsNone(result) - + files = self._check_output_files(virus_name, outfolder) self.assertTrue(files["fasta"]["exists"], "FASTA file not created with genbank_metadata") - + # Check for GenBank metadata file genbank_csv = os.path.join(outfolder, f"{virus_name}_genbank_metadata.csv") - self.assertTrue(os.path.exists(genbank_csv), - f"GenBank metadata CSV not created: {genbank_csv}") - + self.assertTrue(os.path.exists(genbank_csv), f"GenBank metadata CSV not created: {genbank_csv}") + # Verify GenBank CSV has data - self.assertGreater(os.path.getsize(genbank_csv), 0, - "GenBank metadata CSV is empty") + self.assertGreater(os.path.getsize(genbank_csv), 0, "GenBank metadata CSV is empty") # ========================================================================= # DATASETS CLI TESTS: Testing NCBI datasets CLI check and setup # ========================================================================= # These tests verify the datasets CLI detection and installation functionality - + def test_get_datasets_path_returns_valid_path(self): """Test that _get_datasets_path returns a valid path to the datasets CLI. - + The function should return a path to either: 1. The system-installed datasets CLI (if available) 2. The bundled datasets binary (fallback) - + This catches: Detection logic bugs, path resolution issues. """ # _get_datasets_path should always return a valid path # (either system CLI or bundled binary) datasets_path = _get_datasets_path() - + # Should return a non-empty string self.assertIsInstance(datasets_path, str) - self.assertTrue(len(datasets_path) > 0, - "_get_datasets_path should return a non-empty path") - + self.assertTrue(len(datasets_path) > 0, "_get_datasets_path should return a non-empty path") + # The returned path should be executable result = subprocess.run( [datasets_path, "--version"], @@ -1239,57 +1181,54 @@ def test_get_datasets_path_returns_valid_path(self): text=True, timeout=5, ) - self.assertEqual(result.returncode, 0, - f"datasets CLI at {datasets_path} should be executable") - + self.assertEqual(result.returncode, 0, f"datasets CLI at {datasets_path} should be executable") + def test_get_datasets_path_uses_bundled_binary(self): """Test that _get_datasets_path falls back to bundled binary when system CLI is missing. - + When the system-installed datasets CLI is not in PATH, the function should fall back to the bundled binary included with gget. - + Note: _get_datasets_path caches its result, so this test clears the cache before testing the fallback behavior. - + This catches: Bundled binary fallback logic, path resolution issues. """ import gget.gget_virus as gget_virus_module - + # Save original PATH and cache original_path = os.environ.get("PATH", "") original_cache = gget_virus_module._datasets_path_cache - + try: # Clear the cache to force re-detection _clear_datasets_cache() - + # Set PATH to empty to simulate system datasets not being found os.environ["PATH"] = "" - + # Should still return a valid path (to bundled binary) datasets_path = _get_datasets_path() - + # Should return a non-empty string path to bundled binary self.assertIsInstance(datasets_path, str) - self.assertTrue(len(datasets_path) > 0, - "Should return path to bundled binary") - + self.assertTrue(len(datasets_path) > 0, "Should return path to bundled binary") + # Path should contain 'bins' indicating bundled binary - self.assertIn("bins", datasets_path, - f"Path should be bundled binary, got: {datasets_path}") - + self.assertIn("bins", datasets_path, f"Path should be bundled binary, got: {datasets_path}") + finally: # Restore original PATH and cache os.environ["PATH"] = original_path gget_virus_module._datasets_path_cache = original_cache - + def test_datasets_cli_version_output(self): """Test that the datasets CLI returns a valid version string. - - When available (system or bundled), the datasets CLI should return a - version string that can be parsed. This helps ensure the CLI is properly + + When available (system or bundled), the datasets CLI should return a + version string that can be parsed. This helps ensure the CLI is properly functional. - + This catches: Corrupted installations, version parsing issues. """ # Use _get_datasets_path() to get either system or bundled binary @@ -1304,71 +1243,67 @@ def test_datasets_cli_version_output(self): cli_available = result.returncode == 0 except (FileNotFoundError, subprocess.TimeoutExpired, RuntimeError): cli_available = False - + if not cli_available: self.skipTest("NCBI datasets CLI not available - skipping version test") - + # Version output should not be empty and should contain version info version_output = result.stdout.strip() - self.assertTrue(len(version_output) > 0, - "Version output should not be empty") + self.assertTrue(len(version_output) > 0, "Version output should not be empty") # NCBI datasets typically outputs version like "datasets version: X.Y.Z" or just "X.Y.Z" self.assertTrue( any(char.isdigit() for char in version_output), - f"Version output should contain version numbers: {version_output}" + f"Version output should contain version numbers: {version_output}", ) - # ========================================================================= # MULTI-ACCESSION TESTS: Testing new multi-accession functionality # ========================================================================= # These tests verify the new multi-accession support added in recent commits - + def test_parse_accession_input_single(self): """Test parsing of single accession number. - + Tests _parse_accession_input with a single accession identifier and verifies: - Returns correct type ('single') - Accession value is preserved - is_file flag is False - + This catches: Single accession parsing bugs, input validation issues. """ - from gget.gget_virus import _parse_accession_input - - result = _parse_accession_input('NC_045512.2') - - self.assertEqual(result['type'], 'single', "Should identify single accession") - self.assertEqual(result['accessions'], 'NC_045512.2', "Should preserve accession value") - self.assertFalse(result['is_file'], "Single accession should not be marked as file") - self.assertIsNone(result['file_path'], "Single accession should have no file_path") - + + result = _parse_accession_input("NC_045512.2") + + self.assertEqual(result["type"], "single", "Should identify single accession") + self.assertEqual(result["accessions"], "NC_045512.2", "Should preserve accession value") + self.assertFalse(result["is_file"], "Single accession should not be marked as file") + self.assertIsNone(result["file_path"], "Single accession should have no file_path") + def test_parse_accession_input_space_separated(self): """Test parsing of space-separated accessions. - + Tests _parse_accession_input with space-separated accessions and verifies: - Returns correct type ('list') - Accessions list is created with correct count - All accessions are preserved without whitespace - is_file flag is False - + This catches: Space-separated parsing bugs, whitespace handling issues. """ - from gget.gget_virus import _parse_accession_input - - result = _parse_accession_input('NC_045512.2 MN908947.3 MT020781.1') - - self.assertEqual(result['type'], 'list', "Should identify list of accessions") - self.assertIsInstance(result['accessions'], list, "Should return list type") - self.assertEqual(len(result['accessions']), 3, "Should parse 3 accessions") - self.assertEqual(result['accessions'][0], 'NC_045512.2', "First accession should match") - self.assertEqual(result['accessions'][1], 'MN908947.3', "Second accession should match") - self.assertEqual(result['accessions'][2], 'MT020781.1', "Third accession should match") - self.assertFalse(result['is_file'], "Space-separated should not be marked as file") - + + result = _parse_accession_input("NC_045512.2 MN908947.3 MT020781.1") + + self.assertEqual(result["type"], "list", "Should identify list of accessions") + self.assertIsInstance(result["accessions"], list, "Should return list type") + self.assertEqual(len(result["accessions"]), 3, "Should parse 3 accessions") + self.assertEqual(result["accessions"][0], "NC_045512.2", "First accession should match") + self.assertEqual(result["accessions"][1], "MN908947.3", "Second accession should match") + self.assertEqual(result["accessions"][2], "MT020781.1", "Third accession should match") + self.assertFalse(result["is_file"], "Space-separated should not be marked as file") + def test_parse_accession_input_from_file(self): """Test parsing of accessions from a file. - + Tests _parse_accession_input with a file path and verifies: - Returns correct type ('file') - Accessions list is created from file content @@ -1376,214 +1311,204 @@ def test_parse_accession_input_from_file(self): - is_file flag is True - file_path is preserved - Empty lines are skipped - + This catches: File parsing bugs, whitespace/empty line issues, file I/O errors. """ - from gget.gget_virus import _parse_accession_input import tempfile - + # Create a temporary file with accessions - with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f: f.write("NC_045512.2\n") f.write(" MN908947.3 \n") # Test whitespace handling f.write("\n") # Empty line f.write("MT020781.1\n") temp_file = f.name - + try: result = _parse_accession_input(temp_file) - - self.assertEqual(result['type'], 'file', "Should identify file input") - self.assertIsInstance(result['accessions'], list, "Should return list type") - self.assertEqual(len(result['accessions']), 3, "Should parse 3 accessions (empty line skipped)") - self.assertEqual(result['accessions'][0], 'NC_045512.2', "First accession should match") - self.assertEqual(result['accessions'][1], 'MN908947.3', "Second accession should be stripped of whitespace") - self.assertEqual(result['accessions'][2], 'MT020781.1', "Third accession should match") - self.assertTrue(result['is_file'], "File input should be marked as file") - self.assertEqual(result['file_path'], temp_file, "File path should be preserved") + + self.assertEqual(result["type"], "file", "Should identify file input") + self.assertIsInstance(result["accessions"], list, "Should return list type") + self.assertEqual(len(result["accessions"]), 3, "Should parse 3 accessions (empty line skipped)") + self.assertEqual(result["accessions"][0], "NC_045512.2", "First accession should match") + self.assertEqual(result["accessions"][1], "MN908947.3", "Second accession should be stripped of whitespace") + self.assertEqual(result["accessions"][2], "MT020781.1", "Third accession should match") + self.assertTrue(result["is_file"], "File input should be marked as file") + self.assertEqual(result["file_path"], temp_file, "File path should be preserved") finally: os.unlink(temp_file) - + def test_parse_accession_input_empty_file_raises_error(self): """Test that parsing empty file raises ValueError. - + Tests _parse_accession_input with an empty file and verifies: - Raises ValueError - Error message is informative - + This catches: Empty file validation bugs, error handling issues. """ - from gget.gget_virus import _parse_accession_input import tempfile - + # Create an empty temporary file - with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f: temp_file = f.name - + try: with self.assertRaises(ValueError): _parse_accession_input(temp_file) finally: os.unlink(temp_file) - + def test_parse_accession_input_nonexistent_file_raises_error(self): """Test that parsing nonexistent file raises ValueError. - + Tests _parse_accession_input with a nonexistent file path and verifies: - Raises ValueError (not FileNotFoundError - treated as single accession) - + Note: A nonexistent file path will be treated as a single accession string since _parse_accession_input checks os.path.isfile() first. """ - from gget.gget_virus import _parse_accession_input - + # Nonexistent file path will be treated as single accession - result = _parse_accession_input('/nonexistent/file/path.txt') - + result = _parse_accession_input("/nonexistent/file/path.txt") + # Should treat it as a single accession string since file doesn't exist - self.assertEqual(result['type'], 'single', "Nonexistent file treated as single accession") - self.assertEqual(result['accessions'], '/nonexistent/file/path.txt', "Should preserve path as accession") - + self.assertEqual(result["type"], "single", "Nonexistent file treated as single accession") + self.assertEqual(result["accessions"], "/nonexistent/file/path.txt", "Should preserve path as accession") + def test_calculate_max_accessions_per_batch(self): """Test calculation of maximum accessions per batch. - + Tests _calculate_max_accessions_per_batch and verifies: - Returns positive integer - At least 1 accession per batch - Respects URL length limit - Smaller base URL allows more accessions - + This catches: Batch size calculation bugs, URL limit logic errors. """ - from gget.gget_virus import _calculate_max_accessions_per_batch, MAX_URL_LENGTH, BUFFER_SIZE, ACCESSION_AVG_LENGTH - + from gget.gget_virus import ACCESSION_AVG_LENGTH, BUFFER_SIZE, MAX_URL_LENGTH + # Test with different base URL lengths base_url_small = 50 base_url_large = 500 - + max_acc_small = _calculate_max_accessions_per_batch(base_url_small) max_acc_large = _calculate_max_accessions_per_batch(base_url_large) - + # Both should be positive integers self.assertIsInstance(max_acc_small, int, "Should return integer") self.assertIsInstance(max_acc_large, int, "Should return integer") self.assertGreater(max_acc_small, 0, "Should allow at least 1 accession") self.assertGreater(max_acc_large, 0, "Should allow at least 1 accession") - + # Larger base URL should allow fewer accessions - self.assertGreater(max_acc_small, max_acc_large, - "Smaller base URL should allow more accessions") - + self.assertGreater(max_acc_small, max_acc_large, "Smaller base URL should allow more accessions") + # Verify the calculation makes sense # With 2000 char limit, 200 char buffer, typical accession is 11 chars + 3 for %2C expected_rough = (MAX_URL_LENGTH - base_url_small - BUFFER_SIZE) // (ACCESSION_AVG_LENGTH + 3) self.assertEqual(max_acc_small, expected_rough, "Calculation should match expected formula") - + def test_batch_accessions_for_url(self): """Test batching of accessions for URL length limits. - + Tests _batch_accessions_for_url with large accession list and verifies: - Returns list of batches - All accessions are included - No duplicate accessions - Each batch respects URL limit - Batching is consistent - + This catches: Batching algorithm bugs, URL limit violations, data loss. """ - from gget.gget_virus import _batch_accessions_for_url, MAX_URL_LENGTH - + from gget.gget_virus import MAX_URL_LENGTH + # Create large list of accessions that will need multiple batches accessions = [f"NC_{100000 + i}.1" for i in range(1000)] base_url_length = 100 - + batches = _batch_accessions_for_url(accessions, base_url_length) - + # Should have multiple batches for 1000 accessions self.assertIsInstance(batches, list, "Should return list of batches") self.assertGreater(len(batches), 1, "Should split into multiple batches for 1000 accessions") - + # All accessions should be included all_batched = [acc for batch in batches for acc in batch] self.assertEqual(len(all_batched), len(accessions), "All accessions should be included") - + # No duplicates self.assertEqual(len(set(all_batched)), len(accessions), "Should not have duplicates") - + # Verify order is preserved self.assertEqual(all_batched, accessions, "Accession order should be preserved") - + # Verify each batch respects URL limit for batch_num, batch in enumerate(batches, 1): batch_url_length = base_url_length + sum(len(acc) + 3 for acc in batch) - self.assertLessEqual(batch_url_length, MAX_URL_LENGTH, - f"Batch {batch_num} exceeds URL limit ({batch_url_length} > {MAX_URL_LENGTH})") - + self.assertLessEqual( + batch_url_length, + MAX_URL_LENGTH, + f"Batch {batch_num} exceeds URL limit ({batch_url_length} > {MAX_URL_LENGTH})", + ) + @retry_on_network_error(max_retries=3, delay=5) def test_virus_multi_accession_space_separated(self): """Test virus function with space-separated accessions. - + Tests the virus() function with --is_accession flag and space-separated accessions and verifies: - Function completes without errors - Command summary is created (shows processing happened) - Function doesn't crash on multi-accession input - + This catches: Multi-accession parsing bugs, integration issues with virus() function. - + Note: API may return 0 results for some accession combinations, which is acceptable. The key is that the command processes without crashing. """ outfolder = self.test_output_dir - + # Test with space-separated accessions - result = virus( - virus='MN908947.3 NC_045512.2', - is_accession=True, - outfolder=outfolder - ) - + result = virus(virus="MN908947.3 NC_045512.2", is_accession=True, outfolder=outfolder) + # Function should complete successfully self.assertIsNone(result) - + # Command summary should be created - summary_files = [f for f in os.listdir(outfolder) if f.startswith('command_summary')] + summary_files = [f for f in os.listdir(outfolder) if f.startswith("command_summary")] self.assertGreater(len(summary_files), 0, "Command summary should be created") - + @retry_on_network_error(max_retries=3, delay=5) def test_virus_multi_accession_file_input(self): """Test virus function with file-based accessions. - + Tests the virus() function with --is_accession flag and file input and verifies: - Function completes without errors - Command summary is created - Correctly reads accessions from file - + This catches: File reading bugs, multi-accession file processing issues. """ - import tempfile - + outfolder = self.test_output_dir - + # Create temporary accessions file - accessions_file = os.path.join(outfolder, 'test_accessions.txt') - with open(accessions_file, 'w') as f: + accessions_file = os.path.join(outfolder, "test_accessions.txt") + with open(accessions_file, "w") as f: f.write("MN908947.3\n") f.write("NC_045512.2\n") - + # Test with file input - result = virus( - virus=accessions_file, - is_accession=True, - outfolder=outfolder - ) - + result = virus(virus=accessions_file, is_accession=True, outfolder=outfolder) + # Function should complete successfully self.assertIsNone(result) - + # Command summary should be created - summary_files = [f for f in os.listdir(outfolder) if f.startswith('command_summary')] + summary_files = [f for f in os.listdir(outfolder) if f.startswith("command_summary")] self.assertGreater(len(summary_files), 0, "Command summary should be created for file input") - + # Clean up if os.path.exists(accessions_file): os.unlink(accessions_file) @@ -1592,36 +1517,34 @@ def test_virus_multi_accession_file_input(self): # EXPONENTIAL BACKOFF HELPER FUNCTION TESTS # ========================================================================= # These tests verify the core retry logic without making real API calls - + def test_retry_helper_successful_operation(self): """Test successful operation on first attempt (no retries needed).""" - from gget.gget_virus import _retry_with_exponential_backoff - + def successful_op(): return {"result": "success"} - + success, result, error_info = _retry_with_exponential_backoff( operation_name="test_success", operation_func=successful_op, ) - + self.assertTrue(success, "Expected success=True") self.assertEqual(result, {"result": "success"}, "Expected correct result") self.assertIsNone(error_info, "Expected no error_info on success") - + def test_retry_helper_success_after_retry(self): """Test operation that fails once then succeeds.""" import requests - from gget.gget_virus import _retry_with_exponential_backoff - + attempt_count = [0] # Use list to allow modification in nested function - + def flaky_op(): attempt_count[0] += 1 if attempt_count[0] == 1: raise requests.exceptions.ConnectionError("Temporary connection issue") return {"result": "succeeded after retry"} - + start_time = time.time() success, result, error_info = _retry_with_exponential_backoff( operation_name="test_flaky", @@ -1632,27 +1555,26 @@ def flaky_op(): retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError), ) elapsed = time.time() - start_time - + self.assertTrue(success, "Expected success=True after retry") self.assertEqual(result, {"result": "succeeded after retry"}, "Expected correct result") self.assertEqual(attempt_count[0], 2, f"Expected 2 attempts, got {attempt_count[0]}") self.assertGreaterEqual(elapsed, 0.05, f"Expected at least 0.05s delay, got {elapsed}s") - + def test_retry_helper_exponential_backoff_timing(self): """Test that exponential backoff increases delays properly.""" import requests - from gget.gget_virus import _retry_with_exponential_backoff - + attempt_count = [0] - + def always_fails(): attempt_count[0] += 1 raise requests.exceptions.ConnectionError("Persistent connection issue") - + initial_delay = 0.05 backoff_multiplier = 2.0 max_retries = 3 - + start_time = time.time() success, result, error_info = _retry_with_exponential_backoff( operation_name="test_backoff", @@ -1663,24 +1585,24 @@ def always_fails(): retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError), ) elapsed = time.time() - start_time - + # The loop runs max_retries times with delays between attempts expected_min_delay = initial_delay * (1 + backoff_multiplier) - + self.assertFalse(success, "Expected success=False when all retries fail") self.assertEqual(attempt_count[0], max_retries, f"Expected {max_retries} attempts") - self.assertGreaterEqual(elapsed, expected_min_delay * 0.8, - f"Delay too short: {elapsed}s vs {expected_min_delay}s") - + self.assertGreaterEqual( + elapsed, expected_min_delay * 0.8, f"Delay too short: {elapsed}s vs {expected_min_delay}s" + ) + def test_retry_helper_failed_commands_tracking(self): """Test that failed_commands dict is properly populated.""" - from gget.gget_virus import _retry_with_exponential_backoff - + def failing_op(): raise ConnectionError("Test error message") - + failed_commands = {"custom_errors": []} - + success, result, error_info = _retry_with_exponential_backoff( operation_name="test_tracking", operation_func=failing_op, @@ -1688,23 +1610,22 @@ def failing_op(): initial_delay=0.01, failed_commands=failed_commands, ) - + self.assertFalse(success, "Expected operation to fail") self.assertIsNotNone(error_info, "Expected error_info to be populated") self.assertIn("exception_type", error_info, "Expected exception_type in error_info") self.assertIn("error", error_info, "Expected error message in error_info") - + def test_retry_helper_non_retryable_exception(self): """Test that non-retryable exceptions fail immediately.""" import requests - from gget.gget_virus import _retry_with_exponential_backoff - + attempt_count = [0] - + def non_retryable_op(): attempt_count[0] += 1 raise ValueError("This exception is not retryable") - + start_time = time.time() success, result, error_info = _retry_with_exponential_backoff( operation_name="test_non_retryable", @@ -1714,24 +1635,23 @@ def non_retryable_op(): retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError), ) elapsed = time.time() - start_time - + self.assertFalse(success, "Expected operation to fail") self.assertEqual(attempt_count[0], 1, f"Expected only 1 attempt, got {attempt_count[0]}") self.assertLess(elapsed, 0.1, f"Expected immediate failure, but took {elapsed:.2f}s") - + def test_retry_helper_custom_retryable_exceptions(self): """Test with custom retryable exceptions.""" import requests - from gget.gget_virus import _retry_with_exponential_backoff - + attempt_count = [0] - + def custom_failing_op(): attempt_count[0] += 1 if attempt_count[0] == 1: raise requests.exceptions.Timeout("Request timed out") return {"result": "success"} - + success, result, error_info = _retry_with_exponential_backoff( operation_name="test_custom_retryable", operation_func=custom_failing_op, @@ -1739,7 +1659,7 @@ def custom_failing_op(): initial_delay=0.01, retryable_exceptions=(requests.exceptions.Timeout, requests.exceptions.ConnectionError), ) - + self.assertTrue(success, "Expected retry to succeed with Timeout in retryable_exceptions") self.assertEqual(attempt_count[0], 2, f"Expected 2 attempts, got {attempt_count[0]}") @@ -1981,7 +1901,7 @@ def test_clean_xml_declarations(self): def test_clean_xml_declarations_no_declarations(self): """Test _clean_xml_declarations with no declarations to remove.""" - xml = 'data' + xml = "data" result = _clean_xml_declarations(xml) self.assertEqual(result, xml) @@ -2047,7 +1967,7 @@ def test_force_garbage_collection_runs(self): def test_parse_baseline_file_csv(self): """Test _parse_baseline_file with CSV format.""" - with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: f.write("accession,length,host\n") f.write("NC_045512.2,29903,human\n") f.write("MN908947.3,29903,human\n") @@ -2063,7 +1983,7 @@ def test_parse_baseline_file_csv(self): def test_parse_baseline_file_jsonl(self): """Test _parse_baseline_file with JSONL format.""" - with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: f.write('{"accession": "NC_045512.2", "length": 29903}\n') f.write('{"accession": "MN908947.3", "length": 29903}\n') path = f.name @@ -2076,12 +1996,10 @@ def test_parse_baseline_file_jsonl(self): def test_parse_baseline_file_json(self): """Test _parse_baseline_file with JSON array format.""" - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: import json as json_mod - json_mod.dump([ - {"accession": "NC_045512.2"}, - {"accession": "MN908947.3"} - ], f) + + json_mod.dump([{"accession": "NC_045512.2"}, {"accession": "MN908947.3"}], f) path = f.name try: result = _parse_baseline_file(path) @@ -2091,7 +2009,7 @@ def test_parse_baseline_file_json(self): def test_parse_baseline_file_text(self): """Test _parse_baseline_file with plain text format.""" - with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f: f.write("NC_045512.2\n") f.write("MN908947.3\n") f.write("# comment line\n") @@ -2112,7 +2030,7 @@ def test_parse_baseline_file_nonexistent_raises(self): def test_parse_baseline_file_empty_raises(self): """Test _parse_baseline_file raises for empty file.""" - with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: f.write("accession\n") # header only, no data path = f.name try: @@ -2196,10 +2114,12 @@ def test_merge_baseline_with_new_csv(self): baseline_path = os.path.join(tmpdir, "baseline.csv") output_path = os.path.join(tmpdir, "merged.csv") # Create baseline CSV - pd.DataFrame([ - {"accession": "ACC1", "length": 100}, - {"accession": "ACC2", "length": 200}, - ]).to_csv(baseline_path, index=False) + pd.DataFrame( + [ + {"accession": "ACC1", "length": 100}, + {"accession": "ACC2", "length": 200}, + ] + ).to_csv(baseline_path, index=False) # New metadata new_records = [ {"accession": "ACC3", "length": 300}, @@ -2215,9 +2135,11 @@ def test_merge_baseline_with_new_deduplicates(self): with tempfile.TemporaryDirectory() as tmpdir: baseline_path = os.path.join(tmpdir, "baseline.csv") output_path = os.path.join(tmpdir, "merged.csv") - pd.DataFrame([ - {"accession": "ACC1", "length": 100}, - ]).to_csv(baseline_path, index=False) + pd.DataFrame( + [ + {"accession": "ACC1", "length": 100}, + ] + ).to_csv(baseline_path, index=False) new_records = [{"accession": "ACC1", "length": 999}] _merge_baseline_with_new(baseline_path, new_records, output_path) df = pd.read_csv(output_path) @@ -2286,10 +2208,24 @@ def test_load_metadata_from_api_reports_empty(self): def test_load_metadata_from_api_reports_multiple(self): """Test load_metadata_from_api_reports with multiple reports.""" api_reports = [ - {"accession": "ACC1", "length": 100, "completeness": "COMPLETE", - "host": {}, "location": {}, "isolate": {}, "virus": {}}, - {"accession": "ACC2", "length": 200, "completeness": "PARTIAL", - "host": {}, "location": {}, "isolate": {}, "virus": {}}, + { + "accession": "ACC1", + "length": 100, + "completeness": "COMPLETE", + "host": {}, + "location": {}, + "isolate": {}, + "virus": {}, + }, + { + "accession": "ACC2", + "length": 200, + "completeness": "PARTIAL", + "host": {}, + "location": {}, + "isolate": {}, + "virus": {}, + }, ] result = load_metadata_from_api_reports(api_reports) self.assertEqual(len(result), 2) @@ -2424,9 +2360,7 @@ def test_filter_metadata_only_source_database(self): def test_filter_metadata_only_collection_date_range(self): """Test filter_metadata_only with collection date range.""" meta = self._make_test_metadata() - accs, metas, _ = filter_metadata_only( - meta, min_collection_date="2021-01-01", max_collection_date="2021-12-31" - ) + accs, metas, _ = filter_metadata_only(meta, min_collection_date="2021-01-01", max_collection_date="2021-12-31") self.assertEqual(len(accs), 1) self.assertIn("ACC3", accs) @@ -2670,9 +2604,7 @@ def test_filter_genbank_metadata_env_source(self): def test_filter_genbank_metadata_combined(self): """Test filter_genbank_metadata with multiple filters.""" meta = self._make_genbank_metadata() - result, _ = filter_genbank_metadata( - meta, min_gene_count=5, genotype="H5N1", has_proteins="hemagglutinin" - ) + result, _ = filter_genbank_metadata(meta, min_gene_count=5, genotype="H5N1", has_proteins="hemagglutinin") self.assertEqual(len(result), 2) self.assertIn("ACC1", result) self.assertIn("ACC3", result) @@ -2690,9 +2622,7 @@ def test_filter_cached_no_filters(self): def test_filter_cached_host_not_in_strategy(self): """Test filter_cached_metadata_for_unused_filters applies host when not in strategy.""" meta = self._make_test_metadata() - accs, metas = filter_cached_metadata_for_unused_filters( - meta, host="Homo sapiens", applied_strategy_filters=[] - ) + accs, metas = filter_cached_metadata_for_unused_filters(meta, host="Homo sapiens", applied_strategy_filters=[]) self.assertEqual(len(accs), 2) self.assertIn("ACC1", accs) self.assertIn("ACC3", accs) @@ -2709,17 +2639,13 @@ def test_filter_cached_host_in_strategy_skipped(self): def test_filter_cached_complete_only(self): """Test filter_cached_metadata_for_unused_filters with complete_only.""" meta = self._make_test_metadata() - accs, metas = filter_cached_metadata_for_unused_filters( - meta, complete_only=True, applied_strategy_filters=[] - ) + accs, metas = filter_cached_metadata_for_unused_filters(meta, complete_only=True, applied_strategy_filters=[]) self.assertEqual(len(accs), 2) # ACC1 and ACC3 are complete def test_filter_cached_annotated(self): """Test filter_cached_metadata_for_unused_filters with annotated.""" meta = self._make_test_metadata() - accs, metas = filter_cached_metadata_for_unused_filters( - meta, annotated=True, applied_strategy_filters=[] - ) + accs, metas = filter_cached_metadata_for_unused_filters(meta, annotated=True, applied_strategy_filters=[]) self.assertEqual(len(accs), 2) self.assertNotIn("ACC2", accs) @@ -2735,9 +2661,7 @@ def test_filter_cached_geographic_location(self): def test_filter_cached_refseq_only(self): """Test filter_cached_metadata_for_unused_filters with refseq_only.""" meta = self._make_test_metadata() - accs, metas = filter_cached_metadata_for_unused_filters( - meta, refseq_only=True, applied_strategy_filters=[] - ) + accs, metas = filter_cached_metadata_for_unused_filters(meta, refseq_only=True, applied_strategy_filters=[]) self.assertEqual(len(accs), 1) self.assertIn("ACC2", accs) @@ -2764,7 +2688,7 @@ def test_filter_metadata_only_nuc_completeness_partial(self): def test_filter_metadata_only_annotated_true(self): """Test filter_metadata_only with annotated=True passes all (handled server-side). - + Note: annotated=True is handled server-side by the API, so the client-side filter_metadata_only does NOT filter on annotated=True. All records pass. """ @@ -2779,9 +2703,10 @@ def test_filter_metadata_only_annotated_true(self): def test_write_fasta_record_with_description(self): """Test _write_fasta_record writes correct FASTA format with description.""" - from gget.utils import FastaRecord import io + from gget.utils import FastaRecord + record = FastaRecord(seq="ATCGATCGATCG", id="ACC001", description="Test virus isolate") handle = io.StringIO() _write_fasta_record(handle, record) @@ -2792,9 +2717,10 @@ def test_write_fasta_record_with_description(self): def test_write_fasta_record_without_description(self): """Test _write_fasta_record writes correct FASTA format without description.""" - from gget.utils import FastaRecord import io + from gget.utils import FastaRecord + record = FastaRecord(seq="ATCG", id="ACC002", description="") handle = io.StringIO() _write_fasta_record(handle, record) @@ -2805,15 +2731,16 @@ def test_write_fasta_record_without_description(self): def test_write_fasta_record_long_sequence_wraps(self): """Test _write_fasta_record wraps long sequences at 70 characters.""" - from gget.utils import FastaRecord import io + from gget.utils import FastaRecord + # Create a sequence longer than 70 characters long_seq = "A" * 150 record = FastaRecord(seq=long_seq, id="ACC003", description="") handle = io.StringIO() _write_fasta_record(handle, record) - lines = handle.getvalue().strip().split('\n') + lines = handle.getvalue().strip().split("\n") # First line is header, then sequence lines self.assertEqual(lines[0], ">ACC003") @@ -2831,7 +2758,7 @@ def test_stream_copy_fasta_all_records(self): input_path = os.path.join(tmpdir, "input.fasta") output_path = os.path.join(tmpdir, "output.fasta") - with open(input_path, 'w') as f: + with open(input_path, "w") as f: f.write(">ACC1\nATCGATCG\n>ACC2\nGGGGAAAA\n>ACC3\nTTTTCCCC\n") count = _stream_copy_fasta(input_path, output_path) @@ -2840,7 +2767,7 @@ def test_stream_copy_fasta_all_records(self): # Verify output has all 3 records with open(output_path) as f: - headers = [l for l in f if l.startswith('>')] + headers = [l for l in f if l.startswith(">")] self.assertEqual(len(headers), 3) def test_stream_copy_fasta_with_accession_filter(self): @@ -2849,7 +2776,7 @@ def test_stream_copy_fasta_with_accession_filter(self): input_path = os.path.join(tmpdir, "input.fasta") output_path = os.path.join(tmpdir, "output.fasta") - with open(input_path, 'w') as f: + with open(input_path, "w") as f: f.write(">ACC1\nATCGATCG\n>ACC2\nGGGGAAAA\n>ACC3\nTTTTCCCC\n") count = _stream_copy_fasta(input_path, output_path, accession_set={"ACC1", "ACC3"}) @@ -2872,7 +2799,7 @@ def test_filter_sequences_max_ambiguous_chars(self): output_path = os.path.join(tmpdir, "filtered.fasta") # ACC1 has 0 N's, ACC2 has 5 N's, ACC3 has 20 N's - with open(fasta_path, 'w') as f: + with open(fasta_path, "w") as f: f.write(">ACC1\nATCGATCGATCG\n") f.write(">ACC2\nATNNNNNCG\n") f.write(">ACC3\n" + "N" * 20 + "\n") @@ -2884,13 +2811,14 @@ def test_filter_sequences_max_ambiguous_chars(self): } count, filtered_meta, protein_headers, stats = filter_sequences( - fasta_path, metadata_dict, + fasta_path, + metadata_dict, max_ambiguous_chars=10, output_fasta_path=output_path, ) self.assertEqual(count, 2) # ACC1 and ACC2 pass - self.assertEqual(stats['ambiguous_chars'], 1) # ACC3 filtered out + self.assertEqual(stats["ambiguous_chars"], 1) # ACC3 filtered out def test_filter_sequences_no_filters(self): """Test filter_sequences passes all records when no filters applied.""" @@ -2898,7 +2826,7 @@ def test_filter_sequences_no_filters(self): fasta_path = os.path.join(tmpdir, "test.fasta") output_path = os.path.join(tmpdir, "filtered.fasta") - with open(fasta_path, 'w') as f: + with open(fasta_path, "w") as f: f.write(">ACC1\nATCGATCG\n>ACC2\nGGGGAAAA\n") metadata_dict = { @@ -2907,7 +2835,8 @@ def test_filter_sequences_no_filters(self): } count, filtered_meta, protein_headers, stats = filter_sequences( - fasta_path, metadata_dict, + fasta_path, + metadata_dict, output_fasta_path=output_path, ) @@ -2919,7 +2848,7 @@ def test_filter_sequences_proteins_complete(self): fasta_path = os.path.join(tmpdir, "test.fasta") output_path = os.path.join(tmpdir, "filtered.fasta") - with open(fasta_path, 'w') as f: + with open(fasta_path, "w") as f: f.write(">ACC1\nATCGATCG\n>ACC2\nGGGGAAAA\n") metadata_dict = { @@ -2928,13 +2857,14 @@ def test_filter_sequences_proteins_complete(self): } count, filtered_meta, protein_headers, stats = filter_sequences( - fasta_path, metadata_dict, + fasta_path, + metadata_dict, proteins_complete=True, output_fasta_path=output_path, ) self.assertEqual(count, 1) # Only ACC1 has proteins - self.assertEqual(stats['proteins'], 1) # ACC2 filtered out + self.assertEqual(stats["proteins"], 1) # ACC2 filtered out # ========================================================================= # SAVE COMMAND SUMMARY TESTS @@ -2996,14 +2926,18 @@ def test_merge_metadata_csvs_fills_missing(self): standard_path = os.path.join(tmpdir, "standard.csv") # GenBank CSV with missing host - pd.DataFrame([ - {"accession": "ACC1", "Host": "", "Length": "29903"}, - ]).to_csv(genbank_path, index=False) + pd.DataFrame( + [ + {"accession": "ACC1", "Host": "", "Length": "29903"}, + ] + ).to_csv(genbank_path, index=False) # Standard CSV with host data - pd.DataFrame([ - {"accession": "ACC1", "Host": "Homo sapiens", "Length": "29903"}, - ]).to_csv(standard_path, index=False) + pd.DataFrame( + [ + {"accession": "ACC1", "Host": "Homo sapiens", "Length": "29903"}, + ] + ).to_csv(standard_path, index=False) result = merge_metadata_csvs(genbank_path, standard_path) self.assertTrue(result) @@ -3027,13 +2961,17 @@ def test_merge_metadata_csvs_no_overwrite(self): genbank_path = os.path.join(tmpdir, "genbank.csv") standard_path = os.path.join(tmpdir, "standard.csv") - pd.DataFrame([ - {"accession": "ACC1", "Host": "chicken", "Length": "29903"}, - ]).to_csv(genbank_path, index=False) + pd.DataFrame( + [ + {"accession": "ACC1", "Host": "chicken", "Length": "29903"}, + ] + ).to_csv(genbank_path, index=False) - pd.DataFrame([ - {"accession": "ACC1", "Host": "human", "Length": "29903"}, - ]).to_csv(standard_path, index=False) + pd.DataFrame( + [ + {"accession": "ACC1", "Host": "human", "Length": "29903"}, + ] + ).to_csv(standard_path, index=False) merge_metadata_csvs(genbank_path, standard_path) @@ -3202,7 +3140,7 @@ def test_genbank_xml_to_csv_basic(self): """ - with open(xml_path, 'w') as f: + with open(xml_path, "w") as f: f.write(xml_content) _genbank_xml_to_csv(xml_path, csv_path) @@ -3241,9 +3179,7 @@ def test_save_genbank_metadata_to_csv_basic(self): "assembly_name": "ASM985889v3", "taxonomy": "Viruses; Riboviria", "comment": "", - "references": [ - {"title": "Paper", "authors": "Wu F", "journal": "Nature", "pubmed_id": "123"} - ], + "references": [{"title": "Paper", "authors": "Wu F", "journal": "Nature", "pubmed_id": "123"}], }, }, } @@ -3267,5 +3203,5 @@ def test_save_genbank_metadata_to_csv_empty(self): self.assertEqual(len(df), 0) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main()